NCBI C++ ToolKit
alnseq.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: alnseq.cpp 100369 2023-07-25 17:29:42Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kamen Todorov, NCBI
27 *
28 * File Description:
29 * Alignment sequences
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
38 
39 #include <objects/seq/Bioseq.hpp>
41 
42 #include <objmgr/scope.hpp>
43 #include <objmgr/bioseq_handle.hpp>
44 
45 #include <algorithm>
46 
47 
49 BEGIN_objects_SCOPE // namespace ncbi::objects::
50 
51 
53  : m_DsCnt(0),
54  m_ContainsAA(false),
55  m_ContainsNA(false)
56 {
57 }
58 
59 
61  : m_DsCnt(0),
62  m_Scope(&scope),
63  m_ContainsAA(false),
64  m_ContainsNA(false)
65 {
66 }
67 
68 
69 inline
70 bool
72  const CRef<CAlnMixSeq>& seq2)
73 {
74  return seq1->m_Score >seq2->m_Score;
75 }
76 
77 
78 inline
79 bool
81  const CRef<CAlnMixSeq>& seq2)
82 {
83  return
84  (seq1->m_ChainScore == seq2->m_ChainScore &&
85  seq1->m_Score > seq2->m_Score) ||
86  seq1->m_ChainScore > seq2->m_ChainScore;
87 }
88 
89 
90 void
92 {
93  stable_sort(m_Seqs.begin(), m_Seqs.end(), x_CompareScores);
94 }
95 
96 
97 void
99 {
100  stable_sort(m_Seqs.begin(), m_Seqs.end(), x_CompareChainScores);
101 }
102 
103 
104 void
106 {
107  m_DsCnt++;
108 
109  vector<CRef<CAlnMixSeq> >& ds_seq = m_DsSeq[&ds];
110 
111  for (CAlnMap::TNumrow row = 0; row < ds.GetDim(); row++) {
112 
113  CRef<CAlnMixSeq> aln_seq;
114 
115  if ( !m_Scope ) {
116  // identify sequences by their seq ids as provided by
117  // the dense seg (not as reliable as with OM, but faster)
118  CRef<CSeq_id> seq_id(new CSeq_id);
119  seq_id->Assign(*ds.GetIds()[row]);
120 
121  TSeqIdMap::iterator it = m_SeqIds.find(seq_id);
122  if (it == m_SeqIds.end()) {
123  // add this seq id
124  aln_seq = new CAlnMixSeq();
125  m_SeqIds[seq_id] = aln_seq;
126  aln_seq->m_SeqId = seq_id;
127  aln_seq->m_DsCnt = 0;
128 
129  // add this sequence
130  m_Seqs.push_back(aln_seq);
131 
132  // AA or NA?
133  if (ds.IsSetWidths()) {
134  if (ds.GetWidths()[row] == 1) {
135  aln_seq->m_IsAA = true;
136  m_ContainsAA = true;
137  } else {
138  aln_seq->m_IsAA = false;
139  m_ContainsNA = true;
140  }
141  }
142 
143  } else {
144  aln_seq = it->second;
145  }
146 
147  } else {
148  // uniquely identify the bioseq
149  x_IdentifyAlnMixSeq(aln_seq, *(ds.GetIds())[row]);
150 #if _DEBUG
151  // Verify the widths (if exist)
152  if (ds.IsSetWidths()) {
153  const int& width = ds.GetWidths()[row];
154  if (width == 1 && aln_seq->m_IsAA != true ||
155  width == 3 && aln_seq->m_IsAA != false) {
156  string errstr = string("CAlnMix::Add(): ")
157  + "Incorrect width("
158  + NStr::IntToString(width) +
159  ") or molecule type(" +
160  (aln_seq->m_IsAA ? "AA" : "NA") +
161  ").";
162  NCBI_THROW(CAlnException, eInvalidSegment,
163  errstr);
164  }
165  }
166 #endif
167  }
168  // Set the width
169  if (ds.IsSetWidths()) {
170  aln_seq->m_Width = ds.GetWidths()[row];
171  }
172 
173 
174  // Preserve the row of the the original sequences if requested.
175  // This is mostly used to allow a sequence to itself.
176  // Create an additional sequence, pointed by m_AntoherRow,
177  // if the row index differs.
178  if (flags & fPreserveRows) {
179  int row_index = aln_seq->m_RowIdx;
180  if (row_index == -1) {
181  // initialization
182  aln_seq->m_RowIdx = row;
183  } else while (row_index != row) {
184  if (aln_seq->m_AnotherRow) {
185  aln_seq = aln_seq->m_AnotherRow;
186  row_index = aln_seq->m_RowIdx;
187  } else {
188  CRef<CAlnMixSeq> another_row (new CAlnMixSeq);
189 
190  another_row->m_BioseqHandle = aln_seq->m_BioseqHandle;
191  another_row->m_SeqId = aln_seq->m_SeqId;
192  another_row->m_Width = aln_seq->m_Width;
193  another_row->m_SeqIdx = aln_seq->m_SeqIdx;
194  another_row->m_ChildIdx = aln_seq->m_ChildIdx + 1;
195  another_row->m_DsIdx = int(m_DsCnt);
196  another_row->m_RowIdx = row;
197 
198  m_Seqs.push_back(another_row);
199 
200  aln_seq = aln_seq->m_AnotherRow = another_row;
201 
202  break;
203  }
204  }
205  }
206 
207  aln_seq->m_DsCnt++;
208  ds_seq.push_back(aln_seq);
209  }
210 }
211 
212 
213 void
215 {
216  if ( !m_Scope ) {
217  string errstr = string("CAlnMix::x_IdentifyAlnMixSeq(): ")
218  + "In order to use this functionality "
219  "scope should be provided in CAlnMix constructor.";
220  NCBI_THROW(CAlnException, eInvalidRequest, errstr);
221  }
222 
223  CBioseq_Handle bioseq_handle =
224  m_Scope->GetBioseqHandle(seq_id);
225 
226  if ( !bioseq_handle ) {
227  string errstr = string("CAlnMix::x_IdentifyAlnMixSeq(): ")
228  + "Seq-id cannot be resolved: "
229  + (seq_id.AsFastaString());
230 
231  NCBI_THROW(CAlnException, eInvalidSeqId, errstr);
232  }
233 
234 
235  TBioseqHandleMap::iterator it = m_BioseqHandles.find(bioseq_handle);
236  if (it == m_BioseqHandles.end()) {
237  // add this bioseq handle
238  aln_seq = new CAlnMixSeq();
239  m_BioseqHandles[bioseq_handle] = aln_seq;
240  aln_seq->m_BioseqHandle =
241  &m_BioseqHandles.find(bioseq_handle)->first;
242 
243  CRef<CSeq_id> seq_id(new CSeq_id);
244  seq_id->Assign(*aln_seq->m_BioseqHandle->GetSeqId());
245  aln_seq->m_SeqId = seq_id;
246  aln_seq->m_DsCnt = 0;
247 
248  // add this sequence
249  m_Seqs.push_back(aln_seq);
250 
251  // AA or NA?
252  if (aln_seq->m_BioseqHandle->IsProtein()) {
253  aln_seq->m_IsAA = true;
254  m_ContainsAA = true;
255  } else {
256  aln_seq->m_IsAA = false;
257  m_ContainsNA = true;
258  }
259  } else {
260  aln_seq = it->second;
261  }
262 }
263 
264 
265 void
267 {
268  m_Rows.clear();
269 
270  int count = 0;
272  CRef<CAlnMixSeq>& seq = *i;
273 
274  if ( !seq->GetStarts().empty() ) {
275  m_Rows.push_back(seq);
276  seq->m_RowIdx = count++;
277  while ((seq = seq->m_ExtraRow) != NULL ) {
278  seq->m_RowIdx = count++;
279  m_Rows.push_back(seq);
280  }
281  }
282  }
283 }
284 
285 
286 void
288 {
289  NON_CONST_ITERATE (TSeqs, row_i, m_Rows) {
290  CAlnMixSeq * row = *row_i;
291  if ( !row->GetStarts().empty() ) {
292  if (row->m_PositiveStrand) {
293  row->SetStarts().current = row->GetStarts().begin();
294  } else {
295  row->SetStarts().current = row->GetStarts().end();
296  row->SetStarts().current--;
297  }
298  } else {
299  row->SetStarts().current = row->GetStarts().end();
300 #if _DEBUG
301  string errstr =
302  string("CAlnMixSequences::InitRowsStartIts():") +
303  " Internal error: no starts for row " +
304  NStr::IntToString(row->m_RowIdx) +
305  " (seq " +
306  NStr::IntToString(row->m_SeqIdx) +
307  " child " +
308  NStr::IntToString(row->m_ChildIdx) +
309  ").";
310  NCBI_THROW(CAlnException, eMergeFailure, errstr);
311 #endif
312  }
313  }
314 }
315 
316 
317 void
319 {
321  CAlnMixSeq * row = *row_i;
322  if ( !row->GetStarts().empty() ) {
323  if (row->m_PositiveStrand) {
324  row->SetStarts().current = row->GetStarts().begin();
325  } else {
326  row->SetStarts().current = row->GetStarts().end();
327  row->SetStarts().current--;
328  }
329  } else {
330  row->SetStarts().current = row->GetStarts().end();
331 #if _DEBUG
332  string errstr =
333  string("CAlnMixSequences::InitExtraRowStartIts():") +
334  " Internal error: no starts for row " +
335  NStr::IntToString(row->m_RowIdx) + ".";
336  NCBI_THROW(CAlnException, eMergeFailure, errstr);
337 #endif
338  }
339  }
340 }
341 
342 
343 void
345 {
346  NON_CONST_ITERATE (TSeqs, row_i, m_Rows) {
347  ITERATE (CAlnMixStarts, st_i, (*row_i)->GetStarts()) {
348  st_i->second->
349  StartItsConsistencyCheck(**row_i,
350  st_i->first,
351  match_idx);
352  }
353  }
354 }
355 
356 
358  : m_DsCnt(0),
359  m_BioseqHandle(0),
360  m_Score(0),
361  m_ChainScore(0),
362  m_StrandScore(0),
363  m_IsAA(false),
364  m_Width(1),
365  m_Frame(-1),
366  m_PositiveStrand(true),
367  m_RefBy(0),
368  m_ExtraRow(0),
369  m_ExtraRowIdx(0),
370  m_AnotherRow(0),
371  m_DsIdx(0),
372  m_SeqIdx(-1),
373  m_ChildIdx(0),
374  m_RowIdx(-1),
375  m_Starts(new CAlnMixStarts())
376 {
377 }
378 
379 
381 {
382 }
383 
384 END_objects_SCOPE // namespace ncbi::objects::
static CRef< CScope > m_Scope
TDim TNumrow
Definition: alnmap.hpp:69
int m_SeqIdx
Definition: alnseq.hpp:162
CAlnMixSeq * m_AnotherRow
Definition: alnseq.hpp:160
int m_Score
Definition: alnseq.hpp:150
int m_DsIdx
Definition: alnseq.hpp:161
CAlnMixSeq(void)
Definition: alnseq.cpp:357
const CAlnMixStarts & GetStarts() const
Definition: alnseq.hpp:167
int m_RowIdx
Definition: alnseq.hpp:164
unsigned m_Width
Definition: alnseq.hpp:154
CAlnMixSeq * m_ExtraRow
Definition: alnseq.hpp:158
int m_DsCnt
Definition: alnseq.hpp:147
int m_ChildIdx
Definition: alnseq.hpp:163
bool m_IsAA
Definition: alnseq.hpp:153
int m_ChainScore
Definition: alnseq.hpp:151
~CAlnMixSeq()
Definition: alnseq.cpp:380
CRef< CSeq_id > m_SeqId
Definition: alnseq.hpp:149
const CBioseq_Handle * m_BioseqHandle
Definition: alnseq.hpp:148
void Add(const CDense_seg &ds, TAddFlags flags=0)
Definition: alnseq.cpp:105
void SortByScore()
Definition: alnseq.cpp:91
CRef< CScope > m_Scope
Definition: alnseq.hpp:127
vector< CRef< CAlnMixSeq > > TSeqs
Definition: alnseq.hpp:65
void SortByChainScore()
Definition: alnseq.cpp:98
vector< CRef< CAlnMixSeq > > m_Rows
Definition: alnseq.hpp:133
size_t m_DsCnt
Definition: alnseq.hpp:124
void x_IdentifyAlnMixSeq(CRef< CAlnMixSeq > &aln_seq, const CSeq_id &seq_id)
Definition: alnseq.cpp:214
TSeqIdMap m_SeqIds
Definition: alnseq.hpp:129
void BuildRows()
Definition: alnseq.cpp:266
list< CRef< CAlnMixSeq > > m_ExtraRows
Definition: alnseq.hpp:134
void RowsStartItsContsistencyCheck(size_t match_idx)
Definition: alnseq.cpp:344
map< const CDense_seg *, vector< CRef< CAlnMixSeq > > > m_DsSeq
Definition: alnseq.hpp:126
static bool x_CompareScores(const CRef< CAlnMixSeq > &seq1, const CRef< CAlnMixSeq > &seq2)
Definition: alnseq.cpp:71
void InitRowsStartIts()
Definition: alnseq.cpp:287
CAlnMixSequences(void)
Definition: alnseq.cpp:52
void InitExtraRowsStartIts()
Definition: alnseq.cpp:318
static bool x_CompareChainScores(const CRef< CAlnMixSeq > &seq1, const CRef< CAlnMixSeq > &seq2)
Definition: alnseq.cpp:80
TBioseqHandleMap m_BioseqHandles
Definition: alnseq.hpp:130
CBioseq_Handle –.
bool IsSetWidths(void) const
Definition: Dense_seg.hpp:196
const TWidths & GetWidths(void) const
Definition: Dense_seg.hpp:210
CScope –.
Definition: scope.hpp:92
const_iterator end() const
Definition: map.hpp:152
bool empty() const
Definition: map.hpp:149
const_iterator find(const key_type &key) const
Definition: map.hpp:153
static uch flags
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
bool IsProtein(void) const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
#define row(bind, expected)
Definition: string_bind.c:73
Modified on Tue Apr 23 07:39:11 2024 by modify_doxy.py rev. 669887