1 /* $Id: alnmix.cpp 77169 2017-03-30 17:34:38Z grichenk $
2 * ===========================================================================
3 *
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kamen Todorov, NCBI
27 *
28 * File Description:
29 * Alignment mix
30 *
31 * ===========================================================================
32 */
34 #include <ncbi_pch.hpp>
41 #include <objects/seq/Bioseq.hpp>
44 #include <serial/iterator.hpp>
46 #include <algorithm>
49 BEGIN_objects_SCOPE // namespace ncbi::objects::
53  : x_CalculateScore(0)
54 {
55  x_Init();
56 }
60  TCalcScoreMethod calc_score)
61  : m_Scope(&scope),
62  x_CalculateScore(calc_score)
63 {
64  if ( !x_CalculateScore ) {
66  }
67  x_Init();
68 }
72 {
73 }
76 void
78 {
80  new CAlnMixSequences() :
84 }
87 void
89 {
91 }
94 void
96 {
97  if (m_InputAlnsMap.find((void *)&aln) == m_InputAlnsMap.end()) {
98  // add only if not already added
99  m_InputAlnsMap[(void *)&aln] = &aln;
100  m_InputAlns.push_back(CConstRef<CSeq_align>(&aln));
102  if (aln.GetSegs().IsDenseg()) {
103  Add(aln.GetSegs().GetDenseg(), flags);
104  } else if (aln.GetSegs().IsStd()) {
106  (m_Scope ? this : 0);
107  Add(*sa, flags);
108  } else if (aln.GetSegs().IsDisc()) {
110  aln_it,
111  aln.GetSegs().GetDisc().Get()) {
112  Add(**aln_it, flags);
113  }
114  }
115  }
116 }
119 void
121 {
122  const CDense_seg* dsp = &ds;
124  if (m_InputDSsMap.find((void *)dsp) != m_InputDSsMap.end()) {
125  return; // it has already been added
126  }
127  x_Reset();
128 #if _DEBUG
129  dsp->Validate(true);
130 #endif
132  // translate (extend with widths) the dense-seg if necessary
133  if (flags & fForceTranslation && !dsp->IsSetWidths()) {
134  if ( !m_Scope ) {
135  string errstr = string("CAlnMix::Add(): ")
136  + "Cannot force translation for Dense_seg "
137  + NStr::NumericToString(m_InputDSs.size() + 1) + ". "
138  + "Neither CDense_seg::m_Widths are supplied, "
139  + "nor OM is used to identify molecule type.";
140  NCBI_THROW(CAlnException, eMergeFailure, errstr);
141  } else {
142  m_InputDSs.push_back(x_ExtendDSWithWidths(*dsp));
143  dsp = m_InputDSs.back();
144  }
145  } else {
146  m_InputDSs.push_back(CConstRef<CDense_seg>(dsp));
147  }
149  if (flags & fCalcScore) {
150  if ( !x_CalculateScore ) {
151  // provide the default calc method
153  }
154  }
155  if ( !m_Scope && x_CalculateScore) {
156  NCBI_THROW(CAlnException, eMergeFailure, "CAlnMix::Add(): "
157  "Score calculation requested without providing "
158  "a scope in the CAlnMix constructor.");
159  }
160  m_AddFlags = flags;
162  m_InputDSsMap[(void *)dsp] = dsp;
164  m_AlnMixSequences->Add(*dsp, flags);
166  m_AlnMixMatches->Add(*dsp, flags);
167 }
172 {
173  if (ds.IsSetWidths()) {
174  NCBI_THROW(CAlnException, eMergeFailure,
175  "CAlnMix::x_ExtendDSWithWidths(): "
176  "Widths already exist for the input alignment");
177  }
179  bool contains_AA = false, contains_NA = false;
180  CRef<CAlnMixSeq> aln_seq;
181  for (CDense_seg::TDim numrow = 0; numrow < ds.GetDim(); numrow++) {
182  m_AlnMixSequences->x_IdentifyAlnMixSeq(aln_seq, *ds.GetIds()[numrow]);
183  if (aln_seq->m_IsAA) {
184  contains_AA = true;
185  } else {
186  contains_NA = true;
187  }
188  }
189  if (contains_AA && contains_NA) {
190  NCBI_THROW(CAlnException, eMergeFailure,
191  "CAlnMix::x_ExtendDSWithWidths(): "
192  "Incorrect input Dense-seg: Contains both AAs and NAs but "
193  "widths do not exist!");
194  }
196  CRef<CDense_seg> new_ds(new CDense_seg());
198  // copy from the original
199  new_ds->Assign(ds);
201  if (contains_NA) {
202  // fix the lengths
203  const CDense_seg::TLens& lens = ds.GetLens();
204  CDense_seg::TLens& new_lens = new_ds->SetLens();
205  for (CDense_seg::TNumseg numseg = 0; numseg < ds.GetNumseg(); numseg++) {
206  if (lens[numseg] % 3) {
207  string errstr =
208  string("CAlnMix::x_ExtendDSWithWidths(): ") +
209  "Length of segment " + NStr::IntToString(numseg) +
210  " is not divisible by 3.";
211  NCBI_THROW(CAlnException, eMergeFailure, errstr);
212  } else {
213  new_lens[numseg] = lens[numseg] / 3;
214  }
215  }
216  }
218  // add the widths
219  CDense_seg::TWidths& new_widths = new_ds->SetWidths();
220  new_widths.resize(ds.GetDim(), contains_NA ? 3 : 1);
221 #if _DEBUG
222  new_ds->Validate(true);
223 #endif
224  return new_ds;
225 }
228 void
230 {
231  CRef<CAlnMixSeq> aln_seq1, aln_seq2;
232  m_AlnMixSequences->x_IdentifyAlnMixSeq(aln_seq1, id1);
233  m_AlnMixSequences->x_IdentifyAlnMixSeq(aln_seq2, id2);
234  if (aln_seq1->m_BioseqHandle != aln_seq2->m_BioseqHandle) {
235  string errstr =
236  string("CAlnMix::ChooseSeqId(CSeq_id& id1, const CSeq_id& id2):")
237  + " Seq-ids: " + id1.AsFastaString()
238  + " and " + id2.AsFastaString()
239  + " do not resolve to the same bioseq handle,"
240  " but are used on the same 'row' in different segments."
241  " This is legally allowed in a Std-seg, but conversion to"
242  " Dense-seg cannot be performed.";
243  NCBI_THROW(CAlnException, eInvalidSeqId, errstr);
244  }
245  CRef<CSeq_id> id1cref(&id1);
246  CRef<CSeq_id> id2cref(&(const_cast<CSeq_id&>(id2)));
247  if (CSeq_id::BestRank(id1cref) > CSeq_id::BestRank(id2cref)) {
248 #ifdef _DEBUG
249  if (id1.IsGi()) {
250  const CTextseq_id* txt_id = id2.GetTextseq_Id();
251  if (txt_id && !txt_id->IsSetVersion()) {
252  ERR_POST("Using version-less accession " << txt_id->GetAccession()
253  << " instead of GI " << id1.GetGi());
254  }
255  }
256 #endif
257  id1.Reset();
258  SerialAssign<CSeq_id>(id1, id2);
259  }
260 #ifdef _DEBUG
261  else if (id2.IsGi()) {
262  const CTextseq_id* txt_id = id1.GetTextseq_Id();
263  if (txt_id && !txt_id->IsSetVersion()) {
264  ERR_POST("Using version-less accession " << txt_id->GetAccession()
265  << " instead of GI " << id2.GetGi());
266  }
267  }
268 #endif
269 }
272 void
274 {
275  x_SetTaskName("Sorting");
276  if (flags & fSortSeqsByScore) {
277  if (flags & fSortInputByScore) {
279  } else {
281  }
282  }
283  if (flags & fSortInputByScore) {
285  } else {
287  }
288  x_SetTaskName("Merging");
291 }
294 const CDense_seg&
296 {
297  return m_AlnMixMerger->GetDenseg();
298 }
301 const CSeq_align&
303 {
304  return m_AlnMixMerger->GetSeqAlign();
305 }
308 END_objects_SCOPE // namespace ncbi::objects::
