1 /* $Id: align_tab_export_job.cpp 37334 2016-12-23 20:41:42Z katargir $
2 * ===========================================================================
3 *
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Roman Katargin
27 *
28 */
30 #include <ncbi_pch.hpp>
32 #include <corelib/ncbifile.hpp>
34 #include <objmgr/feat_ci.hpp>
38 #include <objmgr/align_ci.hpp>
39 #include <objmgr/util/sequence.hpp>
45 #include <gui/objutils/utils.hpp>
52 : CAppJob("Align Tab Export"), m_Params(params)
53 {
54 }
58 {
59  vector<CSeq_align::TDim> anchors;
60  CSeq_align::TDim num_row = align.CheckNumRows();
61  if (num_row == 0) {
62  // empty alignment
63  NCBI_THROW(CException, eUnknown, "Get empty alignment!");
64  }
66  for (row = 0; row < num_row; ++row) {
69  if ( !idh ) {
70  idh = sequence::GetId(align.GetSeq_id(row),
72  }
73  if ( handle.IsSynonym(idh) ) {
74  anchors.push_back(row);
75  }
76  }
78  if (anchors.empty()) {
79  /// try a more aggressive matching approach
80  for (size_t level = 0; level <= 5 && anchors.empty(); ++level) {
81  for (row = 0; row < num_row; ++row) {
84  if ( !idh ) {
85  idh = sequence::GetId(align.GetSeq_id(row),
87  }
88  if ( handle.ContainsSegment(idh, level) ) {
89  anchors.push_back(row);
90  }
91  }
92  }
93  }
95  _ASSERT( !anchors.empty() );
96  if (anchors.empty()) {
98  "Can find the anchor sequence in the alignment!");
99  }
101  return anchors[0];
102 }
104 static size_t s_CountSegments(const CSeq_align& align)
105 {
106  switch (align.GetSegs().Which()) {
107  default:
108  return 1;
111  return align.GetSegs().GetDisc().Get().size();
114  return align.GetSegs().GetSpliced().GetExons().size();
115  }
116 }
119 static sequence::CDeflineGenerator s_DefGen; ///< This class seems to be non-thread safe even as a stack var
121 static
122 inline void s_GetDefline(CBioseq_Handle bsh, string& defline )
123 {
125  defline = s_DefGen.GenerateDefline(bsh);
126 }
129 static string s_ShortenIdLabel(const string& label)
130 {
131  string out_label = label;
132  // make title/label shorter in case it is a cSRA alignment on panfs
133  if (out_label.length() > 20 && out_label.find("\\panfs\\") != string::npos) {
134  size_t pos = out_label.find_last_of("\\");
135  if (pos != string::npos) {
136  out_label = out_label.substr(pos + 1);
137  if (out_label.length() > 20) {
138  pos = out_label.rfind('.');
139  if (pos != string::npos && pos > 0) {
140  pos = out_label.rfind('.', pos - 1);
141  if (pos != string::npos) {
142  out_label = out_label.substr(pos + 1);
143  }
144  }
145  }
146  }
147  }
148  return out_label;
149 }
151 static string s_RemovePanfsBasePath(const string& label)
152 {
153  string out_label = label;
154  // make title/label shorter in case it is a cSRA alignment on panfs
155  size_t pos = out_label.find("\\panfs\\");
156  if (pos != string::npos) {
157  pos = out_label.find("\\", pos + 7);
158  if (pos != string::npos) {
159  out_label = out_label.substr(pos);
160  }
161  }
162  return out_label;
163 }
166 {
167  map<string, string> fieldData;
169  const CSeq_align::TDim num_rows_limit = 10;
170  ///
171  /// first, format our seq-ids
172  ///
173  vector<bool> strands;
174  vector<string> ids;
175  CSeq_align::TDim num_rows = align.CheckNumRows();
176  string curr_text;
177  {{
178  for(CSeq_align::TDim i = 0; i < num_rows && i < num_rows_limit; ++i ){
181  if( idh_best ){
182  idh = idh_best;
183  }
185  if ( !curr_text.empty() ) {
186  curr_text += " x ";
187  }
188  CConstRef<CSeq_id> seq_id(idh.GetSeqIdOrNull());
189  string label;
190  if (!seq_id) {
191  label = "UNK";
192  } else {
194  }
195  ids.push_back(label);
196  curr_text += s_ShortenIdLabel(label);
197  try {
198  strands.push_back(align.GetSeqStrand(i) == eNa_strand_plus);
199  } catch (CException&) {
200  // ignore the exception
201  }
202  }
203  }}
205  if (num_rows > ids.size()) {
206  curr_text += " x ... [total ";
207  curr_text += NStr::IntToString(num_rows, NStr::fWithCommas);
208  curr_text += "]";
209  }
211  fieldData["Alignment"] = curr_text;
213  if (num_rows > ids.size()) { // early escape?
214  return fieldData;
215  }
217  bool is_protein = false;
218  if (ids.size() == 2 && num_rows == 2 && anchorRow >= 0 && anchorRow < 2) {
219  TSeqRange range = align.GetSeqRange(anchorRow);
220  curr_text = ids[anchorRow] + " (";
221  curr_text += NStr::IntToString(range.GetFrom() + 1, NStr::fWithCommas);
222  curr_text += "..";
223  curr_text += NStr::IntToString(range.GetTo() + 1, NStr::fWithCommas);
224  curr_text += ")";
225  fieldData["Anchor"] = curr_text;
227  range = align.GetSeqRange(1 - anchorRow);
228  //t_title = s_ShortenIdLabel(ids[1 - anchorRow]);
229  curr_text = s_RemovePanfsBasePath(ids[1 - anchorRow]) + " (";
230  curr_text += NStr::IntToString(range.GetFrom() + 1, NStr::fWithCommas);
231  curr_text += "..";
232  curr_text += NStr::IntToString(range.GetTo() + 1, NStr::fWithCommas);
233  curr_text += ")";
234  fieldData["Query"] = curr_text;
236  CBioseq_Handle bsh = handle.GetScope().GetBioseqHandle(align.GetSeq_id(1 - anchorRow));
237  if (bsh) {
238  if (bsh.IsProtein()) {
239  is_protein = true;
240  }
241  s_GetDefline(bsh, curr_text);
242  }
243  }
245  /// only report strand for pair-wise alignments
246  if (!is_protein && num_rows == 2 && strands.size() == 2) {
247  fieldData["Strand"] = (strands[0] == strands[1]) ? "forward" : "reverse";
248  }
250  string tag_name;
251  ///
252  /// next, add a remark about the total aligned range
253  ///
254  CScoreBuilder builder;
255  // new method
256  TSeqPos align_length = builder.GetAlignLength(align);
257  tag_name = "Aligned ";
258  tag_name += is_protein ? "residues" : "bases";
260  fieldData[tag_name] = NStr::IntToString(align_length, NStr::fWithCommas);
262  size_t segs = s_CountSegments(align);
263  fieldData["Segments"] = NStr::SizetToString(segs, NStr::fWithCommas);
265  char buf[255];
266  double coverage = -1.0;
267  if ( !align.GetNamedScore(CSeq_align::eScore_PercentCoverage, coverage) ){
268  try {
269  coverage = builder.GetPercentCoverage( handle.GetScope(), align );
270  } catch (CException&) {
271  // ignore
272  }
273  }
274  if (coverage >= 0.0) {
275  if (coverage < 100.0 && coverage > 99.9) {
276  curr_text = "99.9+";
277  } else {
278  sprintf(buf, "%2.1f", coverage);
279  curr_text = buf;
280  }
281  curr_text += "%";
282  fieldData["Coverage"] = curr_text;
283  }
285  if (align_length < 1000000) {
286  //int identities = builder.GetIdentityCount(*m_Scope, align);
287  //int mismatches = builder.GetMismatchCount(*m_Scope, align);
288  int identities = -1;
289  int mismatches = -1;
290  if (!align.GetNamedScore(CSeq_align::eScore_IdentityCount, identities) ||
291  !align.GetNamedScore(CSeq_align::eScore_MismatchCount, mismatches)) {
292  try {
293  builder.GetMismatchCount(handle.GetScope(), align, identities, mismatches);
294  } catch (CException&) {
295  // ignore
296  }
297  }
299  if (identities >= 0) {
300  double identity = 0.0;
301  if ( !align.GetNamedScore(CSeq_align::eScore_PercentIdentity, identity) ) {
302  identity = identities * 100.0 / align_length;
303  }
304  if (identity < 100.0 && identity > 99.9) {
305  curr_text = "99.9+";
306  } else {
307  sprintf(buf, "%2.1f", identity);
308  curr_text = buf;
309  }
310  curr_text += "%";
311  fieldData["Identity"] = curr_text;
312  }
313  if (mismatches >= 0) {
314  fieldData["Mismatches"] = NStr::NumericToString(mismatches, NStr::fWithCommas);
315  }
316  }
318  try {
319  int gap_count = builder.GetGapCount(align);
320  fieldData["Gaps"] = NStr::IntToString(gap_count, NStr::fWithCommas);
321  } catch (CException&) {
322  // ignore
323  }
325 // if (at_p != (TSeqPos)-1) {
326 // fieldData["Position"] = NStr::UIntToString(at_p + 1, NStr::fWithCommas);
327 // }
329  return fieldData;
330 }
333 {
334  string err_msg;
336  try {
338  const CSeq_loc* loc =
339  dynamic_cast<const CSeq_loc*> (seqLoc.object.GetPointer());
340  CScope& scope = seqLoc.scope.GetObject();
342  CBioseq_Handle handle = scope.GetBioseqHandle(*loc);
347  CSeqUtils::SetResolveDepth(sel, true, -1);
349  CNcbiOfstream ostr(m_Params.GetFileName().fn_str());
350  CCSVExporter exporter(ostr, '\t', '"');
352  vector<string> fields = m_Params.GetFields();
353  ITERATE(vector<string>, it, fields)
354  exporter.Field(*it);
355  exporter.NewRow();
357  for (CAlign_CI align_iter(handle, range, sel); align_iter; ++align_iter) {
358  if (IsCanceled())
359  return eCanceled;
361  const CSeq_align& align = *align_iter;
362  CSeq_align::TDim anchorRow = sFindAnchorRow(handle, align);
363  map<string, string> fieldData = sGenerateFields(handle, align, anchorRow);
365  ITERATE(vector<string>, it, fields)
366  exporter.Field(fieldData[*it]);
367  exporter.NewRow();
368  }
369  }
370  catch (CException& e) {
371  err_msg = "Failed to save file:\n";
372  err_msg += e.GetMsg();
373  }
375  if (err_msg.empty()) {
376  LOG_POST(Info << "CAlignTabExportJob::Run() Finished " << m_Descr);
377  return eCompleted;
378  } else {
379  m_Error.Reset(new CAppJobError(err_msg));
380  return eFailed;
381  }
382 }
