NCBI C++ ToolKit
align_tab_export_job.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: align_tab_export_job.cpp 37334 2016-12-23 20:41:42Z katargir $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Roman Katargin
27 *
28 */
29 
30 #include <ncbi_pch.hpp>
31 
32 #include <corelib/ncbifile.hpp>
33 
34 #include <objmgr/feat_ci.hpp>
35 
37 
38 #include <objmgr/align_ci.hpp>
39 #include <objmgr/util/sequence.hpp>
40 
42 
44 
45 #include <gui/objutils/utils.hpp>
47 
50 
52 : CAppJob("Align Tab Export"), m_Params(params)
53 {
54 }
55 
56 
58 {
59  vector<CSeq_align::TDim> anchors;
60  CSeq_align::TDim num_row = align.CheckNumRows();
61  if (num_row == 0) {
62  // empty alignment
63  NCBI_THROW(CException, eUnknown, "Get empty alignment!");
64  }
66  for (row = 0; row < num_row; ++row) {
69  if ( !idh ) {
70  idh = sequence::GetId(align.GetSeq_id(row),
72  }
73  if ( handle.IsSynonym(idh) ) {
74  anchors.push_back(row);
75  }
76  }
77 
78  if (anchors.empty()) {
79  /// try a more aggressive matching approach
80  for (size_t level = 0; level <= 5 && anchors.empty(); ++level) {
81  for (row = 0; row < num_row; ++row) {
84  if ( !idh ) {
85  idh = sequence::GetId(align.GetSeq_id(row),
87  }
88  if ( handle.ContainsSegment(idh, level) ) {
89  anchors.push_back(row);
90  }
91  }
92  }
93  }
94 
95  _ASSERT( !anchors.empty() );
96  if (anchors.empty()) {
98  "Can find the anchor sequence in the alignment!");
99  }
100 
101  return anchors[0];
102 }
103 
104 static size_t s_CountSegments(const CSeq_align& align)
105 {
106  switch (align.GetSegs().Which()) {
107  default:
108  return 1;
109 
111  return align.GetSegs().GetDisc().Get().size();
112 
114  return align.GetSegs().GetSpliced().GetExons().size();
115  }
116 }
117 
119 static sequence::CDeflineGenerator s_DefGen; ///< This class seems to be non-thread safe even as a stack var
120 
121 static
122 inline void s_GetDefline(CBioseq_Handle bsh, string& defline )
123 {
125  defline = s_DefGen.GenerateDefline(bsh);
126 }
127 
128 
129 static string s_ShortenIdLabel(const string& label)
130 {
131  string out_label = label;
132  // make title/label shorter in case it is a cSRA alignment on panfs
133  if (out_label.length() > 20 && out_label.find("\\panfs\\") != string::npos) {
134  size_t pos = out_label.find_last_of("\\");
135  if (pos != string::npos) {
136  out_label = out_label.substr(pos + 1);
137  if (out_label.length() > 20) {
138  pos = out_label.rfind('.');
139  if (pos != string::npos && pos > 0) {
140  pos = out_label.rfind('.', pos - 1);
141  if (pos != string::npos) {
142  out_label = out_label.substr(pos + 1);
143  }
144  }
145  }
146  }
147  }
148  return out_label;
149 }
150 
151 static string s_RemovePanfsBasePath(const string& label)
152 {
153  string out_label = label;
154  // make title/label shorter in case it is a cSRA alignment on panfs
155  size_t pos = out_label.find("\\panfs\\");
156  if (pos != string::npos) {
157  pos = out_label.find("\\", pos + 7);
158  if (pos != string::npos) {
159  out_label = out_label.substr(pos);
160  }
161  }
162  return out_label;
163 }
164 
166 {
167  map<string, string> fieldData;
168 
169  const CSeq_align::TDim num_rows_limit = 10;
170  ///
171  /// first, format our seq-ids
172  ///
173  vector<bool> strands;
174  vector<string> ids;
175  CSeq_align::TDim num_rows = align.CheckNumRows();
176  string curr_text;
177  {{
178  for(CSeq_align::TDim i = 0; i < num_rows && i < num_rows_limit; ++i ){
181  if( idh_best ){
182  idh = idh_best;
183  }
184 
185  if ( !curr_text.empty() ) {
186  curr_text += " x ";
187  }
188  CConstRef<CSeq_id> seq_id(idh.GetSeqIdOrNull());
189  string label;
190  if (!seq_id) {
191  label = "UNK";
192  } else {
194  }
195  ids.push_back(label);
196  curr_text += s_ShortenIdLabel(label);
197  try {
198  strands.push_back(align.GetSeqStrand(i) == eNa_strand_plus);
199  } catch (CException&) {
200  // ignore the exception
201  }
202  }
203  }}
204 
205  if (num_rows > ids.size()) {
206  curr_text += " x ... [total ";
207  curr_text += NStr::IntToString(num_rows, NStr::fWithCommas);
208  curr_text += "]";
209  }
210 
211  fieldData["Alignment"] = curr_text;
212 
213  if (num_rows > ids.size()) { // early escape?
214  return fieldData;
215  }
216 
217  bool is_protein = false;
218  if (ids.size() == 2 && num_rows == 2 && anchorRow >= 0 && anchorRow < 2) {
219  TSeqRange range = align.GetSeqRange(anchorRow);
220  curr_text = ids[anchorRow] + " (";
221  curr_text += NStr::IntToString(range.GetFrom() + 1, NStr::fWithCommas);
222  curr_text += "..";
223  curr_text += NStr::IntToString(range.GetTo() + 1, NStr::fWithCommas);
224  curr_text += ")";
225  fieldData["Anchor"] = curr_text;
226 
227  range = align.GetSeqRange(1 - anchorRow);
228  //t_title = s_ShortenIdLabel(ids[1 - anchorRow]);
229  curr_text = s_RemovePanfsBasePath(ids[1 - anchorRow]) + " (";
230  curr_text += NStr::IntToString(range.GetFrom() + 1, NStr::fWithCommas);
231  curr_text += "..";
232  curr_text += NStr::IntToString(range.GetTo() + 1, NStr::fWithCommas);
233  curr_text += ")";
234  fieldData["Query"] = curr_text;
235 
236  CBioseq_Handle bsh = handle.GetScope().GetBioseqHandle(align.GetSeq_id(1 - anchorRow));
237  if (bsh) {
238  if (bsh.IsProtein()) {
239  is_protein = true;
240  }
241  s_GetDefline(bsh, curr_text);
242  }
243  }
244 
245  /// only report strand for pair-wise alignments
246  if (!is_protein && num_rows == 2 && strands.size() == 2) {
247  fieldData["Strand"] = (strands[0] == strands[1]) ? "forward" : "reverse";
248  }
249 
250  string tag_name;
251  ///
252  /// next, add a remark about the total aligned range
253  ///
254  CScoreBuilder builder;
255  // new method
256  TSeqPos align_length = builder.GetAlignLength(align);
257  tag_name = "Aligned ";
258  tag_name += is_protein ? "residues" : "bases";
259 
260  fieldData[tag_name] = NStr::IntToString(align_length, NStr::fWithCommas);
261 
262  size_t segs = s_CountSegments(align);
263  fieldData["Segments"] = NStr::SizetToString(segs, NStr::fWithCommas);
264 
265  char buf[255];
266  double coverage = -1.0;
267  if ( !align.GetNamedScore(CSeq_align::eScore_PercentCoverage, coverage) ){
268  try {
269  coverage = builder.GetPercentCoverage( handle.GetScope(), align );
270  } catch (CException&) {
271  // ignore
272  }
273  }
274  if (coverage >= 0.0) {
275  if (coverage < 100.0 && coverage > 99.9) {
276  curr_text = "99.9+";
277  } else {
278  sprintf(buf, "%2.1f", coverage);
279  curr_text = buf;
280  }
281  curr_text += "%";
282  fieldData["Coverage"] = curr_text;
283  }
284 
285  if (align_length < 1000000) {
286  //int identities = builder.GetIdentityCount(*m_Scope, align);
287  //int mismatches = builder.GetMismatchCount(*m_Scope, align);
288  int identities = -1;
289  int mismatches = -1;
290  if (!align.GetNamedScore(CSeq_align::eScore_IdentityCount, identities) ||
291  !align.GetNamedScore(CSeq_align::eScore_MismatchCount, mismatches)) {
292  try {
293  builder.GetMismatchCount(handle.GetScope(), align, identities, mismatches);
294  } catch (CException&) {
295  // ignore
296  }
297  }
298 
299  if (identities >= 0) {
300  double identity = 0.0;
301  if ( !align.GetNamedScore(CSeq_align::eScore_PercentIdentity, identity) ) {
302  identity = identities * 100.0 / align_length;
303  }
304  if (identity < 100.0 && identity > 99.9) {
305  curr_text = "99.9+";
306  } else {
307  sprintf(buf, "%2.1f", identity);
308  curr_text = buf;
309  }
310  curr_text += "%";
311  fieldData["Identity"] = curr_text;
312  }
313  if (mismatches >= 0) {
314  fieldData["Mismatches"] = NStr::NumericToString(mismatches, NStr::fWithCommas);
315  }
316  }
317 
318  try {
319  int gap_count = builder.GetGapCount(align);
320  fieldData["Gaps"] = NStr::IntToString(gap_count, NStr::fWithCommas);
321  } catch (CException&) {
322  // ignore
323  }
324 
325 // if (at_p != (TSeqPos)-1) {
326 // fieldData["Position"] = NStr::UIntToString(at_p + 1, NStr::fWithCommas);
327 // }
328 
329  return fieldData;
330 }
331 
333 {
334  string err_msg;
335 
336  try {
338  const CSeq_loc* loc =
339  dynamic_cast<const CSeq_loc*> (seqLoc.object.GetPointer());
340  CScope& scope = seqLoc.scope.GetObject();
341 
342  CBioseq_Handle handle = scope.GetBioseqHandle(*loc);
343 
347  CSeqUtils::SetResolveDepth(sel, true, -1);
348 
349  CNcbiOfstream ostr(m_Params.GetFileName().fn_str());
350  CCSVExporter exporter(ostr, '\t', '"');
351 
352  vector<string> fields = m_Params.GetFields();
353  ITERATE(vector<string>, it, fields)
354  exporter.Field(*it);
355  exporter.NewRow();
356 
357  for (CAlign_CI align_iter(handle, range, sel); align_iter; ++align_iter) {
358  if (IsCanceled())
359  return eCanceled;
360 
361  const CSeq_align& align = *align_iter;
362  CSeq_align::TDim anchorRow = sFindAnchorRow(handle, align);
363  map<string, string> fieldData = sGenerateFields(handle, align, anchorRow);
364 
365  ITERATE(vector<string>, it, fields)
366  exporter.Field(fieldData[*it]);
367  exporter.NewRow();
368  }
369  }
370  catch (CException& e) {
371  err_msg = "Failed to save file:\n";
372  err_msg += e.GetMsg();
373  }
374 
375  if (err_msg.empty()) {
376  LOG_POST(Info << "CAlignTabExportJob::Run() Finished " << m_Descr);
377  return eCompleted;
378  } else {
379  m_Error.Reset(new CAppJobError(err_msg));
380  return eFailed;
381  }
382 }
383 
User-defined methods of the data storage class.
static CFastMutex s_DFLock
USING_SCOPE(ncbi::objects)
static sequence::CDeflineGenerator s_DefGen
This class seems to be non-thread safe even as a stack var.
static CSeq_align::TDim sFindAnchorRow(CBioseq_Handle handle, const CSeq_align &align)
static string s_ShortenIdLabel(const string &label)
static void s_GetDefline(CBioseq_Handle bsh, string &defline)
static size_t s_CountSegments(const CSeq_align &align)
static string s_RemovePanfsBasePath(const string &label)
static map< string, string > sGenerateFields(CBioseq_Handle handle, const CSeq_align &align, CSeq_align::TDim anchorRow)
CAlignTabExportJob(const CAlignTabExportParams &params)
CAlignTabExportParams m_Params
virtual EJobState Run()
Function that does all the useful work, called by the Engine.
const SConstScopedObject & GetObject() const
vector< string > GetFields() const
CAlign_CI –.
Definition: align_ci.hpp:63
CAppJobError Default implementation for IAppJobError - encapsulates a text error message.
CAppJob - default implementation of IAppJob that could be used as a base class.
CBioseq_Handle –.
void Field(const string &value)
CFastMutex –.
Definition: ncbimtx.hpp:667
CScope –.
Definition: scope.hpp:92
TSeqPos GetAlignLength(const CSeq_align &align, bool ungapped=false)
Compute the length of the alignment (= length of all segments, gaps + aligned)
int GetGapCount(const CSeq_align &align)
Compute the number of gaps in the alignment.
double GetPercentCoverage(CScope &scope, const CSeq_align &align, unsigned query=0)
Compute percent coverage of the query (sequence 0) (range 0-100)
int GetMismatchCount(CScope &scope, const CSeq_align &align)
Compute the number of mismatches in the alignment.
@ eScore_PercentCoverage
Definition: Seq_align.hpp:168
@ eScore_IdentityCount
Definition: Seq_align.hpp:145
@ eScore_PercentIdentity
Definition: Seq_align.hpp:189
@ eScore_MismatchCount
Definition: Seq_align.hpp:154
CRange< TSeqPos > GetSeqRange(TDim row) const
GetSeqRange NB: On a Spliced-seg, in case the product-type is protein, these only return the amin par...
Definition: Seq_align.cpp:153
TDim CheckNumRows(void) const
Validatiors.
Definition: Seq_align.cpp:73
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
bool GetNamedScore(const string &id, int &score) const
Get score.
Definition: Seq_align.cpp:563
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
static objects::SAnnotSelector GetAnnotSelector(TAnnotFlags flags=0)
request an annotation selector for a given type
Definition: utils.cpp:167
static void SetAnnot(objects::SAnnotSelector &sel, const string &annot)
help function for setting up an annotation.
Definition: utils.cpp:320
static void SetResolveDepth(objects::SAnnotSelector &sel, bool adaptive, int depth=-1)
help function for setting selector resolve depth.
Definition: utils.cpp:405
CRef< CAppJobError > m_Error
CRef< objects::CScope > scope
Definition: objects.hpp:53
string m_Descr
mutex to sync our internals
virtual bool IsCanceled() const override
EJobState
Job states (describe FSM)
Definition: app_job.hpp:86
CConstRef< CObject > object
Definition: objects.hpp:52
@ eUnknown
Definition: app_popup.hpp:72
@ eCanceled
Definition: app_job.hpp:91
@ eCompleted
Definition: app_job.hpp:89
@ eFailed
Definition: app_job.hpp:90
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
CConstRef< CSeq_id > GetSeqId(void) const
CConstRef< CSeq_id > GetSeqIdOrNull(void) const
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:605
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
@ eGetId_Canonical
Definition: sequence.hpp:114
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
bool IsProtein(void) const
bool ContainsSegment(const CSeq_id &id, size_t resolve_depth=kMax_Int, EFindSegment limit_flag=eFindSegment_NoLimit) const
Check if the seq-id describes a segment of the bioseq.
CScope & GetScope(void) const
Get scope this handle belongs to.
bool IsSynonym(const CSeq_id &id) const
Check if this id can be used to obtain this bioseq handle.
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TObjectType & GetObject(void)
Get object.
Definition: ncbiobj.hpp:1011
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
@ fWithCommas
Use commas as thousands separator.
Definition: ncbistr.hpp:254
static const char label[]
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_align_.hpp:691
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
const TExons & GetExons(void) const
Get the Exons member data.
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
char * buf
int i
range(_Ty, _Ty) -> range< _Ty >
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
#define row(bind, expected)
Definition: string_bind.c:73
SAnnotSelector –.
#define _ASSERT
Modified on Wed Apr 17 13:08:03 2024 by modify_doxy.py rev. 669887