NCBI C++ ToolKit
msa_tool_job.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: msa_tool_job.cpp 47464 2023-04-20 00:19:10Z evgeniev $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Roman Katargin, Vladislav Evgeniev
27 */
28 
29 
30 #include <ncbi_pch.hpp>
31 
32 #include <corelib/ncbifile.hpp>
33 #include <corelib/ncbiexec.hpp>
34 #include <objmgr/seq_vector.hpp>
35 #include <objmgr/util/sequence.hpp>
37 
38 #include <serial/iterator.hpp>
39 
42 
44 
45 #include <gui/objutils/label.hpp>
47 
51 
52 #include <wx/filename.h>
53 #include <wx/utils.h>
54 #include <wx/msgdlg.h>
55 
58 
59 /*
60 TODO:
61 IdMap is used now only for tree remapping.
62 It seems more effective to store labels there, not SeqLocs.
63 It seems the most effective to retrieve this mapping
64 from CMappingRanges or from CSeq_loc_Mapper. Probably
65 CRef<CSeq_loc> Map(const CSeq_loc& src_loc) can be used.
66 */
67 
71 
73 {
76  : idmap(NULL)
77  {
78  }
79 
81  {
82  _ASSERT(idmap);
83  string& label = node.GetValue().SetLabel();
84  if (label.empty()) {
85  return eTreeTraverse;
86  }
87  TIdMap::iterator id_iter = idmap->find(label);
88  if (id_iter == idmap->end()) {
89  return eTreeTraverse;
90  }
91 
92  TLocPair p = id_iter->second;
93  CScope& scope = *p.second;
94  const CSeq_id& real_id =
95  sequence::GetId(*p.first, &scope);
96  label.erase();
97  CLabel::GetLabel(real_id, &label, CLabel::eDefault, &scope);
98  return eTreeTraverse;
99  }
100 };
101 
103  CNcbiOstream& ostr,
104  TConstScopedObjects& locations,
105  TSeqTypeMap& seq_types,
106  CMappingRanges& ranges,
107  map<string, TLocPair>& idmap
108  ){
109  int dump_count = 0;
110 
111  NON_CONST_ITERATE(TConstScopedObjects, iter, locations){
112  const CSeq_loc* loc = dynamic_cast<const CSeq_loc*>(iter->object.GetPointer());
113  CScope* scope = iter->scope;
114 
115  string loc_id = "seq-" + NStr::SizetToString(++dump_count);
116  idmap["lcl|" + loc_id] = make_pair(CConstRef<CSeq_loc>(loc), CRef<CScope>(scope));
117 
118  ostr << ">" << "lcl|" << loc_id << endl;
119 
120  CSeqVector vec(*loc, *scope, CBioseq_Handle::eCoding_Iupac);
121  string data;
122  vec.GetSeqData(0, vec.size(), data);
123  ostr << data << endl;
124 
125  CBioseq_Handle handle = scope->GetBioseqHandle(*loc->GetId());
126 
127  CRef<CSeq_id> seq_id(new CSeq_id(CSeq_id::e_Local, loc_id));
128  CSeq_id_Handle local_sihd = CSeq_id_Handle::GetHandle(*seq_id);
129  seq_types[local_sihd] = handle.IsNucleotide();
130 
131  CSeq_id_Handle base_sihd = CSeq_id_Handle::GetHandle(*loc->GetId());
132  seq_types[base_sihd] = handle.IsNucleotide();
133 
134  int trcf = handle.IsNucleotide() ? 1 : 3;
135 
136  // prepare for final mapping
137  ranges.AddConversion(
138  local_sihd,
139  0 * trcf, sequence::GetLength(*loc, scope) *trcf,
141  base_sihd,
142  sequence::GetStart(*loc, scope) *trcf,
144  );
145  }
146 }
147 
148 
149 ///////////////////////////////////////////////////////////////////////////////
150 /// CMSAToolJob
151 CMSAToolJob::CMSAToolJob(const wxString &tool_name)
152  : m_PId(0),
153  m_ToolName(tool_name)
154 {
155 
156 }
157 
159 {
160  /// we serialize this through a set of temporary files
161  /// we serialize this through a set of temporary files
162  m_TmpIn = wxFileName::CreateTempFileName(wxT("in"));
163  m_TmpOut = wxFileName::CreateTempFileName(wxT("out"));
164 
165  bool is_nuc = true;
166 
167  /// scoped to make sure we flush and close our file before we start!
168  {{
169  CNcbiOfstream ostr(m_TmpIn.fn_str());
170 
172  wxT("Preparing data for alignment..."));
173 
174  bool ors = false, ands = true;
175  ITERATE( TSeqTypeMap, iter, m_SeqTypes ){
176  ors = ors || iter->second;
177  ands = ands && iter->second;
178  }
179 
180  if( ands != ors ){
181  // we have mix, quit
182  wxMessageBox(
183  m_ToolName + wxT(" tool: All input sequences must be either DNA or protein."),
184  wxT("Error"), wxOK | wxICON_ERROR
185  );
186  return false;
187  }
188 
189  is_nuc = ors;
190  }}
191 
193  LOG_POST(Info << "Launching " << m_ToolName << " executabe:");
194  LOG_POST(Info << m_CmdLine);
195 
196  wxString working_dir = x_GetWorkingDirectory();
197  if (working_dir.IsEmpty()) {
198  m_PId = ::wxExecute(m_CmdLine);
199  }
200  else {
201  wxExecuteEnv env;
202  env.cwd = working_dir;
203  m_PId = ::wxExecute(m_CmdLine, wxEXEC_ASYNC, nullptr, &env);
204  }
205  if (m_PId <= 0) {
206  string error("Failed to launch ");
207  error += m_ToolName;
208  error += " executable.";
210  }
211 
212  return true;
213 }
214 
216 {
217  if (m_PId <= 0) {
218  NCBI_THROW(CException, eUnknown, string(m_ToolName + " not launched!"));
219  }
220 
221  while (true) {
222  ::wxSleep(3);
223 
224  if (IsCanceled()) {
225  ::wxKill(m_PId, wxSIGKILL);
226  return;
227  }
228 
229  ncbi::CProcess proc(static_cast<int>(m_PId), ncbi::CProcess::ePid);
230  if (!proc.IsAlive()) {
231  break;
232  }
233  }
234 
235  wxULongLong size = wxFileName::GetSize(m_TmpOut);
236  if (size == 0 || size == wxInvalidSize) {
237  NCBI_THROW(CException, eUnknown, string(m_ToolName + " executabale failed to produce results."));
238  }
239 
240  string err_msg;
241  try {
242  CNcbiIfstream istr(m_TmpOut.fn_str());
243  CStreamLineReader stream_line(istr);
244  CFastaReader fasta_reader(stream_line, CFastaReader::fParseGaps);
245 
246  /// -1 = multiple alignment
247  CRef<CSeq_entry> entry = fasta_reader.ReadAlignedSet(-1);
248  CRef<CSeq_align> align;
249  CTypeIterator<CSeq_align> iter(*entry);
250  if (iter) {
251  align.Reset(&*iter);
252  ++iter;
253  if (iter) {
254  LOG_POST(Error << "CMSAToolJob::x_CreateProjectItems(): more than one alignment!");
255  }
256  }
257 
258  ///
259  /// perform any necessary remappings
260  ///
261  CSeq_loc_Mapper mapper( &m_Ranges );
262  ITERATE( TSeqTypeMap, st_itr, m_SeqTypes ){
263  mapper.SetSeqTypeById(
264  st_itr->first,
266  );
267  }
268 
269  //- translate locals to proper ids
270  //- translate coords to proper offsets
271  align = mapper.Map( *align );
272 
273  ///
274  /// create an annotation to hold our results
275  CRef<CSeq_annot> annot(new CSeq_annot());
276  annot->SetData().SetAlign().push_back(align);
277 
278  string title;
280  string align_title(m_ToolName + " alignment: ");
281  align_title += title;
282  annot->SetNameDesc(align_title);
284 
285  string comment = "Generated by ";
286  comment += m_ToolName;
287  comment += " tool with command line : \n";
288  comment += m_CmdLine.ToUTF8();
289 
290  if (annot) {
291  CRef<CProjectItem> item(new CProjectItem());
292  item->SetItem().SetAnnot(*annot);
293  item->SetLabel(align_title);
294  CRef<CAnnotdesc> descr(new CAnnotdesc());
295  descr->SetComment(comment);
296  item->SetDescr().push_back(descr);
297  AddProjectItem(*item);
298  }
299 
300  if (!m_TmpTreeOut.empty())
301  x_AddTreeProjectItem(title, comment);
302 
303  }
304  catch (CException& e) {
305  err_msg = "Failed to generate alignment:\n" + e.GetMsg();
306  }
307  catch (std::exception& e) {
308  err_msg = "Failed to generate alignment:\n" + string(e.what());
309  }
310 
311  if (!err_msg.empty())
312  NCBI_THROW(CException, eUnknown, err_msg);
313 }
314 
315 void CMSAToolJob::x_AddTreeProjectItem(const string &title, const string &comment)
316 {
317  if (m_TmpTreeOut.empty())
318  return;
319 
320  ///
321  /// read the tree, if we created one
322  ///
324 
325  try {
326  CNcbiIfstream tree_istr(m_TmpTreeOut.fn_str());
327  unique_ptr<TPhyTreeNode> tree(ReadNewickTree(tree_istr));
328 
329  /// we need to convert the labels to the correct seq-id strings
330  STreeNodeMapper mapper;
331  mapper.idmap = &m_IdMap;
332  TreeDepthFirstTraverse(*tree, mapper);
333 
334  /// convert to the serializable form
335  btc = MakeBioTreeContainer(tree.get());
336 
337  /// adding seq-id prop
338  const int kLabelId = 0;
339  const int kSeqIdId = 2;
340 
341  CRef<CFeatureDescr> feat_descr(new CFeatureDescr);
342  feat_descr->SetId(kSeqIdId);
343  feat_descr->SetName("seq-id");
344  btc->SetFdict().Set().push_back(feat_descr);
345 
346  NON_CONST_ITERATE(CNodeSet::Tdata, node, btc->SetNodes().Set()) {
347  if (!(*node)->CanGetFeatures()) {
348  continue;
349  }
351  (*node)->SetFeatures().Set()) {
352  if ((*node_feature)->GetFeatureid() == kLabelId) {
353  CRef<CNodeFeature> id_node_feature(new CNodeFeature);
354  id_node_feature->SetFeatureid(kSeqIdId);
355  id_node_feature->SetValue((*node_feature)->GetValue());
356  (*node)->SetFeatures().Set().push_back(id_node_feature);
357  }
358  }
359  }
360  }
361  catch (const CException& e) {
363  << "Failed to read phylogenetic tree:\n"
364  << e.GetMsg()
365  << "\nPhylogenetic tree output will not be available.");
366  btc.Reset();
367  m_TmpTreeOut.erase();
368  }
369  catch (const std::exception& e) {
371  << "Failed to read phylogenetic tree:\n"
372  << e.what()
373  << "\nPhylogenetic tree output will not be available.");
374  btc.Reset();
375  m_TmpTreeOut.erase();
376  }
377 
378  if (!btc)
379  return;
380 
381  string tree_title = "Phylogenetic tree: " + title;
382  btc->SetTreetype(tree_title);
383 
384  CRef<CProjectItem> item(new CProjectItem());
385  item->SetItem().SetOther().Set(*btc);
386  item->SetLabel(tree_title);
387  CRef<CAnnotdesc> descr(new CAnnotdesc());
388  descr->SetComment(comment);
389  item->SetDescr().push_back(descr);
390  AddProjectItem(*item);
391 }
392 
394 {
395  if (!m_TmpIn.empty())
396  ::wxRemoveFile(m_TmpIn);
397  if (!m_TmpOut.empty())
398  ::wxRemoveFile(m_TmpOut);
399  if (!m_TmpTreeOut.empty())
400  ::wxRemoveFile(m_TmpTreeOut);
401 }
402 
404 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
std::invoke_result< _Fty, ICanceled & >::type GUI_AsyncExec(_Fty &&_Fnarg, const wxString &msg=wxT("Accessing network..."))
Definition: async_call.hpp:130
CAnnotdesc –.
Definition: Annotdesc.hpp:66
CBioseq_Handle –.
void AddProjectItem(objects::CProjectItem &item)
CRef< objects::CScope > m_Scope
Base class for reading FASTA sequences.
Definition: fasta.hpp:80
CFeatureDescr –.
Storage for multiple mapping ranges.
CNodeFeature –.
Definition: NodeFeature.hpp:66
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
void SetNameDesc(const string &name)
Definition: Seq_annot.cpp:66
void SetCreateDate(const CTime &dt)
Definition: Seq_annot.cpp:121
CSeq_loc_Mapper –.
Simple implementation of ILineReader for i(o)streams.
CTime –.
Definition: ncbitime.hpp:296
definition of a Culling tree
Definition: ncbi_tree.hpp:100
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
Interface for testing cancellation request in a long lasting operation.
Definition: icanceled.hpp:51
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
CRef< objects::CBioTreeContainer > MakeBioTreeContainer(const TPhyTreeNode *tree)
Conversion from TPhyTreeNode to CBioTreeContainer.
Operators to edit gaps in sequences.
static const char * proc
Definition: stats.c:21
static HENV env
Definition: transaction2.c:38
char data[12]
Definition: iconv.c:80
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
string
Definition: cgiapp.hpp:690
#define NULL
Definition: ncbistd.hpp:225
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
TSeqTypeMap m_SeqTypes
TIdMap m_IdMap
to make the alignment as robust as possible, we assign every sequence a unique identifer independeont...
virtual TConstScopedObjects & x_GetObjects()=0
Returns the sequences that will be aligned.
pair< CConstRef< objects::CSeq_loc >, CRef< objects::CScope > > TLocPair
virtual wxString x_GetWorkingDirectory() const
Returns the directory where to execute the msa tool.
virtual void x_CreateProjectItems()
override this function in derived classes and populate m_Items.
wxString m_CmdLine
virtual wxString x_GetCommandLine(const wxString &input, const wxString &output, bool is_nucleotide)=0
Returns the command line, that will be used to execute the third-party tool.
virtual bool BeforeRun()
Function will be called on the main thread before execution of Run Should not do any lengthy work ret...
wxString m_TmpOut
objects::CMappingRanges m_Ranges
virtual void x_AddTreeProjectItem(const string &title, const string &comment)
wxString m_ToolName
CMSAToolJob(const wxString &tool_name)
CMSAToolJob.
wxString m_TmpTreeOut
virtual ~CMSAToolJob()
wxString m_TmpIn
static void GetLabel(const CObject &obj, string *label, ELabelType type=eDefault)
Definition: label.cpp:140
virtual bool IsCanceled() const override
vector< SConstScopedObject > TConstScopedObjects
Definition: objects.hpp:65
@ eDefault
Definition: label.hpp:73
CRef< CSeq_entry > ReadAlignedSet(int reference_row, ILineErrorListener *pMessageListener=nullptr)
Read as many sequences as are available, and interpret them as an alignment, with hyphens marking rel...
Definition: fasta.cpp:1683
@ fParseGaps
Make a delta sequence if gaps found.
Definition: fasta.hpp:91
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
TSeqPos GetStart(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the start of the location.
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
void SetSeqTypeById(const CSeq_id_Handle &idh, ESeqType seqtype) const
Methods for setting sequence types.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddConversion(CRef< CMappingRange > cvt)
Add new mapping range to the proper place.
bool IsNucleotide(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2742
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
Fun TreeDepthFirstTraverse(TTreeNode &tree_node, Fun func)
Depth-first tree traversal algorithm.
Definition: ncbi_tree.hpp:504
ETreeTraverseCode
Tree traverse code returned by the traverse predicate function.
Definition: ncbi_tree.hpp:51
const TValue & GetValue(void) const
Return node's value.
Definition: ncbi_tree.hpp:184
@ eTreeTraverse
Keep traversal.
Definition: ncbi_tree.hpp:52
static const char label[]
void SetNodes(TNodes &value)
Assign a value to Nodes data member.
void SetFdict(TFdict &value)
Assign a value to Fdict data member.
list< CRef< CNodeFeature > > Tdata
void SetTreetype(const TTreetype &value)
Assign a value to Treetype data member.
list< CRef< CNode > > Tdata
Definition: NodeSet_.hpp:89
TDescr & SetDescr(void)
Assign a value to Descr data member.
void SetLabel(const TLabel &value)
Assign a value to Label data member.
void SetItem(TItem &value)
Assign a value to Item data member.
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ e_Local
local use
Definition: Seq_id_.hpp:95
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
TComment & SetComment(void)
Select the variant.
Definition: Annotdesc_.hpp:548
CMSAToolJob::TIdMap TIdMap
USING_SCOPE(objects)
CMSAToolJob::TLocPair TLocPair
map< CSeq_id_Handle, bool > TSeqTypeMap
static void s_DumpSequences_wRange(CNcbiOstream &ostr, TConstScopedObjects &locations, TSeqTypeMap &seq_types, CMappingRanges &ranges, map< string, TLocPair > &idmap)
#define wxT(x)
Definition: muParser.cpp:41
const struct ncbi::grid::netcache::search::fields::SIZE size
Defines a portable execute class.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
TPhyTreeNode * ReadNewickTree(CNcbiIstream &is)
Newick format input.
ETreeTraverseCode operator()(TPhyTreeNode &node, int level)
#define _ASSERT
Modified on Fri Sep 20 14:57:54 2024 by modify_doxy.py rev. 669887