NCBI C++ ToolKit
cleanup_alignments_tool_manager.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cleanup_alignments_tool_manager.cpp 47080 2022-07-22 18:11:54Z asztalos $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
35 
36 #include <gui/objutils/label.hpp>
38 
46 
48 
53 
56 
57 
59 : CAlgoToolManagerBase("Clean Up Alignments",
60  "",
61  "Remove redundant elements from alignments",
62  "Merge and remove redundant elements from alignments to "
63  "create a cleaner diagonalized representation",
64  "https://www.ncbi.nlm.nih.gov/tools/gbench/tutorial1/#manipulate",
65  "Alignment Creation"),
66  m_ParamsPanel(NULL)
67 {
68 }
69 
70 
72 {
73  return "cleanup_alignments_tool_manager";
74 }
75 
76 
78 {
79  return "Clean Up Alignments Tool";
80 }
81 
82 
84 {
86 
88 }
89 
90 
92 {
94 
96 }
97 
98 
100 {
101  if(m_ParamsPanel == NULL) {
103 
104  if( x_AreSelfAlignmentsOnly() ){
106  }
107 
109  m_ParamsPanel->Hide(); // to reduce flicker
112 
113  m_ParamsPanel->SetRegistryPath(m_RegPath + ".ParamsPanel");
115  }
116  return true;
117 }
118 
119 
121 {
122  string err;
123  if(m_Params.m_Alignments.size() == 0) {
124  err = "Please select at least one set of alignments!";
125  }
126  if( ! err.empty()) {
128  return false;
129  }
130  return true;
131 }
132 
133 
134 /// select only Seq-aligns
136 {
137  m_Params.m_Alignments.clear();
138  m_Alignments.clear();
139  ITERATE(vector<TConstScopedObjects>, it, m_InputObjects) {
140  ITERATE(TConstScopedObjects, it2, *it) {
141  const CSeq_align* align = dynamic_cast<const CSeq_align*>(it2->object.GetPointerOrNull());
142  if (align) m_Alignments.push_back(*it2);
143  }
144  }
145 
146  if (m_Alignments.empty()) {
147  x_ConvertInputObjects(CSeq_align::GetTypeInfo(), m_Alignments);
148  }
149 }
150 
152 {
153 
155 
156  const CSeq_align* aln = dynamic_cast<const CSeq_align*>( obj_ir->object.GetPointer() );
157  if( aln ){
158  int num_seqs = aln->CheckNumRows();
159  if( num_seqs <= 0 ){
160  continue;
161  }
162 
163  set<CSeq_id_Handle> idh_set;
164  for( int q = 0; q < num_seqs; q++ ){
165  const CSeq_id& seq_id = aln->GetSeq_id( q );
167  idh_set.insert( idh );
168  }
169 
170  if( idh_set.size() > 1 ){
171  return false;
172  }
173  }
174  }
175 
176  return true;
177 }
178 
180 {
181  return m_ParamsPanel;
182 }
183 
184 
186 {
187  return &m_Params;
188 }
189 
190 
192 {
194  return job;
195 }
196 
197 
198 
199 
200 ///////////////////////////////////////////////////////////////////////////////
201 /// CCleanupAlignmentsJob
202 
204 : m_Params(params)
205 {
206  m_Descr = "Cleaning alignments"; //TODO
207 }
208 
209 /*
210 static string s_GetAnnotName(const CSeq_annot& annot)
211 {
212  /// determine a base name for the annotation
213  string annot_name;
214  if (annot.IsSetDesc()) {
215  ITERATE (CSeq_annot::TDesc::Tdata, iter, annot.GetDesc().Get()) {
216  const CAnnotdesc& desc = **iter;
217  if ( !desc.IsName() ) {
218  continue;
219  }
220 
221  annot_name = desc.GetName();
222  break;
223  }
224  }
225 
226  return annot_name;
227 }
228  */
229 
231 {
233 
234  ///
235  /// assure we're all in one scope
236  ///
237  CRef<CScope> scope;
238  {{
239  ITERATE (TConstScopedObjects, iter, aligns) {
240  if ( !scope ) {
241  scope.Reset(const_cast<CScope*>(&*iter->scope));
242  } else if (scope != &*iter->scope) {
243  scope.Reset();
244  NCBI_USER_THROW( "All alignments must be within the same project" );
245  }
246  }
247  }}
248 
249  ///
250  /// meat goes here
251  ///
252  TConstScopedObjects aligns_out;
253 
254  switch (m_Params.m_Algo) {
256  ///
257  /// alignment manager version is encapsulated in CAlignCleanup
258  ///
259  {{
260  CAlignCleanup::TConstAligns aligns_in;
261  CAlignCleanup::TAligns aligns_out_tmp;
262 
263  ITERATE (TConstScopedObjects, iter, aligns) {
265  (dynamic_cast<const CSeq_align*>(&*iter->object));
266  if (al) {
267  aligns_in.push_back(al);
268  }
269  }
270 
271  CAlignCleanup cleanup(*scope);
272  cleanup.SortInputsByScore(m_Params.m_AlnMgr_Sort);
273  //cleanup.AllowTranslocations(m_Params.m_AlnMgr_Transloc);
274  cleanup.PreserveRows(m_Params.m_AlnMgr_PreserveRows);
275  cleanup.FillUnaligned(m_Params.m_AlnMgr_FillUnaligned);
276 
277  cleanup.Cleanup(aligns_in, aligns_out_tmp,
279  ITERATE (CAlignCleanup::TAligns, iter, aligns_out_tmp) {
280  aligns_out.push_back(SConstScopedObject(*iter, scope));
281  }
282  }}
283  break;
284 
286  {{
287  ///
288  /// compose a list of hit refs
289  ///
290 
292  double min_idty = m_Params.m_HitFilter_MinIdentity;
293 
294  typedef CBlastTabular THit;
295  typedef CRef<THit> THitRef;
296  typedef vector<THitRef> THitRefs;
297 
298  THitRefs hitrefs;
299  ITERATE (TConstScopedObjects, iter, aligns) {
301  (dynamic_cast<const CSeq_align*>(&*iter->object));
302  if (al) {
303  if (al->GetSegs().IsDisc()) {
304  const CSeq_align_set::Tdata &sas =
305  al->GetSegs().GetDisc().Get();
306  ITERATE(CSeq_align_set::Tdata, sa_iter, sas) {
307  CRef<CBlastTabular> hitref (new CBlastTabular(**sa_iter, true));
308  if(hitref->GetIdentity() >= min_idty &&
309  hitref->GetLength() >= min_len)
310  {
311  if(hitref->GetQueryStrand() == false) {
312  hitref->FlipStrands();
313  }
314  hitrefs.push_back(hitref);
315  }
316  }
317  } else {
318  CRef<CBlastTabular> hitref (new CBlastTabular(*al, true));
319  if(hitref->GetIdentity() >= min_idty &&
320  hitref->GetLength() >= min_len)
321  {
322  if(hitref->GetQueryStrand() == false) {
323  hitref->FlipStrands();
324  }
325  hitrefs.push_back(hitref);
326  }
327  }
328  }
329  }
330 
331  if ( !hitrefs.size() ) {
332  break;
333  }
334 
335  ///
336  /// run the greedy reconciliation step
337  ///
338  THitRefs hits_new;
339  CHitFilter<THit>::s_RunGreedy(hitrefs.begin(), hitrefs.end(), &hits_new,
340  min_len, min_idty);
341  hitrefs.erase(remove_if(hitrefs.begin(), hitrefs.end(),
342  CHitFilter<THit>::s_PNullRef), hitrefs.end());
343  copy(hits_new.begin(), hits_new.end(), back_inserter(hitrefs));
344 
345  ///
346  /// format our results as a seq-annot
347  ///
348  CRef<CSeq_align> seq_align_disc (new CSeq_align);
349  seq_align_disc->SetType(CSeq_align::eType_disc);
350  CSeq_align_set::Tdata& align_list = seq_align_disc->SetSegs().SetDisc().Set();
351  aligns_out.push_back(SConstScopedObject(seq_align_disc, scope));
352 
353  ITERATE(THitRefs, ii, hitrefs) {
354 
355  const THit& h = **ii;
356 
357  CRef<CDense_seg> ds (new CDense_seg);
358  const ENa_strand query_strand = h.GetQueryStrand()? eNa_strand_plus:
360  const ENa_strand subj_strand = h.GetSubjStrand()? eNa_strand_plus:
362  const string xcript (CAlignShadow::s_RunLengthDecode(h.GetTranscript()));
363 
364  ds->FromTranscript(h.GetQueryStart(), query_strand,
365  h.GetSubjStart(), subj_strand,
366  xcript);
367 
368  if(query_strand == eNa_strand_plus && subj_strand == eNa_strand_plus) {
369  ds->ResetStrands();
370  }
371 
372  vector< CRef< CSeq_id > > &ids = ds->SetIds();
373  for(Uint1 where = 0; where < 2; ++where) {
374 
375  CRef<CSeq_id> id (new CSeq_id);
376  id->Assign(*h.GetId(where));
377  ids.push_back(id);
378  }
379 
380  CDense_seg::TScores& scores = ds->SetScores();
381  CRef<CScore> score (new CScore);
382  score->SetValue().SetReal(h.GetScore());
383  scores.push_back(score);
384 
385  CRef<CSeq_align> seq_align (new CSeq_align);
386  seq_align->SetType(CSeq_align::eType_disc);
387  seq_align->SetSegs().SetDenseg(*ds);
388  align_list.push_back(seq_align);
389  }
390 
391  ///
392  /// add some scores to the alignments
393  ///
394  NON_CONST_ITERATE (CSeq_align_set::Tdata, iter, align_list) {
395  try {
396  CScoreBuilder builder;
397  builder.AddScore(*scope, **iter,
399  } catch (CException&) {
400  }
401  }
402 
403  }}
404  break;
405  }
406 
407  //
408  // final packaging and reporting
409  //
410  if (aligns_out.size()) {
411 
412  CAlignGroup align_group_sorter;
413  CAlignGroup::TAlignList aligns_out_tmp;
414 
415  // container format conversion...
416  NON_CONST_ITERATE (TConstScopedObjects, iter, aligns_out) {
418  (const_cast<CSeq_align*>
419  (dynamic_cast<const CSeq_align*>
420  (&*iter->object)
421  ));
422  aligns_out_tmp.push_back(al);
423  }
424 
425  CAlignGroup::TAnnotList annot_list;
426  string annot_base_name("Cleaned Alignment: ");
427 
428  align_group_sorter.GroupByStrand(aligns_out_tmp,
429  annot_list,
430  annot_base_name,
431  *scope);
432  // now create a Project Item for the data
433 
434  ITERATE(CAlignGroup::TAnnotList, iter, annot_list) {
435  CRef<objects::CSeq_annot> annot = *iter;
436 
437  annot->SetCreateDate(CTime(CTime::eCurrent));
438 
439  // encode the name correctly
440  // we previously used the 'name' not for a temporary computation
441  // we make this the real 'name' that the object manager will understand
442 
443  string name("Cleaned Alignment: ");
444  CLabel::GetLabel(*annot, &name, CLabel::eDefault, &*scope);
445  if ( !name.empty() ) {
446  annot->SetNameDesc(name);
447  annot->SetTitleDesc(name);
448  }
449 
450  CRef<CProjectItem> pitem(new CProjectItem());
451 
452  pitem->SetItem().SetAnnot(*annot);
453  pitem->SetLabel(name);
454 
455  AddProjectItem(*pitem);
456  }
457  }
458 }
459 
460 
461 
463 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
CAlgoToolManagerBase This is base class for simple algorithmic tool managers.
CUIObject m_Descriptor
describes the Manager's UI properties
virtual void InitUI()
override this function in a derived class and initialize extra members
string m_RegPath
registry path to the settings
wxWindow * m_ParentWindow
a window that will serve as a parent for our panels
virtual void CleanUI()
override this function in a derived class and clean extra members
void x_ConvertInputObjects(const CTypeInfo *typeInfo, map< string, TConstScopedObjects > &results)
vector< TConstScopedObjects > m_InputObjects
original input objects, the tool needs to select a subset of objects that can serve as valid input
CAlgoToolManagerParamsPanel.
class CAlignCleanup implements an alignment cleanup utility based on the C++ alignment manager.
list< CConstRef< CSeq_align > > TConstAligns
list< CRef< CSeq_align > > TAligns
void GroupByStrand(const TAlignList &aligns, TAnnotList &align_groups, const string &annot_base_name, objects::CScope &scope)
Group alignments into bins for each set of strands.
list< CRef< objects::CSeq_align > > TAlignList
Definition: align_group.hpp:55
list< CRef< objects::CSeq_annot > > TAnnotList
Definition: align_group.hpp:56
bool GetQueryStrand(void) const
TCoord GetQueryStart(void) const
static string s_RunLengthDecode(const string &in)
TCoord GetSubjStart(void) const
bool GetSubjStrand(void) const
const TId & GetId(Uint1 where) const
const TTranscript & GetTranscript(void) const
float GetScore(void) const
CDataLoadingAppJob - a base class for Jobs loading data into projects.
void AddProjectItem(objects::CProjectItem &item)
void FromTranscript(TSeqPos query_start, ENa_strand query_strand, TSeqPos subj_start, ENa_strand subj_strand, const string &transcript)
Initialize from pairwise alignment transcript (a string representation produced by CNWAligner)
Definition: Dense_seg.cpp:1273
CScope –.
Definition: scope.hpp:92
void AddScore(CScope &scope, CSeq_align &align, EScoreType score)
deprecated: use CSeq_align::EScoreType directly
Definition: Score.hpp:57
@ eScore_PercentIdentity
Definition: Seq_align.hpp:189
TDim CheckNumRows(void) const
Validatiors.
Definition: Seq_align.cpp:73
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
CTime –.
Definition: ncbitime.hpp:296
IRegSettings An interface for objects that save / restore settings using CGuiRegistry.
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
size_type size() const
Definition: set.hpp:132
USING_SCOPE(objects)
CSplign::THitRef THitRef
CSplign::THit THit
CSplign::THitRefs THitRefs
static void cleanup(void)
Definition: ct_dynamic.c:30
static void s_RunGreedy(typename THitRefs::iterator hri_beg, typename THitRefs::iterator hri_end, THitRefs *phits_new, TCoord min_hit_len=100, double min_hit_idty=.9, TCoord margin=1, TCoord retain_overlap=0, EUnique_type unique_type=e_Strict)
Definition: hit_filter.hpp:234
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_USER_THROW(message)
Throw a quick-and-dirty runtime exception of type 'CException' with the given error message and error...
Definition: ncbiexpt.hpp:715
virtual CAlgoToolManagerParamsPanel * x_GetParamsPanel()
returns a pointer to the parameters panel, override in derived classes
virtual string GetExtensionLabel() const
returns a displayable label for this extension ( please capitalize the key words - "My Extension" )
void SetParams(SCleanupAlignmentsParams *params, TConstScopedObjects *objects)
virtual IRegSettings * x_GetParamsAsRegSetting()
return a pointer to Parameters object as IRegSettings interface
virtual CDataLoadingAppJob * x_CreateLoadingJob()
factory method for creating the job that executes the tool algorithm override in derived classes
virtual string GetExtensionIdentifier() const
returns the unique human-readable identifier for the extension the id should use lowercase letters se...
bool Create(wxWindow *parent, wxWindowID id=ID_CCLEANUPALIGNMENTSPARAMSPANEL, const wxPoint &pos=wxDefaultPosition, const wxSize &size=wxSize(400, 300), long style=wxTAB_TRAVERSAL)
virtual void x_CreateProjectItems()
override this function in derived classes and populate m_Items.
CCleanupAlignmentsJob(const SCleanupAlignmentsParams &params)
CCleanupAlignmentsJob.
virtual bool x_CreateParamsPanelIfNeeded()
returns / creates Parameters panel, override in derived classes see cpp file for example
virtual void InitUI()
override this function in a derived class and initialize extra members
CCleanupAlignmentsParamsPanel * m_ParamsPanel
virtual bool x_ValidateParams()
validates user input in Parameters panel, report errors if any
void x_SelectCompatibleInputObjects()
select only Seq-aligns
virtual void SetRegistryPath(const string &path)
CAlgoToolManagerParamsPanel.
virtual void CleanUI()
override this function in a derived class and clean extra members
static void GetLabel(const CObject &obj, string *label, ELabelType type=eDefault)
Definition: label.cpp:140
string m_Descr
mutex to sync our internals
void NcbiErrorBox(const string &message, const string &title="Error")
specialized Message Box function for reporting critical errors
virtual const string & GetLabel() const
Definition: ui_object.cpp:124
vector< SConstScopedObject > TConstScopedObjects
Definition: objects.hpp:65
@ eDefault
Definition: label.hpp:73
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
void SetLabel(const TLabel &value)
Assign a value to Label data member.
void SetItem(TItem &value)
Assign a value to Item data member.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
TScores & SetScores(void)
Assign a value to Scores data member.
Definition: Dense_seg_.hpp:611
void ResetStrands(void)
Reset Strands data member.
Definition: Dense_seg_.cpp:70
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
bool IsDisc(void) const
Check if variant Disc is selected.
Definition: Seq_align_.hpp:772
vector< CRef< CScore > > TScores
Definition: Dense_seg_.hpp:110
list< CRef< CSeq_align > > Tdata
TIds & SetIds(void)
Assign a value to Ids data member.
Definition: Dense_seg_.hpp:511
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
@ eType_disc
discontinuous alignment
Definition: Seq_align_.hpp:104
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
Modified on Wed Apr 17 13:10:39 2024 by modify_doxy.py rev. 669887