NCBI C++ ToolKit
find_overlap_tool.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: find_overlap_tool.cpp 47080 2022-07-22 18:11:54Z asztalos $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Roman Katargin
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 
33 
36 
39 
40 #include <serial/typeinfo.hpp>
41 
43 
44 #include <gui/objutils/label.hpp>
45 
48 
50 
60 
63 
66 
67 //static void s_PrepareAligner( CNgAligner& aligner, CFindOverlapParams& params );
68 
69 ///////////////////////////////////////////////////////////////////////////////
70 /// CFindOverlapTool
72 : CAlgoToolManagerBase("Find Overlap between DNA Sequences",
73  "",
74  "Find overlap between closely related DNA Sequences",
75  "Generate an overlap alignment between "
76  "closely related DNA sequences, particularly for "
77  "exploring clone overlap relationships",
78  "https://www.ncbi.nlm.nih.gov/tools/gbench/tutorial5/",
79  "Alignment Creation"),
80  m_Panel()
81 {
82 }
83 
85 {
86  return "find_overlap_tool";
87 }
88 
89 
91 {
92  return "Find Overlap Tool";
93 }
94 
96 {
98 
99  m_Panel = NULL;
100 }
101 
103 {
104  m_Panel = NULL;
106 }
107 
109 {
110  if (m_Panel == NULL) {
112 
114  wxDefaultPosition, wxSize(0, 0), SYMBOL_CFINDOVERLAPPANEL_STYLE, false);
117 
118  m_Panel->SetRegistryPath(m_RegPath + ".ParamsPanel");
120  }
121  return true;
122 }
123 
125 {
126  return true;
127 }
128 
129 /// select only Seq-ids
131 {
133  x_ConvertInputObjects(CSeq_id::GetTypeInfo(), objects);
134 
135  m_SeqIds.clear();
137  CIRef<IGuiSeqInfo> gui_seq_info(CreateObjectInterface<IGuiSeqInfo>(*it, NULL));
138  if( !gui_seq_info.IsNull() && gui_seq_info->IsDNA() ){
139  m_SeqIds.push_back(*it);
140  }
141  }
142 }
143 
145 {
146  return m_Panel;
147 }
148 
150 {
151  return &m_Params;
152 }
153 
154 ///////////////////////////////////////////////////////////////////////////////
155 /// CFindOverlapJob
157 {
158 public:
159  CFindOverlapJob (const CFindOverlapParams& params);
160 
161 protected:
162  virtual void x_CreateProjectItems(); // overriding virtual function
163 
164 private:
166  void x_PrepareAligner( CNgAligner& aligner, CFindOverlapParams& params );
167 
168 protected:
170 };
171 
173 {
174  if (m_Panel) m_Params = m_Panel->GetData();
176  return job;
177 }
178 
179 ///////////////////////////////////////////////////////////////////////////////
180 /// CFindOverlapJob
182 : m_Params(params)
183 {
184  CFastMutexGuard lock(m_Mutex);
185 
186  m_Descr = "Find Overlap Job"; //TODO
187 }
188 
190 {
193 
194  const CSeq_id& id1 = dynamic_cast<const CSeq_id&>(*seq1.object);
195  const CSeq_id& id2 = dynamic_cast<const CSeq_id&>(*seq2.object);
196  CScope& scope1 = *seq1.scope;
197  CScope& scope2 = *seq2.scope;
198 
199  CRef<CScope> scope(&scope1);
200  if (&scope1 != &scope2) {
201  LOG_POST(Error << "Inconsistent scopes - results may not be viewable");
203  scope.Reset(new CScope(*obj_mgr));
204  scope->AddScope(scope1);
205  scope->AddScope(scope2);
206  }
207 
208  const string blast_params = ToStdString(m_Params.GetBlastParams());
209 
211 
212  // logging
213  {{
214  string name1, name2;
215  CLabel::GetLabel(id1, &name1, CLabel::eDefault, scope);
216  CLabel::GetLabel(id2, &name2, CLabel::eDefault, scope);
217 
218  LOG_POST(Info << "Find Overlap Alignments: \n" << " " << name1 << " x " << name2
219  << "\n "
220  << " blast_params='" << blast_params << "'" << " "
221  << " max-slope=" << m_Params.GetMaxSlop()
222  << " filter-quality=" << m_Params.GetFilterQuality()
223  );
224  }}
225 
226  vector<CRef<CSeq_align> > alns;
227  /*
228  alns = CContigAssembly::Align(id1, id2, blast_params,
229  min_ident, max_slop, *scope, NULL,
230  half_widths
231  );
232  */
233 
235  subject->SetIdList().push_back( CRef<CSeq_id>( (CSeq_id*)&id1 ) );
236 
238  query->SetIdList().push_back( CRef<CSeq_id>( (CSeq_id*)&id2 ) );
239 
240 
241  CNgAligner aligner(*scope);
242 
243  aligner.SetQuery(query);
244  aligner.SetSubject(subject);
245 
246  x_PrepareAligner( aligner, m_Params );
247 
248  CRef<CSeq_align_set> sas = aligner.Align(); //s_CallAligner( m_Params, aligner );
249 
250  if( sas && sas->IsSet() ){
251  alns.insert( alns.end(), sas->Set().begin(), sas->Set().end() );
252  }
253 
254 
255 
256  // make an annotation
257  if (alns.size()) {
258  CRef<CSeq_annot> annot(new CSeq_annot());
259  ITERATE (vector<CRef<CSeq_align> >, iter, alns) {
260  annot->SetData().SetAlign().push_back(*iter);
261  }
262 
263  string name;
264  CLabel::GetLabel(id1, &name, CLabel::eDefault, scope);
265  name += " x ";
266  CLabel::GetLabel(id2, &name, CLabel::eDefault, scope);
267  name += ": Overlap Alignment";
268 
269  annot->SetNameDesc(name);
271 
272  /// now create a Project Item for the data
273  CRef<CProjectItem> item(new CProjectItem());
274  item->SetItem().SetAnnot(*annot);
275 
276  // TODO we need to generate title properly
277  item->SetLabel(name);
278  AddProjectItem(*item);
279 
280  } else {
281  LOG_POST(Info << "No acceptable overlaps are found.");
282 
283  x_SetTextResult( "No overlaps are found according to the settings desired." );
284  }
285 
286 }
287 
289 {
290  CRef<blast::CBlastNucleotideOptionsHandle> opts( new blast::CBlastNucleotideOptionsHandle() );
291 
292  opts->SetTraditionalBlastnDefaults();
293  blast::CBlastOptions& options = opts->SetOptions();
294 
295  //options.SetWordSize (params.GetWordSize());
296  //options.SetEvalueThreshold (params.GetEValue());
297 
298  /*
299  if (params.GetBestHit()) {
300  options.SetBestHitScoreEdge(0.1);
301  options.SetBestHitOverhang(0.1);
302  }
303  */
304 
305  const string blast_params = ToStdString( params.GetBlastParams() );
306 
307  if( !blast_params.empty() ){
308  static const string s_whitespace(" \n\t\r");
309  vector<string> argv;
310  argv.push_back( EProgramToTaskName( blast::eBlastn ) );
311  NStr::Split( blast_params, s_whitespace, argv );
312 
313  // This is a hack against old-fasioned parameters
314  for( unsigned int i = 0; i < argv.size(); i++ ){
315  string& name = argv[i];
316 
317  if (name == "-W") {
318  //options.SetWordSize(NStr::StringToInt(value));
319  name = "-word_size";
320  } else if (name == "-r") {
321  //options.SetMatchReward(NStr::StringToInt(value));
322  name = "-reward";
323  } else if (name == "-q") {
324  //options.SetMismatchPenalty(NStr::StringToInt(value));
325  name = "-penalty";
326  } else if (name == "-e") {
327  //options.SetEvalueThreshold(NStr::StringToDouble(value));
328  name = "-evalue";
329  } else if (name == "-Z") {
330  //options.SetGapXDropoffFinal(NStr::StringToInt(value));
331  name = "-xdrop_gap_final";
332  } else if (name == "-F") {
333  //options.SetFilterString(value.c_str());
334  name = "-dust";
335 
336  string& value = argv[i + 1];
337  value = ( value == "T" ) ? "yes" : "no";
338 
339  } else if (name == "-G") {
340  //options.SetGapOpeningCost(NStr::StringToInt(value));
341  name = "-gapopen";
342  } else if (name == "-E") {
343  //options.SetGapExtensionCost(NStr::StringToInt(value));
344  name = "-gapextend";
345  }
346  }
347 
348 
349  /*
350  for (unsigned int i = 0; i < argv.size(); i += 2) {
351  const string& name = argv[i];
352 
353  if (i + 1 >= argv.size()) {
354  throw runtime_error("no value given for " + name);
355  }
356  const string& value = argv[i + 1];
357  if (name == "-W") {
358  options.SetWordSize(NStr::StringToInt(value));
359  } else if (name == "-r") {
360  options.SetMatchReward(NStr::StringToInt(value));
361  } else if (name == "-q") {
362  options.SetMismatchPenalty(NStr::StringToInt(value));
363  } else if (name == "-e") {
364  options.SetEvalueThreshold(NStr::StringToDouble(value));
365  } else if (name == "-Z") {
366  options.SetGapXDropoffFinal(NStr::StringToInt(value));
367  } else if (name == "-F") {
368  options.SetFilterString(value.c_str());
369  } else if (name == "-G") {
370  options.SetGapOpeningCost(NStr::StringToInt(value));
371  } else if (name == "-E") {
372  options.SetGapExtensionCost(NStr::StringToInt(value));
373  } else {
374  throw runtime_error("invalid option: " + name);
375  }
376  }
377  */
378 
379 
380  try {
381  CRef<blast::CBlastAppArgs> blast_args(new blast::CBlastnAppArgs());
382  unique_ptr<CArgDescriptions> arg_descs( blast_args->SetCommandLine() );
383  unique_ptr<CArgs> args( arg_descs->CreateArgs( argv.size(), argv ) );
384 
385  ///
386  /// Standard processing for BLAST
387  ///
388  blast::CGenericSearchArgs search_args(false /* not protein */,
389  false /* not RPS blast */,
390  true /* show %identity */);
391  search_args.ExtractAlgorithmOptions(*args, options);
392 
393  blast::CFilteringArgs filter_args(false /* not protein */);
394  filter_args.ExtractAlgorithmOptions(*args, options);
395 
396  blast::CNuclArgs nucl_args;
397  nucl_args.ExtractAlgorithmOptions(*args, options);
398 
399  blast::CGappedArgs gapped_args;
400  gapped_args.ExtractAlgorithmOptions(*args, options);
401 
402  /// best hit args here
403  blast::CHspFilteringArgs cull_args;
404  cull_args.ExtractAlgorithmOptions(*args, options);
405 
406  /// Argments for window size
407  blast::CWindowSizeArg window_args;
408  window_args.ExtractAlgorithmOptions(*args, options);
409 
410  } catch( CException& ex ){
412  eUnknown,
413  "Cannot parse BLAST params, please edit them:\n" + ex.GetMsg()
414  );
415  }
416 
417  }
418 
419  int max_slop = NStr::StringToInt(ToStdString(params.GetMaxSlop()));
420 
421  CBlastAligner* blast_aligner = new CBlastAligner( *opts, 0 );
422  blast_aligner->SetInterruptCallback( x_BlastInterruptCallback, this );
423 
424  aligner.AddAligner( blast_aligner );
425 
426  aligner.AddAligner(new CMergeAligner(1));
427 
428 
429  // filters
430 
431  string perfect_dovetail = " pct_identity_ungap >= 100 AND longest_gap < 50 AND full_dovetail = 1 ";
432  string near_perfect_dovetail = " pct_identity_ungap >= 99.8 AND longest_gap < 50 AND full_dovetail = 1 ";
433  string super_green_contained = " pct_identity_ungap >= 99.8 AND contained > -1 ";
434 
435  string green_dovetail = " pct_identity_ungap >= 99.6 AND longest_gap < 50 AND full_dovetail = 1 ";
436  string green_contained = " pct_identity_ungap >= 99.6 AND contained > -1 ";
437 
438  string yellow_dovetail = " pct_identity_ungap >= 98.0 AND longest_gap < 500 AND full_dovetail = 1 ";
439  string yellow_contained = " pct_identity_ungap >= 98.0 AND contained > -1 ";
440  string yellow_half_dovetail = " pct_identity_ungap >= 98.0 AND longest_gap < 500 AND half_dovetail = 1 AND tail_length <= 50 ";
441 
442  string red_dovetail = " pct_identity_ungap >= 95.0 AND full_dovetail = 1 ";
443  string red_half_dovetail = " pct_identity_ungap >= 95.0 AND half_dovetail = 1 AND tail_length <= 2000 AND tail_length <= align_length_ungap ";
444  string red_contained = " pct_identity_ungap >= 95.0 AND contained > -1 ";
445 
446  string super_red_half_dovetail = " pct_identity_ungap >= 92.0 AND half_dovetail = 1 AND tail_length <= 5000 AND tail_length <= align_length_ungap ";
447  string mega_red_half_dovetail = " pct_identity_ungap >= 92.0 AND half_dovetail = 1 AND tail_length <= 10000 ";
448  string ultra_red_half_dovetail = " pct_identity_ungap >= 92.0 AND half_dovetail = 1 AND tail_length <= MUL(5, align_length_ungap) ";
449  string omega_red_half_dovetail = " pct_identity_ungap >= 92.0 AND half_dovetail = 1 ";
450 
451  int filter_qty = params.GetFilterQuality();
452 
453  if( filter_qty >= 0 ){
454  aligner.AddFilter(new CQueryFilter( 0, perfect_dovetail ) );
455  aligner.AddFilter(new CQueryFilter( 1, near_perfect_dovetail ) );
456  aligner.AddFilter(new CQueryFilter( 2, super_green_contained ) );
457  }
458  if( filter_qty >= 1 ){
459  aligner.AddFilter(new CQueryFilter( 3, green_dovetail ) );
460  aligner.AddFilter(new CQueryFilter( 4, green_contained ) );
461  }
462  if( filter_qty >= 2 ){
463  aligner.AddFilter(new CQueryFilter( 5, yellow_dovetail ) );
464  aligner.AddFilter(new CQueryFilter( 6, yellow_contained ) );
465  aligner.AddFilter(new CQueryFilter( 7, yellow_half_dovetail ) );
466  }
467  if( filter_qty >= 3 ){
468  aligner.AddFilter(new CQueryFilter( 8, red_dovetail ) );
469  aligner.AddFilter(new CQueryFilter( 9, red_half_dovetail ) );
470  aligner.AddFilter(new CQueryFilter( 10, red_contained ) );
471  }
472  if( filter_qty >= 4 ){
473  aligner.AddFilter(new CQueryFilter( 11, super_red_half_dovetail ) );
474  aligner.AddFilter(new CQueryFilter( 12, mega_red_half_dovetail ) );
475  aligner.AddFilter(new CQueryFilter( 13, ultra_red_half_dovetail ) );
476  aligner.AddFilter(new CQueryFilter( 14, omega_red_half_dovetail ) );
477  }
478 
479  aligner.AddScorer( new CBlastScorer() );
480  aligner.AddScorer( new CPctIdentScorer() );
481  aligner.AddScorer( new COverlapScorer( max_slop ) );
482 }
483 
485 {
486  if( prog && prog->user_data ){
487  CFindOverlapJob* job =
488  reinterpret_cast<CFindOverlapJob*>(prog->user_data)
489  ;
490  if( job->IsCanceled() ){
491  return TRUE;
492  }
493  }
494  return FALSE;
495 }
496 
497 
Declares the BLAST exception class.
@ eBlastn
Nucl-Nucl (traditional blastn)
Definition: blast_types.hpp:58
Main argument class for BLASTN application.
CAlgoToolManagerBase This is base class for simple algorithmic tool managers.
virtual void InitUI()
override this function in a derived class and initialize extra members
string m_RegPath
registry path to the settings
wxWindow * m_ParentWindow
a window that will serve as a parent for our panels
virtual void CleanUI()
override this function in a derived class and clean extra members
void x_ConvertInputObjects(const CTypeInfo *typeInfo, map< string, TConstScopedObjects > &results)
CAlgoToolManagerParamsPanel.
virtual void SetRegistryPath(const string &reg_path)
CAlgoToolManagerParamsPanel.
void SetInterruptCallback(TInterruptFnPtr fnptr=NULL, void *user_data=NULL)
CDataLoadingAppJob - a base class for Jobs loading data into projects.
void AddProjectItem(objects::CProjectItem &item)
void x_SetTextResult(const string &result)
CFindOverlapJob.
CFindOverlapJob(const CFindOverlapParams &params)
CFindOverlapJob.
void x_PrepareAligner(CNgAligner &aligner, CFindOverlapParams &params)
static Boolean x_BlastInterruptCallback(SBlastProgress *prog)
CFindOverlapParams m_Params
virtual void x_CreateProjectItems()
override this function in derived classes and populate m_Items.
void SetQuery(ISequenceSet *Set)
Definition: ngalign.cpp:76
void SetSubject(ISequenceSet *Set)
Definition: ngalign.cpp:82
TAlignSetRef Align()
Definition: ngalign.cpp:110
void AddFilter(IAlignmentFilter *Filter)
Definition: ngalign.cpp:88
void AddAligner(IAlignmentFactory *Aligner)
Definition: ngalign.cpp:94
void AddScorer(IAlignmentScorer *Scorer)
Definition: ngalign.cpp:100
CScope –.
Definition: scope.hpp:92
void SetNameDesc(const string &name)
Definition: Seq_annot.cpp:66
void SetCreateDate(const CTime &dt)
Definition: Seq_annot.cpp:121
CTime –.
Definition: ncbitime.hpp:296
IRegSettings An interface for objects that save / restore settings using CGuiRegistry.
#define SYMBOL_CFINDOVERLAPPANEL_STYLE
USING_SCOPE(ncbi::objects)
string EProgramToTaskName(EProgram p)
Convert a EProgram enumeration value to a task name (as those used in the BLAST command line binaries...
Definition: blast_aux.cpp:676
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
virtual CAlgoToolManagerParamsPanel * x_GetParamsPanel()
returns a pointer to the parameters panel, override in derived classes
void SetObjects(TConstScopedObjects *objects)
const SConstScopedObject & GetSeq1() const
virtual string GetExtensionLabel() const
returns a displayable label for this extension ( please capitalize the key words - "My Extension" )
virtual bool x_CreateParamsPanelIfNeeded()
returns / creates Parameters panel, override in derived classes see cpp file for example
wxString GetMaxSlop() const
virtual IRegSettings * x_GetParamsAsRegSetting()
return a pointer to Parameters object as IRegSettings interface
TConstScopedObjects m_SeqIds
CFindOverlapParams & GetData()
virtual void CleanUI()
override this function in a derived class and clean extra members
virtual void InitUI()
override this function in a derived class and initialize extra members
virtual void x_SelectCompatibleInputObjects()
select only Seq-ids
virtual void LoadSettings()
CFindOverlapParams m_Params
virtual CDataLoadingAppJob * x_CreateLoadingJob()
factory method for creating the job that executes the tool algorithm override in derived classes
virtual string GetExtensionIdentifier() const
returns the unique human-readable identifier for the extension the id should use lowercase letters se...
CFindOverlapTool()
CFindOverlapTool.
void SetData(const CFindOverlapParams &data)
virtual bool x_ValidateParams()
validates user input in Parameters panel, report errors if any
const SConstScopedObject & GetSeq2() const
wxString GetBlastParams() const
CFindOverlapPanel * m_Panel
static void GetLabel(const CObject &obj, string *label, ELabelType type=eDefault)
Definition: label.cpp:140
CRef< objects::CScope > scope
Definition: objects.hpp:53
string m_Descr
mutex to sync our internals
virtual bool IsCanceled() const override
CFastMutex m_Mutex
CConstRef< CObject > object
Definition: objects.hpp:52
vector< SConstScopedObject > TConstScopedObjects
Definition: objects.hpp:65
@ eDefault
Definition: label.hpp:73
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void AddScope(CScope &scope, TPriority pri=kPriority_Default)
Add the scope's datasources as a single group with the given priority All data sources (data loaders ...
Definition: scope.cpp:516
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
void SetLabel(const TLabel &value)
Assign a value to Label data member.
void SetItem(TItem &value)
Assign a value to Item data member.
Tdata & Set(void)
Assign a value to data member.
bool IsSet(void) const
Check if a value has been assigned to data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
int i
static char * prog
Definition: mdb_load.c:33
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
The Object manager core.
static static static wxID_ANY
Progress monitoring structure.
Definition: blast_def.h:341
static string subject
static string query
string ToStdString(const wxString &s)
Definition: wx_utils.hpp:161
Modified on Fri Sep 20 14:57:15 2024 by modify_doxy.py rev. 669887