NCBI C++ ToolKit
ngalign_job.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: ngalign_job.cpp 47479 2023-05-02 13:24:02Z ucko $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Roman Katargin
27 */
28 
29 
30 #include <ncbi_pch.hpp>
31 
32 #include <corelib/ncbifile.hpp>
33 #include <corelib/ncbiexec.hpp>
35 #include <objmgr/seq_vector.hpp>
36 #include <objmgr/util/sequence.hpp>
38 
39 #include <serial/iterator.hpp>
40 
43 
45 
46 #include <gui/objutils/label.hpp>
48 
51 
61 
64 
65 
68 
69 ///////////////////////////////////////////////////////////////////////////////
70 /// CNGAlignJob
71 CNGAlignJob::CNGAlignJob(const CNGAlignParams& params) : m_Params(params)
72 {
73  m_Descr = "NGAlign Job";
74 }
75 
76 ///////////////////////////////////////////////////////////////////////////////
77 
79  /// add BLAST-style 'num_ident' score
81 
82  /// add a 'mismatch' core with a count of mismatches
84 
85  /// add a 'gap_count' score
86  fScore_GapCount = 0x004,
87 
88  /// add scores for ungapped and gapped percent identity
90 
91  /// add a score for percent coverage of query (sequence 0)
93 
94  /// default flags: everything
95  fScore_Default = 0xffffffff
96 };
97 typedef int TAlignScoreFlags;
98 
100  CSeq_align& align,
102 {
103  CScoreBuilder sb;
104 
105  if (flags & (fScore_Identities |
109  /// this automatically adds num_ident and num_mismatch
110  sb.AddScore(scope, align,
112  sb.AddScore(scope, align,
114  } else if (flags & fScore_Identities) {
115  sb.AddScore(scope, align, CSeq_align::eScore_IdentityCount);
116  } else if (flags & fScore_Mismatches) {
117  sb.AddScore(scope, align, CSeq_align::eScore_MismatchCount);
118  }
119  }
120 
121  if (flags & fScore_GapCount) {
122  /// FIXME: add eScore_GapCount to CSeq_align, CScoreBuilder
123  //sb.AddScore(scope, align, CScoreBuilder::eScore_GapCount);
124  int gap_count = sb.GetGapCount(align);
125  align.SetNamedScore("gap_count", gap_count);
126  }
127 
130  }
131 
132 }
133 
134 //////////////////////////////////////////////////////////////////////////////
135 
137 public:
139 
141  {
143  result_iter, results->Get()) {
145  assm_iter, result_iter->second->Get()) {
147  query_iter, assm_iter->second) {
149  query_iter->second->Set()) {
150  CSeq_align& align = **it;
151  AddStandardAlignmentScores(scope, align);
152 
153  /// additionally, add the gaponly version, used in gbDNA
154  CScoreBuilder sb;
155  sb.AddScore(scope, align,
157  }
158  }
159  }
160  }
161  }
162 };
163 
165 {
166  if( prog && prog->user_data ){
167  CJobCancelable* job =
168  reinterpret_cast<CJobCancelable*>(prog->user_data)
169  ;
170  if( job->IsCanceled() ){
171  return TRUE;
172  }
173  }
174  return FALSE;
175 }
176 
178  CNGAlignParams& params,
179  CNgAligner& aligner,
180  CJobCancelable* job,
181  CUnorderedSplitter* aSplitter = NULL,
182  bool is_splittable_sequences = false
183 ){
185  (new blast::CBlastNucleotideOptionsHandle);
186 
187  opts->SetTraditionalBlastnDefaults();
188  blast::CBlastOptions& options = opts->SetOptions();
189 
190  options.SetWordSize (static_cast<int>(params.GetWordSize()));
191  options.SetEvalueThreshold (params.GetEValue());
192 
193  unique_ptr<CAutoEnvironmentVariable> p_wm_path;
194 
195  if (params.GetWMTaxId() != 0) {
196  if (!params.GetWMDir().empty()) {
197  options.SetWindowMaskerTaxId(params.GetWMTaxId());
198  //p_wm_path.reset(new CAutoEnvironmentVariable("WINDOW_MASKER_PATH", params.GetWMDir().c_str()));
200  } else {
201  ERR_POST(Error << "BLAST window masker tax-id filtering ignored. WindowMasker DB directory not set.");
202  }
203  }
204 
205  if (params.GetBestHit()) {
206  options.SetBestHitScoreEdge(0.1);
207  options.SetBestHitOverhang(0.1);
208  }
209 
210  string advParams = ToStdString(params.GetAdvParams());
211  if (!advParams.empty()) {
212  static const string s_whitespace(" \n\t\r");
213  vector<string> argv;
214  argv.push_back( EProgramToTaskName( blast::eBlastn ) );
215  NStr::Split( advParams, s_whitespace, argv );
216 
217  CRef<blast::CBlastAppArgs> blast_args(new blast::CBlastnAppArgs());
218  unique_ptr<CArgDescriptions> arg_descs( blast_args->SetCommandLine() );
219  unique_ptr<CArgs> args( arg_descs->CreateArgs( argv.size(), argv ) );
220 
221  ///
222  /// Standard processing for BLAST
223  ///
224  blast::CGenericSearchArgs search_args(false /* not protein */,
225  false /* not RPS blast */,
226  true /* show %identity */);
227  search_args.ExtractAlgorithmOptions(*args, options);
228 
229  blast::CFilteringArgs filter_args(false /* not protein */);
230  filter_args.ExtractAlgorithmOptions(*args, options);
231 
232  blast::CNuclArgs nucl_args;
233  nucl_args.ExtractAlgorithmOptions(*args, options);
234 
235  blast::CGappedArgs gapped_args;
236  gapped_args.ExtractAlgorithmOptions(*args, options);
237 
238  /// best hit args here
239  blast::CHspFilteringArgs cull_args;
240  cull_args.ExtractAlgorithmOptions(*args, options);
241 
242  /// Argments for window size
243  blast::CWindowSizeArg window_args;
244  window_args.ExtractAlgorithmOptions(*args, options);
245  }
246 
247  unique_ptr<CBlastAligner> al(new CBlastAligner(*opts, 0));
248  al->SetInterruptCallback( s_BlastInterruptCallback, job );
249  aligner.AddAligner(al.release());
250 
251  aligner.AddAligner(new CMergeAligner(1));
252  /*
253  aligner.AddAligner(new CInstancedAligner(120,
254  10,
255  5,
256  1));
257  */
258  if( aSplitter && is_splittable_sequences ){
259  /// special split merger for phase 1 alignments
260  aligner.AddAligner( new CSplitSeqAlignMerger(aSplitter) );
261  } else {
262  /// non-phase1 sequences: use the inversion merge aligner
263  aligner.AddAligner( new CInversionMergeAligner(1) );
264  }
265 
266  int filterMode = params.GetFilterMode();
267 
268  if (filterMode == 0 || filterMode == 1) {
269  aligner.AddFilter(new CQueryFilter(0,
270  "pct_identity_gapopen_only >= 99.5 AND pct_coverage >= 99"));
271  aligner.AddFilter(new CQueryFilter(1,
272  "pct_identity_gapopen_only >= 95 AND pct_coverage >= 95"));
273  aligner.AddFilter(new CQueryFilter(2,
274  "pct_identity_gapopen_only >= 95 AND pct_coverage >= 50"));
275  aligner.AddFilter(new CQueryFilter(3,
276  "pct_identity_gapopen_only >= 80 AND pct_coverage >= 25"));
277  }
278  else {
279  aligner.AddFilter(new CQueryFilter(0,
280  "pct_identity_gapopen_only >= 99.5 AND pct_coverage >= 99 AND align_length_ratio <= 2"));
281  aligner.AddFilter(new CQueryFilter(1,
282  "pct_identity_gapopen_only >= 95 AND pct_coverage >= 95"));
283  aligner.AddFilter(new CQueryFilter(2,
284  "pct_identity_gapopen_only >= 95 AND pct_coverage >= 50"));
285  aligner.AddFilter(new CQueryFilter(3,
286  "pct_identity_gapopen_only >= 80 AND pct_coverage >= 25"));
287  }
288 
289  ///
290  /// add scoring
291  /// These add scores
293  aligner.AddScorer( new CGPipeAlignmentScorer() );
294  aligner.AddScorer( new CCommonComponentScorer() );
295 
296  ///
297  /// Run!
298  return aligner.Align();
299 }
300 
302 {
303  ///
304  /// Create the NG aligner
305  ///
306 
307  m_Params.LogDump();
308 
310 
311  /// subject sequence
312  list< CRef<CSeq_loc> > subject_locs;
313  const CSeq_id* seq_id = dynamic_cast<const CSeq_id*>(m_Params.GetSubject().object.GetPointer());
314  if( seq_id != NULL ){
316  loc->SetId( *seq_id );
317 
318  subject_locs.push_back( loc );
319  } else {
320  const CSeq_loc* seq_loc = dynamic_cast<const CSeq_loc*>(m_Params.GetSubject().object.GetPointer());
321  if( seq_loc != NULL ){
322 
323  subject_locs.push_back( CRef<CSeq_loc>(const_cast<CSeq_loc*>(seq_loc)) );
324  } else {
325  LOG_POST(Error << "CNGAlignJob::x_CreateProjectItems(): invalid (non Seq-id/Seq-loc) subject sequence!");
326  return;
327  }
328  }
329 
331  subject->SetLocList().insert( subject->SetLocList().end(), subject_locs.begin(), subject_locs.end() );
332 
333  /// query sequences
334  list< CRef<CSeq_loc> > query_locs;
335 
337  const CSeq_id* seq_id = dynamic_cast<const CSeq_id*>( iter->object.GetPointer() );
338  if( seq_id != NULL ){
340  loc->SetId( *seq_id );
341 
342  query_locs.push_back( loc );
343  } else {
344  const CSeq_loc* seq_loc = dynamic_cast<const CSeq_loc*>( iter->object.GetPointer() );
345  if( seq_loc != NULL ){
346 
347  query_locs.push_back( CRef<CSeq_loc>(const_cast<CSeq_loc*>(seq_loc)) );
348  } else {
349  LOG_POST(Error << "CNGAlignJob::x_CreateProjectItems(): invalid (non Seq-id/Seq-loc) query sequence!");
350  return;
351  }
352  }
353  }
354 
355 
356  TAlignSetRef alignments;
357  unique_ptr<CUnorderedSplitter> splitter;
358 
359  if (m_Params.GetFilterMode() == 0) {
360  ///
361  /// this pass takes care of splittable (i.e., delta-seq) phase 1
362  /// sequences
363  /// not all phase 1 sequences pass through here!
364  ///
365 
366  splitter.reset( new CUnorderedSplitter( *scope ) );
367  CRef<CSplitSeqLocListSet> queries( new CSplitSeqLocListSet( splitter.get() ) );
368 
369  ITERATE(list< CRef<CSeq_loc> >, it, query_locs ){
370  queries->AddSeqLoc(*it);
371  }
372 
373  /*---
374  CRef<CSeqLocListSet> queries( new CSeqLocListSet() );
375  queries->SetLocList().insert( queries->SetLocList().end(), query_locs.begin(), query_locs.end() );
376  */
377 
378  CNgAligner aligner(*scope);
379  aligner.SetQuery(queries);
380  aligner.SetSubject(subject);
381 
382  CRef<CSeq_align_set> sas = s_CallAligner(m_Params, aligner, this, splitter.get(), true);
383  //---CRef<CSeq_align_set> sas = s_CallAligner( m_Params, aligner );
384 
385  if( sas && sas->IsSet() ){
386  if( !alignments ){
387  alignments = sas;
388  } else {
389  alignments->Set().insert(
390  alignments->Set().end(),
391  sas->Set().begin(),
392  sas->Set().end()
393  );
394  }
395  }
396 
397  } else {
398  ///
399  /// all non-splittable sequences are handled differently with different
400  /// aligners
401  ///
402  CRef<CSeqLocListSet> queries( new CSeqLocListSet() );
403  queries->SetLocList().insert( queries->SetLocList().end(), query_locs.begin(), query_locs.end() );
404 
405  CNgAligner aligner(*scope);
406  aligner.SetQuery(queries);
407  aligner.SetSubject(subject);
408 
409  CRef<CSeq_align_set> sas = s_CallAligner( m_Params, aligner, this );
410 
411  if( sas && sas->IsSet() ){
412  if( !alignments ){
413  alignments = sas;
414  } else {
415  alignments->Set().insert(
416  alignments->Set().end(),
417  sas->Set().begin(),
418  sas->Set().end()
419  );
420  }
421  }
422  }
423 
424  if (alignments && ! alignments->Get().empty()) {
425  // pack the alignment in a Seq-annot and label it appropriately
426  CRef<CSeq_annot> annot(new CSeq_annot());
427 
428  typedef CSeq_annot::TData::TAlign TAlign;
429  TAlign& seqannot_align = annot->SetData().SetAlign();
430 
431  string title;
432 
433  ITERATE( CSeq_align_set::Tdata, align, alignments->Get() ){
434  if (title.empty()) {
435  CLabel::GetLabel(**align, &title, CLabel::eDefault, scope);
436  }
437  seqannot_align.push_back(*align);
438  }
439 
440  title += ": Genomic Alignment";
441 
442  annot->SetCreateDate(CurrentTime());
443  annot->SetTitleDesc(title);
444  annot->SetNameDesc(title);
445 
446  /// now create a Project Item for the data
447  CRef<CProjectItem> item(new CProjectItem());
448  item->SetItem().SetAnnot(*annot);
449 
450  item->SetLabel(title);
451  AddProjectItem(*item);
452  }
453 }
454 
456 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eBlastn
Nucl-Nucl (traditional blastn)
Definition: blast_types.hpp:58
Main argument class for BLASTN application.
void AddProjectItem(objects::CProjectItem &item)
void ScoreAlignments(TAlignResultsRef results, CScope &scope)
Base class to build jobs with cancel functionality.
void SetQuery(ISequenceSet *Set)
Definition: ngalign.cpp:76
void SetSubject(ISequenceSet *Set)
Definition: ngalign.cpp:82
TAlignSetRef Align()
Definition: ngalign.cpp:110
void AddFilter(IAlignmentFilter *Filter)
Definition: ngalign.cpp:88
void AddAligner(IAlignmentFactory *Aligner)
Definition: ngalign.cpp:94
void AddScorer(IAlignmentScorer *Scorer)
Definition: ngalign.cpp:100
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
int GetGapCount(const CSeq_align &align)
Compute the number of gaps in the alignment.
void AddScore(CScope &scope, CSeq_align &align, EScoreType score)
deprecated: use CSeq_align::EScoreType directly
@ eScore_PercentIdentity_GapOpeningOnly
Definition: Seq_align.hpp:165
@ eScore_PercentIdentity_Gapped
Definition: Seq_align.hpp:163
@ eScore_PercentIdentity_Ungapped
Definition: Seq_align.hpp:164
@ eScore_PercentCoverage
Definition: Seq_align.hpp:168
@ eScore_IdentityCount
Definition: Seq_align.hpp:145
@ eScore_MismatchCount
Definition: Seq_align.hpp:154
void SetNamedScore(const string &id, int score)
Definition: Seq_align.cpp:636
void SetNameDesc(const string &name)
Definition: Seq_annot.cpp:66
void SetCreateDate(const CTime &dt)
Definition: Seq_annot.cpp:121
void SetTitleDesc(const string &title)
Definition: Seq_annot.cpp:96
static uch flags
Operators to edit gaps in sequences.
string EProgramToTaskName(EProgram p)
Convert a EProgram enumeration value to a task name (as those used in the BLAST command line binaries...
Definition: blast_aux.cpp:676
int WindowMaskerPathInit(const string &window_masker_path)
Initialize the path to the windowmasker data files.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
int GetWMTaxId() const
virtual void x_CreateProjectItems()
override this function in derived classes and populate m_Items.
double GetEValue() const
CNGAlignJob(const CNGAlignParams &params)
CNGAlignJob.
Definition: ngalign_job.cpp:71
TConstScopedObjects & SetQueries()
bool GetBestHit() const
string GetWMDir() const
long GetWordSize() const
wxString GetAdvParams() const
int GetFilterMode() const
const SConstScopedObject & GetSubject() const
CNGAlignParams m_Params
Definition: ngalign_job.hpp:59
static void GetLabel(const CObject &obj, string *label, ELabelType type=eDefault)
Definition: label.cpp:140
CRef< objects::CScope > scope
Definition: objects.hpp:53
string m_Descr
mutex to sync our internals
virtual bool IsCanceled() const override
CConstRef< CObject > object
Definition: objects.hpp:52
vector< SConstScopedObject > TConstScopedObjects
Definition: objects.hpp:65
@ eDefault
Definition: label.hpp:73
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
CTime CurrentTime(CTime::ETimeZone tz=CTime::eLocal, CTime::ETimeZonePrecision tzp=CTime::eTZPrecisionDefault)
Definition: ncbitime.hpp:2185
void SetLabel(const TLabel &value)
Assign a value to Label data member.
void SetItem(TItem &value)
Assign a value to Item data member.
Tdata & Set(void)
Assign a value to data member.
bool IsSet(void) const
Check if a value has been assigned to data member.
list< CRef< CSeq_align > > Tdata
@ e_Whole
whole sequence
Definition: Seq_loc_.hpp:100
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeq_align > > TAlign
Definition: Seq_annot_.hpp:194
static char * prog
Definition: mdb_load.c:33
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
Defines a portable execute class.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
USING_SCOPE(objects)
int TAlignScoreFlags
Definition: ngalign_job.cpp:97
static Boolean s_BlastInterruptCallback(SBlastProgress *prog)
EAlignScoreTypes
Definition: ngalign_job.cpp:78
@ fScore_GapCount
add a 'gap_count' score
Definition: ngalign_job.cpp:86
@ fScore_Identities
add BLAST-style 'num_ident' score
Definition: ngalign_job.cpp:80
@ fScore_Default
default flags: everything
Definition: ngalign_job.cpp:95
@ fScore_Mismatches
add a 'mismatch' core with a count of mismatches
Definition: ngalign_job.cpp:83
@ fScore_PercentCoverage
add a score for percent coverage of query (sequence 0)
Definition: ngalign_job.cpp:92
@ fScore_PercentIdentity
add scores for ungapped and gapped percent identity
Definition: ngalign_job.cpp:89
static CRef< CSeq_align_set > s_CallAligner(CNGAlignParams &params, CNgAligner &aligner, CJobCancelable *job, CUnorderedSplitter *aSplitter=NULL, bool is_splittable_sequences=false)
void AddStandardAlignmentScores(CScope &scope, CSeq_align &align, TAlignScoreFlags flags=fScore_Default)
Definition: ngalign_job.cpp:99
Progress monitoring structure.
Definition: blast_def.h:341
static string subject
Interface to retrieve list of available windowmasker filtering.
string ToStdString(const wxString &s)
Definition: wx_utils.hpp:161
Modified on Thu Apr 25 08:18:02 2024 by modify_doxy.py rev. 669887