NCBI C++ ToolKit
banded_aligner.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: banded_aligner.cpp 71429 2016-03-03 20:39:42Z whlavina $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Nathan Bouk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiexpt.hpp>
34 #include <corelib/ncbi_system.hpp>
35 #include <corelib/ncbi_signal.hpp>
36 #include <math.h>
38 //#include "align_instance.hpp"
39 
42 #include <objmgr/scope.hpp>
52 #include <objmgr/seq_vector.hpp>
53 
56 
60 
64 
65 
68 USING_SCOPE(blast);
69 
70 
71 
72 
73 CRef<CSeq_loc> s_CoverageSeqLoc(TAlignSetRef Alignments, int Row, CScope& Scope)
74 {
75  CRef<CSeq_loc> Accum(new CSeq_loc);
76  ITERATE(CSeq_align_set::Tdata, Iter, Alignments->Get()) {
77 
78  const CSeq_align& Curr = **Iter;
79  const CDense_seg& Denseg = Curr.GetSegs().GetDenseg();
80 
81  size_t Dims = Denseg.GetDim();
82  size_t SegCount = Denseg.GetNumseg();
83  for(size_t CurrSeg = 0; CurrSeg < SegCount; ++CurrSeg) {
84  int Index = (Dims*CurrSeg)+Row;
85  int CurrStart = Denseg.GetStarts()[Index];
86  if( CurrStart != -1) {
87  CRef<CSeq_loc> CurrLoc(new CSeq_loc);
88  CurrLoc->SetInt().SetId().Assign( *Denseg.GetIds()[Row] );
89  CurrLoc->SetInt().SetFrom() = CurrStart;
90  CurrLoc->SetInt().SetTo() = CurrStart + Denseg.GetLens()[CurrSeg];
91  if(Denseg.CanGetStrands())
92  CurrLoc->SetInt().SetStrand() = Denseg.GetStrands()[Index];
93  Accum->SetMix().Set().push_back(CurrLoc);
94  }
95  }
96  }
97 
98  CRef<CSeq_loc> Merged;
99  Merged = sequence::Seq_loc_Merge(*Accum, CSeq_loc::fSortAndMerge_All, &Scope);
100 
101  return Merged;
102 }
103 
104 
105 TSeqPos s_CalcCoverageCount(TAlignSetRef Alignments, int Row, CScope& Scope)
106 {
107 
108  CRef<CSeq_loc> Merged;
109 
110  Merged = s_CoverageSeqLoc(Alignments, Row, Scope);
111  //cout << MSerial_AsnText << *Merged;
112 
113  Merged->ChangeToMix();
114  TSeqPos PosCoveredBases = 0, NegCoveredBases = 0;
115  ITERATE(CSeq_loc_mix::Tdata, LocIter, Merged->GetMix().Get()) {
116 
117 // ITERATE(CPacked_seqint_Base::Tdata, IntIter, Merged->GetPacked_int().Get())
118  if( (*LocIter)->Which() == CSeq_loc::e_Int) {
119  if( (*LocIter)->GetInt().GetStrand() == eNa_strand_plus)
120  PosCoveredBases += (*LocIter)->GetInt().GetLength();
121  else
122  NegCoveredBases += (*LocIter)->GetInt().GetLength();
123  }
124  }
125  //cerr << "Covered: " << PosCoveredBases << " or " << NegCoveredBases
126  // << " len: " << m_Scope->GetBioseqHandle(*Merged->GetId()).GetBioseqLength() << endl;
127 
128  return max(PosCoveredBases, NegCoveredBases);
129 }
130 
131 
132 ///////
133 
134 
135 
137  ISequenceSet* QuerySet,
138  ISequenceSet* SubjectSet,
139  TAlignResultsRef AccumResults)
140 {
141  TAlignResultsRef NewResults(new CAlignResultsSet);
142 
144  AccumResults->Get()) {
145  int BestRank = QueryIter->second->GetBestRank();
146  if(BestRank > m_Threshold || BestRank == -1) {
147 
148  _TRACE("Determined ID: "
149  << QueryIter->second->GetQueryId()->AsFastaString()
150  << " needs Instanced MM Aligner.");
151  if(!x_MinCoverageCheck(*QueryIter->second)) {
152  ERR_POST(Info << "ID: "
153  << QueryIter->second->GetQueryId()->AsFastaString()
154  << " fails the minimum percent coverage cutoff. Skipping.");
155  continue;
156  }
157  x_RunAligner(Scope, *QueryIter->second, NewResults);
158  }
159  }
160 
161  return NewResults;
162 }
163 
164 
165 
166 void CInstancedAligner::x_RunAligner(objects::CScope& Scope,
167  CQuerySet& QueryAligns,
168  TAlignResultsRef Results)
169 {
170  CRef<CSeq_align_set> ResultSet(new CSeq_align_set);
171 
172  vector<CRef<CInstance> > Instances;
173  x_GetCleanupInstances(QueryAligns, Scope, Instances);
174  x_GetDistanceInstances(QueryAligns, Scope, Instances);
175  x_FilterInstances(Instances, m_MaxRatio);
176  if(Instances.empty()) {
177  ERR_POST(Info << " No instances of " << QueryAligns.GetQueryId()->AsFastaString() << " found.");
178  return;
179  }
180  ERR_POST(Info << " Instance Count: " << Instances.size());
181 
182 
183  CRef<CSeq_align> Result(new CSeq_align);
184  ITERATE(vector<CRef<CInstance> >, InstIter, Instances) {
185  if (CSignal::IsSignaled()) {
186  NCBI_THROW(CException, eUnknown, "trapped signal");
187  }
188 
189  const CInstance& Inst = **InstIter;
190 
191  ERR_POST(Info << " Aligning " << Inst.Query.GetId().AsFastaString() << " to "
192  << Inst.Subject.GetId().AsFastaString());
193  ERR_POST(Info << " q:" << Inst.Query.GetFrom() << ":"
194  << Inst.Query.GetTo() << ":"
195  << (Inst.Query.GetStrand() == eNa_strand_plus ? "+" : "-")
196  << " and s: " << Inst.Subject.GetFrom() << ":"
197  << Inst.Subject.GetTo() << ":"
198  << (Inst.Subject.GetStrand() == eNa_strand_plus ? "+" : "-")
199  << " ratio: " << Inst.SubjToQueryRatio());
200 
201  CRef<CDense_seg> GlobalDs;
202  try {
203  GlobalDs = x_RunMMGlobal(
204  Inst.Query.GetId(), Inst.Subject.GetId(),
205  Inst.Query.GetStrand(),
206  Inst.Query.GetFrom(), Inst.Query.GetTo(),
207  Inst.Subject.GetFrom(), Inst.Subject.GetTo(),
208  Scope);
209  } catch(CException& e) {
210  ERR_POST(Error << e.ReportAll());
211  ERR_POST(Error << "CInstancedBandedAligner failed.");
212  throw e;
213  }
214 
216  .Print("instance_query", Inst.Query.GetId().AsFastaString())
217  .Print("instance_subject", Inst.Subject.GetId().AsFastaString())
218  .Print("instance_align", (GlobalDs.IsNull() ? "false" : "true"));
219 
220  if(GlobalDs.IsNull())
221  continue;
222 
223  Result->SetSegs().SetDenseg().Assign(*GlobalDs);
225  Result->SetNamedScore(GetName(), 1);
226  ResultSet->Set().push_back(Result);
227 
228  ERR_POST(Info << " Found alignment ");
229 
230 /*
231  // CContigAssembly::BestLocalSubAlignmentrescores, and finds better-scoring
232  // sub-regions of the Denseg, and keeps only the subregion.
233  // The practical result is, if the alignment has a large gap near the edges
234  // that gap might be trimmed. If there is more matching region past the
235  // too-big gap, that matching region is lost. Sometimes we don't want that
236  // especially in the Instance case, because we trust the region so much
237  // So, (see below) if this function changes the Denseg, we return both, the
238  // trimmed and the untrimmed.
239  CRef<CDense_seg> LocalDs;
240  LocalDs = CContigAssembly::BestLocalSubAlignment(*GlobalDs, Scope);
241 
242  Result->SetSegs().SetDenseg().Assign(*LocalDs);
243  Result->SetType() = CSeq_align::eType_partial;
244  ResultSet->Set().push_back(Result);
245 
246  if(!LocalDs->Equals(*GlobalDs)) {
247  CRef<CSeq_align> Result(new CSeq_align);
248  Result->SetSegs().SetDenseg().Assign(*GlobalDs);
249  Result->SetType() = CSeq_align::eType_partial;
250  ResultSet->Set().push_back(Result);
251  }
252 */
253  }
254 
255  if(!ResultSet->Get().empty()) {
256  Results->Insert(CRef<CQuerySet>(new CQuerySet(*ResultSet)));
257  }
258 }
259 
260 
262 {
266  bool TimedOut;
267 };
268 
269 
271 {
272  SCallbackData* CallbackData = (SCallbackData*)ProgressInfo->m_data;
273  CTime CurrTime(CTime::eCurrent);
274  CTimeSpan Span = (CurrTime-CallbackData->StartTime);
275 
276  if(CallbackData->TimedOut) {
277  return true;
278  }
279 
280  if(CallbackData->PreviousCount == ProgressInfo->m_iter_done ||
281  Span.GetAsDouble() < 1.0) {
282  CallbackData->PreviousCount = ProgressInfo->m_iter_done;
283  return false;
284  }
285 
286  CallbackData->PreviousCount = ProgressInfo->m_iter_done;
287 
288 // ERR_POST(Info << "Counts: " << ProgressInfo->m_iter_done << " of " << ProgressInfo->m_iter_total);
289 
290  double PercentComplete = (double(ProgressInfo->m_iter_done)/ProgressInfo->m_iter_total);
291  double PercentRemaining = 1.0-PercentComplete;
292 
293  double Factor = PercentRemaining/PercentComplete;
294 
295 
296  if( Span.GetCompleteSeconds() > CallbackData->TimeOutSeconds) {
297  ERR_POST(Error << " Instanced Aligner took over 5 minutes. Timed out.");
298  CallbackData->TimedOut = true;
299  return true;
300  }
301 
302  double TimeEstimated = Span.GetAsDouble() * Factor;
303 
304  if(TimeEstimated > CallbackData->TimeOutSeconds) {
305  ERR_POST(Error << " Instanced Aligner expected to take " << TimeEstimated
306  << " seconds. More than " << (CallbackData->TimeOutSeconds/60.0)
307  << " minutes. Terminating Early.");
308  CallbackData->TimedOut = true;
309  return true;
310  }
311 
312 
313  return false;
314 }
315 
316 
317 
319  const CSeq_id& SubjectId,
320  ENa_strand Strand,
321  TSeqPos QueryStart,
322  TSeqPos QueryStop,
323  TSeqPos SubjectStart,
324  TSeqPos SubjectStop,
325  CScope& Scope)
326 {
327 
328  // ASSUMPTION: Query is short, Subject is long.
329  // We should be able to align all of Query against a subsection of Subject
330  // Banded Memory useage is Longer Sequence * Band Width * unit size (1)
331  // The common case is that Subject will be a very sizeable Scaffold.
332  // a bandwidth of anything more than 10-500 can pop RAM on a 16G machine.
333  // So we're gonna snip Subject to a subregion
334  // CMMAligner uses linear RAM, but we still use the restricted region because
335  // we know where the regions are.
336 
338  VecStrand = (Strand == eNa_strand_plus ? CBioseq_Handle::eStrand_Plus
340 
341  CBioseq_Handle QueryHandle, SubjectHandle;
342  QueryHandle = Scope.GetBioseqHandle(QueryId);
343  SubjectHandle = Scope.GetBioseqHandle(SubjectId);
344 
345  CSeqVector QueryVec, SubjectVec;
346  QueryVec = QueryHandle.GetSeqVector(CBioseq_Handle::eCoding_Iupac, VecStrand);
347  SubjectVec = SubjectHandle.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
348 
349 
350  // CSeqVector, when making a minus vector, not only compliments the bases,
351  // flips the order to. But the start and stops provided are in Plus space.
352  // So we need to swap them to Minus space
353  TSeqPos QueryStrandedStart, QueryStrandedStop;
354  if(Strand == eNa_strand_plus) {
355  QueryStrandedStart = QueryStart;
356  QueryStrandedStop = QueryStop;
357  } else {
358  QueryStrandedStart = ( (QueryVec.size()-1) - QueryStop);
359  QueryStrandedStop = ( (QueryVec.size()-1) - QueryStart);
360  }
361 
362  string QuerySeq, SubjectSeq;
363  QueryVec.GetSeqData(QueryStrandedStart, QueryStrandedStop+1, QuerySeq);
364  SubjectVec.GetSeqData(SubjectStart, SubjectStop+1, SubjectSeq);
365 
366 /* ERR_POST(Info << " Query " << QueryStart << " to " << QueryStop << " against");
367  ERR_POST(Info << " QStranded " << QueryStrandedStart << " to " << QueryStrandedStop << " against");
368  ERR_POST(Info << " Subject " << SubjectStart << " to " << SubjectStop);
369 */
370 
371  SCallbackData CallbackData;
372  CallbackData.StartTime = CTime(CTime::eCurrent);
373  CallbackData.TimeOutSeconds = m_TimeOutSeconds;
374  CallbackData.PreviousCount = 0;
375  CallbackData.TimedOut = false;
376 
377  TSeqPos ExtractQueryStart = ((Strand == eNa_strand_plus) ? 0 : QuerySeq.size()-1);
378  CRef<CDense_seg> ResultDenseg;
379 
380  CMMAligner Aligner(QuerySeq, SubjectSeq);
381  Aligner.SetWm(m_Match);
382  Aligner.SetWms(m_Mismatch);
383  Aligner.SetWg(m_GapOpen);
384  Aligner.SetWs(m_GapExtend);
385  Aligner.SetScoreMatrix(NULL);
386  Aligner.SetProgressCallback(s_ProgressCallback, (void*)&CallbackData);
387  try {
388  int Score;
389  Score = Aligner.Run();
390  if(Score == numeric_limits<int>::min()) {
391  return CRef<CDense_seg>();
392  }
393  ResultDenseg = Aligner.GetDense_seg(ExtractQueryStart, Strand, QueryId,
394  0, eNa_strand_plus, SubjectId);
395  } catch(CException& e) {
396  if(!CallbackData.TimedOut) {
397  ERR_POST(Error << "MMAligner: " << e.ReportAll());
398  }
399  return CRef<CDense_seg>();
400  }
401 
402 
403  if(ResultDenseg->GetNumseg() > 0) {
404  ResultDenseg->OffsetRow(0, QueryStart);
405  ResultDenseg->OffsetRow(1, SubjectStart);
406 
407  // TrimEndGaps Segfaults on the only one segment which is a gap case
408  if(ResultDenseg->GetNumseg() == 1 &&
409  (ResultDenseg->GetStarts()[0] == -1 || ResultDenseg->GetStarts()[1] == -1)) {
410  return CRef<CDense_seg>();
411  } else {
412  ResultDenseg->TrimEndGaps();
413  }
414 
415  } else {
416  return CRef<CDense_seg>();
417  }
418 
419  return ResultDenseg;
420 }
421 
422 
424 CInstancedAligner::x_RunCleanup(const objects::CSeq_align_set& AlignSet,
425  objects::CScope& Scope)
426 {
427  CAlignCleanup Cleaner(Scope);
428  //Cleaner.FillUnaligned(true);
429 
430  list<CConstRef<CSeq_align> > In;
431  ITERATE(CSeq_align_set::Tdata, AlignIter, AlignSet.Get()) {
432  CConstRef<CSeq_align> Align(*AlignIter);
433  In.push_back(Align);
434  }
435 
437 
438  try {
439  Cleaner.Cleanup(In, Out->Set());
440  } catch(CException& e) {
441  ERR_POST(Error << "Cleanup Error: " << e.ReportAll());
442  throw e;
443  }
444 
445  NON_CONST_ITERATE(CSeq_align_set::Tdata, AlignIter, Out->Set()) {
446  CDense_seg& Denseg = (*AlignIter)->SetSegs().SetDenseg();
447 
448  if(!Denseg.CanGetStrands() || Denseg.GetStrands().empty()) {
449  Denseg.SetStrands().resize(Denseg.GetDim()*Denseg.GetNumseg(), eNa_strand_plus);
450  }
451 
452  if(Denseg.GetSeqStrand(1) != eNa_strand_plus) {
453  Denseg.Reverse();
454  }
455  //CRef<CDense_seg> Filled = Denseg.FillUnaligned();
456  //Denseg.Assign(*Filled);
457  }
458 
459  return Out;
460 }
461 
462 
464  vector<CRef<CInstance> >& Instances)
465 {
466 
467  ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
468  ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
469  //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
470 
471  CRef<CSeq_align_set> Set = SubjectIter->second;
473 
475  Minuses(new CSeq_align_set);
476 
477  ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
478  if( (*AlignIter)->GetSeqStrand(0) == eNa_strand_plus)
479  Pluses->Set().push_back(*AlignIter);
480  else if( (*AlignIter)->GetSeqStrand(0) == eNa_strand_minus)
481  Minuses->Set().push_back(*AlignIter);
482  }
483 
484  if(!Pluses->Set().empty()) {
485  Out = x_RunCleanup(*Pluses, Scope);
486  if(!Out.IsNull()) {
487  ITERATE(CSeq_align_set::Tdata, AlignIter, Out->Set()) {
488  CRef<CInstance> Inst(new CInstance(*AlignIter));
489  Instances.push_back(Inst);
490  }
491  }
492  }
493  if(!Minuses->Set().empty()) {
494  Out = x_RunCleanup(*Minuses, Scope);
495  if(!Out.IsNull()) {
496  ITERATE(CSeq_align_set::Tdata, AlignIter, Out->Set()) {
497  CRef<CInstance> Inst(new CInstance(*AlignIter));
498  Instances.push_back(Inst);
499  }
500  }
501  }
502  }
503  }
504 
505 }
506 
507 
508 
510 {
511  Query.SetId().Assign(Align->GetSeq_id(0));
512  Subject.SetId().Assign(Align->GetSeq_id(1));
513 
514  Query.SetStrand() = Align->GetSeqStrand(0);
515  Subject.SetStrand() = Align->GetSeqStrand(1);
516 
517  Query.SetFrom() = Align->GetSeqStart(0);
518  Subject.SetFrom() = Align->GetSeqStart(1);
519 
520  Query.SetTo() = Align->GetSeqStop(0);
521  Subject.SetTo() = Align->GetSeqStop(1);
522 
523  Alignments.Set().push_back(Align);
524 }
525 
526 
527 CInstance::CInstance(const CSeq_align_set& AlignSet)
528 {
529  Query.SetId().Assign(AlignSet.Get().front()->GetSeq_id(0));
530  Subject.SetId().Assign(AlignSet.Get().front()->GetSeq_id(1));
531 
532  Query.SetStrand() = AlignSet.Get().front()->GetSeqStrand(0);
533  Subject.SetStrand() = AlignSet.Get().front()->GetSeqStrand(1);
534 
535  Query.SetFrom() = numeric_limits<TSeqPos>::max();
537 
540 
541  ITERATE(CSeq_align_set::Tdata, AlignIter, AlignSet.Get()) {
542  Query.SetFrom(min(Query.GetFrom(), (*AlignIter)->GetSeqStart(0)));
543  Subject.SetFrom(min(Subject.GetFrom(), (*AlignIter)->GetSeqStart(1)));
544 
545  Query.SetTo(max(Query.GetTo(), (*AlignIter)->GetSeqStop(0)));
546  Subject.SetTo(max(Subject.GetTo(), (*AlignIter)->GetSeqStop(1)));
547  }
548 }
549 
550 
552 {
553  if(Query.GetStrand() != Align.GetSeqStrand(0) ||
554  Subject.GetStrand() != Align.GetSeqStrand(1))
555  return false;
556 
557  if( Align.GetSeqStart(0) >= Query.GetFrom() &&
558  Align.GetSeqStop(0) <= Query.GetTo() &&
559  Align.GetSeqStart(1) >= Subject.GetFrom() &&
560  Align.GetSeqStop(1) <= Subject.GetTo()) {
561  return true;
562  }
563  else {
564  return false;
565  }
566 }
567 
568 
569 int CInstance::GapDistance(const CSeq_align& Align) const
570 {
571  int LongestGap = 0;
572  LongestGap = max((int)Align.GetSeqStart(0)-(int)Query.GetTo(), LongestGap);
573  LongestGap = max((int)Align.GetSeqStart(1)-(int)Subject.GetTo(), LongestGap);
574  LongestGap = max((int)Query.GetFrom()-(int)Align.GetSeqStop(0), LongestGap);
575  LongestGap = max((int)Subject.GetFrom()-(int)Align.GetSeqStop(1), LongestGap);
576  // cerr << "Longest Gap: " << LongestGap << endl;
577  return LongestGap;
578 }
579 
580 
582 {
583  return (Subject.GetLength() / double(Query.GetLength()));
584 }
585 
586 
588 {
589  return Query.GetLength();
590 }
591 
592 
594 {
595  Query.SetFrom(min(Query.GetFrom(), Align->GetSeqStart(0)));
596  Subject.SetFrom(min(Subject.GetFrom(), Align->GetSeqStart(1)));
597 
598  Query.SetTo(max(Query.GetTo(), Align->GetSeqStop(0)));
599  Subject.SetTo(max(Subject.GetTo(), Align->GetSeqStop(1)));
600 
601  Alignments.Set().push_back(Align);
602 }
603 
605  vector<CRef<CInstance> >& Instances)
606 {
607  typedef pair<CRef<CInstance>, CRef<CSeq_align_set> > TInstPair;
608 
609  // Identify the best pct_coverage for every Subject Id.
610  // This will be used to filter out subjects with relatively low
611  // best pct_coverage, so that they can be skipped.
612  typedef map<string, double> TSubjectCoverage;
613  TSubjectCoverage BestCoverage;
614  double MaxCoverage = 0;
615 
616  ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
617  ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
618  //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
619 
620  CRef<CSeq_align_set> Set = SubjectIter->second;
621  string IdStr = Set->Get().front()->GetSeq_id(1).AsFastaString();
622  double SubjCoverage = 0;
623  ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
624  double PctCov;
625  (*AlignIter)->GetNamedScore("pct_coverage", PctCov);
626  SubjCoverage = max(SubjCoverage, PctCov);
627  }
628  BestCoverage[IdStr] = SubjCoverage;
629  MaxCoverage = max(SubjCoverage, MaxCoverage);
630  }
631  }
632 
633  typedef vector<CRef<CInstance> > TInstVector;
634  ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
635  ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
636  //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
637 
638  TInstVector SubjInstances;
639 
640  CRef<CSeq_align_set> Set = SubjectIter->second;
641  string SubjIdStr = Set->Get().front()->GetSeq_id(1).AsFastaString();
642  if(BestCoverage[SubjIdStr] < (MaxCoverage*0.10)) {
643  continue; // Skip any subject id that has less than 10% the best pct_coverage
644  }
645 
646  // Group together alignments that are not contained within each other,
647  // but are within a 'small' gap distance of each other into initial
648  // instances
649  ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
650 
651  if((*AlignIter)->GetSegs().Which() != CSeq_align::C_Segs::e_Denseg)
652  continue;
653 
654  bool Inserted = false;
655  bool Contained = false;
656  ITERATE(TInstVector, InstIter, SubjInstances) {
657  bool CurrContained = (*InstIter)->IsAlignmentContained(**AlignIter);
658  Contained |= CurrContained;
659  }
660  if(Contained)
661  continue;
662  NON_CONST_ITERATE(TInstVector, InstIter, SubjInstances) {
663  int GapDist = (*InstIter)->GapDistance(**AlignIter);
664  if(GapDist < 20000) {
665  (*InstIter)->MergeIn(*AlignIter);
666  Inserted = true;
667  break;
668  }
669  }
670  if(!Inserted) {
671  CRef<CInstance> Inst(new CInstance(*AlignIter));
672  SubjInstances.push_back(Inst);
673  }
674  }
675 
676  // For these initial instances, run CAlignCleanup on the instance
677  // CSeq_align_sets. Then make instances from CAlignCleanup results.
678  TInstVector CleanedInstances;
679 
680  ITERATE(TInstVector, InstIter, SubjInstances) {
681  // If the instance only has one alignment in it, skip it.
682  // we already have an identical BLAST result.
683  if((*InstIter)->Alignments.Get().size() <= 1)
684  continue;
685 
686  CRef<CSeq_align_set> Cleaned;
687  Cleaned = x_RunCleanup((*InstIter)->Alignments, Scope);
688  if(!Cleaned.IsNull()) {
689  ITERATE(CSeq_align_set::Tdata, AlignIter, Cleaned->Get()) {
690  // If any of these cleaned alignments are dupes of existing alignments,
691  // skip them
692  bool DupeFound = false;
693  ITERATE(CSeq_align_set::Tdata, SourceIter, (*InstIter)->Alignments.Get()) {
694  if( (*AlignIter)->GetSeqStart(0) == (*SourceIter)->GetSeqStart(0) &&
695  (*AlignIter)->GetSeqStart(1) == (*SourceIter)->GetSeqStart(1) &&
696  (*AlignIter)->GetSeqStop(0) == (*SourceIter)->GetSeqStop(0) &&
697  (*AlignIter)->GetSeqStop(1) == (*SourceIter)->GetSeqStop(1)) {
698  DupeFound = true;
699  break;
700  }
701  }
702  if(DupeFound)
703  continue;
704 
705  // And if any of these cleaned alignments are contained in
706  // established instances, throw them out too.
707  bool Contained = false;
708  ITERATE(TInstVector, CleanIter, CleanedInstances) {
709  bool Curr = (*CleanIter)->IsAlignmentContained(**AlignIter);
710  Contained |= Curr;
711  }
712  if(Contained)
713  continue;
714 
715  // Any Cleaned Seq_align that got this far is a potential instance.
716  bool Dupe = false;
717  CRef<CInstance> Inst(new CInstance(*AlignIter));
718  ITERATE(vector<CRef<CInstance> >, OldInstIter, CleanedInstances) {
719  Dupe |= ((*OldInstIter)->Query.Equals(Inst->Query)
720  && (*OldInstIter)->Subject.Equals(Inst->Subject));
721  }
722  if(!Dupe)
723  CleanedInstances.push_back(Inst);
724  }
725  }
726  }
727 
728  copy(CleanedInstances.begin(), CleanedInstances.end(),
729  insert_iterator<TInstVector>(Instances, Instances.end()));
730  }
731  }
732 
733 }
734 
735 
736 void CInstancedAligner::x_FilterInstances(vector<CRef<CInstance> >& Instances, double MaxRatio)
737 {
738  // Filters out Instances of overly lopsided ratios
739  vector<CRef<CInstance> >::iterator Curr;
740  Curr = Instances.begin();
741  for(Curr = Instances.begin(); Curr != Instances.end(); ) {
742  if( (*Curr)->SubjToQueryRatio() > MaxRatio ||
743  (*Curr)->SubjToQueryRatio() < 0.10 )
744  Curr = Instances.erase(Curr);
745  else
746  ++Curr;
747  }
748 
749  // Find the longest instance now
750  TSeqPos LongestInstance = 0;
751  for(Curr = Instances.begin(); Curr != Instances.end(); ++Curr) {
752  TSeqPos CurrLength = (*Curr)->QueryLength();
753  LongestInstance = max(CurrLength, LongestInstance);
754  }
755 
756  // Filters out instances that are too tiny to both with
757  for(Curr = Instances.begin(); Curr != Instances.end(); ) {
758  if( (*Curr)->QueryLength() <= (LongestInstance*0.05))
759  Curr = Instances.erase(Curr);
760  else
761  ++Curr;
762  }
763 
764  vector<CRef<CInstance> >::iterator Outer, Inner;
765  for(Outer = Instances.begin(); Outer != Instances.end(); ++Outer) {
766  for(Inner = Outer+1; Inner != Instances.end(); ) {
767  if( (*Outer)->Query.Equals((*Inner)->Query) &&
768  (*Outer)->Subject.Equals((*Inner)->Subject) ) {
769  Inner = Instances.erase(Inner);
770  } else {
771  ++Inner;
772  }
773  }
774  }
775 
776 }
777 
778 
780 {
781  double BestPctCoverage = -1.0;
782 
783  ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
784  ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
785  //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
786 
787  CRef<CSeq_align_set> Set = SubjectIter->second;
788 
789  ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
790  double PctCoverage = -1.0;
791  (*AlignIter)->GetNamedScore("pct_coverage", PctCoverage);
792  BestPctCoverage = max(BestPctCoverage, PctCoverage);
793  }
794  }
795  }
796 
797 
798  return (BestPctCoverage >= m_MinPctCoverage || BestPctCoverage == -1.0);
799 }
800 
802 
USING_SCOPE(objects)
TSeqPos s_CalcCoverageCount(TAlignSetRef Alignments, int Row, CScope &Scope)
bool s_ProgressCallback(CNWAligner::SProgressInfo *ProgressInfo)
CRef< CSeq_loc > s_CoverageSeqLoc(TAlignSetRef Alignments, int Row, CScope &Scope)
Declares the CBl2Seq (BLAST 2 Sequences) class.
Declares the CBlastNucleotideOptionsHandle class.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
Definitions of special type used in BLAST.
Main argument class for BLASTN application.
class CAlignCleanup implements an alignment cleanup utility based on the C++ alignment manager.
void Cleanup(const TAligns &aligns_in, TAligns &aligns_out, EMode mode=eDefault)
CBioseq_Handle –.
void TrimEndGaps()
Trim leading/training gaps if possible.
Definition: Dense_seg.cpp:344
ENa_strand GetSeqStrand(TDim row) const
Definition: Dense_seg.cpp:241
void Reverse(void)
Reverse the segments' orientation.
Definition: Dense_seg.cpp:644
void OffsetRow(TDim row, TSignedSeqPos offset)
Offset row's coords.
Definition: Dense_seg.cpp:890
double SubjToQueryRatio() const
int GapDistance(const objects::CSeq_align &Align) const
objects::CSeq_interval Subject
void MergeIn(CRef< objects::CSeq_align > Align)
objects::CSeq_align_set Alignments
TSeqPos QueryLength() const
bool IsAlignmentContained(const objects::CSeq_align &Align) const
CInstance(const CRef< objects::CSeq_align > Align)
objects::CSeq_interval Query
string GetName() const
TAlignResultsRef GenerateAlignments(objects::CScope &Scope, ISequenceSet *QuerySet, ISequenceSet *SubjectSet, TAlignResultsRef AccumResults)
void x_FilterInstances(vector< CRef< CInstance > > &Instances, double MaxRatio)
CRef< objects::CSeq_align_set > x_RunCleanup(const objects::CSeq_align_set &AlignSet, objects::CScope &Scope)
void x_RunAligner(objects::CScope &Scope, CQuerySet &QueryAligns, TAlignResultsRef Results)
void x_GetDistanceInstances(CQuerySet &QueryAligns, objects::CScope &Scope, vector< CRef< CInstance > > &Instances)
void x_GetCleanupInstances(CQuerySet &QueryAligns, objects::CScope &Scope, vector< CRef< CInstance > > &Instances)
CRef< objects::CDense_seg > x_RunMMGlobal(const objects::CSeq_id &QueryId, const objects::CSeq_id &SubjectId, objects::ENa_strand Strand, TSeqPos QueryStart, TSeqPos QueryStop, TSeqPos SubjectStart, TSeqPos SubjectStop, objects::CScope &Scope)
bool x_MinCoverageCheck(const CQuerySet &QueryAligns)
TAssemblyToSubjectSet & Get()
Definition: result_set.hpp:82
CConstRef< objects::CSeq_id > GetQueryId() const
Definition: result_set.hpp:91
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
TSeqPos GetSeqStop(TDim row) const
Definition: Seq_align.cpp:273
void SetNamedScore(const string &id, int score)
Definition: Seq_align.cpp:636
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
TSeqPos GetSeqStart(TDim row) const
Definition: Seq_align.cpp:252
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
CTimeSpan.
Definition: ncbitime.hpp:1313
CTime –.
Definition: ncbitime.hpp:296
Declares the CDiscNucleotideOptionsHandle class.
void SetProgressCallback(FProgressCallback prg_callback, void *data)
void SetWs(TScore value)
Definition: nw_aligner.hpp:122
virtual TScore Run(void)
Definition: nw_aligner.cpp:503
void SetWg(TScore value)
Definition: nw_aligner.hpp:121
void SetScoreMatrix(const SNCBIPackedScoreMatrix *scoremat)
void SetWms(TScore value)
CRef< objects::CDense_seg > GetDense_seg(TSeqPos query_start, objects::ENa_strand query_strand, TSeqPos subj_start, objects::ENa_strand subj_strand, bool trim_end_gaps=false) const
void SetWm(TScore value)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
#define _TRACE(message)
Definition: ncbidbg.hpp:122
CDiagContext_Extra & Print(const string &name, const string &value)
The method does not print the argument, but adds it to the string.
Definition: ncbidiag.cpp:2622
CDiagContext & GetDiagContext(void)
Get diag context instance.
Definition: logging.cpp:818
CDiagContext_Extra Extra(void) const
Create a temporary CDiagContext_Extra object.
Definition: ncbidiag.hpp:2095
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
string ReportAll(TDiagPostFlags flags=eDPF_Exception) const
Report all exceptions.
Definition: ncbiexpt.cpp:370
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void ChangeToMix(void)
Definition: Seq_loc.cpp:3633
@ fSortAndMerge_All
Definition: Seq_loc.hpp:334
CRef< CSeq_loc > Seq_loc_Merge(const CSeq_loc &loc, CSeq_loc::TOpFlags flags, CScope *scope)
Merge ranges in the seq-loc.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eStrand_Plus
Plus strand.
@ eStrand_Minus
Minus strand.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
double GetAsDouble(void) const
Return time span as number of seconds.
Definition: ncbitime.hpp:2566
long GetCompleteSeconds(void) const
Get number of complete seconds.
Definition: ncbitime.hpp:2560
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
static bool IsSignaled(TSignalMask signals=eSignal_Any)
Check that any of specified signals is received.
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
Tdata & Set(void)
Assign a value to data member.
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
TStrands & SetStrands(void)
Assign a value to Strands data member.
Definition: Dense_seg_.hpp:586
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
bool CanGetStrands(void) const
Check if it is safe to call GetStrands method.
Definition: Dense_seg_.hpp:574
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
list< CRef< CSeq_align > > Tdata
const TStrands & GetStrands(void) const
Get the Strands member data.
Definition: Dense_seg_.hpp:580
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
@ eType_partial
mapping pieces together
Definition: Seq_align_.hpp:103
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
list< CRef< CSeq_loc > > Tdata
const Tdata & Get(void) const
Get the member data.
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ e_Int
from to
Definition: Seq_loc_.hpp:101
Main class to perform a BLAST search on the local machine.
Magic spell ;-) needed for some weird compilers... very empiric.
Setup interrupt signal handling.
Defines NCBI C++ exception handling.
T max(T x_, T y_)
T min(T x_, T y_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
void Out(T t, int w, CNcbiOstream &to=cout)
Definition: parse.cpp:467
Modified on Wed Apr 17 13:08:44 2024 by modify_doxy.py rev. 669887