32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbi_system.hpp>
37 #include <objmgr/scope.hpp>
45 #include <objmgr/seq_vector.hpp>
46 #include <objmgr/util/sequence.hpp>
54 USING_SCOPE(blast);
58 // Split a blast parameter string such as
59 // "-W 28 -r 1 -q -3 -e 1e-5 -Z 200 -F 'm L; R -d rodents.lib'"
60 // into ["-W", "28",..."-F", "m L; R -d rodents.lib"]
61 // (i.e., respect single quotes)
62 static void s_SplitCommandLine(string s, vector<string>& result)
63 {
64  bool in_quotes = false;
65  char quote_char = 0; // initialize to avoid compiler warning
66  bool in_space = true;
67  // tack on a space so we can deal with end generically
68  s += ' ';
69  char c;
70  string item;
71  for (unsigned int i = 0; i < s.size(); ++i) {
72  c = s[i];
73  if ((c == ' ' || c == '\t') && !in_quotes) {
74  if (!in_space) {
75  // first space after an item
76  result.push_back(item);
77  in_space = true;
78  }
79  } else {
80  if (in_space) {
81  // first non-space
82  item.erase();
83  in_space = false;
84  }
85  if (c == '\'' || c == '"') {
86  if (in_quotes) {
87  if (c == quote_char) {
88  // end quote
89  in_quotes = false;
90  } else {
91  // one kind of quote inside of the other
92  item += c;
93  }
94  } else {
95  in_quotes = true;
96  quote_char = c;
97  }
98  } else {
99  item += c;
100  }
101  }
102  }
103  if (in_quotes) {
104  throw runtime_error("Unbalanced quotes (')");
105  }
106 }
109 static void s_GetTails(const CAlnVec& vec,
110  vector<CContigAssembly::SAlignStats::STails>& tails)
111 {
113  for (int j = 0; j < vec.GetNumRows(); ++j) {
114  TSeqPos start = vec.GetSeqStart(j);
115  TSeqPos stop = vec.GetSeqStop(j);
117  TSeqPos seq_len = vec.GetBioseqHandle(j).GetBioseqLength();
119  if (vec.IsPositiveStrand(j)) {
120  tls.left = start;
121  tls.right = seq_len - stop - 1;
122  } else {
123  tls.right = start;
124  tls.left = seq_len - stop - 1;
125  }
126  tails.push_back(tls);
127  }
128 }
132  const CSeq_id& subject_id,
133  const string& param_string,
134  CScope& scope)
135 {
136  CSeq_loc query_loc;
137  query_loc.SetWhole().Assign(query_id);
138  CSeq_loc subject_loc;
139  subject_loc.SetWhole().Assign(subject_id);
140  return Blastn(query_loc, subject_loc, param_string, scope);
141 }
145  const CSeq_loc& subject_loc,
146  const string& param_string,
147  CScope& scope)
148 {
149  SSeqLoc query_sl(query_loc, scope);
150  SSeqLoc subject_sl(subject_loc, scope);
152  CBl2Seq bl2seq(query_sl, subject_sl, eBlastn);
153  vector<string> args;
154  s_SplitCommandLine(param_string, args);
156  dynamic_cast<CBlastNucleotideOptionsHandle&>(bl2seq.SetOptionsHandle());
158  for (unsigned int i = 0; i < args.size(); i += 2) {
159  const string& name = args[i];
160  if (i + 1 >= args.size()) {
161  throw runtime_error("no value given for " + name);
162  }
163  const string& value = args[i + 1];
164  if (name == "-W") {
166  } else if (name == "-r") {
168  } else if (name == "-q") {
170  } else if (name == "-e") {
172  } else if (name == "-Z") {
174  } else if (name == "-F") {
175  opts.SetFilterString(value.c_str());
176  } else if (name == "-G") {
178  } else if (name == "-E") {
180  } else {
181  throw runtime_error("invalid option: " + name);
182  }
183  }
185  TSeqAlignVector res = bl2seq.Run();
186  return res.front();
187 }
190 void CContigAssembly::DiagCounts(const CSeq_align_set& align_set, CScope& scope,
191  vector<unsigned int>& plus_vec,
192  vector<unsigned int>& minus_vec)
193 {
194  const CSeq_id& id0 =
195  *align_set.Get().front()->GetSegs().GetDenseg().GetIds()[0];
196  const CSeq_id& id1 =
197  *align_set.Get().front()->GetSegs().GetDenseg().GetIds()[1];
199  TSeqPos len0 = scope.GetBioseqHandle(id0).GetBioseqLength();
200  TSeqPos len1 = scope.GetBioseqHandle(id1).GetBioseqLength();
202  plus_vec.clear();
203  plus_vec.resize(len0 + len1);
205  minus_vec.clear();
206  minus_vec.resize(len0 + len1);
208  ITERATE (CSeq_align_set::Tdata, aln, align_set.Get()) {
209  if ((*aln)->GetSeqStrand(0) == eNa_strand_minus) {
210  const CDense_seg& ds = (*aln)->GetSegs().GetDenseg();
211  for (int i = 0; i < ds.GetNumseg(); ++i) {
212  TSignedSeqPos start0 = ds.GetStarts()[2 * i];
213  TSignedSeqPos start1 = ds.GetStarts()[2 * i + 1];
214  if (start0 == -1 || start1 == -1) {
215  // do nothing with gaps
216  continue;
217  }
218  TSeqPos seg_len = ds.GetLens()[i];
219  TSeqPos diag = (start0 + seg_len - 1) + start1;
220  minus_vec[diag] += seg_len;
221  }
222  } else {
223  const CDense_seg& ds = (*aln)->GetSegs().GetDenseg();
224  for (int i = 0; i < ds.GetNumseg(); ++i) {
225  TSignedSeqPos start0 = ds.GetStarts()[2 * i];
226  TSignedSeqPos start1 = ds.GetStarts()[2 * i + 1];
227  if (start0 == -1 || start1 == -1) {
228  // do nothing with gaps
229  continue;
230  }
231  TSeqPos seg_len = ds.GetLens()[i];
232  TSeqPos diag = start1 - start0 + len0 - 1 ;
233  plus_vec[diag] += seg_len;
234  }
235  }
236  }
237 }
240 void CContigAssembly::FindMaxRange(const vector<unsigned int>& vec,
241  unsigned int window,
242  unsigned int& max,
243  vector<TRange>& max_range)
244 {
245  unsigned int running_sum = 0;
246  unsigned int i;
247  for (i = 0; i < window; ++i) {
248  running_sum += vec[i];
249  }
250  max = running_sum;
251  max_range.clear();
252  max_range.push_back(TRange(window - 1, window - 1));
254  for (i = window; i < vec.size(); ++i) {
255  running_sum -= vec[i - window];
256  running_sum += vec[i];
257  if (running_sum >= max) {
258  if (running_sum > max) {
259  max_range.clear();
260  max = running_sum;
261  }
262  if (max_range.size() && max_range.back().GetFrom() == i - 1) {
263  max_range.back().SetFrom(i);
264  } else {
265  max_range.push_back(TRange(i, i));
266  }
267  }
268  }
269 }
273  CScope& scope,
274  unsigned int window_size,
275  ENa_strand& strand,
276  unsigned int& diag)
277 {
279  vector<unsigned int> plus_vec, minus_vec;
280  DiagCounts(align_set, scope, plus_vec, minus_vec);
282  unsigned int plus_count;
283  vector<TRange> plus_range;
284  FindMaxRange(plus_vec, window_size, plus_count, plus_range);
286  unsigned int minus_count;
287  vector<TRange> minus_range;
288  FindMaxRange(minus_vec, window_size, minus_count, minus_range);
290  unsigned int count;
291  vector<TRange>* r = NULL;
292  if (plus_count > minus_count) {
293  strand = eNa_strand_plus;
294  count = plus_count;
295  r = &plus_range;
296  } else {
297  strand = eNa_strand_minus;
298  count = minus_count;
299  r = &minus_range;
300  }
302  // use first continuous range
303  diag =
304  (r->front().GetFrom() + r->front().GetTo() + 1) / 2 - window_size / 2;
305 }
308 // Fake a banded NW alignment around an arbitrary diagonal.
309 // diag is specified using same convention as returned by
310 // FindDiagFromAlignSet.
313  ENa_strand strand, unsigned int diag,
314  unsigned int half_width, CScope& scope)
315 {
317  CBioseq_Handle hand0 = scope.GetBioseqHandle(id0);
318  if (strand == eNa_strand_plus) {
319  vec_strand = hand0.eStrand_Plus;
320  } else {
321  vec_strand = hand0.eStrand_Minus;
322  }
323  CSeqVector vec0 = hand0.GetSeqVector(hand0.eCoding_Iupac, vec_strand);
324  vec0.SetIupacCoding();
325  string seq0;
326  vec0.GetSeqData(0, vec0.size(), seq0);
328  CBioseq_Handle hand1 = scope.GetBioseqHandle(id1);
329  CSeqVector vec1 = hand1.GetSeqVector(hand1.eCoding_Iupac);
330  vec1.SetIupacCoding();
331  string seq1;
332  vec1.GetSeqData(0, vec1.size(), seq1);
334  CBandAligner alnr(seq0, seq1, 0, half_width);
335  alnr.SetEndSpaceFree(true, true, true, true);
336 #ifdef __LP64__
337  Uint8 phys_ram = GetPhysicalMemorySize();
338  if(phys_ram > 0)
339  alnr.SetSpaceLimit(phys_ram);
340 #endif
341  // Translate shift from one convention (lower left corner is zero)
342  // to another (upper left is zero, direction of shift given separately)
343  Uint1 direction;
344  size_t shift;
345  if (diag <= seq0.size()) {
346  direction = 0;
347  shift = seq0.size() - 1 - diag;
348  } else {
349  direction = 1;
350  shift = diag - (seq0.size() - 1);
351  }
352  alnr.SetShift(direction, shift);
353  alnr.Run();
356  ds->FromTranscript(strand == eNa_strand_plus ? 0 : seq0.size() - 1, strand,
357  0, eNa_strand_plus,
358  alnr.GetTranscriptString());
359  CRef<CSeq_id> cr_id0(new CSeq_id);
360  cr_id0->Assign(id0);
361  CRef<CSeq_id> cr_id1(new CSeq_id);
362  cr_id1->Assign(id1);
363  ds->SetIds().push_back(cr_id0);
364  ds->SetIds().push_back(cr_id1);
366  // Trim any overhanging ends
367  if (ds->GetStarts().back() == -1
368  || ds->GetStarts()[ds->GetStarts().size() - 2] == -1) {
369  // remove last segment
370  ds->SetStarts().pop_back(); ds->SetStarts().pop_back();
371  ds->SetLens().pop_back();
372  ds->SetStrands().pop_back(); ds->SetStrands().pop_back();
373  ds->SetNumseg(ds->GetNumseg() - 1);
374  }
375  if (ds->GetStarts()[0] == -1 || ds->GetStarts()[1] == -1) {
376  // remove first segment
377  for (unsigned int i = 0; i < ds->GetStarts().size() - 2; ++i) {
378  ds->SetStarts()[i] = ds->GetStarts()[i + 2];
379  }
380  ds->SetStarts().resize(ds->GetStarts().size() - 2);
381  for (unsigned int i = 0; i < ds->GetLens().size() - 1; ++i) {
382  ds->SetLens()[i] = ds->GetLens()[i + 1];
383  }
384  ds->SetLens().resize(ds->GetLens().size() - 1);
385  for (unsigned int i = 0; i < ds->GetStrands().size() - 2; ++i) {
386  ds->SetStrands()[i] = ds->GetStrands()[i + 2];
387  }
388  ds->SetStrands().resize(ds->GetStrands().size() - 2);
389  ds->SetNumseg(ds->GetNumseg() - 1);
390  }
392  return ds;
393 }
398  unsigned int slop, CScope& scope)
399 {
401  CAlnVec avec(ds, scope);
402  s_GetTails(avec, stats.tails);
404  if( (stats.tails[0].right <= slop && stats.tails[1].left <= slop) ||
405  (stats.tails[0].left <= slop && stats.tails[1].right <= slop) ) {
406  return true;
407  } else {
408  return false;
409  }
410 /*
411  const CSeq_id& id0 = *ds.GetIds()[0];
412  const CSeq_id& id1 = *ds.GetIds()[1];
413  TSeqPos len0 = scope.GetBioseqHandle(id0).GetBioseqLength();
414  TSeqPos len1 = scope.GetBioseqHandle(id1).GetBioseqLength();
416  // This assumes other sequence is plus strand,
417  // ie., ds.GetSeqStrand(1) == ncbi.eNa_strand_plus
418  if (!ds.CanGetStrands() || ds.GetStrands().empty() ||
419  ds.GetSeqStrand(0) == eNa_strand_plus) {
420  if (ds.GetSeqStart(0) <= slop &&
421  len1 - ds.GetSeqStop(1) - 1 <= slop) {
422  return true;
423  }
424  if (ds.GetSeqStart(1) <= slop &&
425  len0 - ds.GetSeqStop(0) - 1 <= slop) {
426  return true;
427  }
428  return false;
429  } else { // seq0 minus strand
430  if (ds.GetSeqStart(0) <= slop &&
431  ds.GetSeqStart(1) <= slop) {
432  return true;
433  }
434  if (len0 - ds.GetSeqStop(0) - 1 <= slop &&
435  len1 - ds.GetSeqStop(1) - 1 <= slop) {
436  return true;
437  }
438  return false;
439  }*/
440 }
443 // Does the alignment come within slop of either end of
444 // either sequence? Perhaps the criteria should be more stringent.
446  unsigned int slop, CScope& scope)
447 {
448  const CSeq_id& id0 = *ds.GetIds()[0];
449  const CSeq_id& id1 = *ds.GetIds()[1];
450  TSeqPos len0 = scope.GetBioseqHandle(id0).GetBioseqLength();
451  TSeqPos len1 = scope.GetBioseqHandle(id1).GetBioseqLength();
453  return ds.GetSeqStart(0) <= slop
454  || len1 - ds.GetSeqStop(1) - 1 <= slop
455  || ds.GetSeqStart(1) <= slop
456  || len0 - ds.GetSeqStop(0) - 1 <= slop;
457 }
460 // Is one contained in the other, modulo slop?
462  unsigned int slop, CScope& scope)
463 {
464 /* const CSeq_id& id0 = *ds.GetIds()[0];
465  const CSeq_id& id1 = *ds.GetIds()[1];
466  TSeqPos len0 = scope.GetBioseqHandle(id0).GetBioseqLength();
467  TSeqPos len1 = scope.GetBioseqHandle(id1).GetBioseqLength();
468  return (ds.GetSeqStart(0) <= slop
469  && len0 - ds.GetSeqStop(0) - 1 <= slop)
470  ||
471  (ds.GetSeqStart(1) <= slop
472  && len1 - ds.GetSeqStop(1) - 1 <= slop);
473 */
475  CAlnVec avec(ds, scope);
476  s_GetTails(avec, stats.tails);
478  bool FirstContainsSecond;
479  bool SecondContainsFirst;
481  FirstContainsSecond = ((((long)stats.tails[0].left - stats.tails[1].left) >= -int(slop)) &&
482  (((long)stats.tails[0].right - stats.tails[1].right) >= -int(slop)));
483  SecondContainsFirst = ((((long)stats.tails[1].left - stats.tails[0].left) >= -int(slop)) &&
484  (((long)stats.tails[1].right - stats.tails[0].right) >= -int(slop)));
486  return (FirstContainsSecond | SecondContainsFirst);
487 }
491 {
492  double Ident;
493  // This is from the old way to calculate percent identity.
494  // It was only ever used internally to filter overlaps
495  //Ident = CAlnStats(ds, scope).GetFracIdentity();
496  // This way uses the same calculation as the exposed GatherAlignStats()
497  // functions.
499  CAlnVec avec(ds, scope);
500  x_GatherIdentStats(avec, stats);
501  Ident = stats.pct_identity/100.0;
502  return Ident;
503 }
506 // Find the highest-scoring local alignment that is a
507 // sub-alignment of the given alignment
509  CScope& scope)
510 {
511  int Wg = -5, Wm = 1, Wms = -2, Ws = -2;
513  CAlnVec avec(ds_in, scope);
514  avec.SetEndChar('-');
515  avec.SetGapChar('-');
517  unsigned int sz = 0;
518  for (unsigned int i = 0; i < ds_in.GetLens().size(); ++i) {
519  sz += ds_in.GetLens()[i];
520  }
521  vector<int> scores(sz);
522  int previous_score;
523  for (unsigned int i = 0; i < scores.size(); ++i) {
524  unsigned char res0 = avec.GetResidue(0, i);
525  unsigned char res1 = avec.GetResidue(1, i);
526  if (i > 0) {
527  previous_score = scores[i - 1];
528  } else {
529  previous_score = 0;
530  }
531  if (res0 == '-') {
532  if (i > 0 && avec.GetResidue(0, i - 1) == '-') {
533  scores[i] = previous_score + Ws;
534  } else {
535  scores[i] = previous_score + Wg + Ws;
536  }
537  } else if (res1 == '-') {
538  if (i > 0 && avec.GetResidue(1, i - 1) == '-') {
539  scores[i] = previous_score + Ws;
540  } else {
541  scores[i] = previous_score + Wg + Ws;
542  }
543  } else if (res0 == res1) {
544  // match
545  scores[i] = previous_score + Wm;
546  } else {
547  // mismatch
548  scores[i] = previous_score + Wms;
549  }
551  // Don't let the score drop below zero
552  if (scores[i] < 0) {
553  scores[i] = 0;
554  }
555  }
557  // Find the (or a) place where score is max
558  int max_score = 0;
559  unsigned int right_end = 0; // initialize to avoid compiler warning
560  unsigned int left_end;
561  for (unsigned int i = 0; i < scores.size(); ++i) {
562  if (scores[i] > max_score) {
563  max_score = scores[i];
564  right_end = i; // important that we do ">" rather than ">="
565  }
566  }
568  // Find the closest zero prior to this; the column after this
569  // is what we want. If we never hit zero, we want position zero.
570  if (right_end == 0) {
571  left_end = 0; // This will return an alignment of length one.
572  // If score[0] == 0, it is possible that we should
573  // return an alignment of length zero, so this is
574  // not strictly correct.
575  } else {
576  unsigned int i;
577  for (i = right_end - 1; i > 0; --i) {
578  if (scores[i] == 0) {
579  break;
580  }
581  }
582  if (i > 0) {
583  left_end = i + 1;
584  } else {
585  if (scores[0] == 0) {
586  left_end = 1;
587  } else {
588  left_end = 0;
589  }
590  }
591  }
593  // Extract a slice of the ds corresponding to this range
594  CRef<CDense_seg> ds_out;
595  ds_out = ds_in.ExtractSlice(0, avec.GetSeqPosFromAlnPos(0, left_end),
596  avec.GetSeqPosFromAlnPos(0, right_end));
597  return ds_out;
598 }
601 vector<CRef<CSeq_align> >
602 CContigAssembly::Align(const CSeq_id& id0, const CSeq_id& id1,
603  const string& blast_params, double min_ident,
604  unsigned int max_end_slop, CScope& scope,
605  CNcbiOstream* ostr,
606  const vector<unsigned int>& band_halfwidths,
607  unsigned int diag_finding_window,
608  unsigned int min_align_length,
609  ENa_strand strand0, ENa_strand strand1)
610 {
611  if (min_ident > 1 || min_ident < 0) {
612  throw runtime_error("min_ident must be between zero and one (got "
613  + NStr::DoubleToString(min_ident) + ")");
614  }
616  if (ostr) {
617  map<int,string> strandmap;
618  strandmap[eNa_strand_unknown] = "Unknown";
619  strandmap[eNa_strand_plus] = "Plus";
620  strandmap[eNa_strand_minus] = "Minus";
621  *ostr << "Running blast for " << id0.GetSeqIdString(true)
622  << " and " << id1.GetSeqIdString(true) << endl;
623  *ostr << "Filtering on " << min_ident << "%, slop " << max_end_slop
624  << "bp, min length " << min_align_length << "bp"
625  << " and strands " << strandmap[strand0] << ", " << strandmap[strand1] << endl;
626  }
628  try {
629  alns = Blastn(id0, id1, blast_params, scope);
630  }
631  catch (exception& e) {
632  if (ostr) {
633  *ostr << "blast failed:\n" << e.what() << endl;
634  }
635  return vector<CRef<CSeq_align> >();
636  }
637 // cerr << "Blast Count Total: " << alns->Get().size() << endl;
638  vector<CRef<CSeq_align> > good_alns;
640 //double Ident = FracIdent((*aln)->GetSegs().GetDenseg(), scope);
641 //bool Dovetail = IsDovetail((*aln)->GetSegs().GetDenseg(), max_end_slop, scope);
642 //int Len = x_DensegLength((*aln)->GetSegs().GetDenseg());
643 //cerr << " Ident: " << Ident << " Dove: " << Dovetail << " Len: " << Len << endl;
644  if (IsDovetail((*aln)->GetSegs().GetDenseg(), max_end_slop, scope)
645  && FracIdent((*aln)->GetSegs().GetDenseg(), scope) >= min_ident
646  && x_IsAllowedStrands((*aln)->GetSegs().GetDenseg(), strand0, strand1)
647  && x_DensegLength((*aln)->GetSegs().GetDenseg()) >= min_align_length ) {
648  x_OrientAlign((*aln)->SetSegs().SetDenseg(), scope);
649  good_alns.push_back(*aln);
650  }
651  }
652  if (!good_alns.empty()) {
653  if (ostr) {
654  *ostr << "Found "<< good_alns.size() <<
655  " acceptable dovetail alignment(s) by blast "
656  << blast_params << endl;
657  }
658  return good_alns;
659  } else {
660  if (ostr) {
661  *ostr << "Found no acceptable dovetail alignments by blast "
662  << blast_params << endl;
663  }
664  if (alns->Get().empty()) {
665  if (ostr) {
666  *ostr << "No alignments found by blast; "
667  "can't do banded alignment" << endl;
668  }
669  return vector<CRef<CSeq_align> >();
670  }
671  ENa_strand strand;
672  unsigned int diag;
673  FindDiagFromAlignSet(*alns, scope, diag_finding_window, strand, diag);
675  CRef<CDense_seg> local_ds;
676  ITERATE(vector<unsigned int>, band_halfwidth, band_halfwidths) {
677  if (ostr) {
678  *ostr << "Trying banded global alignment with bandwidth = "
679  << 2 * *band_halfwidth + 1 << endl;
680  }
681  CRef<CDense_seg> global_ds;
682  try {
683  global_ds =
684  BandedGlobalAlignment(id0, id1, strand,
685  diag, *band_halfwidth, scope);
686  }
687  catch (CAlgoAlignException& e) {
688  if (ostr) {
689  *ostr << "banded alignment failed:\n" << e.what() << endl;
690  }
691  continue;
692  }
694  if (global_ds->GetNumseg() == 0) {
695  if (ostr) {
696  *ostr << "banded alignment failed: num segs == 0\n" << endl;
697  }
698  continue;
699  }
701  local_ds = BestLocalSubAlignment(*global_ds, scope);
702  double frac_ident = FracIdent(*local_ds, scope);
703  if (ostr) {
704  *ostr << "Fraction identity: " << frac_ident << endl;
705  }
706  if (IsDovetail(*local_ds, max_end_slop, scope)
707  && FracIdent(*local_ds, scope) >= min_ident
708  && x_IsAllowedStrands(*local_ds, strand0, strand1)
709  && x_DensegLength(*local_ds) >= min_align_length ) {
710  if (ostr) {
711  *ostr << "Alignment acceptable (full dovetail)" << endl;
712  }
713  x_OrientAlign(*local_ds, scope);
714  CRef<CSeq_align> aln(new CSeq_align);
715  aln->SetSegs().SetDenseg(*local_ds);
716  aln->SetType(aln->eType_partial);
717  return vector<CRef<CSeq_align> >(1, aln);
718  }
719  }
721  if (ostr) {
722  *ostr << "No acceptable alignments from banded alignment algorithm"
723  << endl;
724  }
725  // Check for any half-dovetails (including contained)
726  // in blast results
727  good_alns.clear();
729  if (IsAtLeastHalfDovetail((*aln)->GetSegs().GetDenseg(),
730  max_end_slop, scope)
731  && FracIdent((*aln)->GetSegs().GetDenseg(), scope) >= min_ident
732  && x_IsAllowedStrands((*aln)->GetSegs().GetDenseg(), strand0, strand1)
733  && x_DensegLength((*aln)->GetSegs().GetDenseg()) >= min_align_length
734  ) {
735  x_OrientAlign((*aln)->SetSegs().SetDenseg(), scope);
736  good_alns.push_back(*aln);
737  }
738  }
739  if (ostr) {
740  *ostr << "Found " << good_alns.size() <<
741  " acceptable half-dovetail "
742  "or contained alignment(s) by blast" << endl;
743  }
744  if (!good_alns.empty()) {
745  return good_alns;
746  } else {
747  // Check whether banded alignment is an
748  // acceptable half-dovetail (including contained)
749  if (local_ds
750  && IsAtLeastHalfDovetail(*local_ds, max_end_slop, scope)
751  && FracIdent(*local_ds, scope) >= min_ident
752  && x_IsAllowedStrands(*local_ds, strand0, strand1)
753  && x_DensegLength(*local_ds) >= min_align_length) {
754  string dovetail_string;
755  if (IsContained(*local_ds, max_end_slop, scope)) {
756  dovetail_string = "contained";
757  } else {
758  dovetail_string = "half-dovetail";
759  }
760  if (ostr) {
761  *ostr << "Banded alignment acceptable ("
762  << dovetail_string << ")" << endl;
763  }
764  x_OrientAlign(*local_ds, scope);
765  CRef<CSeq_align> aln(new CSeq_align);
766  aln->SetSegs().SetDenseg(*local_ds);
767  aln->SetType(aln->eType_partial);
768  return vector<CRef<CSeq_align> >(1, aln);
770  } else {
771  if (ostr) {
772  *ostr << "Banded alignment not an acceptable "
773  "half-dovetail or contained" << endl;
774  }
775  return vector<CRef<CSeq_align> >();
776  }
777  }
778  }
779 }
782 CContigAssembly::CAlnStats::CAlnStats(const objects::CDense_seg& ds,
783  objects::CScope& scope)
784 {
785  // Largely stolen from CCOPair::CAln
786  string row1, row2;
787  CAlnVec vec(ds, scope);
788  vec.SetGapChar('-');
790  (0, vec.GetAlnStop()));
792  (0, vec.GetAlnStop()));
793  _ASSERT(row1.size() == row2.size());
795  m_AdjustedLen = m_MM = m_Gaps = 0;
796  for (unsigned int i = 0; i < row1.size(); ++i) {
797  if (row1[i] != 'N' && row2[i] != 'N') {
798  ++m_AdjustedLen;
800  if (row1[i] != row2[i]) {
801  if (row1[i] == '-') {
802  ++m_Gaps;
803  while (i+1 < row1.size() && row1[i+1] == '-') ++i;
804  } else if (row2[i] == '-') {
805  ++m_Gaps;
806  while (i+1 < row1.size() && row2[i+1] == '-') ++i;
807  } else {
808  ++m_MM;
809  }
810  }
811  }
812  }
815 }
820  SAlignStats& align_stats)
821 {
822  ///
823  /// gap metrics
824  ///
826  align_stats.total_length = 0;
827  align_stats.aligned_length = 0;
828  align_stats.gap_count = 0;
829  align_stats.gaps.clear();
830  align_stats.is_simple.clear();
832  vector<CRef<CSeq_loc> > dust_locs;
834  if (vec.GetNumSegs() > 1) {
835  // run dust to classify gaps as simple sequence or not
836  CSymDustMasker masker;
837  for (int row = 0; row < vec.GetNumRows(); ++row) {
838  CSeqVector seq_vec = vec.GetBioseqHandle(row).GetSeqVector();
839  seq_vec.SetIupacCoding();
840  CSeq_id id("lcl|dummy");
841  CRef<CPacked_seqint> res = masker.GetMaskedInts(id, seq_vec);
842  CRef<CSeq_loc> loc(new CSeq_loc);
843  loc->SetPacked_int(*res);
844  dust_locs.push_back(loc);
845  }
846  }
849  int gap_simple = -1; // -1 = not checked, 0 = no, 1 = yes
850  bool simple = false;
851  for (int i = 0; i < vec.GetNumSegs(); ++i) {
852  align_stats.total_length += vec.GetLen(i);
853  bool is_gap = false;
854  for (int j = 0; j < vec.GetNumRows(); ++j) {
855  if (vec.GetStart(j, i) == -1) {
856  simple = false;
857  unsigned int other_row = (j + 1) % 2;
858  TSeqPos start = vec.GetStart(other_row, i);
859  TSeqPos stop = start + vec.GetLen(i);
860  string seq;
861  vec.GetBioseqHandle(other_row)
864  .GetSeqData(start, stop, seq);
866  CSeq_loc gap_loc;
867  gap_loc.SetInt().SetId().Set("lcl|dummy");
868  gap_loc.SetInt().SetFrom(vec.GetStart(other_row, i));
869  gap_loc.SetInt().SetTo(vec.GetStop(other_row, i));
870  sequence::ECompare cmp_res
871  = sequence::Compare(gap_loc, *dust_locs[other_row],
872  &vec.GetScope(),
874  simple = cmp_res == sequence::eContained
875  || cmp_res == sequence::eSame;
877  if (simple) {
878  gap_simple = 1;
879  } else if (gap_simple == -1) {
880  gap_simple = 0;
881  }
883  is_gap = true;
884  }
885  }
887  if (!is_gap) {
888  align_stats.aligned_length += vec.GetLen(i);
889  } else {
890  align_stats.gap_count += 1;
891  align_stats.gaps.push_back(vec.GetLen(i));
892  align_stats.is_simple.push_back(simple);
893  }
894  }
896  ///
897  /// identity computation
898  ///
899  x_GatherIdentStats(vec, align_stats);
901  ///
902  /// overhangs (unaligned tails)
903  ///
904  s_GetTails(vec, align_stats.tails);
906 }
910  CScope& scope,
911  SAlignStats& align_stats)
912 {
913  CAlnVec avec(ds, scope);
914  GatherAlignStats(avec, align_stats);
915 }
919  CScope& scope,
920  SAlignStats& align_stats)
921 {
922  GatherAlignStats(aln.GetSegs().GetDenseg(), scope, align_stats);
923 }
927 {
928  //return;
930  CAlnVec avec(ds, scope);
931  s_GetTails(avec, stats.tails);
933  if(stats.tails[0].left < stats.tails[1].left) {
934  ds.Reverse();
935  }
936 }
940  ENa_strand strand0,
941  ENa_strand strand1)
942 {
943  ENa_strand align_strands[2];
944  bool matches[2] = {false, false};
945  if(!ds.CanGetStrands() || ds.GetStrands().empty()) {
946  align_strands[0] = align_strands[1] = eNa_strand_plus;
947  } else {
948  align_strands[0] = ds.GetSeqStrand(0);
949  align_strands[1] = ds.GetSeqStrand(1);
950  }
952  if(strand0 == align_strands[0] || strand0 == eNa_strand_unknown)
953  matches[0] = true;
954  if(strand1 == align_strands[1] || strand1 == eNa_strand_unknown)
955  matches[1] = true;
957  if(!(matches[0] & matches[1])) {
958  if(strand0 == align_strands[1] || strand0 == eNa_strand_unknown)
959  matches[0] = true;
960  if(strand1 == align_strands[0] || strand1 == eNa_strand_unknown)
961  matches[1] = true;
962  }
964  return (matches[0] & matches[1]);
965 }
968 TSeqPos CContigAssembly::x_DensegLength(const objects::CDense_seg& ds)
969 {
970  TSeqPos Length = 0;
971  const CDense_seg::TStarts& Starts = ds.GetStarts();
972  const CDense_seg::TLens& Lens = ds.GetLens();
973  int Dim = ds.GetDim();
975  for(unsigned int Seg = 0; Seg < Lens.size(); Seg++) {
977  if(Starts[(Seg*Dim)] == -1 || Starts[(Seg*Dim)+1] == -1)
978  Length++;
979  else
980  Length += Lens[Seg];
981  }
982  return Length;
983 }
986 void CContigAssembly::x_GatherIdentStats(const objects::CAlnVec& vec,
987  SAlignStats& align_stats)
988 {
989  TSeqPos AlignedLength = 0;
991  for (int i = 0; i < vec.GetNumSegs(); ++i) {
992  bool is_gap = false;
993  for (int j = 0; j < vec.GetNumRows(); ++j) {
994  if (vec.GetStart(j, i) == -1) {
995  is_gap = true;
996  }
997  }
999  if (!is_gap) {
1000  AlignedLength += vec.GetLen(i);
1001  } else {
1002  ;
1003  }
1004  }
1006  unsigned int identities = 0;
1007  for (int i = 0; i < vec.GetNumSegs(); ++i) {
1008  string s1;
1009  vec.GetSegSeqString(s1, 0, i);
1010  for (int j = 1; j < vec.GetNumRows(); ++j) {
1011  string s2;
1012  vec.GetSegSeqString(s2, j, i);
1014  for (unsigned int k = 0; k < min(s1.size(), s2.size()); ++k) {
1015  identities += (s1[k] == s2[k]);
1016  }
1017  }
1018  }
1020  align_stats.mismatches = AlignedLength - identities;
1021  align_stats.pct_identity =
1022  100.0 * double(identities) / double(AlignedLength);
1023 }
