NCBI C++ ToolKit
contig_assembly.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: contig_assembly.cpp 64772 2014-10-08 13:43:15Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Josh Cherry
27  *
28  * File Description: Alignment functions intended for use in contig assembly
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbi_system.hpp>
37 #include <objmgr/scope.hpp>
45 #include <objmgr/seq_vector.hpp>
46 #include <objmgr/util/sequence.hpp>
51 
54 USING_SCOPE(blast);
55 
56 
57 
58 // Split a blast parameter string such as
59 // "-W 28 -r 1 -q -3 -e 1e-5 -Z 200 -F 'm L; R -d rodents.lib'"
60 // into ["-W", "28",..."-F", "m L; R -d rodents.lib"]
61 // (i.e., respect single quotes)
62 static void s_SplitCommandLine(string s, vector<string>& result)
63 {
64  bool in_quotes = false;
65  char quote_char = 0; // initialize to avoid compiler warning
66  bool in_space = true;
67  // tack on a space so we can deal with end generically
68  s += ' ';
69  char c;
70  string item;
71  for (unsigned int i = 0; i < s.size(); ++i) {
72  c = s[i];
73  if ((c == ' ' || c == '\t') && !in_quotes) {
74  if (!in_space) {
75  // first space after an item
76  result.push_back(item);
77  in_space = true;
78  }
79  } else {
80  if (in_space) {
81  // first non-space
82  item.erase();
83  in_space = false;
84  }
85  if (c == '\'' || c == '"') {
86  if (in_quotes) {
87  if (c == quote_char) {
88  // end quote
89  in_quotes = false;
90  } else {
91  // one kind of quote inside of the other
92  item += c;
93  }
94  } else {
95  in_quotes = true;
96  quote_char = c;
97  }
98  } else {
99  item += c;
100  }
101  }
102  }
103  if (in_quotes) {
104  throw runtime_error("Unbalanced quotes (')");
105  }
106 }
107 
108 
109 static void s_GetTails(const CAlnVec& vec,
110  vector<CContigAssembly::SAlignStats::STails>& tails)
111 {
113  for (int j = 0; j < vec.GetNumRows(); ++j) {
114  TSeqPos start = vec.GetSeqStart(j);
115  TSeqPos stop = vec.GetSeqStop(j);
116 
117  TSeqPos seq_len = vec.GetBioseqHandle(j).GetBioseqLength();
118 
119  if (vec.IsPositiveStrand(j)) {
120  tls.left = start;
121  tls.right = seq_len - stop - 1;
122  } else {
123  tls.right = start;
124  tls.left = seq_len - stop - 1;
125  }
126  tails.push_back(tls);
127  }
128 }
129 
130 
132  const CSeq_id& subject_id,
133  const string& param_string,
134  CScope& scope)
135 {
136  CSeq_loc query_loc;
137  query_loc.SetWhole().Assign(query_id);
138  CSeq_loc subject_loc;
139  subject_loc.SetWhole().Assign(subject_id);
140  return Blastn(query_loc, subject_loc, param_string, scope);
141 }
142 
143 
145  const CSeq_loc& subject_loc,
146  const string& param_string,
147  CScope& scope)
148 {
149  SSeqLoc query_sl(query_loc, scope);
150  SSeqLoc subject_sl(subject_loc, scope);
151 
152  CBl2Seq bl2seq(query_sl, subject_sl, eBlastn);
153  vector<string> args;
154  s_SplitCommandLine(param_string, args);
156  dynamic_cast<CBlastNucleotideOptionsHandle&>(bl2seq.SetOptionsHandle());
158  for (unsigned int i = 0; i < args.size(); i += 2) {
159  const string& name = args[i];
160  if (i + 1 >= args.size()) {
161  throw runtime_error("no value given for " + name);
162  }
163  const string& value = args[i + 1];
164  if (name == "-W") {
166  } else if (name == "-r") {
168  } else if (name == "-q") {
170  } else if (name == "-e") {
172  } else if (name == "-Z") {
174  } else if (name == "-F") {
175  opts.SetFilterString(value.c_str());
176  } else if (name == "-G") {
178  } else if (name == "-E") {
180  } else {
181  throw runtime_error("invalid option: " + name);
182  }
183  }
184 
185  TSeqAlignVector res = bl2seq.Run();
186  return res.front();
187 }
188 
189 
190 void CContigAssembly::DiagCounts(const CSeq_align_set& align_set, CScope& scope,
191  vector<unsigned int>& plus_vec,
192  vector<unsigned int>& minus_vec)
193 {
194  const CSeq_id& id0 =
195  *align_set.Get().front()->GetSegs().GetDenseg().GetIds()[0];
196  const CSeq_id& id1 =
197  *align_set.Get().front()->GetSegs().GetDenseg().GetIds()[1];
198 
199  TSeqPos len0 = scope.GetBioseqHandle(id0).GetBioseqLength();
200  TSeqPos len1 = scope.GetBioseqHandle(id1).GetBioseqLength();
201 
202  plus_vec.clear();
203  plus_vec.resize(len0 + len1);
204 
205  minus_vec.clear();
206  minus_vec.resize(len0 + len1);
207 
208  ITERATE (CSeq_align_set::Tdata, aln, align_set.Get()) {
209  if ((*aln)->GetSeqStrand(0) == eNa_strand_minus) {
210  const CDense_seg& ds = (*aln)->GetSegs().GetDenseg();
211  for (int i = 0; i < ds.GetNumseg(); ++i) {
212  TSignedSeqPos start0 = ds.GetStarts()[2 * i];
213  TSignedSeqPos start1 = ds.GetStarts()[2 * i + 1];
214  if (start0 == -1 || start1 == -1) {
215  // do nothing with gaps
216  continue;
217  }
218  TSeqPos seg_len = ds.GetLens()[i];
219  TSeqPos diag = (start0 + seg_len - 1) + start1;
220  minus_vec[diag] += seg_len;
221  }
222  } else {
223  const CDense_seg& ds = (*aln)->GetSegs().GetDenseg();
224  for (int i = 0; i < ds.GetNumseg(); ++i) {
225  TSignedSeqPos start0 = ds.GetStarts()[2 * i];
226  TSignedSeqPos start1 = ds.GetStarts()[2 * i + 1];
227  if (start0 == -1 || start1 == -1) {
228  // do nothing with gaps
229  continue;
230  }
231  TSeqPos seg_len = ds.GetLens()[i];
232  TSeqPos diag = start1 - start0 + len0 - 1 ;
233  plus_vec[diag] += seg_len;
234  }
235  }
236  }
237 }
238 
239 
240 void CContigAssembly::FindMaxRange(const vector<unsigned int>& vec,
241  unsigned int window,
242  unsigned int& max,
243  vector<TRange>& max_range)
244 {
245  unsigned int running_sum = 0;
246  unsigned int i;
247  for (i = 0; i < window; ++i) {
248  running_sum += vec[i];
249  }
250  max = running_sum;
251  max_range.clear();
252  max_range.push_back(TRange(window - 1, window - 1));
253 
254  for (i = window; i < vec.size(); ++i) {
255  running_sum -= vec[i - window];
256  running_sum += vec[i];
257  if (running_sum >= max) {
258  if (running_sum > max) {
259  max_range.clear();
260  max = running_sum;
261  }
262  if (max_range.size() && max_range.back().GetFrom() == i - 1) {
263  max_range.back().SetFrom(i);
264  } else {
265  max_range.push_back(TRange(i, i));
266  }
267  }
268  }
269 }
270 
271 
273  CScope& scope,
274  unsigned int window_size,
275  ENa_strand& strand,
276  unsigned int& diag)
277 {
278 
279  vector<unsigned int> plus_vec, minus_vec;
280  DiagCounts(align_set, scope, plus_vec, minus_vec);
281 
282  unsigned int plus_count;
283  vector<TRange> plus_range;
284  FindMaxRange(plus_vec, window_size, plus_count, plus_range);
285 
286  unsigned int minus_count;
287  vector<TRange> minus_range;
288  FindMaxRange(minus_vec, window_size, minus_count, minus_range);
289 
290  unsigned int count;
291  vector<TRange>* r = NULL;
292  if (plus_count > minus_count) {
293  strand = eNa_strand_plus;
294  count = plus_count;
295  r = &plus_range;
296  } else {
297  strand = eNa_strand_minus;
298  count = minus_count;
299  r = &minus_range;
300  }
301 
302  // use first continuous range
303  diag =
304  (r->front().GetFrom() + r->front().GetTo() + 1) / 2 - window_size / 2;
305 }
306 
307 
308 // Fake a banded NW alignment around an arbitrary diagonal.
309 // diag is specified using same convention as returned by
310 // FindDiagFromAlignSet.
313  ENa_strand strand, unsigned int diag,
314  unsigned int half_width, CScope& scope)
315 {
317  CBioseq_Handle hand0 = scope.GetBioseqHandle(id0);
318  if (strand == eNa_strand_plus) {
319  vec_strand = hand0.eStrand_Plus;
320  } else {
321  vec_strand = hand0.eStrand_Minus;
322  }
323  CSeqVector vec0 = hand0.GetSeqVector(hand0.eCoding_Iupac, vec_strand);
324  vec0.SetIupacCoding();
325  string seq0;
326  vec0.GetSeqData(0, vec0.size(), seq0);
327 
328  CBioseq_Handle hand1 = scope.GetBioseqHandle(id1);
329  CSeqVector vec1 = hand1.GetSeqVector(hand1.eCoding_Iupac);
330  vec1.SetIupacCoding();
331  string seq1;
332  vec1.GetSeqData(0, vec1.size(), seq1);
333 
334  CBandAligner alnr(seq0, seq1, 0, half_width);
335  alnr.SetEndSpaceFree(true, true, true, true);
336 #ifdef __LP64__
337  Uint8 phys_ram = GetPhysicalMemorySize();
338  if(phys_ram > 0)
339  alnr.SetSpaceLimit(phys_ram);
340 #endif
341  // Translate shift from one convention (lower left corner is zero)
342  // to another (upper left is zero, direction of shift given separately)
343  Uint1 direction;
344  size_t shift;
345  if (diag <= seq0.size()) {
346  direction = 0;
347  shift = seq0.size() - 1 - diag;
348  } else {
349  direction = 1;
350  shift = diag - (seq0.size() - 1);
351  }
352  alnr.SetShift(direction, shift);
353  alnr.Run();
354 
356  ds->FromTranscript(strand == eNa_strand_plus ? 0 : seq0.size() - 1, strand,
357  0, eNa_strand_plus,
358  alnr.GetTranscriptString());
359  CRef<CSeq_id> cr_id0(new CSeq_id);
360  cr_id0->Assign(id0);
361  CRef<CSeq_id> cr_id1(new CSeq_id);
362  cr_id1->Assign(id1);
363  ds->SetIds().push_back(cr_id0);
364  ds->SetIds().push_back(cr_id1);
365 
366  // Trim any overhanging ends
367  if (ds->GetStarts().back() == -1
368  || ds->GetStarts()[ds->GetStarts().size() - 2] == -1) {
369  // remove last segment
370  ds->SetStarts().pop_back(); ds->SetStarts().pop_back();
371  ds->SetLens().pop_back();
372  ds->SetStrands().pop_back(); ds->SetStrands().pop_back();
373  ds->SetNumseg(ds->GetNumseg() - 1);
374  }
375  if (ds->GetStarts()[0] == -1 || ds->GetStarts()[1] == -1) {
376  // remove first segment
377  for (unsigned int i = 0; i < ds->GetStarts().size() - 2; ++i) {
378  ds->SetStarts()[i] = ds->GetStarts()[i + 2];
379  }
380  ds->SetStarts().resize(ds->GetStarts().size() - 2);
381  for (unsigned int i = 0; i < ds->GetLens().size() - 1; ++i) {
382  ds->SetLens()[i] = ds->GetLens()[i + 1];
383  }
384  ds->SetLens().resize(ds->GetLens().size() - 1);
385  for (unsigned int i = 0; i < ds->GetStrands().size() - 2; ++i) {
386  ds->SetStrands()[i] = ds->GetStrands()[i + 2];
387  }
388  ds->SetStrands().resize(ds->GetStrands().size() - 2);
389  ds->SetNumseg(ds->GetNumseg() - 1);
390  }
391 
392  return ds;
393 }
394 
395 
396 
398  unsigned int slop, CScope& scope)
399 {
401  CAlnVec avec(ds, scope);
402  s_GetTails(avec, stats.tails);
403 
404  if( (stats.tails[0].right <= slop && stats.tails[1].left <= slop) ||
405  (stats.tails[0].left <= slop && stats.tails[1].right <= slop) ) {
406  return true;
407  } else {
408  return false;
409  }
410 /*
411  const CSeq_id& id0 = *ds.GetIds()[0];
412  const CSeq_id& id1 = *ds.GetIds()[1];
413  TSeqPos len0 = scope.GetBioseqHandle(id0).GetBioseqLength();
414  TSeqPos len1 = scope.GetBioseqHandle(id1).GetBioseqLength();
415 
416  // This assumes other sequence is plus strand,
417  // ie., ds.GetSeqStrand(1) == ncbi.eNa_strand_plus
418  if (!ds.CanGetStrands() || ds.GetStrands().empty() ||
419  ds.GetSeqStrand(0) == eNa_strand_plus) {
420  if (ds.GetSeqStart(0) <= slop &&
421  len1 - ds.GetSeqStop(1) - 1 <= slop) {
422  return true;
423  }
424  if (ds.GetSeqStart(1) <= slop &&
425  len0 - ds.GetSeqStop(0) - 1 <= slop) {
426  return true;
427  }
428  return false;
429  } else { // seq0 minus strand
430  if (ds.GetSeqStart(0) <= slop &&
431  ds.GetSeqStart(1) <= slop) {
432  return true;
433  }
434  if (len0 - ds.GetSeqStop(0) - 1 <= slop &&
435  len1 - ds.GetSeqStop(1) - 1 <= slop) {
436  return true;
437  }
438  return false;
439  }*/
440 }
441 
442 
443 // Does the alignment come within slop of either end of
444 // either sequence? Perhaps the criteria should be more stringent.
446  unsigned int slop, CScope& scope)
447 {
448  const CSeq_id& id0 = *ds.GetIds()[0];
449  const CSeq_id& id1 = *ds.GetIds()[1];
450  TSeqPos len0 = scope.GetBioseqHandle(id0).GetBioseqLength();
451  TSeqPos len1 = scope.GetBioseqHandle(id1).GetBioseqLength();
452 
453  return ds.GetSeqStart(0) <= slop
454  || len1 - ds.GetSeqStop(1) - 1 <= slop
455  || ds.GetSeqStart(1) <= slop
456  || len0 - ds.GetSeqStop(0) - 1 <= slop;
457 }
458 
459 
460 // Is one contained in the other, modulo slop?
462  unsigned int slop, CScope& scope)
463 {
464 /* const CSeq_id& id0 = *ds.GetIds()[0];
465  const CSeq_id& id1 = *ds.GetIds()[1];
466  TSeqPos len0 = scope.GetBioseqHandle(id0).GetBioseqLength();
467  TSeqPos len1 = scope.GetBioseqHandle(id1).GetBioseqLength();
468  return (ds.GetSeqStart(0) <= slop
469  && len0 - ds.GetSeqStop(0) - 1 <= slop)
470  ||
471  (ds.GetSeqStart(1) <= slop
472  && len1 - ds.GetSeqStop(1) - 1 <= slop);
473 */
475  CAlnVec avec(ds, scope);
476  s_GetTails(avec, stats.tails);
477 
478  bool FirstContainsSecond;
479  bool SecondContainsFirst;
480 
481  FirstContainsSecond = ((((long)stats.tails[0].left - stats.tails[1].left) >= -int(slop)) &&
482  (((long)stats.tails[0].right - stats.tails[1].right) >= -int(slop)));
483  SecondContainsFirst = ((((long)stats.tails[1].left - stats.tails[0].left) >= -int(slop)) &&
484  (((long)stats.tails[1].right - stats.tails[0].right) >= -int(slop)));
485 
486  return (FirstContainsSecond | SecondContainsFirst);
487 }
488 
489 
491 {
492  double Ident;
493  // This is from the old way to calculate percent identity.
494  // It was only ever used internally to filter overlaps
495  //Ident = CAlnStats(ds, scope).GetFracIdentity();
496  // This way uses the same calculation as the exposed GatherAlignStats()
497  // functions.
499  CAlnVec avec(ds, scope);
500  x_GatherIdentStats(avec, stats);
501  Ident = stats.pct_identity/100.0;
502  return Ident;
503 }
504 
505 
506 // Find the highest-scoring local alignment that is a
507 // sub-alignment of the given alignment
509  CScope& scope)
510 {
511  int Wg = -5, Wm = 1, Wms = -2, Ws = -2;
512 
513  CAlnVec avec(ds_in, scope);
514  avec.SetEndChar('-');
515  avec.SetGapChar('-');
516 
517  unsigned int sz = 0;
518  for (unsigned int i = 0; i < ds_in.GetLens().size(); ++i) {
519  sz += ds_in.GetLens()[i];
520  }
521  vector<int> scores(sz);
522  int previous_score;
523  for (unsigned int i = 0; i < scores.size(); ++i) {
524  unsigned char res0 = avec.GetResidue(0, i);
525  unsigned char res1 = avec.GetResidue(1, i);
526  if (i > 0) {
527  previous_score = scores[i - 1];
528  } else {
529  previous_score = 0;
530  }
531  if (res0 == '-') {
532  if (i > 0 && avec.GetResidue(0, i - 1) == '-') {
533  scores[i] = previous_score + Ws;
534  } else {
535  scores[i] = previous_score + Wg + Ws;
536  }
537  } else if (res1 == '-') {
538  if (i > 0 && avec.GetResidue(1, i - 1) == '-') {
539  scores[i] = previous_score + Ws;
540  } else {
541  scores[i] = previous_score + Wg + Ws;
542  }
543  } else if (res0 == res1) {
544  // match
545  scores[i] = previous_score + Wm;
546  } else {
547  // mismatch
548  scores[i] = previous_score + Wms;
549  }
550 
551  // Don't let the score drop below zero
552  if (scores[i] < 0) {
553  scores[i] = 0;
554  }
555  }
556 
557  // Find the (or a) place where score is max
558  int max_score = 0;
559  unsigned int right_end = 0; // initialize to avoid compiler warning
560  unsigned int left_end;
561  for (unsigned int i = 0; i < scores.size(); ++i) {
562  if (scores[i] > max_score) {
563  max_score = scores[i];
564  right_end = i; // important that we do ">" rather than ">="
565  }
566  }
567 
568  // Find the closest zero prior to this; the column after this
569  // is what we want. If we never hit zero, we want position zero.
570  if (right_end == 0) {
571  left_end = 0; // This will return an alignment of length one.
572  // If score[0] == 0, it is possible that we should
573  // return an alignment of length zero, so this is
574  // not strictly correct.
575  } else {
576  unsigned int i;
577  for (i = right_end - 1; i > 0; --i) {
578  if (scores[i] == 0) {
579  break;
580  }
581  }
582  if (i > 0) {
583  left_end = i + 1;
584  } else {
585  if (scores[0] == 0) {
586  left_end = 1;
587  } else {
588  left_end = 0;
589  }
590  }
591  }
592 
593  // Extract a slice of the ds corresponding to this range
594  CRef<CDense_seg> ds_out;
595  ds_out = ds_in.ExtractSlice(0, avec.GetSeqPosFromAlnPos(0, left_end),
596  avec.GetSeqPosFromAlnPos(0, right_end));
597  return ds_out;
598 }
599 
600 
601 vector<CRef<CSeq_align> >
602 CContigAssembly::Align(const CSeq_id& id0, const CSeq_id& id1,
603  const string& blast_params, double min_ident,
604  unsigned int max_end_slop, CScope& scope,
605  CNcbiOstream* ostr,
606  const vector<unsigned int>& band_halfwidths,
607  unsigned int diag_finding_window,
608  unsigned int min_align_length,
609  ENa_strand strand0, ENa_strand strand1)
610 {
611  if (min_ident > 1 || min_ident < 0) {
612  throw runtime_error("min_ident must be between zero and one (got "
613  + NStr::DoubleToString(min_ident) + ")");
614  }
615 
616  if (ostr) {
617  map<int,string> strandmap;
618  strandmap[eNa_strand_unknown] = "Unknown";
619  strandmap[eNa_strand_plus] = "Plus";
620  strandmap[eNa_strand_minus] = "Minus";
621  *ostr << "Running blast for " << id0.GetSeqIdString(true)
622  << " and " << id1.GetSeqIdString(true) << endl;
623  *ostr << "Filtering on " << min_ident << "%, slop " << max_end_slop
624  << "bp, min length " << min_align_length << "bp"
625  << " and strands " << strandmap[strand0] << ", " << strandmap[strand1] << endl;
626  }
628  try {
629  alns = Blastn(id0, id1, blast_params, scope);
630  }
631  catch (exception& e) {
632  if (ostr) {
633  *ostr << "blast failed:\n" << e.what() << endl;
634  }
635  return vector<CRef<CSeq_align> >();
636  }
637 // cerr << "Blast Count Total: " << alns->Get().size() << endl;
638  vector<CRef<CSeq_align> > good_alns;
640 //double Ident = FracIdent((*aln)->GetSegs().GetDenseg(), scope);
641 //bool Dovetail = IsDovetail((*aln)->GetSegs().GetDenseg(), max_end_slop, scope);
642 //int Len = x_DensegLength((*aln)->GetSegs().GetDenseg());
643 //cerr << " Ident: " << Ident << " Dove: " << Dovetail << " Len: " << Len << endl;
644  if (IsDovetail((*aln)->GetSegs().GetDenseg(), max_end_slop, scope)
645  && FracIdent((*aln)->GetSegs().GetDenseg(), scope) >= min_ident
646  && x_IsAllowedStrands((*aln)->GetSegs().GetDenseg(), strand0, strand1)
647  && x_DensegLength((*aln)->GetSegs().GetDenseg()) >= min_align_length ) {
648  x_OrientAlign((*aln)->SetSegs().SetDenseg(), scope);
649  good_alns.push_back(*aln);
650  }
651  }
652  if (!good_alns.empty()) {
653  if (ostr) {
654  *ostr << "Found "<< good_alns.size() <<
655  " acceptable dovetail alignment(s) by blast "
656  << blast_params << endl;
657  }
658  return good_alns;
659  } else {
660  if (ostr) {
661  *ostr << "Found no acceptable dovetail alignments by blast "
662  << blast_params << endl;
663  }
664  if (alns->Get().empty()) {
665  if (ostr) {
666  *ostr << "No alignments found by blast; "
667  "can't do banded alignment" << endl;
668  }
669  return vector<CRef<CSeq_align> >();
670  }
671  ENa_strand strand;
672  unsigned int diag;
673  FindDiagFromAlignSet(*alns, scope, diag_finding_window, strand, diag);
674 
675  CRef<CDense_seg> local_ds;
676  ITERATE(vector<unsigned int>, band_halfwidth, band_halfwidths) {
677  if (ostr) {
678  *ostr << "Trying banded global alignment with bandwidth = "
679  << 2 * *band_halfwidth + 1 << endl;
680  }
681  CRef<CDense_seg> global_ds;
682  try {
683  global_ds =
684  BandedGlobalAlignment(id0, id1, strand,
685  diag, *band_halfwidth, scope);
686  }
687  catch (CAlgoAlignException& e) {
688  if (ostr) {
689  *ostr << "banded alignment failed:\n" << e.what() << endl;
690  }
691  continue;
692  }
693 
694  if (global_ds->GetNumseg() == 0) {
695  if (ostr) {
696  *ostr << "banded alignment failed: num segs == 0\n" << endl;
697  }
698  continue;
699  }
700 
701  local_ds = BestLocalSubAlignment(*global_ds, scope);
702  double frac_ident = FracIdent(*local_ds, scope);
703  if (ostr) {
704  *ostr << "Fraction identity: " << frac_ident << endl;
705  }
706  if (IsDovetail(*local_ds, max_end_slop, scope)
707  && FracIdent(*local_ds, scope) >= min_ident
708  && x_IsAllowedStrands(*local_ds, strand0, strand1)
709  && x_DensegLength(*local_ds) >= min_align_length ) {
710  if (ostr) {
711  *ostr << "Alignment acceptable (full dovetail)" << endl;
712  }
713  x_OrientAlign(*local_ds, scope);
714  CRef<CSeq_align> aln(new CSeq_align);
715  aln->SetSegs().SetDenseg(*local_ds);
716  aln->SetType(aln->eType_partial);
717  return vector<CRef<CSeq_align> >(1, aln);
718  }
719  }
720 
721  if (ostr) {
722  *ostr << "No acceptable alignments from banded alignment algorithm"
723  << endl;
724  }
725  // Check for any half-dovetails (including contained)
726  // in blast results
727  good_alns.clear();
729  if (IsAtLeastHalfDovetail((*aln)->GetSegs().GetDenseg(),
730  max_end_slop, scope)
731  && FracIdent((*aln)->GetSegs().GetDenseg(), scope) >= min_ident
732  && x_IsAllowedStrands((*aln)->GetSegs().GetDenseg(), strand0, strand1)
733  && x_DensegLength((*aln)->GetSegs().GetDenseg()) >= min_align_length
734  ) {
735  x_OrientAlign((*aln)->SetSegs().SetDenseg(), scope);
736  good_alns.push_back(*aln);
737  }
738  }
739  if (ostr) {
740  *ostr << "Found " << good_alns.size() <<
741  " acceptable half-dovetail "
742  "or contained alignment(s) by blast" << endl;
743  }
744  if (!good_alns.empty()) {
745  return good_alns;
746  } else {
747  // Check whether banded alignment is an
748  // acceptable half-dovetail (including contained)
749  if (local_ds
750  && IsAtLeastHalfDovetail(*local_ds, max_end_slop, scope)
751  && FracIdent(*local_ds, scope) >= min_ident
752  && x_IsAllowedStrands(*local_ds, strand0, strand1)
753  && x_DensegLength(*local_ds) >= min_align_length) {
754  string dovetail_string;
755  if (IsContained(*local_ds, max_end_slop, scope)) {
756  dovetail_string = "contained";
757  } else {
758  dovetail_string = "half-dovetail";
759  }
760  if (ostr) {
761  *ostr << "Banded alignment acceptable ("
762  << dovetail_string << ")" << endl;
763  }
764  x_OrientAlign(*local_ds, scope);
765  CRef<CSeq_align> aln(new CSeq_align);
766  aln->SetSegs().SetDenseg(*local_ds);
767  aln->SetType(aln->eType_partial);
768  return vector<CRef<CSeq_align> >(1, aln);
769 
770  } else {
771  if (ostr) {
772  *ostr << "Banded alignment not an acceptable "
773  "half-dovetail or contained" << endl;
774  }
775  return vector<CRef<CSeq_align> >();
776  }
777  }
778  }
779 }
780 
781 
782 CContigAssembly::CAlnStats::CAlnStats(const objects::CDense_seg& ds,
783  objects::CScope& scope)
784 {
785  // Largely stolen from CCOPair::CAln
786  string row1, row2;
787  CAlnVec vec(ds, scope);
788  vec.SetGapChar('-');
790  (0, vec.GetAlnStop()));
792  (0, vec.GetAlnStop()));
793  _ASSERT(row1.size() == row2.size());
794 
795  m_AdjustedLen = m_MM = m_Gaps = 0;
796  for (unsigned int i = 0; i < row1.size(); ++i) {
797  if (row1[i] != 'N' && row2[i] != 'N') {
798  ++m_AdjustedLen;
799 
800  if (row1[i] != row2[i]) {
801  if (row1[i] == '-') {
802  ++m_Gaps;
803  while (i+1 < row1.size() && row1[i+1] == '-') ++i;
804  } else if (row2[i] == '-') {
805  ++m_Gaps;
806  while (i+1 < row1.size() && row2[i+1] == '-') ++i;
807  } else {
808  ++m_MM;
809  }
810  }
811  }
812  }
813 
815 }
816 
817 
818 
820  SAlignStats& align_stats)
821 {
822  ///
823  /// gap metrics
824  ///
825 
826  align_stats.total_length = 0;
827  align_stats.aligned_length = 0;
828  align_stats.gap_count = 0;
829  align_stats.gaps.clear();
830  align_stats.is_simple.clear();
831 
832  vector<CRef<CSeq_loc> > dust_locs;
833 
834  if (vec.GetNumSegs() > 1) {
835  // run dust to classify gaps as simple sequence or not
836  CSymDustMasker masker;
837  for (int row = 0; row < vec.GetNumRows(); ++row) {
838  CSeqVector seq_vec = vec.GetBioseqHandle(row).GetSeqVector();
839  seq_vec.SetIupacCoding();
840  CSeq_id id("lcl|dummy");
841  CRef<CPacked_seqint> res = masker.GetMaskedInts(id, seq_vec);
842  CRef<CSeq_loc> loc(new CSeq_loc);
843  loc->SetPacked_int(*res);
844  dust_locs.push_back(loc);
845  }
846  }
847 
848 
849  int gap_simple = -1; // -1 = not checked, 0 = no, 1 = yes
850  bool simple = false;
851  for (int i = 0; i < vec.GetNumSegs(); ++i) {
852  align_stats.total_length += vec.GetLen(i);
853  bool is_gap = false;
854  for (int j = 0; j < vec.GetNumRows(); ++j) {
855  if (vec.GetStart(j, i) == -1) {
856  simple = false;
857  unsigned int other_row = (j + 1) % 2;
858  TSeqPos start = vec.GetStart(other_row, i);
859  TSeqPos stop = start + vec.GetLen(i);
860  string seq;
861  vec.GetBioseqHandle(other_row)
864  .GetSeqData(start, stop, seq);
865 
866  CSeq_loc gap_loc;
867  gap_loc.SetInt().SetId().Set("lcl|dummy");
868  gap_loc.SetInt().SetFrom(vec.GetStart(other_row, i));
869  gap_loc.SetInt().SetTo(vec.GetStop(other_row, i));
870  sequence::ECompare cmp_res
871  = sequence::Compare(gap_loc, *dust_locs[other_row],
872  &vec.GetScope(),
874  simple = cmp_res == sequence::eContained
875  || cmp_res == sequence::eSame;
876 
877  if (simple) {
878  gap_simple = 1;
879  } else if (gap_simple == -1) {
880  gap_simple = 0;
881  }
882 
883  is_gap = true;
884  }
885  }
886 
887  if (!is_gap) {
888  align_stats.aligned_length += vec.GetLen(i);
889  } else {
890  align_stats.gap_count += 1;
891  align_stats.gaps.push_back(vec.GetLen(i));
892  align_stats.is_simple.push_back(simple);
893  }
894  }
895 
896  ///
897  /// identity computation
898  ///
899  x_GatherIdentStats(vec, align_stats);
900 
901  ///
902  /// overhangs (unaligned tails)
903  ///
904  s_GetTails(vec, align_stats.tails);
905 
906 }
907 
908 
910  CScope& scope,
911  SAlignStats& align_stats)
912 {
913  CAlnVec avec(ds, scope);
914  GatherAlignStats(avec, align_stats);
915 }
916 
917 
919  CScope& scope,
920  SAlignStats& align_stats)
921 {
922  GatherAlignStats(aln.GetSegs().GetDenseg(), scope, align_stats);
923 }
924 
925 
927 {
928  //return;
930  CAlnVec avec(ds, scope);
931  s_GetTails(avec, stats.tails);
932 
933  if(stats.tails[0].left < stats.tails[1].left) {
934  ds.Reverse();
935  }
936 }
937 
938 
940  ENa_strand strand0,
941  ENa_strand strand1)
942 {
943  ENa_strand align_strands[2];
944  bool matches[2] = {false, false};
945  if(!ds.CanGetStrands() || ds.GetStrands().empty()) {
946  align_strands[0] = align_strands[1] = eNa_strand_plus;
947  } else {
948  align_strands[0] = ds.GetSeqStrand(0);
949  align_strands[1] = ds.GetSeqStrand(1);
950  }
951 
952  if(strand0 == align_strands[0] || strand0 == eNa_strand_unknown)
953  matches[0] = true;
954  if(strand1 == align_strands[1] || strand1 == eNa_strand_unknown)
955  matches[1] = true;
956 
957  if(!(matches[0] & matches[1])) {
958  if(strand0 == align_strands[1] || strand0 == eNa_strand_unknown)
959  matches[0] = true;
960  if(strand1 == align_strands[0] || strand1 == eNa_strand_unknown)
961  matches[1] = true;
962  }
963 
964  return (matches[0] & matches[1]);
965 }
966 
967 
968 TSeqPos CContigAssembly::x_DensegLength(const objects::CDense_seg& ds)
969 {
970  TSeqPos Length = 0;
971  const CDense_seg::TStarts& Starts = ds.GetStarts();
972  const CDense_seg::TLens& Lens = ds.GetLens();
973  int Dim = ds.GetDim();
974 
975  for(unsigned int Seg = 0; Seg < Lens.size(); Seg++) {
976 
977  if(Starts[(Seg*Dim)] == -1 || Starts[(Seg*Dim)+1] == -1)
978  Length++;
979  else
980  Length += Lens[Seg];
981  }
982  return Length;
983 }
984 
985 
986 void CContigAssembly::x_GatherIdentStats(const objects::CAlnVec& vec,
987  SAlignStats& align_stats)
988 {
989  TSeqPos AlignedLength = 0;
990 
991  for (int i = 0; i < vec.GetNumSegs(); ++i) {
992  bool is_gap = false;
993  for (int j = 0; j < vec.GetNumRows(); ++j) {
994  if (vec.GetStart(j, i) == -1) {
995  is_gap = true;
996  }
997  }
998 
999  if (!is_gap) {
1000  AlignedLength += vec.GetLen(i);
1001  } else {
1002  ;
1003  }
1004  }
1005 
1006  unsigned int identities = 0;
1007  for (int i = 0; i < vec.GetNumSegs(); ++i) {
1008  string s1;
1009  vec.GetSegSeqString(s1, 0, i);
1010  for (int j = 1; j < vec.GetNumRows(); ++j) {
1011  string s2;
1012  vec.GetSegSeqString(s2, j, i);
1013 
1014  for (unsigned int k = 0; k < min(s1.size(), s2.size()); ++k) {
1015  identities += (s1[k] == s2[k]);
1016  }
1017  }
1018  }
1019 
1020  align_stats.mismatches = AlignedLength - identities;
1021  align_stats.pct_identity =
1022  100.0 * double(identities) / double(AlignedLength);
1023 }
1024 
1025 
1026 
Declares the CBl2Seq (BLAST 2 Sequences) class.
Declares the CBlastNucleotideOptionsHandle class.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
Definitions of special type used in BLAST.
vector< CRef< objects::CSeq_align_set > > TSeqAlignVector
Vector of Seq-align-sets.
@ eBlastn
Nucl-Nucl (traditional blastn)
Definition: blast_types.hpp:58
TSignedSeqPos GetStop(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:635
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:614
bool IsPositiveStrand(TNumrow row) const
Definition: alnmap.hpp:600
TSignedSeqPos GetSeqPosFromAlnPos(TNumrow for_row, TSeqPos aln_pos, ESearchDirection dir=eNone, bool try_reverse_dir=true) const
Definition: alnmap.cpp:663
TDim GetNumRows(void) const
Definition: alnmap.hpp:517
TSeqPos GetAlnStop(TNumseg seg) const
Definition: alnmap.hpp:488
TSeqPos GetLen(TNumseg seg, int offset=0) const
Definition: alnmap.hpp:621
TSeqPos GetSeqStop(TNumrow row) const
Definition: alnmap.hpp:675
TNumseg GetNumSegs(void) const
Definition: alnmap.hpp:510
TSeqPos GetSeqStart(TNumrow row) const
Definition: alnmap.hpp:665
const CBioseq_Handle & GetBioseqHandle(TNumrow row) const
Definition: alnvec.cpp:86
void SetEndChar(TResidue gap_char)
Definition: alnvec.hpp:368
void SetGapChar(TResidue gap_char)
Definition: alnvec.hpp:339
CScope & GetScope(void) const
Definition: alnvec.hpp:247
string & GetAlnSeqString(string &buffer, TNumrow row, const CAlnMap::TSignedRange &aln_rng) const
Definition: alnvec.cpp:145
TResidue GetResidue(TNumrow row, TSeqPos aln_pos) const
Definition: alnvec.hpp:254
CBioseq_Handle –.
Runs the BLAST algorithm between 2 sequences.
Definition: bl2seq.hpp:58
Handle to the nucleotide-nucleotide options to the BLAST algorithm.
CAlnStats(unsigned int adjusted_len, unsigned int mm, unsigned int gaps)
static void FindMaxRange(const vector< unsigned int > &vec, unsigned int window, unsigned int &max, vector< TRange > &max_range)
static void x_GatherIdentStats(const objects::CAlnVec &vec, SAlignStats &align_stats)
static CRef< objects::CDense_seg > BestLocalSubAlignment(const objects::CDense_seg &ds_in, objects::CScope &scope)
Find the highest-scoring local subalignment.
static bool IsAtLeastHalfDovetail(const objects::CDense_seg &ds, unsigned int slop, objects::CScope &scope)
static CRef< objects::CSeq_align_set > Blastn(const objects::CSeq_id &query_id, const objects::CSeq_id &subject_id, const string &param_string, objects::CScope &scope)
Utility for running blastn.
static void GatherAlignStats(const objects::CAlnVec &vec, SAlignStats &align_stats)
static bool x_IsAllowedStrands(const objects::CDense_seg &ds, objects::ENa_strand strand0, objects::ENa_strand strand1)
static void FindDiagFromAlignSet(const objects::CSeq_align_set &align_set, objects::CScope &scope, unsigned int window_size, objects::ENa_strand &strand, unsigned int &diag)
Given a set of alignments, pick out a diagonal to use as the center of a band in a banded alignment.
CRange< unsigned int > TRange
Find the range (or more than one tied range) containing the maximal diagonal count,...
static void DiagCounts(const objects::CSeq_align_set &align_set, objects::CScope &scope, vector< unsigned int > &plus_vec, vector< unsigned int > &minus_vec)
Count the cells with "ink" along each diagonal in a dot-matrix-type plot of some set of alignments (e...
static vector< CRef< objects::CSeq_align > > Align(const objects::CSeq_id &id0, const objects::CSeq_id &id1, const string &blast_params, double min_ident, unsigned int max_end_slop, objects::CScope &scope, CNcbiOstream *ostr=0, const vector< unsigned int > &band_halfwidths=vector< unsigned int >(1, 200), unsigned int diag_finding_window=200, unsigned int min_align_length=50, objects::ENa_strand strand0=objects::eNa_strand_unknown, objects::ENa_strand strand1=objects::eNa_strand_unknown)
Most users of the class need only to call this function.
static CRef< objects::CDense_seg > BandedGlobalAlignment(const objects::CSeq_id &id0, const objects::CSeq_id &id1, objects::ENa_strand strand, unsigned int diag, unsigned int half_width, objects::CScope &scope)
Do a banded global alignment using an arbitrary band.
static void x_OrientAlign(objects::CDense_seg &ds, objects::CScope &scope)
static TSeqPos x_DensegLength(const objects::CDense_seg &ds)
static bool IsDovetail(const objects::CDense_seg &ds, unsigned int slop, objects::CScope &scope)
static bool IsContained(const objects::CDense_seg &ds, unsigned int slop, objects::CScope &scope)
static double FracIdent(const objects::CDense_seg &ds, objects::CScope &scope)
ENa_strand GetSeqStrand(TDim row) const
Definition: Dense_seg.cpp:241
TSeqPos GetSeqStop(TDim row) const
Definition: Dense_seg.cpp:203
void Reverse(void)
Reverse the segments' orientation.
Definition: Dense_seg.cpp:644
TSeqPos GetSeqStart(TDim row) const
Definition: Dense_seg.cpp:165
void FromTranscript(TSeqPos query_start, ENa_strand query_strand, TSeqPos subj_start, ENa_strand subj_strand, const string &transcript)
Initialize from pairwise alignment transcript (a string representation produced by CNWAligner)
Definition: Dense_seg.cpp:1273
CRef< CDense_seg > ExtractSlice(TDim row, TSeqPos from, TSeqPos to) const
Extract a slice of the alignment that includes the specified range.
Definition: Dense_seg.cpp:747
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
Looks for low complexity parts of sequences according to the symmetric version of DUST algorithm.
Definition: symdust.hpp:61
CRef< objects::CPacked_seqint > GetMaskedInts(objects::CSeq_id &seq_id, const sequence_type &seq)
Mask a sequence and return result as a CPacked_seqint instance.
Definition: symdust.cpp:309
USING_SCOPE(objects)
static void s_SplitCommandLine(string s, vector< string > &result)
static void s_GetTails(const CAlnVec &vec, vector< CContigAssembly::SAlignStats::STails > &tails)
static ulg window_size
void SetSpaceLimit(const size_t &maxmem)
Definition: nw_aligner.hpp:142
void SetShift(Uint1 where, size_t offset)
string GetTranscriptString(void) const
Definition: nw_aligner.cpp:931
virtual TScore Run(void)
Definition: nw_aligner.cpp:503
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
Definition: nw_aligner.cpp:192
void SetEvalueThreshold(double eval)
Sets EvalueThreshold.
void SetMatchReward(int r)
Sets MatchReward.
void SetTraditionalBlastnDefaults()
Sets TraditionalBlastnDefaults.
void SetMismatchPenalty(int p)
Sets MismatchPenalty.
void SetGapXDropoffFinal(double x)
Sets GapXDropoffFinal.
void SetGapExtensionCost(int e)
Sets GapExtensionCost.
void SetFilterString(const char *f, bool clear=true)
Sets FilterString.
void SetWordSize(int ws)
Sets WordSize.
void SetGapOpeningCost(int g)
Sets GapOpeningCost.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
ECompare
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TSeqPos GetBioseqLength(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eStrand_Plus
Plus strand.
@ eStrand_Minus
Minus strand.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void SetIupacCoding(void)
Set coding to either Iupacaa or Iupacna depending on molecule type.
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5187
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1387
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
Tdata & Set(void)
Assign a value to data member.
TLens & SetLens(void)
Assign a value to Lens data member.
Definition: Dense_seg_.hpp:561
vector< TSeqPos > TLens
Definition: Dense_seg_.hpp:108
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
TStarts & SetStarts(void)
Assign a value to Starts data member.
Definition: Dense_seg_.hpp:536
TStrands & SetStrands(void)
Assign a value to Strands data member.
Definition: Dense_seg_.hpp:586
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
Definition: Dense_seg_.hpp:474
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
bool CanGetStrands(void) const
Check if it is safe to call GetStrands method.
Definition: Dense_seg_.hpp:574
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
list< CRef< CSeq_align > > Tdata
TIds & SetIds(void)
Assign a value to Ids data member.
Definition: Dense_seg_.hpp:511
const TStrands & GetStrands(void) const
Get the Strands member data.
Definition: Dense_seg_.hpp:580
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
@ eType_partial
mapping pieces together
Definition: Seq_align_.hpp:103
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Uint8 GetPhysicalMemorySize(void)
Return the amount of physical memory available in the system.
T max(T x_, T y_)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define row(bind, expected)
Definition: string_bind.c:73
Alignment characterization.
vector< STails > tails
unaligned tails
double pct_identity
% identity (varies from 0 to 100)
TSeqPos mismatches
number of mismatched bases
TSeqPos total_length
total covered length of the alignment, including gaps
TSeqPos gap_count
count of total number of gaps
vector< bool > is_simple
for each gap, whether is consists of "simple sequence"
vector< TSeqPos > gaps
the set of gap lengths for this alignment
TSeqPos aligned_length
total number of bases included in the alignment
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
#define _ASSERT
else result
Definition: token2.c:20
Modified on Wed Apr 17 13:10:15 2024 by modify_doxy.py rev. 669887