NCBI C++ ToolKit
prog.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: prog.cpp
29 
30 Author: Jason Papadopoulos
31 
32 Contents: Perform a progressive multiple alignment
33 
34 ******************************************************************************/
35 
36 #include <ncbi_pch.hpp>
37 #include <algo/cobalt/cobalt.hpp>
38 #include <algorithm>
39 
40 /// @file prog.cpp
41 /// Perform a progressive multiple alignment
42 
44 BEGIN_SCOPE(cobalt)
45 
46 /// Convert one hit into a constraint usable by CPSSMAligner
47 /// @param constraint list of previously generated constraints [in][out]
48 /// @param hit Constraint to convert [in]
49 ///
50 static void
51 x_AddConstraint(vector<size_t>& constraint,
52  CHit *hit)
53 {
54  int seq1_start = hit->m_SeqRange1.GetFrom();
55  int seq1_stop = hit->m_SeqRange1.GetTo();
56  int seq2_start = hit->m_SeqRange2.GetFrom();
57  int seq2_stop = hit->m_SeqRange2.GetTo();
58 
59  // add the top left corner of the constraint region,
60  // *unless* it is at the beginning of one or both sequences.
61  // This is a workaround to accomodate the handling of
62  // constraint regions within CPSSMAligner
63 
64  if (!constraint.empty()) {
65  int last = constraint.size();
66  if (constraint[last-4] >= (size_t)seq1_start ||
67  constraint[last-2] >= (size_t)seq2_start) {
68  return;
69  }
70  }
71 
72  constraint.push_back(seq1_start);
73  constraint.push_back(seq1_start);
74  constraint.push_back(seq2_start);
75  constraint.push_back(seq2_start);
76 
77  // constraint must be consistent, and be of size > 1
78  // to cause another group of sequence offsets to be added
79 
80  if (seq1_start >= seq1_stop ||
81  seq2_start >= seq2_stop)
82  return;
83 
84  constraint.push_back(seq1_stop);
85  constraint.push_back(seq1_stop);
86  constraint.push_back(seq2_stop);
87  constraint.push_back(seq2_stop);
88 }
89 
90 /// Convert a constraint into a form usable by CPSSMAligner
91 /// @param constraint list of previously generated constraints [in][out]
92 /// @param hit Constraint to convert [in]
93 ///
94 static void x_HitToConstraints(vector<size_t>& constraint,
95  CHit *hit)
96 {
97  if (!(hit->HasSubHits())) {
98  x_AddConstraint(constraint, hit);
99  }
100  else {
101  // every sub-hit gets its own constraint
102 
103  NON_CONST_ITERATE(CHit::TSubHit, subitr, hit->GetSubHit()) {
104  x_AddConstraint(constraint, *subitr);
105  }
106  }
107 }
108 
109 
110 /// Compute the residue frequencies for a specified range of a collection
111 /// of sequences
112 /// @param freq_data The computed frequencies [out]
113 /// @param query_data List of all sequences [in]
114 /// @param node_list List of suubset of sequences in query_data
115 /// that will contribute to the final residue
116 /// frequencies [in]
117 /// @param range Sequnece range for computation of frequencies, full sequence
118 /// if range limits are smaller than zero [in]
119 ///
120 static void
121 x_FillResidueFrequencies(double **freq_data,
122  vector<CSequence>& query_data,
123  vector<CTree::STreeLeaf>& node_list,
124  TRange range = TRange(-1, -1))
125 {
126  _ASSERT(range.GetFrom() < 0 || range.GetTo() >= range.GetFrom());
127 
128  double sum = 0.0;
129 
130  // sum all the distances from the (implicit) tree root
131  // to all sequences in node_list
132 
133  for (size_t i = 0; i < node_list.size(); i++) {
134  sum += node_list[i].distance;
135  }
136  _ASSERT(sum >= 0.0);
137 
138  for (size_t i = 0; i < node_list.size(); i++) {
139 
140  // update the residue frequencies to include the influence
141  // of a new sequence at a leaf in the tree
142 
143  int index = node_list[i].query_idx;
144  double weight = node_list[i].distance / sum;
145 
146  // the residue frequencies of the sequence are scaled by
147  // the fraction of 'sum' taken up by sequence i in the list.
148  // Since node_list stores *reciprocal* distances, this penalizes
149  // sequences further away from the tree root
150 
151  if (node_list[i].distance == 0 && sum == 0)
152  weight = 1;
153  else
154  weight = node_list[i].distance / sum;
155 
156  int start = range.GetFrom() >= 0 ? range.GetFrom() : 0;
157  int size = range.GetFrom() >= 0 ? range.GetTo() - range.GetFrom() + 1
158  : query_data[index].GetLength();
159 
160  CSequence::TFreqMatrix& matrix = query_data[index].GetFreqs();
161 
162  // add in the residue frequencies
163 
164  _ASSERT(start + size <= query_data[index].GetLength());
165  for (int j = 0; j < size; j++) {
166  if (query_data[index].GetLetter(start + j) == CSequence::kGapChar) {
167  freq_data[j][0] += weight;
168  }
169  else {
170  for (int k = 0; k < kAlphabetSize; k++) {
171  freq_data[j][k] += weight * matrix(start + j, k);
172  }
173  }
174  }
175  }
176 }
177 
178 /// Normalize the residue frequencies so that each column
179 /// sums to 1.0
180 /// @param freq_data The residue frequencies [in/modified]
181 /// @param freq_size Number of columns in freq_data [in]
182 ///
183 static void
184 x_NormalizeResidueFrequencies(double **freq_data,
185  int freq_size)
186 {
187  for (int i = 0; i < freq_size; i++) {
188  double sum = 0.0;
189 
190  // Compute the total weight of each row
191 
192  for (int j = 0; j < kAlphabetSize; j++) {
193  sum += freq_data[i][j];
194  }
195  // Gaps are not allowed in initial queries
196  _ASSERT(sum > 0.0);
197 
198  sum = 1.0 / sum;
199  for (int j = 0; j < kAlphabetSize; j++) {
200  freq_data[i][j] *= sum;
201  }
202  }
203 }
204 
205 /// Find a profile position for a input sequence position
206 static int
208 {
209  int i = 0;
210  int s = -1;
211  for (;i < seq.GetLength() && s < seq_pos;i++) {
212  if (seq.GetLetter(i) != CSequence::kGapChar) {
213  s++;
214  }
215  }
216  if (i >= seq.GetLength()) {
217  return -1;
218  }
219 
220  return i - 1;
221 }
222 
223 /// A pair of integers, defined to privide increment and less-then operators
224 class CIntPair : public pair<int, int>
225 {
226 public:
227 
228  CIntPair(int f, int s) : pair<int, int>(f, s) {}
229 
230  /// Increment both elements
232  {
233  first++;
234  second++;
235  return *this;
236  }
237 
238  /// Increment both elements
240  {
241  CIntPair result(this->first, this->second);
242  ++(*this);
243  return result;
244  }
245 
246  /// True if each element of this is smaller than the corresponding element
247  /// of p
248  bool operator<(const CIntPair& p) const
249  { return first < p.first && second < p.second;}
250 };
251 
252 
253 /// Translate ungapped alignment range for pair-wise sequence alignment in
254 /// input sequence coordinates into ungapped matching ranges on profiles.
255 /// If at least one of the sequences in the profile has gaps withing the
256 /// input matching range, there will be a series of the resulting profile
257 /// ranges.
258 static void
259 x_GetProfileMatchRanges(const TRange& seq_range1, const TRange seq_range2,
260  CSequence& seq1, CSequence& seq2,
261  vector<CMultiAligner::TRangePair>& prof_ranges)
262 {
263  // positions in sequence (seq1, seq2)
264  CIntPair s(seq_range1.GetFrom(), seq_range2.GetFrom());
265  // positions in profile
266  CIntPair p(0, 0);
267  // profile lengths
268  CIntPair len(seq1.GetLength(), seq2.GetLength());
269  // ends of the sequence ranges
270  CIntPair end_seq(seq_range1.GetTo(), seq_range2.GetTo());
271 
273 
274  // find profile positions for begining of the sequence ranges
275  p.first = s_SeqToProfilePosition(s.first, seq1);
276  p.second = s_SeqToProfilePosition(s.second, seq2);
277  _ASSERT(p.first >= 0 && p.second >= 0);
278  if (p.first < 0 || p.second < 0) {
279  return;
280  }
281 
282  // start a pair of profile ranges
283  r.first.SetFrom(p.first);
284  r.second.SetFrom(p.second);
285 
286  ++p;
287  ++s;
288  while (p < len && s < end_seq) {
289 
290  // while we are within input sequences ranges and profile sequences
291  // do not have gaps
292  while (p < len && s < end_seq
293  && seq1.GetLetter(p.first) != CSequence::kGapChar
294  && seq2.GetLetter(p.second) != CSequence::kGapChar) {
295  ++p;
296  ++s;
297  }
298  if ((p.first >= len.first && s.first < end_seq.first)
299  || (p.second >= len.second && s.second < end_seq.second)) {
300  return;
301  }
302  // either a gap was encountered or the a sequence range has ended,
303  // close the profile ranges
304  r.first.SetTo(p.first - 1);
305  r.second.SetTo(p.second - 1);
306  _ASSERT(r.first.GetLength() == r.second.GetLength());
307  prof_ranges.push_back(r);
308 
309  // skip all profile postitions where at least one sequence has a gap
310  while (p.first < len.first
311  && seq1.GetLetter(p.first) == CSequence::kGapChar) {
312 
313  p.first++;
314  }
315 
316  while (p.second < len.second
317  && seq2.GetLetter(p.second) == CSequence::kGapChar) {
318 
319  p.second++;
320  }
321 
322  // start a new pair of profile ranges
323  r.first.SetFrom(p.first);
324  r.second.SetFrom(p.second);
325  }
326 }
327 
328 /// Convert a sequence range to reflect gaps added to the
329 /// underlying sequence
330 /// @param range [in][out]
331 /// @param seq The sequence [in]
332 ///
333 static void
335  CSequence& seq)
336 {
337  int len = seq.GetLength();
338  int i, offset;
339 
340  // convert range.From
341 
342  offset = -1;
343  for (i = 0; i < len; i++) {
344  if (seq.GetLetter(i) != CSequence::kGapChar)
345  offset++;
346  if (offset == range.GetFrom()) {
347  range.SetFrom(i);
348  break;
349  }
350  }
351 
352  // convert range.To
353 
354  if (offset == range.GetTo()) {
355  range.SetTo(i);
356  return;
357  }
358  for (i++; i < len; i++) {
359  if (seq.GetLetter(i) != CSequence::kGapChar)
360  offset++;
361  if (offset == range.GetTo()) {
362  range.SetTo(i);
363  break;
364  }
365  }
366  _ASSERT(i < len);
367 }
368 
369 /// Find the set of constraints to use for a profile-profile
370 /// alignment. This routine considers only constraints between
371 /// one collection of equal-size sequences and another collection
372 /// of equal-size sequences
373 /// @param constraint List of compute constraints, in the format
374 /// expected by CPSSMAligner [out]
375 /// @param alignment Current multiple alignment of sequences [in]
376 /// @param node_list1 List of sequences in first collection [in]
377 /// @param node_list2 List of sequences in second collection [in]
378 /// @param pair_info List of pairwise constraints (between sequences) [in]
379 /// @param iteration The iteration number [in]
380 ///
381 void
382 CMultiAligner::x_FindConstraints(vector<size_t>& constraint,
383  vector<CSequence>& alignment,
384  vector<CTree::STreeLeaf>& node_list1,
385  vector<CTree::STreeLeaf>& node_list2,
386  CNcbiMatrix<CHitList>& pair_info,
387  int iteration)
388 {
389  const int kMinAlignLength = 11;
390  CHitList profile_hitlist;
391 
392  // first collect all of the constraints that pass from
393  // one sequence collection to the other
394 
395  for (int i = 0; i < (int)node_list1.size(); i++) {
396 
397  for (int j = 0; j < (int)node_list2.size(); j++) {
398 
399  int seq1 = node_list1[i].query_idx;
400  int seq2 = node_list2[j].query_idx;
401 
402  // for all constraints between seqience i in collection 1
403  // and sequence j in collection 2
404 
405  for (int k = 0; k < pair_info(seq1, seq2).Size(); k++) {
406  CHit *hit = pair_info(seq1, seq2).GetHit(k);
407  CHit *new_hit = new CHit(seq1, seq2);
408  bool swap_ranges = (seq1 != hit->m_SeqIndex1);
409 
410  // make a temporary hit to represent the constraint
411 
412  new_hit->m_Score = hit->m_Score;
413  if (swap_ranges) {
414  new_hit->m_SeqRange1 = hit->m_SeqRange2;
415  new_hit->m_SeqRange2 = hit->m_SeqRange1;
416  }
417  else {
418  new_hit->m_SeqRange1 = hit->m_SeqRange1;
419  new_hit->m_SeqRange2 = hit->m_SeqRange2;
420  }
421 
422  // add in any sub-hits if this is a domain hit (i.e.
423  // all of the individual blocks of the constraint are
424  // treated together)
425 
426  NON_CONST_ITERATE(CHit::TSubHit, subitr, hit->GetSubHit()) {
427  CHit *subhit = *subitr;
428  CHit *new_subhit = new CHit(seq1, seq2);
429  if (swap_ranges) {
430  new_subhit->m_SeqRange1 = subhit->m_SeqRange2;
431  new_subhit->m_SeqRange2 = subhit->m_SeqRange1;
432  }
433  else {
434  new_subhit->m_SeqRange1 = subhit->m_SeqRange1;
435  new_subhit->m_SeqRange2 = subhit->m_SeqRange2;
436  }
437  new_hit->InsertSubHit(new_subhit);
438  }
439  profile_hitlist.AddToHitList(new_hit);
440  }
441 
442  // check for interrupt
445  "Alignment interrupted");
446  }
447  }
448  }
449 
450  if (profile_hitlist.Empty())
451  return;
452 
453  // convert the range of each constraint into a range on
454  // the aligned sequences, i.e. change each set of bounds
455  // to account for gaps that have since been added to the
456  // sequences
457 
458  for (int i = 0; i < profile_hitlist.Size(); i++) {
459  CHit *hit = profile_hitlist.GetHit(i);
460  x_ExpandRange(hit->m_SeqRange1, alignment[hit->m_SeqIndex1]);
461  x_ExpandRange(hit->m_SeqRange2, alignment[hit->m_SeqIndex2]);
462 
463  NON_CONST_ITERATE(CHit::TSubHit, subitr, hit->GetSubHit()) {
464  CHit *subhit = *subitr;
465  x_ExpandRange(subhit->m_SeqRange1, alignment[subhit->m_SeqIndex1]);
466  x_ExpandRange(subhit->m_SeqRange2, alignment[subhit->m_SeqIndex2]);
467  }
468 
469  // check for interrupt
472  "Alignment interrupted");
473  }
474  }
475 
476  // put the constraints in canonical order
477 
478  profile_hitlist.SortByStartOffset();
479 
480  //--------------------------------------
481  if (m_Options->GetVerbose()) {
482  printf("possible constraints (offsets wrt input profiles):\n");
483  for (int i = 0; i < profile_hitlist.Size(); i++) {
484  CHit *hit = profile_hitlist.GetHit(i);
485  if (hit->m_Score != 1)
486  printf("query %3d %4d - %4d query %3d %4d - %4d score %d %c\n",
487  hit->m_SeqIndex1,
488  hit->m_SeqRange1.GetFrom(), hit->m_SeqRange1.GetTo(),
489  hit->m_SeqIndex2,
490  hit->m_SeqRange2.GetFrom(), hit->m_SeqRange2.GetTo(),
491  hit->m_Score,
492  hit->HasSubHits() ? 'd' : 'f');
493  }
494  }
495  //--------------------------------------
496 
497  // build up a graph of constraints that is derived from
498  // profile_hitlist
499 
500  vector<SGraphNode> graph;
501  for (int j = 0; j < profile_hitlist.Size(); j++) {
502  CHit *hit = profile_hitlist.GetHit(j);
503 
504  // for the first progressive alignment only, ignore
505  // constraints that are too small. We do not do this
506  // in later iterations because the constraints will include
507  // low-scoring pattern hits and conserved columns, which
508  // must be included
509 
510  if (iteration == 0 && hit->m_Score < 1000 &&
511  (hit->m_SeqRange1.GetLength() < kMinAlignLength ||
512  hit->m_SeqRange2.GetLength() < kMinAlignLength) )
513  continue;
514 
515  // find out whether constraint j must override a previous
516  // constraint that was added to the graph
517 
518  int i;
519  for (i = 0; i < (int)graph.size(); i++) {
520  CHit *ghit = graph[i].hit;
521 
522  // if constraint i in the graph is already consistent with
523  // constraint j, then no overriding is possible
524 
525  if ((ghit->m_SeqRange1.StrictlyBelow(hit->m_SeqRange1) &&
526  ghit->m_SeqRange2.StrictlyBelow(hit->m_SeqRange2)) ||
527  (hit->m_SeqRange1.StrictlyBelow(ghit->m_SeqRange1) &&
528  hit->m_SeqRange2.StrictlyBelow(ghit->m_SeqRange2)) ) {
529  continue;
530  }
531 
532  // constraint j overwrites constraint i in the graph if
533  // both of its ranges are 'near' constraint i,
534  // the two constraints lie on the same diagonal, and
535  // constraint j has the higher score
536 
537  if ((abs(ghit->m_SeqRange1.GetFrom() -
538  hit->m_SeqRange1.GetFrom()) <= kMinAlignLength / 2 ||
539  abs(ghit->m_SeqRange1.GetTo() -
540  hit->m_SeqRange1.GetTo()) <= kMinAlignLength / 2 ||
541  abs(ghit->m_SeqRange2.GetFrom() -
542  hit->m_SeqRange2.GetFrom()) <= kMinAlignLength / 2 ||
543  abs(ghit->m_SeqRange2.GetTo() -
544  hit->m_SeqRange2.GetTo()) <= kMinAlignLength / 2) &&
545  (ghit->m_SeqRange1.GetFrom() - ghit->m_SeqRange2.GetFrom() ==
546  hit->m_SeqRange1.GetFrom() - hit->m_SeqRange2.GetFrom()) &&
547  (ghit->m_SeqRange1.GetTo() - ghit->m_SeqRange2.GetTo() ==
548  hit->m_SeqRange1.GetTo() - hit->m_SeqRange2.GetTo()) ) {
549 
550  if (ghit->m_Score < hit->m_Score)
551  graph[i].hit = hit;
552  break;
553  }
554  }
555 
556  // constraint j is sufficiently different from the others
557  // in the graph that the graph should expand to include
558  // constraint j
559 
560  if (i == (int)graph.size())
561  graph.push_back(SGraphNode(hit, j));
562 
563  // check for interrupt
566  "Alignment interrupted");
567  }
568  }
569 
570  if (graph.empty())
571  return;
572 
573  // we now have a list of constraints from which a highest-
574  // scoring subset can be selected. First, though, we have to
575  // assign a score to each constraint in the graph
576 
577  if (iteration == 0) {
578 
579  // for the first iteration, the constraints in the graph
580  // have scores assigned to them based on sequence similarity,
581  // and it makes sense to reward constraints that are similar
582  // across many different sequences
583 
584  int num_hits = profile_hitlist.Size();
585  vector<int> list1, list2;
586  list1.reserve(num_hits);
587  list2.reserve(num_hits);
588 
589  // for constraint i
590 
591  for (int i = 0; i < (int)graph.size(); i++) {
592  CHit *ghit = graph[i].hit;
593  list1.clear();
594  list2.clear();
595 
596  // the score in the graph for constraint i is the score
597  // for constraint i, scaled by the product of the number of
598  // sequences in each collection that contain constraints
599  // compatible with constraint i. Two constraints are compatible
600  // if the start and end points of both constraints are 'close'
601  // to each other and lie on the same diagonals
602 
603  for (int j = 0; j < num_hits; j++) {
604  CHit *hit = profile_hitlist.GetHit(j);
605 
606  if ((abs(ghit->m_SeqRange1.GetFrom() -
607  hit->m_SeqRange1.GetFrom()) <= kMinAlignLength / 2 ||
608  abs(ghit->m_SeqRange1.GetTo() -
609  hit->m_SeqRange1.GetTo()) <= kMinAlignLength / 2 ||
610  abs(ghit->m_SeqRange2.GetFrom() -
611  hit->m_SeqRange2.GetFrom()) <= kMinAlignLength / 2 ||
612  abs(ghit->m_SeqRange2.GetTo() -
613  hit->m_SeqRange2.GetTo()) <= kMinAlignLength / 2) &&
614  (ghit->m_SeqRange1.GetFrom() - ghit->m_SeqRange2.GetFrom()==
615  hit->m_SeqRange1.GetFrom() - hit->m_SeqRange2.GetFrom()) &&
616  (ghit->m_SeqRange1.GetTo() - ghit->m_SeqRange2.GetTo() ==
617  hit->m_SeqRange1.GetTo() - hit->m_SeqRange2.GetTo()) ) {
618 
619  // constraint j is compatible with constraint
620  // i. If constraint j is between a different pair
621  // of sequences than we've seen so far, it
622  // constributes to the score of constraint i
623 
624  int k;
625  for (k = 0; k < (int)list1.size(); k++) {
626  if (list1[k] == hit->m_SeqIndex1)
627  break;
628  }
629  if (k == (int)list1.size())
630  list1.push_back(hit->m_SeqIndex1);
631 
632  for (k = 0; k < (int)list2.size(); k++) {
633  if (list2[k] == hit->m_SeqIndex2)
634  break;
635  }
636  if (k == (int)list2.size())
637  list2.push_back(hit->m_SeqIndex2);
638  }
639  }
640 
641  // scale the score of constraint i to reward the situation
642  // when multiple different constraints between the two
643  // sequence collection agree with constraint i
644 
645  graph[i].best_score = graph[i].hit->m_Score *
646  list1.size() * list2.size();
647  // check for interrupt
650  "Alignment interrupted");
651  }
652  }
653  }
654  else {
655 
656  // iterations beyond the first contain constraints whose
657  // scores were assigned arbitraily. In this case, just
658  // propagate those scores directly to the graph
659 
660  NON_CONST_ITERATE(vector<SGraphNode>, itr, graph) {
661  itr->best_score = itr->hit->m_Score;
662  }
663  }
664 
665  // find the highest-scoring consistent subset of constraints
666 
667  SGraphNode *best_path = x_FindBestPath(graph);
668  while (best_path != 0) {
669  CHit *hit = best_path->hit;
670 
671  //--------------------------------------
672  if (m_Options->GetVerbose()) {
673  printf("pick query %3d %4d - %4d query %3d %4d - %4d\n",
674  hit->m_SeqIndex1,
675  hit->m_SeqRange1.GetFrom(), hit->m_SeqRange1.GetTo(),
676  hit->m_SeqIndex2,
677  hit->m_SeqRange2.GetFrom(), hit->m_SeqRange2.GetTo());
678  }
679  //--------------------------------------
680 
681  // convert the constraint into a form the aligner can use
682 
683  x_HitToConstraints(constraint, hit);
684  best_path = best_path->path_next;
685  }
686  _ASSERT(!constraint.empty());
687 }
688 
689 
690 /// Find constraint to use for profile to profile alignment in clusters
691 ///
692 /// Finds in-cluster constraint (blastp hit) between two most similar sequences
693 /// from each profile.
694 /// @param alignment Current multiple alignment of sequences [in]
695 /// @param node_list1 List of sequences in first collection [in]
696 /// @param node_list2 List of sequences in second collection [in]
697 /// @param pair_info List of pairwise constraints (between sequences) [in]
698 /// @param match_ranges A list of pairs of ungapped blastp alignment ranges
699 /// on the profiles [out]
700 void CMultiAligner::x_FindInClusterConstraints(vector<CSequence>& alignment,
701  vector<CTree::STreeLeaf>& node_list1,
702  vector<CTree::STreeLeaf>& node_list2,
703  CNcbiMatrix<CHitList>& pair_info,
704  vector<TRangePair>& match_ranges) const
705 {
706  // Find the pair of most similar sequences, each from another subtree
708 
709  int seq1 = -1, seq2 = -1;
710  double dist = 0.0;
711  for (size_t i=0;i < node_list1.size();i++) {
712  for (size_t j=0;j < node_list2.size();j++) {
713  if (dist > dmat(node_list1[i].query_idx, node_list2[j].query_idx)
714  || seq1 < 0) {
715 
716  seq1 = node_list1[i].query_idx;
717  seq2 = node_list2[j].query_idx;
718  dist = dmat(seq1, seq2);
719  }
720  }
721  }
722  _ASSERT(seq1 != seq2);
723 
724  // Exit if no contraints found
725  if (pair_info(seq1, seq2).Size() == 0) {
726  return;
727  }
728 
729  CHit* hit = pair_info(seq1, seq2).GetHit(0);
730  for (int k=1;k < pair_info(seq1, seq2).Size();k++) {
731  if (pair_info(seq1, seq2).GetHit(k)->m_Score > hit->m_Score) {
732  hit = pair_info(seq1, seq2).GetHit(k);
733  }
734  }
735 
736  // get matching renges for pair-wise sequence alignent
737  // these are in sequence coordinates
738  vector<TOffsetPair> seq_match_regions(
741  hit->m_SeqRange2.GetFrom())));
742 
743  // Translate the match ranges into profile coordinates. Gaps within
744  // the sequence in the profile break up the matching ranges, hence
745  // translating a pair of ranges on the sequences results in a series of
746  // pairs of ranges on the profiles.
747  _ASSERT(seq_match_regions.size() >= 2);
748  for (size_t i=0;i < seq_match_regions.size();i+=2) {
749  _ASSERT(i < seq_match_regions.size() - 1);
750  _ASSERT(seq_match_regions[i + 1].first - seq_match_regions[i].first
751  == seq_match_regions[i + 1].second - seq_match_regions[i].second);
752 
753  // extract sequence ranges
754  TRange seq_range1(seq_match_regions[i].first,
755  seq_match_regions[i + 1].first);
756  TRange seq_range2(seq_match_regions[i].second,
757  seq_match_regions[i + 1].second);
758 
759  // translate matching sequence ranges into matching profile ranges
760  x_GetProfileMatchRanges(seq_range1, seq_range2,
761  alignment[hit->m_SeqIndex1],
762  alignment[hit->m_SeqIndex2],
763  match_ranges);
764  }
765 
766  //--------------------------------------
767  if (m_Options->GetVerbose()) {
768  printf("possible constraints (offsets wrt input profiles):\n");
769  printf("query %3d %4d - %4d query %3d %4d - %4d score %d\n",
770  hit->m_SeqIndex1,
771  match_ranges.front().first.GetFrom(),
772  match_ranges.back().first.GetTo(),
773  hit->m_SeqIndex2,
774  match_ranges.front().second.GetFrom(),
775  match_ranges.back().second.GetTo(),
776  hit->m_Score);
777  }
778  //--------------------------------------
779 }
780 
781 
782 /// Align two collections of sequences. All sequences within
783 /// a single collection begin with the same size
784 /// @param node_list1 List of sequence number in first collection [in]
785 /// @param node_list2 List of sequence number in second collection [in]
786 /// @param alignment Complete list of aligned sequences (may contain
787 /// sequences that will not be aligned). On output, new gaps
788 /// will be propagated to all affected sequences [in][out]
789 /// @param pair_info Constraints that may be used in the alignment [in]
790 /// @param iteration The iteration number [in]
791 ///
792 void
794  vector<CTree::STreeLeaf>& node_list1,
795  vector<CTree::STreeLeaf>& node_list2,
796  vector<CSequence>& alignment,
797  CNcbiMatrix<CHitList>& pair_info,
798  int iteration)
799 {
800  double **freq1_data;
801  double **freq2_data;
803 
804  int freq1_size = alignment[node_list1[0].query_idx].GetLength();
805  int freq2_size = alignment[node_list2[0].query_idx].GetLength();
806 
807  //---------------------------
808  if (m_Options->GetVerbose()) {
809  printf("\nalign profile (size %d) with profile (size %d)\n",
810  freq1_size, freq2_size);
811  }
812  //---------------------------
813 
814  // build a set of residue frequencies for the
815  // sequences in the left subtree
816 
817  freq1_data = new double* [freq1_size];
818  freq1_data[0] = new double[kAlphabetSize * freq1_size];
819 
820  for (int i = 1; i < freq1_size; i++)
821  freq1_data[i] = freq1_data[0] + kAlphabetSize * i;
822 
823  memset(freq1_data[0], 0, kAlphabetSize * freq1_size * sizeof(double));
824  x_FillResidueFrequencies(freq1_data, alignment, node_list1);
825  x_NormalizeResidueFrequencies(freq1_data, freq1_size);
826 
827  // build a set of residue frequencies for the
828  // sequences in the right subtree
829 
830  freq2_data = new double* [freq2_size];
831  freq2_data[0] = new double[kAlphabetSize * freq2_size];
832 
833  for (int i = 1; i < freq2_size; i++)
834  freq2_data[i] = freq2_data[0] + kAlphabetSize * i;
835 
836  memset(freq2_data[0], 0, kAlphabetSize * freq2_size * sizeof(double));
837  x_FillResidueFrequencies(freq2_data, alignment, node_list2);
838  x_NormalizeResidueFrequencies(freq2_data, freq2_size);
839 
840  // Perform dynamic programming global alignment
841 
842  m_Aligner.SetSequences((const double**)freq1_data, freq1_size,
843  (const double**)freq2_data, freq2_size, kScale);
844  m_Aligner.SetEndSpaceFree(false, false, false, false);
845 
846  vector<size_t> constraint;
847 
848  // determine the list of constraints to use
849 
850  x_FindConstraints(constraint, alignment, node_list1,
851  node_list2, pair_info, iteration);
852 
853  // List the query sequences that participate in
854  // each profile
855 
856 
857  //-------------------------------
858  if (m_Options->GetVerbose()) {
859  printf("constraints: ");
860  for (int i = 0; i < (int)constraint.size(); i+=4) {
861  printf("(seq1 %d seq2 %d)->", (int)constraint[i],
862  (int)constraint[i+2]);
863  }
864  printf("\n");
865  }
866  //-------------------------------
867  m_Aligner.SetPattern(constraint);
868 
869  // if there is a large length disparity between the two
870  // profiles, reduce or eliminate end gap penalties. Also
871  // scale up the gap penalties to match those of the score matrix
872 
879 
880  if (freq1_size > 1.2 * freq2_size ||
881  freq2_size > 1.2 * freq1_size) {
884  }
885  if ((freq1_size > 1.5 * freq2_size ||
886  freq2_size > 1.5 * freq1_size) &&
887  !constraint.empty()) {
888  m_Aligner.SetEndSpaceFree(true, true, true, true);
889  }
890 
891  // run the aligner, scale the penalties back down
892 
893  m_Aligner.Run();
900 
901  delete [] freq1_data[0];
902  delete [] freq1_data;
903  delete [] freq2_data[0];
904  delete [] freq2_data;
905 
906  // Retrieve the traceback information from the alignment
907  // and propagate new gaps to the affected sequences
908 
910  for (int i = 0; i < (int)node_list1.size(); i++) {
911  alignment[node_list1[i].query_idx].PropagateGaps(t,
913  }
914  for (int i = 0; i < (int)node_list2.size(); i++) {
915  alignment[node_list2[i].query_idx].PropagateGaps(t,
917  }
918 
919  //-------------------------------------------
920  if (m_Options->GetVerbose()) {
921  printf(" ");
922  for (int i = 0; i < (int)t.size() / 10; i++)
923  printf("%10d", i + 1);
924  printf("\n ");
925  for (int i = 0; i < (int)t.size(); i++)
926  printf("%d", i % 10);
927  printf("\n\n");
928 
929  for (int i = 0; i < (int)node_list1.size(); i++) {
930  int index = node_list1[i].query_idx;
931  CSequence& query = alignment[index];
932  printf("%3d: ", index);
933  for (int j = 0; j < query.GetLength(); j++) {
934  printf("%c", query.GetPrintableLetter(j));
935  }
936  printf("\n");
937  }
938  printf("\n");
939  for (int i = 0; i < (int)node_list2.size(); i++) {
940  int index = node_list2[i].query_idx;
941  CSequence& query = alignment[index];
942  printf("%3d: ", index);
943  for (int j = 0; j < query.GetLength(); j++) {
944  printf("%c", query.GetPrintableLetter(j));
945  }
946  printf("\n");
947  }
948  }
949  //-------------------------------------------
950 }
951 
952 
953 /// Compute profile profile alignmnet for a ranges of given profiles.
954 /// Resambles x_AlignProfileProfile, but works on sequence ranges,
955 /// does not allow end space free for large sequence lengths difference,
956 /// returns transcript, and does not update the vector of input sequences.
957 /// @param node_list1 List of sequence numbers in first collecton [in]
958 /// @param node_list2 List of sequence numbers in second collection [in]
959 /// @param alignment List of sequences [in]
960 /// @param constraints Constraints for alignment [in]
961 /// @param range1 Range for alignment of the first profile [in]
962 /// @param range2 Range for alignment of the second profile [in]
963 /// @param t Alignmet transcript [out]
965  vector<CTree::STreeLeaf>& node_list1,
966  vector<CTree::STreeLeaf>& node_list2,
967  vector<CSequence>& alignment,
968  vector<size_t>& constraints,
969  const TRange& range1, const TRange& range2,
970  int full_prof_len1, int full_prof_len2,
973 {
974  double **freq1_data;
975  double **freq2_data;
977 
978  int freq1_size = range1.GetTo() - range1.GetFrom() + 1;
979  int freq2_size = range2.GetTo() - range2.GetFrom() + 1;
980 
981  // build a set of residue frequencies for the
982  // sequences in the left subtree
983  freq1_data = new double* [freq1_size];
984  freq1_data[0] = new double[kAlphabetSize * freq1_size];
985 
986  for (int i = 1; i < freq1_size; i++)
987  freq1_data[i] = freq1_data[0] + kAlphabetSize * i;
988 
989  memset(freq1_data[0], 0, kAlphabetSize * freq1_size * sizeof(double));
990  x_FillResidueFrequencies(freq1_data, alignment, node_list1, range1);
991  x_NormalizeResidueFrequencies(freq1_data, freq1_size);
992 
993  // build a set of residue frequencies for the
994  // sequences in the right subtree
995  freq2_data = new double* [freq2_size];
996  freq2_data[0] = new double[kAlphabetSize * freq2_size];
997 
998  for (int i = 1; i < freq2_size; i++) {
999  freq2_data[i] = freq2_data[0] + kAlphabetSize * i;
1000  }
1001 
1002  memset(freq2_data[0], 0, kAlphabetSize * freq2_size * sizeof(double));
1003  x_FillResidueFrequencies(freq2_data, alignment, node_list2, range2);
1004  x_NormalizeResidueFrequencies(freq2_data, freq2_size);
1005 
1006  // Perform dynamic programming global alignment
1007  m_Aligner.SetSequences((const double**)freq1_data, freq1_size,
1008  (const double**)freq2_data, freq2_size, kScale);
1009 
1010  m_Aligner.SetEndSpaceFree(false, false, false, false);
1011  m_Aligner.SetPattern(constraints);
1012 
1019 
1020  // If there is a large disparity between lengths of the two full
1021  // profiles reduce or eliminate end gap penalties. Also
1022  // scale up the gap penalties to match those of the score matrix
1023  // Note that this function aligns left or right margin of the profiles,
1024  // hence gaps penalties are reduced on one side only.
1025  if (full_prof_len1 > 1.2 * full_prof_len2 ||
1026  full_prof_len2 > 1.2 * full_prof_len1) {
1027 
1028  if (strat & fReduceLeft) {
1030  * kScale / 2);
1031  }
1032 
1033  if (strat & fReduceRight) {
1035  * kScale / 2);
1036  }
1037  }
1038 
1039  // run the aligner, scale the penalties back down
1040  m_Aligner.Run();
1047 
1048  delete [] freq1_data[0];
1049  delete [] freq1_data;
1050  delete [] freq2_data[0];
1051  delete [] freq2_data;
1052 
1053  t = m_Aligner.GetTranscript(false);
1054 }
1055 
1057  vector<CTree::STreeLeaf>& node_list1,
1058  vector<CTree::STreeLeaf>& node_list2,
1059  vector<CSequence>& alignment,
1060  CNcbiMatrix<CHitList>& pair_info,
1061  int iteration)
1062 {
1063  // If there is a blastp alignment between the most similar sequences,
1064  // then the matching positions from sequence alignment will be also
1065  // matching in the profile alignment. The ranges outsize of the matching
1066  // ranges from blastp alignment are aligned with CPSSMAligner.
1067 
1068  //---------------------------
1069  if (m_Options->GetVerbose()) {
1070  printf("\nalign profile (size %d) with profile (size %d)\n",
1071  alignment[node_list1[0].query_idx].GetLength(),
1072  alignment[node_list2[0].query_idx].GetLength());
1073  }
1074  //---------------------------
1075 
1076  // Find pair-wise constraints and ranges of matching positions from the
1077  // constraint alignment translated into profile coordinates
1078  vector<TRangePair> match_ranges;
1079  x_FindInClusterConstraints(alignment, node_list1, node_list2,
1080  pair_info, match_ranges);
1081 
1082  // Perform standard profile-profile alignment if no constraints are found
1083  if (match_ranges.empty()) {
1084  x_AlignProfileProfile(node_list1, node_list2, alignment, pair_info,
1085  iteration);
1086  string message = "No significant alignments were found for cluster"
1087  " containing sequences: ";
1088  ITERATE(vector<CTree::STreeLeaf>, it, node_list1) {
1089  message += NStr::IntToString(it->query_idx) + ", ";
1090  }
1091  message += " and ";
1092  ITERATE(vector<CTree::STreeLeaf>, it, node_list2) {
1093  message += NStr::IntToString(it->query_idx) + ", ";
1094  }
1095  message += ". Decreasing maximum in-cluster distance or turing off"
1096  " clustering option may improve results.";
1097  m_Messages.push_back(message);
1098 
1099  return;
1100  }
1101 
1102  //-------------------------------
1103  if (m_Options->GetVerbose()) {
1104  ITERATE (vector<TRangePair>, rit, match_ranges) {
1105  printf("in-cluster constraints: "
1106  "(seq1 %d seq2 %d)->(seq1 %d seq2 %d)\n",
1107  rit->first.GetFrom(), rit->second.GetFrom(),
1108  rit->first.GetTo(), rit->second.GetTo());
1109  }
1110  printf("\n");
1111  }
1112  //-------------------------------
1113 
1114  // Align profiles using the hit information
1115 
1116  // transcript will hold information on profile alignment
1117  CNWAligner::TTranscript transcr;
1118 
1119  // input profile lengths
1120  int prof1_length = alignment[node_list1.front().query_idx].GetLength();
1121  int prof2_length = alignment[node_list2.front().query_idx].GetLength();
1122 
1123  // Take care of the left margin: parts of the profiles not included in the
1124  // constraint;
1125  // if both profiles have non-zero left margin length, compute alignment
1126  if (match_ranges.front().first.GetFrom() > 0
1127  && match_ranges.front().second.GetFrom() > 0) {
1128 
1129  TRange range1(0, match_ranges.front().first.GetFrom() - 1);
1130  TRange range2(0, match_ranges.front().second.GetFrom() - 1);
1131 
1132  // if the fast alignment option is requested, then assume these
1133  // ranges in both profiles are unaligned
1134  if (m_Options->GetFastAlign()) {
1135  for (int i=0;i < range2.GetLength();i++) {
1136  transcr.push_back(CNWAligner::eTS_Insert);
1137  }
1138  for (int i=0;i < range1.GetLength();i++) {
1139  transcr.push_back(CNWAligner::eTS_Delete);
1140  }
1141  }
1142  else {
1143  // otherwise align profiles
1144  vector<size_t> constr;
1146  x_ComputeProfileRangeAlignment(node_list1, node_list2, alignment,
1147  constr, range1, range2,
1148  prof1_length, prof2_length, fReduceLeft,
1149  transcr);
1150  }
1151  }
1152  else {
1153  // otherwise put approriate gaps in the transcript
1154  int len1 = match_ranges.front().first.GetFrom();
1155  int len2 = match_ranges.front().second.GetFrom();
1156 
1157  for (int i=0; i < len1;i++) {
1158  transcr.push_back(CNWAligner::eTS_Delete);
1159  }
1160  for (int i=0; i < len2;i++) {
1161  transcr.push_back(CNWAligner::eTS_Insert);
1162  }
1163  }
1164 
1165 
1166  // iterate over the matching ranges from the constraint pair-wise alignment
1167  vector<TRangePair>::const_iterator it(match_ranges.begin());
1168 
1169  // process the first matching range: put matching positions in transcript
1170  for (int i=0;i < it->first.GetLength();i++) {
1171  transcr.push_back(CNWAligner::eTS_Match);
1172  }
1173 
1174  vector<TRangePair>::const_iterator prev_it(it);
1175  ++it;
1176 
1177  // for each following matching range
1178  for (; it != match_ranges.end(); ++it, ++prev_it) {
1179  _ASSERT(it->first.GetLength() == it->second.GetLength());
1180 
1181  // get space between current and previous matching range
1182  TRange space1(prev_it->first.GetToOpen(), it->first.GetFrom() - 1);
1183  TRange space2(prev_it->second.GetToOpen(), it->second.GetFrom() - 1);
1184 
1185  // if the space is empty in the first profile, put insertions into
1186  // the transcript
1187  if (space1.Empty()) {
1188  for (int i=0;i < space2.GetLength();i++) {
1189  transcr.push_back(CNWAligner::eTS_Insert);
1190  }
1191  }
1192  // if the space is empty in the second profile, put deletions into
1193  // the transcript
1194  else if (space2.Empty()) {
1195  for (int i=0;i < space1.GetLength();i++) {
1196  transcr.push_back(CNWAligner::eTS_Delete);
1197  }
1198  }
1199  // otherwise compute alignment for these spaces
1200  else if (space1.NotEmpty() && space2.NotEmpty()) {
1201 
1202  // if the fast alignment option is requested, then assume these
1203  // ranges in both profiles are unaligned
1204  if (m_Options->GetFastAlign()) {
1205  for (int i=0;i < space2.GetLength();i++) {
1206  transcr.push_back(CNWAligner::eTS_Insert);
1207  }
1208  for (int i=0;i < space1.GetLength();i++) {
1209  transcr.push_back(CNWAligner::eTS_Delete);
1210  }
1211  }
1212  else {
1213  vector<size_t> constr;
1215  x_ComputeProfileRangeAlignment(node_list1, node_list2,
1216  alignment, constr,
1217  space1, space2,
1218  prof1_length,
1219  prof2_length,
1220  fReduceBoth, tr);
1221 
1223  transcr.push_back(*t);
1224  }
1225  }
1226  }
1227 
1228  // process matching range: add matching positions to the transcript
1229  for (int i=0;i < it->first.GetLength();i++) {
1230  transcr.push_back(CNWAligner::eTS_Match);
1231  }
1232  }
1233 
1234  // take care of the right margin: profile ranges outside of the constraint
1235  // at the right end of the profiles
1236  if (prof1_length - match_ranges.back().first.GetTo() - 1 > 0
1237  && prof2_length - match_ranges.back().second.GetTo() - 1 > 0) {
1238 
1239  TRange seq1_range(match_ranges.back().first.GetTo() + 1,
1240  prof1_length - 1);
1241  TRange seq2_range(match_ranges.back().second.GetTo() + 1,
1242  prof2_length - 1);
1243 
1244  // if the fast alignment option is requested, then assume these
1245  // ranges in both profiles are unaligned
1246  if (m_Options->GetFastAlign()) {
1247  for (int i=0;i < seq2_range.GetLength();i++) {
1248  transcr.push_back(CNWAligner::eTS_Insert);
1249  }
1250  for (int i=0;i < seq1_range.GetLength();i++) {
1251  transcr.push_back(CNWAligner::eTS_Delete);
1252  }
1253  }
1254  else {
1255  vector<size_t> constr;
1257  x_ComputeProfileRangeAlignment(node_list1, node_list2, alignment,
1258  constr, seq1_range, seq2_range,
1259  prof1_length, prof2_length,
1260  fReduceRight, t);
1261 
1263  transcr.push_back(*it);
1264  }
1265  }
1266  }
1267  else {
1268 
1269  // Put gaps
1270  int len1 = prof1_length - match_ranges.back().first.GetTo() - 1;
1271  int len2 = prof2_length - match_ranges.back().second.GetTo() - 1;
1272 
1273  for (int i=0; i < len1;i++) {
1274  transcr.push_back(CNWAligner::eTS_Delete);
1275  }
1276  for (int i=0; i < len2;i++) {
1277  transcr.push_back(CNWAligner::eTS_Insert);
1278  }
1279  }
1280 
1281  // Propagate gaps in all profile sequences
1282  for (int i=0;i < (int)node_list1.size();i++) {
1283  alignment[node_list1[i].query_idx].PropagateGaps(transcr,
1285  }
1286  for (int i=0;i < (int)node_list2.size();i++) {
1287  alignment[node_list2[i].query_idx].PropagateGaps(transcr,
1289  }
1290 
1291  //-------------------------------------------
1292  if (m_Options->GetVerbose()) {
1293  CNWAligner::TTranscript& t = transcr;
1294  printf(" ");
1295  for (int i = 0; i < (int)t.size() / 10; i++)
1296  printf("%10d", i + 1);
1297  printf("\n ");
1298  for (int i = 0; i < (int)t.size(); i++)
1299  printf("%d", i % 10);
1300  printf("\n\n");
1301 
1302  for (int i = 0; i < (int)node_list1.size(); i++) {
1303  int index = node_list1[i].query_idx;
1304  CSequence& query = alignment[index];
1305  printf("%3d: ", index);
1306  for (int j = 0; j < query.GetLength(); j++) {
1307  printf("%c", query.GetPrintableLetter(j));
1308  }
1309  printf("\n");
1310  }
1311  printf("\n");
1312  for (int i = 0; i < (int)node_list2.size(); i++) {
1313  int index = node_list2[i].query_idx;
1314  CSequence& query = alignment[index];
1315  printf("%3d: ", index);
1316  for (int j = 0; j < query.GetLength(); j++) {
1317  printf("%c", query.GetPrintableLetter(j));
1318  }
1319  printf("\n");
1320  }
1321  }
1322  //-------------------------------------------
1323 }
1324 
1325 
1326 /// Number of 'letters' in the reduced alphabet
1327 static const int kNumClasses = 10;
1328 
1329 /// Background frequencies of the letters in the reduced
1330 /// alphabet (each is the sum of the Robinson-Robinson
1331 /// frequencies of the underlying protein letters that
1332 /// appear in that letter class)
1333 ///
1334 static const double kDerivedFreqs[kNumClasses] = {
1335  0.000000, 0.019250, 0.073770, 0.052030, 0.228450,
1336  0.084020, 0.021990, 0.207660, 0.108730, 0.204100
1337 };
1338 
1339 /// Mapping from protein letters to reduced alphabet letters
1340 static const int kRes2Class[kAlphabetSize] = {
1341  0, 7, 9, 1, 9, 9, 5, 2, 6, 4, 8, 4, 4,
1342  9, 3, 9, 8, 7, 7, 4, 5, 0, 5, 9, 0, 0
1343 };
1344 
1345 /// Calculate the entropy score of one column of a multiple
1346 /// alignment (see the COBALT papaer for details)
1347 /// @param align The alignment [in]
1348 /// @param col the column number to score
1349 /// @return the computed score
1350 double
1351 CMultiAligner::x_GetScoreOneCol(vector<CSequence>& align, int col)
1352 {
1353  int count[kNumClasses];
1354  double freq[kNumClasses];
1355  int num_queries = align.size();
1356 
1357  double r = 1.0 / num_queries;
1358 
1359  for (int j = 0; j < kNumClasses; j++)
1360  count[j] = 0;
1361 
1362  for (int j = 0; j < num_queries; j++)
1363  count[kRes2Class[align[j].GetLetter(col)]]++;
1364 
1365  for (int j = 1; j < kNumClasses; j++)
1366  freq[j] = count[j] * r;
1367 
1368  double H1 = 0;
1369  double H2 = 0;
1370  double H3 = 0;
1371  double pseudocount = m_Options->GetPseudocount();
1372  for (int j = 1; j < kNumClasses; j++) {
1373  if (count[j] > 0 && kDerivedFreqs[j] > 0) {
1374  H1 += freq[j] * log((num_queries - 1) / pseudocount +
1375  (count[j] - 1) / kDerivedFreqs[j]);
1376 
1377  H2 += freq[j];
1378  H3 += kDerivedFreqs[j];
1379  }
1380  }
1381 
1382  return H1 - H2 * log((num_queries - 1) *
1383  (pseudocount + H3) / pseudocount);
1384 }
1385 
1386 
1387 /// Compute the entropy score of a multiple alignment
1388 /// @param align complete multiple alignment [in]
1389 /// @return the alignment score
1390 ///
1391 double
1392 CMultiAligner::x_GetScore(vector<CSequence>& align)
1393 {
1394  int align_len = align[0].GetLength();
1395  double H = 0;
1396 
1397  for (int i = 0; i < align_len; i++)
1398  H += x_GetScoreOneCol(align, i);
1399 
1400  return H;
1401 }
1402 
1403 
1404 /// Perform a single bipartition on a multiple alignment
1405 /// @param input_cluster A tree describing sequences that form one half
1406 /// of the bipartition [in]
1407 /// @param alignment A complete multiple alignment, updated with the
1408 /// the bipartition results if they have a higher score [in][out]
1409 /// @param pair_info Pairwise constraints on the alignment [in]
1410 /// @param score Starting alignment score [in]
1411 /// @param iteration The iteration number [in]
1412 /// @return The score of the new multiple alignment
1413 ///
1414 double
1416  const TPhyTreeNode *input_cluster,
1417  vector<CSequence>& alignment,
1418  CNcbiMatrix<CHitList>& pair_info,
1419  double score,
1420  int iteration)
1421 {
1422  int num_queries = m_QueryData.size();
1423 
1424  // list all the sequences in the tree
1425 
1426  vector<CTree::STreeLeaf> full_seq_list;
1427  CTree::ListTreeLeaves(GetTree(), full_seq_list, 0.0);
1428 
1429  // list the sequence that form one of the clusters to align
1430 
1431  vector<CTree::STreeLeaf> cluster_seq_list;
1432  CTree::ListTreeLeaves(input_cluster, cluster_seq_list, 0.0);
1433 
1434  //--------------------------------
1435  if (m_Options->GetVerbose()) {
1436  printf("cluster: ");
1437  for (int i = 0; i < (int)cluster_seq_list.size(); i++) {
1438  printf("%d ", cluster_seq_list[i].query_idx);
1439  }
1440  printf("\n");
1441  }
1442  //--------------------------------
1443 
1444  vector<int> cluster_idx;
1445  vector<int> other_idx;
1446  vector<CTree::STreeLeaf> other_seq_list;
1447  int cluster_size = cluster_seq_list.size();
1448 
1449  // other_seq_list get the OIDs of sequences that do
1450  // not participate in cluster_seq_list
1451 
1452  for (int i = 0; i < num_queries; i++) {
1453  int j;
1454  for (j = 0; j < cluster_size; j++) {
1455  if (full_seq_list[i].query_idx ==
1456  cluster_seq_list[j].query_idx) {
1457  break;
1458  }
1459  }
1460  if (j == cluster_size) {
1461  other_seq_list.push_back(full_seq_list[i]);
1462  other_idx.push_back(full_seq_list[i].query_idx);
1463  }
1464  }
1465  for (int i = 0; i < cluster_size; i++) {
1466  cluster_idx.push_back(cluster_seq_list[i].query_idx);
1467  }
1468 
1469  vector<CSequence> tmp_align = alignment;
1470 
1471  // squeeze out all-gap columns from each cluster, then
1472  // realign the two clusters
1473 
1474  CSequence::CompressSequences(tmp_align, cluster_idx);
1475  CSequence::CompressSequences(tmp_align, other_idx);
1476  x_AlignProfileProfile(cluster_seq_list, other_seq_list,
1477  tmp_align, pair_info, iteration);
1478 
1479  // replace the input alignment with the alignment result
1480  // if the latter has a higher score
1481 
1482  double new_score = x_GetScore(tmp_align);
1483  if (m_Options->GetVerbose())
1484  printf("realigned score = %f\n\n", new_score);
1485 
1486  if (new_score > score) {
1487  score = new_score;
1488  alignment.swap(tmp_align);
1489  }
1490 
1491  return score;
1492 }
1493 
1494 
1495 /// Find locations of gaps in cluster prototype
1496 /// @param cluster Cluster [in]
1497 /// @param query_data Current alignment of input sequences [in]
1498 /// @param gaps List of gap range locations [out]
1500  const vector<CSequence>& query_data,
1501  vector<TRange>& gaps)
1502 {
1503  gaps.clear();
1504 
1505  const CSequence& seq = query_data[cluster.GetPrototype()];
1506  for (int i=0;i < seq.GetLength();i++) {
1507  if (seq.GetLetter(i) == CSequence::kGapChar) {
1508  TRange range;
1509  range.SetFrom(i);
1510  while (i < seq.GetLength()
1511  && seq.GetLetter(i) == CSequence::kGapChar) {
1512 
1513  i++;
1514  }
1515  i--;
1516  range.SetTo(i);
1517  gaps.push_back(range);
1518  }
1519  }
1520 }
1521 
1522 /// Main driver for progressive alignment
1523 /// @param tree Alignment guide tree [in]
1524 /// @param query_data The sequences to align. The first call assumes
1525 /// this list contains the unaligned sequences, and
1526 /// intermediate alignment stages will update the list
1527 /// as alignment progresses. On output query_data contains
1528 /// the aligned version of all sequences [in][out]
1529 /// @param pair_info List of alignment constraints [in]
1530 /// @param iteration The iteration number [in]
1531 /// @param is_cluster Is the curretly traversed node inside a cluster
1532 /// subtree [in]
1533 void
1535  const TPhyTreeNode *tree,
1536  vector<CSequence>& query_data,
1537  CNcbiMatrix<CHitList>& pair_info,
1538  int iteration, bool is_cluster)
1539 {
1540 
1541  // Nodes with id >= kClusterNodeId are roots of cluster subtrees
1542  if (tree->GetValue().GetId() >= kClusterNodeId) {
1543  is_cluster = true;
1544  }
1545 
1546  TPhyTreeNode::TNodeList_CI child(tree->SubNodeBegin());
1547 
1548  // recursively convert each subtree into a multiple alignment
1549 
1550  const TPhyTreeNode *left_child = *child++;
1551  if (!left_child->IsLeaf())
1552  x_AlignProgressive(left_child, query_data,
1553  pair_info, iteration, is_cluster);
1554 
1555  const TPhyTreeNode *right_child = *child;
1556  if (!right_child->IsLeaf())
1557  x_AlignProgressive(right_child, query_data,
1558  pair_info, iteration, is_cluster);
1559 
1560  // align the two subtrees
1561 
1562  vector<CTree::STreeLeaf> node_list1, node_list2;
1563  CTree::ListTreeLeaves(left_child, node_list1,
1564  left_child->GetValue().GetDist());
1565  CTree::ListTreeLeaves(right_child, node_list2,
1566  right_child->GetValue().GetDist());
1567 
1568  // Use different alignment procedure inside clusters
1569  if (is_cluster && iteration == 0) {
1570  x_AlignProfileProfileUsingHit(node_list1, node_list2,
1571  query_data, pair_info, iteration);
1572  }
1573  else {
1574  x_AlignProfileProfile(node_list1, node_list2,
1575  query_data, pair_info, iteration);
1576  }
1577 
1578  // check for interrupt
1581  "Alignment interrupted");
1582  }
1583 
1584  // If root of a cluster tree then set RPS frequencies for cluster seuquences
1585  // Node id > kClusterNodeId denotes root of a cluster tree
1586  if (tree->GetValue().GetId() >= kClusterNodeId && !m_DomainHits.Empty()) {
1587 
1588  int index = tree->GetValue().GetId() - kClusterNodeId;
1589  const CClusterer::CSingleCluster& cluster
1590  = m_Clusterer.GetClusters()[index];
1591 
1592  vector<TRange> gaps;
1593  x_GetClusterGapLocations(cluster, query_data, gaps);
1594  x_AddRpsFreqsToCluster(cluster, query_data, gaps);
1595  }
1596 }
1597 
1598 
1599 void
1601  vector<CSequence>& query_data,
1602  const vector<TRange>& gaps)
1603 {
1604  // Cdd frequencies must be added to all cluster sequencies, because
1605  // at each profile-to-profile alignment frequencies are taken from
1606  // individual sequences
1607 
1608  _ASSERT(cluster.GetPrototype() < (int)m_RPSLocs.size());
1609 
1610  // Get RPS frequencies for cluster prototype
1611  const CSequence& prot = m_QueryData[cluster.GetPrototype()];
1612  const CSequence::TFreqMatrix& rps_freqs = prot.GetFreqs();
1613 
1614  int offset = 0;
1615  vector<TRange>::const_iterator gap_it(gaps.begin());
1616 
1617  double domain_res_freq_boost = m_Options->GetDomainResFreqBoost();
1618 
1619  // For each range with RPS frequencies
1620  ITERATE(vector<TRange>, it, m_RPSLocs[cluster.GetPrototype()]) {
1621 
1622  // iterate through each specific location
1623  for (int i=it->GetFrom();i < it->GetTo();i++) {
1624 
1625  // update difference between unaligned and aligned locations in
1626  // prototype sequence
1627  while (gap_it != gaps.end() && gap_it->GetFrom() < i + offset) {
1628  offset += gap_it->GetTo() - gap_it->GetFrom() + 1;
1629  gap_it++;
1630  }
1631 
1632  // for each cluster element except for cluster prototype
1633  ITERATE(CClusterer::CSingleCluster, elem, cluster) {
1634  if (*elem == cluster.GetPrototype()) {
1635  continue;
1636  }
1637 
1638  CSequence& seq = query_data[*elem];
1639  CSequence::TFreqMatrix& matrix = seq.GetFreqs();
1640  _ASSERT(rps_freqs.GetRows() + offset <= matrix.GetRows());
1641 
1642  // assign RPS frequencies
1643  for (int j=0;j < (int)matrix.GetCols();j++) {
1644 
1645  if (seq.GetLetter(i + offset) != CSequence::kGapChar) {
1646  _ASSERT(rps_freqs(i, j) >= 0.0);
1647 
1648  matrix(i + offset, j) = rps_freqs(i, j);
1649  }
1650  }
1651 
1652  // if cluster sequence has different letter than prototype
1653  if (seq.GetLetter(i + offset) != prot.GetLetter(i)) {
1654 
1655  // remove domain frequency boost assigned to prototype
1656  matrix(i + offset, prot.GetLetter(i))
1657  -= domain_res_freq_boost;
1658 
1659  // assign conserved domain frequency boost
1660  if (seq.GetLetter(i + offset) != CSequence::kGapChar) {
1661  matrix(i + offset, seq.GetLetter(i + offset))
1662  += domain_res_freq_boost;
1663  }
1664  }
1665  }
1666  }
1667  }
1668 }
1669 
1670 
1671 /// Create a list of constraints that reflect conserved columns
1672 /// in a multiple alignment
1673 /// @param new_alignment The multiple alignment to analyze [in]
1674 /// @param conserved The list of pairwise constraints [out]
1675 ///
1676 void
1678  vector<CSequence>& new_alignment,
1679  CHitList& conserved)
1680 {
1681  int num_queries = new_alignment.size();
1682  int align_length = new_alignment[0].GetLength();
1683  vector<double> hvec(align_length);
1684  vector<TRange> chosen_cols;
1685  int i, j, k, m;
1686 
1687  // find and save the score of each column
1688 
1689  for (i = 0; i < align_length; i++) {
1690  hvec[i] = x_GetScoreOneCol(new_alignment, i);
1691  }
1692 
1693  // use the above to find all the ranges of consecutive
1694  // columns whose score all exceed the cutoff for being conserved.
1695 
1696  for (i = 0; i < align_length; i++) {
1697  if (hvec[i] >= m_Options->GetConservedCutoffScore()) {
1698  if (!chosen_cols.empty() &&
1699  chosen_cols.back().GetTo() == i-1) {
1700  chosen_cols.back().SetTo(i);
1701  }
1702  else {
1703  chosen_cols.push_back(TRange(i, i));
1704  }
1705  }
1706  }
1707 
1708  // A conserved range must be at least 2 columns
1709 
1710  for (i = j = 0; j < (int)chosen_cols.size(); j++) {
1711  if (chosen_cols[j].GetLength() > 1) {
1712  chosen_cols[i++] = chosen_cols[j];
1713  }
1714  }
1715  chosen_cols.resize(i);
1716 
1717  //-------------------------------------
1718  if (m_Options->GetVerbose()) {
1719  for (i = 0; i < (int)chosen_cols.size(); i++) {
1720  printf("constraint at position %3d - %3d\n",
1721  chosen_cols[i].GetFrom(), chosen_cols[i].GetTo());
1722  }
1723  }
1724  //-------------------------------------
1725 
1726  vector<TRange> range_nogap(num_queries);
1727 
1728  for (i = 0; i < (int)chosen_cols.size(); i++) {
1729 
1730  TRange& curr_range = chosen_cols[i];
1731  int start = curr_range.GetFrom();
1732  int length = curr_range.GetLength();
1733 
1734  // find the coordinates of range i on each of the
1735  // unaligned sequences
1736 
1737  for (j = 0; j < num_queries; j++) {
1738 
1739  // a sequence that participates in the constraints
1740  // cannot have a gap within range i
1741 
1742  for (k = 0; k < length; k++) {
1743  if (new_alignment[j].GetLetter(start+k) == CSequence::kGapChar)
1744  break;
1745  }
1746  if (k < length) {
1747  range_nogap[j].SetEmpty();
1748  continue;
1749  }
1750 
1751  // locate the start and end range on sequence j
1752 
1753  int offset1 = -1;
1754  for (m = 0; m <= curr_range.GetFrom(); m++) {
1755  if (new_alignment[j].GetLetter(m) != CSequence::kGapChar)
1756  offset1++;
1757  }
1758 
1759  int offset2 = offset1;
1760  for (; m <= curr_range.GetTo(); m++) {
1761  if (new_alignment[j].GetLetter(m) != CSequence::kGapChar)
1762  offset2++;
1763  }
1764  range_nogap[j].Set(offset1, offset2);
1765  }
1766 
1767  // convert range i of conserved columns into an
1768  // all-against-all collection of pairwise constraints
1769 
1770  for (k = 0; k < num_queries - 1; k++) {
1771  for (m = k + 1; m < num_queries; m++) {
1772  if (!range_nogap[k].Empty() && !range_nogap[m].Empty()) {
1773  conserved.AddToHitList(new CHit(k, m, range_nogap[k],
1774  range_nogap[m], 1000,
1775  CEditScript()));
1776  }
1777  }
1778  }
1779  }
1780 
1781  //------------------------------
1782  if (m_Options->GetVerbose()) {
1783  printf("Per-column score\n");
1784  for (i = 0; i < align_length; i++) {
1785  printf("%6.2f ", hvec[i]);
1786  if ((i+1)%10 == 0)
1787  printf("\n");
1788  }
1789  printf("\n");
1790  }
1791  //------------------------------
1792 }
1793 
1794 
1795 // remove pointers to constraints from pair-wise lists
1797  int num_queries)
1798 {
1799  for (int i = 0; i < num_queries; i++) {
1800  for (int j = 0; j < num_queries; j++) {
1801  pair_info(i, j).ResetList();
1802  }
1803  }
1804 }
1805 
1806 /// Main driver for the progressive alignment process
1807 /// @param edges List of tree edges sorted by decreasing edge length [in]
1808 /// @param cluster_cutoff The minimum distance a tree edge must have
1809 /// to be the subject of a bipartition [in]
1810 ///
1811 void
1813  vector<CTree::STreeEdge>& edges,
1814  double cluster_cutoff)
1815 {
1816  int num_queries = m_QueryData.size();
1817  int iteration = 0;
1818  int conserved_cols;
1819  int new_conserved_cols;
1820  double new_score = 0.0;
1821  double best_score = INT4_MIN;
1822 
1823  // the initial list of constraints consists of the
1824  // filtered list of blast alignments plus any user-defined
1825  // constraints and in-cluster constraints
1826 
1827  CNcbiMatrix<CHitList> pair_info(num_queries, num_queries, CHitList());
1828  for (int i = 0; i < m_CombinedHits.Size(); i++) {
1829  CHit *hit = m_CombinedHits.GetHit(i);
1830  pair_info(hit->m_SeqIndex1, hit->m_SeqIndex2).AddToHitList(hit);
1831  pair_info(hit->m_SeqIndex2, hit->m_SeqIndex1).AddToHitList(hit);
1832  }
1833  for (int i = 0; i < m_UserHits.Size(); i++) {
1834  CHit *hit = m_UserHits.GetHit(i);
1835  pair_info(hit->m_SeqIndex1, hit->m_SeqIndex2).AddToHitList(hit);
1836  pair_info(hit->m_SeqIndex2, hit->m_SeqIndex1).AddToHitList(hit);
1837  }
1838  for (int i = 0; i < m_LocalInClusterHits.Size();i++) {
1840  pair_info(hit->m_SeqIndex1, hit->m_SeqIndex2).AddToHitList(hit);
1841  pair_info(hit->m_SeqIndex2, hit->m_SeqIndex1).AddToHitList(hit);
1842  }
1843  CHitList conserved_regions;
1844 
1845  conserved_cols = 0;
1846  vector<CSequence> tmp_aligned = m_QueryData;
1847 
1848 
1849  try {
1850  // perform the initial progressive alignment
1851 
1852  x_AlignProgressive(GetTree(), tmp_aligned,
1853  pair_info, iteration, false);
1854 
1855  while (1) {
1856 
1857  // compute the previous alignment score
1858 
1859  double realign_score = x_GetScore(tmp_aligned);
1860  if (m_Options->GetVerbose())
1861  printf("start score: %f\n", realign_score);
1862 
1863  // repeat the complete bipartition process until
1864  // either the best score stops improving or we've done
1865  // too many bipartitions
1866 
1867  int num_tries = m_Options->GetRealign() ? 5 : 0;
1868  for (int i = 0; i < num_tries; i++) {
1869  new_score = realign_score;
1870 
1871  // a single bipartition consists of systematically
1872  // breaking the largest edges in the tree and realigning
1873  // the subtrees to either side of that edge. Repeat
1874  // until the edges get too small
1875 
1876  for (int j = 0; j < (int)edges.size() &&
1877  edges[j].distance >= cluster_cutoff; j++) {
1878 
1879  new_score = x_RealignSequences(edges[j].node,
1880  tmp_aligned,
1881  pair_info,
1882  new_score,
1883  iteration);
1884  }
1885 
1886  // quit if the best score from bipartition i improved
1887  // the score of realignment i-1 by less than 2%
1888 
1889  if (new_score - realign_score <= 0.02 * fabs(realign_score)) {
1890  break;
1891  }
1892  realign_score = new_score;
1893  }
1894  if (m_Options->GetRealign()) {
1895  realign_score = max(realign_score, new_score);
1896  }
1897 
1898  //-------------------------------------------------
1899  if (m_Options->GetVerbose()) {
1900  for (int i = 0; i < num_queries; i++) {
1901  for (int j = 0; j < (int)tmp_aligned[i].GetLength(); j++) {
1902  printf("%c", tmp_aligned[i].GetPrintableLetter(j));
1903  }
1904  printf("\n");
1905  }
1906  printf("score = %f\n", realign_score);
1907  }
1908  //-------------------------------------------------
1909 
1910  if (realign_score > best_score) {
1911 
1912  // the current iteration has improved the alignment
1913 
1914  if (m_Options->GetVerbose()) {
1915  printf("REPLACE ALIGNMENT\n\n");
1916  }
1917  best_score = realign_score;
1918  m_Results = tmp_aligned; // will always happen at least once
1919 
1920  if (!m_Options->GetIterate())
1921  break;
1922 
1924  }
1925 
1927  && iteration >= 1) {
1928  break;
1929  }
1930 
1931  // if iteration is allowed: recompute the conserved
1932  // columns based on the new alignment first remove
1933  // the last batch of conserved regions
1934 
1935  for (int i = 0; i < num_queries; i++) {
1936  for (int j = 0; j < num_queries; j++) {
1937  pair_info(i, j).ResetList();
1938  }
1939  }
1940  conserved_regions.PurgeAllHits();
1941 
1942  x_FindConservedColumns(tmp_aligned, conserved_regions);
1943 
1944  // build up the list of conserved columns again, using
1945  // phi-pattern constraints, user-defined constraints and
1946  // the current collection of columns marked as conserved
1947 
1948  new_conserved_cols = 0;
1949  for (int i = 0; i < m_PatternHits.Size(); i++) {
1950  CHit *hit = m_PatternHits.GetHit(i);
1951  pair_info(hit->m_SeqIndex1, hit->m_SeqIndex2).AddToHitList(hit);
1952  pair_info(hit->m_SeqIndex2, hit->m_SeqIndex1).AddToHitList(hit);
1953  new_conserved_cols += hit->m_SeqRange1.GetLength();
1954  }
1955  for (int i = 0; i < m_UserHits.Size(); i++) {
1956  CHit *hit = m_UserHits.GetHit(i);
1957  pair_info(hit->m_SeqIndex1, hit->m_SeqIndex2).AddToHitList(hit);
1958  pair_info(hit->m_SeqIndex2, hit->m_SeqIndex1).AddToHitList(hit);
1959  new_conserved_cols += hit->m_SeqRange1.GetLength();
1960  }
1961  for (int i = 0; i < conserved_regions.Size(); i++) {
1962  CHit *hit = conserved_regions.GetHit(i);
1963  pair_info(hit->m_SeqIndex1, hit->m_SeqIndex2).AddToHitList(hit);
1964  pair_info(hit->m_SeqIndex2, hit->m_SeqIndex1).AddToHitList(hit);
1965  new_conserved_cols += hit->m_SeqRange1.GetLength();
1966  }
1967 
1968  // only perform another iteration of the number of
1969  // columns participating in constraints has increased
1970 
1971  if (new_conserved_cols <= conserved_cols)
1972  break;
1973 
1974  iteration++;
1975  conserved_cols = new_conserved_cols;
1976  tmp_aligned = m_QueryData;
1977 
1978  // do the next progressive alignment
1979 
1980  x_AlignProgressive(GetTree(), tmp_aligned,
1981  pair_info, iteration, false);
1982  }
1983 
1984  m_Score = best_score;
1985  }
1986  catch (std::bad_alloc ex) {
1987  // memory clean up
1988  s_CleanUpConstraints(pair_info, (int)m_QueryData.size());
1989 
1990  NCBI_THROW(CMultiAlignerException, eOutOfMemory, "Out of memory error");
1991  }
1992 
1993  // clean up the constraints
1994  s_CleanUpConstraints(pair_info, num_queries);
1995 }
1996 
1997 
1998 /// Callback for sorting tree edges in order of
1999 /// decreasing edge weight
2001 public:
2003  const CTree::STreeEdge& b) const {
2004  return a.distance > b.distance;
2005  }
2006 };
2007 
2008 
2009 void
2011 {
2013 
2014  // Nodes with id larger of equal to kClusterNodeId are considered root of
2015  // query cluster subtrees.
2016  _ASSERT((int)m_QueryData.size() < kClusterNodeId);
2017 
2018  // write down all the tree edges, along with their weight
2019  // skip edges inside query clusters
2020 
2021  vector<CTree::STreeEdge> edges;
2023  sort(edges.begin(), edges.end(), compare_tree_edge_descending());
2024  int num_edges = edges.size();
2025 
2026  // choose the maximum number of edges that the bipartition
2027  // phase will use; each edge will be split and the subtrees on
2028  // either side of the edge will be realigned
2029 
2030  int num_internal_edges = min(11, (int)(0.3 * num_edges + 0.5));
2031 
2032 
2033  // choose the cluster cutoff; this is the first big jump in
2034  // the length of tree edges, and marks the limit beyond which
2035  // we won't try to bipartition
2036 
2037  double cluster_cutoff = INT4_MAX;
2038  int i;
2039  for (i = 0; i < num_internal_edges - 1; i++) {
2040  if (edges[i].distance > 2 * edges[i+1].distance) {
2041  cluster_cutoff = edges[i].distance;
2042  break;
2043  }
2044  }
2045  if (i == num_internal_edges - 1) {
2046  cluster_cutoff = edges[i].distance;
2047  }
2048 
2049  //---------------------
2050  if (m_Options->GetVerbose()) {
2051  for (i = 0; i < num_edges; i++)
2052  printf("%f ", edges[i].distance);
2053  printf("cutoff = %f\n", cluster_cutoff);
2054  }
2055  //---------------------
2056 
2057  // progressively align all sequences
2058 
2059  x_BuildAlignmentIterative(edges, cluster_cutoff);
2060 }
2061 
2062 END_SCOPE(cobalt)
static const string kScale
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Definition: base.hpp:119
CLocalRange< TOffset > TRange
define for the fundamental building block of sequence ranges
Definition: base.hpp:115
pair< TOffset, TOffset > TOffsetPair
Basic type specifying a range on a sequence.
Definition: base.hpp:52
int GetPrototype(void) const
Get cluster prototype.
Definition: clusterer.hpp:103
const TDistMatrix & GetDistMatrix(void) const
Get distance matrix.
Definition: clusterer.cpp:118
const TClusters & GetClusters(void) const
Get clusters.
Definition: clusterer.hpp:272
Interface for the traceback from blast hits.
Definition: traceback.hpp:55
vector< TOffsetPair > ListMatchRegions(TOffsetPair start_offsets)
Compile a list of regions in the current edit script that contain substitutions.
Definition: traceback.cpp:444
An ordered collection of CHit objects.
Definition: hitlist.hpp:50
int Size() const
Retrieve number of hits in list.
Definition: hitlist.hpp:75
void PurgeAllHits()
Delete all hits unconditionally.
Definition: hitlist.hpp:148
bool Empty()
Determine whether a list contains no hits.
Definition: hitlist.hpp:79
CHit * GetHit(int index)
Retrieve a hit from the hitlist.
Definition: hitlist.hpp:93
void SortByStartOffset()
Sort the hits in the hitlist by increasing sequence1 index, then by increasing sequence1 start offset...
Definition: hitlist.cpp:379
void AddToHitList(CHit *hit)
Append a hit to the hitlist.
Definition: hitlist.hpp:84
A generalized representation of a pairwise alignment.
Definition: hit.hpp:86
TSubHit & GetSubHit()
Retrieve a list of subhits.
Definition: hit.hpp:185
void InsertSubHit(CHit *hit)
Add a to a CHit's list of subhits.
Definition: hit.hpp:180
int m_Score
Score of alignment.
Definition: hit.hpp:104
CEditScript & GetEditScript()
Retrieve the traceback associated with a CHit.
Definition: hit.hpp:190
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
Definition: hit.hpp:97
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
Definition: hit.hpp:101
TRange m_SeqRange1
The range of offsets on the first sequence.
Definition: hit.hpp:107
TRange m_SeqRange2
The range of offsets on the second sequence.
Definition: hit.hpp:110
bool HasSubHits()
Query if a CHit has a hierarchy of subhits available.
Definition: hit.hpp:195
vector< CHit * > TSubHit
Hits can be grouped hierarchically.
Definition: hit.hpp:93
A pair of integers, defined to privide increment and less-then operators.
Definition: prog.cpp:225
bool operator<(const CIntPair &p) const
True if each element of this is smaller than the corresponding element of p.
Definition: prog.cpp:248
CIntPair & operator++(void)
Increment both elements.
Definition: prog.cpp:231
CIntPair(int f, int s)
Definition: prog.cpp:228
CIntPair operator++(int)
Increment both elements.
Definition: prog.cpp:239
bool StrictlyBelow(const TThisType &r)
Test whether 'this' represents a sequence range strictly less than a given sequence range.
Definition: base.hpp:100
bool GetFastAlign(void) const
Check if fast alignment is to be used.
Definition: options.hpp:723
TScore GetEndGapExtendPenalty(void) const
Get gap extension penalty for end gaps in pairwise global alignment of profiles.
Definition: options.hpp:658
double GetConservedCutoffScore(void) const
Get cutoff score for conserved aligned columns.
Definition: options.hpp:554
bool GetIterate(void) const
Check if iterative alignmnet option is used.
Definition: options.hpp:337
double GetPseudocount(void) const
Get pseudocount for calculating column entropy.
Definition: options.hpp:565
TScore GetGapExtendPenalty(void) const
Get gap extension penlaty for middle gaps in pairwise global alignment of profiles.
Definition: options.hpp:632
TScore GetGapOpenPenalty(void) const
Get gap opening penalty for middle gaps in pairwise global alignment of profiles.
Definition: options.hpp:619
TScore GetEndGapOpenPenalty(void) const
Get gap opening penalty for end gaps in pairwise global alignment of profiles.
Definition: options.hpp:645
bool GetRealign(void) const
Check if MSA is to be realigned for different rooting of progressive alignment tree.
Definition: options.hpp:343
@ eMulti
Alignment guide tree for each cluster is attached to the main alignment guide tree.
Definition: options.hpp:269
bool GetVerbose(void) const
Get verbose mode.
Definition: options.hpp:691
double GetDomainResFreqBoost(void) const
Get boost for residue frequencies in conserved domains from RPS data base.
Definition: options.hpp:514
EEndGapCostStrategy
Strategy for reducing end gap penalties for profile-profile alignment.
Definition: cobalt.hpp:553
@ fReduceLeft
Reduce penalty only for left end gaps.
Definition: cobalt.hpp:555
@ fReduceBoth
Reduce penalty for both end gaps.
Definition: cobalt.hpp:559
@ fReduceRight
Reduce penalty only for right end gaps.
Definition: cobalt.hpp:557
CMultiAlignerOptions::EInClustAlnMethod m_ClustAlnMethod
Definition: cobalt.hpp:744
SProgress m_ProgressMonitor
Definition: cobalt.hpp:737
SGraphNode * x_FindBestPath(vector< SGraphNode > &nodes)
Find a maximum weight path in a directed acyclic graph.
Definition: seg.cpp:52
void x_AlignProfileProfileUsingHit(vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, int iteration)
Align two profiles with all sequences that belong to the same cluster.
Definition: prog.cpp:1056
void x_AddRpsFreqsToCluster(const CClusterer::CSingleCluster &cluster, vector< CSequence > &query_data, const vector< TRange > &gaps)
Definition: prog.cpp:1600
void x_BuildAlignmentIterative(vector< CTree::STreeEdge > &edges, double cluster_cutoff)
Main driver for the progressive alignment process.
Definition: prog.cpp:1812
CHitList m_CombinedHits
Definition: cobalt.hpp:718
CHitList m_LocalInClusterHits
Definition: cobalt.hpp:720
void x_FindConservedColumns(vector< CSequence > &new_alignment, CHitList &conserved)
Create a list of constraints that reflect conserved columns in a multiple alignment.
Definition: prog.cpp:1677
const TPhyTreeNode * GetTree(void) const
Get ree used guide in progressive alignment.
Definition: cobalt.hpp:245
@ eOutOfMemory
Out of memory error.
Definition: cobalt.hpp:84
@ eInterrupt
Alignment interruped through callback function.
Definition: cobalt.hpp:83
vector< CSequence > m_QueryData
Definition: cobalt.hpp:691
void x_FindInClusterConstraints(vector< CSequence > &alignment, vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, CNcbiMatrix< CHitList > &pair_info, vector< TRangePair > &match_ranges) const
Find constraint to use for profile to profile alignment in clusters.
Definition: prog.cpp:700
vector< vector< TRange > > m_RPSLocs
Definition: cobalt.hpp:731
CHitList m_UserHits
Definition: cobalt.hpp:721
CClusterer m_Clusterer
Definition: cobalt.hpp:712
void x_ComputeProfileRangeAlignment(vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, vector< size_t > &constraints, const TRange &range1, const TRange &range2, int full_prof_len1, int full_prof_len2, EEndGapCostStrategy strat, CNWAligner::TTranscript &t)
Compute profile profile alignmnet for a ranges of given profiles.
Definition: prog.cpp:964
vector< CSequence > m_Results
Definition: cobalt.hpp:703
CHitList m_DomainHits
Definition: cobalt.hpp:716
CConstRef< CMultiAlignerOptions > m_Options
Definition: cobalt.hpp:686
void x_AlignProfileProfile(vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, int iteration)
Align two collections of sequences.
Definition: prog.cpp:793
double x_GetScoreOneCol(vector< CSequence > &align, int col)
Calculate the entropy score of one column of a multiple alignment (see the COBALT papaer for details)
Definition: prog.cpp:1351
vector< string > m_Messages
Definition: cobalt.hpp:739
void x_AlignProgressive(const TPhyTreeNode *tree, vector< CSequence > &query_data, CNcbiMatrix< CHitList > &pair_info, int iteration, bool is_cluster)
Main driver for progressive alignment.
Definition: prog.cpp:1534
int m_Score
Alignment score.
Definition: cobalt.hpp:706
CHitList m_PatternHits
Definition: cobalt.hpp:719
void x_BuildAlignment()
Given the current domain, local, pattern and user hits, along with the current tree,...
Definition: prog.cpp:2010
void x_FindConstraints(vector< size_t > &constraint, vector< CSequence > &alignment, vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, CNcbiMatrix< CHitList > &pair_info, int iteration)
Find the set of constraints to use for a profile-profile alignment.
Definition: prog.cpp:382
pair< TRange, TRange > TRangePair
Definition: cobalt.hpp:116
double x_GetScore(vector< CSequence > &align)
Compute the entropy score of a multiple alignment.
Definition: prog.cpp:1392
static const int kRpsScaleFactor
Definition: cobalt.hpp:500
CPSSMAligner m_Aligner
Definition: cobalt.hpp:710
@ eIterativeAlignment
Definition: cobalt.hpp:98
@ eProgressiveAlignment
Definition: cobalt.hpp:97
static const int kClusterNodeId
Definition: cobalt.hpp:747
double x_RealignSequences(const TPhyTreeNode *input_cluster, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, double score, int iteration)
Perform a single bipartition on a multiple alignment.
Definition: prog.cpp:1415
FInterruptFn m_Interrupt
Definition: cobalt.hpp:736
size_t GetRows() const
get the number of rows in this matrix
Definition: matrix.hpp:298
size_t GetCols() const
get the number of columns in this matrix
Definition: matrix.hpp:305
Class for representing protein sequences.
Definition: seq.hpp:54
int GetLength() const
Get the length of the current sequence.
Definition: seq.hpp:125
unsigned char GetLetter(int pos) const
Access the sequence letter at a specified position.
Definition: seq.hpp:107
TFreqMatrix & GetFreqs()
Access the list of position frequencies associated with a sequence.
Definition: seq.hpp:89
static void CompressSequences(vector< CSequence > &seq, vector< int > index_list)
Given a collection of sequences, remove all sequence positions where a subset of the sequences all co...
Definition: seq.cpp:232
static const unsigned char kGapChar
The ncbistdaa code for a gap.
Definition: seq.hpp:58
definition of a Culling tree
Definition: ncbi_tree.hpp:100
static void ListTreeEdges(const TPhyTreeNode *node, vector< STreeEdge > &edge_list, int max_id=-1)
Traverse a tree below a given starting point, listing all edges encountered along the way.
Definition: tree.cpp:79
static void ListTreeLeaves(const TPhyTreeNode *node, vector< STreeLeaf > &node_list, double curr_dist=0)
Traverse a tree below a given starting point, listing all leaves encountered along the way.
Definition: tree.cpp:98
Callback for sorting tree edges in order of decreasing edge weight.
Definition: prog.cpp:2000
bool operator()(const CTree::STreeEdge &a, const CTree::STreeEdge &b) const
Definition: prog.cpp:2002
Interface for CMultiAligner.
bool Empty(const CNcbiOstrstream &src)
Definition: fileutil.cpp:523
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
int offset
Definition: replacements.h:160
#define H(x, y, z)
Definition: md4.c:180
void SetStartWg(TScore value)
TTranscript GetTranscript(bool reversed=true) const
Definition: nw_aligner.cpp:909
void SetEndWs(TScore value)
virtual CNWAligner::TScore Run(void)
void SetEndWg(TScore value)
vector< ETranscriptSymbol > TTranscript
Definition: nw_aligner.hpp:199
void SetWs(TScore value)
void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
void SetWg(TScore value)
void SetPattern(const vector< size_t > &pattern)
Definition: nw_aligner.cpp:845
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
Definition: nw_aligner.cpp:192
void SetStartWs(TScore value)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
position_type GetLength(void) const
Definition: range.hpp:158
bool NotEmpty(void) const
Definition: range.hpp:152
bool Empty(void) const
Definition: range.hpp:148
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
TNodeList::const_iterator TNodeList_CI
Definition: ncbi_tree.hpp:110
bool IsLeaf() const
Report whether this is a leaf node.
Definition: ncbi_tree.hpp:296
const TValue & GetValue(void) const
Return node's value.
Definition: ncbi_tree.hpp:184
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
n font weight
int i
int len
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
#define fabs(v)
Definition: ncbi_dispd.c:46
#define abs(a)
Definition: ncbi_heapmgr.c:130
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
#define INT4_MAX
largest nubmer represented by signed int
Definition: ncbi_std.h:141
#define INT4_MIN
Smallest (most negative) number represented by signed int.
Definition: ncbi_std.h:146
T max(T x_, T y_)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
static const int kNumClasses
Number of 'letters' in the reduced alphabet.
Definition: prog.cpp:1327
static void x_NormalizeResidueFrequencies(double **freq_data, int freq_size)
Normalize the residue frequencies so that each column sums to 1.0.
Definition: prog.cpp:184
static void x_HitToConstraints(vector< size_t > &constraint, CHit *hit)
Convert a constraint into a form usable by CPSSMAligner.
Definition: prog.cpp:94
static void x_AddConstraint(vector< size_t > &constraint, CHit *hit)
Convert one hit into a constraint usable by CPSSMAligner.
Definition: prog.cpp:51
static void x_ExpandRange(TRange &range, CSequence &seq)
Convert a sequence range to reflect gaps added to the underlying sequence.
Definition: prog.cpp:334
static void s_CleanUpConstraints(CNcbiMatrix< CHitList > &pair_info, int num_queries)
Definition: prog.cpp:1796
static const int kRes2Class[kAlphabetSize]
Mapping from protein letters to reduced alphabet letters.
Definition: prog.cpp:1340
static void x_GetProfileMatchRanges(const TRange &seq_range1, const TRange seq_range2, CSequence &seq1, CSequence &seq2, vector< CMultiAligner::TRangePair > &prof_ranges)
Translate ungapped alignment range for pair-wise sequence alignment in input sequence coordinates int...
Definition: prog.cpp:259
static void x_FillResidueFrequencies(double **freq_data, vector< CSequence > &query_data, vector< CTree::STreeLeaf > &node_list, TRange range=TRange(-1, -1))
Compute the residue frequencies for a specified range of a collection of sequences.
Definition: prog.cpp:121
static const double kDerivedFreqs[kNumClasses]
Background frequencies of the letters in the reduced alphabet (each is the sum of the Robinson-Robins...
Definition: prog.cpp:1334
static int s_SeqToProfilePosition(int seq_pos, CSequence &seq)
Find a profile position for a input sequence position.
Definition: prog.cpp:207
static void x_GetClusterGapLocations(const CClusterer::CSingleCluster &cluster, const vector< CSequence > &query_data, vector< TRange > &gaps)
Find locations of gaps in cluster prototype.
Definition: prog.cpp:1499
struct SGraphNode * path_next
the optimal path node belongs to
Definition: cobalt.hpp:505
CHit * hit
the alignment represented by this node
Definition: cobalt.hpp:503
EAlignmentStage stage
Definition: cobalt.hpp:103
Structure for listing tree edges.
Definition: tree.hpp:59
static string query
#define _ASSERT
else result
Definition: token2.c:20
Modified on Wed May 22 11:33:19 2024 by modify_doxy.py rev. 669887