NCBI C++ ToolKit
seg.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: seg.cpp
29 
30 Author: Jason Papadopoulos
31 
32 Contents: Code that deals with scoring and manipulating
33  alignments ("segments") from BLAST runs
34 
35 ******************************************************************************/
36 
37 #include <ncbi_pch.hpp>
38 #include <algo/cobalt/cobalt.hpp>
39 
40 /// @file seg.cpp
41 /// Code that deals with scoring and manipulating
42 /// alignments ("segments") from BLAST runs
43 
45 BEGIN_SCOPE(cobalt)
46 
47 /// Find a maximum weight path in a directed acyclic graph
48 /// @param nodes The graph [in/modified]
49 /// @return Pointer to the first node in the optimal path
50 ///
52 CMultiAligner::x_FindBestPath(vector<SGraphNode>& nodes)
53 {
54  // Given the nodes of a directed acyclic graph, each
55  // contining a score and referring to a pairwise
56  // alignment between two sequences, find the non-
57  // conflicting subset of the nodes that has the highest
58  // score. Note that the scores optimized here are
59  // local to the graph, and need not depend on the actual
60  // alignment to which a node points.
61  //
62  // The algorithm is a simplified form of dynamic programming,
63  // which relies on the list of alignments being
64  // in sorted order. Optimal paths are built right-to-left,
65  // from the rightmost alignment to the leftmost. For each
66  // node, optimal paths for all nodes to the right have
67  // already been computed, and the task reduces to determining
68  // which path to attach the present node to, in order to
69  // maximize the score of paths starting at the present node.
70  //
71  // Each node stores the best score for the path starting at
72  // that node, and each node lies on at most one optimal path.
73  //
74  // This routine returns a linked list of structures that
75  // give the alignments participating in the optimal path.
76  // The 'best_score' field in the first linked list entry
77  // gives the score of the optimal path.
78 
79  SGraphNode *best_node = NULL;
80  int num_nodes = nodes.size();
81  double best_score = INT4_MIN;
82 
83  // walk backwards through the list of nodes
84 
85  for (int i = num_nodes - 1; i >= 0; i--) {
86 
87  // Find the node that lies strictly to the right of
88  // of node i and is the head of the currently highest-scoring
89  // collection of nodes. Then attach the current node
90  // to it. Update the optimal score for the path that
91  // includes the current node
92 
93  CHit *hit1 = nodes[i].hit;
94  double self_score = nodes[i].best_score;
95  for (int j = i + 1; j < num_nodes; j++) {
96 
97  CHit *hit2 = nodes[j].hit;
98  if (hit1->m_SeqRange1.StrictlyBelow(hit2->m_SeqRange1) &&
99  hit1->m_SeqRange2.StrictlyBelow(hit2->m_SeqRange2) &&
100  nodes[j].best_score + self_score > nodes[i].best_score) {
101 
102  nodes[i].best_score = nodes[j].best_score + self_score;
103  nodes[i].path_next = &(nodes[j]);
104  }
105  }
106 
107  // the current node has been processed. See if a new global
108  // optimum score has been reached
109 
110  if (nodes[i].best_score > best_score) {
111  best_score = nodes[i].best_score;
112  best_node = &(nodes[i]);
113  }
114  }
115 
116  return best_node;
117 }
118 
119 void
121 {
122  // For each pair of sequences, chooses a subset of the
123  // available alignments for which 1) all alignments
124  // occur in order, without crossing, on both sequences; and
125  // 2) the combined score of all alignments is as large as possible.
126  // The following relies on m_CombinedList being sorted in
127  // canonical order
128 
129  int num_queries = m_QueryData.size();
130  CNcbiMatrix< vector<SGraphNode> > nodes(num_queries, num_queries);
131 
132  // first group together alignments to the same pairs of
133  // sequences. Each such collection of alignments becomes
134  // a node in the graph for that sequence pair; the score
135  // of the node is the score of the hit it points to
136 
137  for (int i = 0; i < m_CombinedHits.Size(); i++) {
138 
139  // assume all hits will be deleted
140 
141  m_CombinedHits.SetKeepHit(i, false);
142 
143  CHit *hit = m_CombinedHits.GetHit(i);
144  nodes(hit->m_SeqIndex1, hit->m_SeqIndex2).push_back(SGraphNode(hit, i));
145  nodes(hit->m_SeqIndex1, hit->m_SeqIndex2).back().best_score =
146  hit->m_Score;
147  }
148 
149  // now optimize the graph for each sequence pair
150  // separately
151 
152  for (int i = 0; i < num_queries - 1; i++) {
153 
154  for (int j = i + 1; j < num_queries; j++) {
155 
156  // Find the optimal segment collection for this pair of queries
157 
158  SGraphNode *best_path = x_FindBestPath(nodes(i, j));
159 
160  // signal that the optimal path nodes should not be pruned
161 
162  while (best_path != NULL) {
163  m_CombinedHits.SetKeepHit(best_path->list_pos, true);
164  best_path = best_path->path_next;
165  }
166  }
167  }
168 
169  // Remove hits that do not lie on an optimal path
170 
172 }
173 
174 void
176 {
178 
179  // For each pair of queries, find a maximal-scoring subset
180  // of nonoverlapping alignments
181 
183 
184  //--------------------------------------------------
185  if (m_Options->GetVerbose()) {
186  printf("Saved Segments:\n");
187  for (int i = 0; i < m_CombinedHits.Size(); i++) {
188  CHit *hit = m_CombinedHits.GetHit(i);
189  printf("query %2d %3d - %3d query %2d %3d - %3d score %d\n",
190  hit->m_SeqIndex1,
191  hit->m_SeqRange1.GetFrom(), hit->m_SeqRange1.GetTo(),
192  hit->m_SeqIndex2,
193  hit->m_SeqRange2.GetFrom(), hit->m_SeqRange2.GetTo(),
194  hit->m_Score);
195  }
196  printf("\n\n");
197  }
198  //--------------------------------------------------
199 
200 }
201 
202 END_SCOPE(cobalt)
int Size() const
Retrieve number of hits in list.
Definition: hitlist.hpp:75
void MakeCanonical()
Sort the list in a canonical form: first swap the indices and ranges on all hits and subhits so that ...
Definition: hitlist.cpp:98
void SetKeepHit(int index, bool keep)
Set whether a hit in the hitlist will be scheduled for deletion.
Definition: hitlist.hpp:126
CHit * GetHit(int index)
Retrieve a hit from the hitlist.
Definition: hitlist.hpp:93
void PurgeUnwantedHits()
Delete all hits scheduled to be deleted.
Definition: hitlist.hpp:134
A generalized representation of a pairwise alignment.
Definition: hit.hpp:86
int m_Score
Score of alignment.
Definition: hit.hpp:104
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
Definition: hit.hpp:97
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
Definition: hit.hpp:101
TRange m_SeqRange1
The range of offsets on the first sequence.
Definition: hit.hpp:107
TRange m_SeqRange2
The range of offsets on the second sequence.
Definition: hit.hpp:110
bool StrictlyBelow(const TThisType &r)
Test whether 'this' represents a sequence range strictly less than a given sequence range.
Definition: base.hpp:100
bool GetVerbose(void) const
Get verbose mode.
Definition: options.hpp:691
SGraphNode * x_FindBestPath(vector< SGraphNode > &nodes)
Find a maximum weight path in a directed acyclic graph.
Definition: seg.cpp:52
struct CMultiAligner::SGraphNode SGraphNode
CHitList m_CombinedHits
Definition: cobalt.hpp:718
vector< CSequence > m_QueryData
Definition: cobalt.hpp:691
void x_FindConsistentHitSubset(void)
Find consistent subset of pair-wise hits that can be used as alignment constraints.
Definition: seg.cpp:175
CConstRef< CMultiAlignerOptions > m_Options
Definition: cobalt.hpp:686
void x_FindAlignmentSubsets()
Definition: seg.cpp:120
Interface for CMultiAligner.
#define NULL
Definition: ncbistd.hpp:225
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
int i
#define INT4_MIN
Smallest (most negative) number represented by signed int.
Definition: ncbi_std.h:146
struct SGraphNode * path_next
the optimal path node belongs to
Definition: cobalt.hpp:505
Modified on Wed Mar 27 11:26:45 2024 by modify_doxy.py rev. 669887