NCBI C++ ToolKit
phytree_simplify.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: phytree_simplify.cpp 79933 2017-10-23 15:25:58Z zaretska $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * lavNational Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Greg Boratyn
27  *
28  * File Description: Tree visitor classes for phylogenetic node groupping
29  * and tree simplification
30  */
31 
32 #include <ncbi_pch.hpp>
35 
37 
38 static const string s_kDifferentGroups = "$DIFFERENT_GROUPS";
39 
40 
41 CPhyTreeNodeGroupper::CPhyTreeNodeGroupper(const string& feature_name,
42  const string& feature_color,
44  CNcbiOfstream* ostr)
45  : m_Root(NULL), m_Ostr(ostr)
46 {
47  Init(feature_name, feature_color, tree);
48 }
49 
50 void CPhyTreeNodeGroupper::Init(const string& feature_name,
51  const string& feature_color,
53 {
54  m_LabelFeatureName = feature_name;
55  m_ColorFeatureName = feature_color;
56 
57  const CBioTreeFeatureDictionary& fdict = tree.GetFeatureDict();
58  if (!fdict.HasFeature(m_LabelFeatureName)
59  || !fdict.HasFeature(m_ColorFeatureName)) {
60 
61  m_Error = "Feature not in feature dictionary";
62  }
63  m_LabeledNodes.clear();
64 
65  while (!m_LabelStack.empty()) {
66  m_LabelStack.pop();
67  }
68 
69  while (!m_ParentIdStack.empty()) {
70  m_ParentIdStack.pop();
71  }
72 }
73 
75  CBioTreeDynamic::CBioNode& x_node,
76  int delta)
77 {
78  if (m_Ostr != NULL) {
79  *m_Ostr << "stack top: ";
80  if (m_LabelStack.empty())
81  *m_Ostr << "empty";
82  else
83  *m_Ostr << m_LabelStack.top().first;
84  *m_Ostr << ", num elements on label stack: " << m_LabelStack.size();
85  *m_Ostr << ", num elements on parent stack: " << m_ParentIdStack.size();
86  *m_Ostr << NcbiEndl;
87  }
88 
89  if (!m_Error.empty()) {
90  return eTreeTraverseStop;
91  }
92 
93  if (m_Root == NULL)
94  m_Root = &x_node;
95 
96  switch (delta) {
97  case 0: return x_OnStepDown(x_node);
98  case 1: return x_OnStepRight(x_node);
99  case -1: return x_OnStepLeft(x_node);
100  }
101 
102  return eTreeTraverse;
103 }
104 
106  CBioTreeDynamic::CBioNode& x_node)
107 {
108  //If leaf, then group name is put on stack
109  if (m_Ostr != NULL)
110  *m_Ostr << "x_OnStepRight, Id: " + NStr::IntToString(x_node.GetValue().GetId()) << NcbiEndl;
111 
112  //If this is a leaf, then the group name is saved for comparisons with
113  // other nodes
114  if (x_node.IsLeaf()) {
115  pair<string, string> label_color
116  = make_pair(x_node.GetFeature(m_LabelFeatureName),
117  x_node.GetFeature(m_ColorFeatureName));
118 
119  if (label_color.first.empty() || label_color.second.empty()) {
120  m_Error = "Leafe node has unset feature, Id: "
121  + NStr::IntToString(x_node.GetValue().GetId());
122  return eTreeTraverseStop;
123  }
124 
125  m_LabelStack.push(label_color);
126 
127  if (m_Ostr != NULL)
128  *m_Ostr << "Leaf, m_CurrentGroupName put on stack: "
129  << m_LabelStack.top().first << NcbiEndl;
130 
131  }
132  else {
133  m_LabelStack.push(make_pair(NcbiEmptyString, NcbiEmptyString));
134  }
135 
136  return eTreeTraverse;
137 }
138 
140  CBioTreeDynamic::CBioNode& x_node)
141 {
142  if (m_Ostr != NULL)
143  *m_Ostr << "x_OnStepDown, Id: "
144  + NStr::IntToString(x_node.GetValue().GetId()) << NcbiEndl;
145 
146 
147  if (x_node.IsLeaf()) {
148  //Checking if different groups were alredy found
149  if (m_LabelStack.top().first == s_kDifferentGroups) {
150 
151  if (m_Ostr != NULL)
152  *m_Ostr << "Leaf, m_CurrentGroupName == DIFFERENT_GROUPS"
153  << NcbiEndl;
154 
155  return eTreeTraverse;
156  }
157 
158  const string& name = x_node.GetFeature(m_LabelFeatureName);
159 
160  if (name == NcbiEmptyString) {
161  m_Error = "Leafe node has unset feature, Id: "
162  + NStr::IntToString(x_node.GetValue().GetId());
163  return eTreeTraverseStop;
164  }
165 
166  if (m_Ostr != NULL)
167  *m_Ostr << "Leaf with group name: " << name << NcbiEndl;
168 
169  //If the node's group name is different
170  // this is denoted by changing stack top
171  if (name != m_LabelStack.top().first) {
172  m_LabelStack.top().first = s_kDifferentGroups;
173 
174  if (m_Ostr != NULL)
175  *m_Ostr << " group name different, changing stack top to DIFFERENT_GROUPS"
176  << NcbiEndl;
177 
178 
179 
180  }
181  else {
182  if (m_Ostr != NULL)
183  *m_Ostr << " group name the same, stack top not changed"
184  << NcbiEndl;
185  }
186 
187  }
188 
189  return eTreeTraverse;
190 }
191 
193  CBioTreeDynamic::CBioNode& x_node)
194 {
195  if (m_Ostr != NULL)
196  *m_Ostr << "x_OnStepLeft, Id: " + NStr::IntToString(x_node.GetValue().GetId()) << NcbiEndl;
197 
198  //If stack top holds information about subtree rooted in x_node
199  // empty string means that this is the first subtree examined on this level
200  const pair<string, string> subtree_label_color = m_LabelStack.top();
201 
202  //If subtree name is different from DIFFERENT_GROUPS
203  // then subtree has common group
204  if (subtree_label_color.first != s_kDifferentGroups) {
205 
206  //Checking whether subtree nodes were already marked and unmarking them
207  // so that only the common parent node is marked for each group
208  while (!m_ParentIdStack.empty()
209  && m_ParentIdStack.top() == (*x_node).GetId()) {
210  m_LabeledNodes.pop_back();
211  m_ParentIdStack.pop();
212  }
213 
214  //Marking common group subtree
215  m_LabeledNodes.push_back(CLabeledNode(&x_node, m_LabelStack.top()));
216 
217  if (m_Ostr != NULL)
218  *m_Ostr << "Subtree with common group name: " << m_LabelStack.top().first
219  << NcbiEndl;
220 
221  //Saving parent id so that this node can be unmarked if parent belongs
222  // to the same group
223  if (!x_IsRoot(&x_node)) {
224  m_ParentIdStack.push((**x_node.GetParent()).GetId());
225  }
226 
227  }
228  else {
229  //Cleaning parent id stack if this subtree does not have cmomon label
230  while (!m_ParentIdStack.empty() && m_ParentIdStack.top() == (*x_node).GetId()) {
231  m_ParentIdStack.pop();
232  }
233  }
234 
235  m_LabelStack.pop();
236  if (m_Ostr != NULL)
237  *m_Ostr << " poping from stack" << NcbiEndl;
238 
239  //Comparing subtree group name to sibling nodes group name
240  // and correcting stack top.
241  if (!x_IsRoot(&x_node)) {
242  if (m_LabelStack.top().first == NcbiEmptyString)
243  m_LabelStack.top() = subtree_label_color;
244  else
245  if (m_LabelStack.top().first != subtree_label_color.first)
246  m_LabelStack.top().first = s_kDifferentGroups;
247  }
248 
249  return eTreeTraverse;
250 }
251 
252 
253 CPhyTreeLabelTracker::CPhyTreeLabelTracker(const string& label_feature,
254  const string& color_feature,
256  : m_LabelFeatureTag(label_feature),
257  m_ColorFeatureTag(color_feature),
258  m_FoundQueryNode(false),
259  m_FoundSeqFromType(false),
260  m_FoundSeqFromVerifiedMat(false),
261  m_FoundSeqReferenceDB(false),
262  m_FoundSeqKmerBlast(false)
263 {
264  const CBioTreeFeatureDictionary& fdict = tree.GetFeatureDict();
265  if (!fdict.HasFeature(label_feature) || !fdict.HasFeature(color_feature)) {
266  m_Error = "Feature not in feature dictionary";
267  }
268 
270  m_LeafCount = 0;
271 }
272 
274  CBioTreeDynamic::CBioNode& node,
275  int delta)
276 {
277  if (!m_Error.empty()) {
278  return eTreeTraverseStop;
279  }
280 
281  if (delta == 0 || delta == 1) {
282 
283  // if query node
284  if (!m_FoundQueryNode && x_IsQuery(node)) {
285  m_FoundQueryNode = true;
286  }
287 
288  if (!m_FoundSeqFromType && x_IsSeqFromType(node)) {
289  m_FoundSeqFromType = true;
290  }
293  }
295  m_FoundSeqReferenceDB = true;
296  }
297  if (!m_FoundSeqKmerBlast && x_IsSeqKmerBlast(node)) {
298  m_FoundSeqKmerBlast = true;
299  }
300  if (node.IsLeaf()) {
301  const string& label = node.GetFeature(m_LabelFeatureTag);
302  const string& color = node.GetFeature(m_ColorFeatureTag);
303 
305  m_LeafCount++;
306  }
307  }
308 
309  return eTreeTraverse;
310 }
311 
312 
313 bool CPhyTreeLabelTracker::x_IsQuery(const CBioTreeDynamic::CBioNode& node) const
314 {
315  return node.GetFeature(CPhyTreeFormatter::GetFeatureTag(
318 }
319 
321  const CBioTreeDynamic::CBioNode& node) const
322 {
323  return node.GetFeature(CPhyTreeFormatter::GetFeatureTag(
326 }
327 
329  const CBioTreeDynamic::CBioNode& node) const
330 {
331  return node.GetFeature(CPhyTreeFormatter::GetFeatureTag(
334 }
336  const CBioTreeDynamic::CBioNode& node) const
337 {
338  return node.GetFeature(CPhyTreeFormatter::GetFeatureTag(
341 }
343  const CBioTreeDynamic::CBioNode& node) const
344 {
345  return node.GetFeature(CPhyTreeFormatter::GetFeatureTag(
348 }
349 
351  const string& feature_color,
352  const string& feature_acc,
354  CNcbiOfstream* ostr)
355  : m_Ostr(ostr)
356 {
357  Init(feature_name, feature_color, feature_acc, tree);
358 }
359 
360 void CPhyTreeNodeAnalyzer::Init(const string& feature_name,
361  const string& feature_color,
362  const string& feature_acc,
364 {
365  m_LabelFeatureName = feature_name;
366  m_AccFeatureName = feature_acc;
367  m_ColorFeatureName = feature_color;
368 
369  const CBioTreeFeatureDictionary& fdict = tree.GetFeatureDict();
370  if (!fdict.HasFeature(m_LabelFeatureName) ||
371  !fdict.HasFeature(m_ColorFeatureName) ||
372  !fdict.HasFeature(m_AccFeatureName)) {
373 
374  m_Error = "Feature " + m_LabelFeatureName + " or " + m_AccFeatureName + " not in feature dictionary";
375  }
376  m_LabeledNodes.clear();
377  while (!m_InfoStack.empty()) {
378  m_InfoStack.pop();
379  }
380 }
381 
383  CBioTreeDynamic::CBioNode& x_node,
384  int delta)
385 {
386  if (!m_Error.empty()) {
387  return eTreeTraverseStop;
388  }
389 
390  switch (delta) {
391  case 0: return x_OnStepDown(x_node);
392  case 1: return x_OnStepRight(x_node);
393  case -1: return x_OnStepLeft(x_node);
394  }
395 
396  return eTreeTraverse;
397 }
398 
400 {
401  *m_Ostr << "Number of blast names =" << nodeMap.size() << endl;
402  for (auto it = nodeMap.begin(); it != nodeMap.end(); ++it) {
403  vector <CPhyTreeNodeAnalyzer::TLeafNodeInfo> vecInf = it->second;
404  string printStr = (vecInf.size() > 1) ? " nodeIDs: " : " nodeID: ";
405  *m_Ostr << "Blast name: " << it->first << printStr;
406  for(size_t i = 0;i < vecInf.size(); i++){
407  *m_Ostr << vecInf[i].nodeID << ",";
408  }
409 
410  printStr = (vecInf.size() > 1) ? " Accessions: " : " Accession: ";
411  *m_Ostr << printStr;
412  for(size_t i = 0;i < vecInf.size(); i++){
413  *m_Ostr << vecInf[i].accession << ",";
414  }
415  *m_Ostr << endl;
416  }
417 }
418 
419 void CPhyTreeNodeAnalyzer::x_InitLeafNodeStack(CBioTreeDynamic::CBioNode& x_node)
420 {
421  TLeafNodeInfoMap infoStackMap;
422  vector <TLeafNodeInfo> vecLeafInfoNode;
423 
424  TLeafNodeInfo leafInfoNode;
425  leafInfoNode.nodeID = x_node.GetValue().GetId();
426 
427  leafInfoNode.nodeColor = x_node.GetFeature(m_ColorFeatureName);
428  leafInfoNode.accession = x_node.GetFeature(m_AccFeatureName);
429 
430  vecLeafInfoNode.push_back(leafInfoNode);
431  infoStackMap.insert(TLeafNodeInfoMap::value_type(x_node.GetFeature(m_LabelFeatureName),vecLeafInfoNode));
432 
433  m_InfoStack.push(infoStackMap);
434  if (m_Ostr) {
435  x_PrintNodeMap(infoStackMap);
436  }
437 }
438 
439 
441  CBioTreeDynamic::CBioNode& x_node)
442 {
443  //If leaf, then record blast name and acc number is put on stack
444  if (m_Ostr != NULL)
445  *m_Ostr << "x_OnStepRight, nodeID: " + NStr::IntToString(x_node.GetValue().GetId()) << NcbiEndl;
446 
447  // other nodes
448  if (x_node.IsLeaf()) {
449  if (x_node.GetFeature(m_LabelFeatureName).empty() || x_node.GetFeature(m_AccFeatureName).empty()) {
450  m_Error = "Leaf node has unset feature, Id: "
451  + NStr::IntToString(x_node.GetValue().GetId());
452  return eTreeTraverseStop;
453  }
454  x_InitLeafNodeStack(x_node);
455  }
456 
457  return eTreeTraverse;
458 }
459 
461  CBioTreeDynamic::CBioNode& x_node)
462 {
463  if (m_Ostr != NULL)
464  *m_Ostr << "x_OnStepDown, nodeID: " + NStr::IntToString(x_node.GetValue().GetId()) << NcbiEndl;
465 
466  if (x_node.IsLeaf()) {
467 
468  if (x_node.GetFeature(m_LabelFeatureName).empty() || x_node.GetFeature(m_AccFeatureName).empty()) {
469  m_Error = "Leaf node has unset feature, Id: "
470  + NStr::IntToString(x_node.GetValue().GetId());
471  return eTreeTraverseStop;
472  }
473 
474  x_InitLeafNodeStack(x_node);
475  }
476 
477  return eTreeTraverse;
478 }
479 
480 
482 {
483  TLeafNodeInfoMap mapResult(nodeMap1);
484  for (auto it = nodeMap2.begin(); it != nodeMap2.end(); ++it) {
485  if (mapResult.count(it->first) > 0 ) { //blastname exists in the second map
486  vector <TLeafNodeInfo> vecInf2 = it->second;
487  vector <TLeafNodeInfo> vecInf1 = mapResult[it->first];
488  vecInf1.insert(vecInf1.end(), vecInf2.begin(), vecInf2.end());
489  mapResult[it->first].insert(mapResult[it->first].end(), vecInf2.begin(), vecInf2.end());
490  }
491  else {
492  mapResult.insert(TLeafNodeInfoMap::value_type(it->first,it->second));
493  }
494  }
495  return mapResult;
496 }
497 
499  CBioTreeDynamic::CBioNode& x_node)
500 {
501  if (m_Ostr != NULL)
502  *m_Ostr << "x_OnStepLeft, nodeID: " + NStr::IntToString(x_node.GetValue().GetId()) << NcbiEndl;
503 
504  TLeafNodeInfoMap infoStackMap1, infoStackMap2, infoStackMapCombined;
505  if(!m_InfoStack.empty()) {
506  infoStackMap1 = m_InfoStack.top();
507  m_InfoStack.pop();
508  }
509  //new map = combine maps
510  if(!m_InfoStack.empty()) {
511  infoStackMap2 = m_InfoStack.top();
512  m_InfoStack.pop();
513  }
514  infoStackMapCombined = x_CombineNodeMaps(infoStackMap1, infoStackMap2);
515  m_InfoStack.push(infoStackMapCombined);
516  if (m_Ostr) {
517  x_PrintNodeMap(infoStackMapCombined);
518  }
519  m_LabeledNodes.push_back(CLabeledNode(&x_node, new TLeafNodeInfoMap(infoStackMapCombined)));
520  return eTreeTraverse;
521 }
522 
Feature dictionary.
Definition: bio_tree.hpp:176
static const string kNodeInfoSeqFromType
Node feature "node-info" value for sequences from type.
static string GetFeatureTag(EFeatureID feat)
Get tree feature tag.
static const string kNodeInfoSeqKmerBlast
@ eNodeInfoId
Used for denoting query nodes.
static const string kNodeInfoSeqReferenceDB
static const string kNodeInfoSeqFromVerifiedMat
static const string kNodeInfoQuery
Node feature "node-info" value for query nodes.
bool x_IsSeqReferenceDB(const CBioTreeDynamic::CBioNode &node) const
CPhyTreeLabelTracker(const string &label, const string &color, CBioTreeDynamic &tree)
bool x_IsSeqKmerBlast(const CBioTreeDynamic::CBioNode &node) const
bool x_IsQuery(const CBioTreeDynamic::CBioNode &node) const
ETreeTraverseCode operator()(CBioTreeDynamic::CBioNode &node, int delta)
bool x_IsSeqFromVerifiedMat(const CBioTreeDynamic::CBioNode &node) const
bool x_IsSeqFromType(const CBioTreeDynamic::CBioNode &node) const
TLabelColorMap m_LabelsColors
void x_InitLeafNodeStack(CBioTreeDynamic::CBioNode &x_node)
void x_PrintNodeMap(TLeafNodeInfoMap nodeMap)
stack< TLeafNodeInfoMap > m_InfoStack
CLabeledNodes m_LabeledNodes
ETreeTraverseCode x_OnStepLeft(CBioTreeDynamic::CBioNode &x_node)
TLeafNodeInfoMap x_CombineNodeMaps(TLeafNodeInfoMap nodeMap1, TLeafNodeInfoMap nodeMap2)
ETreeTraverseCode operator()(CBioTreeDynamic::CBioNode &node, int delta)
CPhyTreeNodeAnalyzer(const string &feature_name, const string &feature_acc, const string &feature_color, CBioTreeDynamic &tree, CNcbiOfstream *ostr=NULL)
ETreeTraverseCode x_OnStepRight(CBioTreeDynamic::CBioNode &x_node)
void Init(const string &feature_name, const string &feature_color, const string &feature_acc, CBioTreeDynamic &tree)
ETreeTraverseCode x_OnStepDown(CBioTreeDynamic::CBioNode &x_node)
map< string, vector< TLeafNodeInfo > > TLeafNodeInfoMap
ETreeTraverseCode operator()(CBioTreeDynamic::CBioNode &node, int delta)
bool x_IsRoot(CBioTreeDynamic::CBioNode *node) const
ETreeTraverseCode x_OnStepRight(CBioTreeDynamic::CBioNode &x_node)
stack< pair< string, string > > m_LabelStack
stack< TBioTreeNodeId > m_ParentIdStack
CBioTreeDynamic::CBioNode * m_Root
void Init(const string &feature_name, const string &feature_color, CBioTreeDynamic &tree)
CLabeledNodes m_LabeledNodes
CPhyTreeNodeGroupper(const string &feature_name, const string &feature_color, CBioTreeDynamic &tree, CNcbiOfstream *ostr=NULL)
ETreeTraverseCode x_OnStepLeft(CBioTreeDynamic::CBioNode &x_node)
ETreeTraverseCode x_OnStepDown(CBioTreeDynamic::CBioNode &x_node)
size_type size() const
Definition: map.hpp:148
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
container_type::value_type value_type
Definition: map.hpp:52
void clear()
Definition: map.hpp:169
Definition: map.hpp:338
#define false
Definition: bool.h:36
#define NULL
Definition: ncbistd.hpp:225
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NcbiEndl
Definition: ncbistre.hpp:548
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
#define NcbiEmptyString
Definition: ncbistr.hpp:122
ETreeTraverseCode
Tree traverse code returned by the traverse predicate function.
Definition: ncbi_tree.hpp:51
bool HasFeature(const string &feature_name) const
Check if feature is listed in the dictionary.
Definition: bio_tree.cpp:146
@ eTreeTraverseStop
Stop traversal (return form algorithm)
Definition: ncbi_tree.hpp:53
@ eTreeTraverse
Keep traversal.
Definition: ncbi_tree.hpp:52
static const char label[]
n background color
int i
Int4 delta(size_t dimension_, const Int4 *score_)
static const string s_kDifferentGroups
Modified on Sun Apr 21 03:38:38 2024 by modify_doxy.py rev. 669887