NCBI C++ ToolKit
cuTaxNRCriteria.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuTaxNRCriteria.cpp 90374 2020-06-08 15:56:03Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Chris Lanczycki
27 *
28 * File Description:
29 * A base concrete class used for non-redundification of a cluster of sequences using
30 * taxonomic criteria.
31 *
32 */
33 
34 
35 #include <ncbi_pch.hpp>
36 #include <algorithm>
37 //#include <algo/structure/cd_utils/cuSequence.hpp>
39 
42 BEGIN_SCOPE(cd_utils)
43 
45 
46 
47 int CTaxNRItem::Compare(const CNRItem& rhs) const
48 {
49  try {
50  const CTaxNRItem& taxNRItem = dynamic_cast<const CTaxNRItem&> (rhs);
51  return CTaxNRItem::CompareItems(*this, taxNRItem);
52  } catch (std::bad_cast) {
53  return 1; // rule in favor of this object if can't perform the cast
54  }
55 }
56 
57 int CTaxNRItem::CompareItems(const CTaxNRItem& lhs, const CTaxNRItem& rhs) {
58  bool gotAnswer = false;
59  int result;
60  TTaxItemId prefNodeIdLHS, prefNodeIdRHS, modelOrgIdLHS, modelOrgIdRHS;
61  string nodeName, orgName;
63 
64  prefNodeIdLHS = lhs.prefTaxnode;
65  prefNodeIdRHS = rhs.prefTaxnode;
66 
67  // Cases where one or both items are not under a preferred tax node. (If neither is
68  // under a preferred node, they are equivalent from point of view of taxonomy alone.)
69  if (prefNodeIdLHS == badId) {
70  result = (prefNodeIdRHS == badId) ? 0 : -1;
71  gotAnswer = true;
72  } else if (prefNodeIdRHS == badId) {
73  result = 1;
74  gotAnswer = true;
75  }
76 
77  // If the items are under different preferred tax nodes, in a binary comparison
78  // the items are equally favored independent of model organism status: if there
79  // is only a non-model org. under a pref. tax node, we'll keep it.
80  if (!gotAnswer && prefNodeIdLHS != prefNodeIdRHS) {
81  result = 0;
82  gotAnswer = true;
83  }
84 
85  // Cases where both items are under the same preferred tax node.
86  if (!gotAnswer) {
87  modelOrgIdLHS = lhs.modelOrg;
88  modelOrgIdRHS = rhs.modelOrg;
89 
90  if (modelOrgIdLHS == badId) {
91  if (modelOrgIdRHS == badId) {
92  result = 0; // they're equally favored; we'd *keep* only one, but w/o tie-breakers it could be either
93  } else {
94  result = -1;
95  }
96  } else if (modelOrgIdRHS == badId) {
97  result = 1;
98  } else if (modelOrgIdLHS != modelOrgIdRHS) {
99  result = 0; // if different model orgs, both kept
100  } else {
101  result = 0; // they're equally favored; we'd *keep* only one, but w/o tie-breakers it could be either
102  }
103  }
104 
105  return result;
106 }
107 
108 
109 
110 ////////////////////////////////////////////////
111 // CTaxNRCriteria class
112 ////////////////////////////////////////////////
113 
114 
116 
120 }
121 
122 CTaxNRCriteria::CTaxNRCriteria(const vector< TTaxId >& priorityTaxIds, const vector< TTaxId >& taxIdsToBeClustered) {
124 
125  m_priorityTaxNodes = (m_taxClient) ? new CPriorityTaxNodes(priorityTaxIds, *m_taxClient) : NULL;
126 
127  for (CBaseClusterer::TId i = 0; i < taxIdsToBeClustered.size(); ++i) {
128  m_id2Tax.insert(TId2TaxidMap::value_type(i, taxIdsToBeClustered[i]));
129  }
130 }
131 
132 CTaxNRCriteria::CTaxNRCriteria(CPriorityTaxNodes* priorityTaxNodes, const vector< TTaxId >& taxIdsToBeClustered) {
134  m_priorityTaxNodes = priorityTaxNodes;
135 
136  for (CBaseClusterer::TId i = 0; i < taxIdsToBeClustered.size(); ++i) {
137  m_id2Tax.insert(TId2TaxidMap::value_type(i, taxIdsToBeClustered[i]));
138  }
139 }
140 
141 CTaxNRCriteria::CTaxNRCriteria(CPriorityTaxNodes* priorityTaxNodes, const TId2TaxidMap& id2TaxidMap) {
143  m_priorityTaxNodes = priorityTaxNodes;
144  m_id2Tax = id2TaxidMap;
145 }
146 
148  delete m_priorityTaxNodes;
149 }
150 
152  TId2TaxidMapCIt cit = m_id2Tax.find(id);
153  TTaxId result = (cit != m_id2Tax.end()) ? cit->second : INVALID_TAX_ID;
154  return result;
155 }
156 
158 
159  m_name = "Taxonomic Non-redundification Criteria";
160  m_shouldMatch = true;
161 
162  // Lazy-initialize tax server.
163  if (!m_taxClient) {
164  m_taxClient = new TaxClient();
165  }
167 
168  m_id2Tax.clear();
169 }
170 
172 {
173  return (m_taxClient) ? m_taxClient->GetTaxIDFromBioseq(bioseq, false) : INVALID_TAX_ID;
174 }
175 
176 unsigned int CTaxNRCriteria::Apply(CBaseClusterer::TCluster*& clusterPtr, string* report) {
177 
178  unsigned int nSubcluster = 0, nMarkedRedundant = 0;
179  int priorityNodeId;
180  TTaxId taxId;
181  string nodeName;
182 
183  if (!clusterPtr || !m_id2ItemMap) return nMarkedRedundant;
184 
185  CTaxNRItem* taxNRItem = NULL;
187 
189  CBaseClusterer::TClusterIt itemIt = clusterPtr->begin(), itemItEnd = clusterPtr->end();
190 
191  // Sort cluster into subclusters based on the priority tax node items are under.
192  // priorityNodeId == INVALID_TAX_ITEM_ID is the case where under no priority tax node.
193  // Apply rules to mark those items in the subclusters that are redundant.
195  for (; itemIt != itemItEnd; ++itemIt) {
196 
197  id = *itemIt;
198  taxId = m_id2Tax[id];
199  nodeName.erase();
200  priorityNodeId = (taxId > ZERO_TAX_ID && m_priorityTaxNodes) ? m_priorityTaxNodes->GetPriorityTaxnode(taxId, nodeName, m_taxClient) : badId;
201  if (priorityNodeId == -1) priorityNodeId = badId;
202 
203  taxNRItem = new CTaxNRItem(id, (CTaxNRItem::TTaxItemId)(priorityNodeId), CTaxNRItem::INVALID_TAX_ITEM_ID, taxId, true);
204  if (!taxNRItem) {
205  continue;
206  }
207 
208  if ((priorityNodeId == badId && m_shouldMatch) || (priorityNodeId != badId && !m_shouldMatch)) {
209  if (report) report->append("\n Toss ID " + NStr::UIntToString(id) + " taxId = " + NStr::NumericToString(taxId) + " nodeName = " + nodeName + ": priorityNodeId " + NStr::IntToString(priorityNodeId));
210  taxNRItem->keep = false;
211  ++nMarkedRedundant;
212  } else {
213  if (report) report->append("\n Keep ID " + NStr::UIntToString(id) + " taxId = " + NStr::NumericToString(taxId) + " nodeName = " + nodeName + ": priorityNodeId " + NStr::IntToString(priorityNodeId));
214  }
215  if (m_subclusters.count(priorityNodeId) == 0) ++nSubcluster;
216 
217  _ASSERT(m_id2ItemMap->count(id) == 0);
218 
219  m_subclusters[priorityNodeId].insert(CBaseClusterer::TClusterVT(id));
220  m_id2ItemMap->insert(TId2ItemVT(id, taxNRItem));
221  }
222 
223  if (report && report->length() > 0) report->append("\n");
224 
225  return nMarkedRedundant;
226 }
227 
228 
229 int CTaxNRCriteria::CompareItems(const CTaxNRItem& lhs, const CTaxNRItem& rhs) const {
230 
231  string nodeName, orgName;
232  CTaxNRItem lhsItem(lhs), rhsItem(rhs);
233 
234  // If don't have valid field in the passed-in items, fill them in the temporaries.
235  if (lhsItem.taxId == INVALID_TAX_ID) {
236  // Doing it this way since operator[] returns non-const reference.
237  TId2TaxidMapCIt citLHS = m_id2Tax.find(lhs.itemId);
238  lhsItem.taxId = citLHS != m_id2Tax.end() ? citLHS->second : ZERO_TAX_ID;
239  }
240  if (rhsItem.taxId == INVALID_TAX_ID) {
241  // Doing it this way since operator[] returns non-const reference.
242  TId2TaxidMapCIt citRHS = m_id2Tax.find(rhs.itemId);
243  rhsItem.taxId = citRHS != m_id2Tax.end() ? citRHS->second : ZERO_TAX_ID;
244  }
245 
246  return (lhsItem.taxId == rhsItem.taxId) ? 0 : (lhsItem.taxId < rhsItem.taxId) ? -1 : 1;
247 }
248 
250  if (m_taxClient && !m_taxClient->IsAlive()) {
252  }
254  return m_isTaxConnected;
255 }
256 
257 END_SCOPE(cd_utils)
TCluster::iterator TClusterIt
TCluster::value_type TClusterVT
unsigned int TId
TId2Item * m_id2ItemMap
TId2Item::value_type TId2ItemVT
static const TItemId INVALID_ITEM_ID
TItemId itemId
int GetPriorityTaxnode(TTaxId taxid, string &nodeName, TaxClient *taxClient=NULL)
TTaxId GetTaxIdFromClient(const CBioseq &bioseq)
virtual int CompareItems(const CTaxNRItem &lhs, const CTaxNRItem &rhs) const
TSubclusterMap m_subclusters
virtual ~CTaxNRCriteria()
CPriorityTaxNodes * m_priorityTaxNodes
TId2TaxidMap::const_iterator TId2TaxidMapCIt
virtual unsigned int Apply(CBaseClusterer::TCluster *&cluster, string *report=NULL)
TTaxId GetTaxIdForId(const CBaseClusterer::TId &id) const
static TaxClient * m_taxClient
TId2TaxidMap m_id2Tax
static const TTaxItemId INVALID_TAX_ITEM_ID
TTaxItemId prefTaxnode
static int CompareItems(const CTaxNRItem &lhs, const CTaxNRItem &rhs)
virtual int Compare(const CNRItem &rhs) const
TTaxItemId modelOrg
virtual TTaxId GetTaxIDFromBioseq(const CBioseq &bioseq, bool lookInBioseq)
virtual bool IsAlive()
Definition: cuTaxClient.cpp:95
virtual bool ConnectToTaxServer()
Definition: cuTaxClient.cpp:82
void erase(iterator pos)
Definition: map.hpp:167
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: set.hpp:45
const_iterator begin() const
Definition: set.hpp:135
const_iterator end() const
Definition: set.hpp:136
USING_SCOPE(objects)
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define INVALID_TAX_ID
Definition: ncbimisc.hpp:1116
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define NULL
Definition: ncbistd.hpp:225
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5108
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
int i
#define _ASSERT
else result
Definition: token2.c:20
Modified on Wed Dec 06 07:13:19 2023 by modify_doxy.py rev. 669887