NCBI C++ ToolKit
taxid_set.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: taxid_set.cpp 96196 2022-02-22 19:32:10Z boratyng $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kevin Bealer
27 *
28 */
29 
30 /** @file taxid_set.cpp
31 * Class which defines sequence id to taxid mapping.
32 */
33 #include <ncbi_pch.hpp>
36 #include <serial/typeinfo.hpp>
37 
38 #ifndef SKIP_DOXYGEN_PROCESSING
41 #endif
42 
44 
46 {
47  while(f && (! f.eof())) {
48  string s;
49  NcbiGetlineEOL(f, s);
50 
51  if (s.empty())
52  continue;
53 
54  // Remove leading/trailing spaces.
55  s = NStr::TruncateSpaces(s);
56 
57  vector<string> tokens;
58  NStr::Split(s, " \t", tokens);
59 
60  string gi_str = tokens.front();
61  string tx_str;
62  if (tokens.size() == 2) {
63  tx_str = tokens.back();
64  }
65 
66  if (gi_str.size() && tx_str.size()) {
67  TTaxId taxid = NStr::StringToNumeric<TTaxId>(tx_str, NStr::fAllowLeadingSpaces);
68  string key = AccessionToKey(gi_str);
69 
70  m_TaxIdMap[key] = taxid;
71  }
72  }
74 }
75 
76 
77 void CTaxIdSet::AddTaxId(const CSeq_id& seqid, const TTaxId& taxid)
78 {
79  string key = AccessionToKey(seqid.AsFastaString());
80  m_TaxIdMap[key] = taxid;
81 }
82 
83 
84 TTaxId CTaxIdSet::x_SelectBestTaxid(const objects::CBlast_def_line & defline)
85 {
86  TTaxId retval = m_GlobalTaxId;
87 
88  if (retval != kTaxIdNotSet) {
89  return retval;
90  }
91 
92  if ( !m_TaxIdMap.empty() ) {
93  vector<string> keys;
94  GetDeflineKeys(defline, keys);
95 
96  ITERATE(vector<string>, key, keys) {
97  if (key->empty())
98  continue;
99 
101 
102  if (item != m_TaxIdMap.end()) {
103  retval = item->second;
104  m_Matched = true;
105  break;
106  }
107 
108  // try removing version to see if strings match.
109  // this is most likely to help if the Query ID was parsed as local ID but is really accession.
110  string keyNoVersion;
111  string version;
112  if (NStr::SplitInTwo(*key, ".", keyNoVersion, version) == true)
113  {
114  item = m_TaxIdMap.find(keyNoVersion);
115 
116  if (item != m_TaxIdMap.end()) {
117  retval = item->second;
118  m_Matched = true;
119  break;
120  }
121  }
122  }
123  } else if (defline.IsSetTaxid()) {
124  retval = defline.GetTaxid();
125  }
126 
127  return retval;
128 }
129 
130 void
132 {
133  NON_CONST_ITERATE(CBlast_def_line_set::Tdata, itr, deflines->Set()) {
134  (*itr)->SetTaxid(x_SelectBestTaxid(**itr));
135  }
136 }
137 
CRef –.
Definition: ncbiobj.hpp:618
void FixTaxId(CRef< objects::CBlast_def_line_set > deflines)
Check that each defline has the specified taxid; if not, replace the defline and set the taxid.
Definition: taxid_set.cpp:131
static const TTaxId kTaxIdNotSet
Definition: taxid_set.hpp:49
void AddTaxId(const objects::CSeq_id &seqid, const TTaxId &taxid)
Definition: taxid_set.cpp:77
bool m_Matched
Definition: taxid_set.hpp:69
map< string, TTaxId > m_TaxIdMap
Definition: taxid_set.hpp:68
TTaxId x_SelectBestTaxid(const objects::CBlast_def_line &defline)
Selects the most suitable tax id for the input passed in, checking the global taxid first,...
Definition: taxid_set.cpp:84
TTaxId m_GlobalTaxId
Definition: taxid_set.hpp:67
void SetMappingFromFile(CNcbiIstream &f)
Definition: taxid_set.cpp:45
const_iterator end() const
Definition: map.hpp:152
bool empty() const
Definition: map.hpp:149
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3550
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3182
@ fAllowLeadingSpaces
Ignore leading spaces in converted string.
Definition: ncbistr.hpp:294
list< CRef< CBlast_def_line > > Tdata
static int version
Definition: mdb_load.c:29
string AccessionToKey(const string &acc)
void GetDeflineKeys(const objects::CBlast_def_line &defline, vector< string > &keys)
Get all keys for a defline.
const struct ncbi::grid::netcache::search::fields::KEY key
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
USING_SCOPE(objects)
Class which defines sequence id to taxid mapping.
Modified on Fri Dec 01 04:49:55 2023 by modify_doxy.py rev. 669887