NCBI C++ ToolKit
entrez_conversion.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: entrez_conversion.cpp 47296 2022-12-30 03:01:50Z evgeniev $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Andrey Yazhuk
27  *
28  * File Description:
29  * */
30 
31 #include <ncbi_pch.hpp>
32 
34 #include <corelib/ncbitime.hpp>
35 #include <util/static_map.hpp>
36 #include <gui/objutils/label.hpp>
37 #include <gui/objutils/utils.hpp>
39 
42 
44 static const TPair sc_Dbs[] = {
45  { "assembly", "Genome Assembly" },
46  { "gene", "Gene" },
47  { "nucleotide", "Nucleotide" },
48  { "protein", "Protein" },
49 };
52 
53 static const TPair sc_Pairs[] = {
54  { "genomeprj_genome", "Chromosomes" },
55  { "genomeprj_nucleotide", "Contigs" },
56  { "genomeprj_nucleotide_mrna", "mRNAs" },
57  { "genomeprj_nucleotide_wgs", "Whole Genome Shotgun sequences" },
58  { "genomeprj_organella", "Organelles" },
59  { "genomeprj_protein", "Proteins" },
60 };
61 
64 
66 {
67 public:
68  void GetLabel(const CObject& obj, string* str,
69  CLabel::ELabelType type, objects::CScope* scope) const
70  {
71  _ASSERT(str);
72  _ASSERT(scope);
73  const CXmlNodeObject* xmlNode = dynamic_cast<const CXmlNodeObject*>(&obj);
74  if (xmlNode) {
75  switch (type) {
76  case CLabel::eUserType:
77  *str += "Entrez DocumentSummary Record";
78  break;
79 
81  *str += kEmptyStr;
82  break;
83 
84  case CLabel::eType:
85  *str += "Entrez Record";
86  break;
87 
90  {
91  // XPath query cannot be executed on a node
93  do {
94  itDesc = xmlNode->GetNode().find("AssemblyDescription");
95  if (itDesc != xmlNode->GetNode().end())
96  break;
97 
98  itDesc = xmlNode->GetNode().find("Description");
99  if (itDesc != xmlNode->GetNode().end())
100  break;
101 
102  itDesc = xmlNode->GetNode().find("Title");
103  }
104  while(false);
105  if (itDesc == xmlNode->GetNode().end())
106  break;
107  *str += itDesc->get_content();
108  break;
109  }
110  case CLabel::eContent:
111  {
112  // XPath query cannot be executed on a node
114  do {
115  itName = xmlNode->GetNode().find("AssemblyName");
116  if (itName != xmlNode->GetNode().end())
117  break;
118 
119  itName = xmlNode->GetNode().find("Name");
120  if (itName != xmlNode->GetNode().end())
121  break;
122 
123  itName = xmlNode->GetNode().find("Caption");
124  }
125  while(false);
126  if (itName == xmlNode->GetNode().end())
127  break;
128  *str += itName->get_content();
129  break;
130  }
131 
133  {
134  GetLabel(obj, str, CLabel::eUserType, scope);
135  *str += "\n";
136  GetLabel(obj, str, CLabel::eContent, scope);
137  break;
138  }
139  default:
140  break;
141  }
142  }
143  }
144 };
145 
146 void CEntrezDB::GetDbNames(vector<string>& names)
147 {
148  ITERATE(TEntrezDbMap, it, sc_EntrezDbMap) {
149  names.push_back(it->first);
150  }
151 }
152 
153 void CEntrezDB::GetDbNames(vector<TNamePair>& name_pairs)
154 {
155  ITERATE(TEntrezDbMap, it, sc_EntrezDbMap) {
156  name_pairs.push_back(TNamePair(it->first, it->second));
157  }
158 }
159 
160 
161 string CEntrezDB::GetVisibleName(const string& db_name)
162 {
163  TEntrezDbMap::const_iterator it = sc_EntrezDbMap.find(db_name.c_str());
164  return (it == sc_EntrezDbMap.end()) ? kEmptyStr : it->second;
165 }
166 
167 
168 string CEntrezDB::GetDbName(const string& visible_name)
169 {
170  ITERATE (TEntrezDbMap, iter, sc_EntrezDbMap) {
171  if (iter->second == visible_name) {
172  return iter->first;
173  }
174  }
175  return kEmptyStr;
176 }
177 
178 void CEntrezDB::Query(const string& db_name,
179  const string& terms,
180  size_t& total_uids,
181  xml::document &docsums,
182  size_t max_return)
183 {
184  // Register
185  string type = typeid(CXmlNodeObject).name();
186  if ( ! CLabel::HasHandler(type)) {
188  }
189 
190  // prepare eSearch request
191  // uid can be GI or 32-bit id depending on db_name
192  // always use TGi, since TGi fits for everything
194  try {
195  CSeqUtils::ESearchQuery(db_name, terms, uids, total_uids, (int) max_return);
196  }
197  catch (const CException& e) {
198  LOG_POST(Error << "CEntrezDB::Query(): error executing search Entrez query: " << terms << ": " << e.GetMsg());
199  }
200 
201  if (!total_uids)
202  return;
203 
204  CGuiEutilsClient ecli;
205  if (max_return)
206  ecli.SetMaxReturn((int) max_return);
207 
208  try {
209  ecli.Summary(db_name, uids, docsums, "2.0");
210  }
211  catch (const CException& e) {
212  LOG_POST(Error << "CEntrezDB::Query(): error executing summary Entrez query: " << terms << ": " << e.GetMsg());
213  }
214 }
215 
216 
218  const xml::node_set& ds_set)
219 {
220  CRef<CSeq_annot> annot(new CSeq_annot());
221  ITERATE (xml::node_set, iter, ds_set) {
222  CRef<CSeq_id> id = CreateId_Nuc_Prot(*iter);
223  annot->SetData().SetIds().push_back(id);
224  }
225 
226  string title("Query: ");
227  title += query_str;
228  title += ", database: ";
229  //TODO title += GetVisibleDbName();
230  title += ": ";
231  title += NStr::SizetToString(ds_set.size(), NStr::fWithCommas);
232  title += " sequence";
233  if (ds_set.size() != 1) {
234  title += "s";
235  }
236  annot->SetNameDesc(title);
238  return annot;
239 }
240 
241 
243 {
245  TGi gi = GetUidAttr(ds);
246  if (GI_FROM(TIntId, 0) != gi) {
247  result.Reset(new CSeq_id());
248  result->SetGi(gi);
249  }
250  return result;
251 }
252 
253 
255 {
256  CRef<CEntrezgene> egene;
257 
258  TGi uid = GetUidAttr(ds);
259  if (GI_FROM(TIntId, 0) == uid)
260  return egene;
261 
262  vector<TEntrezId> uids;
263  uids.push_back(GI_TO(TEntrezId, uid));
264 
265  // prepare eFetch request
266  CGuiEutilsClient ecli;
267 
268  CNcbiStrstream asnGene;
269 
270  egene.Reset(new CEntrezgene());
271  try {
272  ecli.Fetch("gene", uids, asnGene, "asn.1");
273  asnGene >> MSerial_AsnText >> *egene;
274  }
275  catch (const CException& e) {
276  LOG_POST(Error << "CEntrezDB::CreateGene_Gene(): error fetching ID " << uid << ": " << e.GetMsg());
277  egene.Reset();
278  }
279 
280  return egene;
281 }
282 
283 
285 {
286  CRef<CEntrezgene_Set> gene_set;
287 
288  vector<TEntrezId> uids;
289 
290  // prepare eFetch request
291  CGuiEutilsClient ecli;
292 
293  CNcbiStrstream asnGeneSet;
294 
295  /// for each docsum, retrieve the Entrezgene object
296  xml::node_set::const_iterator iter = ds_set.begin();
297  do {
298  TGi uid = GetUidAttr(*iter);
299  if (GI_FROM(TIntId, 0) != uid)
300  uids.push_back(GI_TO(TEntrezId, uid));
301 
302  ++iter;
303  if (uids.size() % 5 == 4 || iter == ds_set.end()) {
304 
305  CNcbiStrstream asnGene;
306  try {
307  ecli.Fetch("gene", uids, asnGene, "asn.1");
308  }
309  catch (const CException& e) {
310  LOG_POST(Error << "CEntrezDB_Gene::CreateGeneSet_Gene(): error retrieving IDs " << CSeqUtils::CreateIdStr(uids) << ": " << e.GetMsg());
311  uids.clear();
312  break;
313  }
314 
315  while(true) {
316  try {
317  CRef<CEntrezgene> egene(new CEntrezgene());
318 
319  asnGene >> MSerial_AsnText >> *egene;
320 
321  if ( !gene_set ) {
322  gene_set.Reset(new CEntrezgene_Set);
323  }
324  gene_set->Set().push_back(egene);
325  }
326  catch (const CEofException&) {
327  /// ignore end-of-file - we expect this on completion
328  break;
329  }
330  catch (const CException& e) {
331  LOG_POST(Error << "CEntrezDB_Gene::CreateGeneSet_Gene(): error retrieving IDs " << CSeqUtils::CreateIdStr(uids) << ": " << e.GetMsg());
332  break;
333  }
334  catch (const std::exception& e) {
335  LOG_POST(Error << "CEntrezDB_Gene::CreateGeneSet_Gene(): error retrieving IDs " << CSeqUtils::CreateIdStr(uids) << ": " << e.what());
336  break;
337  }
338  }
339  uids.clear();
340  }
341  }
342  while (iter != ds_set.end());
343  if ( gene_set && gene_set->IsSet() && !gene_set->Get().size() ) {
344  gene_set.Reset();
345  }
346 
347  return gene_set;
348 }
349 
351 {
353  if (itUid == ds.get_attributes().end())
354  return GI_FROM(TIntId, 0);
355  return GI_FROM(Int8, NStr::StringToInt8(itUid->get_value()));
356 }
357 
static CRef< objects::CEntrezgene > CreateGene_Gene(const xml::node &ds)
static void GetDbNames(vector< string > &names)
static TGi GetUidAttr(const xml::node &ds)
static string GetDbName(const string &visible_name)
static string GetVisibleName(const string &db_name)
static CRef< objects::CSeq_id > CreateId_Nuc_Prot(const xml::node &ds)
static CRef< objects::CEntrezgene_Set > CreateGeneSet_Gene(const xml::node_set &ds_set)
pair< string, string > TNamePair
static void Query(const string &db_name, const string &terms, size_t &total_uids, xml::document &docsums, size_t max_return=0)
static CRef< objects::CSeq_annot > CreateAnnot_Nuc_Prot(const string &query_str, const xml::node_set &ds_set)
CEntrezgene_Set –.
void Fetch(const string &db, const vector< objects::CSeq_id_Handle > &uids, CNcbiOstream &ostr, const string &retmode="xml")
void SetMaxReturn(int ret_max)
void Summary(const string &db, const vector< objects::CSeq_id_Handle > &uids, xml::document &docsums, const string &version="")
CObject –.
Definition: ncbiobj.hpp:180
void SetNameDesc(const string &name)
Definition: Seq_annot.cpp:66
void SetCreateDate(const CTime &dt)
Definition: Seq_annot.cpp:121
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
CTime –.
Definition: ncbitime.hpp:296
void GetLabel(const CObject &obj, string *str, CLabel::ELabelType type, objects::CScope *scope) const
const xml::node & GetNode() const
ILabelHandler interface.
Definition: label.hpp:136
const char * get_value(void) const
Get the value of this attribute.
Definition: ait_impl.cpp:350
Const Iterator class for accessing attribute pairs.
Definition: attributes.hpp:320
iterator end(void)
Get an iterator that points one past the the last attribute.
Definition: attributes.cpp:174
The xml::document class is used to hold the XML tree and various bits of information about it.
Definition: document.hpp:80
The xml::node::const_iterator provides a way to access children nodes similar to a standard C++ conta...
Definition: node.hpp:746
The xml::node_set::const_iterator class is used to iterate over nodes in a node set.
Definition: node_set.hpp:226
The xml::node_set class is used to store xpath query result set.
Definition: node_set.hpp:68
iterator begin()
Get an iterator that points to the beginning of the xpath query result node set.
Definition: node_set.cpp:173
size_t size() const
Get the number of nodes in the xpath query result node set.
Definition: node_set.cpp:167
iterator end()
Get an iterator that points one past the last node in the xpath query result node set.
Definition: node_set.cpp:185
The xml::node class is used to hold information about one XML node.
Definition: node.hpp:106
attributes::iterator find_attribute(const char *name, const ns *nspace=NULL)
Search for a node attribute.
Definition: node.cpp:851
iterator end(void)
Get an iterator that points one past the last child for this node.
Definition: node.hpp:835
iterator find(const char *name, const ns *nspace=NULL)
Find the first child node that has the given name and namespace.
Definition: node.cpp:1258
const char * get_content(void) const
Get the content for this text node.
Definition: node.cpp:797
xml::attributes & get_attributes(void)
Get the list of attributes.
Definition: node.cpp:831
USING_SCOPE(objects)
CStaticPairArrayMap< const char *, const char *, PNocase_CStr > TEntrezDbMap
static const TPair sc_Pairs[]
DEFINE_STATIC_ARRAY_MAP(TEntrezDbMap, sc_EntrezDbMap, sc_Dbs)
CStaticPairArrayMap< const char *, const char *, PNocase_CStr > TGenomeLinkMap
SStaticPair< const char *, const char * > TPair
static const TPair sc_Dbs[]
static const struct name_t names[]
static const char * str(char *buf, int n)
Definition: stats.c:84
SStrictId_Entrez::TId TEntrezId
TEntrezId type for entrez ids which require the same strictness as TGi.
Definition: ncbimisc.hpp:1041
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
Int8 TIntId
Definition: ncbimisc.hpp:999
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
static void ESearchQuery(const string &db, const string &term, TEntrezIds &uids, size_t &count, const int ret_max, const string &xpath="//IdList/Id/text()")
Queries esearch.fcgi and returns a vector of uids/seq-ids (seq-ids preferred for future compatibility...
Definition: utils.cpp:1680
vector< TEntrezId > TEntrezIds
Definition: utils.hpp:125
static string CreateIdStr(const vector< T > &uids)
Convert a list of ids into a comma-delimited string.
Definition: utils.hpp:552
static void RegisterLabelHandler(const string &type, ILabelHandler &handler)
Definition: label.cpp:229
static bool HasHandler(const CTypeInfo &type)
Definition: label.cpp:243
ELabelType
Definition: label.hpp:60
@ eType
Definition: label.hpp:65
@ eUserSubtype
Definition: label.hpp:64
@ eUserTypeAndContent
Definition: label.hpp:66
@ eContent
Definition: label.hpp:62
@ eDescription
Definition: label.hpp:68
@ eUserType
Definition: label.hpp:63
@ eDescriptionBrief
Definition: label.hpp:67
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
#define kEmptyStr
Definition: ncbistr.hpp:123
static Int8 StringToInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Int8.
Definition: ncbistr.cpp:793
@ fWithCommas
Use commas as thousands separator.
Definition: ncbistr.hpp:254
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
Defines: CTimeFormat - storage class for time format.
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: type.c:6
#define _ASSERT
else result
Definition: token2.c:20
Modified on Wed Apr 24 14:12:51 2024 by modify_doxy.py rev. 669887