NCBI C++ ToolKit
process_eutils.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 
33 #include <corelib/ncbistd.hpp>
34 #include <corelib/ncbi_system.hpp>
38 
39 // #include <misc/eutils_client/eutils_client.hpp>
40 // #include <objects/pubmed/Pubmed_entry.hpp>
41 
42 #include <cmath>
43 
44 #ifndef __process_eutils__hpp__
45 #define __process_eutils__hpp__
46 
47 
48 
50 {
51 public:
53  {
54  }
55 
56  bool TryQuery (string query)
57  {
58  if (m_Host.empty() || m_Url.empty()) return false;
59  for (int attempt = 0; attempt < 5; ++attempt) {
60  try {
62 
63  /*
64  NcbiStreamCopy(cout, istr);
65  return true;
66  */
67 
69  parse_stream(istr, &msgs);
70 
71  if (msgs.has_errors() || msgs.has_fatal_errors()) {
72  ERR_POST(Warning << "error parsing xml: " << msgs.print());
73  return false;
74  }
75 
76  if (Succeeded ()) return true;
77  return false;
78  }
79  catch (CException& e) {
80  ERR_POST(Warning << "failed on attempt " << attempt + 1
81  << ": " << e);
82  }
83 
84  int sleep_secs = ::sqrt((double)attempt);
85  if (sleep_secs) {
86  SleepSec(sleep_secs);
87  }
88  }
89  return false;
90  }
91 
92 protected:
93  virtual bool StartElement (const std::string &name)
94  {
95  return true;
96  }
97  virtual bool EndElement (const std::string &name)
98  {
99  return true;
100  }
101  virtual bool Text (const std::string &contents)
102  {
103  return true;
104  }
105  virtual bool Succeeded ()
106  {
107  return true;
108  }
109 
110  bool PathSuffixIs (const char* suffix)
111  {
112  string::size_type pos = m_Path.rfind(suffix);
113  return (pos != string::npos && pos == m_Path.size() - strlen(suffix));
114  }
115 
116 protected:
117  bool error(const string& message)
118  {
119  ERR_POST(Error << "parse error: " << message);
120  return false;
121  }
122 
123  bool warning(const string& message)
124  {
125  ERR_POST(Warning << "parse warning: " << message);
126  return false;
127  }
128 
129  bool start_element(const string& name, const attrs_type& attrs)
130  {
131  if ( !m_Path.empty() ) {
132  m_Path += "/";
133  }
134  m_Path += name;
135  if (! StartElement(name)) {
136  return false;
137  }
138  return true;
139  }
140 
141  bool end_element(const string& name)
142  {
143  bool bail = false;
144  if (! EndElement(name)) {
145  bail = true;
146  }
147  string::size_type pos = m_Path.find_last_of("/");
148  if (pos != string::npos) {
149  m_Path.erase(pos);
150  }
151  if (bail) {
152  return false;
153  }
154  return true;
155  }
156 
157  bool text(const string& contents)
158  {
159  if (contents.empty()) {
160  return true;
161  }
162 
163  bool empty_text = true;
164  int i;
165  for (i = 0; i < contents.length(); i++) {
166  if (contents [i] != ' ' &&
167  contents [i] != '\n' &&
168  contents [i] != '\r' &&
169  contents [i] != '\t') {
170  empty_text = false;
171  }
172  }
173  if (empty_text) {
174  return true;
175  }
176 
177  if (! Text(contents)) {
178  return false;
179  }
180 
181  return true;
182  }
183 protected:
184  string m_Host;
185  string m_Url;
186  string m_Path;
187 };
188 
189 
191 {
192 public:
193  CESearchGeneric(vector<int>& uids)
194  : m_Uids(uids)
195  {
196  m_Host = "eutils.ncbi.nlm.nih.gov";
197  m_Url = "/entrez/eutils/esearch.fcgi";
198  }
199 
200 protected:
201  bool Text (const string& contents)
202  {
203  if (PathSuffixIs("/IdList/Id")) {
204  m_Uids.push_back(NStr::StringToInt(contents));
205  }
206  return true;
207  }
208 
209  bool Succeeded ()
210  {
211  if (m_Uids.size() > 0) return true;
212  return false;
213  }
214 
215 private:
216  vector<int>& m_Uids;
217 };
218 
220 {
221 public:
222  CESummaryGeneric(vector<string>& strs)
223  : m_Strs(strs)
224  {
225  m_Host = "eutils.ncbi.nlm.nih.gov";
226  m_Url = "/entrez/eutils/esummary.fcgi";
227  }
228 
229 protected:
230  bool StartElement (const string &name)
231  {
232  if (PathSuffixIs("/DocumentSummary")) {
233  m_Iso.clear();
234  m_Title.clear();
235  m_Issn.clear();
236  }
237  return true;
238  }
239 
240  bool EndElement (const string &name)
241  {
242  if (! PathSuffixIs("/DocumentSummary")) return true;
243 
244  if (m_Iso.empty()) return true;
245 
246  if (! m_Title.empty() && ! m_Issn.empty()) {
247  m_Strs.push_back(m_Iso + "||(" + m_Title + ":" + m_Issn + ")");
248  } else if (! m_Title.empty()) {
249  m_Strs.push_back(m_Iso + "||(" + m_Title + ")");
250  } else if (! m_Issn.empty()) {
251  m_Strs.push_back(m_Iso + "||(" + m_Issn + ")");
252  } else {
253  m_Strs.push_back(m_Iso);
254  }
255 
256  return true;
257  }
258 
259  bool Text (const string& contents)
260  {
261  if (PathSuffixIs("/DocumentSummary/ISOAbbreviation")) {
262  if (m_Iso.empty()) {
263  m_Iso = contents;
264  }
265  }
266  if (PathSuffixIs("/TitleMain/Title")) {
267  if (m_Title.empty()) {
268  m_Title = contents;
269  }
270  }
271  if (PathSuffixIs("/ISSNInfo/issn")) {
272  if (m_Issn.empty()) {
273  m_Issn = contents;
274  }
275  }
276  return true;
277  }
278 
279  bool Succeeded ()
280  {
281  if (m_Strs.size() > 0) return true;
282  return false;
283  }
284 
285 private:
286  string m_Iso;
287  string m_Title;
288  string m_Issn;
289 
290 private:
291  vector<string>& m_Strs;
292 };
293 
294 static unsigned char _ToKey[256] = {
295  0x00, 0x01, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
296 
297  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
298 
299  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300 
301  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x1F,
302  /* sp ! " # $ % & ' */
303  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27,
304  /* ( ) * + , - . / */
305  0x20, 0x20, 0x20, 0x20, 0x2C, 0x20, 0x20, 0x2F,
306  /* 0 1 2 3 4 5 6 7 */
307  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
308  /* 8 9 : ; < = > ? */
309  0x38, 0x39, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
310  /* @ A B C D E F G */
311  0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
312  /* H I J K L M N O */
313  0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
314  /* P Q R S T U V W */
315  0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
316  /* X Y Z [ \ ] ^ _ */
317  0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
318  /* ` a b c d e f g */
319  0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
320  /* h i j k l m n o */
321  0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
322  /* p q r s t u v w */
323  0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
324  /* x y z { | } ~ DEL */
325  0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
326 
327  0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
328 
329  0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
330 
331  0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
332 
333  0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
334 
335  0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
336 
337  0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
338 
339  0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
340 
341  0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
342 
343  0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
344 
345  0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
346 
347  0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
348 
349  0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
350 
351  0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
352 
353  0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
354 
355  0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
356 
357  0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
358 };
359 
361 {
362 public:
364  {
365  }
366 
367 protected:
368  bool DoOneESearch (string db, string term, string field, vector<int>& uids)
369 
370  {
371  string query;
372 
373  query += "db=" + NStr::URLEncode(db);
374  query += "&term=" + NStr::URLEncode(term);
375  query += NStr::URLEncode(field);
376  query += "&retmode=xml&retmax=200";
377 
378  uids.clear();
379 
380  CESearchGeneric parser(uids);
381  return parser.TryQuery (query);
382  }
383 
384  bool DoOneESummary (string db, vector<int>& uids, vector<string>& strs)
385 
386  {
387  string query;
388 
389  query += "db=" + NStr::URLEncode(db);
390  query += "&retmax=200&version=2.0&id=";
391  for (size_t i = 0; i < uids.size(); i++) {
392  if (i > 0) {
393  query += ",";
394  }
395  query += NStr::IntToString(uids [i]);
396  }
397 
398  strs.clear();
399 
400  CESummaryGeneric parser(strs);
401  return parser.TryQuery (query);
402  }
403 
404  bool LooksLikeISSN (string str)
405 
406  {
407  if (str.length() != 9) return false;
408 
409  char ch = str [4];
410  if (ch != '-' && ch != ' ' && ch != '+') return false;
411 
412  for (int i = 0; i < 9; i++) {
413  ch = str [i];
414  if (ch >= '0' && ch <= '9') continue;
415  if (i == 4) {
416  if (ch == '-' || ch == '+' || ch == ' ') continue;
417  }
418  if (i == 8) {
419  if (ch == 'X' || ch == 'x') continue;
420  }
421  return false;
422  }
423 
424  return true;
425  }
426 
427 public:
428  bool DoJournalSearch (string journal, vector<int>& uids)
429 
430  {
431  for (size_t i = 0; i < journal.size(); i++) {
432  journal [i] = _ToKey [(int) (unsigned char) journal [i]];
433  }
434 
435  if (LooksLikeISSN (journal)) {
436  if (journal [4] == '+' || journal [4] == ' ') {
437  journal [4] = '-';
438  }
439  if (journal [8] == 'x') {
440  journal [8] = 'X';
441  }
442  if (DoOneESearch ("nlmcatalog", journal, "[issn]", uids)) {
443  return true;
444  }
445  }
446 
447  if (DoOneESearch ("nlmcatalog", journal, "[multi] AND ncbijournals[sb]", uids)) {
448  return true;
449  }
450 
451  if (DoOneESearch ("nlmcatalog", journal, "[jour]", uids)) {
452  return true;
453  }
454 
455  return false;
456  }
457 
458  bool DoJournalSummary (vector<int>& uids, vector<string>& strs)
459 
460  {
461  if (DoOneESummary ("nlmcatalog", uids, strs)) {
462  return true;
463  }
464 
465  return false;
466  }
467 };
468 
469 // ============================================================================
471 // ============================================================================
472  : public CScopedProcess
473 {
474 public:
475  // ------------------------------------------------------------------------
477  // ------------------------------------------------------------------------
478  : CScopedProcess()
479  , m_out (0)
480  {};
481 
482  // ------------------------------------------------------------------------
484  // ------------------------------------------------------------------------
485  {
486  };
487 
488  // ------------------------------------------------------------------------
490  const CArgs& args )
491  // ------------------------------------------------------------------------
492  {
494 
495  m_out = args["o"] ? &(args["o"].AsOutputFile()) : &cout;
496  m_journal = args["journal"].AsString();
497  };
498 
499  // ------------------------------------------------------------------------
501  // ------------------------------------------------------------------------
502  {
503  }
504 
505  // ------------------------------------------------------------------------
506  virtual void SeqEntryInitialize(
507  CRef<CSeq_entry>& se )
508  // ------------------------------------------------------------------------
509  {
511  };
512 
513  /*
514  string s_LookupJournals(const string& title)
515 
516  {
517  if (title.empty()) {
518  return "";
519  }
520 
521  string tmp = title;
522  NStr::ToLower(tmp);
523 
524  string qry;
525  if (NStr::StartsWith(title, "the ")) {
526  NStr::TrimPrefixInPlace(tmp, "the ");
527  qry = tmp + " [MULT] OR the " + tmp + " [MULT]";
528  } else if (NStr::StartsWith(title, "journal ")) {
529  qry = tmp + " [MULT] OR the " + tmp + " [MULT]";
530  } else {
531  qry = tmp + " [MULT]";
532  }
533 
534  string sch = ncbi::edirect::Execute("esearch", { "-db", "nlmcatalog", "-query", qry } );
535  string xml = ncbi::edirect::Execute("efetch", { "-format", "docsum" }, sch);
536  string res = ncbi::edirect::Execute("xtract", { "-pattern", "DocumentSummary", "-element", "MedlineTA" }, xml);
537 
538  return res;
539  }
540  */
541 
542  // ------------------------------------------------------------------------
544  // ------------------------------------------------------------------------
545  {
546  /*
547  vector<int> uids;
548  vector<string> strs;
549  CEJournalSearch searcher;
550  if (searcher.DoJournalSearch (m_journal, uids)) {
551  if (searcher.DoJournalSummary (uids, strs)) {
552  // cout << "Success" << NcbiEndl;
553  // cout << "Vector of " + NStr::IntToString((int) strs.size()) + " strings" << NcbiEndl;
554  for (int i = 0; i < strs.size(); i++) {
555  cout << strs [i] << NcbiEndl;
556  }
557  }
558  }
559  // cout << "Vector of " + NStr::IntToString((int) uids.size()) + " elements" << NcbiEndl;
560  */
561 
562  /*
563  struct timeval start_time, end_time;
564  long milli_time, seconds, useconds;
565  gettimeofday(&start_time, NULL);
566 
567  // string jrs = s_LookupJournals(m_journal);
568 
569  gettimeofday(&end_time, NULL);
570  seconds = end_time.tv_sec - start_time.tv_sec; //seconds
571  useconds = end_time.tv_usec - start_time.tv_usec; //milliseconds
572  milli_time = ((seconds) * 1000 + useconds/1000.0);
573 
574  // cout << endl << jrs << endl;
575 
576  cout << "Elapsed time: " << milli_time <<" milliseconds" << endl;
577  */
578  };
579 
580 protected:
582  string m_journal;
583 };
584 
585 #endif
CArgs –.
Definition: ncbiargs.hpp:379
This stream exchanges data with an HTTP server located at the URL: http[s]://host[:port]/path[?...
bool LooksLikeISSN(string str)
bool DoOneESummary(string db, vector< int > &uids, vector< string > &strs)
bool DoJournalSearch(string journal, vector< int > &uids)
bool DoOneESearch(string db, string term, string field, vector< int > &uids)
bool DoJournalSummary(vector< int > &uids, vector< string > &strs)
vector< int > & m_Uids
bool Text(const string &contents)
CESearchGeneric(vector< int > &uids)
vector< string > & m_Strs
bool Text(const string &contents)
bool EndElement(const string &name)
bool StartElement(const string &name)
CESummaryGeneric(vector< string > &strs)
virtual bool EndElement(const std::string &name)
virtual bool Succeeded()
bool error(const string &message)
bool start_element(const string &name, const attrs_type &attrs)
bool PathSuffixIs(const char *suffix)
bool TryQuery(string query)
virtual bool StartElement(const std::string &name)
bool warning(const string &message)
bool end_element(const string &name)
virtual bool Text(const std::string &contents)
bool text(const string &contents)
virtual void SeqEntryInitialize(CRef< CSeq_entry > &se)
void ProcessInitialize(const CArgs &args)
CNcbiOstream * m_out
void ProcessInitialize(const CArgs &args)
virtual void SeqEntryInitialize(CRef< CSeq_entry > &se)
The xml::error_messages class is used to store all the error message which are collected while parsin...
Definition: errors.hpp:137
bool has_fatal_errors(void) const
Check if there are fatal errors in the error messages.
Definition: errors.cpp:126
bool has_errors(void) const
Check if there are errors in the error messages.
Definition: errors.cpp:122
std::string print(void) const
Convert error messages into a single printable string.
Definition: errors.cpp:130
The xml::event_parser is used to parse an XML document by calling member functions when certain thing...
std::map< std::string, std::string > attrs_type
a type for holding XML node attributes
bool parse_stream(std::istream &stream, error_messages *messages, warnings_as_errors_type how=type_warnings_not_errors)
Parse what ever data that can be read from the given stream.
Include a standard set of the NCBI C++ Toolkit most basic headers.
This file contains the definition of the xml::event_parser class.
static const char * str(char *buf, int n)
Definition: stats.c:84
string
Definition: cgiapp.hpp:690
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
static string URLEncode(const CTempString str, EUrlEncode flag=eUrlEnc_SkipMarkChars)
URL-encode string.
Definition: ncbistr.cpp:6053
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
void SleepSec(unsigned long sec, EInterruptOnSignal onsignal=eRestartOnSignal)
Sleep.
static unsigned char _ToKey[256]
CRef< CPub > journal(ParserPtr pp, char *bptr, char *eptr, CRef< CAuth_list > &auth_list, CRef< CTitle::C_E > &title, bool has_muid, CRef< CCit_art > &cit_art, Int4 er)
Definition: ref.cpp:1457
static string query
Modified on Fri Sep 20 14:57:55 2024 by modify_doxy.py rev. 669887