NCBI C++ ToolKit
pubmed_citmatch.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: pubmed_citmatch.cpp 101369 2023-12-06 17:53:25Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Vitaly Stakhovsky, NCBI
27  *
28  * File Description:
29  * PubMed Citation Match test application using EUtils
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 
36 #include <objects/pub/Pub.hpp>
37 
39 
43 
44 namespace
45 {
47  void dump_cm_list(CNcbiOstream& os, const vector<SCitMatch>& cm_list)
48  {
49  for (const auto& cm : cm_list) {
50  if (! cm.Journal.empty()) {
51  os << "Journal=" << cm.Journal << endl;
52  }
53  if (! cm.Volume.empty()) {
54  os << "Volume=" << cm.Volume << endl;
55  }
56  if (! cm.Page.empty()) {
57  os << "Page=" << cm.Page << endl;
58  }
59  if (! cm.Year.empty()) {
60  os << "Year=" << cm.Year << endl;
61  }
62  if (! cm.Author.empty()) {
63  os << "Author=" << cm.Author << endl;
64  }
65  if (! cm.Issue.empty()) {
66  os << "Issue=" << cm.Issue << endl;
67  }
68  if (! cm.Title.empty()) {
69  os << "Title=" << cm.Title << endl;
70  }
71  if (cm.InPress) {
72  os << "InPress=" << cm.InPress << endl;
73  }
74  os << endl;
75  }
76  }
77 
78  void read_cm_list(CNcbiIstream& is, vector<SCitMatch>& cm_list)
79  {
80  string line;
81  unique_ptr<SCitMatch> cm;
82 
83  while (NcbiGetlineEOL(is, line)) {
84  NStr::Sanitize(line);
85  if (line.empty()) {
86  if (cm) {
87  cm_list.push_back(*cm);
88  cm.reset();
89  } else {
90  break;
91  }
92  } else {
93  string key, val;
94  NStr::SplitInTwo(line, "=", key, val);
95 
96  if (! val.empty()) {
97  if (NStr::EqualNocase(key, "Journal")) {
98  if (! cm)
99  cm.reset(new SCitMatch);
100  cm->Journal = val;
101  } else if (NStr::EqualNocase(key, "Volume")) {
102  if (! cm)
103  cm.reset(new SCitMatch);
104  cm->Volume = val;
105  } else if (NStr::EqualNocase(key, "Page")) {
106  if (! cm)
107  cm.reset(new SCitMatch);
108  cm->Page = val;
109  } else if (NStr::EqualNocase(key, "Year")) {
110  if (! cm)
111  cm.reset(new SCitMatch);
112  cm->Year = val;
113  } else if (NStr::EqualNocase(key, "Author")) {
114  if (! cm)
115  cm.reset(new SCitMatch);
116  cm->Author = val;
117  } else if (NStr::EqualNocase(key, "Issue")) {
118  if (! cm)
119  cm.reset(new SCitMatch);
120  cm->Issue = val;
121  } else if (NStr::EqualNocase(key, "Title")) {
122  if (! cm)
123  cm.reset(new SCitMatch);
124  cm->Title = val;
125  } else if (NStr::EqualNocase(key, "InPress")) {
126  if (! cm)
127  cm.reset(new SCitMatch);
128  cm->InPress = NStr::EqualNocase(key, "true");
129  }
130  }
131  }
132  }
133  _ASSERT(! cm);
134  }
135 }
136 
138 {
139 public:
141  {
143  }
144 
145  void Init() override
146  {
147  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
148  arg_desc->SetUsageContext("", "Match publications from PubMed and print Ids");
149  arg_desc->AddKey("i", "InFile", "Data to match", CArgDescriptions::eInputFile);
150  arg_desc->AddOptionalKey("pubmed", "source", "Always eutils", CArgDescriptions::eString, CArgDescriptions::fHidden);
151  arg_desc->AddOptionalKey("url", "url", "eutils base URL (http://eutils.ncbi.nlm.nih.gov/entrez/eutils/ by default)", CArgDescriptions::eString);
152  arg_desc->AddOptionalKey("o", "OutFile", "Output File", CArgDescriptions::eOutputFile);
153  arg_desc->AddFlag("stats", "Also print execution statistics");
154  SetupArgDescriptions(arg_desc.release());
155  }
156 
157  int Run() override
158  {
159  const CArgs& args = GetArgs();
160 
161  vector<SCitMatch> cm_list;
162  CNcbiIstream& is = args["i"].AsInputFile();
163  read_cm_list(is, cm_list);
164 
165  if (cm_list.empty()) {
166  cerr << "Warning: No input data" << endl;
167  return 0;
168  }
169 
170  if (args["url"]) {
171  string url = args["url"].AsString();
173  }
174 
175  ostream* output = nullptr;
176  if (args["o"]) {
177  output = &args["o"].AsOutputFile();
178  } else {
179  output = &NcbiCout;
180  }
181 
182  unique_ptr<CEUtilsUpdater> upd(new CEUtilsUpdater());
183 
184  bool bstats = args["stats"];
185  unsigned nruns = 0;
186  unsigned ngood = 0;
187  vector<string> results;
188  results.reserve(cm_list.size());
189  CStopWatch sw;
190 
191  sw.Start();
192  for (const SCitMatch& cm : cm_list) {
193  ++nruns;
194  try {
195  EPubmedError err;
196  TEntrezId pmid = upd->CitMatch(cm, &err);
197  if (pmid != ZERO_ENTREZ_ID) {
198  results.push_back(to_string(ENTREZ_ID_TO(TIntId, pmid)));
199  ++ngood;
200  } else {
201  ostringstream oss;
202  oss << "Error: " << err;
203  results.push_back(oss.str());
204  }
205  } catch (const CException& e) {
206  results.push_back(e.what());
207  }
208  }
209  sw.Stop();
210 
211  for (const auto& r : results) {
212  *output << r << endl;
213  }
214 
215  if (bstats) {
216  *output << " * Number of runs: " << nruns << endl;
217  *output << " * successful: " << ngood << endl;
218  *output << " * Elapsed time: " << sw << endl;
219  }
220 
221  if (args["o"]) {
222  args["o"].CloseFile();
223  }
224 
225  return 0;
226  }
227 };
228 
229 int main(int argc, const char* argv[])
230 {
231  return CPubmedFetchApplication().AppMain(argc, argv);
232 }
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
int Run() override
Run the application.
void Init() override
Initialize the application.
CStopWatch –.
Definition: ncbitime.hpp:1938
CVersionInfo –.
EPubmedError
SStrictId_Entrez::TId TEntrezId
TEntrezId type for entrez ids which require the same strictness as TGi.
Definition: ncbimisc.hpp:1041
#define ENTREZ_ID_TO(T, entrez_id)
Definition: ncbimisc.hpp:1097
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:799
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1175
Int8 TIntId
Definition: ncbimisc.hpp:999
void SetVersion(const CVersionInfo &version)
Set the version number for the program.
Definition: ncbiapp.cpp:1135
#define ZERO_ENTREZ_ID
Definition: ncbimisc.hpp:1102
@ fHidden
Hide it in Usage.
Definition: ncbiargs.hpp:662
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
static void SetBaseURL(const string &url)
Set new base url for all e-utils requests.
Definition: eutils.cpp:137
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
#define NCBI_UNUSED
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NcbiCout
Definition: ncbistre.hpp:543
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3550
static string Sanitize(CTempString str, TSS_Flags flags=fSS_print)
Sanitize a string, allowing only specified classes of characters.
Definition: ncbistr.hpp:2876
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
void Stop(void)
Suspend the timer.
Definition: ncbitime.hpp:2793
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2765
static CStopWatch sw
Definition: fix_pub.hpp:45
const struct ncbi::grid::netcache::search::fields::KEY key
#define NCBI_SC_VERSION_PROXY
#define NCBI_TEAMCITY_BUILD_NUMBER_PROXY
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static SQLCHAR output[256]
Definition: print.c:5
USING_SCOPE(objects)
int main(int argc, const char *argv[])
USING_NCBI_SCOPE
#define _ASSERT
Modified on Sat Mar 02 10:53:00 2024 by modify_doxy.py rev. 669887