NCBI C++ ToolKit
unpub_report.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Alexey Dobronadezhdin, Vitaly Stakhovsky
27  *
28  * File Description:
29  *
30  * ===========================================================================
31  */
32 #include <ncbi_pch.hpp>
33 
34 #include <iostream>
35 
36 #include <corelib/ncbistr.hpp>
37 
38 #include <objects/pub/Pub.hpp>
39 
43 
50 
53 
57 
62 
64 
65 #include "utils.hpp"
66 #include "unpub_report.hpp"
67 
68 
69 /////////////////////////////////////////////////
70 // CUnpublishedReport
71 
72 namespace pub_report
73 {
74 
75 using namespace std;
76 
77 static void ProcessInitials(string& initials)
78 {
79  string ret;
80 
81  for (char c : initials) {
82  if (c != ' ' && c != '.' && c != ',') {
83  ret += c;
84  }
85  }
86 
87  ret.swap(initials);
88 }
89 
90 static void GetAuthorsFromList(list<string>& authors, const CAuth_list& auth_list)
91 {
92  if (auth_list.IsSetNames()) {
93  const CAuth_list::C_Names& names = auth_list.GetNames();
94  if (names.IsStd()) {
95  for (const CRef<CAuthor>& auth : names.GetStd()) {
96  if (auth->IsSetName()) {
97  string cur_auth, cur_initials;
98  const CPerson_id& person = auth->GetName();
99  if (person.IsName()) {
100  const CName_std& std_name = person.GetName();
101  if (std_name.IsSetLast()) {
102  cur_auth = std_name.GetLast();
103  if (std_name.IsSetInitials()) {
104  cur_initials = std_name.GetInitials();
105  ProcessInitials(cur_initials);
106  }
107  }
108  } else if (person.IsStr()) {
109  cur_auth = person.GetStr();
110  } else if (person.IsMl()) {
111  cur_auth = person.GetMl();
112  }
113 
114  if (! cur_auth.empty()) {
115  if (! cur_initials.empty()) {
116  cur_auth += ' ' + cur_initials;
117  }
118  authors.push_back(cur_auth);
119  }
120  }
121  }
122  } else {
123  for (const string& auth : names.IsStr() ? names.GetStr() : names.GetMl()) {
124  if (! auth.empty()) {
125  authors.push_back(auth);
126  }
127  }
128  }
129  }
130 }
131 
132 
133 /////////////////////////////////////////////////
134 // CPubData
135 class CPubData
136 {
137 public:
139 
141  m_title_words_set(false),
142  m_full_title_set(false)
143  {
144  }
145 
146  const list<string>& GetAuthors() const
147  {
148  return m_authors;
149  }
150 
151  const TSeqIds& GetSeqIds() const
152  {
153  return m_seq_ids;
154  }
155 
156  const list<string>& GetTitleWords() const
157  {
158  if (! m_title_words_set) {
159  CreateTitleWords();
160  m_title_words_set = true;
161  }
162  return m_titlewords;
163  }
164 
165  const string& GetFullTitle() const
166  {
167  if (! m_full_title_set) {
168  CreateFullTitle();
169  m_full_title_set = true;
170  }
171  return m_full_title;
172  }
173 
174  const string& GetTitle() const
175  {
176  return m_title;
177  }
178 
179  const string& GetJournal() const
180  {
181  return m_journal;
182  }
183 
184  const string& GetUnique() const
185  {
186  return m_unique;
187  }
188 
189  const string& GetVolume() const
190  {
191  return m_volume;
192  }
193 
194  const string& GetPages() const
195  {
196  return m_pages;
197  }
198 
199  void SetAuthors(const CAuth_list& auth_list)
200  {
201  m_authors.clear();
202  GetAuthorsFromList(m_authors, auth_list);
203  }
204 
205  void SetTitle(const string& title)
206  {
207  m_title = title;
208  m_title_words_set = false;
209  m_full_title_set = false;
210 
211  m_full_title.clear();
212  m_titlewords.clear();
213  }
214 
215  void SetJournal(const string& journal)
216  {
217  m_journal = journal;
218  }
219 
220  void SetUnique(const string& unique)
221  {
222  m_unique = unique;
223  }
224 
225  void SetDate(const CDate_std& date)
226  {
227  if (date.IsSetYear() && date.IsSetMonth()) {
228  m_date.Reset(new CDate_std);
229  m_date->Assign(date);
230  }
231  }
232 
233  bool IsSetDate() const
234  {
235  return m_date.NotEmpty();
236  }
237 
238  int GetYear() const
239  {
240  if (IsSetDate() && m_date->IsSetYear()) {
241  return m_date->GetYear();
242  }
243  return 0;
244  }
245 
246  int GetMonth() const
247  {
248  if (IsSetDate() && m_date->IsSetMonth()) {
249  return m_date->GetMonth();
250  }
251  return 0;
252  }
253 
254  void AddSeqId(const string& seq_id)
255  {
256  m_seq_ids.insert(seq_id);
257  }
258 
259  void SetVolume(const string& volume)
260  {
261  m_volume = volume;
262  }
263 
264  void SetPages(const string& pages)
265  {
266  m_pages = pages;
267  }
268 
269 private:
270  list<string> m_authors;
272 
273  mutable list<string> m_titlewords;
274 
275  string m_journal;
276  string m_title;
277  string m_unique;
278 
279  mutable string m_full_title;
281 
282  mutable bool m_title_words_set,
284 
285  string m_volume,
287 
288  void CreateTitleWords() const
289  {
290  string title = NStr::Sanitize(m_title);
291  NStr::ReplaceInPlace(title, ",", " ");
292  NStr::ReplaceInPlace(title, ":", " ");
293  NStr::ReplaceInPlace(title, "(", " ");
294  NStr::ReplaceInPlace(title, ")", " ");
295  NStr::ReplaceInPlace(title, "-", " ");
296  NStr::ReplaceInPlace(title, ".", " ");
297 
298  NStr::Split(title, " ", m_titlewords, NStr::fSplit_Tokenize);
299  }
300 
301  void CreateFullTitle() const
302  {
303  m_full_title = NStr::Sanitize(m_title);
304 
305  if (! m_full_title.empty() && m_full_title.front() == '[' && m_full_title.back() == ']') {
306  m_full_title = m_full_title.substr(1, m_full_title.size() - 2);
307  m_full_title = NStr::Sanitize(m_full_title);
308  }
309 
310  if (m_full_title.back() == '.')
311  m_full_title.pop_back();
312  }
313 };
314 
315 /////////////////////////////////////////////////
316 
317 
318 CUnpublishedReport::CUnpublishedReport(CNcbiOstream& out, int max_date_check, bool nohydra) :
319  m_out(out),
320  m_max_date_check(max_date_check),
321  m_nohydra(nohydra)
322 {
323 }
324 
325 static void CollectDataGen(const CCit_gen& cit, CPubData& data)
326 {
327  if (cit.IsSetAuthors()) {
328  data.SetAuthors(cit.GetAuthors());
329  }
330 
331  if (cit.IsSetJournal()) {
332  data.SetJournal(GetBestTitle(cit.GetJournal()));
333  }
334 
335  if (cit.IsSetTitle()) {
336  data.SetTitle(cit.GetTitle());
337  }
338 
339  if (cit.IsSetDate() && cit.GetDate().IsStd()) {
340  data.SetDate(cit.GetDate().GetStd());
341  }
342 }
343 
344 static void CollectDataArt(const CCit_art& cit, CPubData& data)
345 {
346  NCBI_ASSERT(cit.IsSetFrom() && cit.GetFrom().IsJournal(), "cit should be a journal");
347  const CCit_jour& from_journal = cit.GetFrom().GetJournal();
348 
349  NCBI_ASSERT(from_journal.IsSetImp(), "Imprint should be set");
350  const CImprint& imprint = from_journal.GetImp();
351 
352  if (cit.IsSetAuthors()) {
353  data.SetAuthors(cit.GetAuthors());
354  }
355 
356  if (from_journal.IsSetTitle()) {
357  data.SetJournal(GetBestTitle(from_journal.GetTitle()));
358  }
359 
360  if (cit.IsSetTitle() && cit.GetTitle().IsSet()) {
361  for (const auto& cur_title : cit.GetTitle().Get()) {
362  if (cur_title->IsName()) {
363  data.SetTitle(cur_title->GetName());
364  break;
365  }
366  }
367  }
368 
369  if (imprint.IsSetDate() && imprint.GetDate().IsStd()) {
370  data.SetDate(imprint.GetDate().GetStd());
371  }
372 
373  if (imprint.IsSetVolume()) {
374  data.SetVolume(imprint.GetVolume());
375  }
376 
377  if (imprint.IsSetPages()) {
378  data.SetPages(imprint.GetPages());
379  }
380 }
381 
382 static void CollectData(const CPub& pub, CPubData& data)
383 {
384  if (pub.IsGen()) {
385  CollectDataGen(pub.GetGen(), data);
386  } else if (pub.IsArticle()) {
388  }
389 
390  string label;
392  data.SetUnique(label);
393 }
394 
402  eLastValue // should always be the last enum item
403 };
404 
405 static AuthorNameMatch CompareAuthorNames(string first, string second)
406 {
407  if (NStr::EqualNocase(first, second)) {
408  return eFullMatch;
409  }
410 
411  auto pred = [](char c) { return c == '-'; };
412  first.erase(remove_if(first.begin(), first.end(), pred), first.end());
413  second.erase(remove_if(second.begin(), second.end(), pred), second.end());
414 
415  if (NStr::EqualNocase(first, second)) {
416  return eNoHyphenMatch;
417  }
418 
419  size_t space_pos_first = first.find(' ');
420  if (space_pos_first != string::npos && space_pos_first + 2 < first.size()) {
421  first.resize(space_pos_first + 3);
422  }
423 
424  size_t space_pos_second = second.find(' ');
425  if (space_pos_second != string::npos && space_pos_second + 2 < second.size()) {
426  second.resize(space_pos_second + 3);
427  }
428 
429  if (NStr::EqualNocase(first, second)) {
430  return eTwoInitialsMatch;
431  }
432 
433  if (space_pos_first != string::npos && space_pos_first + 1 < first.size()) {
434  first.resize(space_pos_first + 2);
435  }
436  if (space_pos_second != string::npos && space_pos_second + 1 < second.size()) {
437  second.resize(space_pos_second + 2);
438  }
439 
440  if (NStr::EqualNocase(first, second)) {
441  return eOneInitialMatch;
442  }
443 
444  if (space_pos_first != string::npos) {
445  first.resize(space_pos_first);
446  }
447  if (space_pos_second != string::npos) {
448  second.resize(space_pos_second);
449  }
450 
451  if (NStr::EqualNocase(first, second)) {
452  return eLastNameMatch;
453  }
454 
455  return eNoMatch;
456 }
457 
458 static AuthorNameMatch CompareAuthors(const list<string>& first, const list<string>& second)
459 {
460  if (first.size() != second.size()) {
461  return eNoMatch;
462  }
463 
464  auto first_it = first.begin(),
465  second_it = second.begin();
466 
468  for (; ret != eNoMatch && first_it != first.end(); ++first_it, ++second_it) {
469  AuthorNameMatch cur = CompareAuthorNames(*first_it, *second_it);
470  if (cur < ret) {
471  ret = cur;
472  }
473  }
474 
475  return ret;
476 }
477 
479 {
480  const string& cur_seq_id = GetCurrentSeqId();
481 
482  shared_ptr<CPubData> data(new CPubData);
483  CollectData(pub, *data);
484 
485  bool need_to_add = true;
486  for (const auto& cur_pub : m_pubs) {
487  AuthorNameMatch match = CompareAuthors(cur_pub->GetAuthors(), data->GetAuthors());
488  if (match == eFullMatch &&
489  NStr::EqualNocase(cur_pub->GetFullTitle(), data->GetFullTitle()) &&
490  NStr::EqualNocase(cur_pub->GetUnique(), data->GetUnique())) {
491 
492  if (! cur_seq_id.empty()) {
493  cur_pub->AddSeqId(cur_seq_id);
494  } else {
495  m_pubs_need_id.push_back(cur_pub);
496  }
497 
498  need_to_add = false;
499  break;
500  }
501  }
502 
503  if (need_to_add) {
504  m_pubs.push_back(data);
505 
506  if (cur_seq_id.empty()) {
507  m_pubs_need_id.push_back(data);
508  } else {
509  data->AddSeqId(cur_seq_id);
510  }
511 
512  if (! data->IsSetDate()) {
513  m_pubs_need_date.push_back(data);
514  }
515  }
516 }
517 
519 {
520  if (! m_eutils) {
521  m_eutils.reset(new CEutilsClient);
522  }
523  return *m_eutils;
524 }
525 
526 static void GetOneInitialAuthorName(const string& author, string& name)
527 {
528  size_t space = author.rfind(' ');
529  if (space == string::npos) {
530  name = author;
531  } else {
532  name = author.substr(0, space + 1);
533  if (space + 1 < author.size()) {
534  name += author[space + 1];
535  }
536  }
537 }
538 
539 static void GetNameFromStdName(const CPerson_id& person, string& name)
540 {
541  if (person.IsName()) {
542  if (person.GetName().IsSetLast()) {
543  name = person.GetName().GetLast();
544 
545  if (person.GetName().IsSetInitials()) {
546  name += ' ' + person.GetName().GetInitials();
547  }
548  }
549  } else if (person.IsMl()) {
550  name = person.GetMl();
551  } else if (person.IsStr()) {
552  name = person.GetStr();
553  }
554 }
555 
556 static bool FirstOrLastAuthorMatches(const list<string>& authors, const CAuth_list::C_Names& pubmed_authors)
557 {
558  string first_author,
559  last_author;
560 
561  if (authors.size()) {
562  GetOneInitialAuthorName(authors.front(), first_author);
563  }
564 
565  if (authors.size() > 1) {
566  GetOneInitialAuthorName(authors.back(), last_author);
567  }
568 
569  bool ret = false;
570  if (pubmed_authors.IsStd()) {
571  for (const auto& auth : pubmed_authors.GetStd()) {
572  if (auth->IsSetName()) {
573  string name;
574  GetNameFromStdName(auth->GetName(), name);
575  string cur_name;
576  GetOneInitialAuthorName(name, cur_name);
577  if (NStr::EqualNocase(cur_name, first_author) || NStr::EqualNocase(cur_name, last_author)) {
578  ret = true;
579  break;
580  }
581  }
582  }
583  } else {
584  const list<string>& names = pubmed_authors.IsMl() ? pubmed_authors.GetMl() : pubmed_authors.GetStr();
585  for (const string& name : names) {
586  string cur_name;
587  GetOneInitialAuthorName(name, cur_name);
588  if (cur_name == first_author || cur_name == last_author) {
589  ret = true;
590  break;
591  }
592  }
593  }
594 
595  return ret;
596 }
597 
598 static bool CheckRefs(const CMedline_entry& medline_entry, const CPubData::TSeqIds& seq_ids)
599 {
600  bool ret = true;
601 
602  if (medline_entry.IsSetXref()) {
603  ret = false;
604  for (const auto& xref : medline_entry.GetXref()) {
605  if (xref->IsSetCit()) {
606  if (seq_ids.find(xref->GetCit()) != seq_ids.end()) {
607  return true;
608  }
609  }
610  }
611  }
612 
613  return ret;
614 }
615 
616 static bool CheckDate(int year, int month, int max_date_check, const CCit_jour& juornal)
617 {
618  bool ret = true;
619  if (max_date_check && year && juornal.IsSetImp() && juornal.GetImp().IsSetDate()) {
620  const CDate& pub_date = juornal.GetImp().GetDate();
621  if (pub_date.IsStd() && pub_date.GetStd().IsSetYear()) {
622 
623  const CDate_std& std_pub_date = pub_date.GetStd();
624 
625  CDate_std date_before,
626  date_after;
627 
628  date_before.SetYear(year);
629  if (month > 1) {
630  date_before.SetMonth(month - 1);
631  } else {
632  date_before.SetYear(year - 1);
633  date_before.SetMonth(12);
634  }
635 
636  date_after.SetYear(year + max_date_check);
637  date_after.SetMonth(month);
638 
639  CDate::ECompare before = date_before.Compare(std_pub_date),
640  after = date_after.Compare(std_pub_date);
641 
642  ret = (before == CDate::eCompare_before || before == CDate::eCompare_same) &&
643  (after == CDate::eCompare_after || after == CDate::eCompare_same);
644  }
645  }
646 
647  return ret;
648 }
649 
650 
652 {
653  vector<string> query;
654 
655  // title
656  for (const string& w : data.GetTitleWords()) {
657  query.push_back(w);
658  }
659 
660  // authors
661  for (const string& author : data.GetAuthors()) {
662  list<CTempString> names;
664  if (! names.empty()) {
665  query.push_back(names.front());
666  }
667  }
668 
669  vector<TEntrezId> uids;
670 
671  try {
672  edit::CEUtilsUpdater::DoPubSearch(query, uids);
673  } catch (CException& e) {
674  ERR_POST(Warning << "failed while Hydra search: " << e);
675  }
676 
677  TEntrezId pmid = ZERO_ENTREZ_ID;
678  if (uids.size() == 1) {
679  pmid = uids.front();
680  }
681 
682  return pmid;
683 }
684 
686 {
687  static const string BASE_URL = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=pub_report&versions=no&format=xml&ids=PMC";
688  static const size_t BUF_SIZE = 1024;
689 
690  string url = BASE_URL + NStr::IntToString(ENTREZ_ID_TO(int, pmc));
691 
692  TEntrezId pmid = ZERO_ENTREZ_ID;
693 
694  for (int attempt = 1; attempt <= 5; attempt++) {
695  try {
696  CConn_HttpStream https(url);
697 
698  string result;
699 
700  vector<char> buf(BUF_SIZE);
701  while (! https.fail()) {
702  https.read(&buf[0], BUF_SIZE);
703  result.append(&buf[0], https.gcount());
704  }
705 
706  if (NStr::StartsWith(result, "<pmcids status=\"ok\">")) {
707  if (result.find("status = \"error\"") == string::npos && result.find("<errmsg>") == string::npos) {
708  static const char pmid_start[] = "pmid=\"";
709  size_t pmid_pos = result.find(pmid_start);
710  if (pmid_pos != string::npos) {
711  pmid = ENTREZ_ID_FROM(int, NStr::StringToInt(result.c_str() + pmid_pos + sizeof(pmid_start) - 1, NStr::fAllowTrailingSymbols));
712  }
713  }
714 
715  break;
716  }
717 
718  } catch (CException& e) {
719  ERR_POST(Warning << "failed on attempt " << attempt
720  << ": " << e);
721  }
722  }
723 
724  return pmid;
725 }
726 
727 static void NormalizeTitle(string& s)
728 {
729  for (char& c : s) {
730  switch (c) {
731  case '.':
732  case '"':
733  case '(':
734  case ')':
735  case '[':
736  case ']':
737  case ':':
738  c = ' ';
739  break;
740  default:
741  if (isupper(c)) {
742  c = tolower(c);
743  }
744  break;
745  }
746  }
747 }
748 
749 static TEntrezId DoEUtilsSearch(CEutilsClient& eutils, const string& database, const string& term)
750 {
751  vector<TEntrezId> uids;
752  eutils.Search(database, term, uids);
753 
754  TEntrezId pmid = ZERO_ENTREZ_ID;
755  if (uids.size() == 1) {
756  pmid = uids.front();
757  }
758 
759  return pmid;
760 }
761 
763 {
764  CEutilsClient& eutils = GetEUtils();
765 
766  // TODO: CEUtilsUpdater::CitMatch()
767  string title = data.GetTitle();
768  NormalizeTitle(title);
769  string term = title + "[title]";
770 
771  TEntrezId pmid = ZERO_ENTREZ_ID;
772 
773  if (! term.empty()) {
774  try {
775  pmid = DoEUtilsSearch(eutils, "pubmed", term);
776  if (pmid == ZERO_ENTREZ_ID) {
777  pmid = DoEUtilsSearch(eutils, "pmc", term);
778  if (pmid != ZERO_ENTREZ_ID) {
779  pmid = ConvertPMCtoPMID(pmid);
780  }
781  }
782  } catch (...) {
783  pmid = ZERO_ENTREZ_ID;
784  }
785  }
786 
787  if (pmid == ZERO_ENTREZ_ID && ! m_nohydra) {
788  pmid = DoHydraSearch(data);
789  }
790 
791  return pmid;
792 }
793 
795 {
796  CNcbiStrstream xml_stream;
797  eutils::CPubmedArticleSet pas;
798 
799  CEutilsClient& eutils = GetEUtils();
800  vector<TEntrezId> uids { pmid };
801 
802  try {
803  eutils.Fetch("PubMed", uids, xml_stream);
804  if (xml_stream) {
805  xml_stream >> MSerial_Xml >> pas;
806  }
807  } catch (CException& e) {
808  // skips exceptions those may occur during Fetch(...) and '>>'
809  ERR_POST(Warning << "failed while fetching data from PubMed: " << e);
810  return false;
811  }
812 
813  const auto& pp = pas.GetPP().GetPP();
814  if (! pp.empty()) {
815  const auto& ppf = *pp.front();
816  if (ppf.IsPubmedArticle()) {
817  const eutils::CPubmedArticle& article = ppf.GetPubmedArticle();
818  pubmed_entry.Reset(article.ToPubmed_entry());
819  } else if (ppf.IsPubmedBookArticle()) {
820  const eutils::CPubmedBookArticle& article = ppf.GetPubmedBookArticle();
821  pubmed_entry.Reset(article.ToPubmed_entry());
822  }
823  }
824 
825  if (pubmed_entry && pubmed_entry->IsSetMedent() && pubmed_entry->GetMedent().IsSetCit()) {
826  const CCit_art& cit_art = pubmed_entry->GetMedent().GetCit();
827  if (cit_art.IsSetFrom() && cit_art.GetFrom().IsJournal()) {
828  bool proceed = CheckDate(data.GetYear(), data.GetMonth(), m_max_date_check, cit_art.GetFrom().GetJournal()) && CheckRefs(pubmed_entry->GetMedent(), data.GetSeqIds());
829  if (proceed && cit_art.IsSetAuthors()) {
830  const CAuth_list& authors = cit_art.GetAuthors();
831  if (authors.IsSetNames() && FirstOrLastAuthorMatches(data.GetAuthors(), authors.GetNames())) {
832  return true;
833  }
834  }
835  }
836  }
837 
838  return false;
839 }
840 
841 static AuthorNameMatch IsAuthorInList(const list<string>& auths, const string& author)
842 {
844 
845  bool found = false;
846  for (const string& cur_author : auths) {
847  AuthorNameMatch cur_cmp_res = CompareAuthorNames(cur_author, author);
848  if (cur_cmp_res == eNoMatch) {
849  continue;
850  }
851 
852  found = true;
853  if (cur_cmp_res < res) {
854  res = cur_cmp_res;
855  }
856  }
857 
858  return found ? res : eNoMatch;
859 }
860 
862 {
863  for (const string& id : ids) {
864  out << "SEQID |" << id << "|\t";
865  }
866 }
867 
868 static string authors_cmp_result_label[] = {
869  "AUTH_MISMATCH", "LAST_NAMES", "ONE_INIT", "TWO_INITS", "NO_HYPHENS", "FULL_NAMES"
870 };
871 
873 {
874  NCBI_ASSERT(res < eLastValue, "Invalid res value");
875  return authors_cmp_result_label[res];
876 }
877 
878 static bool ReportAuthorDiff(CNcbiOstream& out, const list<string>& pubmed_auths, const list<string>& auths)
879 {
880  AuthorNameMatch best_match = eFullMatch;
881 
882  size_t matches = 0;
883  for (const string& author : auths) {
884  AuthorNameMatch cur_match = IsAuthorInList(pubmed_auths, author);
885  if (cur_match == eNoMatch) {
886  continue;
887  }
888 
889  ++matches;
890  if (cur_match < best_match) {
891  best_match = cur_match;
892  }
893  }
894 
895  bool both_ok = true;
896 
897  size_t pubmed_size = pubmed_auths.size(),
898  cur_size = auths.size();
899  string result_str = GetAuthorsCmpResultStr(best_match);
900 
901  if (! auths.empty() && matches == cur_size) {
902  if (cur_size < 3 && pubmed_size > 4) {
903  out << "AUTHORS_QUESTIONABLE [" << result_str << "] " << cur_size << " -> " << pubmed_size << '\t';
904  both_ok = false;
905  } else if (cur_size < pubmed_size) {
906  out << "AUTHORS_ADDED [" << result_str << "] " << pubmed_size - cur_size << '\t';
907  } else {
908  out << "AUTHORS_REORDERED [" << result_str << "]\t";
909  }
910  } else {
911  out << "AUTHORS_CHANGED [" << result_str << "] " << matches << " / " << pubmed_size << '\t';
912  both_ok = false;
913  }
914 
915  return both_ok;
916 }
917 
918 static bool ReportTitleDiff(CNcbiOstream& out, const list<string>& pubmed_title_words, const list<string>& title_words)
919 {
920  size_t matches = 0;
921  for (const string& word : title_words) {
922  if (NStr::FindNoCase(pubmed_title_words, word)) {
923  ++matches;
924  }
925  }
926 
927  bool both_ok = true;
928 
929  size_t pubmed_size = pubmed_title_words.size(),
930  cur_size = title_words.size();
931 
932  if (cur_size < 3 && pubmed_size > 4) {
933  out << "TITLE_QUESTIONABLE " << cur_size << " -> " << pubmed_size << '\t';
934  both_ok = false;
935  } else if (pubmed_size && cur_size && NStr::EqualNocase(pubmed_title_words.front(), title_words.front()) &&
936  matches == pubmed_size) {
937  out << "TITLE_SAME [SIMILAR] " << matches << '\t';
938  } else if (pubmed_size && matches == pubmed_size) {
939  out << "TITLE_ALTERED " << matches << '\t';
940  both_ok = false;
941  } else {
942  out << "TITLE_DIFFERS " << matches << " / " << pubmed_size << '\t';
943  both_ok = false;
944  }
945 
946  return both_ok;
947 }
948 
949 static void ReportAuththors(CNcbiOstream& out, const char* prefix, const list<string>& auths)
950 {
951  out << prefix;
952  if (! auths.empty()) {
953 
954  out << auths.front();
955  auto auth = auths.begin();
956  for (++auth; auth != auths.end(); ++auth)
957  out << ", " << *auth;
958  }
959  out << '\t';
960 }
961 
962 static void ReportTitle(CNcbiOstream& out, const char* prefix, const CPubData& data)
963 {
964  out << prefix;
965 
966  if (! data.GetFullTitle().empty()) {
967  out << data.GetFullTitle();
968  } else {
969 
970  const list<string>& words = data.GetTitleWords();
971  if (! words.empty()) {
972  out << words.front();
973  auto word = words.begin();
974  for (++word; word != words.end(); ++word)
975  out << ' ' << *word;
976  }
977  }
978  out << '\t';
979 }
980 
981 static void ReportJournal(CNcbiOstream& out, const char* prefix, const CPubData& data)
982 {
983  out << prefix;
984  int year = data.GetYear();
985 
986  if (data.GetJournal().empty()) {
987 
988  out << "Unpublished";
989  if (year) {
990  out << " [" << year << ']';
991  }
992  } else {
993  out << data.GetJournal();
994 
995  if (year) {
996  out << " [" << year << ']';
997  }
998 
999  if (! data.GetVolume().empty()) {
1000  out << ' ' << data.GetVolume();
1001  }
1002 
1003  if (! data.GetPages().empty()) {
1004  out << " : " << data.GetPages();
1005  }
1006  }
1007 }
1008 
1009 static void ReportOnePub(CNcbiOstream& out, const CCit_art& pubmed_cit_art, const CPubData& data, TEntrezId pmid)
1010 {
1011  CPubData pubmed_data;
1012  CollectDataArt(pubmed_cit_art, pubmed_data);
1013 
1014  if (! pubmed_data.GetAuthors().empty() && ! pubmed_data.GetTitleWords().empty()) {
1015 
1016  AuthorNameMatch authors_cmp_res = CompareAuthors(pubmed_data.GetAuthors(), data.GetAuthors());
1017  bool title_same = NStr::EqualNocase(pubmed_data.GetFullTitle(), data.GetFullTitle());
1018 
1019  out << "PMID " << ENTREZ_ID_TO(int, pmid) << '\t';
1020  ReportSeqIds(out, data.GetSeqIds());
1021 
1022  if (data.GetUnique().empty()) {
1023  out << "?\t";
1024  } else {
1025  out << "UNIQ_CIT " << data.GetUnique() << '\t';
1026  }
1027 
1028  bool both_ok = true;
1029  if (authors_cmp_res == eNoMatch) {
1030  both_ok = ReportAuthorDiff(out, pubmed_data.GetAuthors(), data.GetAuthors());
1031  } else {
1032  out << "AUTHORS_SAME [" << GetAuthorsCmpResultStr(authors_cmp_res) << "]\t";
1033  }
1034 
1035  if (title_same) {
1036  out << "TITLE_SAME [IDENTICAL]\t";
1037  } else {
1038  if (! ReportTitleDiff(out, pubmed_data.GetTitleWords(), data.GetTitleWords()))
1039  both_ok = false;
1040  }
1041 
1042  out << (both_ok ? "PROBABLE\t" : "POSSIBLE\t");
1043 
1044  ReportAuththors(out, "OLD_AUTH ", data.GetAuthors());
1045  ReportAuththors(out, "NEW_AUTH ", pubmed_data.GetAuthors());
1046 
1047  ReportTitle(out, "OLD_TITL ", data);
1048  ReportTitle(out, "NEW_TITL ", pubmed_data);
1049 
1050  ReportJournal(out, "OLD_JOUR ", data);
1051  out << '\t';
1052  ReportJournal(out, "NEW_JOUR ", pubmed_data);
1053 
1054  out << " <" << ENTREZ_ID_TO(int, pmid) << '>';
1055  out << '\n';
1056  }
1057 }
1058 
1060 {
1061  m_out << "Trying " << m_pubs.size() << " Entrez Queries\n\n";
1062  for (const auto& pub : m_pubs) {
1063  TEntrezId pmid = RetrievePMid(*pub);
1064  if (pmid != ZERO_ENTREZ_ID) {
1065  CRef<CPubmed_entry> pubmed_entry;
1066  if (FetchPub(pmid, *pub, pubmed_entry)) {
1067  NCBI_ASSERT(pubmed_entry->IsSetMedent() && pubmed_entry->GetMedent().IsSetCit(),
1068  "MedEntry and MedEntry.Cit should be present at this point");
1069  ReportOnePub(m_out, pubmed_entry->GetMedent().GetCit(), *pub, pmid);
1070  }
1071  }
1072  }
1073 }
1074 
1076 {
1077  if (! name.empty() && ! m_pubs_need_id.empty()) {
1078  for (const auto& pub_need_id : m_pubs_need_id) {
1079  pub_need_id->AddSeqId(name);
1080  }
1081  m_pubs_need_id.clear();
1082  }
1083 
1085 }
1086 
1088 {
1089  if (IsSetDate()) {
1090  for (const auto& pub : m_pubs_need_date) {
1091  if (! pub->IsSetDate()) {
1092  pub->SetDate(GetDate());
1093  }
1094  }
1095  }
1096 
1097  m_pubs_need_date.clear();
1098  ResetDate(); // clears the current date
1099 }
1100 
1102 {
1103  return m_date.NotEmpty();
1104 }
1105 
1107 {
1108  if (date.IsSetYear() && date.IsSetMonth()) {
1109  m_date.Reset(new CDate_std);
1110  m_date->Assign(date);
1111  }
1112 }
1113 
1115 {
1116  static CDate_std invalid_date;
1117  return IsSetDate() ? *m_date : invalid_date;
1118 }
1119 
1121 {
1122  m_date.Reset();
1123 }
1124 
1125 }
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
@Auth_list.hpp User-defined methods of the data storage class.
Definition: Auth_list.hpp:57
This stream exchanges data with an HTTP server located at the URL: http[s]://host[:port]/path[?...
CDate::ECompare Compare(const CDate_std &date) const
Indicate how *this relates to another date.
Definition: Date_std.cpp:91
Definition: Date.hpp:53
ECompare
How *this relates to another date.
Definition: Date.hpp:73
@ eCompare_before
*this comes first.
Definition: Date.hpp:74
@ eCompare_same
They're equivalent.
Definition: Date.hpp:75
@ eCompare_after
*this comes second.
Definition: Date.hpp:76
Class for querying via E-Utils.
void Fetch(const string &db, const vector< objects::CSeq_id_Handle > &uids, CNcbiOstream &ostr, const string &retmode="xml")
Uint8 Search(const string &db, const string &term, vector< objects::CSeq_id_Handle > &uids, const string &xml_path=kEmptyStr)
CImprint –.
Definition: Imprint.hpp:66
@Name_std.hpp User-defined methods of the data storage class.
Definition: Name_std.hpp:56
Definition: Pub.hpp:56
bool GetLabel(string *label, ELabelType type=eContent, TLabelFlags flags=0, ELabelVersion version=eLabel_DefaultVersion) const
Concatenate a label for this pub to label.
Definition: Pub.cpp:76
@ fLabel_Unique
Append a unique tag [V1].
const std::string & GetCurrentSeqId() const
Definition: base_report.hpp:46
virtual void SetCurrentSeqId(const std::string &name)
Definition: base_report.hpp:51
const string & GetPages() const
void SetDate(const CDate_std &date)
void SetAuthors(const CAuth_list &auth_list)
const string & GetJournal() const
const TSeqIds & GetSeqIds() const
CRef< CDate_std > m_date
const string & GetFullTitle() const
const string & GetTitle() const
void SetTitle(const string &title)
void SetUnique(const string &unique)
const string & GetVolume() const
const list< string > & GetAuthors() const
list< string > m_authors
void CreateFullTitle() const
void SetPages(const string &pages)
void AddSeqId(const string &seq_id)
void SetJournal(const string &journal)
void SetVolume(const string &volume)
bool IsSetDate() const
const list< string > & GetTitleWords() const
const string & GetUnique() const
list< string > m_titlewords
set< string > TSeqIds
void CreateTitleWords() const
ncbi::CNcbiOstream & m_out
const CDate_std & GetDate() const
TEntrezId RetrievePMid(const CPubData &data) const
void SetCurrentSeqId(const std::string &name) override
CUnpublishedReport(ncbi::CNcbiOstream &out, int max_date_check, bool nohydra)
void ReportUnpublished(const CPub &pub)
bool FetchPub(TEntrezId pmid, const CPubData &data, CRef< CPubmed_entry > &pubmed_entry) const
shared_ptr< CEutilsClient > m_eutils
CEutilsClient & GetEUtils() const
void SetDate(const CDate_std &date)
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
The NCBI C++ standard methods for dealing with std::string.
std::ofstream out("events_result.xml")
main entry point for tests
static const struct name_t names[]
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
char data[12]
Definition: iconv.c:80
SStrictId_Entrez::TId TEntrezId
TEntrezId type for entrez ids which require the same strictness as TGi.
Definition: ncbimisc.hpp:1041
#define ENTREZ_ID_TO(T, entrez_id)
Definition: ncbimisc.hpp:1097
#define ENTREZ_ID_FROM(T, value)
Definition: ncbimisc.hpp:1098
#define ZERO_ENTREZ_ID
Definition: ncbimisc.hpp:1102
string
Definition: cgiapp.hpp:687
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_Xml
Definition: serialbase.hpp:698
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static string Sanitize(CTempString str, TSS_Flags flags=fSS_print)
Sanitize a string, allowing only specified classes of characters.
Definition: ncbistr.hpp:2876
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
Definition: ncbistr.hpp:298
@ fSplit_Truncate
Definition: ncbistr.hpp:2501
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2498
static const char label[]
bool IsSetVolume(void) const
Check if a value has been assigned to Volume data member.
Definition: Imprint_.hpp:746
bool IsSetDate(void) const
Check if a value has been assigned to Date data member.
Definition: Cit_gen_.hpp:853
bool IsSetAuthors(void) const
Check if a value has been assigned to Authors data member.
Definition: Cit_gen_.hpp:623
bool IsSetAuthors(void) const
authors (ANSI requires) Check if a value has been assigned to Authors data member.
Definition: Cit_art_.hpp:534
const TJournal & GetJournal(void) const
Get the variant data.
Definition: Cit_art_.cpp:111
bool IsSetTitle(void) const
title of journal Check if a value has been assigned to Title data member.
Definition: Cit_jour_.hpp:201
const TVolume & GetVolume(void) const
Get the Volume member data.
Definition: Imprint_.hpp:758
const TPages & GetPages(void) const
Get the Pages member data.
Definition: Imprint_.hpp:852
const TFrom & GetFrom(void) const
Get the From member data.
Definition: Cit_art_.hpp:567
const TAuthors & GetAuthors(void) const
Get the Authors member data.
Definition: Cit_gen_.hpp:635
const TDate & GetDate(void) const
Get the Date member data.
Definition: Cit_gen_.hpp:865
bool IsSetTitle(void) const
title of paper (ANSI requires) Check if a value has been assigned to Title data member.
Definition: Cit_art_.hpp:513
const TTitle & GetTitle(void) const
Get the Title member data.
Definition: Cit_art_.hpp:525
bool IsSetFrom(void) const
Check if a value has been assigned to From data member.
Definition: Cit_art_.hpp:555
bool IsSetImp(void) const
Check if a value has been assigned to Imp data member.
Definition: Cit_jour_.hpp:231
const TTitle & GetTitle(void) const
Get the Title member data.
Definition: Cit_gen_.hpp:933
bool IsSetNames(void) const
Check if a value has been assigned to Names data member.
Definition: Auth_list_.hpp:464
const TJournal & GetJournal(void) const
Get the Journal member data.
Definition: Cit_gen_.hpp:703
bool IsSetTitle(void) const
eg.
Definition: Cit_gen_.hpp:921
bool IsSetJournal(void) const
Check if a value has been assigned to Journal data member.
Definition: Cit_gen_.hpp:691
bool IsSetDate(void) const
date of publication Check if a value has been assigned to Date data member.
Definition: Imprint_.hpp:716
const TStr & GetStr(void) const
Get the variant data.
Definition: Auth_list_.hpp:450
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Title_.hpp:769
const TImp & GetImp(void) const
Get the Imp member data.
Definition: Cit_jour_.hpp:243
bool IsJournal(void) const
Check if variant Journal is selected.
Definition: Cit_art_.hpp:495
const TNames & GetNames(void) const
Get the Names member data.
Definition: Auth_list_.hpp:478
bool IsMl(void) const
Check if variant Ml is selected.
Definition: Auth_list_.hpp:424
const TStd & GetStd(void) const
Get the variant data.
Definition: Auth_list_.hpp:410
const TDate & GetDate(void) const
Get the Date member data.
Definition: Imprint_.hpp:728
const TAuthors & GetAuthors(void) const
Get the Authors member data.
Definition: Cit_art_.hpp:546
const TTitle & GetTitle(void) const
Get the Title member data.
Definition: Cit_jour_.hpp:213
const TMl & GetMl(void) const
Get the variant data.
Definition: Auth_list_.hpp:430
bool IsSetPages(void) const
Check if a value has been assigned to Pages data member.
Definition: Imprint_.hpp:840
const Tdata & Get(void) const
Get the member data.
Definition: Title_.hpp:781
bool IsStd(void) const
Check if variant Std is selected.
Definition: Auth_list_.hpp:404
const TStr & GetStr(void) const
Get the variant data.
Definition: Person_id_.hpp:391
bool IsMl(void) const
Check if variant Ml is selected.
Definition: Person_id_.hpp:365
void SetYear(TYear value)
Assign a value to Year data member.
Definition: Date_std_.hpp:435
bool IsSetYear(void) const
full year (including 1900) Check if a value has been assigned to Year data member.
Definition: Date_std_.hpp:407
bool IsStd(void) const
Check if variant Std is selected.
Definition: Date_.hpp:320
const TInitials & GetInitials(void) const
Get the Initials member data.
Definition: Name_std_.hpp:610
void SetMonth(TMonth value)
Assign a value to Month data member.
Definition: Date_std_.hpp:482
bool IsName(void) const
Check if variant Name is selected.
Definition: Person_id_.hpp:359
bool IsSetInitials(void) const
first + middle initials Check if a value has been assigned to Initials data member.
Definition: Name_std_.hpp:598
const TMl & GetMl(void) const
Get the variant data.
Definition: Person_id_.hpp:371
bool IsSetLast(void) const
Check if a value has been assigned to Last data member.
Definition: Name_std_.hpp:410
bool IsStr(void) const
Check if variant Str is selected.
Definition: Person_id_.hpp:385
bool IsSetMonth(void) const
month (1-12) Check if a value has been assigned to Month data member.
Definition: Date_std_.hpp:454
const TLast & GetLast(void) const
Get the Last member data.
Definition: Name_std_.hpp:422
const TName & GetName(void) const
Get the variant data.
Definition: Person_id_.cpp:137
const TStd & GetStd(void) const
Get the variant data.
Definition: Date_.cpp:109
bool IsSetXref(void) const
Check if a value has been assigned to Xref data member.
const TXref & GetXref(void) const
Get the Xref member data.
const TArticle & GetArticle(void) const
Get the variant data.
Definition: Pub_.cpp:233
const TGen & GetGen(void) const
Get the variant data.
Definition: Pub_.cpp:167
bool IsArticle(void) const
Check if variant Article is selected.
Definition: Pub_.hpp:629
bool IsGen(void) const
Check if variant Gen is selected.
Definition: Pub_.hpp:584
use only n Cassandra database for the lookups</td > n</tr > n< tr > n< td > yes</td > n< td > do not use tables BIOSEQ_INFO and BLOB_PROP in the Cassandra database
char * buf
static void ReportJournal(CNcbiOstream &out, const char *prefix, const CPubData &data)
static void ProcessInitials(string &initials)
static void CollectData(const CPub &pub, CPubData &data)
static void ReportTitle(CNcbiOstream &out, const char *prefix, const CPubData &data)
static TEntrezId ConvertPMCtoPMID(TEntrezId pmc)
static void GetOneInitialAuthorName(const string &author, string &name)
static void CollectDataArt(const CCit_art &cit, CPubData &data)
static string authors_cmp_result_label[]
static void NormalizeTitle(string &s)
static void GetAuthorsFromList(list< string > &authors, const CAuth_list &auth_list)
static TEntrezId DoHydraSearch(const CPubData &data)
static void CollectDataGen(const CCit_gen &cit, CPubData &data)
static bool FirstOrLastAuthorMatches(const list< string > &authors, const CAuth_list::C_Names &pubmed_authors)
static AuthorNameMatch CompareAuthorNames(string first, string second)
static void ReportSeqIds(CNcbiOstream &out, const CPubData::TSeqIds &ids)
static bool ReportTitleDiff(CNcbiOstream &out, const list< string > &pubmed_title_words, const list< string > &title_words)
static void ReportAuththors(CNcbiOstream &out, const char *prefix, const list< string > &auths)
string GetBestTitle(const CTitle &titles)
Definition: utils.cpp:44
static AuthorNameMatch CompareAuthors(const list< string > &first, const list< string > &second)
static TEntrezId DoEUtilsSearch(CEutilsClient &eutils, const string &database, const string &term)
static string GetAuthorsCmpResultStr(AuthorNameMatch res)
static void GetNameFromStdName(const CPerson_id &person, string &name)
static void ReportOnePub(CNcbiOstream &out, const CCit_art &pubmed_cit_art, const CPubData &data, TEntrezId pmid)
static AuthorNameMatch IsAuthorInList(const list< string > &auths, const string &author)
static bool ReportAuthorDiff(CNcbiOstream &out, const list< string > &pubmed_auths, const list< string > &auths)
static bool CheckRefs(const CMedline_entry &medline_entry, const CPubData::TSeqIds &seq_ids)
static bool CheckDate(int year, int month, int max_date_check, const CCit_jour &juornal)
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isupper(Uchar c)
Definition: ncbictype.hpp:70
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
static const char * prefix[]
Definition: pcregrep.c:405
CRef< CPub > journal(ParserPtr pp, char *bptr, char *eptr, CRef< CAuth_list > &auth_list, CRef< CTitle::C_E > &title, bool has_muid, CRef< CCit_art > &cit_art, Int4 er)
Definition: ref.cpp:1457
static string query
else result
Definition: token2.c:20
Modified on Thu Apr 25 08:18:07 2024 by modify_doxy.py rev. 669887