81 for (
char c : initials) {
82 if (c !=
' ' && c !=
'.' && c !=
',') {
96 if (auth->IsSetName()) {
97 string cur_auth, cur_initials;
108 }
else if (person.
IsStr()) {
109 cur_auth = person.
GetStr();
110 }
else if (person.
IsMl()) {
111 cur_auth = person.
GetMl();
114 if (! cur_auth.empty()) {
115 if (! cur_initials.empty()) {
116 cur_auth +=
' ' + cur_initials;
118 authors.push_back(cur_auth);
123 for (
const string& auth :
names.IsStr() ?
names.GetStr() :
names.GetMl()) {
124 if (! auth.empty()) {
125 authors.push_back(auth);
141 m_title_words_set(
false),
142 m_full_title_set(
false)
158 if (! m_title_words_set) {
160 m_title_words_set =
true;
167 if (! m_full_title_set) {
169 m_full_title_set =
true;
208 m_title_words_set =
false;
209 m_full_title_set =
false;
211 m_full_title.clear();
212 m_titlewords.clear();
229 m_date->Assign(date);
235 return m_date.NotEmpty();
240 if (IsSetDate() && m_date->IsSetYear()) {
241 return m_date->GetYear();
248 if (IsSetDate() && m_date->IsSetMonth()) {
249 return m_date->GetMonth();
256 m_seq_ids.insert(seq_id);
305 if (! m_full_title.empty() && m_full_title.front() ==
'[' && m_full_title.back() ==
']') {
306 m_full_title = m_full_title.substr(1, m_full_title.size() - 2);
310 if (m_full_title.back() ==
'.')
311 m_full_title.pop_back();
320 m_max_date_check(max_date_check),
361 for (
const auto& cur_title : cit.
GetTitle().
Get()) {
362 if (cur_title->IsName()) {
363 data.SetTitle(cur_title->GetName());
411 auto pred = [](
char c) {
return c ==
'-'; };
413 second.erase(
remove_if(second.begin(), second.end(), pred), second.end());
419 size_t space_pos_first =
first.find(
' ');
420 if (space_pos_first != string::npos && space_pos_first + 2 <
first.size()) {
421 first.resize(space_pos_first + 3);
424 size_t space_pos_second = second.find(
' ');
425 if (space_pos_second != string::npos && space_pos_second + 2 < second.size()) {
426 second.resize(space_pos_second + 3);
433 if (space_pos_first != string::npos && space_pos_first + 1 <
first.size()) {
434 first.resize(space_pos_first + 2);
436 if (space_pos_second != string::npos && space_pos_second + 1 < second.size()) {
437 second.resize(space_pos_second + 2);
444 if (space_pos_first != string::npos) {
445 first.resize(space_pos_first);
447 if (space_pos_second != string::npos) {
448 second.resize(space_pos_second);
460 if (
first.size() != second.size()) {
464 auto first_it =
first.begin(),
465 second_it = second.begin();
468 for (; ret !=
eNoMatch && first_it !=
first.end(); ++first_it, ++second_it) {
485 bool need_to_add =
true;
486 for (
const auto& cur_pub :
m_pubs) {
492 if (! cur_seq_id.empty()) {
493 cur_pub->AddSeqId(cur_seq_id);
506 if (cur_seq_id.empty()) {
509 data->AddSeqId(cur_seq_id);
512 if (!
data->IsSetDate()) {
528 size_t space = author.rfind(
' ');
529 if (space == string::npos) {
532 name = author.substr(0, space + 1);
533 if (space + 1 < author.size()) {
534 name += author[space + 1];
549 }
else if (person.
IsMl()) {
550 name = person.
GetMl();
551 }
else if (person.
IsStr()) {
561 if (authors.size()) {
565 if (authors.size() > 1) {
570 if (pubmed_authors.
IsStd()) {
571 for (
const auto& auth : pubmed_authors.
GetStd()) {
572 if (auth->IsSetName()) {
584 const list<string>&
names = pubmed_authors.
IsMl() ? pubmed_authors.
GetMl() : pubmed_authors.
GetStr();
585 for (
const string& name :
names) {
588 if (cur_name == first_author || cur_name == last_author) {
604 for (
const auto& xref : medline_entry.
GetXref()) {
605 if (xref->IsSetCit()) {
606 if (seq_ids.
find(xref->GetCit()) != seq_ids.
end()) {
636 date_after.
SetYear(year + max_date_check);
640 after = date_after.
Compare(std_pub_date);
653 vector<string>
query;
656 for (
const string& w :
data.GetTitleWords()) {
661 for (
const string& author :
data.GetAuthors()) {
662 list<CTempString>
names;
664 if (!
names.empty()) {
669 vector<TEntrezId> uids;
672 edit::CEUtilsUpdater::DoPubSearch(
query, uids);
678 if (uids.size() == 1) {
687 static const string BASE_URL =
"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=pub_report&versions=no&format=xml&ids=PMC";
688 static const size_t BUF_SIZE = 1024;
694 for (
int attempt = 1; attempt <= 5; attempt++) {
700 vector<char>
buf(BUF_SIZE);
701 while (! https.fail()) {
702 https.read(&
buf[0], BUF_SIZE);
707 if (
result.find(
"status = \"error\"") == string::npos &&
result.find(
"<errmsg>") == string::npos) {
708 static const char pmid_start[] =
"pmid=\"";
709 size_t pmid_pos =
result.find(pmid_start);
710 if (pmid_pos != string::npos) {
751 vector<TEntrezId> uids;
755 if (uids.size() == 1) {
767 string title =
data.GetTitle();
769 string term = title +
"[title]";
773 if (! term.empty()) {
797 eutils::CPubmedArticleSet pas;
800 vector<TEntrezId> uids { pmid };
803 eutils.
Fetch(
"PubMed", uids, xml_stream);
813 const auto& pp = pas.GetPP().GetPP();
815 const auto& ppf = *pp.front();
816 if (ppf.IsPubmedArticle()) {
817 const eutils::CPubmedArticle& article = ppf.GetPubmedArticle();
818 pubmed_entry.
Reset(article.ToPubmed_entry());
819 }
else if (ppf.IsPubmedBookArticle()) {
820 const eutils::CPubmedBookArticle& article = ppf.GetPubmedBookArticle();
821 pubmed_entry.
Reset(article.ToPubmed_entry());
825 if (pubmed_entry && pubmed_entry->IsSetMedent() && pubmed_entry->GetMedent().IsSetCit()) {
826 const CCit_art& cit_art = pubmed_entry->GetMedent().GetCit();
846 for (
const string& cur_author : auths) {
853 if (cur_cmp_res < res) {
863 for (
const string&
id : ids) {
864 out <<
"SEQID |" <<
id <<
"|\t";
869 "AUTH_MISMATCH",
"LAST_NAMES",
"ONE_INIT",
"TWO_INITS",
"NO_HYPHENS",
"FULL_NAMES"
883 for (
const string& author : auths) {
890 if (cur_match < best_match) {
891 best_match = cur_match;
897 size_t pubmed_size = pubmed_auths.size(),
898 cur_size = auths.size();
901 if (! auths.empty() && matches == cur_size) {
902 if (cur_size < 3 && pubmed_size > 4) {
903 out <<
"AUTHORS_QUESTIONABLE [" << result_str <<
"] " << cur_size <<
" -> " << pubmed_size <<
'\t';
905 }
else if (cur_size < pubmed_size) {
906 out <<
"AUTHORS_ADDED [" << result_str <<
"] " << pubmed_size - cur_size <<
'\t';
908 out <<
"AUTHORS_REORDERED [" << result_str <<
"]\t";
911 out <<
"AUTHORS_CHANGED [" << result_str <<
"] " << matches <<
" / " << pubmed_size <<
'\t';
921 for (
const string& word : title_words) {
929 size_t pubmed_size = pubmed_title_words.size(),
930 cur_size = title_words.size();
932 if (cur_size < 3 && pubmed_size > 4) {
933 out <<
"TITLE_QUESTIONABLE " << cur_size <<
" -> " << pubmed_size <<
'\t';
935 }
else if (pubmed_size && cur_size &&
NStr::EqualNocase(pubmed_title_words.front(), title_words.front()) &&
936 matches == pubmed_size) {
937 out <<
"TITLE_SAME [SIMILAR] " << matches <<
'\t';
938 }
else if (pubmed_size && matches == pubmed_size) {
939 out <<
"TITLE_ALTERED " << matches <<
'\t';
942 out <<
"TITLE_DIFFERS " << matches <<
" / " << pubmed_size <<
'\t';
952 if (! auths.empty()) {
954 out << auths.front();
955 auto auth = auths.begin();
956 for (++auth; auth != auths.end(); ++auth)
957 out <<
", " << *auth;
966 if (!
data.GetFullTitle().empty()) {
970 const list<string>& words =
data.GetTitleWords();
971 if (! words.empty()) {
972 out << words.front();
973 auto word = words.begin();
974 for (++word; word != words.end(); ++word)
984 int year =
data.GetYear();
986 if (
data.GetJournal().empty()) {
988 out <<
"Unpublished";
990 out <<
" [" << year <<
']';
996 out <<
" [" << year <<
']';
999 if (!
data.GetVolume().empty()) {
1000 out <<
' ' <<
data.GetVolume();
1003 if (!
data.GetPages().empty()) {
1004 out <<
" : " <<
data.GetPages();
1022 if (
data.GetUnique().empty()) {
1025 out <<
"UNIQ_CIT " <<
data.GetUnique() <<
'\t';
1028 bool both_ok =
true;
1036 out <<
"TITLE_SAME [IDENTICAL]\t";
1042 out << (both_ok ?
"PROBABLE\t" :
"POSSIBLE\t");
1061 m_out <<
"Trying " <<
m_pubs.size() <<
" Entrez Queries\n\n";
1062 for (
const auto& pub :
m_pubs) {
1066 if (
FetchPub(pmid, *pub, pubmed_entry)) {
1067 NCBI_ASSERT(pubmed_entry->IsSetMedent() && pubmed_entry->GetMedent().IsSetCit(),
1068 "MedEntry and MedEntry.Cit should be present at this point");
1079 pub_need_id->AddSeqId(name);
1091 if (! pub->IsSetDate()) {
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
@Auth_list.hpp User-defined methods of the data storage class.
This stream exchanges data with an HTTP server located at the URL: http[s]://host[:port]/path[?...
CDate::ECompare Compare(const CDate_std &date) const
Indicate how *this relates to another date.
ECompare
How *this relates to another date.
@ eCompare_before
*this comes first.
@ eCompare_same
They're equivalent.
@ eCompare_after
*this comes second.
Class for querying via E-Utils.
void Fetch(const string &db, const vector< objects::CSeq_id_Handle > &uids, CNcbiOstream &ostr, const string &retmode="xml")
Uint8 Search(const string &db, const string &term, vector< objects::CSeq_id_Handle > &uids, const string &xml_path=kEmptyStr)
@Name_std.hpp User-defined methods of the data storage class.
bool GetLabel(string *label, ELabelType type=eContent, TLabelFlags flags=0, ELabelVersion version=eLabel_DefaultVersion) const
Concatenate a label for this pub to label.
@ fLabel_Unique
Append a unique tag [V1].
const std::string & GetCurrentSeqId() const
virtual void SetCurrentSeqId(const std::string &name)
const string & GetPages() const
void SetDate(const CDate_std &date)
void SetAuthors(const CAuth_list &auth_list)
const string & GetJournal() const
const TSeqIds & GetSeqIds() const
const string & GetFullTitle() const
const string & GetTitle() const
void SetTitle(const string &title)
void SetUnique(const string &unique)
const string & GetVolume() const
const list< string > & GetAuthors() const
void CreateFullTitle() const
void SetPages(const string &pages)
void AddSeqId(const string &seq_id)
void SetJournal(const string &journal)
void SetVolume(const string &volume)
const list< string > & GetTitleWords() const
const string & GetUnique() const
list< string > m_titlewords
void CreateTitleWords() const
ncbi::CNcbiOstream & m_out
const CDate_std & GetDate() const
TEntrezId RetrievePMid(const CPubData &data) const
void SetCurrentSeqId(const std::string &name) override
void CompleteReport() override
void ClearData() override
CUnpublishedReport(ncbi::CNcbiOstream &out, int max_date_check, bool nohydra)
void ReportUnpublished(const CPub &pub)
bool FetchPub(TEntrezId pmid, const CPubData &data, CRef< CPubmed_entry > &pubmed_entry) const
shared_ptr< CEutilsClient > m_eutils
CEutilsClient & GetEUtils() const
void SetDate(const CDate_std &date)
const_iterator find(const key_type &key) const
const_iterator end() const
The NCBI C++ standard methods for dealing with std::string.
std::ofstream out("events_result.xml")
main entry point for tests
static const struct name_t names[]
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
SStrictId_Entrez::TId TEntrezId
TEntrezId type for entrez ids which require the same strictness as TGi.
#define ENTREZ_ID_TO(T, entrez_id)
#define ENTREZ_ID_FROM(T, value)
#define NCBI_ASSERT(expr, mess)
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
void Warning(CExceptionArgs_Base &args)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static string Sanitize(CTempString str, TSS_Flags flags=fSS_print)
Sanitize a string, allowing only specified classes of characters.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
static const char label[]
bool IsSetVolume(void) const
Check if a value has been assigned to Volume data member.
bool IsSetDate(void) const
Check if a value has been assigned to Date data member.
bool IsSetAuthors(void) const
Check if a value has been assigned to Authors data member.
bool IsSetAuthors(void) const
authors (ANSI requires) Check if a value has been assigned to Authors data member.
const TJournal & GetJournal(void) const
Get the variant data.
bool IsSetTitle(void) const
title of journal Check if a value has been assigned to Title data member.
const TVolume & GetVolume(void) const
Get the Volume member data.
const TPages & GetPages(void) const
Get the Pages member data.
const TFrom & GetFrom(void) const
Get the From member data.
const TAuthors & GetAuthors(void) const
Get the Authors member data.
const TDate & GetDate(void) const
Get the Date member data.
bool IsSetTitle(void) const
title of paper (ANSI requires) Check if a value has been assigned to Title data member.
const TTitle & GetTitle(void) const
Get the Title member data.
bool IsSetFrom(void) const
Check if a value has been assigned to From data member.
bool IsSetImp(void) const
Check if a value has been assigned to Imp data member.
const TTitle & GetTitle(void) const
Get the Title member data.
bool IsSetNames(void) const
Check if a value has been assigned to Names data member.
const TJournal & GetJournal(void) const
Get the Journal member data.
bool IsSetTitle(void) const
eg.
bool IsSetJournal(void) const
Check if a value has been assigned to Journal data member.
bool IsSetDate(void) const
date of publication Check if a value has been assigned to Date data member.
const TStr & GetStr(void) const
Get the variant data.
bool IsSet(void) const
Check if a value has been assigned to data member.
const TImp & GetImp(void) const
Get the Imp member data.
bool IsJournal(void) const
Check if variant Journal is selected.
const TNames & GetNames(void) const
Get the Names member data.
bool IsMl(void) const
Check if variant Ml is selected.
const TStd & GetStd(void) const
Get the variant data.
const TDate & GetDate(void) const
Get the Date member data.
const TAuthors & GetAuthors(void) const
Get the Authors member data.
const TTitle & GetTitle(void) const
Get the Title member data.
const TMl & GetMl(void) const
Get the variant data.
bool IsSetPages(void) const
Check if a value has been assigned to Pages data member.
const Tdata & Get(void) const
Get the member data.
bool IsStd(void) const
Check if variant Std is selected.
const TStr & GetStr(void) const
Get the variant data.
bool IsMl(void) const
Check if variant Ml is selected.
void SetYear(TYear value)
Assign a value to Year data member.
bool IsSetYear(void) const
full year (including 1900) Check if a value has been assigned to Year data member.
bool IsStd(void) const
Check if variant Std is selected.
const TInitials & GetInitials(void) const
Get the Initials member data.
void SetMonth(TMonth value)
Assign a value to Month data member.
bool IsName(void) const
Check if variant Name is selected.
bool IsSetInitials(void) const
first + middle initials Check if a value has been assigned to Initials data member.
const TMl & GetMl(void) const
Get the variant data.
bool IsSetLast(void) const
Check if a value has been assigned to Last data member.
bool IsStr(void) const
Check if variant Str is selected.
bool IsSetMonth(void) const
month (1-12) Check if a value has been assigned to Month data member.
const TLast & GetLast(void) const
Get the Last member data.
const TName & GetName(void) const
Get the variant data.
const TStd & GetStd(void) const
Get the variant data.
bool IsSetXref(void) const
Check if a value has been assigned to Xref data member.
const TXref & GetXref(void) const
Get the Xref member data.
const TArticle & GetArticle(void) const
Get the variant data.
const TGen & GetGen(void) const
Get the variant data.
bool IsArticle(void) const
Check if variant Article is selected.
bool IsGen(void) const
Check if variant Gen is selected.
use only n Cassandra database for the lookups</td > n</tr > n< tr > n< td > yes</td > n< td > do not use tables BIOSEQ_INFO and BLOB_PROP in the Cassandra database
static void ReportJournal(CNcbiOstream &out, const char *prefix, const CPubData &data)
static void ProcessInitials(string &initials)
static void CollectData(const CPub &pub, CPubData &data)
static void ReportTitle(CNcbiOstream &out, const char *prefix, const CPubData &data)
static TEntrezId ConvertPMCtoPMID(TEntrezId pmc)
static void GetOneInitialAuthorName(const string &author, string &name)
static void CollectDataArt(const CCit_art &cit, CPubData &data)
static string authors_cmp_result_label[]
static void NormalizeTitle(string &s)
static void GetAuthorsFromList(list< string > &authors, const CAuth_list &auth_list)
static TEntrezId DoHydraSearch(const CPubData &data)
static void CollectDataGen(const CCit_gen &cit, CPubData &data)
static bool FirstOrLastAuthorMatches(const list< string > &authors, const CAuth_list::C_Names &pubmed_authors)
static AuthorNameMatch CompareAuthorNames(string first, string second)
static void ReportSeqIds(CNcbiOstream &out, const CPubData::TSeqIds &ids)
static bool ReportTitleDiff(CNcbiOstream &out, const list< string > &pubmed_title_words, const list< string > &title_words)
static void ReportAuththors(CNcbiOstream &out, const char *prefix, const list< string > &auths)
string GetBestTitle(const CTitle &titles)
static AuthorNameMatch CompareAuthors(const list< string > &first, const list< string > &second)
static TEntrezId DoEUtilsSearch(CEutilsClient &eutils, const string &database, const string &term)
static string GetAuthorsCmpResultStr(AuthorNameMatch res)
static void GetNameFromStdName(const CPerson_id &person, string &name)
static void ReportOnePub(CNcbiOstream &out, const CCit_art &pubmed_cit_art, const CPubData &data, TEntrezId pmid)
static AuthorNameMatch IsAuthorInList(const list< string > &auths, const string &author)
static bool ReportAuthorDiff(CNcbiOstream &out, const list< string > &pubmed_auths, const list< string > &auths)
static bool CheckRefs(const CMedline_entry &medline_entry, const CPubData::TSeqIds &seq_ids)
static bool CheckDate(int year, int month, int max_date_check, const CCit_jour &juornal)
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
static const char * prefix[]
CRef< CPub > journal(ParserPtr pp, char *bptr, char *eptr, CRef< CAuth_list > &auth_list, CRef< CTitle::C_E > &title, bool has_muid, CRef< CCit_art > &cit_art, Int4 er)