NCBI C++ ToolKit
SubSource.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: SubSource.cpp 99436 2023-03-27 19:11:29Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: .......
27  *
28  * File Description:
29  * .......
30  *
31  * Remark:
32  * This code was originally generated by application DATATOOL
33  * using the following specifications:
34  * 'seqfeat.asn'.
35  */
36 
37 // standard includes
38 #include <ncbi_pch.hpp>
39 #include <serial/enumvalues.hpp>
40 
41 // generated includes
43 
44 #include <math.h>
46 #include <corelib/ncbitime.hpp>
47 
49 #include <mutex>
50 #include <util/compile_time.hpp>
51 
52 // generated classes
53 
55 
56 BEGIN_objects_SCOPE // namespace ncbi::objects::
57 
58 unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonCountryMap;
59 unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonWaterMap;
60 
61 
62 // destructor
64 {
65 }
66 
67 void CSubSource::GetLabel(string* str) const
68 {
69  *str += '/';
70  string type_name;
71  if (GetSubtype() == eSubtype_other) {
72  type_name = "other";
73  } else {
74  try {
75  // eVocabulary_insdc has some special cases not (historically)
76  // used here.
77  type_name = GetSubtypeName(GetSubtype());
78  replace(type_name.begin(), type_name.end(), '_', '-');
79  } catch (const CSerialException&) {
80  type_name = "unknown";
81  }
82  }
83  *str += type_name;
84  *str += '=';
85  *str += GetName();
86  if (IsSetAttrib()) {
87  *str += " (";
88  *str += GetAttrib();
89  *str += ")";
90  }
91 }
92 
93 
95  EVocabulary vocabulary)
96 {
97  string name = NStr::TruncateSpaces(str);
98  NStr::ToLower(name);
99  replace(name.begin(), name.end(), '_', '-');
100  replace(name.begin(), name.end(), ' ', '-');
101 
102  if ( NStr::EqualNocase(name, "note") ||
103  NStr::EqualNocase(name, "subsource-note") ||
104  NStr::EqualNocase(name, "subsrc-note") ||
105  NStr::EqualNocase(name, "note-subsource")) {
106  return eSubtype_other;
107  } else if (vocabulary == eVocabulary_insdc) {
108  // consider a table if more special cases arise.
109  if (name == "insertion-seq") {
111  } else if (name == "plasmid") {
112  return eSubtype_plasmid_name;
113  } else if (name == "transposon") {
115  } else if (name == "sub-clone") {
116  return eSubtype_subclone;
117  }
118  }
119  return ENUM_METHOD_NAME(ESubtype)()->FindValue(name);
120 }
121 
122 
124  EVocabulary vocabulary)
125 {
126 
127  string name = NStr::TruncateSpaces(str);
128  NStr::ToLower(name);
129  replace(name.begin(), name.end(), '_', '-');
130  replace(name.begin(), name.end(), ' ', '-');
131 
132  if ( NStr::EqualNocase(name, "note") ||
133  NStr::EqualNocase(name, "subsource-note") ||
134  NStr::EqualNocase(name, "subsrc-note") ||
135  NStr::EqualNocase(name, "note-subsource")) {
136  return true;
137  }
138  if (vocabulary == eVocabulary_insdc) {
139  // consider a table if more special cases arise.
140  if (name == "insertion-seq" ||
141  name == "plasmid" ||
142  name == "transposon" ||
143  name == "sub-clone") {
144  return true;
145  }
146  }
147  return ENUM_METHOD_NAME(ESubtype)()->IsValidName(name);
148 }
149 
150 
152  EVocabulary vocabulary)
153 {
154  if (stype == CSubSource::eSubtype_other) {
155  return "note";
156  } else if (vocabulary == eVocabulary_insdc) {
157  switch (stype) {
158  case eSubtype_subclone: return "sub_clone";
159  case eSubtype_plasmid_name: return "plasmid";
160  case eSubtype_transposon_name: return "transposon";
161  case eSubtype_insertion_seq_name: return "insertion_seq";
162  default:
163  return NStr::Replace
164  (ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true),
165  "-", "_");
166  }
167  } else {
168  return ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true);
169  }
170 }
171 
172 
173 
175 {
176  return subtype != eSubtype_chromosome
177  && subtype != eSubtype_sex
178  && subtype != eSubtype_germline
179  && subtype != eSubtype_rearranged
180  && subtype != eSubtype_plasmid_name
181  && subtype != eSubtype_segment
182  && subtype != eSubtype_country
183  && subtype != eSubtype_transgenic
184  && subtype != eSubtype_environmental_sample
185  && subtype != eSubtype_lat_lon
186  && subtype != eSubtype_collection_date
187  && subtype != eSubtype_collected_by
188  && subtype != eSubtype_identified_by
189  && subtype != eSubtype_fwd_primer_seq
190  && subtype != eSubtype_rev_primer_seq
191  && subtype != eSubtype_fwd_primer_name
192  && subtype != eSubtype_rev_primer_name
193  && subtype != eSubtype_metagenomic
194  && subtype != eSubtype_altitude
195  && subtype != eSubtype_clone;
196 }
197 
198 
199 bool CSubSource::NeedsNoText(const TSubtype& subtype)
200 {
201  if (subtype == eSubtype_germline
202  || subtype == eSubtype_rearranged
203  || subtype == eSubtype_transgenic
204  || subtype == eSubtype_environmental_sample
205  || subtype == eSubtype_metagenomic) {
206  return true;
207  } else {
208  return false;
209  }
210 }
211 
212 
214 {
215  if (subtype == eSubtype_frequency
216  || subtype == eSubtype_insertion_seq_name
217  || subtype == eSubtype_phenotype
218  || subtype == eSubtype_plastid_name
219  || subtype == eSubtype_transposon_name
220  || subtype == eSubtype_fwd_primer_seq
221  || subtype == eSubtype_rev_primer_seq
222  || subtype == eSubtype_fwd_primer_name
223  || subtype == eSubtype_rev_primer_name
224  || subtype == eSubtype_whole_replicon) { // metagenomic subsrc qualifier taken off this list: GB-3384
225  return true;
226  } else {
227  return false;
228  }
229 }
230 
231 
232 bool CSubSource::IsDayValueOkForMonth(int day, int month, int year)
233 {
234  if (month < 1 || month > 12 || day < 1) {
235  return false;
236  }
237  bool rval = true;
238  if (year < 100) {
239  year += 2000;
240  } else if (year > 3000) {
241  return false;
242  } else if (year < 1538) {
243  return false;
244  }
245  CTime month_o(year, month, 1);
246  if (day > month_o.DaysInMonth()) {
247  rval = false;
248  }
249  return rval;
250 }
251 
252 
254 {
255  if (NStr::IsBlank(test)) {
257  "collection-date string is blank");
258  }
259  string str = NStr::TruncateSpaces(test);
260 
261  if (IsISOFormatDate(str)) {
262  return GetDateFromISODate(str);
263  }
264 
265  size_t pos = NStr::Find(str, "-");
266  string year;
267  string month;
268  string day;
269 
270  if (pos == NPOS) {
271  year = str;
272  } else {
273  size_t pos2 = NStr::Find(str, "-", pos + 1);
274  if (pos2 == NPOS) {
275  month = str.substr(0, pos);
276  year = str.substr(pos + 1);
277  if (NStr::IsBlank(month)) {
279  "collection-date string is improperly formatted");
280  }
281  } else {
282  day = str.substr(0, pos);
283  month = str.substr(pos + 1, pos2 - pos - 1);
284  year = str.substr(pos2 + 1);
285  if (NStr::IsBlank(month) || NStr::IsBlank(day)) {
287  "collection-date string is improperly formatted");
288  }
289  }
290  }
291 
292  int month_val = 0;
293  if (!NStr::IsBlank(month)) {
294  try {
295  month_val = CTime::MonthNameToNum(month);
296  } catch (const CTimeException&) {
298  "collection-date string has invalid month");
299  }
300  }
301 
302  int day_val = 0;
303  if (!NStr::IsBlank(day)) {
304  try {
305  day_val = NStr::StringToInt (day);
306  if (day_val < 1) {
308  "collection-date string has invalid day value");
309  }
310  } catch ( const exception& ) {
311  // threw exception while converting to int
313  "collection-date string is improperly formatted");
314  }
315  }
316 
317  if (NStr::IsBlank(year)) {
319  "collection-date string is improperly formatted");
320  }
321 
322  int year_val = 0;
323  try {
324  year_val = NStr::StringToInt (year);
325  } catch ( const exception& ) {
326  // threw exception while converting to int
328  "collection-date string is improperly formatted");
329  }
330 
331  /*
332  if (year_val < 1000 || year_val >= 2100) {
333  NCBI_THROW (CException, eUnknown,
334  "collection-date year is out of range");
335  }
336  */
337 
338  if (year_val < 1000) {
340  "collection-date year is out of range");
341  }
342 
343  if (year_val >= 2100) {
345  "collection-date year is out of range");
346  }
347 
348  if (day_val > 0 && month_val > 0 && !IsDayValueOkForMonth(day_val, month_val, year_val)) {
350  "collection-date day is greater than monthly maximum");
351  }
352 
353  CRef<CDate> date(new CDate);
354 
355  date->SetStd().SetYear (year_val);
356  if (month_val > 0) {
357  date->SetStd().SetMonth (month_val);
358  }
359  if (day_val > 0) {
360  date->SetStd().SetDay (day_val);
361  }
362 
363  time_t t;
364 
365  time(&t);
366 
367  CDate now(t);
368 
369  /*
370  if (IsCollectionDateAfterTime(*date, t)) {
371  NCBI_THROW (CException, eUnknown,
372  "collection-date year is out of range");
373  }
374  */
375 
376  return date;
377 }
378 
379 
380 bool CSubSource::IsCollectionDateAfterTime(const string& collection_date, time_t t, bool& bad_format)
381 {
382  bad_format = false;
383  bool in_future = false;
384  vector<string> pieces;
385  NStr::Split(collection_date, "/", pieces);
386  if (pieces.size() > 2) {
387  bad_format = true;
388  } else {
389  ITERATE(vector<string>, it, pieces) {
390  CRef<CDate> coll_date = DateFromCollectionDate (*it);
391  if (!coll_date) {
392  bad_format = true;
393  } else if (IsCollectionDateAfterTime(*coll_date, t)) {
394  in_future = true;
395  }
396  }
397  }
398  return in_future;
399 }
400 
401 
402 bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, time_t t)
403 {
404  CDate now(t);
405  if (collection_date.Compare(now) == CDate::eCompare_after) {
406  return true;
407  } else {
408  return false;
409  }
410 }
411 
412 
413 bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, CTime& ctime)
414 {
415  time_t t = ctime.GetTimeT();
416  return IsCollectionDateAfterTime(collection_date, t);
417 }
418 
419 
420 void CSubSource::IsCorrectDateFormat(const string& date_string, bool& bad_format, bool& in_future)
421 {
422  bad_format = false;
423  in_future = false;
424 
425  vector<string> pieces;
426  NStr::Split(date_string, "/", pieces);
427  if (pieces.size() > 2) {
428  bad_format = true;
429  return;
430  } else if (pieces.size() == 2) {
431  bool first_bad = false;
432  bool first_future = false;
433  bool second_bad = false;
434  bool second_future = false;
435  IsCorrectDateFormat(pieces[0], first_bad, first_future);
436  IsCorrectDateFormat(pieces[1], second_bad, second_future);
437  bad_format = first_bad || second_bad;
438  if (!bad_format) {
439  in_future = first_future || second_future;
440  }
441  return;
442  }
443 
444  try {
445  CRef<CDate> coll_date = CSubSource::DateFromCollectionDate (date_string);
446 
447  if (!IsISOFormatDate(date_string)) {
448  // if there are two dashes, then the first token needs to be the day, and the
449  // day has to have two numbers, a leading zero if the day is less than 10
450  size_t pos = NStr::Find(date_string, "-");
451  if (pos != NPOS) {
452  size_t pos2 = NStr::Find(date_string, "-", pos + 1);
453  if (pos2 != NPOS && pos != 2) {
454  bad_format = true;
455  }
456  }
457  }
458 
459  if (!bad_format) {
460  time_t t;
461 
462  time(&t);
463 
464  in_future = IsCollectionDateAfterTime(*coll_date, t);
465  }
466  } catch (const CException& ) {
467  bad_format = true;
468  }
469 }
470 
471 size_t CSubSource::CheckDateFormat(const string& date_string)
472 {
473  size_t rval = eDateFormatFlag_ok;
474  vector<string> pieces;
475  NStr::Split(date_string, "/", pieces);
476  if (pieces.size() > 2) {
478  } else if (pieces.size() == 2) {
479  rval |= CheckDateFormat(pieces[0]);
480  rval |= CheckDateFormat(pieces[1]);
481  if (rval == eDateFormatFlag_ok) {
482  try {
485  if (d2->Compare(*d1) == CDate::eCompare_before) {
487  }
488  } catch (const CException&) {
490  }
491  }
492  return rval;
493  }
494 
495  try {
496  CRef<CDate> coll_date = CSubSource::DateFromCollectionDate(date_string);
497 
498  if (!IsISOFormatDate(date_string)) {
499  // if there are two dashes, then the first token needs to be the day, and the
500  // day has to have two numbers, a leading zero if the day is less than 10
501  size_t pos = NStr::Find(date_string, "-");
502  if (pos != NPOS) {
503  size_t pos2 = NStr::Find(date_string, "-", pos + 1);
504  if (pos2 != NPOS && pos != 2) {
506  }
507  }
508  }
509 
510  if (rval == eDateFormatFlag_ok) {
511  time_t t;
512 
513  time(&t);
514  if (IsCollectionDateAfterTime(*coll_date, t)) {
516  }
517  }
518  } catch (const CException&) {
520  }
521  return rval;
522 }
523 
525 
526 // null term exemption values, order is not important
527 MAKE_CONST_SET(s_Null_CollectionDatesSet, ct::tagStrCase,
528 {
529  "missing",
530  "missing: control sample",
531  "missing: data agreement established pre-2023",
532  "missing: endangered species",
533  "missing: human-identifiable",
534  "missing: lab stock",
535  "missing: sample group",
536  "missing: synthetic construct",
537  "missing: third party data",
538  "not applicable",
539  "not collected",
540  "not provided",
541  "restricted access",
542 })
543 
544 string CSubSource::GetCollectionDateProblem (const string& date_string)
545 {
546  string problem;
547  if (s_Null_CollectionDatesSet.find(date_string.c_str()) != s_Null_CollectionDatesSet.end()) {
548  return problem;
549  }
550  size_t rval = CheckDateFormat(date_string);
551  if (rval & eDateFormatFlag_bad_format) {
552  problem = "Collection_date format is not in DD-Mmm-YYYY format";
553  } else if (rval & eDateFormatFlag_in_future) {
554  problem = "Collection_date is in the future";
555  } else if (rval & eDateFormatFlag_out_of_order) {
556  problem = "Collection_dates are out of order";
557  }
558  return problem;
559 }
560 
561 
562 string CSubSource::x_ParseDateRangeWithDelimiter(const string& orig_date, CTempString delim)
563 {
564  size_t pos = NStr::Find(orig_date, delim, NStr::eNocase);
565  if (pos == NPOS) {
566  return kEmptyStr;
567  }
568  size_t second_pos = NStr::Find(orig_date.substr(pos + 1), delim, NStr::eNocase);
569  if (second_pos != NPOS) {
570  return kEmptyStr;
571  }
572  bool month_ambig = false;
573  string first_date = FixDateFormat(orig_date.substr(0, pos), true, month_ambig);
574  if (month_ambig || NStr::IsBlank(first_date)) {
575  return kEmptyStr;
576  }
577  string second_date = FixDateFormat(orig_date.substr(pos + delim.length()), true, month_ambig);
578  if (month_ambig || NStr::IsBlank(second_date)) {
579  return kEmptyStr;
580  }
581  string fix = first_date + "/" + second_date;
582  return fix;
583 }
584 
585 
586 string CSubSource::FixDateFormat (const string& orig_date)
587 {
588  bool month_ambiguous = false;
589 
590  string fix = FixDateFormat(orig_date, true, month_ambiguous);
591  if (month_ambiguous) {
592  fix.clear();
593  } else if (NStr::IsBlank(fix)) {
594  static const char* delimiters[] = {"/", " to ", " and ", "-", "_"};
595  for (size_t i = 0; i < ArraySize(delimiters); i++) {
596  fix = x_ParseDateRangeWithDelimiter(orig_date, delimiters[i]);
597  if (!NStr::IsBlank(fix)) {
598  break;
599  }
600  }
601  }
602  return fix;
603 }
604 
605 // ISO Format for time is one of these:
606 // HH:MM:SS
607 // HH:MM
608 // HH
609 // Followed by either Z or +hh:mm to indicate an offset from Zulu
610 bool CSubSource::IsISOFormatTime(const string& orig_time, int& hour, int& min, int& sec, bool require_time_zone)
611 {
612  int offset_hour = 0;
613  int offset_min = 0;
614  size_t suffix = NStr::Find(orig_time, "Z");
615  if (suffix == NPOS) {
616  suffix = NStr::Find(orig_time, "+");
617  if (suffix == NPOS) {
618  if (require_time_zone) {
619  return false;
620  } else {
621  suffix = orig_time.length();
622  }
623  } else {
624  if (orig_time.substr(suffix).length() != 6 ||
625  !isdigit((unsigned char)orig_time[suffix + 1]) ||
626  !isdigit((unsigned char)orig_time[suffix + 2]) ||
627  orig_time[suffix + 3] != ':' ||
628  !isdigit((unsigned char)orig_time[suffix + 4]) ||
629  !isdigit((unsigned char)orig_time[suffix + 5])) {
630  return false;
631  }
632  try {
633  offset_hour = NStr::StringToInt(orig_time.substr(suffix + 1, 2));
634  offset_min = NStr::StringToInt(orig_time.substr(suffix + 4, 2));
635  } catch (...) {
636  return false;
637  }
638  }
639  }
640  if (suffix != 2 && suffix != 5 && suffix != 8) {
641  return false;
642  }
643 
644  if (!isdigit((unsigned char)orig_time[0]) || !isdigit((unsigned char)orig_time[1])) {
645  return false;
646  }
647  hour = 0;
648  min = 0;
649  sec = 0;
650  try {
651  hour = NStr::StringToInt(orig_time.substr(0, 2));
652  if (hour < 0 || hour > 23) {
653  return false;
654  }
655  hour -= offset_hour;
656  } catch (...) {
657  return false;
658  }
659  if (suffix > 2) {
660  if (!isdigit((unsigned char)orig_time[3]) || !isdigit((unsigned char)orig_time[4])) {
661  return false;
662  }
663  try {
664  min = NStr::StringToInt(orig_time.substr(3, 2));
665  if (min < 0 || min > 59) {
666  return false;
667  }
668  } catch (...) {
669  return false;
670  }
671  min -= offset_min;
672  }
673  if (suffix == 8) {
674  if (!isdigit((unsigned char)orig_time[6]) || !isdigit((unsigned char)orig_time[7])) {
675  return false;
676  }
677  try {
678  sec = NStr::StringToInt(orig_time.substr(6, 2));
679  if (sec < 0) {
680  // negative number bad
681  return false;
682  } else if (sec > 59) {
683  // too big
684  return false;
685  }
686  } catch (...) {
687  return false;
688  }
689  }
690 
691  return true;
692 }
693 
694 // ISO Format for date is exactly 10 characters long OR exactly 7 characters long.
695 // For ten characters:
696 // First four characters must be digits, represent year.
697 // Fifth character must be dash.
698 // Sixth and seventh characters must be digits, represent month, use zero padding.
699 // Eighth character must be dash.
700 // Ninth and tenth characters must be digits, represent day, use zero padding.
701 // For 7 characters:
702 // First four characters must be digits, represent year.
703 // Fifth character must be dash.
704 // Sixth and seventh characters must be digits, represent month, use zero padding.
705 bool CSubSource::IsISOFormatDateOnly (const string& cpy)
706 {
707  if (cpy.length() != 10 && cpy.length() != 7) {
708  return false;
709  }
710  bool rval = true;
711  size_t pos = 0;
712  string::const_iterator it = cpy.begin();
713  while (it != cpy.end() && rval) {
714  if (pos == 4 || pos == 7) {
715  if (*it != '-') {
716  rval = false;
717  }
718  } else if (!isdigit(*it)) {
719  rval = false;
720  }
721  ++it;
722  ++pos;
723  }
724  if (rval) {
725  try {
726  int year = NStr::StringToInt(cpy.substr(0, 4));
727  int month = NStr::StringToInt(cpy.substr(5, 2));
728  if (month < 1 || month > 12) {
729  rval = false;
730  }
731  if (cpy.length() == 10) { // has day
732  int day = NStr::StringToInt(cpy.substr(8, 2));
733  if (!IsDayValueOkForMonth(day, month, year)) {
734  rval = false;
735  }
736  }
737  } catch (...) {
738  rval = false;
739  }
740  }
741  return rval;
742 }
743 
744 
745 bool CSubSource::x_IsFixableIsoDate(const string& orig_date)
746 {
747  string cpy = orig_date;
749  size_t time_pos = NStr::Find(cpy, "T");
750  bool rval = false;
751  if (time_pos == NPOS) {
752  rval = false;
753  } else {
754  if (!IsISOFormatDateOnly(cpy.substr(0, time_pos))) {
755  rval = false;
756  } else {
757  int h, m, s;
758  if (IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, true)) {
759  // already fine, not fixable
760  rval = false;
761  } else {
762  rval = IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, false);
763  }
764  }
765  }
766  return rval;
767 }
768 
769 
770 string CSubSource::x_RemoveIsoTime(const string& orig_date)
771 {
772  string cpy = orig_date;
774  size_t time_pos = NStr::Find(cpy, "T");
775  if (time_pos != NPOS) {
776  cpy = cpy.substr(0, time_pos);
777  }
778  return cpy;
779 }
780 
781 
782 bool CSubSource::IsISOFormatDate(const string& orig_date)
783 {
784  string cpy = orig_date;
786  size_t time_pos = NStr::Find(cpy, "T");
787  if (time_pos == NPOS) {
788  return IsISOFormatDateOnly(cpy);
789  } else {
790  int h, m, s;
791  return (IsISOFormatDateOnly(cpy.substr(0, time_pos)) &&
792  IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s));
793  }
794 
795 }
796 
798 {
799  try {
800  string cpy = orig_date;
802  CRef<CDate> date(new CDate());
803  int year_val = NStr::StringToInt(cpy.substr(0, 4));
804  int month_val = NStr::StringToInt(cpy.substr(5, 2));
805  date->SetStd().SetYear (year_val);
806  date->SetStd().SetMonth (month_val);
807  if (cpy.length() > 7) {
808  int day_val = NStr::StringToInt(cpy.substr(8, 2));
809  date->SetStd().SetDay (day_val);
810  }
811  return date;
812  } catch (...) {
813  return CRef<CDate>();
814  }
815 }
816 
817 
818 vector<string> CSubSource::x_GetDateTokens(const string& orig_date)
819 {
820  vector<string> tokens;
821  string token_delimiters = " ,-/=_.";
822 
823  string cpy = orig_date;
825 
826  string curr_token;
827  bool is_chars = false;
828  ITERATE(string, s, cpy) {
829  if (token_delimiters.find(*s) != NPOS) {
830  if (!NStr::IsBlank(curr_token)) {
831  tokens.push_back(curr_token);
832  }
833  curr_token.clear();
834  is_chars = false;
835  } else if (is_chars && !isalpha((unsigned char)(*s))) {
836  // previous token was all letters, do not add non-letter characters
837  if (!NStr::IsBlank(curr_token)) {
838  tokens.push_back(curr_token);
839  }
840  curr_token = *s;
841  is_chars = false;
842  } else if (!NStr::IsBlank(curr_token) && !is_chars && isalpha(*s)) {
843  // previous token had no letters
844  tokens.push_back(curr_token);
845  curr_token = *s;
846  is_chars = true;
847  } else {
848  curr_token += *s;
849  if (isalpha(*s)) {
850  is_chars = true;
851  }
852  }
853  }
854  if (!NStr::IsBlank(curr_token)) {
855  tokens.push_back(curr_token);
856  }
857 
858  // reattach 'st', 'nd', 'rd', and 'th' to numbers if present
859  if (tokens.size() > 3) {
860  vector<string>::iterator p = tokens.begin();
861  bool prev_is_number = isdigit((unsigned char)(*p)[0]);
862  vector<string>::iterator s = p;
863  ++s;
864  while (s != tokens.end()) {
865  if (prev_is_number &&
866  (NStr::EqualNocase(*s, "st") ||
867  NStr::EqualNocase(*s, "nd") ||
868  NStr::EqualNocase(*s, "rd") ||
869  NStr::EqualNocase(*s, "th"))) {
870  *p += *s;
871  s = tokens.erase(s);
872  prev_is_number = false;
873  } else {
874  ++p;
875  ++s;
876  prev_is_number = isdigit((unsigned char)(*p)[0]);
877  }
878  }
879  }
880 
881  return tokens;
882 }
883 
884 
885 bool s_ChooseMonthAndDay(const string& token1, const string& token2, bool month_first, string& month, int& day, bool& month_ambiguous)
886 {
887  try {
888  int val1 = NStr::StringToInt (token1);
889  int val2 = NStr::StringToInt (token2);
890  if (val1 > 12 && val2 > 12) {
891  // both numbers too big for month
892  return false;
893  } else if (val1 < 13 && val2 < 13) {
894  if (val1 == val2) {
895  // no need to call this ambiguous
896  month = CTime::MonthNumToName(val1, CTime::eAbbr);
897  day = val2;
898  } else {
899  // both numbers could be month
900  month_ambiguous = true;
901  if (month_first) {
902  month = CTime::MonthNumToName(val1, CTime::eAbbr);
903  day = val2;
904  } else {
905  month = CTime::MonthNumToName(val2, CTime::eAbbr);
906  day = val1;
907  }
908  }
909  } else if (val1 < 13) {
910  month = CTime::MonthNumToName(val1, CTime::eAbbr);
911  day = val2;
912  } else {
913  month = CTime::MonthNumToName(val2, CTime::eAbbr);
914  day = val1;
915  }
916  return true;
917  } catch ( ... ) {
918  return false;
919  }
920 }
921 
922 
923 string CSubSource::FixDateFormat (const string& test, bool month_first, bool& month_ambiguous)
924 {
925  string orig_date = test;
926  NStr::TruncateSpacesInPlace(orig_date);
927 
928  if (IsISOFormatDate(orig_date)) {
929  return orig_date;
930  } else if (x_IsFixableIsoDate(orig_date)) {
931  return x_RemoveIsoTime(orig_date);
932  }
933 
934  string reformatted_date;
935  string month;
936  int year = 0, day = 0;
937  //string token_delimiters = " ,-/=_.";
938  size_t num_original_tokens = 0;
939 
940  month_ambiguous = false;
941  vector<string> tokens = x_GetDateTokens(orig_date);
942 
943  num_original_tokens = tokens.size();
944  if (tokens.size() < 1 || tokens.size() > 3) {
945  // no tokens or too many tokens
946  return kEmptyStr;
947  }
948 
949  string one_token;
950  vector<string>::iterator it = tokens.begin();
951  while (it != tokens.end()) {
952  one_token = *it;
953  bool found = false;
954  if (NStr::EqualNocase(one_token, "1st") || NStr::EqualNocase(one_token, "first")) {
955  day = 1;
956  found = true;
957  } else if (NStr::EqualNocase(one_token, "2nd") || NStr::EqualNocase(one_token, "second")) {
958  day = 2;
959  found = true;
960  } else if (NStr::EqualNocase(one_token, "3rd") || NStr::EqualNocase (one_token, "third")) {
961  day = 3;
962  found = true;
963  } else if (one_token.length() > 0
964  && isdigit((unsigned char)one_token[0])
965  && NStr::EndsWith(one_token, "th")) {
966  try {
967  day = NStr::StringToInt (one_token.substr(0, one_token.length() - 2));
968  found = true;
969  } catch ( ... ) {
970  // threw exception while converting to int
971  return kEmptyStr;
972  }
973  } else if (isalpha((unsigned char)one_token[0])) {
974  if (!NStr::IsBlank(month)) {
975  // already have month, error
976  return kEmptyStr;
977  }
978  if (one_token.length() > 3) {
979  one_token = one_token.substr(0, 3);
980  }
981  try {
982  int month_num = CTime::MonthNameToNum(one_token);
983  found = true;
984  month = CTime::MonthNumToName(month_num, CTime::eAbbr);
985  } catch (const CTimeException&) {
986  }
987  } else {
988  try {
989  int this_val = NStr::StringToInt (one_token);
990  int min = 1;
991  int max = 31;
992  if (this_val < min) {
993  return kEmptyStr;
994  } else if (this_val > max) {
995  if (year > 0) {
996  // already have year, error
997  return kEmptyStr;
998  }
999  year = this_val;
1000  found = true;
1001  }
1002  } catch ( ... ) {
1003  // threw exception while converting to int
1004  return kEmptyStr;
1005  }
1006  }
1007  if (found) {
1008  it = tokens.erase(it);
1009  } else {
1010  it++;
1011  }
1012  }
1013 
1014  if (tokens.size() == 0) {
1015  // good - all tokens assigned to values
1016  } else if (tokens.size() > 2) {
1017  // three numbers: treat last one as year
1018  try {
1019  year = NStr::StringToInt(tokens[2]);
1020  if (year < 100) {
1021  year += 2000;
1022  }
1023  if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
1024  return kEmptyStr;
1025  }
1026  // mark month as ambiguous, since we are guessing about year
1027  month_ambiguous = true;
1028  } catch ( ... ) {
1029  // threw exception while converting to int
1030  return kEmptyStr;
1031  }
1032  } else if (tokens.size() == 1) {
1033  try {
1034  int val = NStr::StringToInt (tokens[0]);
1035  if (year == 0) {
1036  year = val;
1037  } else {
1038  if (NStr::IsBlank (month)) {
1039  if (val > 0 && val < 13) {
1041  } else {
1042  // month number out of range
1043  return kEmptyStr;
1044  }
1045  } else {
1046  day = val;
1047  }
1048  }
1049  } catch ( ... ) {
1050  // threw exception while converting to int
1051  return kEmptyStr;
1052  }
1053  } else if (!NStr::IsBlank (month)) {
1054  if (tokens.size() == 2) {
1055  // we have a month and two other numbers (we hope)
1056  int val1 = 0;
1057  int val2 = 0;
1058  try {
1059  val1 = NStr::StringToInt (tokens[0]);
1060  val2 = NStr::StringToInt (tokens[1]);
1061  } catch (CException& /*e*/) {
1062  // not actually numbers
1063  return kEmptyStr;
1064  }
1065  bool zero_pad_1 = NStr::StartsWith(tokens[0], "0");
1066  bool zero_pad_2 = NStr::StartsWith(tokens[1], "0");
1067  if (val1 < 10 && !zero_pad_1 && (val2 > 10 || zero_pad_2)) {
1068  // if one token is not zero-padded and less than 10,
1069  // the other either is zero-padded and greater than 10,
1070  // the "small" token is the day and the second (+2000) is the year
1071  day = val1;
1072  year = val2 + 2000;
1073  } else if (val2 < 10 && !zero_pad_2 && (val1 > 10 || zero_pad_1)) {
1074  // if one token is not zero-padded and less than 10,
1075  // the other either is zero-padded and greater than 10,
1076  // the "small" token is the day and the second (+2000) is the year
1077  day = val2;
1078  year = val1 + 2000;
1079  } else {
1080  int month_num = CTime::MonthNameToNum(month);
1081  if (IsDayValueOkForMonth(val1, month_num, val2 + 2000)) {
1082  day = val1;
1083  year = val2 + 2000;
1084  } else {
1085  day = val2;
1086  year = val1 + 2000;
1087  }
1088  }
1089  } else {
1090  return kEmptyStr;
1091  }
1092  } else {
1093  if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
1094  return kEmptyStr;
1095  }
1096  }
1097 
1098  // make sure day is valid
1099  if (day > 0 && !NStr::IsBlank(month) && year > -1) {
1100  try {
1101  int month_num = CTime::MonthNameToNum(month);
1102  if (!IsDayValueOkForMonth(day, month_num, year)) {
1103  return kEmptyStr;
1104  }
1105  } catch (const CTimeException&) {
1106  return kEmptyStr;
1107  }
1108  }
1109 
1110  if (year > 0 && year < 100 && num_original_tokens > 1) {
1111  // try to guess year from two-digit year provided,
1112  // only if it could not possibly be a day of the month
1113  // and if there were at least two tokens provided
1114  string year_date = NStr::NumericToString(year + 2000);
1115  bool format_bad = false;
1116  bool in_future = false;
1117  IsCorrectDateFormat(year_date, format_bad, in_future);
1118  if (in_future) {
1119  year += 1900;
1120  } else {
1121  year += 2000;
1122  }
1123  }
1124  if (year >= 1000 && year < 2100) {
1125  reformatted_date = NStr::NumericToString (year);
1126  if (!NStr::IsBlank (month)) {
1127  reformatted_date = month + "-" + reformatted_date;
1128  if (day > 0) {
1129  string day_str = NStr::NumericToString (day);
1130  if (day_str.length() < 2) {
1131  day_str = "0" + day_str;
1132  }
1133  reformatted_date = day_str + "-" + reformatted_date;
1134  }
1135  }
1136  }
1137 
1138  return reformatted_date;
1139 }
1140 
1141 
1142 void CSubSource::DetectDateFormat(const string& orig_date, bool& ambiguous, bool &day_first)
1143 {
1144  ambiguous = false;
1145  day_first = false;
1146  vector<string> tokens = x_GetDateTokens(orig_date);
1147  if (tokens.size() != 3) {
1148  // can't do detection if there are more or less than three tokens
1149  ambiguous = true;
1150  return;
1151  }
1152  vector<int> nums;
1153 
1154  // detection is only valid if all tokens are numbers and at least one is known to be the year
1155  try {
1156  ITERATE(vector<string>, it, tokens) {
1157  nums.push_back(NStr::StringToInt (*it));
1158  }
1159  } catch ( ... ) {
1160  // threw exception while converting to int
1161  ambiguous = true;
1162  return;
1163  }
1164  enum EPos { eDay = 0, eMonth = 1, eYear = 2 };
1165  vector<int> positions;
1166  positions.push_back(0);
1167  positions.push_back(0);
1168  positions.push_back(0);
1169 
1170  int token_pos = 1;
1171  ITERATE(vector<int>, it, nums) {
1172  if (*it > 31) {
1173  if (positions[eYear] > 0) {
1174  // already found a year
1175  ambiguous = true;
1176  return;
1177  }
1178  positions[eYear] = token_pos;
1179  } else if (*it > 12) {
1180  if (positions[eDay] > 0) {
1181  // already found a day
1182  ambiguous = true;
1183  return;
1184  }
1185  positions[eDay] = token_pos;
1186  } else if (positions[eMonth] > 0) {
1187  // already found a month
1188  ambiguous = true;
1189  return;
1190  } else {
1191  positions[eMonth] = token_pos;
1192  }
1193  token_pos++;
1194  }
1195  if (positions[eDay] < positions[eMonth]) {
1196  day_first = true;
1197  } else {
1198  day_first = false;
1199  }
1200 }
1201 
1202 
1203 void CSubSource::IsCorrectLatLonFormat (string lat_lon, bool& format_correct, bool& precision_correct,
1204  bool& lat_in_range, bool& lon_in_range,
1205  double& lat_value, double& lon_value)
1206 {
1207  format_correct = false;
1208  lat_in_range = false;
1209  lon_in_range = false;
1210  precision_correct = false;
1211  double ns, ew;
1212  char lon, lat;
1213  int processed;
1214 
1215  lat_value = 0.0;
1216  lon_value = 0.0;
1217 
1218  if (NStr::IsBlank(lat_lon)) {
1219  return;
1220  } else if (sscanf (lat_lon.c_str(), "%lf %c %lf %c%n", &ns, &lat, &ew, &lon, &processed) != 4
1221  || size_t(processed) != lat_lon.length()) {
1222  return;
1223  } else if ((lat != 'N' && lat != 'S') || (lon != 'E' && lon != 'W')) {
1224  return;
1225  } else {
1226  // init values found
1227  if (lat == 'N') {
1228  lat_value = ns;
1229  } else {
1230  lat_value = 0.0 - ns;
1231  }
1232  if (lon == 'E') {
1233  lon_value = ew;
1234  } else {
1235  lon_value = 0.0 - ew;
1236  }
1237 
1238  // make sure format is correct
1239  vector<string> pieces;
1240  NStr::Split(lat_lon, " ", pieces);
1241  if (pieces.size() > 3) {
1242  int precision_lat = x_GetPrecision(pieces[0]);
1243  int precision_lon = x_GetPrecision(pieces[2]);
1244 
1245  char reformatted[1000];
1246  sprintf (reformatted, "%.*lf %c %.*lf %c", precision_lat, ns, lat,
1247  precision_lon, ew, lon);
1248 
1249  size_t len = strlen (reformatted);
1250  if (NStr::StartsWith(lat_lon, reformatted)
1251  && (len == lat_lon.length()
1252  || (len < lat_lon.length()
1253  && lat_lon[len] == ';'))) {
1254  format_correct = true;
1255  if (ns <= 90 && ns >= 0) {
1256  lat_in_range = true;
1257  }
1258  if (ew <= 180 && ew >= 0) {
1259  lon_in_range = true;
1260  }
1261  if (precision_lat < 3 && precision_lon < 3) {
1262  precision_correct = true;
1263  }
1264  }
1265  }
1266  }
1267 }
1268 
1269 
1271 {
1272  bool format_correct = false;
1273  bool precision_correct = false;
1274  bool lat_in_range = false;
1275  bool lon_in_range = false;
1276  double lat_value = 0.0;
1277  double lon_value = 0.0;
1278  IsCorrectLatLonFormat(orig, format_correct, precision_correct,
1279  lat_in_range, lon_in_range,
1280  lat_value, lon_value);
1281  if (!format_correct || !lat_in_range || !lon_in_range || precision_correct) {
1282  return orig;
1283  }
1284  vector<string> pieces;
1285  NStr::Split(orig, " ", pieces);
1286  if (pieces.size() > 3) {
1287  int precision_lat = x_GetPrecision(pieces[0]);
1288  int precision_lon = x_GetPrecision(pieces[2]);
1289  if (precision_lat > 4) {
1290  precision_lat = 4;
1291  }
1292  if (precision_lon > 4) {
1293  precision_lon = 4;
1294  }
1295 
1296  char reformatted[1000];
1297  sprintf(reformatted, "%.*lf %c %.*lf %c", precision_lat, fabs(lat_value), pieces[1].c_str()[0],
1298  precision_lon, fabs(lon_value), pieces[3].c_str()[0]);
1299  string new_val = reformatted;
1300  return reformatted;
1301  }
1302  return kEmptyStr;
1303 }
1304 
1305 /*
1306 1. String should be converted to UTF8 string, this will get rid of \xC0 and similar substrings
1307 2. Every codepoint (note that this is not regular ascii "char") that is not a digit or a decimal point or a letter should be prepended with a space.
1308  Transitions from alpha to digit/point and from digit/point to alpha should also be prepended with a space.
1309 3. NStr::Split is called with space as a separator and Tokenize flag - need to check if Split works with UTF8 strings properly.
1310 4. After this we should have a vector of tokens, some of which are numbers and others are "modifiers" such as ', '', degrees, N, S, E, W, etc.
1311 5. A pattern string is created where each number is replaced with "1" and modifiers are normalized to "lat", or "N"; the actual numerical values are kept in a separate vector
1312 5. Based on the pattern the vector of numbers is parsed into degrees, minutes, or seconds,
1313 6. NSEW and "lattitude/longitude" are applied to degrees in the order of appearance, if none are present other heuristic to determine which is latitude and which is longitude
1314 */
1315 
1316 static string s_InsertSpacesBetweenTokens(const string &old_str)
1317 {
1318  string new_str;
1319  for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
1320  {
1322  if (sym < 0x80)
1323  {
1324  char c = static_cast<char>(sym);
1325  if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
1326  {
1327  new_str += ' ';
1328  }
1329  else if (!new_str.empty() &&
1330  ((isalpha(new_str.back()) && !isalpha(c)) ||
1331  (!isalpha(new_str.back()) && isalpha(c))))
1332  {
1333  new_str += ' ';
1334  }
1335  new_str += c;
1336  if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
1337  {
1338  new_str += ' ';
1339  }
1340  }
1341  else
1342  {
1343  new_str += ' ';
1344  }
1345  }
1346  return new_str;
1347 }
1348 
1349 static string s_RemoveSpacesWithinNumbers(const string &old_str)
1350 {
1351  string new_str;
1352  bool is_number = true;
1353  for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
1354  {
1356  if (sym < 0x80)
1357  {
1358  char c = static_cast<char>(sym);
1359  size_t j = new_str.size();
1360  if (j >= 4 && new_str[j-1] == ' ' && new_str[j-2] == '.' && new_str[j-3] == ' ' && isdigit(new_str[j-4]) && isdigit(c))
1361  {
1362  new_str.pop_back();
1363  new_str.pop_back();
1364  new_str.pop_back();
1365  new_str += '.';
1366  }
1367  new_str += c;
1368  if (!isdigit(c) && c != '+' && c != '-' && c != '.' && !isspace(c)) {
1369  is_number = false;
1370  }
1371  }
1372  else
1373  {
1374  new_str += ' ';
1375  is_number = false;
1376  }
1377  }
1378  if (is_number)
1379  {
1380  NStr::ReplaceInPlace(new_str, "+", " +");
1381  NStr::ReplaceInPlace(new_str, "-", " -");
1382  }
1383  return new_str;
1384 }
1385 
1386 static bool s_IsNumber(const string &token, double *result = NULL)
1387 {
1388  double num = NStr::StringToDouble(token, NStr::fConvErr_NoThrow);
1389  if (!num && errno)
1390  {
1391  return false;
1392  }
1393  if (result) {
1394  *result = num;
1395  }
1396  return true;
1397 }
1398 
1399 static string s_NormalizeTokens(vector<string> &tokens, vector<double> &numbers, vector<string> &anum, vector<int> &precision, vector<string> &lat_long, vector<string> &nsew)
1400 {
1401  vector<string> pattern;
1402  for (size_t i = 0; i < tokens.size(); i++)
1403  {
1404  string &token = tokens[i];
1405 
1406  double num;
1407  if (s_IsNumber(token, &num))
1408  {
1409  numbers.push_back(num);
1410  anum.push_back(token);
1411  pattern.push_back("1");
1412  precision.push_back(0);
1413  if (NStr::Find(token, ".") != NPOS && !NStr::EndsWith(token, "."))
1414  {
1415  precision.back()
1416  = static_cast<int>(token.length() - token.find('.') - 1);
1417  }
1418  continue;
1419  }
1420 
1421  {
1422  vector<string> tmp;
1423  NStr::Split(token, ".", tmp);
1424  double num0, num1, num2;
1425  if (tmp.size() == 3 && s_IsNumber(tmp[0], &num0) && s_IsNumber(tmp[1], &num1) && s_IsNumber(tmp[2], &num2))
1426  {
1427  numbers.push_back(num0);
1428  anum.push_back(tmp[0]);
1429  pattern.push_back("1");
1430  precision.push_back(0);
1431  numbers.push_back(num1);
1432  anum.push_back(tmp[1]);
1433  pattern.push_back("1");
1434  precision.push_back(0);
1435  numbers.push_back(num2);
1436  anum.push_back(tmp[2]);
1437  pattern.push_back("1");
1438  precision.push_back(0);
1439  continue;
1440  }
1441  }
1442 
1443  if (token == "\'" && i >= 3 && s_IsNumber(tokens[i - 1]) && tokens[i - 2] == "\'" && s_IsNumber(tokens[i - 3]))
1444  {
1445  token = "\"";
1446  }
1447 
1448  if (NStr::EqualNocase(token, "degrees") || NStr::EqualNocase(token, "deg") || NStr::EqualNocase(token, "deg.") || NStr::EqualNocase(token, "degree"))
1449  {
1450  token = "degrees";
1451  pattern.push_back("degrees");
1452  }
1453  else if ( token == "\'" || NStr::EqualNocase(token, "min") || NStr::EqualNocase(token, "min.") || NStr::EqualNocase(token, "minute") || NStr::EqualNocase(token, "minutes"))
1454  {
1455  token = "\'";
1456  pattern.push_back("\'");
1457  }
1458  else if (token == "\"" || NStr::EqualNocase(token, "sec") || NStr::EqualNocase(token, "sec.") || NStr::EqualNocase(token, "second") || NStr::EqualNocase(token, "seconds"))
1459  {
1460  token = "\"";
1461  pattern.push_back("\"");
1462  }
1463  else if (token == "," || token == ":" || token == "_" || token == "&" || token == "." || token == ";" || token == "#" || NStr::EqualNocase(token, "and"))
1464  {
1465  }
1466  else if (NStr::EqualNocase(token, "lattitude") || NStr::EqualNocase(token, "latitude") || NStr::EqualNocase(token, "lat") || NStr::EqualNocase(token, "lat."))
1467  {
1468  pattern.push_back("lat");
1469  lat_long.push_back("lat");
1470  }
1471  else if (NStr::EqualNocase(token, "longitude") || NStr::EqualNocase(token, "lo") || NStr::EqualNocase(token, "lon") || NStr::EqualNocase(token, "long")
1472  || NStr::EqualNocase(token, "lo.") || NStr::EqualNocase(token, "lon.") || NStr::EqualNocase(token, "long."))
1473  {
1474  pattern.push_back("lat");
1475  lat_long.push_back("long");
1476  }
1477  else if (token == "N" || NStr::EqualNocase(token, "north"))
1478  {
1479  pattern.push_back("N");
1480  nsew.push_back("N");
1481  }
1482  else if (token == "S" || NStr::EqualNocase(token, "south"))
1483  {
1484  pattern.push_back("N");
1485  nsew.push_back("S");
1486  }
1487  else if (token == "E" || NStr::EqualNocase(token, "east"))
1488  {
1489  pattern.push_back("N");
1490  nsew.push_back("E");
1491  }
1492  else if (token == "W" || NStr::EqualNocase(token, "west") || token == "Wdeg")
1493  {
1494  pattern.push_back("N");
1495  nsew.push_back("W");
1496  }
1497  else if (token == "NW")
1498  {
1499  nsew.push_back("N");
1500  nsew.push_back("W");
1501  }
1502  else if (token == "NE")
1503  {
1504  nsew.push_back("N");
1505  nsew.push_back("E");
1506  }
1507  else if (token == "SW")
1508  {
1509  nsew.push_back("S");
1510  nsew.push_back("W");
1511  }
1512  else if (token == "SE")
1513  {
1514  nsew.push_back("S");
1515  nsew.push_back("E");
1516  }
1517  else
1518  {
1519  //cout << "Token: " << token << endl;
1520  numbers.clear();
1521  return kEmptyStr;
1522  }
1523  }
1524  //cout << "Pattern: " << NStr::Join(pattern, " ") << endl;
1525  return NStr::Join(pattern, " ");
1526 }
1527 
1528 static void s_ReorderNorthSouthEastWest(vector<double> &numbers, vector<int> &precision, const vector<string> &lat_long, vector<string> &nsew)
1529 {
1530  if (numbers.size() != 2)
1531  {
1532  numbers.clear();
1533  return;
1534  }
1535  if (lat_long.size() == 2)
1536  {
1537  if (lat_long.front() == "long")
1538  {
1539  swap(numbers[0], numbers[1]);
1540  swap(precision[0], precision[1]);
1541  if (nsew.size() == 2) {
1542  swap(nsew[0], nsew[1]);
1543  }
1544  }
1545  }
1546  else if (!lat_long.empty())
1547  {
1548  numbers.clear();
1549  return;
1550  }
1551  if (nsew.size() == 2)
1552  {
1553  if ((nsew[0] == "E" || nsew[0] == "W") &&
1554  (nsew[1] == "N" || nsew[1] == "S"))
1555  {
1556  swap(numbers[0], numbers[1]);
1557  swap(precision[0], precision[1]);
1558  swap(nsew[0], nsew[1]);
1559  }
1560  if (nsew[0] == "N")
1561  {
1562  numbers[0] = fabs(numbers[0]);
1563  }
1564  else if (nsew[0] == "S")
1565  {
1566  if (numbers[0] != 0)
1567  numbers[0] = -fabs(numbers[0]);
1568  }
1569  else
1570  {
1571  numbers.clear();
1572  return;
1573  }
1574  if (nsew[1] == "E")
1575  {
1576  numbers[1] = fabs(numbers[1]);
1577  }
1578  else if (nsew[1] == "W")
1579  {
1580  if (numbers[1] != 0)
1581  numbers[1] = -fabs(numbers[1]);
1582  }
1583  else
1584  {
1585  numbers.clear();
1586  return;
1587  }
1588 
1589  }
1590  else if (!nsew.empty())
1591  {
1592  numbers.clear();
1593  return;
1594  }
1595  if (lat_long.empty() && nsew.empty() && fabs(numbers[0]) > 90 && fabs(numbers[1]) < 90)
1596  {
1597  swap(numbers[0], numbers[1]);
1598  swap(precision[0], precision[1]);
1599  }
1600  if (fabs(numbers[0]) > 90 || fabs(numbers[1]) > 180)
1601  {
1602  numbers.clear();
1603  return;
1604  }
1605 }
1606 
1607 static void s_GetLatLong(const string &new_str, vector<double> &numbers, vector<int> &precision)
1608 {
1609  vector<string> tokens;
1610  NStr::Split(new_str, " ", tokens, NStr::fSplit_Tokenize);
1611  vector<string> lat_long;
1612  vector<string> nsew;
1613  vector<string> anum;
1614  string pattern = s_NormalizeTokens(tokens, numbers, anum, precision, lat_long, nsew);
1615  if (pattern.empty())
1616  {
1617  numbers.clear();
1618  return;
1619  }
1620  vector<double> degrees(2, 0);
1621  vector<int> prec(2, 0);
1622  int sign1 = 1;
1623  int sign2 = 1;
1624  if ( pattern == "1 1" ||
1625  pattern == "1 N 1 N" ||
1626  pattern == "N 1 N 1" ||
1627  pattern == "1 degrees N 1 degrees N" ||
1628  pattern == "lat 1 lat 1" ||
1629  pattern == "1 N lat 1 N lat" ||
1630  pattern == "1 degrees N lat 1 degrees N lat")
1631  {
1632  degrees[0] = numbers[0];
1633  degrees[1] = numbers[1];
1634  prec[0] = precision[0];
1635  prec[1] = precision[1];
1636  }
1637  else if ((pattern == "1 1 \" 1 1 '" ||
1638  pattern == "1 degrees 1 \" N 1 degrees 1 ' N")
1639  && numbers[1] < 60 && numbers[3] < 60
1640  && numbers[1] >= 0 && numbers[3] >= 0)
1641  {
1642  sign1 = anum[0][0] == '-' ? -1 : 1;
1643  sign2 = anum[2][0] == '-' ? -1 : 1;
1644  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 3600);
1645  degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
1646  prec[0] = max(precision[0], precision[1] + 4);
1647  prec[1] = max(precision[2], precision[3] + 2);
1648  }
1649  else if ( (pattern == "1 1 ' 1" ||
1650  pattern == "1 degrees 1 ' N 1 degrees N")
1651  && numbers[1] < 60
1652  && numbers[1] >= 0)
1653  {
1654  sign1 = anum[0][0] == '-' ? -1 : 1;
1655  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1656  degrees[1] = numbers[2];
1657  prec[0] = max(precision[0], precision[1] + 2);
1658  prec[1] = precision[2];
1659  }
1660  else if (pattern == "1 1 ' 1 \" 1"
1661  && numbers[1] < 60 && numbers[2] < 60
1662  && numbers[1] >= 0 && numbers[2] >= 0)
1663  {
1664  sign1 = anum[0][0] == '-' ? -1 : 1;
1665  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1666  degrees[1] = numbers[3];
1667  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1668  prec[1] = precision[3];
1669  }
1670  else if ((pattern == "1 1 ' 1 \" 1 1 '" ||
1671  pattern == "1 1 1 N 1 1 N" ||
1672  pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' N")
1673  && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
1674  && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
1675  {
1676  sign1 = anum[0][0] == '-' ? -1 : 1;
1677  sign2 = anum[3][0] == '-' ? -1 : 1;
1678  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1679  degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60);
1680  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1681  prec[1] = max(precision[3], precision[4] + 2);
1682  }
1683  else if (( pattern == "1 1 ' 1 \" 1 1 ' 1 \"" ||
1684  pattern == "1 1 ' 1 \" N 1 1 ' 1 \" N" ||
1685  pattern == "1 degrees 1 ' 1 \" 1 degrees 1 ' 1 \"" ||
1686  pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \" N" ||
1687  pattern == "N 1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \"" ||
1688  pattern == "1 degrees 1 ' 1 N 1 degrees 1 ' 1 N" ||
1689  pattern == "1 degrees 1 1 N 1 degrees 1 1 N" ||
1690  pattern == "1 1 1 N 1 1 1 N")
1691  && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60 && numbers[5] < 60
1692  && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0 && numbers[5] >= 0)
1693  {
1694  sign1 = anum[0][0] == '-' ? -1 : 1;
1695  sign2 = anum[3][0] == '-' ? -1 : 1;
1696  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1697  degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60 + numbers[5] / 3600);
1698  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1699  prec[1] = max(max(precision[3], precision[4] + 2), precision[5] + 4);
1700  }
1701  else if (( pattern == "1 1 ' 1 1 '" ||
1702  pattern == "1 1 N 1 1 N" ||
1703  pattern == "1 1 ' N 1 1 ' N" ||
1704  pattern == "1 degrees 1 ' N 1 degrees 1 ' N" ||
1705  pattern == "lat 1 degrees 1 ' N lat 1 degrees 1 ' N" ||
1706  pattern == "1 degrees 1 N 1 degrees 1 N" ||
1707  pattern == "1 degrees 1 N 1 degrees 1 ' N" ||
1708  pattern == "1 degrees 1 ' N 1 degrees 1 N" ||
1709  pattern == "N 1 degrees 1 ' N 1 degrees 1" ||
1710  pattern == "N 1 degrees 1 ' N 1 degrees 1 '" ||
1711  pattern == "N 1 degrees 1 ' N 1 1 '")
1712  && numbers[1] < 60 && numbers[3] < 60
1713  && numbers[1] >= 0 && numbers[3] >= 0)
1714  {
1715  sign1 = anum[0][0] == '-' ? -1 : 1;
1716  sign2 = anum[2][0] == '-' ? -1 : 1;
1717  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1718  degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
1719  prec[0] = max(precision[0], precision[1] + 2);
1720  prec[1] = max(precision[2], precision[3] + 2);
1721  }
1722  else if ((pattern == "1 N 1 1 N" ||
1723  pattern == "1 degrees N 1 degrees 1 ' N")
1724  && numbers[2] < 60
1725  && numbers[2] >= 0)
1726  {
1727  sign2 = anum[1][0] == '-' ? -1 : 1;
1728  degrees[0] = numbers[0];
1729  degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60);
1730  prec[0] = precision[0];
1731  prec[1] = max(precision[1], precision[2] + 2);
1732  }
1733  else if ((pattern == "1 degrees 1 ' 1 degrees 1 ' 1 \"" ||
1734  pattern == "N 1 1 N 1 1 1")
1735  && numbers[1] < 60 && numbers[3] < 60 && numbers[4] < 60
1736  && numbers[1] >= 0 && numbers[3] >= 0 && numbers[4] >= 0)
1737  {
1738  sign1 = anum[0][0] == '-' ? -1 : 1;
1739  sign2 = anum[2][0] == '-' ? -1 : 1;
1740  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1741  degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60 + numbers[4] / 3600);
1742  prec[0] = max(precision[0], precision[1] + 2);
1743  prec[1] = max(max(precision[2], precision[3] + 2), precision[4] + 4);
1744  }
1745  else if (pattern == "1 degrees 1 degrees 1 ' 1 \""
1746  && numbers[2] < 60 && numbers[3] < 60
1747  && numbers[2] >= 0 && numbers[3] >= 0)
1748  {
1749  sign2 = anum[1][0] == '-' ? -1 : 1;
1750  degrees[0] = numbers[0];
1751  degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60 + numbers[3] / 3600);
1752  prec[0] = precision[0];
1753  prec[1] = max(max(precision[1], precision[2] + 2), precision[3] + 4);
1754  }
1755  else if (pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 \" N"
1756  && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
1757  && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
1758  {
1759  sign1 = anum[0][0] == '-' ? -1 : 1;
1760  sign2 = anum[3][0] == '-' ? -1 : 1;
1761  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1762  degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 3600);
1763  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1764  prec[1] = max(precision[3], precision[4] + 4);
1765  }
1766  else
1767  {
1768  degrees.clear();
1769  prec.clear();
1770  }
1771  swap(degrees, numbers);
1772  swap(prec, precision);
1773  s_ReorderNorthSouthEastWest(numbers, precision, lat_long, nsew);
1774 }
1775 
1776 
1777 string s_ShortenLatLon( string &subname ) {
1778  string lat;
1779  string north_or_south;
1780  string lon;
1781  string east_or_west;
1782 
1783  if (subname.length() < 1) {
1784  return subname;
1785  }
1786  char ch = subname[0];
1787  if (ch < '0' || ch > '9') {
1788  return subname;
1789  }
1790 
1791  // extract the pieces
1792  CNcbiIstrstream lat_lon_stream( subname );
1793  lat_lon_stream >> lat;
1794  lat_lon_stream >> north_or_south;
1795  lat_lon_stream >> lon;
1796  lat_lon_stream >> east_or_west;
1797  if( lat_lon_stream.bad() ) {
1798  return subname;
1799  }
1800 
1801  if( north_or_south != "N" && north_or_south != "S" ) {
1802  return subname;
1803  }
1804 
1805  if( east_or_west != "E" && east_or_west != "W" ) {
1806  return subname;
1807  }
1808 
1809  size_t pos = NStr::Find(lat, ".");
1810  if (pos > 0) {
1811  size_t len = lat.length();
1812  if (pos + 9 < len) {
1813  lat.erase(pos + 9);
1814  }
1815  }
1816 
1817  pos = NStr::Find(lon, ".");
1818  if (pos > 0) {
1819  size_t len = lon.length();
1820  if (pos + 9 < len) {
1821  lon.erase(pos + 9);
1822  }
1823  }
1824 
1825  return lat + " " + north_or_south + " " + lon + " " + east_or_west;
1826 }
1827 
1828 string CSubSource::FixLatLonFormat (string orig_lat_lon, bool guess)
1829 {
1830  //cout << "Before: " << orig_lat_lon << endl;
1831  NStr::ParseEscapes(orig_lat_lon);
1832  CStringUTF8 old_str = CUtf8::AsUTF8(orig_lat_lon, CUtf8::GuessEncoding(orig_lat_lon));
1833  if (NStr::StartsWith(old_str, "\""))
1834  {
1835  NStr::TrimPrefixInPlace(old_str, "\"");
1836  NStr::TrimSuffixInPlace(old_str, "\"");
1837  }
1838  NStr::ReplaceInPlace(old_str, "\'\'", "\"");
1839  string fixed_str = s_RemoveSpacesWithinNumbers(old_str);
1840  string new_str = s_InsertSpacesBetweenTokens(fixed_str);
1841  NStr::Sanitize(new_str);
1842  vector<double> numbers;
1843  vector<int> precision;
1844  s_GetLatLong(new_str, numbers, precision);
1845  string res;
1846  if (!numbers.empty())
1847  {
1848  res = MakeLatLon(numbers[0], numbers[1], precision[0], precision[1]);
1849  }
1850  //cout << "After: " << res << endl;
1851  res = s_ShortenLatLon(res);
1852  return res;
1853 }
1854 
1855 
1856 string CSubSource::MakeLatLon(double lat_value, double lon_value, int lat_precision, int lon_precision )
1857 {
1858  char ns = 'N';
1859  if (lat_value < 0) {
1860  ns = 'S';
1861  lat_value = -lat_value;
1862  }
1863  char ew = 'E';
1864  if (lon_value < 0) {
1865  ew = 'W';
1866  lon_value = -lon_value;
1867  }
1868  string lat = NStr::DoubleToString(lat_value, lat_precision);
1869  string lon = NStr::DoubleToString(lon_value, lon_precision);
1870 
1871  NStr::TrimSuffixInPlace(lat, ".");
1872  NStr::TrimSuffixInPlace(lon, ".");
1873  string res = lat + " " + ns + " " + lon + " " + ew;
1874  return res;
1875 }
1876 
1877 
1878 CLatLonCountryId *CSubSource::x_CalculateLatLonId(float lat_value, float lon_value, string country, string province)
1879 {
1880  CLatLonCountryId *id = new CLatLonCountryId(lat_value, lon_value);
1881 
1882  bool goodmatch = false;
1883 
1884  // lookup region by coordinates, or find nearest region and calculate distance
1885  const CCountryExtreme * guess = m_LatLonCountryMap->GuessRegionForLatLon(lat_value, lon_value, country, province);
1886  if (guess) {
1887  id->SetFullGuess(guess->GetCountry());
1888  id->SetGuessCountry(guess->GetLevel0());
1889  id->SetGuessProvince(guess->GetLevel1());
1890  if (NStr::EqualNocase(country, id->GetGuessCountry())
1891  && (NStr::IsBlank(province) || NStr::EqualNocase(province, id->GetGuessProvince()))) {
1892  goodmatch = true;
1893  }
1894  } else {
1895  // not inside a country, check water
1896  guess = m_LatLonWaterMap->GuessRegionForLatLon(lat_value, lon_value, country);
1897  if (guess) {
1898  // found inside water
1899  id->SetGuessWater(guess->GetCountry());
1900  if (NStr::EqualNocase(country, id->GetGuessWater())) {
1901  goodmatch = true;
1902  }
1903 
1904  // also see if close to land for coastal warning (if country is land)
1905  // or proximity message (if country is water)
1906  double landdistance = 0.0;
1907  guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
1908  if (guess) {
1909  id->SetClosestFull(guess->GetCountry());
1910  id->SetClosestCountry(guess->GetLevel0());
1911  id->SetClosestProvince(guess->GetLevel1());
1912  id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
1913  if (NStr::EqualNocase(country, id->GetClosestCountry())
1914  && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
1915  goodmatch = true;
1916  }
1917  }
1918  } else {
1919  // may be coastal inlet, area of data insufficiency
1920  double landdistance = 0.0;
1921  guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
1922  if (guess) {
1923  id->SetClosestFull(guess->GetCountry());
1924  id->SetClosestCountry(guess->GetLevel0());
1925  id->SetClosestProvince(guess->GetLevel1());
1926  id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
1927  if (NStr::EqualNocase(country, id->GetClosestCountry())
1928  && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
1929  goodmatch = true;
1930  }
1931  }
1932 
1933  double waterdistance = 0.0;
1934  guess = m_LatLonWaterMap->FindClosestToLatLon (lat_value, lon_value, 5.0, waterdistance);
1935  if (guess) {
1936  id->SetClosestWater(guess->GetLevel0());
1937  id->SetWaterDistance(m_LatLonWaterMap->AdjustAndRoundDistance (waterdistance));
1938  if (NStr::EqualNocase(country, id->GetClosestWater())) {
1939  goodmatch = true;
1940  }
1941  }
1942  }
1943  }
1944 
1945  // if guess is not the provided country or province, calculate distance to claimed country
1946  if (!goodmatch) {
1947  double distance = 0.0;
1948  guess = m_LatLonCountryMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
1949  if (guess) {
1950  if (distance < ErrorDistance(lat_value, lon_value, m_LatLonCountryMap->GetScale())) {
1951  // close enough
1952  id->SetGuessCountry(country);
1953  id->SetGuessProvince(province);
1954  id->SetFullGuess(guess->GetCountry());
1955  } else {
1956  id->SetClaimedFull(guess->GetCountry());
1957  id->SetClaimedDistance(m_LatLonCountryMap->AdjustAndRoundDistance (distance));
1958  }
1959  } else if (NStr::IsBlank(province)) {
1960  guess = m_LatLonWaterMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
1961  if (guess) {
1962  id->SetClaimedFull(guess->GetCountry());
1963  id->SetClaimedDistance(m_LatLonWaterMap->AdjustAndRoundDistance (distance));
1964  }
1965  }
1966  }
1967 
1968  return id;
1969 }
1970 
1971 
1972 
1975  {"Adriatic Sea", "Mediterranean Sea"},
1976  {"Aegean Sea", "Mediterranean Sea"},
1977  {"Alboran Sea", "Mediterranean Sea"},
1978  {"Andaman Sea", "Indian Ocean"},
1979  {"Arabian Sea", "Indian Ocean"},
1980  {"Argentine Sea", "Atlantic Ocean"},
1981  {"Ariake Sea", "Pacific Ocean"},
1982  {"Baffin Bay", "Atlantic Ocean"},
1983  {"Balearic Sea", "Mediterranean Sea"},
1984  {"Baltic Sea", "Atlantic Ocean"},
1985  {"Barents Sea", "Arctic Ocean"},
1986  {"Bay of Bengal", "Indian Ocean"},
1987  {"Beaufort Sea", "Arctic Ocean"},
1988  {"Bering Sea", "Pacific Ocean"},
1989  {"Bismarck Sea", "Pacific Ocean"},
1990  {"Black Sea", "Mediterranean Sea"},
1991  {"Bohai Sea", "Pacific Ocean"},
1992  {"Caribbean Sea", "Atlantic Ocean"},
1993  {"Celebes Sea", "Pacific Ocean"},
1994  {"Champlain Sea", "Atlantic Ocean"},
1995  {"Chilean Sea", "Pacific Ocean"},
1996  {"China Seas", "Pacific Ocean"},
1997  {"Chukchi Sea", "Arctic Ocean"},
1998  {"Coral Sea", "Pacific Ocean"},
1999  {"Davis Strait", "Atlantic Ocean"},
2000  {"East China Sea", "Pacific Ocean"},
2001  {"East Siberian Sea", "Arctic Ocean"},
2002  {"English Channel", "Atlantic Ocean"},
2003  {"Erythraean Sea", "Indian Ocean"},
2004  {"Golfo de California", "Pacific Ocean"},
2005  {"Greenland Sea", "Arctic Ocean"},
2006  {"Gulf of Mexico", "Atlantic Ocean"},
2007  {"Gulf of Thailand", "Pacific Ocean"},
2008  {"Gulf of Tonkin", "Pacific Ocean"},
2009  {"Hudson Bay", "Arctic Ocean"},
2010  {"Ionian Sea", "Mediterranean Sea"},
2011  {"Irish Sea", "Atlantic Ocean"},
2012  {"Irminger Sea", "Atlantic Ocean"},
2013  {"James Bay", "Atlantic Ocean"},
2014  {"Java Sea", "Indian Ocean"},
2015  {"Kara Sea", "Arctic Ocean"},
2016  {"Koro Sea", "Pacific Ocean"},
2017  {"Labrador Sea", "Atlantic Ocean"},
2018  {"Laccadive Sea", "Indian Ocean"},
2019  {"Laptev Sea", "Arctic Ocean"},
2020  {"Ligurian Sea", "Mediterranean Sea"},
2021  {"Lincoln Sea", "Arctic Ocean"},
2022  {"Myrtoan Sea", "Mediterranean Sea"},
2023  {"North Sea", "Atlantic Ocean"},
2024  {"Norwegian Sea", "Atlantic Ocean"},
2025  {"Pechora Sea", "Arctic Ocean"},
2026  {"Persian Gulf", "Indian Ocean"},
2027  {"Philippine Sea", "Pacific Ocean"},
2028  {"Red Sea", "Indian Ocean"},
2029  {"Salish Sea", "Pacific Ocean"},
2030  {"Sargasso Sea", "Atlantic Ocean"},
2031  {"Scotia Sea", "Southern Ocean"},
2032  {"Sea of Azov", "Black Sea"},
2033  {"Sea of Chiloe", "Pacific Ocean"},
2034  {"Sea of Crete", "Mediterranean Sea"},
2035  {"Sea of Japan", "Pacific Ocean"},
2036  {"Sea of Okhotsk", "Pacific Ocean"},
2037  {"Sea of the Hebrides", "Atlantic Ocean"},
2038  {"Sea of Zanj", "Indian Ocean"},
2039  {"Seas of Greenland", "Atlantic Ocean"},
2040  {"Sethusamudram", "Indian Ocean"},
2041  {"Sibutu Passage", "Pacific Ocean"},
2042  {"Solomon Sea", "Pacific Ocean"},
2043  {"South China Sea", "Pacific Ocean"},
2044  {"Sulu Sea", "Pacific Ocean"},
2045  {"Tasman Sea", "Pacific Ocean"},
2046  {"Thracian Sea", "Mediterranean Sea"},
2047  {"Timor Sea", "Indian Ocean"},
2048  {"Tyrrhenian Sea", "Mediterranean Sea"},
2049  {"Wandel Sea", "Arctic Ocean"},
2050  {"White Sea", "Arctic Ocean"},
2051  {"Yellow Sea", "Pacific Ocean"}
2052 };
2055 
2056 static string x_FindSurroundingOcean (string& water)
2057 
2058 {
2059  TWaterPairMap::const_iterator new_water_pair_iter = sc_WaterPairMap.find(water.c_str());
2060  if( new_water_pair_iter != sc_WaterPairMap.end() ) {
2061  return new_water_pair_iter->second;
2062  }
2063  return kEmptyStr;
2064 }
2065 
2066 
2067 string CSubSource::ValidateLatLonCountry (const string& input_countryname, string& lat_lon, bool check_state, ELatLonCountryErr& errcode)
2068 {
2069  errcode = eLatLonCountryErr_None;
2070  string countryname = input_countryname;
2071  if (NStr::IsBlank(countryname) || NStr::IsBlank(lat_lon)) {
2072  return kEmptyStr;
2073  }
2074 
2075  {
2076  static std::mutex m;
2077 
2078  std::lock_guard g(m);
2079 
2080  if ( m_LatLonCountryMap.get() == 0 ) {
2081  m_LatLonCountryMap.reset (new CLatLonCountryMap(false));
2082  }
2083  if ( m_LatLonWaterMap.get() == 0 ) {
2084  m_LatLonWaterMap.reset (new CLatLonCountryMap(true));
2085  }
2086  }
2087 
2088  // only do these checks if the latlon format is good
2089  bool format_correct, lat_in_range, lon_in_range, precision_correct;
2090  double lat_value = 0.0, lon_value = 0.0;
2091  CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
2092  lat_in_range, lon_in_range,
2093  lat_value, lon_value);
2094  if (!format_correct) {
2095  // may have comma and then altitude, so just get lat_lon component */
2096  size_t pos = NStr::Find(lat_lon, ",", NStr::eNocase, NStr::eReverseSearch);
2097  if (pos != NPOS) {
2098  lat_lon = lat_lon.substr(0, pos);
2099  CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
2100  lat_in_range, lon_in_range,
2101  lat_value, lon_value);
2102  }
2103  }
2104 
2105  // reality checks
2106  if (!format_correct || !lat_in_range || !lon_in_range) {
2107  // incorrect lat_lon format should be reported elsewhere
2108  // incorrect latitude range should be reported elsewhere
2109  // incorrect longitude range should be reported elsewhere
2110  return kEmptyStr;
2111  }
2112 
2113  // get rid of comments after semicolon or comma in country name
2114  size_t pos = NStr::Find(countryname, ";");
2115  if (pos != NPOS) {
2116  countryname = countryname.substr(0, pos);
2117  }
2118  pos = NStr::Find(countryname, ",");
2119  if (pos != NPOS) {
2120  countryname = countryname.substr(0, pos);
2121  }
2122 
2123  // adjust for special cases
2124  if (NStr::StartsWith(countryname, "Norway: Svalbard")) {
2125  countryname = "Svalbard";
2126  }
2127 
2128  string country = countryname;
2129  string province;
2130  pos = NStr::Find(country, ":");
2131  if (pos != NPOS) {
2132  // is the full string in the list?
2133  if (m_LatLonCountryMap->HaveLatLonForRegion(countryname)) {
2134  province = country.substr(pos + 1);
2136  }
2137  country = country.substr(0, pos);
2139  }
2140  if (NStr::IsBlank(country)) {
2141  return kEmptyStr;
2142  }
2143 
2144  // known exceptions - don't even bother calculating any further
2145  if (NStr::EqualNocase (country, "Antarctica") && lat_value < -60.0) {
2146  return kEmptyStr;
2147  }
2148 
2149  if (! NStr::IsBlank(province)) {
2150  // do not attempt quick exit
2151  } else if (m_LatLonCountryMap->HaveLatLonForRegion(country)) {
2152  if (m_LatLonCountryMap->IsCountryInLatLon(country, lat_value, lon_value)) {
2153  return kEmptyStr;
2154  }
2155  } else if (m_LatLonWaterMap->HaveLatLonForRegion(country)) {
2156  if (m_LatLonWaterMap->IsCountryInLatLon(country, lat_value, lon_value)) {
2157  return kEmptyStr;
2158  }
2159  } else if (NStr::EqualNocase (country, "State of Palestine")) {
2160  } else {
2161  // report unrecognized country
2162  return kEmptyStr;
2163  }
2164 
2165  CLatLonCountryId *id = x_CalculateLatLonId(lat_value, lon_value, country, province);
2166  CLatLonCountryId::TClassificationFlags flags = (id == NULL ? 0 : id->Classify(country, province));
2167 
2168  string wguess = id->GetGuessWater();
2169  string cguess = id->GetGuessCountry();
2170 
2171  // special case where subsection of country has been identified but is not in coordinates of country
2172  // VR-840
2173  if (province.empty() && NStr::Equal(cguess, country)) {
2174  delete id;
2175  return kEmptyStr;
2176  }
2177 
2178  if (NStr::EqualNocase (country, "State of Palestine") &&
2179  (NStr::EqualNocase (cguess, "Gaza Strip") ||
2180  NStr::EqualNocase (cguess, "West Bank"))) {
2181  delete id;
2182  return kEmptyStr;
2183  }
2184 
2185  if (NStr::IsBlank (cguess) && (! NStr::IsBlank (wguess))) {
2186  string parent = x_FindSurroundingOcean (wguess);
2187  if ((! NStr::IsBlank (parent)) && NStr::EqualNocase (country, parent)) {
2188  delete id;
2189  return kEmptyStr;
2190  }
2191  }
2192 
2193  double neardist = 0.0;
2195  CLatLonCountryId::TClassificationFlags adjusted_flags = 0;
2196 
2197  if (!flags && m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 2.0, neardist, country) && neardist < 5.0) {
2198  id->SetGuessCountry (country);
2199  id->SetGuessProvince (kEmptyStr);
2200  flags = id->Classify(country, province);
2201  }
2202 
2203  if (!flags && !m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)
2204  && !m_LatLonWaterMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)) {
2205  /* do not flip from water */
2206  CLatLonCountryId *adjust_id = x_CalculateLatLonId(lon_value, lat_value, country, province);
2207  adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2208  if (adjusted_flags) {
2209  string awguess = adjust_id->GetGuessWater();
2210  string acguess = adjust_id->GetGuessCountry();
2211  if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2212  delete id;
2213  id = adjust_id;
2214  flags = adjusted_flags;
2215  adjustment = CLatLonCountryMap::fFlip;
2216  }
2217  } else {
2218  if (adjust_id) {
2219  delete adjust_id;
2220  }
2221  adjust_id = x_CalculateLatLonId(-lat_value, lon_value, country, province);
2222  adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2223  if (adjusted_flags) {
2224  string awguess = adjust_id->GetGuessWater();
2225  string acguess = adjust_id->GetGuessCountry();
2226  if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2227  delete id;
2228  id = adjust_id;
2229  flags = adjusted_flags;
2230  adjustment = CLatLonCountryMap::fNegateLat;
2231  }
2232  } else {
2233  if (adjust_id) {
2234  delete adjust_id;
2235  }
2236  adjust_id = x_CalculateLatLonId(lat_value, -lon_value, country, province);
2237  adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2238  if (adjusted_flags) {
2239  string awguess = adjust_id->GetGuessWater();
2240  string acguess = adjust_id->GetGuessCountry();
2241  if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2242  delete id;
2243  id = adjust_id;
2244  flags = adjusted_flags;
2245  adjustment = CLatLonCountryMap::fNegateLon;
2246  }
2247  } else {
2248  if (adjust_id) {
2249  delete adjust_id;
2250  }
2251  }
2252  }
2253  }
2254  }
2255 
2256  string error;
2257 
2258  if (adjustment != CLatLonCountryMap::fNone) {
2259  if (adjustment == CLatLonCountryMap::fFlip) {
2260  errcode = eLatLonCountryErr_Value;
2261  error = "Latitude and longitude values appear to be exchanged";
2262  lat_lon = MakeLatLon(lon_value, lat_value);
2263  } else if (adjustment == CLatLonCountryMap::fNegateLat) {
2264  errcode = eLatLonCountryErr_Value;
2265  if (lat_value < 0.0) {
2266  error = "Latitude should be set to N (northern hemisphere)";
2267  } else {
2268  error = "Latitude should be set to S (southern hemisphere)";
2269  }
2270  lat_lon = MakeLatLon(-lat_value, lon_value);
2271  } else if (adjustment == CLatLonCountryMap::fNegateLon) {
2272  errcode = eLatLonCountryErr_Value;
2273  if (lon_value < 0.0) {
2274  error = "Longitude should be set to E (eastern hemisphere)";
2275  } else {
2276  error = "Longitude should be set to W (western hemisphere)";
2277  }
2278  lat_lon = MakeLatLon(lat_value, -lon_value);
2279  }
2281  // success! nothing to report
2282  } else if (flags & CLatLonCountryId::fWaterMatch) {
2283  // success! nothing to report
2284  } else if (flags & CLatLonCountryId::fCountryMatch && NStr::IsBlank(province)) {
2285  if (check_state) {
2286  string full_guess = id->GetFullGuess();
2287  if (!NStr::Equal(full_guess, country)) {
2288  errcode = eLatLonCountryErr_State;
2289  error = "Lat_lon " + lat_lon + " is in " + id->GetFullGuess()
2290  + " (more specific than " + country + ")";
2291  }
2292  }
2293  } else if (!NStr::IsBlank(id->GetGuessWater())) {
2295  bool suppress = false;
2296  string reportregion;
2297  string nosubphrase;
2298  string desphrase = "designated subregion ";
2299  string subphrase = "another subregion ";
2300  string phrase = nosubphrase;
2301  bool show_claimed = false;
2302 
2303  if (id->GetLandDistance() < 100) {
2304  // for now, will not report
2305  // this is a policy decision
2306  suppress = true;
2307  } else if (NStr::Find(countryname, "Island") != NPOS) {
2308  suppress = true;
2309  }
2310 
2311 
2313  reportregion = countryname;
2314  phrase = desphrase;
2315  } else {
2316  // wasn't closest province, so must be closest country
2317  if (!NStr::IsBlank(province) && check_state) {
2318  phrase = subphrase;
2319  reportregion = id->GetClosestFull();
2320  } else {
2321  reportregion = id->GetClosestCountry();
2322  }
2323  if (!NStr::IsBlank(id->GetClaimedFull())) {
2324  show_claimed = true;
2325  }
2326  }
2327  string water = id->GetGuessWater();
2328  if (NStr::EqualNocase (water, "Red Sea") &&
2329  (NStr::EqualNocase (reportregion, "Egypt") ||
2330  NStr::EqualNocase (reportregion, "Saudi Arabia") ||
2331  NStr::EqualNocase (reportregion, "Sudan") ||
2332  NStr::EqualNocase (reportregion, "Eritrea") ||
2333  NStr::EqualNocase (reportregion, "Dijibouti") ||
2334  NStr::EqualNocase (reportregion, "Yemen") ||
2335  NStr::EqualNocase (reportregion, "Israel") ||
2336  NStr::EqualNocase (reportregion, "Jordan"))) {
2337  } else if (NStr::EqualNocase (water, "Gulf of Mexico") &&
2338  (NStr::EqualNocase (reportregion, "USA") ||
2339  NStr::EqualNocase (reportregion, "Mexico"))) {
2340  } else if (!suppress) {
2341  errcode = eLatLonCountryErr_Water;
2342  if (show_claimed) {
2343  error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion + "' at distance "
2345  + " km, but in water '" + id->GetGuessWater()
2346  + "' - claimed region '" + id->GetClaimedFull()
2347  + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2348  } else {
2349  error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion
2350  + "' at distance " + NStr::IntToString(id->GetLandDistance()) + " km, but in water '"
2351  + id->GetGuessWater() + "'";
2352  }
2353  }
2354  } else if (neardist > 0.0) {
2355  errcode = eLatLonCountryErr_Water;
2356  error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "', '"
2357  + countryname + "' is " + NStr::IntToString(m_LatLonCountryMap->AdjustAndRoundDistance(neardist)) + " km away";
2358  } else {
2359  errcode = eLatLonCountryErr_Water;
2360  error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "'";
2361  }
2362  } else if (!NStr::IsBlank(id->GetGuessCountry())) {
2363  string full_guess = id->GetFullGuess();
2364  if (NStr::EqualNocase (country, "China") && NStr::EqualNocase (full_guess, "Hong Kong")) {
2365  // skip
2366  } else if (NStr::IsBlank(id->GetClaimedFull())) {
2367  if (NStr::Equal(id->GetGuessCountry(), country) && !NStr::Equal(id->GetGuessProvince(), province)) {
2368  errcode = eLatLonCountryErr_State;
2369  } else {
2370  errcode = eLatLonCountryErr_Country;
2371  }
2372  error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2373  + countryname + "'";
2374  } else {
2375  if (NStr::IsBlank(province)) {
2376  errcode = eLatLonCountryErr_Country;
2377  error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2378  + country + "' - claimed region '" + id->GetClaimedFull()
2379  + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2380  } else {
2381  errcode = eLatLonCountryErr_Country;
2382  if (NStr::EqualNocase(id->GetGuessCountry(), country)) {
2383  errcode = eLatLonCountryErr_State;
2384  }
2385  if (errcode == eLatLonCountryErr_Country || check_state) {
2386  error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2387  + countryname + "' - claimed region '" + id->GetClaimedFull()
2388  + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2389  } else {
2390  errcode = eLatLonCountryErr_None;
2391  }
2392  }
2393  }
2394  } else if (!NStr::IsBlank(id->GetClosestCountry())) {
2395  errcode = eLatLonCountryErr_Country;
2396  error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestCountry() + "' instead of '"
2397  + countryname + "'";
2398  } else if (!NStr::IsBlank(id->GetClosestWater())) {
2399  errcode = eLatLonCountryErr_Water;
2400  error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestWater() + "' instead of '"
2401  + countryname + "'";
2402  } else {
2403  errcode = eLatLonCountryErr_Country;
2404  error = "Unable to determine mapping for lat_lon '" + lat_lon + "' and country '" + countryname + "'";
2405  }
2406 
2407 
2408  delete id;
2409  return error;
2410 }
2411 
2412 
2414  "asexual",
2415  "bisexual",
2416  "diecious",
2417  "dioecious",
2418  "f",
2419  "female",
2420  "gelding",
2421  "hermaphrodite",
2422  "intersex",
2423  "m",
2424  "male",
2425  "mixed",
2426  "monecious",
2427  "monoecious",
2428  "neuter",
2429  "unisexual",
2430 };
2431 
2432 
2434  "pooled males and females",
2435  "pooled male and female",
2436 };
2437 
2438 
2440 {
2441  size_t max = sizeof(sm_ValidSexQualifierPhrases) / sizeof(const char*);
2442 
2443  const char* *begin = sm_ValidSexQualifierPhrases;
2444  const char* *end = &(sm_ValidSexQualifierPhrases[max]);
2445 
2446  if (find(begin, end, value) != end) {
2447  return true;
2448  } else {
2449  return false;
2450  }
2451 }
2452 
2453 
2455 
2456 {
2457  string str = value;
2458  NStr::ToLower(str);
2459 
2461  return true;
2462  }
2463 
2464  vector<string> words;
2465  NStr::Split(str, " ,/", words);
2466  if (words.size() == 0) {
2467  return false;
2468  }
2469 
2470  size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
2471 
2472  const char* *begin = sm_ValidSexQualifierTokens;
2473  const char* *end = &(sm_ValidSexQualifierTokens[max]);
2474 
2475  bool is_good = false;
2476 
2477  ITERATE(vector<string>, w, words) {
2478  if (NStr::Equal(*w, "and")) {
2479  // ok, skip it
2480  } else {
2481  if (find(begin, end, *w) != end) {
2482  is_good = true;
2483  } else {
2484  is_good = false;
2485  break;
2486  }
2487  }
2488  }
2489  return is_good;
2490 }
2491 
2492 
2494 {
2495  string str = value;
2496  NStr::ToLower(str);
2497 
2499  return str;
2500  }
2501 
2502  vector<string> words;
2503  NStr::Split(str, " ,/", words);
2504 
2505  if (words.size() == 0) {
2506  return kEmptyStr;
2507  }
2509 
2510  const char* *begin = sm_ValidSexQualifierTokens;
2511  const char* *end = &(sm_ValidSexQualifierTokens[max]);
2512 
2513  vector<string> good_values;
2514  bool pooled = false;
2515 
2516  ITERATE(vector<string>, w, words) {
2517  if (NStr::Equal(*w, "and")) {
2518  // ok, skip it
2519  } else if (NStr::EqualNocase(*w, "(pooled)") || NStr::EqualNocase(*w, "pooled")) {
2520  // set pooled flag
2521  pooled = true;
2522  } else {
2523  if (find(begin, end, *w) != end) {
2524  if (NStr::Equal(*w, "m")) {
2525  good_values.push_back("male");
2526  } else if (NStr::Equal(*w, "f")) {
2527  good_values.push_back("female");
2528  } else {
2529  good_values.push_back(*w);
2530  }
2531  } else {
2532  // if any bad values, can't autofix
2533  return kEmptyStr;
2534  }
2535  }
2536  }
2537  if (good_values.size() == 0) {
2538  // no good tokens, can't autofix
2539  return kEmptyStr;
2540  }
2541 
2542  string fixed = good_values[0];
2543  for (size_t i = 1; i < good_values.size(); i++) {
2544  if (good_values.size() > 2) {
2545  fixed += ",";
2546  }
2547  if (i == good_values.size() - 1) {
2548  fixed += " and";
2549  }
2550  fixed += " " + good_values[i];
2551  }
2552  if (pooled) {
2553  fixed = "pooled " + fixed;
2554  }
2555  return fixed;
2556 }
2557 
2558 
2559 void s_CollectNumberAndUnits(const string& value, string& number, string& units)
2560 {
2561  number.clear();
2562  units.clear();
2563 
2564  if (NStr::IsBlank(value)) {
2565  return;
2566  }
2567 
2568  string::const_iterator it = value.begin();
2569  if (*it == '+' || *it == '-') {
2570  number += *it;
2571  it++;
2572  }
2573 
2574  bool any_digit = false;
2575  bool skip_comma = true;
2576  while (it != value.end() && (isdigit(*it) || *it == ',')) {
2577  if (*it == ',') {
2578  if (skip_comma) {
2579  // only skip the first comma
2580  skip_comma = false;
2581  } else {
2582  break;
2583  }
2584  } else {
2585  any_digit = true;
2586  number += *it;
2587  }
2588  it++;
2589  }
2590 
2591  if (it == value.end()) {
2592  number.clear();
2593  return;
2594  }
2595 
2596  if (*it == '.') {
2597  number += *it;
2598  it++;
2599  while (it != value.end() && isdigit(*it)) {
2600  any_digit = true;
2601  number += *it;
2602  it++;
2603  }
2604  }
2605 
2606  if (it == value.end() || *it != ' ' || !any_digit) {
2607  number.clear();
2608  return;
2609  }
2610 
2611  it++;
2612  while (it != value.end()) {
2613  units += *it;
2614  it++;
2615  }
2616 }
2617 
2618 
2620 {
2621  if (NStr::IsBlank(value)) {
2622  return false;
2623  }
2624 
2625  string number;
2626  string units;
2628  if (NStr::IsBlank(number) || !NStr::EqualCase(units, "m")) {
2629  return false;
2630  } else {
2631  return true;
2632  }
2633 
2634 }
2635 
2636 
2637 int CSubSource::x_GetPrecision(const string& num_str)
2638 {
2639  int precision = 0;
2640  size_t pos = NStr::Find(num_str, ".");
2641  if (pos != NPOS) {
2642  precision = int(num_str.length() - pos - 1);
2643  }
2644  return precision;
2645 }
2646 
2647 
2649 {
2650  char reformatted[1000];
2651  sprintf(reformatted, "%.*lf", precision, val);
2652  string rval = reformatted;
2653  return rval;
2654 }
2655 
2656 string CSubSource::FixAltitude (const string& value)
2657 {
2658  if (NStr::IsBlank(value)) {
2659  return kEmptyStr;
2660  }
2661 
2662  string number;
2663  string units;
2665  if (NStr::IsBlank(number)) {
2666  return kEmptyStr;
2667  } else if (NStr::Equal(units, "ft.") || NStr::Equal(units, "ft") || NStr::Equal(units, "feet") || NStr::Equal(units, "foot")) {
2669  double val = NStr::StringToDouble(number);
2670  val *= 0.3048;
2672  units = "m";
2673  }
2674 
2675  string rval = kEmptyStr;
2676  if (NStr::Equal(units, "m.")
2677  || NStr::Equal(units, "meters")
2678  || NStr::Equal(units, "meter")
2679  || NStr::Equal(units, "m")) {
2680 
2681  rval = number + " " + "m";
2682  }
2683  return rval;
2684 }
2685 
2686 
2687 // From VR-793:
2688 // A. For segment, endogenous_virus_name:
2689 // 1. Must begin with a letter or number
2690 // 2. Spaces and other printable characters are permitted
2691 // 3. Must not be empty, must not be longer than 240 characters
2692 
2694 {
2695  if (NStr::IsBlank(value)) {
2696  return false;
2697  } else if (!isalnum(value.c_str()[0])) {
2698  return false;
2699  } else if (value.length() > 240) {
2700  return false;
2701  }
2702 
2703  for (auto it : value) {
2704  if (!isprint(it)) {
2705  return false;
2706  }
2707  }
2708 
2709  return true;
2710 }
2711 
2712 
2714 {
2716 }
2717 
2718 
2720 {
2722 }
2723 
2724 
2725 // From VR-793:
2726 // B. For chromosome, linkage_group and plasmid_name values:
2727 // 4. Must begin with a letter or number
2728 // 5. Must not be empty, must not be longer than 32 characters
2729 // 6. Must not contain <tab>
2730 // 7. Spaces and other printable characters are permitted
2731 // 8. Must not contain the word "plasmid" (ignoring case)
2732 // 9. Must not contain the word "chromosome" (ignoring case)
2733 // 10. Must not contain the phrase "linkage group" (ignoring case)
2734 // 11. Must not contain the series of letters "chr" (ignoring case)
2735 // 12. Must not contain the taxname (ignoring case)
2736 // 14. Must not contain the genus (ignoring case)
2737 // 15. Must not contain the species (ignoring case)
2738 // except allow the species to match the value after an initial 'p' (e.g., JX416328)
2739 // 16. Must not contain the series of letters "chrm" (ignoring case)
2740 // 17. Must not contain the series of letters "chrom" (ignoring case)
2741 // 18. Must not contain the phrase "linkage-group" (ignoring case)
2742 static bool s_FailsGenusOrSpeciesTest(const string& value, const string& taxname)
2743 { // See RW-1436
2744  if (NStr::IsBlank(taxname) ||
2745  NStr::StartsWith(taxname, "Plasmid ", NStr::eNocase) ||
2746  NStr::StartsWith(taxname, "IncQ plasmid", NStr::eNocase)) {
2747  return false;
2748  }
2749 
2750  size_t pos = NStr::Find(taxname, " ");
2751  if (pos != NPOS) {
2752  string genus = taxname.substr(0, pos);
2753  if (NStr::FindNoCase(value, genus) != NPOS) {
2754  // B.14
2755  return true;
2756  }
2757  string species = taxname.substr(pos + 1);
2758 
2759  pos = NStr::FindNoCase(value, species);
2760  if (pos != NPOS) {
2761  if (pos != 1 || value[0] != 'p') {
2762  // B.15
2763  return true;
2764  }
2765  }
2766  }
2767 
2768  return false;
2769 }
2770 
2772 {
2773  if (NStr::FindNoCase(taxname, "Borrelia") != NPOS || NStr::FindNoCase(taxname, "Borreliella") != NPOS) {
2774  if (NStr::StartsWith(value, "cp") || NStr::StartsWith(value, "lp")) {
2775  return true;
2776  }
2777  }
2779  // checks for isalnum start, blankness and unprintable characters
2780  // B.4, B.5, B.7
2781  return false;
2782  } else if (value.length() > 32) {
2783  // B.5
2784  return false;
2785  }
2786 
2787  if (s_FailsGenusOrSpeciesTest(value, taxname)) {
2788  return false;
2789  }
2790 
2791  static string s_ForbiddenPhrases[] = {
2792  "\t", // B.6.
2793  "plasmid", // B.8
2794  "chromosome", // B.9
2795  "linkage group", // B.10
2796  "chr", // B.11
2797  "linkage_group", // B.15
2798  "chrm", // B.16
2799  "chrom", // B.17
2800  "linkage-group" // B.18
2801  };
2802 
2803  for (auto it : s_ForbiddenPhrases) {
2804  if (NStr::FindNoCase(value, it) != NPOS) {
2805  return false;
2806  }
2807  }
2808  return true;
2809 }
2810 
2811 
2812 bool CSubSource::IsChromosomeNameValid(const string& value, const string& taxname)
2813 {
2814  if (NStr::IsBlank(value)) {
2815  return false;
2816  }
2817  if (NStr::StartsWith(value, "LG", NStr::eNocase)) {
2818  return false;
2819  } else {
2821  }
2822 }
2823 
2824 
2825 bool CSubSource::IsLinkageGroupNameValid(const string& value, const string& taxname)
2826 {
2827  if (NStr::IsBlank(value)) {
2828  return false;
2829  }
2831 }
2832 
2833 
2834 // VR-793
2835 // C. For plasmid_name values:
2836 // 19. Exception- megaplasmid is legal
2837 bool CSubSource::IsPlasmidNameValid(const string& value, const string& taxname)
2838 {
2839  if (NStr::IsBlank(value)) {
2840  return false;
2841  }
2842  if (NStr::Equal(value, "megaplasmid")) {
2843  return true;
2844  }
2845  if (NStr::StartsWith(value, "megaplasmid ") && value.length() > 12 && NStr::Find(value.substr(12), " ") == NPOS) {
2846  return true;
2847  }
2848  if (NStr::Equal(value, "F") || NStr::Equal(value, "F factor") || NStr::Equal(value, "F plasmid")) {
2849  return true;
2850  }
2851 
2852  if (NStr::FindNoCase(value,"plasmid") != NPOS) {
2853  static const set<string, PNocase_Conditional> s_PlasmidNameExceptions =
2854  { // This list comes from RW-1436/RW-1430
2855  "Plasmid F",
2856  "Plasmid R",
2857  "Plasmid pIP630",
2858  "Plasmid pNG2",
2859  "Plasmid pGT633",
2860  "Plasmid pE5",
2861  "Plasmid pIP1527",
2862  "Plasmid pAM77",
2863  "Plasmid pAZ1",
2864  "Plasmid RP4"
2865  };
2866 
2867  if (s_PlasmidNameExceptions.find(value) != end(s_PlasmidNameExceptions)) {
2868  return true;
2869  }
2870  return false;
2871  }
2872 
2874 }
2875 
2876 
2877 typedef pair<string, string> TContaminatingCellLine;
2880 
2883 DEFINE_STATIC_FAST_MUTEX(s_CellLineContaminationMutex);
2884 
2885 #include "cell_line.inc"
2886 
2887 static void s_ProcessCellLineLine(const CTempString& line)
2888 {
2889  vector<string> tokens;
2890  NStr::Split(line, "\t", tokens);
2891  if (tokens.size() < 4) {
2892  ERR_POST_X(1, Warning << "Not enough columns in cell_line entry " << line
2893  << "; disregarding");
2894  } else {
2895  NStr::ToUpper(tokens[0]);
2896  (s_CellLineContaminationMap[tokens[0]])[tokens[1]] = TContaminatingCellLine(tokens[2], tokens[3]);
2897  }
2898 }
2899 
2900 
2902 {
2903  CFastMutexGuard GUARD(s_CellLineContaminationMutex);
2905  return;
2906  }
2907 
2908  // read table
2909 
2910  size_t count = sizeof(kCellLine) / sizeof (*kCellLine);
2911  const char * const * start = kCellLine;
2912  while (count--) {
2913  s_ProcessCellLineLine(*start++);
2914  }
2915 
2916 
2918 }
2919 
2920 
2921 string CSubSource::CheckCellLine(const string& cell_line, const string& organism)
2922 {
2923  string rval;
2924 
2926  string cell_line_search = cell_line;
2927  NStr::ToUpper(cell_line_search);
2928 
2929  if (!NStr::IsBlank(((s_CellLineContaminationMap[cell_line_search])[organism]).first)) {
2930  rval = "The International Cell Line Authentication Committee database indicates that " +
2931  cell_line + " from " + organism + " is known to be contaminated by " +
2932  ((s_CellLineContaminationMap[cell_line_search])[organism]).first +
2933  " from " + ((s_CellLineContaminationMap[cell_line_search])[organism]).second +
2934  ". Please see http://iclac.org/databases/cross-contaminations/ for more information and references.";
2935  }
2936  return rval;
2937 }
2938 
2939 
2940 // =============================================================================
2941 // Country Names
2942 // =============================================================================
2943 
2944 
2945 // legal country names, must be in alphabetical order (case sensitive)
2946 static const char* const s_Countries[] = {
2947  "Afghanistan",
2948  "Albania",
2949  "Algeria",
2950  "American Samoa",
2951  "Andorra",
2952  "Angola",
2953  "Anguilla",
2954  "Antarctica",
2955  "Antigua and Barbuda",
2956  "Arctic Ocean",
2957  "Argentina",
2958  "Armenia",
2959  "Aruba",
2960  "Ashmore and Cartier Islands",
2961  "Atlantic Ocean",
2962  "Australia",
2963  "Austria",
2964  "Azerbaijan",
2965  "Bahamas",
2966  "Bahrain",
2967  "Baker Island",
2968  "Baltic Sea",
2969  "Bangladesh",
2970  "Barbados",
2971  "Bassas da India",
2972  "Belarus",
2973  "Belgium",
2974  "Belize",
2975  "Benin",
2976  "Bermuda",
2977  "Bhutan",
2978  "Bolivia",
2979  "Borneo",
2980  "Bosnia and Herzegovina",
2981  "Botswana",
2982  "Bouvet Island",
2983  "Brazil",
2984  "British Virgin Islands",
2985  "Brunei",
2986  "Bulgaria",
2987  "Burkina Faso",
2988  "Burundi",
2989  "Cambodia",
2990  "Cameroon",
2991  "Canada",
2992  "Cape Verde",
2993  "Cayman Islands",
2994  "Central African Republic",
2995  "Chad",
2996  "Chile",
2997  "China",
2998  "Christmas Island",
2999  "Clipperton Island",
3000  "Cocos Islands",
3001  "Colombia",
3002  "Comoros",
3003  "Cook Islands",
3004  "Coral Sea Islands",
3005  "Costa Rica",
3006  "Cote d'Ivoire",
3007  "Croatia",
3008  "Cuba",
3009  "Curacao",
3010  "Cyprus",
3011  "Czech Republic",
3012  "Democratic Republic of the Congo",
3013  "Denmark",
3014  "Djibouti",
3015  "Dominica",
3016  "Dominican Republic",
3017  "Ecuador",
3018  "Egypt",
3019  "El Salvador",
3020  "Equatorial Guinea",
3021  "Eritrea",
3022  "Estonia",
3023  "Eswatini",
3024  "Ethiopia",
3025  "Europa Island",
3026  "Falkland Islands (Islas Malvinas)",
3027  "Faroe Islands",
3028  "Fiji",
3029  "Finland",
3030  "France",
3031  "French Guiana",
3032  "French Polynesia",
3033  "French Southern and Antarctic Lands",
3034  "Gabon",
3035  "Gambia",
3036  "Gaza Strip",
3037  "Georgia",
3038  "Germany",
3039  "Ghana",
3040  "Gibraltar",
3041  "Glorioso Islands",
3042  "Greece",
3043  "Greenland",
3044  "Grenada",
3045  "Guadeloupe",
3046  "Guam",
3047  "Guatemala",
3048  "Guernsey",
3049  "Guinea",
3050  "Guinea-Bissau",
3051  "Guyana",
3052  "Haiti",
3053  "Heard Island and McDonald Islands",
3054  "Honduras",
3055  "Hong Kong",
3056  "Howland Island",
3057  "Hungary",
3058  "Iceland",
3059  "India",
3060  "Indian Ocean",
3061  "Indonesia",
3062  "Iran",
3063  "Iraq",
3064  "Ireland",
3065  "Isle of Man",
3066  "Israel",
3067  "Italy",
3068  "Jamaica",
3069  "Jan Mayen",
3070  "Japan",
3071  "Jarvis Island",
3072  "Jersey",
3073  "Johnston Atoll",
3074  "Jordan",
3075  "Juan de Nova Island",
3076  "Kazakhstan",
3077  "Kenya",
3078  "Kerguelen Archipelago",
3079  "Kingman Reef",
3080  "Kiribati",
3081  "Kosovo",
3082  "Kuwait",
3083  "Kyrgyzstan",
3084  "Laos",
3085  "Latvia",
3086  "Lebanon",
3087  "Lesotho",
3088  "Liberia",
3089  "Libya",
3090  "Liechtenstein",
3091  "Line Islands",
3092  "Lithuania",
3093  "Luxembourg",
3094  "Macau",
3095  "Madagascar",
3096  "Malawi",
3097  "Malaysia",
3098  "Maldives",
3099  "Mali",
3100  "Malta",
3101  "Marshall Islands",
3102  "Martinique",
3103  "Mauritania",
3104  "Mauritius",
3105  "Mayotte",
3106  "Mediterranean Sea",
3107  "Mexico",
3108  "Micronesia, Federated States of",
3109  "Midway Islands",
3110  "Moldova",
3111  "Monaco",
3112  "Mongolia",
3113  "Montenegro",
3114  "Montserrat",
3115  "Morocco",
3116  "Mozambique",
3117  "Myanmar",
3118  "Namibia",
3119  "Nauru",
3120  "Navassa Island",
3121  "Nepal",
3122  "Netherlands",
3123  "New Caledonia",
3124  "New Zealand",
3125  "Nicaragua",
3126  "Niger",
3127  "Nigeria",
3128  "Niue",
3129  "Norfolk Island",
3130  "North Korea",
3131  "North Macedonia",
3132  "North Sea",
3133  "Northern Mariana Islands",
3134  "Norway",
3135  "Oman",
3136  "Pacific Ocean",
3137  "Pakistan",
3138  "Palau",
3139  "Palmyra Atoll",
3140  "Panama",
3141  "Papua New Guinea",
3142  "Paracel Islands",
3143  "Paraguay",
3144  "Peru",
3145  "Philippines",
3146  "Pitcairn Islands",
3147  "Poland",
3148  "Portugal",
3149  "Puerto Rico",
3150  "Qatar",
3151  "Republic of the Congo",
3152  "Reunion",
3153  "Romania",
3154  "Ross Sea",
3155  "Russia",
3156  "Rwanda",
3157  "Saint Barthelemy",
3158  "Saint Helena",
3159  "Saint Kitts and Nevis",
3160  "Saint Lucia",
3161  "Saint Martin",
3162  "Saint Pierre and Miquelon",
3163  "Saint Vincent and the Grenadines",
3164  "Samoa",
3165  "San Marino",
3166  "Sao Tome and Principe",
3167  "Saudi Arabia",
3168  "Senegal",
3169  "Serbia",
3170  "Seychelles",
3171  "Sierra Leone",
3172  "Singapore",
3173  "Sint Maarten",
3174  "Slovakia",
3175  "Slovenia",
3176  "Solomon Islands",
3177  "Somalia",
3178  "South Africa",
3179  "South Georgia and the South Sandwich Islands",
3180  "South Korea",
3181  "South Sudan",
3182  "Southern Ocean",
3183  "Spain",
3184  "Spratly Islands",
3185  "Sri Lanka",
3186  "State of Palestine",
3187  "Sudan",
3188  "Suriname",
3189  "Svalbard",
3190  "Sweden",
3191  "Switzerland",
3192  "Syria",
3193  "Taiwan",
3194  "Tajikistan",
3195  "Tanzania",
3196  "Tasman Sea",
3197  "Thailand",
3198  "Timor-Leste",
3199  "Togo",
3200  "Tokelau",
3201  "Tonga",
3202  "Trinidad and Tobago",
3203  "Tromelin Island",
3204  "Tunisia",
3205  "Turkey",
3206  "Turkmenistan",
3207  "Turks and Caicos Islands",
3208  "Tuvalu",
3209  "USA",
3210  "Uganda",
3211  "Ukraine",
3212  "United Arab Emirates",
3213  "United Kingdom",
3214  "Uruguay",
3215  "Uzbekistan",
3216  "Vanuatu",
3217  "Venezuela",
3218  "Viet Nam",
3219  "Virgin Islands",
3220  "Wake Island",
3221  "Wallis and Futuna",
3222  "West Bank",
3223  "Western Sahara",
3224  "Yemen",
3225  "Zambia",
3226  "Zimbabwe"
3227 };
3228 static const TCStrSet s_CountriesSet(s_Countries, sizeof(s_Countries), __FILE__, __LINE__);
3229 
3230 // former legal country names, must be in alphabetical order (case sensitive)
3231 static const char* const s_Former_Countries[] = {
3232  "Belgian Congo",
3233  "British Guiana",
3234  "Burma",
3235  "Czechoslovakia",
3236  "East Timor",
3237  "Korea",
3238  "Macedonia",
3239  "Micronesia",
3240  "Netherlands Antilles",
3241  "Serbia and Montenegro",
3242  "Siam",
3243  "Swaziland",
3244  "The former Yugoslav Republic of Macedonia",
3245  "USSR",
3246  "Yugoslavia",
3247  "Zaire"
3248 };
3249 static const TCStrSet s_Former_CountriesSet(s_Former_Countries, sizeof(s_Former_Countries), __FILE__, __LINE__);
3250 
3251 // null term exemption values, must be in alphabetical order (case sensitive)
3252 static const char* const s_Null_Countries[] = {
3253  "missing",
3254  "missing: control sample",
3255  "missing: data agreement established pre-2023",
3256  "missing: endangered species",
3257  "missing: human-identifiable",
3258  "missing: lab stock",
3259  "missing: sample group",
3260  "missing: synthetic construct",
3261  "missing: third party data",
3262  "not applicable",
3263  "not collected",
3264  "not provided",
3265  "restricted access"
3266 };
3267 static const TCStrSet s_Null_CountriesSet(s_Null_Countries, sizeof(s_Null_Countries), __FILE__, __LINE__);
3268 
3269 bool CCountries::IsValid(const string& country)
3270 {
3271  string name = country;
3272  size_t pos = country.find(':');
3273 
3274  if ( pos != NPOS ) {
3275  if (pos == country.length() - 1) {
3276  return false;
3277  }
3278  name = country.substr(0, pos);
3279  }
3280 
3281  // try current countries
3282  if (s_CountriesSet.find(name.c_str()) != s_CountriesSet.end()) {
3283  return true;
3284  } else if (s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end()) {
3285  return true;
3286  } else if (s_Null_CountriesSet.find(name.c_str()) != s_Null_CountriesSet.end()) {
3287  return true;
3288  } else {
3289  return false;
3290  }
3291 }
3292 
3293 
3294 bool CCountries::IsValid(const string& country, bool& is_miscapitalized)
3295 {
3296  string name = country;
3297  size_t pos = country.find(':');
3298 
3299  if ( pos != NPOS ) {
3300  name = country.substr(0, pos);
3301  if (pos == country.length() - 1) {
3302  return false;
3303  }
3304  }
3305 
3306  is_miscapitalized = false;
3307  // try current countries
3308  // fast check for properly capitalized
3309  if ( s_CountriesSet.find(name.c_str()) != s_CountriesSet.end() ) {
3310  return true;
3311  }
3312  if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
3313  return true;
3314  }
3315  if ( s_Null_CountriesSet.find(name.c_str()) != s_Null_CountriesSet.end() ) {
3316  return true;
3317  }
3318  // slow check for miscapitalized
3319  ITERATE ( TCStrSet, it, s_CountriesSet ) {
3320  if ( NStr::EqualNocase(name, *it) ) {
3321  is_miscapitalized = true;
3322  return true;
3323  }
3324  }
3326  if ( NStr::EqualNocase(name, *it) ) {
3327  is_miscapitalized = true;
3328  return true;
3329  }
3330  }
3332  if ( NStr::EqualNocase(name, *it) ) {
3333  is_miscapitalized = true;
3334  return true;
3335  }
3336  }
3337 
3338  return false;
3339 }
3340 
3341 
3342 bool CCountries::WasValid(const string& country)
3343 {
3344  string name = country;
3345  size_t pos = country.find(':');
3346 
3347  if ( pos != NPOS ) {
3348  name = country.substr(0, pos);
3349  }
3350 
3351  // try formerly-valid countries
3352  return s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end();
3353 }
3354 
3355 
3356 bool CCountries::WasValid(const string& country, bool& is_miscapitalized)
3357 {
3358  string name = country;
3359  size_t pos = country.find(':');
3360 
3361  if ( pos != NPOS ) {
3362  name = country.substr(0, pos);
3363  }
3364 
3365  is_miscapitalized = false;
3366  // try formerly-valid countries
3367  // fast check for properly capitalized
3368  if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
3369  return true;
3370  }
3371  // slow check for miscapitalized
3373  if ( NStr::EqualNocase(name, *it) ) {
3374  is_miscapitalized = true;
3375  return true;
3376  }
3377  }
3378  return false;
3379 }
3380 
3381 /////////////////////////////////////////////////////////////////////////////
3382 ////// Country Capitalization Fix ///////////////////////////////////////////
3383 
3385 {
3386  {"england", "United Kingdom: England"},
3387  {"great britain", "United Kingdom: Great Britain"},
3388  {"new jersey, usa", "USA: New Jersey"}
3389 };
3392 
3394 {"ABW", "Aruba"},
3395 {"AFG", "Afghanistan"},
3396 {"AGO", "Angola"},
3397 {"AIA", "Anguilla"},
3398 {"ALA", "Aland Islands"},
3399 {"ALB", "Albania"},
3400 {"AND", "Andorra"},
3401 {"ARE", "United Arab Emirates"},
3402 {"ARG", "Argentina"},
3403 {"ARM", "Armenia"},
3404 {"ASM", "American Samoa"},
3405 {"ATA", "Antarctica"},
3406 {"ATF", "French Southern Territories"},
3407 {"ATG", "Antigua and Barbuda"},
3408 {"AUS", "Australia"},
3409 {"AUT", "Austria"},
3410 {"AZE", "Azerbaijan"},
3411 {"Antigua & Barbuda", "Antigua and Barbuda"},
3412 {"Ashmore & Cartier Islands", "Ashmore and Cartier Islands"},
3413 {"BDI", "Burundi"},
3414 {"BEL", "Belgium"},
3415 {"BEN", "Benin"},
3416 {"BES", "Bonaire, Sint Eustatius and Saba"},
3417 {"BFA", "Burkina Faso"},
3418 {"BGD", "Bangladesh"},
3419 {"BGR", "Bulgaria"},
3420 {"BHR", "Bahrain"},
3421 {"BHS", "Bahamas"},
3422 {"BIH", "Bosnia and Herzegovina"},
3423 {"BLM", "Saint Barthelemy"},
3424 {"BLR", "Belarus"},
3425 {"BLZ", "Belize"},
3426 {"BMU", "Bermuda"},
3427 {"BOL", "Bolivia"},
3428 {"BRA", "Brazil"},
3429 {"BRB", "Barbados"},
3430 {"BRN", "Brunei"},
3431 {"BTN", "Bhutan"},
3432 {"BVT", "Bouvet Island"},
3433 {"BWA", "Botswana"},
3434 {"Brasil", "Brazil"},
3435 {"CAF", "Central African Republic"},
3436 {"CAN", "Canada"},
3437 {"CCK", "Cocos Islands"},
3438 {"CHE", "Switzerland"},
3439 {"CHL", "Chile"},
3440 {"CHN", "China"},
3441 {"CIV", "Cote d'Ivoire"},
3442 {"CMR", "Cameroon"},
3443 {"COD", "Democratic Republic of the Congo"},
3444 {"COG", "Republic of the Congo"},
3445 {"COK", "Cook Islands"},
3446 {"COL", "Colombia"},
3447 {"COM", "Comoros"},
3448 {"CPV", "Cape Verde"},
3449 {"CRI", "Costa Rica"},
3450 {"CUB", "Cuba"},
3451 {"CUW", "Curacao"},
3452 {"CXR", "Christmas Island"},
3453 {"CYM", "Cayman Islands"},
3454 {"CYP", "Cyprus"},
3455 {"CZE", "Czech Republic"},
3456 {"Cape Verde Islands", "Cape Verde"},
3457 {"DEU", "Germany"},
3458 {"DJI", "Djibouti"},
3459 {"DMA", "Dominica"},
3460 {"DNK", "Denmark"},
3461 {"DOM", "Dominican Republic"},
3462 {"DZA", "Algeria"},
3463 {"Democratic Republic of Congo", "Democratic Republic of the Congo"},
3464 {"ECU", "Ecuador"},
3465 {"EGY", "Egypt"},
3466 {"ERI", "Eritrea"},
3467 {"ESH", "Western Sahara"},
3468 {"ESP", "Spain"},
3469 {"EST", "Estonia"},
3470 {"ETH", "Ethiopia"},
3471 {"FIN", "Finland"},
3472 {"FJI", "Fiji"},
3473 {"FLK", "Falkland Islands (Islas Malvinas)"},
3474 {"FRA", "France"},
3475 {"FRO", "Faroe Islands"},
3476 {"FSM", "Micronesia, Federated States of"},
3477 {"Falkland Islands", "Falkland Islands (Islas Malvinas)"},
3478 {"French Southern & Antarctic Lands", "French Southern and Antarctic Lands"},
3479 {"GAB", "Gabon"},
3480 {"GBR", "United Kingdom"},
3481 {"GEO", "Georgia"},
3482 {"GGY", "Guernsey"},
3483 {"GHA", "Ghana"},
3484 {"GIB", "Gibraltar"},
3485 {"GIN", "Guinea"},
3486 {"GLP", "Guadeloupe"},
3487 {"GMB", "Gambia"},
3488 {"GNB", "Guinea-Bissau"},
3489 {"GNQ", "Equatorial Guinea"},
3490 {"GRC", "Greece"},
3491 {"GRD", "Grenada"},
3492 {"GRL", "Greenland"},
3493 {"GTM", "Guatemala"},
3494 {"GUF", "French Guiana"},
3495 {"GUM", "Guam"},
3496 {"GUY", "Guyana"},
3497 {"HKG", "Hong Kong"},
3498 {"HMD", "Heard Island and McDonald Islands"},
3499 {"HND", "Honduras"},
3500 {"HRV", "Croatia"},
3501 {"HTI", "Haiti"},
3502 {"HUN", "Hungary"},
3503 {"Heard Island & McDonald Islands", "Heard Island and McDonald Islands"},
3504 {"IDN", "Indonesia"},
3505 {"IMN", "Isle of Man"},
3506 {"IND", "India"},
3507 {"IOT", "British Indian Ocean Territory"},
3508 {"IRL", "Ireland"},
3509 {"IRN", "Iran"},
3510 {"IRQ", "Iraq"},
3511 {"ISL", "Iceland"},
3512 {"ISR", "Israel"},
3513 {"ITA", "Italy"},
3514 {"Ivory Coast", "Cote d'Ivoire"},
3515 {"JAM", "Jamaica"},
3516 {"JEY", "Jersey"},
3517 {"JOR", "Jordan"},
3518 {"JPN", "Japan"},
3519 {"KAZ", "Kazakhstan"},
3520 {"KEN", "Kenya"},
3521 {"KGZ", "Kyrgyzstan"},
3522 {"KHM", "Cambodia"},
3523 {"KIR", "Kiribati"},
3524 {"KNA", "Saint Kitts and Nevis"},
3525 {"KOR", "South Korea"},
3526 {"KWT", "Kuwait"},
3527 {"LAO", "Lao People's Democratic Republic"},
3528 {"LBN", "Lebanon"},
3529 {"LBR", "Liberia"},
3530 {"LBY", "Libyan Arab Jamahiriya"},
3531 {"LCA", "Saint Lucia"},
3532 {"LIE", "Liechtenstein"},
3533 {"LKA", "Sri Lanka"},
3534 {"LSO", "Lesotho"},
3535 {"LTU", "Lithuania"},
3536 {"LUX", "Luxembourg"},
3537 {"LVA", "Latvia"},
3538 {"La Reunion Island", "Reunion"},
3539 {"Luxemburg", "Luxembourg"},
3540 {"MAC", "Macao"},
3541 {"MAF", "Saint Martin (French part)"},
3542 {"MAR", "Morocco"},
3543 {"MCO", "Monaco"},
3544 {"MDA", "Moldova"},
3545 {"MDG", "Madagascar"},
3546 {"MDV", "Maldives"},
3547 {"MEX", "Mexico"},
3548 {"MHL", "Marshall Islands"},
3549 {"MKD", "North Macedonia"},
3550 {"MLI", "Mali"},
3551 {"MLT", "Malta"},
3552 {"MMR", "Myanmar"},
3553 {"MNE", "Montenegro"},
3554 {"MNG", "Mongolia"},
3555 {"MNP", "Northern Mariana Islands"},
3556 {"MOZ", "Mozambique"},
3557 {"MRT", "Mauritania"},
3558 {"MSR", "Montserrat"},
3559 {"MTQ", "Martinique"},
3560 {"MUS", "Mauritius"},
3561 {"MWI", "Malawi"},
3562 {"MYS", "Malaysia"},
3563 {"MYT", "Mayotte"},
3564 {"Macedonia", "North Macedonia"},
3565 {"NAM", "Namibia"},
3566 {"NCL", "New Caledonia"},
3567 {"NER", "Niger"},
3568 {"NFK", "Norfolk Island"},
3569 {"NGA", "Nigeria"},
3570 {"NIC", "Nicaragua"},
3571 {"NIU", "Niue"},
3572 {"NLD", "Netherlands"},
3573 {"NOR", "Norway"},
3574 {"NPL", "Nepal"},
3575 {"NRU", "Nauru"},
3576 {"NZL", "New Zealand"},
3577 {"Netherland", "Netherlands"},
3578 {"New Guinea", "Papua New Guinea"},
3579 {"OMN", "Oman"},
3580 {"P, R, China", "China"},
3581 {"P.R. China", "China"},
3582 {"P.R.China", "China"},
3583 {"PAK", "Pakistan"},
3584 {"PAN", "Panama"},
3585 {"PCN", "Pitcairn"},
3586 {"PER", "Peru"},
3587 {"PHL", "Philippines"},
3588 {"PLW", "Palau"},
3589 {"PNG", "Papua New Guinea"},
3590 {"POL", "Poland"},
3591 {"PRI", "Puerto Rico"},
3592 {"PRK", "North Korea"},
3593 {"PRT", "Portugal"},
3594 {"PRY", "Paraguay"},
3595 {"PSE", "Palestinian Territory"},
3596 {"PYF", "French Polynesia"},
3597 {"People's Republic of China", "China"},
3598 {"Pr China", "China"},
3599 {"Prchina", "China"},
3600 {"QAT", "Qatar"},
3601 {"REU", "Reunion"},
3602 {"ROU", "Romania"},
3603 {"RUS", "Russia"},
3604 {"RWA", "Rwanda"},
3605 {"Republic of Congo", "Republic of the Congo"},
3606 {"SAU", "Saudi Arabia"},
3607 {"SDN", "Sudan"},
3608 {"SEN", "Senegal"},
3609 {"SGP", "Singapore"},
3610 {"SGS", "South Georgia and the South Sandwich Islands"},
3611 {"SHN", "Saint Helena"},
3612 {"SJM", "Svalbard and Jan Mayen"},
3613 {"SLB", "Solomon Islands"},
3614 {"SLE", "Sierra Leone"},
3615 {"SLV", "El Salvador"},
3616 {"SMR", "San Marino"},
3617 {"SOM", "Somalia"},
3618 {"SPM", "Saint Pierre and Miquelon"},
3619 {"SRB", "Serbia"},
3620 {"SSD", "South Sudan"},
3621 {"STP", "Sao Tome and Principe"},
3622 {"SUR", "Suriname"},
3623 {"SVK", "Slovakia"},
3624 {"SVN", "Slovenia"},
3625 {"SWE", "Sweden"},
3626 {"SWZ", "Eswatini"},
3627 {"SXM", "Sint Maarten (Dutch part)"},
3628 {"SYC", "Seychelles"},
3629 {"SYR", "Syrian Arab Republic"},
3630 {"Saint Kitts & Nevis", "Saint Kitts and Nevis"},
3631 {"Saint Pierre & Miquelon", "Saint Pierre and Miquelon"},
3632 {"Saint Vincent & Grenadines", "Saint Vincent and the Grenadines"},
3633 {"Saint Vincent & the Grenadines", "Saint Vincent and the Grenadines"},
3634 {"Saint Vincent and Grenadines", "Saint Vincent and the Grenadines"},
3635 {"San Tome and Principe Island", "Sao Tome and Principe"},
3636 {"Sao Tome & Principe", "Sao Tome and Principe"},
3637 {"South Georgia & South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
3638 {"South Georgia & the South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
3639 {"St Helena", "Saint Helena"},
3640 {"St Lucia", "Saint Lucia"},
3641 {"St Pierre and Miquelon", "Saint Pierre and Miquelon"},
3642 {"St Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
3643 {"St. Helena", "Saint Helena"},
3644 {"St. Lucia", "Saint Lucia"},
3645 {"St. Pierre and Miquelon", "Saint Pierre and Miquelon"},
3646 {"St. Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
3647 {"TCA", "Turks and Caicos Islands"},
3648 {"TCD", "Chad"},
3649 {"TGO", "Togo"},
3650 {"THA", "Thailand"},
3651 {"TJK", "Tajikistan"},
3652 {"TKL", "Tokelau"},
3653 {"TKM", "Turkmenistan"},
3654 {"TLS", "Timor-Leste"},
3655 {"TON", "Tonga"},
3656 {"TTO", "Trinidad and Tobago"},
3657 {"TUN", "Tunisia"},
3658 {"TUR", "Turkey"},
3659 {"TUV", "Tuvalu"},
3660 {"TWN", "Taiwan"},
3661 {"TZA", "Tanzania"},
3662 {"The Netherlands", "Netherlands"},
3663 {"Trinidad & Tobago", "Trinidad and Tobago"},
3664 {"Turks & Caicos", "Turks and Caicos Islands"},
3665 {"Turks & Caicos Islands", "Turks and Caicos Islands"},
3666 {"Turks and Caicos", "Turks and Caicos Islands"},
3667 {"U.S.A.", "USA"},
3668 {"UGA", "Uganda"},
3669 {"UK", "United Kingdom"},
3670 {"UKR", "Ukraine"},
3671 {"UMI", "United States Minor Outlying Islands"},
3672 {"URY", "Uruguay"},
3673 {"UZB", "Uzbekistan"},
3674 {"United States", "USA"},
3675 {"United States of America", "USA"},
3676 {"VAT", "Holy See (Vatican City State)"},
3677 {"VCT", "Saint Vincent and the Grenadines"},
3678 {"VEN", "Venezuela"},
3679 {"VGB", "British Virgin Islands"},
3680 {"VIR", "Virgin Islands"},
3681 {"VNM", "Viet Nam"},
3682 {"VUT", "Vanuatu"},
3683 {"Vietnam", "Viet Nam"},
3684 {"WLF", "Wallis and Futuna"},
3685 {"WSM", "Samoa"},
3686 {"YEM", "Yemen"},
3687 {"ZAF", "South Africa"},
3688 {"ZMB", "Zambia"},
3689 {"ZWE", "Zimbabwe"},
3690 {"the Netherlands", "Netherlands"}
3691 };
3692 
3694 
3695 // for GP-24841
3697 {"Burma", "Myanmar"},
3698 {"Siam", "Thailand"}
3699 };
3701 
3702 // for GB-7408
3704 {"Antigua", "Antigua and Barbuda: Antigua"},
3705 {"Ashmore Island", "Ashmore and Cartier Islands: Ashmore Island"},
3706 {"Autonomous Region of the Azores", "Portugal: Azores"},
3707 {"Azores", "Portugal: Azores"},
3708 {"Barbuda", "Antigua and Barbuda: Barbuda"},
3709 {"Bassas da India", "French Southern and Antarctic Lands: Bassas da India"},
3710 {"Caicos Islands", "Turks and Caicos Islands: Caicos Islands"},
3711 {"Canary Islands", "Spain: Canary Islands"},
3712 {"Cartier Island", "Ashmore and Cartier Islands: Cartier Island"},
3713 {"East Germany", "Germany: East Germany"},
3714 {"El Hierro", "Spain: El Hierro"},
3715 {"Europa Island", "French Southern and Antarctic Lands: Europa Island"},
3716 {"Fuerteventura", "Spain: Fuerteventura"},
3717 {"Glorioso Islands", "French Southern and Antarctic Lands: Glorioso Islands"},
3718 {"Gran Canaria", "Spain: Gran Canaria"},
3719 {"Grenadines", "Saint Vincent and the Grenadines: Grenadines"},
3720 {"Heard Island", "Heard Island and McDonald Islands: Heard Island"},
3721 {"Ile Amsterdam", "French Southern and Antarctic Lands: Ile Amsterdam"},
3722 {"Ile Saint-Paul", "French Southern and Antarctic Lands: Ile Saint-Paul"},
3723 {"Iles Crozet", "French Southern and Antarctic Lands: Iles Crozet"},
3724 {"Iles Kerguelen", "French Southern and Antarctic Lands: Iles Kerguelen"},
3725 {"Juan de Nova Island", "French Southern and Antarctic Lands: Juan de Nova Island"},
3726 {"La Gomera", "Spain: La Gomera"},
3727 {"La Graciosa", "Spain: La Graciosa"},
3728 {"La Palma", "Spain: La Palma"},
3729 {"Lanzarote", "Spain: Lanzarote"},
3730 {"Madeira", "Portugal: Madeira"},
3731 {"McDonald Island", "Heard Island and McDonald Islands: McDonald Island"},
3732 {"McDonald Islands", "Heard Island and McDonald Islands: McDonald Islands"},
3733 {"Miquelon", "Saint Pierre and Miquelon: Miquelon"},
3734 {"Nevis", "Saint Kitts and Nevis: Nevis"},
3735 {"Principe", "Sao Tome and Principe: Principe"},
3736 {"Saint Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3737 {"Saint Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3738 {"Saint Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3739 {"Sao Tome", "Sao Tome and Principe: Sao Tome"},
3740 {"Scotland", "United Kingdom: Scotland"},
3741 {"South Sandwich Islands", "South Georgia and the South Sandwich Islands: South Sandwich Islands"},
3742 {"St Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3743 {"St Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3744 {"St Thomas", "USA: Saint Thomas"},
3745 {"St Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3746 {"St. Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3747 {"St. Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3748 {"St. Thomas", "USA: Saint Thomas"},
3749 {"St. Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3750 {"Tenerife", "Spain: Tenerife"},
3751 {"Tobago", "Trinidad and Tobago: Tobago"},
3752 {"Trinidad", "Trinidad and Tobago: Trinidad"},
3753 {"Tromelin Island", "French Southern and Antarctic Lands: Tromelin Island"},
3754 {"Turks Islands", "Turks and Caicos Islands: Turks Islands"},
3755 {"Wales", "United Kingdom: Wales"},
3756 {"West Germany", "Germany: West Germany"},
3757 
3758 };
3760 
3761 
3762 static const char* s_USAStates[] = {
3763  "Alabama",
3764  "Alaska",
3765  "Arizona",
3766  "Arkansas",
3767  "California",
3768  "Colorado",
3769  "Connecticut",
3770  "Delaware",
3771  "District of Columbia",
3772  "Florida",
3773  "Georgia",
3774  "Hawaii",
3775  "Idaho",
3776  "Illinois",
3777  "Indiana",
3778  "Iowa",
3779  "Kansas",
3780  "Kentucky",
3781  "Louisiana",
3782  "Maine",
3783  "Maryland",
3784  "Massachusetts",
3785  "Michigan",
3786  "Minnesota",
3787  "Mississippi",
3788  "Missouri",
3789  "Montana",
3790  "Nebraska",
3791  "Nevada",
3792  "New Hampshire",
3793  "New Jersey",
3794  "New Mexico",
3795  "New York",
3796  "North Carolina",
3797  "North Dakota",
3798  "Ohio",
3799  "Oklahoma",
3800  "Oregon",
3801  "Pennsylvania",
3802  "Rhode Island",
3803  "South Carolina",
3804  "South Dakota",
3805  "Tennessee",
3806  "Texas",
3807  "Utah",
3808  "Vermont",
3809  "Virginia",
3810  "Washington",
3811  "West Virginia",
3812  "Wisconsin",
3813  "Wyoming"
3814 };
3815 
3817 {
3818  vector<string> words;
3819  NStr::Split(phrase, " \t\r\n", words);
3820  for(vector<string>::iterator word = words.begin(); word != words.end(); ++word)
3821  if (!word->empty() && isalpha(word->at(0)))
3822  word->at(0) = (unsigned char)toupper(word->at(0));
3823  return NStr::Join(words," ");
3824 }
3825 
3826 string CCountries::WholeCountryFix(string country)
3827 {
3828  string new_country;
3829  TCStringPairsMap::const_iterator found = k_whole_country_fixes.find(NStr::ToLower(country).c_str());
3830  if (found != k_whole_country_fixes.end()) {
3831  new_country = found->second;
3832  return new_country;
3833  }
3834 
3835  const size_t num_states = sizeof(s_USAStates) / sizeof(s_USAStates[0]);
3836  for (size_t i = 0; i < num_states; ++i) {
3837  if (NStr::EqualNocase(s_USAStates[i], country)) {
3838  new_country = "USA: " + CTempString(s_USAStates[i]);
3839  break;
3840  }
3841  }
3842 
3843  return new_country;
3844 }
3845 
3846 bool CCountries::IsSubstringOfStringInList(const string& phrase, const string& country1, size_t pos1)
3847 {
3848  bool r = false;
3850  {
3851  string country2(*c);
3852  if (country2.length() > country1.length() && NStr::FindNoCase(country2,country1) != NPOS)
3853  {
3854  SIZE_TYPE pos2 = NStr::FindNoCase(phrase,country2);
3855  while (pos2 != NPOS)
3856  {
3857  if (pos2 <= pos1 && pos2+country2.length() >= pos1+country1.length())
3858  r = true;
3859  pos2 = NStr::FindNoCase(phrase,country2,pos2+country2.length());
3860  }
3861  }
3862  }
3863  return r;
3864 }
3865 
3866 bool CCountries::ContainsMultipleCountryNames (const string &phrase)
3867 {
3868  int num_matches = 0;
3870  {
3871  string country(*c);
3872  size_t pos = NStr::FindNoCase(phrase,country);
3873  while (pos != NPOS)
3874  {
3875  if (!((pos+country.length()<phrase.length() && isalpha(phrase[pos+country.length()]))
3876  || (pos > 0 && isalpha(phrase[pos-1]))
3877  || IsSubstringOfStringInList(phrase,country,pos)))
3878  num_matches++;
3879  pos = NStr::FindNoCase(phrase,country,pos+country.length());
3880  }
3881 
3882  }
3883  return (num_matches > 1);
3884 }
3885 
3887 {
3888  string output = country;
3889  ITERATE ( TCStrSet, it, s_CountriesSet ) {
3890  if ( NStr::EqualNocase(country, *it) ) {
3891  output = *it;
3892  }
3893  }
3894  return output;
3895 }
3896 
3897 
3898 void CCountries::x_RemoveDelimitersFromEnds(string& val, bool except_paren)
3899 {
3901  bool any_found = true;
3902  while (!val.empty() && any_found) {
3903  any_found = false;
3904  if (NStr::StartsWith(val, ",")
3905  || NStr::StartsWith(val, ":")
3906  || NStr::StartsWith(val, ".")
3907  || (!except_paren && NStr::StartsWith(val, ")"))) {
3908  val = val.substr(1);
3909  any_found = true;
3911  } else if (NStr::EndsWith(val, ",")
3912  || NStr::EndsWith(val, ":")
3913  || (!except_paren && NStr::EndsWith(val, "("))) {
3914  val = val.substr(0, val.length() - 1);
3915  any_found = true;
3917  } else if (NStr::EndsWith(val, "the") && val.length() > 3 && !isalpha((unsigned char)val[val.length() - 4])) {
3918  val = val.substr(0, val.length() - 4);
3919  any_found = true;
3920  } else if (NStr::EndsWith(val, ".")) {
3921  size_t len = val.length();
3922  if (len > 1 && isspace((unsigned char)val[len - 2])) {
3923  val = val.substr(0, val.length() - 1);
3924  any_found = true;
3926  } else if (len > 5) {
3927  // make sure no spaces or punctuation within 4 characters before '.'
3928  bool do_remove = true;
3929  size_t pos = val.length() - 2;
3930  size_t dist = 0;
3931  while (dist < 4 && do_remove) {
3932  if (isspace((unsigned char)val[pos]) || ispunct((unsigned char)val[pos])) {
3933  do_remove = false;
3934  }
3935  pos--;
3936  dist++;
3937  }
3938  if (do_remove) {
3939  val = val.substr(0, val.length() - 1);
3940  any_found = true;
3941  }
3942  }
3943  }
3944  }
3945 }
3946 
3947 
3948 vector<string> CCountries::x_Tokenize(const string& val)
3949 {
3950  vector<string> tokens;
3951  NStr::Split(val, ",:()", tokens);
3952  // special tokenizing - if tokens contain periods but resulting token is at least four characters long
3953  vector<string>::iterator it = tokens.begin();
3954  while (it != tokens.end()) {
3955  size_t pos = NStr::Find(*it, ".");
3956  if (pos != NPOS && pos > 3 && (*it).length() - pos > 4) {
3957  string first = (*it).substr(0, pos);
3958  string remainder = (*it).substr(pos + 1);
3959  size_t space_pos = NStr::Find(first, " ");
3960  size_t len_to_space = first.length();
3961  while (space_pos != NPOS) {
3962  first = first.substr(space_pos + 1);
3963  len_to_space = first.length();
3964  space_pos = NStr::Find(first, " ");
3965  }
3966  if (len_to_space > 4) {
3967  (*it) = (*it).substr(0, pos);
3968  it = tokens.insert(it, remainder);
3969  } else {
3970  it++;
3971  }
3972  } else {
3973  it++;
3974  }
3975  }
3976  return tokens;
3977 }
3978 
3979 
3980 bool s_ContainsWholeWord(const CTempString test, const CTempString word, NStr::ECase case_sense)
3981 {
3982  size_t start = 0;
3983  size_t tlen = test.length();
3984  size_t wlen = word.length();
3985 
3986  size_t pos = NStr::Find(test, word, case_sense);
3987  while (pos != NPOS) {
3988  size_t p = start + pos;
3989  if ( (p == 0 || !isalpha((unsigned char)test[p - 1])) &&
3990  (p + wlen >= tlen || !isalpha((unsigned char)test[p + wlen])) ) {
3991  return true;
3992  }
3993  start = p + 1;
3994  pos = NStr::Find(CTempString(test, start, tlen - start), word, case_sense);
3995  }
3996  return false;
3997 }
3998 
3999 
4000 bool s_SuppressCountryFix(const string& test)
4001 {
4002  if (s_ContainsWholeWord(test, "Sea", NStr::eNocase)) {
4003  return true;
4004  } else if (s_ContainsWholeWord(test, "USSR", NStr::eNocase)) {
4005  return true;
4006  }
4007  return false;
4008 }
4009 
4010 
4012 (const TCStringPairsMap& fix_map,
4013  const vector<string>& countries,
4014  string& valid_country,
4015  string& orig_valid_country,
4016  bool& too_many_countries,
4017  bool& bad_cap)
4018 {
4019  for (auto country : countries) {
4020  if (!country.empty() && !too_many_countries)
4021  {
4022  string check = country;
4025 
4026  bool check_has_bad_cap = false;
4027  if (IsValid(check,check_has_bad_cap))
4028  {
4029  if (valid_country.empty())
4030  {
4031  valid_country = check;
4032  orig_valid_country = check;
4033  bad_cap = check_has_bad_cap;
4034  }
4035  else
4036  {
4037  too_many_countries = true;
4038  }
4039  }
4040  else // see if this is a fixable country
4041  {
4042  TCStringPairsMap::const_iterator found = fix_map.find(check.c_str());
4043  if (found != fix_map.end())
4044  {
4045  if (valid_country.empty())
4046  {
4047  valid_country = found->second;
4048  orig_valid_country = check;
4049  }
4050  else
4051  {
4052  too_many_countries = true;
4053  }
4054  }
4055  }
4056  }
4057  }
4058 }
4059 
4060 // start of RW-1278
4061 
4062 bool s_CompressRunsOfSpaces(string& val)
4063 {
4064  if (val.length() == 0) return false;
4065 
4066  char * str = new char[sizeof(char) * (val.length() + 1)];
4067  strcpy(str, val.c_str());
4068 
4069  unsigned char ch; /* to use 8bit characters in multibyte languages */
4070  unsigned char pv; /* to use 8bit characters in multibyte languages */
4071  char * dst;
4072  char * ptr;
4073 
4074  dst = str;
4075  ptr = str;
4076  ch = *ptr;
4077  pv = '\0';
4078  while (ch != '\0') {
4079  *dst = ch;
4080  dst++;
4081  ptr++;
4082  pv = ch;
4083  ch = *ptr;
4084  if (pv == ' ') {
4085  while (ch == ' ') {
4086  ptr++;
4087  ch = *ptr;
4088  }
4089  pv = '\0';
4090  }
4091  }
4092  if (dst != NULL) {
4093  *dst = '\0';
4094  }
4095 
4096  string new_val;
4097  new_val = str;
4098  delete[] str;
4099 
4100  if (!NStr::Equal(val, new_val)) {
4101  val = new_val;
4102  return true;
4103  }
4104  else {
4105  return false;
4106  }
4107 }
4108 
4111  { "Acadia Parish", "Acadia Parish" },
4112  { "AcadiaParish", "Acadia Parish" },
4113  { "Allen Parish", "Allen Parish" },
4114  { "AllenParish", "Allen Parish" },
4115  { "Ascension Parish", "Ascension Parish" },
4116  { "AscensionParish", "Ascension Parish" },
4117  { "Assumption Parish", "Assumption Parish" },
4118  { "AssumptionParish", "Assumption Parish" },
4119  { "Avoyelles Parish", "Avoyelles Parish" },
4120  { "AvoyellesParish", "Avoyelles Parish" },
4121  { "Beauregard Parish", "Beauregard Parish" },
4122  { "BeauregardParish", "Beauregard Parish" },
4123  { "Bienville Parish", "Bienville Parish" },
4124  { "BienvilleParish", "Bienville Parish" },
4125  { "Bossier Parish", "Bossier Parish" },
4126  { "BossierParish", "Bossier Parish" },
4127  { "Caddo Parish", "Caddo Parish" },
4128  { "CaddoParish", "Caddo Parish" },
4129  { "Calcasieu Parish", "Calcasieu Parish" },
4130  { "CalcasieuParish", "Calcasieu Parish" },
4131  { "Caldwell Parish", "Caldwell Parish" },
4132  { "CaldwellParish", "Caldwell Parish" },
4133  { "Cameron Parish", "Cameron Parish" },
4134  { "CameronParish", "Cameron Parish" },
4135  { "Catahoula Parish", "Catahoula Parish" },
4136  { "CatahoulaParish", "Catahoula Parish" },
4137  { "Claiborne Parish", "Claiborne Parish" },
4138  { "ClaiborneParish", "Claiborne Parish" },
4139  { "Concordia Parish", "Concordia Parish" },
4140  { "ConcordiaParish", "Concordia Parish" },
4141  { "DeSoto Parish", "DeSoto Parish" },
4142  { "DeSotoParish", "DeSoto Parish" },
4143  { "East Baton Rouge Parish", "East Baton Rouge Parish" },
4144  { "East Carroll Parish", "East Carroll Parish" },
4145  { "East Feliciana Parish", "East Feliciana Parish" },
4146  { "EastBatonRougeParish", "East Baton Rouge Parish" },
4147  { "EastCarrollParish", "East Carroll Parish" },
4148  { "EastFelicianaParish", "East Feliciana Parish" },
4149  { "Evangeline Parish", "Evangeline Parish" },
4150  { "EvangelineParish", "Evangeline Parish" },
4151  { "Franklin Parish", "Franklin Parish" },
4152  { "FranklinParish", "Franklin Parish" },
4153  { "Grant Parish", "Grant Parish" },
4154  { "GrantParish", "Grant Parish" },
4155  { "Iberia Parish", "Iberia Parish" },
4156  { "IberiaParish", "Iberia Parish" },
4157  { "Iberville Parish", "Iberville Parish" },
4158  { "IbervilleParish", "Iberville Parish" },
4159  { "Jackson Parish", "Jackson Parish" },
4160  { "JacksonParish", "Jackson Parish" },
4161  { "Jefferson Davis Parish", "Jefferson Davis Parish" },
4162  { "Jefferson Parish", "Jefferson Parish" },
4163  { "JeffersonDavisParish", "Jefferson Davis Parish" },
4164  { "JeffersonParish", "Jefferson Parish" },
4165  { "Lafayette Parish", "Lafayette Parish" },
4166  { "LafayetteParish", "Lafayette Parish" },
4167  { "Lafourche Parish", "Lafourche Parish" },
4168  { "LafourcheParish", "Lafourche Parish" },
4169  { "LaSalle Parish", "LaSalle Parish" },
4170  { "LaSalleParish", "LaSalle Parish" },
4171  { "Lincoln Parish", "Lincoln Parish" },
4172  { "LincolnParish", "Lincoln Parish" },
4173  { "Livingston Parish", "Livingston Parish" },
4174  { "LivingstonParish", "Livingston Parish" },
4175  { "Madison Parish", "Madison Parish" },
4176  { "MadisonParish", "Madison Parish" },
4177  { "Morehouse Parish", "Morehouse Parish" },
4178  { "MorehouseParish", "Morehouse Parish" },
4179  { "Natchitoches Parish", "Natchitoches Parish" },
4180  { "NatchitochesParish", "Natchitoches Parish" },
4181  { "Orleans Parish", "Orleans Parish" },
4182  { "OrleansParish", "Orleans Parish" },
4183  { "Ouachita Parish", "Ouachita Parish" },
4184  { "OuachitaParish", "Ouachita Parish" },
4185  { "Plaquemines Parish", "Plaquemines Parish" },
4186  { "PlaqueminesParish", "Plaquemines Parish" },
4187  { "Pointe Coupee Parish", "Pointe Coupee Parish" },
4188  { "PointeCoupeeParish", "Pointe Coupee Parish" },
4189  { "Rapides Parish", "Rapides Parish" },
4190  { "RapidesParish", "Rapides Parish" },
4191  { "Red River Parish", "Red River Parish" },
4192  { "RedRiverParish", "Red River Parish" },
4193  { "Richland Parish", "Richland Parish" },
4194  { "RichlandParish", "Richland Parish" },
4195  { "Sabine Parish", "Sabine Parish" },
4196  { "SabineParish", "Sabine Parish" },
4197  { "St. Bernard Parish", "St. Bernard Parish" },
4198  { "St. Charles Parish", "St. Charles Parish" },
4199  { "St. Helena Parish", "St. Helena Parish" },
4200  { "St. James Parish", "St. James Parish" },
4201  { "St. John the Baptist Parish", "St. John the Baptist Parish" },
4202  { "St. Landry Parish", "St. Landry Parish" },
4203  { "St. Martin Parish", "St. Martin Parish" },
4204  { "St. Mary Parish", "St. Mary Parish" },
4205  { "St. Tammany Parish", "St. Tammany Parish" },
4206  { "St.BernardParish", "St. Bernard Parish" },
4207  { "St.CharlesParish", "St. Charles Parish" },
4208  { "St.HelenaParish", "St. Helena Parish" },
4209  { "St.JamesParish", "St. James Parish" },
4210  { "St.JohntheBaptistParish", "St. John the Baptist Parish" },
4211  { "St.LandryParish", "St. Landry Parish" },
4212  { "St.MartinParish", "St. Martin Parish" },
4213  { "St.MaryParish", "St. Mary Parish" },
4214  { "St.TammanyParish", "St. Tammany Parish" },
4215  { "Tangipahoa Parish", "Tangipahoa Parish" },
4216  { "TangipahoaParish", "Tangipahoa Parish" },
4217  { "Tensas Parish", "Tensas Parish" },
4218  { "TensasParish", "Tensas Parish" },
4219  { "Terrebonne Parish", "Terrebonne Parish" },
4220  { "TerrebonneParish", "Terrebonne Parish" },
4221  { "Union Parish", "Union Parish" },
4222  { "UnionParish", "Union Parish" },
4223  { "Vermilion Parish", "Vermilion Parish" },
4224  { "VermilionParish", "Vermilion Parish" },
4225  { "Vernon Parish", "Vernon Parish" },
4226  { "VernonParish", "Vernon Parish" },
4227  { "Washington Parish", "Washington Parish" },
4228  { "WashingtonParish", "Washington Parish" },
4229  { "Webster Parish", "Webster Parish" },
4230  { "WebsterParish", "Webster Parish" },
4231  { "West Baton Rouge Parish", "West Baton Rouge Parish" },
4232  { "West Carroll Parish", "West Carroll Parish" },
4233  { "West Feliciana Parish", "West Feliciana Parish" },
4234  { "WestBatonRougeParish", "West Baton Rouge Parish" },
4235  { "WestCarrollParish", "West Carroll Parish" },
4236  { "WestFelicianaParish", "West Feliciana Parish" },
4237  { "Winn Parish", "Winn Parish" },
4238  { "WinnParish", "Winn Parish" }
4239 };
4240 
4243 
4244 bool s_IsParish ( string& parish ) {
4245 
4246  if ( parish.empty() ) {
4247  return false;
4248  }
4249 
4250  TParishMap::const_iterator parish_find_iter = parishAbbrevMap.find(parish.c_str());
4251  if ( parish_find_iter != parishAbbrevMap.end() ) {
4252  // replace with full parish name
4253  parish = parish_find_iter->second;
4254  return true;
4255  }
4256 
4257  return false;
4258 }
4259 
4262  { "AK", "Alaska" },
4263  { "AL", "Alabama" },
4264  { "Alabama", "Alabama" },
4265  { "Alaska", "Alaska" },
4266  { "American Samoa", "American Samoa" },
4267  { "AR", "Arkansas" },
4268  { "Arizona", "Arizona" },
4269  { "Arkansas", "Arkansas" },
4270  { "AS", "American Samoa" },
4271  { "AZ", "Arizona" },
4272  { "CA", "California" },
4273  { "California", "California" },
4274  { "CO", "Colorado" },
4275  { "Colorado", "Colorado" },
4276  { "Connecticut", "Connecticut" },
4277  { "CT", "Connecticut" },
4278  { "DC", "District of Columbia" },
4279  { "DE", "Delaware" },
4280  { "Delaware", "Delaware" },
4281  { "District of Columbia", "District of Columbia" },
4282  { "FL", "Florida" },
4283  { "Florida", "Florida" },
4284  { "GA", "Georgia" },
4285  { "Georgia", "Georgia" },
4286  { "GU", "Guam" },
4287  { "Guam", "Guam" },
4288  { "Hawaii", "Hawaii" },
4289  { "HI", "Hawaii" },
4290  { "IA", "Iowa" },
4291  { "ID", "Idaho" },
4292  { "Idaho", "Idaho" },
4293  { "IL", "Illinois" },
4294  { "Illinois", "Illinois" },
4295  { "IN", "Indiana" },
4296  { "Indiana", "Indiana" },
4297  { "Iowa", "Iowa" },
4298  { "Kansas", "Kansas" },
4299  { "Kentucky", "Kentucky" },
4300  { "KS", "Kansas" },
4301  { "KY", "Kentucky" },
4302  { "LA", "Louisiana" },
4303  { "Louisiana", "Louisiana" },
4304  { "MA", "Massachusetts" },
4305  { "Maine", "Maine" },
4306  { "Maryland", "Maryland" },
4307  { "Massachusetts", "Massachusetts" },
4308  { "MD", "Maryland" },
4309  { "ME", "Maine" },
4310  { "MI", "Michigan" },
4311  { "Michigan", "Michigan" },
4312  { "Minnesota", "Minnesota" },
4313  { "Mississippi", "Mississippi" },
4314  { "Missouri", "Missouri" },
4315  { "MN", "Minnesota" },
4316  { "MO", "Missouri" },
4317  { "Montana", "Montana" },
4318  { "MS", "Mississippi" },
4319  { "MT", "Montana" },
4320  { "NC", "North Carolina" },
4321  { "ND", "North Dakota" },
4322  { "NE", "Nebraska" },
4323  { "Nebraska", "Nebraska" },
4324  { "Nevada", "Nevada" },
4325  { "New Hampshire", "New Hampshire" },
4326  { "New Jersey", "New Jersey" },
4327  { "New Mexico", "New Mexico" },
4328  { "New York", "New York" },
4329  { "NH", "New Hampshire" },
4330  { "NJ", "New Jersey" },
4331  { "NM", "New Mexico" },
4332  { "North Carolina", "North Carolina" },
4333  { "North Dakota", "North Dakota" },
4334  { "NV", "Nevada" },
4335  { "NY", "New York" },
4336  { "OH", "Ohio" },
4337  { "Ohio", "Ohio" },
4338  { "OK", "Oklahoma" },
4339  { "Oklahoma", "Oklahoma" },
4340  { "OR", "Oregon" },
4341  { "Oregon", "Oregon" },
4342  { "PA", "Pennsylvania" },
4343  { "Pennsylvania", "Pennsylvania" },
4344  { "PR", "Puerto Rico" },
4345  { "Puerto Rico", "Puerto Rico" },
4346  { "Rhode Island", "Rhode Island" },
4347  { "RI", "Rhode Island" },
4348  { "SC", "South Carolina" },
4349  { "SD", "South Dakota" },
4350  { "South Carolina", "South Carolina" },
4351  { "South Dakota", "South Dakota" },
4352  { "Tennessee", "Tennessee" },
4353  { "Texas", "Texas" },
4354  { "TN", "Tennessee" },
4355  { "TX", "Texas" },
4356  { "US Virgin Islands", "US Virgin Islands" },
4357  { "UT", "Utah" },
4358  { "Utah", "Utah" },
4359  { "VA", "Virginia" },
4360  { "Vermont", "Vermont" },
4361  { "VI", "US Virgin Islands" },
4362  { "Virgin Islands", "US Virgin Islands" },
4363  { "Virginia", "Virginia" },
4364  { "VT", "Vermont" },
4365  { "WA", "Washington" },
4366  { "Washington", "Washington" },
4367  { "West Virginia", "West Virginia" },
4368  { "WI", "Wisconsin" },
4369  { "Wisconsin", "Wisconsin" },
4370  { "WV", "West Virginia" },
4371  { "WY", "Wyoming" },
4372  { "Wyoming", "Wyoming" }
4373 };
4374 
4377 
4378 bool s_IsState ( string& state, bool& modified ) {
4379 
4380  if ( state.empty() ) {
4381  return false;
4382  }
4383 
4384  string original = state;
4385  string working = state;
4386 
4387  if ( NStr::StartsWith ( working, "State of ", NStr::eNocase )) {
4388  NStr::TrimPrefixInPlace ( working, "State of ", NStr::eNocase );
4389  }
4390 
4391  if ( NStr::StartsWith ( working, "Commonwealth of ", NStr::eNocase )) {
4392  NStr::TrimPrefixInPlace ( working, "Commonwealth of ", NStr::eNocase );
4393  }
4394 
4395  if ( NStr::EndsWith ( working, " State", NStr::eNocase )) {
4396  NStr::TrimSuffixInPlace ( working, " State", NStr::eNocase );
4397  }
4398 
4399  NStr::TruncateSpacesInPlace ( working );
4400 
4401  TStateMap::const_iterator state_find_iter = stateAbbrevMap.find(working.c_str());
4402  if ( state_find_iter != stateAbbrevMap.end() ) {
4403  // replace with full state name
4404  state = state_find_iter->second;
4405  // report conversion from two-letter, changed capitalization, or prefix/suffix removal
4406  if ( ! NStr::Equal ( original, state )) {
4407  modified = true;
4408  }
4409  return true;
4410  }
4411 
4412  return false;
4413 }
4414 
4416 
4417  if ( country.empty() ) {
4418  return CCountries::e_NoResult;
4419  }
4420 
4421  // make working copy
4422  string original = country;
4423  string working = country;
4424 
4425  // remove flanking quotation marks - if CCountries::NewFixCountry not called
4426  if ( NStr::StartsWith ( working, "\"" ) && NStr::EndsWith ( working, "\"" )) {
4427  working = working.substr ( 1, working.length() - 2 );
4428  }
4429 
4430  // remove flanking spaces
4431  NStr::TruncateSpacesInPlace ( working );
4432 
4433  // separate strings before and after colon
4434  string frst, scnd;
4435  NStr::SplitInTwo ( working, ":", frst, scnd );
4436 
4437  NStr::TruncateSpacesInPlace ( frst );
4438  NStr::TruncateSpacesInPlace ( scnd );
4439 
4440  // confirm that country is USA
4441  if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
4442  // if not, first try rescuing US territory
4443  working = CCountries::NewFixCountry(working, true);
4444  NStr::SplitInTwo ( working, ":", frst, scnd );
4445  NStr::TruncateSpacesInPlace ( frst );
4446  NStr::TruncateSpacesInPlace ( scnd );
4447  if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
4448  return CCountries::e_NotUSA;
4449  }
4450  }
4451 
4452  // split state/county/city clauses at commas
4453  vector<string> components;
4454  NStr::Split(scnd, ",", components);
4455 
4456  // check for only country
4457  if ( components.size() < 1 ) {
4458  country = "USA";
4459  return CCountries::e_Valid;
4460  }
4461 
4462  for ( size_t j = 0; j < components.size(); j++ ) {
4463  // remove flanking spaces around components
4464  NStr::TruncateSpacesInPlace ( components[j] );
4465  s_CompressRunsOfSpaces ( components[j] );
4466  // clean up runon strings like EastBatonRougeParish
4467  if ( NStr::EndsWith ( components[j], "Parish", NStr::eNocase )) {
4468  s_IsParish( components[j] );
4469  }
4470  }
4471 
4472  // bool any_modified = false;
4473  int num_states = 0;
4474  int match = -1;
4475 
4476  // string* first = 0;
4477  // string* last = 0;
4478 
4479  // has multiple components
4480  // int max = components.size() - 1;
4481  for ( int j = 0; j < components.size(); j++ ) {
4482  bool modified = false;
4483  if ( s_IsState ( components[j], modified )) {
4484  /*
4485  if (modified) {
4486  any_modified = true;
4487  }
4488  */
4489  if ( match < 0 ) {
4490  // record position of first s_IsState match
4491  match = j;
4492  }
4493  // count successful matches
4494  num_states++;
4495  /*
4496  if ( j == 0 ) {
4497  first = &(components[j]);
4498  }
4499  if ( j == max ) {
4500  last = &(components[j]);
4501  }
4502  */
4503  }
4504  }
4505 
4506  // generate result
4507  string res;
4508  res.append ("USA: ");
4509  string pfx = "";
4510 
4511  if ( match >= 0 ) {
4512  // move first state matched to first position
4513  res.append ( components[match] );
4514  pfx = ", ";
4515  }
4516 
4517  for ( size_t j = 0; j < components.size(); j++ ) {
4518  if ( j == match) continue;
4519  res.append ( pfx );
4520  res.append ( components[j] );
4521  pfx = ", ";
4522  }
4523 
4524  country = res;
4525 
4526  if ( match < 0 ) {
4527  return CCountries::e_Missing;
4528  } else if ( num_states > 1 ) {
4529  return CCountries::e_Ambiguous;
4530  } else if ( ! NStr::Equal ( original, res )) {
4531  return CCountries::e_Corrected;
4532  }
4533 
4534  return CCountries::e_Valid;
4535 }
4536 
4538 
4540 static bool exceptions_initialized = false;
4541 
4542 void CCountries::ReadUSAExceptionMap (CCountries::TUsaExceptionMap& exceptions, const string& exception_file ) {
4543 
4544  if ( ! exception_file.empty()) {
4545 
4546  TNCBITSVStream my_stream (exception_file);
4547  for ( const auto & row : my_stream ) {
4548  TFieldNo number_of_fields = row. GetNumberOfFields();
4549  if ( number_of_fields != 2 ) continue;
4550  string fr = row[0].Get<string>();
4551  string to = row[1].Get<string>();
4552  exceptions [fr] = to;
4553  }
4554  }
4555 }
4556 
4558 
4559  // clear previous map
4560  exception_map.clear();
4561 
4562  // initialize internal exception map
4563  for ( const auto & itm : exceptions ) {
4564  string fr = itm.first;
4565  string to = itm.second;
4566 
4567  // ensure colon is followed by space to match initial correction
4568  string f1, f2;
4569  NStr::SplitInTwo ( fr, ":", f1, f2 );
4572  if ( ! f1.empty() && ! f2.empty()) {
4573  fr = f1 + ": " + f2;
4574  }
4575 
4576  exception_map [fr] = to;
4577  }
4578 
4579  exceptions_initialized = true;
4580 }
4581 
4582 void CCountries::LoadUSAExceptionMap (const string& exception_file ) {
4583 
4584  if ( ! exception_file.empty()) {
4585 
4586  TUsaExceptionMap exceptions;
4587  ReadUSAExceptionMap ( exceptions, exception_file );
4588  LoadUSAExceptionMap ( exceptions );
4589  }
4590 }
4591 
4592 string CCountries::USAStateCleanup ( const string& country, CCountries::EStateCleanup& type ) {
4593 
4594  // call algorithmic mapping function
4595  string working = country;
4596  type = s_DoUSAStateCleanup ( working );
4597 
4598  // apply exceptions from preloaded data file
4599  if ( exceptions_initialized ) {
4600  string corrected = exception_map [working];
4601  if ( ! corrected.empty()) {
4602  // presence in map here will disambiguate otherwise ambiguous name pair,
4603  // thus self-entries need to be added to the ambiguous state exception list
4604  if ( ! NStr::StartsWith ( corrected, "USA" )) {
4605  type = e_NotUSA;
4606  } else if ( NStr::Equal ( corrected, working ) && NStr::Equal ( corrected, country )) {
4607  type = e_Valid;
4608  } else {
4609  type = e_Corrected;
4610  }
4611  return corrected;
4612  }
4613  }
4614 
4615  if ( ! NStr::StartsWith ( working, "USA" )) {
4616  type = e_NotUSA;
4617  }
4618  return working;
4619 }
4620 
4621 string CCountries::USAStateCleanup ( const string& country ) {
4622 
4624  return USAStateCleanup ( country, type );
4625 }
4626 
4627 // end of RW-1278
4628 
4629 string CCountries::NewFixCountry (const string& test, bool us_territories)
4630 {
4631  // change requested for JIRA:SQD-1410
4632  if (s_SuppressCountryFix(test)) {
4633  if (IsValid(test)) {
4634  return test;
4635  } else {
4636  return kEmptyStr;
4637  }
4638  }
4639 
4640  string input = test;
4641  if (NStr::StartsWith(input, "\"") && NStr::EndsWith(input, "\"")) {
4642  input = input.substr(1, input.length() - 2);
4643  }
4645 
4646  if (NStr::EndsWith(input, ":")) {
4647  input = input.substr(0, input.length() - 1);
4649  }
4650 
4651  string usa1,usa2;
4652  NStr::SplitInTwo(input, ":", usa1, usa2);
4653  if (!usa1.empty() && !usa2.empty()) {
4656  if (NStr::EqualNocase(usa1, "U.S.A.") || NStr::EqualNocase(usa1, "United States") || NStr::EqualNocase(usa1, "United States of America")) {
4657  input = "USA: " + usa2;
4658  }
4659  }
4660 
4661  auto old_name_fix = k_old_country_name_fixes.find(input.c_str());
4662  if (old_name_fix != k_old_country_name_fixes.end()) {
4663  input = old_name_fix->second;
4664  return input;
4665  }
4666 
4667  if (us_territories) {
4668  if ( NStr::StartsWith( input, "Puerto Rico", NStr::eNocase) || NStr::StartsWith( input, "Guam", NStr::eNocase) || NStr::StartsWith( input, "American Samoa", NStr::eNocase) ) {
4669  input = "USA: " + input;
4672  return input;
4673  } else if ( NStr::StartsWith( input, "Virgin Islands", NStr::eNocase) ) {
4674  input = "USA: US " + input;
4677  return input;
4678  }
4679  }
4680 
4681  if (IsValid(input)) {
4683  return input;
4684  }
4685  string new_country = WholeCountryFix(input);
4686  if (!new_country.empty())
4687  return new_country;
4688 
4689  bool too_many_countries = false;
4690  bool bad_cap = false;
4691  vector<string> countries = x_Tokenize(input);
4692  string valid_country;
4693  string orig_valid_country;
4694 
4695  x_FindCountryName(k_country_name_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
4696  if (valid_country.empty()) {
4697  x_FindCountryName(k_subregion_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
4698  }
4699 
4700  if (!valid_country.empty() && !too_many_countries)
4701  too_many_countries = ContainsMultipleCountryNames (input);
4702 
4703  if (!valid_country.empty() && too_many_countries && valid_country == input)
4704  {
4705  string str1,str2;
4706  NStr::SplitInTwo(valid_country,":",str1,str2);
4707  if (!str1.empty() && !str2.empty() && !NStr::StartsWith(str2," "))
4708  new_country = str1+": "+str2;
4709 
4711  }
4712  else if(!valid_country.empty() && !too_many_countries)
4713  {
4714  // find valid_country in input
4715  size_t pos = NStr::Find(input,orig_valid_country);
4716  // save preceeding string without trailing spaces or delimiters ":,"
4717  string before = input.substr(0,pos);
4718 
4721  // save trailing string without initial spaces or delimiters
4722  string after = input.substr(pos+orig_valid_country.length());
4723  x_RemoveDelimitersFromEnds(after, true);
4725  if (bad_cap) new_country = GetCorrectedCountryCapitalization(valid_country);
4726  else new_country = valid_country;
4727  if (!before.empty() || !after.empty()) {
4728  if (NStr::Find(valid_country, ":") == NPOS) {
4729  new_country += ": ";
4730  } else {
4731  new_country += ", ";
4732  }
4733  }
4734  if (!before.empty())
4735  new_country += before;
4736  if (!before.empty() && !after.empty() && !NStr::Equal(after, ")"))
4737  new_country += ", ";
4738  if (!after.empty())
4739  new_country += after;
4741  }
4742 
4743  return new_country;
4744 }
4745 
4746 
4748 {
4749  // requested in SQD-4516
4750  bool rval = false;
4751  int count = 0;
4752  for (size_t i = 0; i < country.length(); i++) {
4753  if (country[i] == ':') {
4754  count++;
4755  if (count > 1) {
4756  country[i] = ',';
4757  rval = true;
4758  }
4759  }
4760  }
4761  return rval;
4762 }
4763 
4764 
4765 string CCountries::CountryFixupItem(const string &input, bool capitalize_after_colon)
4766 {
4767  string country = NewFixCountry (input);
4768  string new_country = country;
4769  SIZE_TYPE country_end_pos = NStr::Find(country,":");
4770  if (country_end_pos != NPOS)
4771  {
4772  SIZE_TYPE pos = country_end_pos;
4773  while (country[pos] == ',' || country[pos] == ':' || isspace((unsigned char)country[pos]))
4774  {
4775  pos++;
4776  }
4777  string after = country.substr(pos);
4778  if (after.empty()) {
4779  if (pos > country_end_pos) {
4780  new_country = country.substr(0, country_end_pos);
4781  }
4782  } else {
4784  if (capitalize_after_colon)
4785  after = CapitalizeFirstLetterOfEveryWord (after);
4786  new_country = country.substr(0,country_end_pos);
4787  new_country += ": " + after;
4788  }
4789  }
4790  return new_country;
4791 }
4792 
4793 
4794 // SubSource Qual Fixups
4797 
4799  { "adult", "adult" },
4800  { "egg", "egg" },
4801  { "juvenile", "juvenile" },
4802  { "larva", "larva" }
4803 };
4804 
4806 
4807 
4809 {
4810  string fix = value;
4811 
4812  TStaticQualFixMap::const_iterator it = sc_DevStagePairs.find(value.c_str());
4813  if (it != sc_DevStagePairs.end()) {
4814  fix = it->second;
4815  }
4816  return fix;
4817 }
4818 
4819 
4821  { "hemocyte", "hemocyte" },
4822  { "hepatocyte", "hepatocyte" },
4823  { "lymphocyte", "lymphocyte" },
4824  { "neuroblast", "neuroblast" }
4825 };
4826 
4828 
4830 {
4831  string fix = value;
4832 
4833  TStaticQualFixMap::const_iterator it = sc_CellTypePairs.find(value.c_str());
4834  if (it != sc_CellTypePairs.end()) {
4835  fix = it->second;
4836  }
4837  return fix;
4838 
4839 }
4840 
4843 
4845 static bool s_QualFixupMapsInitialized = false;
4846 
4847 static void s_ProcessQualMapLine(const CTempString& line, TQualFixMap& qual_map)
4848 {
4849  vector<CTempString> tokens;
4850  NStr::Split(line, "\t", tokens);
4851  if (tokens.size() > 1) {
4852  qual_map[tokens[0]] = tokens[1];
4853  }
4854 }
4855 
4856 
4857 void s_AddOneDataFile(const string& file_name, const string& data_name,
4858  const char **built_in, size_t num_built_in,
4859  TQualFixMap& qual_map)
4860 {
4861  string file = g_FindDataFile(file_name);
4862  CRef<ILineReader> lr;
4863  if (!file.empty()) {
4864  try {
4865  lr = ILineReader::New(file);
4866  } NCBI_CATCH("s_InitializeQualMaps")
4867  }
4868 
4869  if (lr.Empty()) {
4870  if (built_in == NULL) {
4871  ERR_POST(Note << "No data for " + data_name);
4872  } else {
4873  if (getenv("NCBI_DEBUG")) {
4874  ERR_POST(Note << "Falling back on built-in data for " + data_name);
4875  }
4876  for (size_t i = 0; i < num_built_in; i++) {
4877  const char *p = built_in[i];
4878  s_ProcessQualMapLine(p, qual_map);
4879  }
4880  }
4881  } else {
4882  if (getenv("NCBI_DEBUG")) {
4883  ERR_POST(Note << "Reading from " + file + " for " + data_name);
4884  }
4885  do {
4886  s_ProcessQualMapLine(*++*lr, qual_map);
4887  } while (!lr->AtEOF());
4888  }
4889 }
4890 
4891 #include "isolation_sources.inc"
4892 
4893 static void s_InitializeQualMaps(void)
4894 {
4895  CFastMutexGuard GUARD(s_QualFixMutex);
4897  return;
4898  }
4899 
4900  // tissue types
4901  s_AddOneDataFile("isolation_sources.txt", "isolation sources", (const char **)k_isolation_sources, sizeof(k_isolation_sources) / sizeof(char *), s_IsolationSourceMap);
4903 }
4904 
4905 
4906 
4907 
4908 
4910 {
4911  string fix = value;
4912 
4914 
4916  if (it != s_IsolationSourceMap.end()) {
4917  return it->second;
4918  }
4919 
4920  size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
4921  for (size_t i = 0; i < max; i++) {
4924  break;
4925  }
4926  }
4927 
4928  fix = COrgMod::FixHostCapitalization(fix);
4929  fix = FixDevStageCapitalization(fix);
4930  fix = FixCellTypeCapitalization(fix);
4931 
4932  return fix;
4933 }
4934 
4935 
4937 {
4938  string fix = value;
4939 
4942  if (it != s_IsolationSourceMap.end()) {
4943  return it->second;
4944  }
4945 
4946 
4947  size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
4948  for (size_t i = 0; i < max; i++) {
4951  break;
4952  }
4953  }
4954 
4955  fix = COrgMod::FixHostCapitalization(fix);
4956  fix = FixDevStageCapitalization(fix);
4957  fix = FixCellTypeCapitalization(fix);
4958 
4959  return fix;
4960 }
4961 
4962 
4964 {
4966 }
4967 
4968 
4969 string CSubSource::FixCapitalization(TSubtype subtype, const string& value)
4970 {
4971  string new_val = value;
4972  switch (subtype) {
4974  new_val = FixSexQualifierValue(value);
4975  if (NStr::IsBlank(new_val)) {
4976  new_val = value;
4977  }
4978  break;
4981  break;
4983  new_val = FixLabHostCapitalization(value);
4984  break;
4987  break;
4989  new_val = FixDevStageCapitalization(value);
4990  break;
4992  new_val = FixCellTypeCapitalization(value);
4993  break;
4994  default:
4995  new_val = value;
4996  break;
4997  }
4998  return new_val;
4999 }
5000 
5001 
5003 {
5004  if (!IsSetSubtype() || !IsSetName()) {
5005  return;
5006  }
5007 
5008  TSubtype subtype = GetSubtype();
5009 
5010  if (subtype == CSubSource::eSubtype_sex) {
5011  string upr = GetName();
5012  string lwr = upr;
5013  NStr::ToLower(lwr);
5014  if (! NStr::Equal(upr, lwr)) {
5015  SetName(lwr);
5016  }
5017  }
5018 
5019  const string& name = GetName();
5020 
5021  string new_val = FixCapitalization(subtype, name);
5022 
5023  if (!NStr::IsBlank(new_val)) {
5024  SetName(new_val);
5025  }
5026 
5027 }
5028 
5029 
5030 string CSubSource::AutoFix(TSubtype subtype, const string& value)
5031 {
5032  string new_val;
5033  switch (subtype) {
5035  new_val = CCountries::NewFixCountry(value);
5036  break;
5038  new_val = FixDateFormat(value);
5039  break;
5041  new_val = FixLatLonFormat(value);
5042  break;
5044  new_val = FixSexQualifierValue(value);
5045  break;
5047  new_val = FixAltitude(value);
5048  break;
5049  default:
5050  break;
5051  }
5052  return new_val;
5053 }
5054 
5055 
5057 {
5058  if (!IsSetSubtype() || !IsSetName()) {
5059  return;
5060  }
5061 
5062  TSubtype subtype = GetSubtype();
5063  string new_val = AutoFix(subtype, GetName());
5064 
5065  if (!NStr::IsBlank(new_val)) {
5066  SetName(new_val);
5067  } else if (subtype == CSubSource::eSubtype_sex) {
5068  string upr = GetName();
5069  string lwr = upr;
5070  NStr::ToLower(lwr);
5071  if (! NStr::Equal(upr, lwr)) {
5072  SetName(lwr);
5073  }
5074  }
5075 }
5076 
5077 
5078 
5079 // NOTE (for two arrays below): If string A is a prefix of string B, string B should be placed
5080 // BEFORE string A. I.e. longer string should be earlier
5081 static const char * s_RemovableCultureNotes[] = {
5082  "[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]",
5083  "[BankIt_uncultured16S_wizard]; [universal primers]; [dgge]",
5084  "[BankIt_uncultured16S_wizard]; [universal primers]",
5085  "[BankIt_cultured16S_wizard]",
5086  "[BankIt_organellerRNA_wizard]",
5087  "[BankIt_ITS_wizard]; [rRNAITS_notfound]",
5088  "[BankIt_ITS_wizard]",
5089  "[uncultured (using universal primers)]",
5090  "[uncultured (using universal primers) bacterial source]",
5091  "[cultured bacterial source]",
5092  "[enrichment culture bacterial source]",
5093  "[mixed bacterial source (cultured and uncultured)]",
5094  "[uncultured]; [universal primers]",
5095  "[mixed bacterial source]",
5096  "[virus wizard]",
5097  "[cDNA derived from mRNA, purified viral particles]",
5098  "[cDNA derived from mRNA, whole cell/tissue lysate]",
5099  "[cDNA derived from genomic RNA, whole cell/tissue lysate]",
5100  "[cDNA derived from genomic RNA, purified viral particles]",
5101  "[universal primers]",
5102  "[uncultured; wizard]",
5103  "[uncultured; wizard; spans unknown]",
5104  "[cultured; wizard]",
5105  "[cultured; wizard; spans unknown]",
5106  "[intergenic wizard]",
5107  "[intergenic wizard; spans unknown]",
5108  "[Microsatellite wizard]",
5109  "[Microsatellite wizard; multiple repeats]",
5110  "[D-loop wizard]",
5111  "[D-loop wizard; spans unknown]",
5112  "[D-loop wizard; spans known]",
5113  NULL
5114 };
5115 
5116 static const char * s_ReplaceableCultureNotes[] = {
5117  "[BankIt_uncultured16S_wizard]; [species_specific primers]; [tgge]",
5118  "[BankIt_uncultured16S_wizard]; [species_specific primers]; [dgge]",
5119  "[BankIt_uncultured16S_wizard]; [species_specific primers]",
5120  "[uncultured (with species-specific primers)]",
5121  "[uncultured]; [amplified with species-specific primers]",
5122  "[uncultured (using species-specific primers) bacterial source]",
5123  "[amplified with species-specific primers]",
5124  NULL
5125 };
5126 
5127 
5129 {
5130  for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
5132  if (pos != string::npos) {
5133  return true;
5134  }
5135  }
5136  for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
5138  return true;
5139  }
5140  }
5141  return false;
5142 }
5143 
5144 
5145 void CSubSource::RemoveCultureNotes (string& value, bool is_species_level)
5146 {
5147  if (NStr::IsBlank(value)) {
5148  return;
5149  }
5150 
5151  for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
5152  string to_remove = s_RemovableCultureNotes[i];
5153  size_t remove_len = to_remove.length();
5154  size_t pos = NStr::FindNoCase(value, to_remove);
5155  while (pos != NPOS) {
5156  size_t extra_len = strspn (value.c_str() + pos + remove_len, " ;");
5157  value = value.substr(0, pos) + value.substr(pos + remove_len + extra_len);
5158  pos = NStr::FindNoCase(value, to_remove);
5159  }
5160  }
5161  // remove leading/trailing semicolons
5162  while (NStr::StartsWith(value, " ") || NStr::StartsWith(value, ";")) {
5163  value = value.substr(1);
5164  }
5165  while (NStr::EndsWith(value, " ") || NStr::EndsWith(value, ";")) {
5166  value = value.substr(0, value.length() - 1);
5167  }
5168 
5169  if (is_species_level) {
5170  for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
5172  value = "amplified with species-specific primers";
5173  break;
5174  }
5175  }
5176  }
5177 }
5178 
5179 
5180 void CSubSource::RemoveCultureNotes (bool is_species_level)
5181 {
5182  if (IsSetName()) {
5183  RemoveCultureNotes(SetName(), is_species_level);
5184  if (NStr::IsBlank(GetName())) {
5185  ResetName();
5186  }
5187  }
5188 }
5189 
5190 
5191 // CCountryLine
5193 (const string & country_name, double y, double min_x, double max_x, double scale)
5194 : m_CountryName(country_name) ,
5195  m_Scale (scale)
5196 {
5197  m_Y = x_ConvertLat(y);
5198  m_MinX = x_ConvertLon(min_x);
5199  m_MaxX = x_ConvertLon(max_x);
5200 
5201 }
5202 
5203 
5205 {
5206 }
5207 
5208 
5209 #define EPSILON 0.001
5210 
5211 int CCountryLine::ConvertLat (double y, double scale)
5212 {
5213 
5214  int val = 0;
5215 
5216  if (y < -90.0) {
5217  y = -90.0;
5218  }
5219  if (y > 90.0) {
5220  y = 90.0;
5221  }
5222 
5223  if (y > 0) {
5224  val = (int) (y * scale + EPSILON);
5225  } else {
5226  val = (int) (-(-y * scale + EPSILON));
5227  }
5228 
5229  return val;
5230 }
5231 
5232 
5234 {
5235  return ConvertLat(y, m_Scale);
5236 }
5237 
5238 int CCountryLine::ConvertLon (double x, double scale)
5239 {
5240 
5241  int val = 0;
5242 
5243  if (x < -180.0) {
5244  x = -180.0;
5245  }
5246  if (x > 180.0) {
5247  x = 180.0;
5248  }
5249 
5250  if (x > 0) {
5251  val = (int) (x * scale + EPSILON);
5252  } else {
5253  val = (int) (-(-x * scale + EPSILON));
5254  }
5255 
5256  return val;
5257 }
5258 
5259 
5261 {
5262  return ConvertLon(x, m_Scale);
5263 }
5264 
5265 
5266 CCountryExtreme::CCountryExtreme (const string & country_name, int min_x, int min_y, int max_x, int max_y)
5267 : m_CountryName(country_name) , m_MinX (min_x), m_MinY (min_y), m_MaxX(max_x), m_MaxY (max_y)
5268 {
5269  m_Area = (1 + m_MaxY - m_MinY) * (1 + m_MaxX - m_MinX);
5270  size_t pos = NStr::Find(country_name, ":");
5271  if (pos == NPOS) {
5272  m_Level0 = country_name;
5273  m_Level1.clear();
5274  } else {
5275  m_Level0 = country_name.substr(0, pos);
5277  m_Level1 = country_name.substr(pos + 1);
5279  }
5280 
5281 }
5282 
5283 
5285 {
5286 
5287 }
5288 
5289 
5291 {
5292  if (min_x < m_MinX) {
5293  m_MinX = min_x;
5294  return true;
5295  } else {
5296  return false;
5297  }
5298 }
5299 
5300 
5302 {
5303  if (max_x > m_MaxX) {
5304  m_MaxX = max_x;
5305  return true;
5306  } else {
5307  return false;
5308  }
5309 }
5310 
5311 
5313 {
5314  if (min_y < m_MinY) {
5315  m_MinY = min_y;
5316  return true;
5317  } else {
5318  return false;
5319  }
5320 }
5321 
5322 
5324 {
5325  if (max_y > m_MaxY) {
5326  m_MaxY = max_y;
5327  return true;
5328  } else {
5329  return false;
5330  }
5331 }
5332 
5333 
5335 {
5336  if (line) {
5337  SetMinX(line->GetMinX());
5338  SetMaxX(line->GetMaxX());
5339  SetMinY(line->GetY());
5340  SetMaxY(line->GetY());
5341  m_Area += 1 + line->GetMaxX() - line->GetMinX();
5342  }
5343 }
5344 
5345 
5346 bool CCountryExtreme::DoesOverlap(const CCountryExtreme* other_block) const
5347 {
5348  if (!other_block) {
5349  return false;
5350  } else if (m_MaxX >= other_block->GetMinX()
5351  && m_MaxX <= other_block->GetMaxX()
5352  && m_MaxY >= other_block->GetMinY()
5353  && m_MinY <= other_block->GetMaxY()) {
5354  return true;
5355  } else if (other_block->GetMaxX() >= m_MinX
5356  && other_block->GetMaxX() <= m_MaxX
5357  && other_block->GetMaxY() >= m_MinY
5358  && other_block->GetMinY() <= m_MaxY) {
5359  return true;
5360  } else {
5361  return false;
5362  }
5363 }
5364 
5365 
5366 bool CCountryExtreme::PreferTo(const CCountryExtreme* other_block, const string country, const string province, const bool prefer_new) const
5367 {
5368  if (!other_block) {
5369  return true;
5370  }
5371 
5372  // if no preferred country, these are equal
5373  if (NStr::IsBlank(country)) {
5374  return prefer_new;
5375  }
5376 
5377  // if match to preferred country
5378  if (NStr::EqualNocase(country, m_Level0)) {
5379  // if best was not preferred country, take new match
5380  if (!NStr::EqualNocase(country, other_block->GetLevel0())) {
5381  return true;
5382  }
5383  // if match to preferred province
5384  if (!NStr::IsBlank(province) && NStr::EqualNocase(province, m_Level1)) {
5385  // if best was not preferred province, take new match
5386  if (!NStr::EqualNocase(province, other_block->GetLevel1())) {
5387  return true;
5388  }
5389  }
5390 
5391  // if both match province, or neither does, or no preferred province, take smallest
5392  return prefer_new;
5393  }
5394 
5395  // if best matches preferred country, keep
5396  if (NStr::EqualNocase(country, other_block->GetLevel0())) {
5397  return false;
5398  }
5399 
5400  // otherwise take smallest
5401  return prefer_new;
5402 }
5403 
5404 
5406  : m_Lat(lat),
5407  m_Lon(lon),
5408  m_LandDistance(-1),
5409  m_WaterDistance(-1),
5410  m_ClaimedDistance(-1)
5411 {}
5412 
5413 
5415 {
5417 
5418  // compare guesses or closest regions to indicated country and province
5419  if (!NStr::IsBlank(GetGuessCountry())) {
5420  // if top level countries match
5421  if (NStr::EqualNocase(country, GetGuessCountry())) {
5423  // if both are empty, still call it a match
5424  if (NStr::EqualNocase(province, GetGuessProvince())) {
5426  }
5427  }
5428  // if they don't match, are they closest?
5429  if (!(rval & CLatLonCountryId::fCountryMatch)) {
5430  if (NStr::EqualNocase(country, GetClosestCountry())) {
5432  if (NStr::EqualNocase(province, GetClosestProvince())) {
5434  }
5435  }
5436  } else if (!(rval & CLatLonCountryId::fProvinceMatch) && !NStr::IsBlank(province)) {
5437  if (NStr::EqualNocase (province, GetClosestProvince())) {
5439  }
5440  }
5441  }
5442 
5443  if (!NStr::IsBlank(GetGuessWater())) {
5444  // was the non-approved body of water correctly indicated?
5445  if (NStr::EqualNocase(country, GetGuessWater())) {
5447  } else if (NStr::EqualNocase(country, GetClosestWater())) {
5449  }
5450  }
5451 
5461  }
5462  } else {
5466  }
5467  }
5468  }
5469  return rval;
5470 }
5471 
5472 
5474 {
5475 }
5476 
5477 
5478 #include "lat_lon_country.inc"
5479 static const size_t k_NumLatLonCountryText = ArraySize(s_DefaultLatLonCountryText);
5480 
5481 #include "lat_lon_water.inc"
5482 static const size_t k_NumLatLonWaterText = ArraySize(s_DefaultLatLonWaterText);
5483 
5484 void CLatLonCountryMap::x_InitFromDefaultList(const char * const *list, int num)
5485 {
5486  if (getenv("NCBI_DEBUG")) {
5487  ERR_POST(Note << "Falling back on built-in data for latlon / water data.");
5488  }
5489  // initialize list of country lines
5490  m_CountryLineList.clear();
5491  m_Scale = 20.0;
5492  string current_country;
5493 
5494  for (int i = 0; i < num; i++) {
5495  CTempString line = list[i];
5496  if (line[0] == '-') {
5497  // skip comment
5498  } else if (isalpha ((unsigned char)line[0])) {
5499  current_country = line;
5500  } else if (isdigit ((unsigned char)line[0])) {
5501  m_Scale = NStr::StringToDouble(line);
5502  } else {
5503  vector<string> tokens;
5504  NStr::Split(line, "\t", tokens);
5505  if (tokens.size() > 3) {
5506  double x = NStr::StringToDouble(tokens[1]);
5507  for (size_t j = 2; j < tokens.size() - 1; j+=2) {
5508  m_CountryLineList.push_back(new CCountryLine(current_country, x, NStr::StringToDouble(