NCBI C++ ToolKit
SubSource.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: SubSource.cpp 102247 2024-04-10 22:44:36Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: .......
27  *
28  * File Description:
29  * .......
30  *
31  * Remark:
32  * This code was originally generated by application DATATOOL
33  * using the following specifications:
34  * 'seqfeat.asn'.
35  */
36 
37 // standard includes
38 #include <ncbi_pch.hpp>
39 #include <serial/enumvalues.hpp>
40 
41 // generated includes
43 
44 #include <math.h>
46 #include <corelib/ncbitime.hpp>
47 
49 #include <mutex>
50 #include <util/compile_time.hpp>
51 
52 // generated classes
53 
55 
56 BEGIN_objects_SCOPE // namespace ncbi::objects::
57 
58 unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonCountryMap;
59 unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonWaterMap;
60 
61 
62 // destructor
64 {
65 }
66 
67 
69 {
71  return false;
72  }
73 
75  string fromEnv = env.Get("NCBI_GEO_LOC_NAME_FOR_COUNTRY");
76  NStr::ToLower(fromEnv);
77  if (fromEnv == "true") {
78  return true;
79  } else if (fromEnv == "false") {
80  return false;
81  }
82 
84  string fromConfig = reg.GetString("OrgSubSource", "UseGeoLocNameForCountry", "off");
85  NStr::ToLower(fromConfig);
86  if (fromConfig == "1" || fromConfig == "on" || fromConfig == "true" || fromConfig == "yes") {
87  return true;
88  }
89 
90  return false;
91 }
92 
93 
95 {
96  static bool value = s_init_UseGeoLocNameForCountry();
97  return value;
98 }
99 
100 
101 void CSubSource::GetLabel(string* str) const
102 {
103  *str += '/';
104  string type_name;
105  if (GetSubtype() == eSubtype_other) {
106  type_name = "other";
107  } else {
108  try {
109  // eVocabulary_insdc has some special cases not (historically)
110  // used here.
112  replace(type_name.begin(), type_name.end(), '_', '-');
113  } catch (const CSerialException&) {
114  type_name = "unknown";
115  }
116  }
117  *str += type_name;
118  *str += '=';
119  *str += GetName();
120  if (IsSetAttrib()) {
121  *str += " (";
122  *str += GetAttrib();
123  *str += ")";
124  }
125 }
126 
127 
129  EVocabulary vocabulary)
130 {
131  string name = NStr::TruncateSpaces(str);
132  NStr::ToLower(name);
133  replace(name.begin(), name.end(), '_', '-');
134  replace(name.begin(), name.end(), ' ', '-');
135 
136  if ( NStr::EqualNocase(name, "note") ||
137  NStr::EqualNocase(name, "subsource-note") ||
138  NStr::EqualNocase(name, "subsrc-note") ||
139  NStr::EqualNocase(name, "note-subsource")) {
140  return eSubtype_other;
141  } else if (vocabulary == eVocabulary_insdc) {
142  // consider a table if more special cases arise.
143  if (name == "insertion-seq") {
145  } else if (name == "plasmid") {
146  return eSubtype_plasmid_name;
147  } else if (name == "transposon") {
149  } else if (name == "sub-clone") {
150  return eSubtype_subclone;
151  }
152  }
153  return ENUM_METHOD_NAME(ESubtype)()->FindValue(name);
154 }
155 
156 
158  EVocabulary vocabulary)
159 {
160 
161  string name = NStr::TruncateSpaces(str);
162  NStr::ToLower(name);
163  replace(name.begin(), name.end(), '_', '-');
164  replace(name.begin(), name.end(), ' ', '-');
165 
166  if ( NStr::EqualNocase(name, "note") ||
167  NStr::EqualNocase(name, "subsource-note") ||
168  NStr::EqualNocase(name, "subsrc-note") ||
169  NStr::EqualNocase(name, "note-subsource")) {
170  return true;
171  }
172  if (vocabulary == eVocabulary_insdc) {
173  // consider a table if more special cases arise.
174  if (name == "insertion-seq" ||
175  name == "plasmid" ||
176  name == "transposon" ||
177  name == "sub-clone") {
178  return true;
179  }
180  }
181  return ENUM_METHOD_NAME(ESubtype)()->IsValidName(name);
182 }
183 
184 
186  EVocabulary vocabulary)
187 {
188  if (stype == CSubSource::eSubtype_other) {
189  return "note";
190  } else if (vocabulary == eVocabulary_insdc) {
191  switch (stype) {
192  case eSubtype_subclone: return "sub_clone";
193  case eSubtype_plasmid_name: return "plasmid";
194  case eSubtype_transposon_name: return "transposon";
195  case eSubtype_insertion_seq_name: return "insertion_seq";
196  default:
197  return NStr::Replace
198  (ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true),
199  "-", "_");
200  }
201  } else {
202  return ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true);
203  }
204 }
205 
206 
207 
209 {
210  return subtype != eSubtype_chromosome
211  && subtype != eSubtype_sex
212  && subtype != eSubtype_germline
213  && subtype != eSubtype_rearranged
214  && subtype != eSubtype_plasmid_name
215  && subtype != eSubtype_segment
216  && subtype != eSubtype_country
217  && subtype != eSubtype_transgenic
218  && subtype != eSubtype_environmental_sample
219  && subtype != eSubtype_lat_lon
220  && subtype != eSubtype_collection_date
221  && subtype != eSubtype_collected_by
222  && subtype != eSubtype_identified_by
223  && subtype != eSubtype_fwd_primer_seq
224  && subtype != eSubtype_rev_primer_seq
225  && subtype != eSubtype_fwd_primer_name
226  && subtype != eSubtype_rev_primer_name
227  && subtype != eSubtype_metagenomic
228  && subtype != eSubtype_altitude
229  && subtype != eSubtype_clone;
230 }
231 
232 
233 bool CSubSource::NeedsNoText(const TSubtype& subtype)
234 {
235  if (subtype == eSubtype_germline
236  || subtype == eSubtype_rearranged
237  || subtype == eSubtype_transgenic
238  || subtype == eSubtype_environmental_sample
239  || subtype == eSubtype_metagenomic) {
240  return true;
241  } else {
242  return false;
243  }
244 }
245 
246 
248 {
249  if (subtype == eSubtype_frequency
250  || subtype == eSubtype_insertion_seq_name
251  || subtype == eSubtype_phenotype
252  || subtype == eSubtype_plastid_name
253  || subtype == eSubtype_transposon_name
254  || subtype == eSubtype_fwd_primer_seq
255  || subtype == eSubtype_rev_primer_seq
256  || subtype == eSubtype_fwd_primer_name
257  || subtype == eSubtype_rev_primer_name
258  || subtype == eSubtype_whole_replicon) { // metagenomic subsrc qualifier taken off this list: GB-3384
259  return true;
260  } else {
261  return false;
262  }
263 }
264 
265 
266 bool CSubSource::IsDayValueOkForMonth(int day, int month, int year)
267 {
268  if (month < 1 || month > 12 || day < 1) {
269  return false;
270  }
271  bool rval = true;
272  if (year < 100) {
273  year += 2000;
274  } else if (year > 3000) {
275  return false;
276  } else if (year < 1538) {
277  return false;
278  }
279  CTime month_o(year, month, 1);
280  if (day > month_o.DaysInMonth()) {
281  rval = false;
282  }
283  return rval;
284 }
285 
286 
288 {
289  if (NStr::IsBlank(test)) {
291  "collection-date string is blank");
292  }
293  string str = NStr::TruncateSpaces(test);
294 
295  if (IsISOFormatDate(str)) {
296  return GetDateFromISODate(str);
297  }
298 
299  size_t pos = NStr::Find(str, "-");
300  string year;
301  string month;
302  string day;
303 
304  if (pos == NPOS) {
305  year = str;
306  } else {
307  size_t pos2 = NStr::Find(str, "-", pos + 1);
308  if (pos2 == NPOS) {
309  month = str.substr(0, pos);
310  year = str.substr(pos + 1);
311  if (NStr::IsBlank(month)) {
313  "collection-date string is improperly formatted");
314  }
315  } else {
316  day = str.substr(0, pos);
317  month = str.substr(pos + 1, pos2 - pos - 1);
318  year = str.substr(pos2 + 1);
319  if (NStr::IsBlank(month) || NStr::IsBlank(day)) {
321  "collection-date string is improperly formatted");
322  }
323  }
324  }
325 
326  int month_val = 0;
327  if (!NStr::IsBlank(month)) {
328  try {
329  month_val = CTime::MonthNameToNum(month);
330  } catch (const CTimeException&) {
332  "collection-date string has invalid month");
333  }
334  }
335 
336  int day_val = 0;
337  if (!NStr::IsBlank(day)) {
338  try {
339  day_val = NStr::StringToInt (day);
340  if (day_val < 1) {
342  "collection-date string has invalid day value");
343  }
344  } catch ( const exception& ) {
345  // threw exception while converting to int
347  "collection-date string is improperly formatted");
348  }
349  }
350 
351  if (NStr::IsBlank(year)) {
353  "collection-date string is improperly formatted");
354  }
355 
356  int year_val = 0;
357  try {
358  year_val = NStr::StringToInt (year);
359  } catch ( const exception& ) {
360  // threw exception while converting to int
362  "collection-date string is improperly formatted");
363  }
364 
365  /*
366  if (year_val < 1000 || year_val >= 2100) {
367  NCBI_THROW (CException, eUnknown,
368  "collection-date year is out of range");
369  }
370  */
371 
372  if (year_val < 1000) {
374  "collection-date year is out of range");
375  }
376 
377  if (year_val >= 2100) {
379  "collection-date year is out of range");
380  }
381 
382  if (day_val > 0 && month_val > 0 && !IsDayValueOkForMonth(day_val, month_val, year_val)) {
384  "collection-date day is greater than monthly maximum");
385  }
386 
387  CRef<CDate> date(new CDate);
388 
389  date->SetStd().SetYear (year_val);
390  if (month_val > 0) {
391  date->SetStd().SetMonth (month_val);
392  }
393  if (day_val > 0) {
394  date->SetStd().SetDay (day_val);
395  }
396 
397  time_t t;
398 
399  time(&t);
400 
401  CDate now(t);
402 
403  /*
404  if (IsCollectionDateAfterTime(*date, t)) {
405  NCBI_THROW (CException, eUnknown,
406  "collection-date year is out of range");
407  }
408  */
409 
410  return date;
411 }
412 
413 
414 bool CSubSource::IsCollectionDateAfterTime(const string& collection_date, time_t t, bool& bad_format)
415 {
416  bad_format = false;
417  bool in_future = false;
418  vector<string> pieces;
419  NStr::Split(collection_date, "/", pieces);
420  if (pieces.size() > 2) {
421  bad_format = true;
422  } else {
423  ITERATE(vector<string>, it, pieces) {
424  CRef<CDate> coll_date = DateFromCollectionDate (*it);
425  if (!coll_date) {
426  bad_format = true;
427  } else if (IsCollectionDateAfterTime(*coll_date, t)) {
428  in_future = true;
429  }
430  }
431  }
432  return in_future;
433 }
434 
435 
436 bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, time_t t)
437 {
438  CDate now(t);
439  if (collection_date.Compare(now) == CDate::eCompare_after) {
440  return true;
441  } else {
442  return false;
443  }
444 }
445 
446 
447 bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, CTime& ctime)
448 {
449  time_t t = ctime.GetTimeT();
450  return IsCollectionDateAfterTime(collection_date, t);
451 }
452 
453 
454 void CSubSource::IsCorrectDateFormat(const string& date_string, bool& bad_format, bool& in_future)
455 {
456  bad_format = false;
457  in_future = false;
458 
459  vector<string> pieces;
460  NStr::Split(date_string, "/", pieces);
461  if (pieces.size() > 2) {
462  bad_format = true;
463  return;
464  } else if (pieces.size() == 2) {
465  bool first_bad = false;
466  bool first_future = false;
467  bool second_bad = false;
468  bool second_future = false;
469  IsCorrectDateFormat(pieces[0], first_bad, first_future);
470  IsCorrectDateFormat(pieces[1], second_bad, second_future);
471  bad_format = first_bad || second_bad;
472  if (!bad_format) {
473  in_future = first_future || second_future;
474  }
475  return;
476  }
477 
478  try {
479  CRef<CDate> coll_date = CSubSource::DateFromCollectionDate (date_string);
480 
481  if (!IsISOFormatDate(date_string)) {
482  // if there are two dashes, then the first token needs to be the day, and the
483  // day has to have two numbers, a leading zero if the day is less than 10
484  size_t pos = NStr::Find(date_string, "-");
485  if (pos != NPOS) {
486  size_t pos2 = NStr::Find(date_string, "-", pos + 1);
487  if (pos2 != NPOS && pos != 2) {
488  bad_format = true;
489  }
490  }
491  }
492 
493  if (!bad_format) {
494  time_t t;
495 
496  time(&t);
497 
498  in_future = IsCollectionDateAfterTime(*coll_date, t);
499  }
500  } catch (const CException& ) {
501  bad_format = true;
502  }
503 }
504 
505 size_t CSubSource::CheckDateFormat(const string& date_string)
506 {
507  size_t rval = eDateFormatFlag_ok;
508  vector<string> pieces;
509  NStr::Split(date_string, "/", pieces);
510  if (pieces.size() > 2) {
512  } else if (pieces.size() == 2) {
513  rval |= CheckDateFormat(pieces[0]);
514  rval |= CheckDateFormat(pieces[1]);
515  if (rval == eDateFormatFlag_ok) {
516  try {
519  if (d2->Compare(*d1) == CDate::eCompare_before) {
521  }
522  } catch (const CException&) {
524  }
525  }
526  return rval;
527  }
528 
529  try {
530  CRef<CDate> coll_date = CSubSource::DateFromCollectionDate(date_string);
531 
532  if (!IsISOFormatDate(date_string)) {
533  // if there are two dashes, then the first token needs to be the day, and the
534  // day has to have two numbers, a leading zero if the day is less than 10
535  size_t pos = NStr::Find(date_string, "-");
536  if (pos != NPOS) {
537  size_t pos2 = NStr::Find(date_string, "-", pos + 1);
538  if (pos2 != NPOS && pos != 2) {
540  }
541  }
542  }
543 
544  if (rval == eDateFormatFlag_ok) {
545  time_t t;
546 
547  time(&t);
548  if (IsCollectionDateAfterTime(*coll_date, t)) {
550  }
551  }
552  } catch (const CException&) {
554  }
555  return rval;
556 }
557 
559 
560 // null term exemption values, order is not important
561 MAKE_CONST_SET(s_Null_CollectionDatesSet, ct::tagStrCase,
562 {
563  "missing",
564  "missing: control sample",
565  "missing: data agreement established pre-2023",
566  "missing: endangered species",
567  "missing: human-identifiable",
568  "missing: lab stock",
569  "missing: sample group",
570  "missing: synthetic construct",
571  "missing: third party data",
572  "not applicable",
573  "not collected",
574  "not provided",
575  "restricted access",
576 })
577 
578 string CSubSource::GetCollectionDateProblem (const string& date_string)
579 {
580  string problem;
581  if (s_Null_CollectionDatesSet.find(date_string.c_str()) != s_Null_CollectionDatesSet.end()) {
582  return problem;
583  }
584  size_t rval = CheckDateFormat(date_string);
585  if (rval & eDateFormatFlag_bad_format) {
586  problem = "Collection_date format is not in DD-Mmm-YYYY format";
587  } else if (rval & eDateFormatFlag_in_future) {
588  problem = "Collection_date is in the future";
589  } else if (rval & eDateFormatFlag_out_of_order) {
590  problem = "Collection_dates are out of order";
591  }
592  return problem;
593 }
594 
595 
596 string CSubSource::x_ParseDateRangeWithDelimiter(const string& orig_date, CTempString delim)
597 {
598  size_t pos = NStr::Find(orig_date, delim, NStr::eNocase);
599  if (pos == NPOS) {
600  return kEmptyStr;
601  }
602  size_t second_pos = NStr::Find(orig_date.substr(pos + 1), delim, NStr::eNocase);
603  if (second_pos != NPOS) {
604  return kEmptyStr;
605  }
606  bool month_ambig = false;
607  string first_date = FixDateFormat(orig_date.substr(0, pos), true, month_ambig);
608  if (month_ambig || NStr::IsBlank(first_date)) {
609  return kEmptyStr;
610  }
611  string second_date = FixDateFormat(orig_date.substr(pos + delim.length()), true, month_ambig);
612  if (month_ambig || NStr::IsBlank(second_date)) {
613  return kEmptyStr;
614  }
615  string fix = first_date + "/" + second_date;
616  return fix;
617 }
618 
619 
620 string CSubSource::FixDateFormat (const string& orig_date)
621 {
622  bool month_ambiguous = false;
623 
624  string fix = FixDateFormat(orig_date, true, month_ambiguous);
625  if (month_ambiguous) {
626  fix.clear();
627  } else if (NStr::IsBlank(fix)) {
628  static const char* delimiters[] = {"/", " to ", " and ", "-", "_"};
629  for (size_t i = 0; i < ArraySize(delimiters); i++) {
630  fix = x_ParseDateRangeWithDelimiter(orig_date, delimiters[i]);
631  if (!NStr::IsBlank(fix)) {
632  break;
633  }
634  }
635  }
636  return fix;
637 }
638 
639 // ISO Format for time is one of these:
640 // HH:MM:SS
641 // HH:MM
642 // HH
643 // Followed by either Z or +hh:mm to indicate an offset from Zulu
644 bool CSubSource::IsISOFormatTime(const string& orig_time, int& hour, int& min, int& sec, bool require_time_zone)
645 {
646  int offset_hour = 0;
647  int offset_min = 0;
648  size_t suffix = NStr::Find(orig_time, "Z");
649  if (suffix == NPOS) {
650  suffix = NStr::Find(orig_time, "+");
651  if (suffix == NPOS) {
652  if (require_time_zone) {
653  return false;
654  } else {
655  suffix = orig_time.length();
656  }
657  } else {
658  if (orig_time.substr(suffix).length() != 6 ||
659  !isdigit((unsigned char)orig_time[suffix + 1]) ||
660  !isdigit((unsigned char)orig_time[suffix + 2]) ||
661  orig_time[suffix + 3] != ':' ||
662  !isdigit((unsigned char)orig_time[suffix + 4]) ||
663  !isdigit((unsigned char)orig_time[suffix + 5])) {
664  return false;
665  }
666  try {
667  offset_hour = NStr::StringToInt(orig_time.substr(suffix + 1, 2));
668  offset_min = NStr::StringToInt(orig_time.substr(suffix + 4, 2));
669  } catch (...) {
670  return false;
671  }
672  }
673  }
674  if (suffix != 2 && suffix != 5 && suffix != 8) {
675  return false;
676  }
677 
678  if (!isdigit((unsigned char)orig_time[0]) || !isdigit((unsigned char)orig_time[1])) {
679  return false;
680  }
681  hour = 0;
682  min = 0;
683  sec = 0;
684  try {
685  hour = NStr::StringToInt(orig_time.substr(0, 2));
686  if (hour < 0 || hour > 23) {
687  return false;
688  }
689  hour -= offset_hour;
690  } catch (...) {
691  return false;
692  }
693  if (suffix > 2) {
694  if (!isdigit((unsigned char)orig_time[3]) || !isdigit((unsigned char)orig_time[4])) {
695  return false;
696  }
697  try {
698  min = NStr::StringToInt(orig_time.substr(3, 2));
699  if (min < 0 || min > 59) {
700  return false;
701  }
702  } catch (...) {
703  return false;
704  }
705  min -= offset_min;
706  }
707  if (suffix == 8) {
708  if (!isdigit((unsigned char)orig_time[6]) || !isdigit((unsigned char)orig_time[7])) {
709  return false;
710  }
711  try {
712  sec = NStr::StringToInt(orig_time.substr(6, 2));
713  if (sec < 0) {
714  // negative number bad
715  return false;
716  } else if (sec > 59) {
717  // too big
718  return false;
719  }
720  } catch (...) {
721  return false;
722  }
723  }
724 
725  return true;
726 }
727 
728 // ISO Format for date is exactly 10 characters long OR exactly 7 characters long.
729 // For ten characters:
730 // First four characters must be digits, represent year.
731 // Fifth character must be dash.
732 // Sixth and seventh characters must be digits, represent month, use zero padding.
733 // Eighth character must be dash.
734 // Ninth and tenth characters must be digits, represent day, use zero padding.
735 // For 7 characters:
736 // First four characters must be digits, represent year.
737 // Fifth character must be dash.
738 // Sixth and seventh characters must be digits, represent month, use zero padding.
739 bool CSubSource::IsISOFormatDateOnly (const string& cpy)
740 {
741  if (cpy.length() != 10 && cpy.length() != 7) {
742  return false;
743  }
744  bool rval = true;
745  size_t pos = 0;
746  string::const_iterator it = cpy.begin();
747  while (it != cpy.end() && rval) {
748  if (pos == 4 || pos == 7) {
749  if (*it != '-') {
750  rval = false;
751  }
752  } else if (!isdigit(*it)) {
753  rval = false;
754  }
755  ++it;
756  ++pos;
757  }
758  if (rval) {
759  try {
760  int year = NStr::StringToInt(cpy.substr(0, 4));
761  int month = NStr::StringToInt(cpy.substr(5, 2));
762  if (month < 1 || month > 12) {
763  rval = false;
764  }
765  if (cpy.length() == 10) { // has day
766  int day = NStr::StringToInt(cpy.substr(8, 2));
767  if (!IsDayValueOkForMonth(day, month, year)) {
768  rval = false;
769  }
770  }
771  } catch (...) {
772  rval = false;
773  }
774  }
775  return rval;
776 }
777 
778 
779 bool CSubSource::x_IsFixableIsoDate(const string& orig_date)
780 {
781  string cpy = orig_date;
783  size_t time_pos = NStr::Find(cpy, "T");
784  bool rval = false;
785  if (time_pos == NPOS) {
786  rval = false;
787  } else {
788  if (!IsISOFormatDateOnly(cpy.substr(0, time_pos))) {
789  rval = false;
790  } else {
791  int h, m, s;
792  if (IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, true)) {
793  // already fine, not fixable
794  rval = false;
795  } else {
796  rval = IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, false);
797  }
798  }
799  }
800  return rval;
801 }
802 
803 
804 string CSubSource::x_RemoveIsoTime(const string& orig_date)
805 {
806  string cpy = orig_date;
808  size_t time_pos = NStr::Find(cpy, "T");
809  if (time_pos != NPOS) {
810  cpy = cpy.substr(0, time_pos);
811  }
812  return cpy;
813 }
814 
815 
816 bool CSubSource::IsISOFormatDate(const string& orig_date)
817 {
818  string cpy = orig_date;
820  size_t time_pos = NStr::Find(cpy, "T");
821  if (time_pos == NPOS) {
822  return IsISOFormatDateOnly(cpy);
823  } else {
824  int h, m, s;
825  return (IsISOFormatDateOnly(cpy.substr(0, time_pos)) &&
826  IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s));
827  }
828 
829 }
830 
832 {
833  try {
834  string cpy = orig_date;
836  CRef<CDate> date(new CDate());
837  int year_val = NStr::StringToInt(cpy.substr(0, 4));
838  int month_val = NStr::StringToInt(cpy.substr(5, 2));
839  date->SetStd().SetYear (year_val);
840  date->SetStd().SetMonth (month_val);
841  if (cpy.length() > 7) {
842  int day_val = NStr::StringToInt(cpy.substr(8, 2));
843  date->SetStd().SetDay (day_val);
844  }
845  return date;
846  } catch (...) {
847  return CRef<CDate>();
848  }
849 }
850 
851 
852 vector<string> CSubSource::x_GetDateTokens(const string& orig_date)
853 {
854  vector<string> tokens;
855  string token_delimiters = " ,-/=_.";
856 
857  string cpy = orig_date;
859 
860  string curr_token;
861  bool is_chars = false;
862  ITERATE(string, s, cpy) {
863  if (token_delimiters.find(*s) != NPOS) {
864  if (!NStr::IsBlank(curr_token)) {
865  tokens.push_back(curr_token);
866  }
867  curr_token.clear();
868  is_chars = false;
869  } else if (is_chars && !isalpha((unsigned char)(*s))) {
870  // previous token was all letters, do not add non-letter characters
871  if (!NStr::IsBlank(curr_token)) {
872  tokens.push_back(curr_token);
873  }
874  curr_token = *s;
875  is_chars = false;
876  } else if (!NStr::IsBlank(curr_token) && !is_chars && isalpha(*s)) {
877  // previous token had no letters
878  tokens.push_back(curr_token);
879  curr_token = *s;
880  is_chars = true;
881  } else {
882  curr_token += *s;
883  if (isalpha(*s)) {
884  is_chars = true;
885  }
886  }
887  }
888  if (!NStr::IsBlank(curr_token)) {
889  tokens.push_back(curr_token);
890  }
891 
892  // reattach 'st', 'nd', 'rd', and 'th' to numbers if present
893  if (tokens.size() > 3) {
894  vector<string>::iterator p = tokens.begin();
895  bool prev_is_number = isdigit((unsigned char)(*p)[0]);
896  vector<string>::iterator s = p;
897  ++s;
898  while (s != tokens.end()) {
899  if (prev_is_number &&
900  (NStr::EqualNocase(*s, "st") ||
901  NStr::EqualNocase(*s, "nd") ||
902  NStr::EqualNocase(*s, "rd") ||
903  NStr::EqualNocase(*s, "th"))) {
904  *p += *s;
905  s = tokens.erase(s);
906  prev_is_number = false;
907  } else {
908  ++p;
909  ++s;
910  prev_is_number = isdigit((unsigned char)(*p)[0]);
911  }
912  }
913  }
914 
915  return tokens;
916 }
917 
918 
919 bool s_ChooseMonthAndDay(const string& token1, const string& token2, bool month_first, string& month, int& day, bool& month_ambiguous)
920 {
921  try {
922  int val1 = NStr::StringToInt (token1);
923  int val2 = NStr::StringToInt (token2);
924  if (val1 > 12 && val2 > 12) {
925  // both numbers too big for month
926  return false;
927  } else if (val1 < 13 && val2 < 13) {
928  if (val1 == val2) {
929  // no need to call this ambiguous
930  month = CTime::MonthNumToName(val1, CTime::eAbbr);
931  day = val2;
932  } else {
933  // both numbers could be month
934  month_ambiguous = true;
935  if (month_first) {
936  month = CTime::MonthNumToName(val1, CTime::eAbbr);
937  day = val2;
938  } else {
939  month = CTime::MonthNumToName(val2, CTime::eAbbr);
940  day = val1;
941  }
942  }
943  } else if (val1 < 13) {
944  month = CTime::MonthNumToName(val1, CTime::eAbbr);
945  day = val2;
946  } else {
947  month = CTime::MonthNumToName(val2, CTime::eAbbr);
948  day = val1;
949  }
950  return true;
951  } catch ( ... ) {
952  return false;
953  }
954 }
955 
956 
957 string CSubSource::FixDateFormat (const string& test, bool month_first, bool& month_ambiguous)
958 {
959  string orig_date = test;
960  NStr::TruncateSpacesInPlace(orig_date);
961 
962  if (IsISOFormatDate(orig_date)) {
963  return orig_date;
964  } else if (x_IsFixableIsoDate(orig_date)) {
965  return x_RemoveIsoTime(orig_date);
966  }
967 
968  string reformatted_date;
969  string month;
970  int year = 0, day = 0;
971  //string token_delimiters = " ,-/=_.";
972  size_t num_original_tokens = 0;
973 
974  month_ambiguous = false;
975  vector<string> tokens = x_GetDateTokens(orig_date);
976 
977  num_original_tokens = tokens.size();
978  if (tokens.size() < 1 || tokens.size() > 3) {
979  // no tokens or too many tokens
980  return kEmptyStr;
981  }
982 
983  string one_token;
984  vector<string>::iterator it = tokens.begin();
985  while (it != tokens.end()) {
986  one_token = *it;
987  bool found = false;
988  if (NStr::EqualNocase(one_token, "1st") || NStr::EqualNocase(one_token, "first")) {
989  day = 1;
990  found = true;
991  } else if (NStr::EqualNocase(one_token, "2nd") || NStr::EqualNocase(one_token, "second")) {
992  day = 2;
993  found = true;
994  } else if (NStr::EqualNocase(one_token, "3rd") || NStr::EqualNocase (one_token, "third")) {
995  day = 3;
996  found = true;
997  } else if (one_token.length() > 0
998  && isdigit((unsigned char)one_token[0])
999  && NStr::EndsWith(one_token, "th")) {
1000  try {
1001  day = NStr::StringToInt (one_token.substr(0, one_token.length() - 2));
1002  found = true;
1003  } catch ( ... ) {
1004  // threw exception while converting to int
1005  return kEmptyStr;
1006  }
1007  } else if (isalpha((unsigned char)one_token[0])) {
1008  if (!NStr::IsBlank(month)) {
1009  // already have month, error
1010  return kEmptyStr;
1011  }
1012  if (one_token.length() > 3) {
1013  one_token = one_token.substr(0, 3);
1014  }
1015  try {
1016  int month_num = CTime::MonthNameToNum(one_token);
1017  found = true;
1018  month = CTime::MonthNumToName(month_num, CTime::eAbbr);
1019  } catch (const CTimeException&) {
1020  }
1021  } else {
1022  try {
1023  int this_val = NStr::StringToInt (one_token);
1024  int min = 1;
1025  int max = 31;
1026  if (this_val < min) {
1027  return kEmptyStr;
1028  } else if (this_val > max) {
1029  if (year > 0) {
1030  // already have year, error
1031  return kEmptyStr;
1032  }
1033  year = this_val;
1034  found = true;
1035  }
1036  } catch ( ... ) {
1037  // threw exception while converting to int
1038  return kEmptyStr;
1039  }
1040  }
1041  if (found) {
1042  it = tokens.erase(it);
1043  } else {
1044  it++;
1045  }
1046  }
1047 
1048  if (tokens.size() == 0) {
1049  // good - all tokens assigned to values
1050  } else if (tokens.size() > 2) {
1051  // three numbers: treat last one as year
1052  try {
1053  year = NStr::StringToInt(tokens[2]);
1054  if (year < 100) {
1055  year += 2000;
1056  }
1057  if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
1058  return kEmptyStr;
1059  }
1060  // mark month as ambiguous, since we are guessing about year
1061  month_ambiguous = true;
1062  } catch ( ... ) {
1063  // threw exception while converting to int
1064  return kEmptyStr;
1065  }
1066  } else if (tokens.size() == 1) {
1067  try {
1068  int val = NStr::StringToInt (tokens[0]);
1069  if (year == 0) {
1070  year = val;
1071  } else {
1072  if (NStr::IsBlank (month)) {
1073  if (val > 0 && val < 13) {
1075  } else {
1076  // month number out of range
1077  return kEmptyStr;
1078  }
1079  } else {
1080  day = val;
1081  }
1082  }
1083  } catch ( ... ) {
1084  // threw exception while converting to int
1085  return kEmptyStr;
1086  }
1087  } else if (!NStr::IsBlank (month)) {
1088  if (tokens.size() == 2) {
1089  // we have a month and two other numbers (we hope)
1090  int val1 = 0;
1091  int val2 = 0;
1092  try {
1093  val1 = NStr::StringToInt (tokens[0]);
1094  val2 = NStr::StringToInt (tokens[1]);
1095  } catch (CException& /*e*/) {
1096  // not actually numbers
1097  return kEmptyStr;
1098  }
1099  bool zero_pad_1 = NStr::StartsWith(tokens[0], "0");
1100  bool zero_pad_2 = NStr::StartsWith(tokens[1], "0");
1101  if (val1 < 10 && !zero_pad_1 && (val2 > 10 || zero_pad_2)) {
1102  // if one token is not zero-padded and less than 10,
1103  // the other either is zero-padded and greater than 10,
1104  // the "small" token is the day and the second (+2000) is the year
1105  day = val1;
1106  year = val2 + 2000;
1107  } else if (val2 < 10 && !zero_pad_2 && (val1 > 10 || zero_pad_1)) {
1108  // if one token is not zero-padded and less than 10,
1109  // the other either is zero-padded and greater than 10,
1110  // the "small" token is the day and the second (+2000) is the year
1111  day = val2;
1112  year = val1 + 2000;
1113  } else {
1114  int month_num = CTime::MonthNameToNum(month);
1115  if (IsDayValueOkForMonth(val1, month_num, val2 + 2000)) {
1116  day = val1;
1117  year = val2 + 2000;
1118  } else {
1119  day = val2;
1120  year = val1 + 2000;
1121  }
1122  }
1123  } else {
1124  return kEmptyStr;
1125  }
1126  } else {
1127  if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
1128  return kEmptyStr;
1129  }
1130  }
1131 
1132  // make sure day is valid
1133  if (day > 0 && !NStr::IsBlank(month) && year > -1) {
1134  try {
1135  int month_num = CTime::MonthNameToNum(month);
1136  if (!IsDayValueOkForMonth(day, month_num, year)) {
1137  return kEmptyStr;
1138  }
1139  } catch (const CTimeException&) {
1140  return kEmptyStr;
1141  }
1142  }
1143 
1144  if (year > 0 && year < 100 && num_original_tokens > 1) {
1145  // try to guess year from two-digit year provided,
1146  // only if it could not possibly be a day of the month
1147  // and if there were at least two tokens provided
1148  string year_date = NStr::NumericToString(year + 2000);
1149  bool format_bad = false;
1150  bool in_future = false;
1151  IsCorrectDateFormat(year_date, format_bad, in_future);
1152  if (in_future) {
1153  year += 1900;
1154  } else {
1155  year += 2000;
1156  }
1157  }
1158  if (year >= 1000 && year < 2100) {
1159  reformatted_date = NStr::NumericToString (year);
1160  if (!NStr::IsBlank (month)) {
1161  reformatted_date = month + "-" + reformatted_date;
1162  if (day > 0) {
1163  string day_str = NStr::NumericToString (day);
1164  if (day_str.length() < 2) {
1165  day_str = "0" + day_str;
1166  }
1167  reformatted_date = day_str + "-" + reformatted_date;
1168  }
1169  }
1170  }
1171 
1172  return reformatted_date;
1173 }
1174 
1175 
1176 void CSubSource::DetectDateFormat(const string& orig_date, bool& ambiguous, bool &day_first)
1177 {
1178  ambiguous = false;
1179  day_first = false;
1180  vector<string> tokens = x_GetDateTokens(orig_date);
1181  if (tokens.size() != 3) {
1182  // can't do detection if there are more or less than three tokens
1183  ambiguous = true;
1184  return;
1185  }
1186  vector<int> nums;
1187 
1188  // detection is only valid if all tokens are numbers and at least one is known to be the year
1189  try {
1190  ITERATE(vector<string>, it, tokens) {
1191  nums.push_back(NStr::StringToInt (*it));
1192  }
1193  } catch ( ... ) {
1194  // threw exception while converting to int
1195  ambiguous = true;
1196  return;
1197  }
1198  enum EPos { eDay = 0, eMonth = 1, eYear = 2 };
1199  vector<int> positions;
1200  positions.push_back(0);
1201  positions.push_back(0);
1202  positions.push_back(0);
1203 
1204  int token_pos = 1;
1205  ITERATE(vector<int>, it, nums) {
1206  if (*it > 31) {
1207  if (positions[eYear] > 0) {
1208  // already found a year
1209  ambiguous = true;
1210  return;
1211  }
1212  positions[eYear] = token_pos;
1213  } else if (*it > 12) {
1214  if (positions[eDay] > 0) {
1215  // already found a day
1216  ambiguous = true;
1217  return;
1218  }
1219  positions[eDay] = token_pos;
1220  } else if (positions[eMonth] > 0) {
1221  // already found a month
1222  ambiguous = true;
1223  return;
1224  } else {
1225  positions[eMonth] = token_pos;
1226  }
1227  token_pos++;
1228  }
1229  if (positions[eDay] < positions[eMonth]) {
1230  day_first = true;
1231  } else {
1232  day_first = false;
1233  }
1234 }
1235 
1236 
1237 void CSubSource::IsCorrectLatLonFormat (string lat_lon, bool& format_correct, bool& precision_correct,
1238  bool& lat_in_range, bool& lon_in_range,
1239  double& lat_value, double& lon_value)
1240 {
1241  format_correct = false;
1242  lat_in_range = false;
1243  lon_in_range = false;
1244  precision_correct = false;
1245  double ns, ew;
1246  char lon, lat;
1247  int processed;
1248 
1249  lat_value = 0.0;
1250  lon_value = 0.0;
1251 
1252  if (NStr::IsBlank(lat_lon)) {
1253  return;
1254  } else if (sscanf (lat_lon.c_str(), "%lf %c %lf %c%n", &ns, &lat, &ew, &lon, &processed) != 4
1255  || size_t(processed) != lat_lon.length()) {
1256  return;
1257  } else if ((lat != 'N' && lat != 'S') || (lon != 'E' && lon != 'W')) {
1258  return;
1259  } else {
1260  // init values found
1261  if (lat == 'N') {
1262  lat_value = ns;
1263  } else {
1264  lat_value = 0.0 - ns;
1265  }
1266  if (lon == 'E') {
1267  lon_value = ew;
1268  } else {
1269  lon_value = 0.0 - ew;
1270  }
1271 
1272  // make sure format is correct
1273  vector<string> pieces;
1274  NStr::Split(lat_lon, " ", pieces);
1275  if (pieces.size() > 3) {
1276  int precision_lat = x_GetPrecision(pieces[0]);
1277  int precision_lon = x_GetPrecision(pieces[2]);
1278 
1279  char reformatted[1000];
1280  sprintf (reformatted, "%.*lf %c %.*lf %c", precision_lat, ns, lat,
1281  precision_lon, ew, lon);
1282 
1283  size_t len = strlen (reformatted);
1284  if (NStr::StartsWith(lat_lon, reformatted)
1285  && (len == lat_lon.length()
1286  || (len < lat_lon.length()
1287  && lat_lon[len] == ';'))) {
1288  format_correct = true;
1289  if (ns <= 90 && ns >= 0) {
1290  lat_in_range = true;
1291  }
1292  if (ew <= 180 && ew >= 0) {
1293  lon_in_range = true;
1294  }
1295  if (precision_lat < 3 && precision_lon < 3) {
1296  precision_correct = true;
1297  }
1298  }
1299  }
1300  }
1301 }
1302 
1303 
1305 {
1306  bool format_correct = false;
1307  bool precision_correct = false;
1308  bool lat_in_range = false;
1309  bool lon_in_range = false;
1310  double lat_value = 0.0;
1311  double lon_value = 0.0;
1312  IsCorrectLatLonFormat(orig, format_correct, precision_correct,
1313  lat_in_range, lon_in_range,
1314  lat_value, lon_value);
1315  if (!format_correct || !lat_in_range || !lon_in_range || precision_correct) {
1316  return orig;
1317  }
1318  vector<string> pieces;
1319  NStr::Split(orig, " ", pieces);
1320  if (pieces.size() > 3) {
1321  int precision_lat = x_GetPrecision(pieces[0]);
1322  int precision_lon = x_GetPrecision(pieces[2]);
1323  if (precision_lat > 4) {
1324  precision_lat = 4;
1325  }
1326  if (precision_lon > 4) {
1327  precision_lon = 4;
1328  }
1329 
1330  char reformatted[1000];
1331  sprintf(reformatted, "%.*lf %c %.*lf %c", precision_lat, fabs(lat_value), pieces[1].c_str()[0],
1332  precision_lon, fabs(lon_value), pieces[3].c_str()[0]);
1333  string new_val = reformatted;
1334  return reformatted;
1335  }
1336  return kEmptyStr;
1337 }
1338 
1339 /*
1340 1. String should be converted to UTF8 string, this will get rid of \xC0 and similar substrings
1341 2. Every codepoint (note that this is not regular ascii "char") that is not a digit or a decimal point or a letter should be prepended with a space.
1342  Transitions from alpha to digit/point and from digit/point to alpha should also be prepended with a space.
1343 3. NStr::Split is called with space as a separator and Tokenize flag - need to check if Split works with UTF8 strings properly.
1344 4. After this we should have a vector of tokens, some of which are numbers and others are "modifiers" such as ', '', degrees, N, S, E, W, etc.
1345 5. A pattern string is created where each number is replaced with "1" and modifiers are normalized to "lat", or "N"; the actual numerical values are kept in a separate vector
1346 5. Based on the pattern the vector of numbers is parsed into degrees, minutes, or seconds,
1347 6. NSEW and "lattitude/longitude" are applied to degrees in the order of appearance, if none are present other heuristic to determine which is latitude and which is longitude
1348 */
1349 
1350 static string s_InsertSpacesBetweenTokens(const string &old_str)
1351 {
1352  string new_str;
1353  for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
1354  {
1356  if (sym < 0x80)
1357  {
1358  char c = static_cast<char>(sym);
1359  if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
1360  {
1361  new_str += ' ';
1362  }
1363  else if (!new_str.empty() &&
1364  ((isalpha(new_str.back()) && !isalpha(c)) ||
1365  (!isalpha(new_str.back()) && isalpha(c))))
1366  {
1367  new_str += ' ';
1368  }
1369  new_str += c;
1370  if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
1371  {
1372  new_str += ' ';
1373  }
1374  }
1375  else
1376  {
1377  new_str += ' ';
1378  }
1379  }
1380  return new_str;
1381 }
1382 
1383 static string s_RemoveSpacesWithinNumbers(const string &old_str)
1384 {
1385  string new_str;
1386  bool is_number = true;
1387  for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
1388  {
1390  if (sym < 0x80)
1391  {
1392  char c = static_cast<char>(sym);
1393  size_t j = new_str.size();
1394  if (j >= 4 && new_str[j-1] == ' ' && new_str[j-2] == '.' && new_str[j-3] == ' ' && isdigit(new_str[j-4]) && isdigit(c))
1395  {
1396  new_str.pop_back();
1397  new_str.pop_back();
1398  new_str.pop_back();
1399  new_str += '.';
1400  }
1401  new_str += c;
1402  if (!isdigit(c) && c != '+' && c != '-' && c != '.' && !isspace(c)) {
1403  is_number = false;
1404  }
1405  }
1406  else
1407  {
1408  new_str += ' ';
1409  is_number = false;
1410  }
1411  }
1412  if (is_number)
1413  {
1414  NStr::ReplaceInPlace(new_str, "+", " +");
1415  NStr::ReplaceInPlace(new_str, "-", " -");
1416  }
1417  return new_str;
1418 }
1419 
1420 static bool s_IsNumber(const string &token, double *result = NULL)
1421 {
1422  double num = NStr::StringToDouble(token, NStr::fConvErr_NoThrow);
1423  if (!num && errno)
1424  {
1425  return false;
1426  }
1427  if (result) {
1428  *result = num;
1429  }
1430  return true;
1431 }
1432 
1433 static string s_NormalizeTokens(vector<string> &tokens, vector<double> &numbers, vector<string> &anum, vector<int> &precision, vector<string> &lat_long, vector<string> &nsew)
1434 {
1435  vector<string> pattern;
1436  for (size_t i = 0; i < tokens.size(); i++)
1437  {
1438  string &token = tokens[i];
1439 
1440  double num;
1441  if (s_IsNumber(token, &num))
1442  {
1443  numbers.push_back(num);
1444  anum.push_back(token);
1445  pattern.push_back("1");
1446  precision.push_back(0);
1447  if (NStr::Find(token, ".") != NPOS && !NStr::EndsWith(token, "."))
1448  {
1449  precision.back()
1450  = static_cast<int>(token.length() - token.find('.') - 1);
1451  }
1452  continue;
1453  }
1454 
1455  {
1456  vector<string> tmp;
1457  NStr::Split(token, ".", tmp);
1458  double num0, num1, num2;
1459  if (tmp.size() == 3 && s_IsNumber(tmp[0], &num0) && s_IsNumber(tmp[1], &num1) && s_IsNumber(tmp[2], &num2))
1460  {
1461  numbers.push_back(num0);
1462  anum.push_back(tmp[0]);
1463  pattern.push_back("1");
1464  precision.push_back(0);
1465  numbers.push_back(num1);
1466  anum.push_back(tmp[1]);
1467  pattern.push_back("1");
1468  precision.push_back(0);
1469  numbers.push_back(num2);
1470  anum.push_back(tmp[2]);
1471  pattern.push_back("1");
1472  precision.push_back(0);
1473  continue;
1474  }
1475  }
1476 
1477  if (token == "\'" && i >= 3 && s_IsNumber(tokens[i - 1]) && tokens[i - 2] == "\'" && s_IsNumber(tokens[i - 3]))
1478  {
1479  token = "\"";
1480  }
1481 
1482  if (NStr::EqualNocase(token, "degrees") || NStr::EqualNocase(token, "deg") || NStr::EqualNocase(token, "deg.") || NStr::EqualNocase(token, "degree"))
1483  {
1484  token = "degrees";
1485  pattern.push_back("degrees");
1486  }
1487  else if ( token == "\'" || NStr::EqualNocase(token, "min") || NStr::EqualNocase(token, "min.") || NStr::EqualNocase(token, "minute") || NStr::EqualNocase(token, "minutes"))
1488  {
1489  token = "\'";
1490  pattern.push_back("\'");
1491  }
1492  else if (token == "\"" || NStr::EqualNocase(token, "sec") || NStr::EqualNocase(token, "sec.") || NStr::EqualNocase(token, "second") || NStr::EqualNocase(token, "seconds"))
1493  {
1494  token = "\"";
1495  pattern.push_back("\"");
1496  }
1497  else if (token == "," || token == ":" || token == "_" || token == "&" || token == "." || token == ";" || token == "#" || NStr::EqualNocase(token, "and"))
1498  {
1499  }
1500  else if (NStr::EqualNocase(token, "lattitude") || NStr::EqualNocase(token, "latitude") || NStr::EqualNocase(token, "lat") || NStr::EqualNocase(token, "lat."))
1501  {
1502  pattern.push_back("lat");
1503  lat_long.push_back("lat");
1504  }
1505  else if (NStr::EqualNocase(token, "longitude") || NStr::EqualNocase(token, "lo") || NStr::EqualNocase(token, "lon") || NStr::EqualNocase(token, "long")
1506  || NStr::EqualNocase(token, "lo.") || NStr::EqualNocase(token, "lon.") || NStr::EqualNocase(token, "long."))
1507  {
1508  pattern.push_back("lat");
1509  lat_long.push_back("long");
1510  }
1511  else if (token == "N" || NStr::EqualNocase(token, "north"))
1512  {
1513  pattern.push_back("N");
1514  nsew.push_back("N");
1515  }
1516  else if (token == "S" || NStr::EqualNocase(token, "south"))
1517  {
1518  pattern.push_back("N");
1519  nsew.push_back("S");
1520  }
1521  else if (token == "E" || NStr::EqualNocase(token, "east"))
1522  {
1523  pattern.push_back("N");
1524  nsew.push_back("E");
1525  }
1526  else if (token == "W" || NStr::EqualNocase(token, "west") || token == "Wdeg")
1527  {
1528  pattern.push_back("N");
1529  nsew.push_back("W");
1530  }
1531  else if (token == "NW")
1532  {
1533  nsew.push_back("N");
1534  nsew.push_back("W");
1535  }
1536  else if (token == "NE")
1537  {
1538  nsew.push_back("N");
1539  nsew.push_back("E");
1540  }
1541  else if (token == "SW")
1542  {
1543  nsew.push_back("S");
1544  nsew.push_back("W");
1545  }
1546  else if (token == "SE")
1547  {
1548  nsew.push_back("S");
1549  nsew.push_back("E");
1550  }
1551  else
1552  {
1553  //cout << "Token: " << token << endl;
1554  numbers.clear();
1555  return kEmptyStr;
1556  }
1557  }
1558  //cout << "Pattern: " << NStr::Join(pattern, " ") << endl;
1559  return NStr::Join(pattern, " ");
1560 }
1561 
1562 static void s_ReorderNorthSouthEastWest(vector<double> &numbers, vector<int> &precision, const vector<string> &lat_long, vector<string> &nsew)
1563 {
1564  if (numbers.size() != 2)
1565  {
1566  numbers.clear();
1567  return;
1568  }
1569  if (lat_long.size() == 2)
1570  {
1571  if (lat_long.front() == "long")
1572  {
1573  swap(numbers[0], numbers[1]);
1574  swap(precision[0], precision[1]);
1575  if (nsew.size() == 2) {
1576  swap(nsew[0], nsew[1]);
1577  }
1578  }
1579  }
1580  else if (!lat_long.empty())
1581  {
1582  numbers.clear();
1583  return;
1584  }
1585  if (nsew.size() == 2)
1586  {
1587  if ((nsew[0] == "E" || nsew[0] == "W") &&
1588  (nsew[1] == "N" || nsew[1] == "S"))
1589  {
1590  swap(numbers[0], numbers[1]);
1591  swap(precision[0], precision[1]);
1592  swap(nsew[0], nsew[1]);
1593  }
1594  if (nsew[0] == "N")
1595  {
1596  numbers[0] = fabs(numbers[0]);
1597  }
1598  else if (nsew[0] == "S")
1599  {
1600  if (numbers[0] != 0)
1601  numbers[0] = -fabs(numbers[0]);
1602  }
1603  else
1604  {
1605  numbers.clear();
1606  return;
1607  }
1608  if (nsew[1] == "E")
1609  {
1610  numbers[1] = fabs(numbers[1]);
1611  }
1612  else if (nsew[1] == "W")
1613  {
1614  if (numbers[1] != 0)
1615  numbers[1] = -fabs(numbers[1]);
1616  }
1617  else
1618  {
1619  numbers.clear();
1620  return;
1621  }
1622 
1623  }
1624  else if (!nsew.empty())
1625  {
1626  numbers.clear();
1627  return;
1628  }
1629  if (lat_long.empty() && nsew.empty() && fabs(numbers[0]) > 90 && fabs(numbers[1]) < 90)
1630  {
1631  swap(numbers[0], numbers[1]);
1632  swap(precision[0], precision[1]);
1633  }
1634  if (fabs(numbers[0]) > 90 || fabs(numbers[1]) > 180)
1635  {
1636  numbers.clear();
1637  return;
1638  }
1639 }
1640 
1641 static void s_GetLatLong(const string &new_str, vector<double> &numbers, vector<int> &precision)
1642 {
1643  vector<string> tokens;
1644  NStr::Split(new_str, " ", tokens, NStr::fSplit_Tokenize);
1645  vector<string> lat_long;
1646  vector<string> nsew;
1647  vector<string> anum;
1648  string pattern = s_NormalizeTokens(tokens, numbers, anum, precision, lat_long, nsew);
1649  if (pattern.empty())
1650  {
1651  numbers.clear();
1652  return;
1653  }
1654  vector<double> degrees(2, 0);
1655  vector<int> prec(2, 0);
1656  int sign1 = 1;
1657  int sign2 = 1;
1658  if ( pattern == "1 1" ||
1659  pattern == "1 N 1 N" ||
1660  pattern == "N 1 N 1" ||
1661  pattern == "1 degrees N 1 degrees N" ||
1662  pattern == "lat 1 lat 1" ||
1663  pattern == "1 N lat 1 N lat" ||
1664  pattern == "1 degrees N lat 1 degrees N lat")
1665  {
1666  degrees[0] = numbers[0];
1667  degrees[1] = numbers[1];
1668  prec[0] = precision[0];
1669  prec[1] = precision[1];
1670  }
1671  else if ((pattern == "1 1 \" 1 1 '" ||
1672  pattern == "1 degrees 1 \" N 1 degrees 1 ' N")
1673  && numbers[1] < 60 && numbers[3] < 60
1674  && numbers[1] >= 0 && numbers[3] >= 0)
1675  {
1676  sign1 = anum[0][0] == '-' ? -1 : 1;
1677  sign2 = anum[2][0] == '-' ? -1 : 1;
1678  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 3600);
1679  degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
1680  prec[0] = max(precision[0], precision[1] + 4);
1681  prec[1] = max(precision[2], precision[3] + 2);
1682  }
1683  else if ( (pattern == "1 1 ' 1" ||
1684  pattern == "1 degrees 1 ' N 1 degrees N")
1685  && numbers[1] < 60
1686  && numbers[1] >= 0)
1687  {
1688  sign1 = anum[0][0] == '-' ? -1 : 1;
1689  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1690  degrees[1] = numbers[2];
1691  prec[0] = max(precision[0], precision[1] + 2);
1692  prec[1] = precision[2];
1693  }
1694  else if (pattern == "1 1 ' 1 \" 1"
1695  && numbers[1] < 60 && numbers[2] < 60
1696  && numbers[1] >= 0 && numbers[2] >= 0)
1697  {
1698  sign1 = anum[0][0] == '-' ? -1 : 1;
1699  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1700  degrees[1] = numbers[3];
1701  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1702  prec[1] = precision[3];
1703  }
1704  else if ((pattern == "1 1 ' 1 \" 1 1 '" ||
1705  pattern == "1 1 1 N 1 1 N" ||
1706  pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' N")
1707  && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
1708  && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
1709  {
1710  sign1 = anum[0][0] == '-' ? -1 : 1;
1711  sign2 = anum[3][0] == '-' ? -1 : 1;
1712  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1713  degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60);
1714  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1715  prec[1] = max(precision[3], precision[4] + 2);
1716  }
1717  else if (( pattern == "1 1 ' 1 \" 1 1 ' 1 \"" ||
1718  pattern == "1 1 ' 1 \" N 1 1 ' 1 \" N" ||
1719  pattern == "1 degrees 1 ' 1 \" 1 degrees 1 ' 1 \"" ||
1720  pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \" N" ||
1721  pattern == "N 1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \"" ||
1722  pattern == "1 degrees 1 ' 1 N 1 degrees 1 ' 1 N" ||
1723  pattern == "1 degrees 1 1 N 1 degrees 1 1 N" ||
1724  pattern == "1 1 1 N 1 1 1 N")
1725  && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60 && numbers[5] < 60
1726  && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0 && numbers[5] >= 0)
1727  {
1728  sign1 = anum[0][0] == '-' ? -1 : 1;
1729  sign2 = anum[3][0] == '-' ? -1 : 1;
1730  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1731  degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60 + numbers[5] / 3600);
1732  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1733  prec[1] = max(max(precision[3], precision[4] + 2), precision[5] + 4);
1734  }
1735  else if (( pattern == "1 1 ' 1 1 '" ||
1736  pattern == "1 1 N 1 1 N" ||
1737  pattern == "1 1 ' N 1 1 ' N" ||
1738  pattern == "1 degrees 1 ' N 1 degrees 1 ' N" ||
1739  pattern == "lat 1 degrees 1 ' N lat 1 degrees 1 ' N" ||
1740  pattern == "1 degrees 1 N 1 degrees 1 N" ||
1741  pattern == "1 degrees 1 N 1 degrees 1 ' N" ||
1742  pattern == "1 degrees 1 ' N 1 degrees 1 N" ||
1743  pattern == "N 1 degrees 1 ' N 1 degrees 1" ||
1744  pattern == "N 1 degrees 1 ' N 1 degrees 1 '" ||
1745  pattern == "N 1 degrees 1 ' N 1 1 '")
1746  && numbers[1] < 60 && numbers[3] < 60
1747  && numbers[1] >= 0 && numbers[3] >= 0)
1748  {
1749  sign1 = anum[0][0] == '-' ? -1 : 1;
1750  sign2 = anum[2][0] == '-' ? -1 : 1;
1751  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1752  degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
1753  prec[0] = max(precision[0], precision[1] + 2);
1754  prec[1] = max(precision[2], precision[3] + 2);
1755  }
1756  else if ((pattern == "1 N 1 1 N" ||
1757  pattern == "1 degrees N 1 degrees 1 ' N")
1758  && numbers[2] < 60
1759  && numbers[2] >= 0)
1760  {
1761  sign2 = anum[1][0] == '-' ? -1 : 1;
1762  degrees[0] = numbers[0];
1763  degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60);
1764  prec[0] = precision[0];
1765  prec[1] = max(precision[1], precision[2] + 2);
1766  }
1767  else if ((pattern == "1 degrees 1 ' 1 degrees 1 ' 1 \"" ||
1768  pattern == "N 1 1 N 1 1 1")
1769  && numbers[1] < 60 && numbers[3] < 60 && numbers[4] < 60
1770  && numbers[1] >= 0 && numbers[3] >= 0 && numbers[4] >= 0)
1771  {
1772  sign1 = anum[0][0] == '-' ? -1 : 1;
1773  sign2 = anum[2][0] == '-' ? -1 : 1;
1774  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1775  degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60 + numbers[4] / 3600);
1776  prec[0] = max(precision[0], precision[1] + 2);
1777  prec[1] = max(max(precision[2], precision[3] + 2), precision[4] + 4);
1778  }
1779  else if (pattern == "1 degrees 1 degrees 1 ' 1 \""
1780  && numbers[2] < 60 && numbers[3] < 60
1781  && numbers[2] >= 0 && numbers[3] >= 0)
1782  {
1783  sign2 = anum[1][0] == '-' ? -1 : 1;
1784  degrees[0] = numbers[0];
1785  degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60 + numbers[3] / 3600);
1786  prec[0] = precision[0];
1787  prec[1] = max(max(precision[1], precision[2] + 2), precision[3] + 4);
1788  }
1789  else if (pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 \" N"
1790  && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
1791  && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
1792  {
1793  sign1 = anum[0][0] == '-' ? -1 : 1;
1794  sign2 = anum[3][0] == '-' ? -1 : 1;
1795  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1796  degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 3600);
1797  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1798  prec[1] = max(precision[3], precision[4] + 4);
1799  }
1800  else
1801  {
1802  degrees.clear();
1803  prec.clear();
1804  }
1805  swap(degrees, numbers);
1806  swap(prec, precision);
1807  s_ReorderNorthSouthEastWest(numbers, precision, lat_long, nsew);
1808 }
1809 
1810 
1811 string s_ShortenLatLon( string &subname ) {
1812  string lat;
1813  string north_or_south;
1814  string lon;
1815  string east_or_west;
1816 
1817  if (subname.length() < 1) {
1818  return subname;
1819  }
1820  char ch = subname[0];
1821  if (ch < '0' || ch > '9') {
1822  return subname;
1823  }
1824 
1825  // extract the pieces
1826  CNcbiIstrstream lat_lon_stream( subname );
1827  lat_lon_stream >> lat;
1828  lat_lon_stream >> north_or_south;
1829  lat_lon_stream >> lon;
1830  lat_lon_stream >> east_or_west;
1831  if( lat_lon_stream.bad() ) {
1832  return subname;
1833  }
1834 
1835  if( north_or_south != "N" && north_or_south != "S" ) {
1836  return subname;
1837  }
1838 
1839  if( east_or_west != "E" && east_or_west != "W" ) {
1840  return subname;
1841  }
1842 
1843  size_t pos = NStr::Find(lat, ".");
1844  if (pos > 0) {
1845  size_t len = lat.length();
1846  if (pos + 9 < len) {
1847  lat.erase(pos + 9);
1848  }
1849  }
1850 
1851  pos = NStr::Find(lon, ".");
1852  if (pos > 0) {
1853  size_t len = lon.length();
1854  if (pos + 9 < len) {
1855  lon.erase(pos + 9);
1856  }
1857  }
1858 
1859  return lat + " " + north_or_south + " " + lon + " " + east_or_west;
1860 }
1861 
1862 string CSubSource::FixLatLonFormat (string orig_lat_lon, bool guess)
1863 {
1864  //cout << "Before: " << orig_lat_lon << endl;
1865  NStr::ParseEscapes(orig_lat_lon);
1866  CStringUTF8 old_str = CUtf8::AsUTF8(orig_lat_lon, CUtf8::GuessEncoding(orig_lat_lon));
1867  if (NStr::StartsWith(old_str, "\""))
1868  {
1869  NStr::TrimPrefixInPlace(old_str, "\"");
1870  NStr::TrimSuffixInPlace(old_str, "\"");
1871  }
1872  NStr::ReplaceInPlace(old_str, "\'\'", "\"");
1873  string fixed_str = s_RemoveSpacesWithinNumbers(old_str);
1874  string new_str = s_InsertSpacesBetweenTokens(fixed_str);
1875  NStr::Sanitize(new_str);
1876  vector<double> numbers;
1877  vector<int> precision;
1878  s_GetLatLong(new_str, numbers, precision);
1879  string res;
1880  if (!numbers.empty())
1881  {
1882  res = MakeLatLon(numbers[0], numbers[1], precision[0], precision[1]);
1883  }
1884  //cout << "After: " << res << endl;
1885  res = s_ShortenLatLon(res);
1886  return res;
1887 }
1888 
1889 
1890 string CSubSource::MakeLatLon(double lat_value, double lon_value, int lat_precision, int lon_precision )
1891 {
1892  char ns = 'N';
1893  if (lat_value < 0) {
1894  ns = 'S';
1895  lat_value = -lat_value;
1896  }
1897  char ew = 'E';
1898  if (lon_value < 0) {
1899  ew = 'W';
1900  lon_value = -lon_value;
1901  }
1902  string lat = NStr::DoubleToString(lat_value, lat_precision);
1903  string lon = NStr::DoubleToString(lon_value, lon_precision);
1904 
1905  NStr::TrimSuffixInPlace(lat, ".");
1906  NStr::TrimSuffixInPlace(lon, ".");
1907  string res = lat + " " + ns + " " + lon + " " + ew;
1908  return res;
1909 }
1910 
1911 
1912 CLatLonCountryId *CSubSource::x_CalculateLatLonId(float lat_value, float lon_value, string country, string province)
1913 {
1914  CLatLonCountryId *id = new CLatLonCountryId(lat_value, lon_value);
1915 
1916  bool goodmatch = false;
1917 
1918  // lookup region by coordinates, or find nearest region and calculate distance
1919  const CCountryExtreme * guess = m_LatLonCountryMap->GuessRegionForLatLon(lat_value, lon_value, country, province);
1920  if (guess) {
1921  id->SetFullGuess(guess->GetCountry());
1922  id->SetGuessCountry(guess->GetLevel0());
1923  id->SetGuessProvince(guess->GetLevel1());
1924  if (NStr::EqualNocase(country, id->GetGuessCountry())
1925  && (NStr::IsBlank(province) || NStr::EqualNocase(province, id->GetGuessProvince()))) {
1926  goodmatch = true;
1927  }
1928  } else {
1929  // not inside a country, check water
1930  guess = m_LatLonWaterMap->GuessRegionForLatLon(lat_value, lon_value, country);
1931  if (guess) {
1932  // found inside water
1933  id->SetGuessWater(guess->GetCountry());
1934  if (NStr::EqualNocase(country, id->GetGuessWater())) {
1935  goodmatch = true;
1936  }
1937 
1938  // also see if close to land for coastal warning (if country is land)
1939  // or proximity message (if country is water)
1940  double landdistance = 0.0;
1941  guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
1942  if (guess) {
1943  id->SetClosestFull(guess->GetCountry());
1944  id->SetClosestCountry(guess->GetLevel0());
1945  id->SetClosestProvince(guess->GetLevel1());
1946  id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
1947  if (NStr::EqualNocase(country, id->GetClosestCountry())
1948  && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
1949  goodmatch = true;
1950  }
1951  }
1952  } else {
1953  // may be coastal inlet, area of data insufficiency
1954  double landdistance = 0.0;
1955  guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
1956  if (guess) {
1957  id->SetClosestFull(guess->GetCountry());
1958  id->SetClosestCountry(guess->GetLevel0());
1959  id->SetClosestProvince(guess->GetLevel1());
1960  id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
1961  if (NStr::EqualNocase(country, id->GetClosestCountry())
1962  && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
1963  goodmatch = true;
1964  }
1965  }
1966 
1967  double waterdistance = 0.0;
1968  guess = m_LatLonWaterMap->FindClosestToLatLon (lat_value, lon_value, 5.0, waterdistance);
1969  if (guess) {
1970  id->SetClosestWater(guess->GetLevel0());
1971  id->SetWaterDistance(m_LatLonWaterMap->AdjustAndRoundDistance (waterdistance));
1972  if (NStr::EqualNocase(country, id->GetClosestWater())) {
1973  goodmatch = true;
1974  }
1975  }
1976  }
1977  }
1978 
1979  // if guess is not the provided country or province, calculate distance to claimed country
1980  if (!goodmatch) {
1981  double distance = 0.0;
1982  guess = m_LatLonCountryMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
1983  if (guess) {
1984  if (distance < ErrorDistance(lat_value, lon_value, m_LatLonCountryMap->GetScale())) {
1985  // close enough
1986  id->SetGuessCountry(country);
1987  id->SetGuessProvince(province);
1988  id->SetFullGuess(guess->GetCountry());
1989  } else {
1990  id->SetClaimedFull(guess->GetCountry());
1991  id->SetClaimedDistance(m_LatLonCountryMap->AdjustAndRoundDistance (distance));
1992  }
1993  } else if (NStr::IsBlank(province)) {
1994  guess = m_LatLonWaterMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
1995  if (guess) {
1996  id->SetClaimedFull(guess->GetCountry());
1997  id->SetClaimedDistance(m_LatLonWaterMap->AdjustAndRoundDistance (distance));
1998  }
1999  }
2000  }
2001 
2002  return id;
2003 }
2004 
2005 
2006 
2009  {"Adriatic Sea", "Mediterranean Sea"},
2010  {"Aegean Sea", "Mediterranean Sea"},
2011  {"Alboran Sea", "Mediterranean Sea"},
2012  {"Andaman Sea", "Indian Ocean"},
2013  {"Arabian Sea", "Indian Ocean"},
2014  {"Argentine Sea", "Atlantic Ocean"},
2015  {"Ariake Sea", "Pacific Ocean"},
2016  {"Baffin Bay", "Atlantic Ocean"},
2017  {"Balearic Sea", "Mediterranean Sea"},
2018  {"Baltic Sea", "Atlantic Ocean"},
2019  {"Barents Sea", "Arctic Ocean"},
2020  {"Bay of Bengal", "Indian Ocean"},
2021  {"Beaufort Sea", "Arctic Ocean"},
2022  {"Bering Sea", "Pacific Ocean"},
2023  {"Bismarck Sea", "Pacific Ocean"},
2024  {"Black Sea", "Mediterranean Sea"},
2025  {"Bohai Sea", "Pacific Ocean"},
2026  {"Caribbean Sea", "Atlantic Ocean"},
2027  {"Celebes Sea", "Pacific Ocean"},
2028  {"Champlain Sea", "Atlantic Ocean"},
2029  {"Chilean Sea", "Pacific Ocean"},
2030  {"China Seas", "Pacific Ocean"},
2031  {"Chukchi Sea", "Arctic Ocean"},
2032  {"Coral Sea", "Pacific Ocean"},
2033  {"Davis Strait", "Atlantic Ocean"},
2034  {"East China Sea", "Pacific Ocean"},
2035  {"East Siberian Sea", "Arctic Ocean"},
2036  {"English Channel", "Atlantic Ocean"},
2037  {"Erythraean Sea", "Indian Ocean"},
2038  {"Golfo de California", "Pacific Ocean"},
2039  {"Greenland Sea", "Arctic Ocean"},
2040  {"Gulf of Mexico", "Atlantic Ocean"},
2041  {"Gulf of Thailand", "Pacific Ocean"},
2042  {"Gulf of Tonkin", "Pacific Ocean"},
2043  {"Hudson Bay", "Arctic Ocean"},
2044  {"Ionian Sea", "Mediterranean Sea"},
2045  {"Irish Sea", "Atlantic Ocean"},
2046  {"Irminger Sea", "Atlantic Ocean"},
2047  {"James Bay", "Atlantic Ocean"},
2048  {"Java Sea", "Indian Ocean"},
2049  {"Kara Sea", "Arctic Ocean"},
2050  {"Koro Sea", "Pacific Ocean"},
2051  {"Labrador Sea", "Atlantic Ocean"},
2052  {"Laccadive Sea", "Indian Ocean"},
2053  {"Laptev Sea", "Arctic Ocean"},
2054  {"Ligurian Sea", "Mediterranean Sea"},
2055  {"Lincoln Sea", "Arctic Ocean"},
2056  {"Myrtoan Sea", "Mediterranean Sea"},
2057  {"North Sea", "Atlantic Ocean"},
2058  {"Norwegian Sea", "Atlantic Ocean"},
2059  {"Pechora Sea", "Arctic Ocean"},
2060  {"Persian Gulf", "Indian Ocean"},
2061  {"Philippine Sea", "Pacific Ocean"},
2062  {"Red Sea", "Indian Ocean"},
2063  {"Salish Sea", "Pacific Ocean"},
2064  {"Sargasso Sea", "Atlantic Ocean"},
2065  {"Scotia Sea", "Southern Ocean"},
2066  {"Sea of Azov", "Black Sea"},
2067  {"Sea of Chiloe", "Pacific Ocean"},
2068  {"Sea of Crete", "Mediterranean Sea"},
2069  {"Sea of Japan", "Pacific Ocean"},
2070  {"Sea of Okhotsk", "Pacific Ocean"},
2071  {"Sea of the Hebrides", "Atlantic Ocean"},
2072  {"Sea of Zanj", "Indian Ocean"},
2073  {"Seas of Greenland", "Atlantic Ocean"},
2074  {"Sethusamudram", "Indian Ocean"},
2075  {"Sibutu Passage", "Pacific Ocean"},
2076  {"Solomon Sea", "Pacific Ocean"},
2077  {"South China Sea", "Pacific Ocean"},
2078  {"Sulu Sea", "Pacific Ocean"},
2079  {"Tasman Sea", "Pacific Ocean"},
2080  {"Thracian Sea", "Mediterranean Sea"},
2081  {"Timor Sea", "Indian Ocean"},
2082  {"Tyrrhenian Sea", "Mediterranean Sea"},
2083  {"Wandel Sea", "Arctic Ocean"},
2084  {"White Sea", "Arctic Ocean"},
2085  {"Yellow Sea", "Pacific Ocean"}
2086 };
2089 
2090 static string x_FindSurroundingOcean (string& water)
2091 
2092 {
2093  TWaterPairMap::const_iterator new_water_pair_iter = sc_WaterPairMap.find(water.c_str());
2094  if( new_water_pair_iter != sc_WaterPairMap.end() ) {
2095  return new_water_pair_iter->second;
2096  }
2097  return kEmptyStr;
2098 }
2099 
2100 
2101 string CSubSource::ValidateLatLonCountry (const string& input_countryname, string& lat_lon, bool check_state, ELatLonCountryErr& errcode)
2102 {
2103  errcode = eLatLonCountryErr_None;
2104  string countryname = input_countryname;
2105  if (NStr::IsBlank(countryname) || NStr::IsBlank(lat_lon)) {
2106  return kEmptyStr;
2107  }
2108 
2109  {
2110  static std::mutex m;
2111 
2112  std::lock_guard g(m);
2113 
2114  if ( m_LatLonCountryMap.get() == 0 ) {
2115  m_LatLonCountryMap.reset (new CLatLonCountryMap(false));
2116  }
2117  if ( m_LatLonWaterMap.get() == 0 ) {
2118  m_LatLonWaterMap.reset (new CLatLonCountryMap(true));
2119  }
2120  }
2121 
2122  // only do these checks if the latlon format is good
2123  bool format_correct, lat_in_range, lon_in_range, precision_correct;
2124  double lat_value = 0.0, lon_value = 0.0;
2125  CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
2126  lat_in_range, lon_in_range,
2127  lat_value, lon_value);
2128  if (!format_correct) {
2129  // may have comma and then altitude, so just get lat_lon component */
2130  size_t pos = NStr::Find(lat_lon, ",", NStr::eNocase, NStr::eReverseSearch);
2131  if (pos != NPOS) {
2132  lat_lon = lat_lon.substr(0, pos);
2133  CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
2134  lat_in_range, lon_in_range,
2135  lat_value, lon_value);
2136  }
2137  }
2138 
2139  // reality checks
2140  if (!format_correct || !lat_in_range || !lon_in_range) {
2141  // incorrect lat_lon format should be reported elsewhere
2142  // incorrect latitude range should be reported elsewhere
2143  // incorrect longitude range should be reported elsewhere
2144  return kEmptyStr;
2145  }
2146 
2147  // get rid of comments after semicolon or comma in country name
2148  size_t pos = NStr::Find(countryname, ";");
2149  if (pos != NPOS) {
2150  countryname = countryname.substr(0, pos);
2151  }
2152  pos = NStr::Find(countryname, ",");
2153  if (pos != NPOS) {
2154  countryname = countryname.substr(0, pos);
2155  }
2156 
2157  // adjust for special cases
2158  if (NStr::StartsWith(countryname, "Norway: Svalbard")) {
2159  countryname = "Svalbard";
2160  }
2161 
2162  string country = countryname;
2163  string province;
2164  pos = NStr::Find(country, ":");
2165  if (pos != NPOS) {
2166  // is the full string in the list?
2167  if (m_LatLonCountryMap->HaveLatLonForRegion(countryname)) {
2168  province = country.substr(pos + 1);
2170  }
2171  country = country.substr(0, pos);
2173  }
2174  if (NStr::IsBlank(country)) {
2175  return kEmptyStr;
2176  }
2177 
2178  // known exceptions - don't even bother calculating any further
2179  if (NStr::EqualNocase (country, "Antarctica") && lat_value < -60.0) {
2180  return kEmptyStr;
2181  }
2182 
2183  if (! NStr::IsBlank(province)) {
2184  // do not attempt quick exit
2185  } else if (m_LatLonCountryMap->HaveLatLonForRegion(country)) {
2186  if (m_LatLonCountryMap->IsCountryInLatLon(country, lat_value, lon_value)) {
2187  return kEmptyStr;
2188  }
2189  } else if (m_LatLonWaterMap->HaveLatLonForRegion(country)) {
2190  if (m_LatLonWaterMap->IsCountryInLatLon(country, lat_value, lon_value)) {
2191  return kEmptyStr;
2192  }
2193  } else if (NStr::EqualNocase (country, "State of Palestine")) {
2194  } else {
2195  // report unrecognized country
2196  return kEmptyStr;
2197  }
2198 
2199  CLatLonCountryId *id = x_CalculateLatLonId(lat_value, lon_value, country, province);
2200  CLatLonCountryId::TClassificationFlags flags = (id == NULL ? 0 : id->Classify(country, province));
2201 
2202  string wguess = id->GetGuessWater();
2203  string cguess = id->GetGuessCountry();
2204 
2205  // special case where subsection of country has been identified but is not in coordinates of country
2206  // VR-840
2207  if (province.empty() && NStr::Equal(cguess, country)) {
2208  delete id;
2209  return kEmptyStr;
2210  }
2211 
2212  if (NStr::EqualNocase (country, "State of Palestine") &&
2213  (NStr::EqualNocase (cguess, "Gaza Strip") ||
2214  NStr::EqualNocase (cguess, "West Bank"))) {
2215  delete id;
2216  return kEmptyStr;
2217  }
2218 
2219  if (NStr::IsBlank (cguess) && (! NStr::IsBlank (wguess))) {
2220  string parent = x_FindSurroundingOcean (wguess);
2221  if ((! NStr::IsBlank (parent)) && NStr::EqualNocase (country, parent)) {
2222  delete id;
2223  return kEmptyStr;
2224  }
2225  }
2226 
2227  double neardist = 0.0;
2229  CLatLonCountryId::TClassificationFlags adjusted_flags = 0;
2230 
2231  if (!flags && m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 2.0, neardist, country) && neardist < 5.0) {
2232  id->SetGuessCountry (country);
2233  id->SetGuessProvince (kEmptyStr);
2234  flags = id->Classify(country, province);
2235  }
2236 
2237  if (!flags && !m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)
2238  && !m_LatLonWaterMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)) {
2239  /* do not flip from water */
2240  CLatLonCountryId *adjust_id = x_CalculateLatLonId(lon_value, lat_value, country, province);
2241  adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2242  if (adjusted_flags) {
2243  string awguess = adjust_id->GetGuessWater();
2244  string acguess = adjust_id->GetGuessCountry();
2245  if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2246  delete id;
2247  id = adjust_id;
2248  flags = adjusted_flags;
2249  adjustment = CLatLonCountryMap::fFlip;
2250  }
2251  } else {
2252  if (adjust_id) {
2253  delete adjust_id;
2254  }
2255  adjust_id = x_CalculateLatLonId(-lat_value, lon_value, country, province);
2256  adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2257  if (adjusted_flags) {
2258  string awguess = adjust_id->GetGuessWater();
2259  string acguess = adjust_id->GetGuessCountry();
2260  if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2261  delete id;
2262  id = adjust_id;
2263  flags = adjusted_flags;
2264  adjustment = CLatLonCountryMap::fNegateLat;
2265  }
2266  } else {
2267  if (adjust_id) {
2268  delete adjust_id;
2269  }
2270  adjust_id = x_CalculateLatLonId(lat_value, -lon_value, country, province);
2271  adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2272  if (adjusted_flags) {
2273  string awguess = adjust_id->GetGuessWater();
2274  string acguess = adjust_id->GetGuessCountry();
2275  if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2276  delete id;
2277  id = adjust_id;
2278  flags = adjusted_flags;
2279  adjustment = CLatLonCountryMap::fNegateLon;
2280  }
2281  } else {
2282  if (adjust_id) {
2283  delete adjust_id;
2284  }
2285  }
2286  }
2287  }
2288  }
2289 
2290  string error;
2291 
2292  if (adjustment != CLatLonCountryMap::fNone) {
2293  if (adjustment == CLatLonCountryMap::fFlip) {
2294  errcode = eLatLonCountryErr_Value;
2295  error = "Latitude and longitude values appear to be exchanged";
2296  lat_lon = MakeLatLon(lon_value, lat_value);
2297  } else if (adjustment == CLatLonCountryMap::fNegateLat) {
2298  errcode = eLatLonCountryErr_Value;
2299  if (lat_value < 0.0) {
2300  error = "Latitude should be set to N (northern hemisphere)";
2301  } else {
2302  error = "Latitude should be set to S (southern hemisphere)";
2303  }
2304  lat_lon = MakeLatLon(-lat_value, lon_value);
2305  } else if (adjustment == CLatLonCountryMap::fNegateLon) {
2306  errcode = eLatLonCountryErr_Value;
2307  if (lon_value < 0.0) {
2308  error = "Longitude should be set to E (eastern hemisphere)";
2309  } else {
2310  error = "Longitude should be set to W (western hemisphere)";
2311  }
2312  lat_lon = MakeLatLon(lat_value, -lon_value);
2313  }
2315  // success! nothing to report
2316  } else if (flags & CLatLonCountryId::fWaterMatch) {
2317  // success! nothing to report
2318  } else if (flags & CLatLonCountryId::fCountryMatch && NStr::IsBlank(province)) {
2319  if (check_state) {
2320  string full_guess = id->GetFullGuess();
2321  if (!NStr::Equal(full_guess, country)) {
2322  errcode = eLatLonCountryErr_State;
2323  error = "Lat_lon " + lat_lon + " is in " + id->GetFullGuess()
2324  + " (more specific than " + country + ")";
2325  }
2326  }
2327  } else if (!NStr::IsBlank(id->GetGuessWater())) {
2329  bool suppress = false;
2330  string reportregion;
2331  string nosubphrase;
2332  string desphrase = "designated subregion ";
2333  string subphrase = "another subregion ";
2334  string phrase = nosubphrase;
2335  bool show_claimed = false;
2336 
2337  if (id->GetLandDistance() < 100) {
2338  // for now, will not report
2339  // this is a policy decision
2340  suppress = true;
2341  } else if (NStr::Find(countryname, "Island") != NPOS) {
2342  suppress = true;
2343  }
2344 
2345 
2347  reportregion = countryname;
2348  phrase = desphrase;
2349  } else {
2350  // wasn't closest province, so must be closest country
2351  if (!NStr::IsBlank(province) && check_state) {
2352  phrase = subphrase;
2353  reportregion = id->GetClosestFull();
2354  } else {
2355  reportregion = id->GetClosestCountry();
2356  }
2357  if (!NStr::IsBlank(id->GetClaimedFull())) {
2358  show_claimed = true;
2359  }
2360  }
2361  string water = id->GetGuessWater();
2362  if (NStr::EqualNocase (water, "Red Sea") &&
2363  (NStr::EqualNocase (reportregion, "Egypt") ||
2364  NStr::EqualNocase (reportregion, "Saudi Arabia") ||
2365  NStr::EqualNocase (reportregion, "Sudan") ||
2366  NStr::EqualNocase (reportregion, "Eritrea") ||
2367  NStr::EqualNocase (reportregion, "Dijibouti") ||
2368  NStr::EqualNocase (reportregion, "Yemen") ||
2369  NStr::EqualNocase (reportregion, "Israel") ||
2370  NStr::EqualNocase (reportregion, "Jordan"))) {
2371  } else if (NStr::EqualNocase (water, "Gulf of Mexico") &&
2372  (NStr::EqualNocase (reportregion, "USA") ||
2373  NStr::EqualNocase (reportregion, "Mexico"))) {
2374  } else if (!suppress) {
2375  errcode = eLatLonCountryErr_Water;
2376  if (show_claimed) {
2377  error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion + "' at distance "
2379  + " km, but in water '" + id->GetGuessWater()
2380  + "' - claimed region '" + id->GetClaimedFull()
2381  + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2382  } else {
2383  error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion
2384  + "' at distance " + NStr::IntToString(id->GetLandDistance()) + " km, but in water '"
2385  + id->GetGuessWater() + "'";
2386  }
2387  }
2388  } else if (neardist > 0.0) {
2389  errcode = eLatLonCountryErr_Water;
2390  error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "', '"
2391  + countryname + "' is " + NStr::IntToString(m_LatLonCountryMap->AdjustAndRoundDistance(neardist)) + " km away";
2392  } else {
2393  errcode = eLatLonCountryErr_Water;
2394  error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "'";
2395  }
2396  } else if (!NStr::IsBlank(id->GetGuessCountry())) {
2397  string full_guess = id->GetFullGuess();
2398  if (NStr::EqualNocase (country, "China") && NStr::EqualNocase (full_guess, "Hong Kong")) {
2399  // skip
2400  } else if (NStr::IsBlank(id->GetClaimedFull())) {
2401  if (NStr::Equal(id->GetGuessCountry(), country) && !NStr::Equal(id->GetGuessProvince(), province)) {
2402  errcode = eLatLonCountryErr_State;
2403  } else {
2404  errcode = eLatLonCountryErr_Country;
2405  }
2406  error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2407  + countryname + "'";
2408  } else {
2409  if (NStr::IsBlank(province)) {
2410  errcode = eLatLonCountryErr_Country;
2411  error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2412  + country + "' - claimed region '" + id->GetClaimedFull()
2413  + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2414  } else {
2415  errcode = eLatLonCountryErr_Country;
2416  if (NStr::EqualNocase(id->GetGuessCountry(), country)) {
2417  errcode = eLatLonCountryErr_State;
2418  }
2419  if (errcode == eLatLonCountryErr_Country || check_state) {
2420  error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2421  + countryname + "' - claimed region '" + id->GetClaimedFull()
2422  + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2423  } else {
2424  errcode = eLatLonCountryErr_None;
2425  }
2426  }
2427  }
2428  } else if (!NStr::IsBlank(id->GetClosestCountry())) {
2429  errcode = eLatLonCountryErr_Country;
2430  error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestCountry() + "' instead of '"
2431  + countryname + "'";
2432  } else if (!NStr::IsBlank(id->GetClosestWater())) {
2433  errcode = eLatLonCountryErr_Water;
2434  error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestWater() + "' instead of '"
2435  + countryname + "'";
2436  } else {
2437  errcode = eLatLonCountryErr_Country;
2438  error = "Unable to determine mapping for lat_lon '" + lat_lon + "' and country '" + countryname + "'";
2439  }
2440 
2441 
2442  delete id;
2443  return error;
2444 }
2445 
2446 
2448  "asexual",
2449  "bisexual",
2450  "diecious",
2451  "dioecious",
2452  "f",
2453  "female",
2454  "gelding",
2455  "hermaphrodite",
2456  "intersex",
2457  "m",
2458  "male",
2459  "mixed",
2460  "monecious",
2461  "monoecious",
2462  "neuter",
2463  "unisexual",
2464 };
2465 
2466 
2468  "pooled males and females",
2469  "pooled male and female",
2470 };
2471 
2472 
2474 {
2475  size_t max = sizeof(sm_ValidSexQualifierPhrases) / sizeof(const char*);
2476 
2477  const char* *begin = sm_ValidSexQualifierPhrases;
2478  const char* *end = &(sm_ValidSexQualifierPhrases[max]);
2479 
2480  if (find(begin, end, value) != end) {
2481  return true;
2482  } else {
2483  return false;
2484  }
2485 }
2486 
2487 
2489 
2490 {
2491  string str = value;
2492  NStr::ToLower(str);
2493 
2495  return true;
2496  }
2497 
2498  vector<string> words;
2499  NStr::Split(str, " ,/", words);
2500  if (words.size() == 0) {
2501  return false;
2502  }
2503 
2504  size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
2505 
2506  const char* *begin = sm_ValidSexQualifierTokens;
2507  const char* *end = &(sm_ValidSexQualifierTokens[max]);
2508 
2509  bool is_good = false;
2510 
2511  ITERATE(vector<string>, w, words) {
2512  if (NStr::Equal(*w, "and")) {
2513  // ok, skip it
2514  } else {
2515  if (find(begin, end, *w) != end) {
2516  is_good = true;
2517  } else {
2518  is_good = false;
2519  break;
2520  }
2521  }
2522  }
2523  return is_good;
2524 }
2525 
2526 
2528 {
2529  string str = value;
2530  NStr::ToLower(str);
2531 
2533  return str;
2534  }
2535 
2536  vector<string> words;
2537  NStr::Split(str, " ,/", words);
2538 
2539  if (words.size() == 0) {
2540  return kEmptyStr;
2541  }
2543 
2544  const char* *begin = sm_ValidSexQualifierTokens;
2545  const char* *end = &(sm_ValidSexQualifierTokens[max]);
2546 
2547  vector<string> good_values;
2548  bool pooled = false;
2549 
2550  ITERATE(vector<string>, w, words) {
2551  if (NStr::Equal(*w, "and")) {
2552  // ok, skip it
2553  } else if (NStr::EqualNocase(*w, "(pooled)") || NStr::EqualNocase(*w, "pooled")) {
2554  // set pooled flag
2555  pooled = true;
2556  } else {
2557  if (find(begin, end, *w) != end) {
2558  if (NStr::Equal(*w, "m")) {
2559  good_values.push_back("male");
2560  } else if (NStr::Equal(*w, "f")) {
2561  good_values.push_back("female");
2562  } else {
2563  good_values.push_back(*w);
2564  }
2565  } else {
2566  // if any bad values, can't autofix
2567  return kEmptyStr;
2568  }
2569  }
2570  }
2571  if (good_values.size() == 0) {
2572  // no good tokens, can't autofix
2573  return kEmptyStr;
2574  }
2575 
2576  string fixed = good_values[0];
2577  for (size_t i = 1; i < good_values.size(); i++) {
2578  if (good_values.size() > 2) {
2579  fixed += ",";
2580  }
2581  if (i == good_values.size() - 1) {
2582  fixed += " and";
2583  }
2584  fixed += " " + good_values[i];
2585  }
2586  if (pooled) {
2587  fixed = "pooled " + fixed;
2588  }
2589  return fixed;
2590 }
2591 
2592 
2593 void s_CollectNumberAndUnits(const string& value, string& number, string& units)
2594 {
2595  number.clear();
2596  units.clear();
2597 
2598  if (NStr::IsBlank(value)) {
2599  return;
2600  }
2601 
2602  string::const_iterator it = value.begin();
2603  if (*it == '+' || *it == '-') {
2604  number += *it;
2605  it++;
2606  }
2607 
2608  bool any_digit = false;
2609  bool skip_comma = true;
2610  while (it != value.end() && (isdigit(*it) || *it == ',')) {
2611  if (*it == ',') {
2612  if (skip_comma) {
2613  // only skip the first comma
2614  skip_comma = false;
2615  } else {
2616  break;
2617  }
2618  } else {
2619  any_digit = true;
2620  number += *it;
2621  }
2622  it++;
2623  }
2624 
2625  if (it == value.end()) {
2626  number.clear();
2627  return;
2628  }
2629 
2630  if (*it == '.') {
2631  number += *it;
2632  it++;
2633  while (it != value.end() && isdigit(*it)) {
2634  any_digit = true;
2635  number += *it;
2636  it++;
2637  }
2638  }
2639 
2640  if (it == value.end() || *it != ' ' || !any_digit) {
2641  number.clear();
2642  return;
2643  }
2644 
2645  it++;
2646  while (it != value.end()) {
2647  units += *it;
2648  it++;
2649  }
2650 }
2651 
2652 
2654 {
2655  if (NStr::IsBlank(value)) {
2656  return false;
2657  }
2658 
2659  string number;
2660  string units;
2662  if (NStr::IsBlank(number) || !NStr::EqualCase(units, "m")) {
2663  return false;
2664  } else {
2665  return true;
2666  }
2667 
2668 }
2669 
2670 
2671 int CSubSource::x_GetPrecision(const string& num_str)
2672 {
2673  int precision = 0;
2674  size_t pos = NStr::Find(num_str, ".");
2675  if (pos != NPOS) {
2676  precision = int(num_str.length() - pos - 1);
2677  }
2678  return precision;
2679 }
2680 
2681 
2683 {
2684  char reformatted[1000];
2685  sprintf(reformatted, "%.*lf", precision, val);
2686  string rval = reformatted;
2687  return rval;
2688 }
2689 
2690 string CSubSource::FixAltitude (const string& value)
2691 {
2692  if (NStr::IsBlank(value)) {
2693  return kEmptyStr;
2694  }
2695 
2696  string number;
2697  string units;
2699  if (NStr::IsBlank(number)) {
2700  return kEmptyStr;
2701  } else if (NStr::Equal(units, "ft.") || NStr::Equal(units, "ft") || NStr::Equal(units, "feet") || NStr::Equal(units, "foot")) {
2703  double val = NStr::StringToDouble(number);
2704  val *= 0.3048;
2706  units = "m";
2707  }
2708 
2709  string rval = kEmptyStr;
2710  if (NStr::Equal(units, "m.")
2711  || NStr::Equal(units, "meters")
2712  || NStr::Equal(units, "meter")
2713  || NStr::Equal(units, "m")) {
2714 
2715  rval = number + " " + "m";
2716  }
2717  return rval;
2718 }
2719 
2720 
2721 // From VR-793:
2722 // A. For segment, endogenous_virus_name:
2723 // 1. Must begin with a letter or number
2724 // 2. Spaces and other printable characters are permitted
2725 // 3. Must not be empty, must not be longer than 240 characters
2726 
2728 {
2729  if (NStr::IsBlank(value)) {
2730  return false;
2731  } else if (!isalnum(value.c_str()[0])) {
2732  return false;
2733  } else if (value.length() > 240) {
2734  return false;
2735  }
2736 
2737  for (auto it : value) {
2738  if (!isprint(it)) {
2739  return false;
2740  }
2741  }
2742 
2743  return true;
2744 }
2745 
2746 
2748 {
2750 }
2751 
2752 
2754 {
2756 }
2757 
2758 
2759 // From VR-793:
2760 // B. For chromosome, linkage_group and plasmid_name values:
2761 // 4. Must begin with a letter or number
2762 // 5. Must not be empty, must not be longer than 32 characters
2763 // 6. Must not contain <tab>
2764 // 7. Spaces and other printable characters are permitted
2765 // 8. Must not contain the word "plasmid" (ignoring case)
2766 // 9. Must not contain the word "chromosome" (ignoring case)
2767 // 10. Must not contain the phrase "linkage group" (ignoring case)
2768 // 11. Must not contain the series of letters "chr" (ignoring case)
2769 // 12. Must not contain the taxname (ignoring case)
2770 // 14. Must not contain the genus (ignoring case)
2771 // 15. Must not contain the species (ignoring case)
2772 // except allow the species to match the value after an initial 'p' (e.g., JX416328)
2773 // 16. Must not contain the series of letters "chrm" (ignoring case)
2774 // 17. Must not contain the series of letters "chrom" (ignoring case)
2775 // 18. Must not contain the phrase "linkage-group" (ignoring case)
2776 static bool s_FailsGenusOrSpeciesTest(const string& value, const string& taxname)
2777 { // See RW-1436
2778  if (NStr::IsBlank(taxname) ||
2779  NStr::StartsWith(taxname, "Plasmid ", NStr::eNocase) ||
2780  NStr::StartsWith(taxname, "IncQ plasmid", NStr::eNocase)) {
2781  return false;
2782  }
2783 
2784  size_t pos = NStr::Find(taxname, " ");
2785  if (pos != NPOS) {
2786  string genus = taxname.substr(0, pos);
2787  if (NStr::FindNoCase(value, genus) != NPOS) {
2788  // B.14
2789  return true;
2790  }
2791  string species = taxname.substr(pos + 1);
2792 
2793  pos = NStr::FindNoCase(value, species);
2794  if (pos != NPOS) {
2795  if (pos != 1 || value[0] != 'p') {
2796  // B.15
2797  return true;
2798  }
2799  }
2800  }
2801 
2802  return false;
2803 }
2804 
2806 {
2807  if (NStr::FindNoCase(taxname, "Borrelia") != NPOS || NStr::FindNoCase(taxname, "Borreliella") != NPOS) {
2808  if (NStr::StartsWith(value, "cp") || NStr::StartsWith(value, "lp")) {
2809  return true;
2810  }
2811  }
2813  // checks for isalnum start, blankness and unprintable characters
2814  // B.4, B.5, B.7
2815  return false;
2816  } else if (value.length() > 32) {
2817  // B.5
2818  return false;
2819  }
2820 
2821  if (s_FailsGenusOrSpeciesTest(value, taxname)) {
2822  return false;
2823  }
2824 
2825  static string s_ForbiddenPhrases[] = {
2826  "\t", // B.6.
2827  "plasmid", // B.8
2828  "chromosome", // B.9
2829  "linkage group", // B.10
2830  "chr", // B.11
2831  "linkage_group", // B.15
2832  "chrm", // B.16
2833  "chrom", // B.17
2834  "linkage-group" // B.18
2835  };
2836 
2837  for (auto it : s_ForbiddenPhrases) {
2838  if (NStr::FindNoCase(value, it) != NPOS) {
2839  return false;
2840  }
2841  }
2842  return true;
2843 }
2844 
2845 
2846 bool CSubSource::IsChromosomeNameValid(const string& value, const string& taxname)
2847 {
2848  if (NStr::IsBlank(value)) {
2849  return false;
2850  }
2851  if (NStr::StartsWith(value, "LG", NStr::eNocase)) {
2852  return false;
2853  } else {
2855  }
2856 }
2857 
2858 
2859 bool CSubSource::IsLinkageGroupNameValid(const string& value, const string& taxname)
2860 {
2861  if (NStr::IsBlank(value)) {
2862  return false;
2863  }
2865 }
2866 
2867 
2868 // VR-793
2869 // C. For plasmid_name values:
2870 // 19. Exception- megaplasmid is legal
2871 bool CSubSource::IsPlasmidNameValid(const string& value, const string& taxname)
2872 {
2873  if (NStr::IsBlank(value)) {
2874  return false;
2875  }
2876  if (NStr::Equal(value, "megaplasmid")) {
2877  return true;
2878  }
2879  if (NStr::StartsWith(value, "megaplasmid ") && value.length() > 12 && NStr::Find(value.substr(12), " ") == NPOS) {
2880  return true;
2881  }
2882  if (NStr::Equal(value, "F") || NStr::Equal(value, "F factor") || NStr::Equal(value, "F plasmid")) {
2883  return true;
2884  }
2885 
2886  if (NStr::FindNoCase(value,"plasmid") != NPOS) {
2887  static const set<string, PNocase_Conditional> s_PlasmidNameExceptions =
2888  { // This list comes from RW-1436/RW-1430
2889  "Plasmid F",
2890  "Plasmid R",
2891  "Plasmid pIP630",
2892  "Plasmid pNG2",
2893  "Plasmid pGT633",
2894  "Plasmid pE5",
2895  "Plasmid pIP1527",
2896  "Plasmid pAM77",
2897  "Plasmid pAZ1",
2898  "Plasmid RP4"
2899  };
2900 
2901  if (s_PlasmidNameExceptions.find(value) != end(s_PlasmidNameExceptions)) {
2902  return true;
2903  }
2904  return false;
2905  }
2906 
2908 }
2909 
2910 
2911 typedef pair<string, string> TContaminatingCellLine;
2914 
2917 DEFINE_STATIC_FAST_MUTEX(s_CellLineContaminationMutex);
2918 
2919 #include "cell_line.inc"
2920 
2921 static void s_ProcessCellLineLine(const CTempString& line)
2922 {
2923  vector<string> tokens;
2924  NStr::Split(line, "\t", tokens);
2925  if (tokens.size() < 4) {
2926  ERR_POST_X(1, Warning << "Not enough columns in cell_line entry " << line
2927  << "; disregarding");
2928  } else {
2929  NStr::ToUpper(tokens[0]);
2930  (s_CellLineContaminationMap[tokens[0]])[tokens[1]] = TContaminatingCellLine(tokens[2], tokens[3]);
2931  }
2932 }
2933 
2934 
2936 {
2937  CFastMutexGuard GUARD(s_CellLineContaminationMutex);
2939  return;
2940  }
2941 
2942  // read table
2943 
2944  size_t count = sizeof(kCellLine) / sizeof (*kCellLine);
2945  const char * const * start = kCellLine;
2946  while (count--) {
2947  s_ProcessCellLineLine(*start++);
2948  }
2949 
2950 
2952 }
2953 
2954 
2955 string CSubSource::CheckCellLine(const string& cell_line, const string& organism)
2956 {
2957  string rval;
2958 
2960  string cell_line_search = cell_line;
2961  NStr::ToUpper(cell_line_search);
2962 
2963  if (!NStr::IsBlank(((s_CellLineContaminationMap[cell_line_search])[organism]).first)) {
2964  rval = "The International Cell Line Authentication Committee database indicates that " +
2965  cell_line + " from " + organism + " is known to be contaminated by " +
2966  ((s_CellLineContaminationMap[cell_line_search])[organism]).first +
2967  " from " + ((s_CellLineContaminationMap[cell_line_search])[organism]).second +
2968  ". Please see http://iclac.org/databases/cross-contaminations/ for more information and references.";
2969  }
2970  return rval;
2971 }
2972 
2973 
2974 // =============================================================================
2975 // Country Names
2976 // =============================================================================
2977 
2978 
2979 // legal country names, must be in alphabetical order (case sensitive)
2980 static const char* const s_Countries[] = {
2981  "Afghanistan",
2982  "Albania",
2983  "Algeria",
2984  "American Samoa",
2985  "Andorra",
2986  "Angola",
2987  "Anguilla",
2988  "Antarctica",
2989  "Antigua and Barbuda",
2990  "Arctic Ocean",
2991  "Argentina",
2992  "Armenia",
2993  "Aruba",
2994  "Ashmore and Cartier Islands",
2995  "Atlantic Ocean",
2996  "Australia",
2997  "Austria",
2998  "Azerbaijan",
2999  "Bahamas",
3000  "Bahrain",
3001  "Baker Island",
3002  "Baltic Sea",
3003  "Bangladesh",
3004  "Barbados",
3005  "Bassas da India",
3006  "Belarus",
3007  "Belgium",
3008  "Belize",
3009  "Benin",
3010  "Bermuda",
3011  "Bhutan",
3012  "Bolivia",
3013  "Borneo",
3014  "Bosnia and Herzegovina",
3015  "Botswana",
3016  "Bouvet Island",
3017  "Brazil",
3018  "British Virgin Islands",
3019  "Brunei",
3020  "Bulgaria",
3021  "Burkina Faso",
3022  "Burundi",
3023  "Cambodia",
3024  "Cameroon",
3025  "Canada",
3026  "Cape Verde",
3027  "Cayman Islands",
3028  "Central African Republic",
3029  "Chad",
3030  "Chile",
3031  "China",
3032  "Christmas Island",
3033  "Clipperton Island",
3034  "Cocos Islands",
3035  "Colombia",
3036  "Comoros",
3037  "Cook Islands",
3038  "Coral Sea Islands",
3039  "Costa Rica",
3040  "Cote d'Ivoire",
3041  "Croatia",
3042  "Cuba",
3043  "Curacao",
3044  "Cyprus",
3045  "Czechia",
3046  "Democratic Republic of the Congo",
3047  "Denmark",
3048  "Djibouti",
3049  "Dominica",
3050  "Dominican Republic",
3051  "Ecuador",
3052  "Egypt",
3053  "El Salvador",
3054  "Equatorial Guinea",
3055  "Eritrea",
3056  "Estonia",
3057  "Eswatini",
3058  "Ethiopia",
3059  "Europa Island",
3060  "Falkland Islands (Islas Malvinas)",
3061  "Faroe Islands",
3062  "Fiji",
3063  "Finland",
3064  "France",
3065  "French Guiana",
3066  "French Polynesia",
3067  "French Southern and Antarctic Lands",
3068  "Gabon",
3069  "Gambia",
3070  "Gaza Strip",
3071  "Georgia",
3072  "Germany",
3073  "Ghana",
3074  "Gibraltar",
3075  "Glorioso Islands",
3076  "Greece",
3077  "Greenland",
3078  "Grenada",
3079  "Guadeloupe",
3080  "Guam",
3081  "Guatemala",
3082  "Guernsey",
3083  "Guinea",
3084  "Guinea-Bissau",
3085  "Guyana",
3086  "Haiti",
3087  "Heard Island and McDonald Islands",
3088  "Honduras",
3089  "Hong Kong",
3090  "Howland Island",
3091  "Hungary",
3092  "Iceland",
3093  "India",
3094  "Indian Ocean",
3095  "Indonesia",
3096  "Iran",
3097  "Iraq",
3098  "Ireland",
3099  "Isle of Man",
3100  "Israel",
3101  "Italy",
3102  "Jamaica",
3103  "Jan Mayen",
3104  "Japan",
3105  "Jarvis Island",
3106  "Jersey",
3107  "Johnston Atoll",
3108  "Jordan",
3109  "Juan de Nova Island",
3110  "Kazakhstan",
3111  "Kenya",
3112  "Kerguelen Archipelago",
3113  "Kingman Reef",
3114  "Kiribati",
3115  "Kosovo",
3116  "Kuwait",
3117  "Kyrgyzstan",
3118  "Laos",
3119  "Latvia",
3120  "Lebanon",
3121  "Lesotho",
3122  "Liberia",
3123  "Libya",
3124  "Liechtenstein",
3125  "Line Islands",
3126  "Lithuania",
3127  "Luxembourg",
3128  "Macau",
3129  "Madagascar",
3130  "Malawi",
3131  "Malaysia",
3132  "Maldives",
3133  "Mali",
3134  "Malta",
3135  "Marshall Islands",
3136  "Martinique",
3137  "Mauritania",
3138  "Mauritius",
3139  "Mayotte",
3140  "Mediterranean Sea",
3141  "Mexico",
3142  "Micronesia, Federated States of",
3143  "Midway Islands",
3144  "Moldova",
3145  "Monaco",
3146  "Mongolia",
3147  "Montenegro",
3148  "Montserrat",
3149  "Morocco",
3150  "Mozambique",
3151  "Myanmar",
3152  "Namibia",
3153  "Nauru",
3154  "Navassa Island",
3155  "Nepal",
3156  "Netherlands",
3157  "New Caledonia",
3158  "New Zealand",
3159  "Nicaragua",
3160  "Niger",
3161  "Nigeria",
3162  "Niue",
3163  "Norfolk Island",
3164  "North Korea",
3165  "North Macedonia",
3166  "North Sea",
3167  "Northern Mariana Islands",
3168  "Norway",
3169  "Oman",
3170  "Pacific Ocean",
3171  "Pakistan",
3172  "Palau",
3173  "Palmyra Atoll",
3174  "Panama",
3175  "Papua New Guinea",
3176  "Paracel Islands",
3177  "Paraguay",
3178  "Peru",
3179  "Philippines",
3180  "Pitcairn Islands",
3181  "Poland",
3182  "Portugal",
3183  "Puerto Rico",
3184  "Qatar",
3185  "Republic of the Congo",
3186  "Reunion",
3187  "Romania",
3188  "Ross Sea",
3189  "Russia",
3190  "Rwanda",
3191  "Saint Barthelemy",
3192  "Saint Helena",
3193  "Saint Kitts and Nevis",
3194  "Saint Lucia",
3195  "Saint Martin",
3196  "Saint Pierre and Miquelon",
3197  "Saint Vincent and the Grenadines",
3198  "Samoa",
3199  "San Marino",
3200  "Sao Tome and Principe",
3201  "Saudi Arabia",
3202  "Senegal",
3203  "Serbia",
3204  "Seychelles",
3205  "Sierra Leone",
3206  "Singapore",
3207  "Sint Maarten",
3208  "Slovakia",
3209  "Slovenia",
3210  "Solomon Islands",
3211  "Somalia",
3212  "South Africa",
3213  "South Georgia and the South Sandwich Islands",
3214  "South Korea",
3215  "South Sudan",
3216  "Southern Ocean",
3217  "Spain",
3218  "Spratly Islands",
3219  "Sri Lanka",
3220  "State of Palestine",
3221  "Sudan",
3222  "Suriname",
3223  "Svalbard",
3224  "Sweden",
3225  "Switzerland",
3226  "Syria",
3227  "Taiwan",
3228  "Tajikistan",
3229  "Tanzania",
3230  "Tasman Sea",
3231  "Thailand",
3232  "Timor-Leste",
3233  "Togo",
3234  "Tokelau",
3235  "Tonga",
3236  "Trinidad and Tobago",
3237  "Tromelin Island",
3238  "Tunisia",
3239  "Turkey",
3240  "Turkmenistan",
3241  "Turks and Caicos Islands",
3242  "Tuvalu",
3243  "USA",
3244  "Uganda",
3245  "Ukraine",
3246  "United Arab Emirates",
3247  "United Kingdom",
3248  "Uruguay",
3249  "Uzbekistan",
3250  "Vanuatu",
3251  "Venezuela",
3252  "Viet Nam",
3253  "Virgin Islands",
3254  "Wake Island",
3255  "Wallis and Futuna",
3256  "West Bank",
3257  "Western Sahara",
3258  "Yemen",
3259  "Zambia",
3260  "Zimbabwe"
3261 };
3262 static const TCStrSet s_CountriesSet(s_Countries, sizeof(s_Countries), __FILE__, __LINE__);
3263 
3264 // former legal country names, must be in alphabetical order (case sensitive)
3265 static const char* const s_Former_Countries[] = {
3266  "Belgian Congo",
3267  "British Guiana",
3268  "Burma",
3269  "Czech Republic",
3270  "Czechoslovakia",
3271  "East Timor",
3272  "Korea",
3273  "Macedonia",
3274  "Micronesia",
3275  "Netherlands Antilles",
3276  "Serbia and Montenegro",
3277  "Siam",
3278  "Swaziland",
3279  "The former Yugoslav Republic of Macedonia",
3280  "USSR",
3281  "Yugoslavia",
3282  "Zaire"
3283 };
3284 static const TCStrSet s_Former_CountriesSet(s_Former_Countries, sizeof(s_Former_Countries), __FILE__, __LINE__);
3285 
3286 // null term exemption values, must be in alphabetical order (case sensitive)
3287 static const char* const s_Null_Countries[] = {
3288  "missing",
3289  "missing: control sample",
3290  "missing: data agreement established pre-2023",
3291  "missing: endangered species",
3292  "missing: human-identifiable",
3293  "missing: lab stock",
3294  "missing: sample group",
3295  "missing: synthetic construct",
3296  "missing: third party data",
3297  "not applicable",
3298  "not collected",
3299  "not provided",
3300  "restricted access"
3301 };
3302 static const TCStrSet s_Null_CountriesSet(s_Null_Countries, sizeof(s_Null_Countries), __FILE__, __LINE__);
3303 
3304 bool CCountries::IsValid(const string& country)
3305 {
3306  string name = country;
3307  size_t pos = country.find(':');
3308 
3309  if ( pos != NPOS ) {
3310  if (pos == country.length() - 1) {
3311  return false;
3312  }
3313  name = country.substr(0, pos);
3314  }
3315 
3316  // try current countries
3317  if (s_CountriesSet.find(name.c_str()) != s_CountriesSet.end()) {
3318  return true;
3319  } else if (s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end()) {
3320  return true;
3321  } else if (s_Null_CountriesSet.find(name.c_str()) != s_Null_CountriesSet.end()) {
3322  return true;
3323  } else {
3324  return false;
3325  }
3326 }
3327 
3328 
3329 bool CCountries::IsValid(const string& country, bool& is_miscapitalized)
3330 {
3331  string name = country;
3332  size_t pos = country.find(':');
3333 
3334  if ( pos != NPOS ) {
3335  name = country.substr(0, pos);
3336  if (pos == country.length() - 1) {
3337  return false;
3338  }
3339  }
3340 
3341  is_miscapitalized = false;
3342  // try current countries
3343  // fast check for properly capitalized
3344  if ( s_CountriesSet.find(name.c_str()) != s_CountriesSet.end() ) {
3345  return true;
3346  }
3347  if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
3348  return true;
3349  }
3350  if ( s_Null_CountriesSet.find(name.c_str()) != s_Null_CountriesSet.end() ) {
3351  return true;
3352  }
3353  // slow check for miscapitalized
3354  ITERATE ( TCStrSet, it, s_CountriesSet ) {
3355  if ( NStr::EqualNocase(name, *it) ) {
3356  is_miscapitalized = true;
3357  return true;
3358  }
3359  }
3361  if ( NStr::EqualNocase(name, *it) ) {
3362  is_miscapitalized = true;
3363  return true;
3364  }
3365  }
3367  if ( NStr::EqualNocase(name, *it) ) {
3368  is_miscapitalized = true;
3369  return true;
3370  }
3371  }
3372 
3373  return false;
3374 }
3375 
3376 
3377 bool CCountries::WasValid(const string& country)
3378 {
3379  string name = country;
3380  size_t pos = country.find(':');
3381 
3382  if ( pos != NPOS ) {
3383  name = country.substr(0, pos);
3384  }
3385 
3386  // try formerly-valid countries
3387  return s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end();
3388 }
3389 
3390 
3391 bool CCountries::WasValid(const string& country, bool& is_miscapitalized)
3392 {
3393  string name = country;
3394  size_t pos = country.find(':');
3395 
3396  if ( pos != NPOS ) {
3397  name = country.substr(0, pos);
3398  }
3399 
3400  is_miscapitalized = false;
3401  // try formerly-valid countries
3402  // fast check for properly capitalized
3403  if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
3404  return true;
3405  }
3406  // slow check for miscapitalized
3408  if ( NStr::EqualNocase(name, *it) ) {
3409  is_miscapitalized = true;
3410  return true;
3411  }
3412  }
3413  return false;
3414 }
3415 
3416 /////////////////////////////////////////////////////////////////////////////
3417 ////// Country Capitalization Fix ///////////////////////////////////////////
3418 
3420 {
3421  {"england", "United Kingdom: England"},
3422  {"great britain", "United Kingdom: Great Britain"},
3423  {"new jersey, usa", "USA: New Jersey"}
3424 };
3427 
3429 {"ABW", "Aruba"},
3430 {"AFG", "Afghanistan"},
3431 {"AGO", "Angola"},
3432 {"AIA", "Anguilla"},
3433 {"ALA", "Aland Islands"},
3434 {"ALB", "Albania"},
3435 {"AND", "Andorra"},
3436 {"ARE", "United Arab Emirates"},
3437 {"ARG", "Argentina"},
3438 {"ARM", "Armenia"},
3439 {"ASM", "American Samoa"},
3440 {"ATA", "Antarctica"},
3441 {"ATF", "French Southern Territories"},
3442 {"ATG", "Antigua and Barbuda"},
3443 {"AUS", "Australia"},
3444 {"AUT", "Austria"},
3445 {"AZE", "Azerbaijan"},
3446 {"Antigua & Barbuda", "Antigua and Barbuda"},
3447 {"Ashmore & Cartier Islands", "Ashmore and Cartier Islands"},
3448 {"BDI", "Burundi"},
3449 {"BEL", "Belgium"},
3450 {"BEN", "Benin"},
3451 {"BES", "Bonaire, Sint Eustatius and Saba"},
3452 {"BFA", "Burkina Faso"},
3453 {"BGD", "Bangladesh"},
3454 {"BGR", "Bulgaria"},
3455 {"BHR", "Bahrain"},
3456 {"BHS", "Bahamas"},
3457 {"BIH", "Bosnia and Herzegovina"},
3458 {"BLM", "Saint Barthelemy"},
3459 {"BLR", "Belarus"},
3460 {"BLZ", "Belize"},
3461 {"BMU", "Bermuda"},
3462 {"BOL", "Bolivia"},
3463 {"BRA", "Brazil"},
3464 {"BRB", "Barbados"},
3465 {"BRN", "Brunei"},
3466 {"BTN", "Bhutan"},
3467 {"BVT", "Bouvet Island"},
3468 {"BWA", "Botswana"},
3469 {"Brasil", "Brazil"},
3470 {"CAF", "Central African Republic"},
3471 {"CAN", "Canada"},
3472 {"CCK", "Cocos Islands"},
3473 {"CHE", "Switzerland"},
3474 {"CHL", "Chile"},
3475 {"CHN", "China"},
3476 {"CIV", "Cote d'Ivoire"},
3477 {"CMR", "Cameroon"},
3478 {"COD", "Democratic Republic of the Congo"},
3479 {"COG", "Republic of the Congo"},
3480 {"COK", "Cook Islands"},
3481 {"COL", "Colombia"},
3482 {"COM", "Comoros"},
3483 {"CPV", "Cape Verde"},
3484 {"CRI", "Costa Rica"},
3485 {"CUB", "Cuba"},
3486 {"CUW", "Curacao"},
3487 {"CXR", "Christmas Island"},
3488 {"CYM", "Cayman Islands"},
3489 {"CYP", "Cyprus"},
3490 {"CZE", "Czechia"},
3491 {"Cape Verde Islands", "Cape Verde"},
3492 {"DEU", "Germany"},
3493 {"DJI", "Djibouti"},
3494 {"DMA", "Dominica"},
3495 {"DNK", "Denmark"},
3496 {"DOM", "Dominican Republic"},
3497 {"DZA", "Algeria"},
3498 {"Democratic Republic of Congo", "Democratic Republic of the Congo"},
3499 {"ECU", "Ecuador"},
3500 {"EGY", "Egypt"},
3501 {"ERI", "Eritrea"},
3502 {"ESH", "Western Sahara"},
3503 {"ESP", "Spain"},
3504 {"EST", "Estonia"},
3505 {"ETH", "Ethiopia"},
3506 {"FIN", "Finland"},
3507 {"FJI", "Fiji"},
3508 {"FLK", "Falkland Islands (Islas Malvinas)"},
3509 {"FRA", "France"},
3510 {"FRO", "Faroe Islands"},
3511 {"FSM", "Micronesia, Federated States of"},
3512 {"Falkland Islands", "Falkland Islands (Islas Malvinas)"},
3513 {"French Southern & Antarctic Lands", "French Southern and Antarctic Lands"},
3514 {"GAB", "Gabon"},
3515 {"GBR", "United Kingdom"},
3516 {"GEO", "Georgia"},
3517 {"GGY", "Guernsey"},
3518 {"GHA", "Ghana"},
3519 {"GIB", "Gibraltar"},
3520 {"GIN", "Guinea"},
3521 {"GLP", "Guadeloupe"},
3522 {"GMB", "Gambia"},
3523 {"GNB", "Guinea-Bissau"},
3524 {"GNQ", "Equatorial Guinea"},
3525 {"GRC", "Greece"},
3526 {"GRD", "Grenada"},
3527 {"GRL", "Greenland"},
3528 {"GTM", "Guatemala"},
3529 {"GUF", "French Guiana"},
3530 {"GUM", "Guam"},
3531 {"GUY", "Guyana"},
3532 {"HKG", "Hong Kong"},
3533 {"HMD", "Heard Island and McDonald Islands"},
3534 {"HND", "Honduras"},
3535 {"HRV", "Croatia"},
3536 {"HTI", "Haiti"},
3537 {"HUN", "Hungary"},
3538 {"Heard Island & McDonald Islands", "Heard Island and McDonald Islands"},
3539 {"IDN", "Indonesia"},
3540 {"IMN", "Isle of Man"},
3541 {"IND", "India"},
3542 {"IOT", "British Indian Ocean Territory"},
3543 {"IRL", "Ireland"},
3544 {"IRN", "Iran"},
3545 {"IRQ", "Iraq"},
3546 {"ISL", "Iceland"},
3547 {"ISR", "Israel"},
3548 {"ITA", "Italy"},
3549 {"Ivory Coast", "Cote d'Ivoire"},
3550 {"JAM", "Jamaica"},
3551 {"JEY", "Jersey"},
3552 {"JOR", "Jordan"},
3553 {"JPN", "Japan"},
3554 {"KAZ", "Kazakhstan"},
3555 {"KEN", "Kenya"},
3556 {"KGZ", "Kyrgyzstan"},
3557 {"KHM", "Cambodia"},
3558 {"KIR", "Kiribati"},
3559 {"KNA", "Saint Kitts and Nevis"},
3560 {"KOR", "South Korea"},
3561 {"KWT", "Kuwait"},
3562 {"LAO", "Lao People's Democratic Republic"},
3563 {"LBN", "Lebanon"},
3564 {"LBR", "Liberia"},
3565 {"LBY", "Libyan Arab Jamahiriya"},
3566 {"LCA", "Saint Lucia"},
3567 {"LIE", "Liechtenstein"},
3568 {"LKA", "Sri Lanka"},
3569 {"LSO", "Lesotho"},
3570 {"LTU", "Lithuania"},
3571 {"LUX", "Luxembourg"},
3572 {"LVA", "Latvia"},
3573 {"La Reunion Island", "Reunion"},
3574 {"Luxemburg", "Luxembourg"},
3575 {"MAC", "Macao"},
3576 {"MAF", "Saint Martin (French part)"},
3577 {"MAR", "Morocco"},
3578 {"MCO", "Monaco"},
3579 {"MDA", "Moldova"},
3580 {"MDG", "Madagascar"},
3581 {"MDV", "Maldives"},
3582 {"MEX", "Mexico"},
3583 {"MHL", "Marshall Islands"},
3584 {"MKD", "North Macedonia"},
3585 {"MLI", "Mali"},
3586 {"MLT", "Malta"},
3587 {"MMR", "Myanmar"},
3588 {"MNE", "Montenegro"},
3589 {"MNG", "Mongolia"},
3590 {"MNP", "Northern Mariana Islands"},
3591 {"MOZ", "Mozambique"},
3592 {"MRT", "Mauritania"},
3593 {"MSR", "Montserrat"},
3594 {"MTQ", "Martinique"},
3595 {"MUS", "Mauritius"},
3596 {"MWI", "Malawi"},
3597 {"MYS", "Malaysia"},
3598 {"MYT", "Mayotte"},
3599 {"Macedonia", "North Macedonia"},
3600 {"NAM", "Namibia"},
3601 {"NCL", "New Caledonia"},
3602 {"NER", "Niger"},
3603 {"NFK", "Norfolk Island"},
3604 {"NGA", "Nigeria"},
3605 {"NIC", "Nicaragua"},
3606 {"NIU", "Niue"},
3607 {"NLD", "Netherlands"},
3608 {"NOR", "Norway"},
3609 {"NPL", "Nepal"},
3610 {"NRU", "Nauru"},
3611 {"NZL", "New Zealand"},
3612 {"Netherland", "Netherlands"},
3613 {"New Guinea", "Papua New Guinea"},
3614 {"OMN", "Oman"},
3615 {"P, R, China", "China"},
3616 {"P.R. China", "China"},
3617 {"P.R.China", "China"},
3618 {"PAK", "Pakistan"},
3619 {"PAN", "Panama"},
3620 {"PCN", "Pitcairn"},
3621 {"PER", "Peru"},
3622 {"PHL", "Philippines"},
3623 {"PLW", "Palau"},
3624 {"PNG", "Papua New Guinea"},
3625 {"POL", "Poland"},
3626 {"PRI", "Puerto Rico"},
3627 {"PRK", "North Korea"},
3628 {"PRT", "Portugal"},
3629 {"PRY", "Paraguay"},
3630 {"PSE", "Palestinian Territory"},
3631 {"PYF", "French Polynesia"},
3632 {"People's Republic of China", "China"},
3633 {"Pr China", "China"},
3634 {"Prchina", "China"},
3635 {"QAT", "Qatar"},
3636 {"REU", "Reunion"},
3637 {"ROU", "Romania"},
3638 {"RUS", "Russia"},
3639 {"RWA", "Rwanda"},
3640 {"Republic of Congo", "Republic of the Congo"},
3641 {"SAU", "Saudi Arabia"},
3642 {"SDN", "Sudan"},
3643 {"SEN", "Senegal"},
3644 {"SGP", "Singapore"},
3645 {"SGS", "South Georgia and the South Sandwich Islands"},
3646 {"SHN", "Saint Helena"},
3647 {"SJM", "Svalbard and Jan Mayen"},
3648 {"SLB", "Solomon Islands"},
3649 {"SLE", "Sierra Leone"},
3650 {"SLV", "El Salvador"},
3651 {"SMR", "San Marino"},
3652 {"SOM", "Somalia"},
3653 {"SPM", "Saint Pierre and Miquelon"},
3654 {"SRB", "Serbia"},
3655 {"SSD", "South Sudan"},
3656 {"STP", "Sao Tome and Principe"},
3657 {"SUR", "Suriname"},
3658 {"SVK", "Slovakia"},
3659 {"SVN", "Slovenia"},
3660 {"SWE", "Sweden"},
3661 {"SWZ", "Eswatini"},
3662 {"SXM", "Sint Maarten (Dutch part)"},
3663 {"SYC", "Seychelles"},
3664 {"SYR", "Syrian Arab Republic"},
3665 {"Saint Kitts & Nevis", "Saint Kitts and Nevis"},
3666 {"Saint Pierre & Miquelon", "Saint Pierre and Miquelon"},
3667 {"Saint Vincent & Grenadines", "Saint Vincent and the Grenadines"},
3668 {"Saint Vincent & the Grenadines", "Saint Vincent and the Grenadines"},
3669 {"Saint Vincent and Grenadines", "Saint Vincent and the Grenadines"},
3670 {"San Tome and Principe Island", "Sao Tome and Principe"},
3671 {"Sao Tome & Principe", "Sao Tome and Principe"},
3672 {"South Georgia & South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
3673 {"South Georgia & the South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
3674 {"St Helena", "Saint Helena"},
3675 {"St Lucia", "Saint Lucia"},
3676 {"St Pierre and Miquelon", "Saint Pierre and Miquelon"},
3677 {"St Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
3678 {"St. Helena", "Saint Helena"},
3679 {"St. Lucia", "Saint Lucia"},
3680 {"St. Pierre and Miquelon", "Saint Pierre and Miquelon"},
3681 {"St. Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
3682 {"TCA", "Turks and Caicos Islands"},
3683 {"TCD", "Chad"},
3684 {"TGO", "Togo"},
3685 {"THA", "Thailand"},
3686 {"TJK", "Tajikistan"},
3687 {"TKL", "Tokelau"},
3688 {"TKM", "Turkmenistan"},
3689 {"TLS", "Timor-Leste"},
3690 {"TON", "Tonga"},
3691 {"TTO", "Trinidad and Tobago"},
3692 {"TUN", "Tunisia"},
3693 {"TUR", "Turkey"},
3694 {"TUV", "Tuvalu"},
3695 {"TWN", "Taiwan"},
3696 {"TZA", "Tanzania"},
3697 {"The Netherlands", "Netherlands"},
3698 {"Trinidad & Tobago", "Trinidad and Tobago"},
3699 {"Turks & Caicos", "Turks and Caicos Islands"},
3700 {"Turks & Caicos Islands", "Turks and Caicos Islands"},
3701 {"Turks and Caicos", "Turks and Caicos Islands"},
3702 {"U.S.A.", "USA"},
3703 {"UGA", "Uganda"},
3704 {"UK", "United Kingdom"},
3705 {"UKR", "Ukraine"},
3706 {"UMI", "United States Minor Outlying Islands"},
3707 {"URY", "Uruguay"},
3708 {"UZB", "Uzbekistan"},
3709 {"United States", "USA"},
3710 {"United States of America", "USA"},
3711 {"VAT", "Holy See (Vatican City State)"},
3712 {"VCT", "Saint Vincent and the Grenadines"},
3713 {"VEN", "Venezuela"},
3714 {"VGB", "British Virgin Islands"},
3715 {"VIR", "Virgin Islands"},
3716 {"VNM", "Viet Nam"},
3717 {"VUT", "Vanuatu"},
3718 {"Vietnam", "Viet Nam"},
3719 {"WLF", "Wallis and Futuna"},
3720 {"WSM", "Samoa"},
3721 {"YEM", "Yemen"},
3722 {"ZAF", "South Africa"},
3723 {"ZMB", "Zambia"},
3724 {"ZWE", "Zimbabwe"},
3725 {"the Netherlands", "Netherlands"}
3726 };
3727 
3729 
3730 // for GP-24841
3732 {"Burma", "Myanmar"},
3733 {"Siam", "Thailand"}
3734 };
3736 
3737 // for GB-7408
3739 {"Antigua", "Antigua and Barbuda: Antigua"},
3740 {"Ashmore Island", "Ashmore and Cartier Islands: Ashmore Island"},
3741 {"Autonomous Region of the Azores", "Portugal: Azores"},
3742 {"Azores", "Portugal: Azores"},
3743 {"Barbuda", "Antigua and Barbuda: Barbuda"},
3744 {"Bassas da India", "French Southern and Antarctic Lands: Bassas da India"},
3745 {"Caicos Islands", "Turks and Caicos Islands: Caicos Islands"},
3746 {"Canary Islands", "Spain: Canary Islands"},
3747 {"Cartier Island", "Ashmore and Cartier Islands: Cartier Island"},
3748 {"East Germany", "Germany: East Germany"},
3749 {"El Hierro", "Spain: El Hierro"},
3750 {"Europa Island", "French Southern and Antarctic Lands: Europa Island"},
3751 {"Fuerteventura", "Spain: Fuerteventura"},
3752 {"Glorioso Islands", "French Southern and Antarctic Lands: Glorioso Islands"},
3753 {"Gran Canaria", "Spain: Gran Canaria"},
3754 {"Grenadines", "Saint Vincent and the Grenadines: Grenadines"},
3755 {"Heard Island", "Heard Island and McDonald Islands: Heard Island"},
3756 {"Ile Amsterdam", "French Southern and Antarctic Lands: Ile Amsterdam"},
3757 {"Ile Saint-Paul", "French Southern and Antarctic Lands: Ile Saint-Paul"},
3758 {"Iles Crozet", "French Southern and Antarctic Lands: Iles Crozet"},
3759 {"Iles Kerguelen", "French Southern and Antarctic Lands: Iles Kerguelen"},
3760 {"Juan de Nova Island", "French Southern and Antarctic Lands: Juan de Nova Island"},
3761 {"La Gomera", "Spain: La Gomera"},
3762 {"La Graciosa", "Spain: La Graciosa"},
3763 {"La Palma", "Spain: La Palma"},
3764 {"Lanzarote", "Spain: Lanzarote"},
3765 {"Madeira", "Portugal: Madeira"},
3766 {"McDonald Island", "Heard Island and McDonald Islands: McDonald Island"},
3767 {"McDonald Islands", "Heard Island and McDonald Islands: McDonald Islands"},
3768 {"Miquelon", "Saint Pierre and Miquelon: Miquelon"},
3769 {"Nevis", "Saint Kitts and Nevis: Nevis"},
3770 {"Principe", "Sao Tome and Principe: Principe"},
3771 {"Saint Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3772 {"Saint Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3773 {"Saint Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3774 {"Sao Tome", "Sao Tome and Principe: Sao Tome"},
3775 {"Scotland", "United Kingdom: Scotland"},
3776 {"South Sandwich Islands", "South Georgia and the South Sandwich Islands: South Sandwich Islands"},
3777 {"St Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3778 {"St Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3779 {"St Thomas", "USA: Saint Thomas"},
3780 {"St Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3781 {"St. Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3782 {"St. Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3783 {"St. Thomas", "USA: Saint Thomas"},
3784 {"St. Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3785 {"Tenerife", "Spain: Tenerife"},
3786 {"Tobago", "Trinidad and Tobago: Tobago"},
3787 {"Trinidad", "Trinidad and Tobago: Trinidad"},
3788 {"Tromelin Island", "French Southern and Antarctic Lands: Tromelin Island"},
3789 {"Turks Islands", "Turks and Caicos Islands: Turks Islands"},
3790 {"Wales", "United Kingdom: Wales"},
3791 {"West Germany", "Germany: West Germany"},
3792 
3793 };
3795 
3796 
3797 static const char* s_USAStates[] = {
3798  "Alabama",
3799  "Alaska",
3800  "Arizona",
3801  "Arkansas",
3802  "California",
3803  "Colorado",
3804  "Connecticut",
3805  "Delaware",
3806  "District of Columbia",
3807  "Florida",
3808  "Georgia",
3809  "Hawaii",
3810  "Idaho",
3811  "Illinois",
3812  "Indiana",
3813  "Iowa",
3814  "Kansas",
3815  "Kentucky",
3816  "Louisiana",
3817  "Maine",
3818  "Maryland",
3819  "Massachusetts",
3820  "Michigan",
3821  "Minnesota",
3822  "Mississippi",
3823  "Missouri",
3824  "Montana",
3825  "Nebraska",
3826  "Nevada",
3827  "New Hampshire",
3828  "New Jersey",
3829  "New Mexico",
3830  "New York",
3831  "North Carolina",
3832  "North Dakota",
3833  "Ohio",
3834  "Oklahoma",
3835  "Oregon",
3836  "Pennsylvania",
3837  "Rhode Island",
3838  "South Carolina",
3839  "South Dakota",
3840  "Tennessee",
3841  "Texas",
3842  "Utah",
3843  "Vermont",
3844  "Virginia",
3845  "Washington",
3846  "West Virginia",
3847  "Wisconsin",
3848  "Wyoming"
3849 };
3850 
3852 {
3853  vector<string> words;
3854  NStr::Split(phrase, " \t\r\n", words);
3855  for(vector<string>::iterator word = words.begin(); word != words.end(); ++word)
3856  if (!word->empty() && isalpha(word->at(0)))
3857  word->at(0) = (unsigned char)toupper(word->at(0));
3858  return NStr::Join(words," ");
3859 }
3860 
3861 string CCountries::WholeCountryFix(string country)
3862 {
3863  string new_country;
3864  TCStringPairsMap::const_iterator found = k_whole_country_fixes.find(NStr::ToLower(country).c_str());
3865  if (found != k_whole_country_fixes.end()) {
3866  new_country = found->second;
3867  return new_country;
3868  }
3869 
3870  const size_t num_states = sizeof(s_USAStates) / sizeof(s_USAStates[0]);
3871  for (size_t i = 0; i < num_states; ++i) {
3872  if (NStr::EqualNocase(s_USAStates[i], country)) {
3873  new_country = "USA: " + CTempString(s_USAStates[i]);
3874  break;
3875  }
3876  }
3877 
3878  return new_country;
3879 }
3880 
3881 bool CCountries::IsSubstringOfStringInList(const string& phrase, const string& country1, size_t pos1)
3882 {
3883  bool r = false;
3885  {
3886  string country2(*c);
3887  if (country2.length() > country1.length() && NStr::FindNoCase(country2,country1) != NPOS)
3888  {
3889  SIZE_TYPE pos2 = NStr::FindNoCase(phrase,country2);
3890  while (pos2 != NPOS)
3891  {
3892  if (pos2 <= pos1 && pos2+country2.length() >= pos1+country1.length())
3893  r = true;
3894  pos2 = NStr::FindNoCase(phrase,country2,pos2+country2.length());
3895  }
3896  }
3897  }
3898  return r;
3899 }
3900 
3901 bool CCountries::ContainsMultipleCountryNames (const string &phrase)
3902 {
3903  int num_matches = 0;
3905  {
3906  string country(*c);
3907  size_t pos = NStr::FindNoCase(phrase,country);
3908  while (pos != NPOS)
3909  {
3910  if (!((pos+country.length()<phrase.length() && isalpha(phrase[pos+country.length()]))
3911  || (pos > 0 && isalpha(phrase[pos-1]))
3912  || IsSubstringOfStringInList(phrase,country,pos)))
3913  num_matches++;
3914  pos = NStr::FindNoCase(phrase,country,pos+country.length());
3915  }
3916 
3917  }
3918  return (num_matches > 1);
3919 }
3920 
3922 {
3923  string output = country;
3924  ITERATE ( TCStrSet, it, s_CountriesSet ) {
3925  if ( NStr::EqualNocase(country, *it) ) {
3926  output = *it;
3927  }
3928  }
3929  return output;
3930 }
3931 
3932 
3933 void CCountries::x_RemoveDelimitersFromEnds(string& val, bool except_paren)
3934 {
3936  bool any_found = true;
3937  while (!val.empty() && any_found) {
3938  any_found = false;
3939  if (NStr::StartsWith(val, ",")
3940  || NStr::StartsWith(val, ":")
3941  || NStr::StartsWith(val, ".")
3942  || (!except_paren && NStr::StartsWith(val, ")"))) {
3943  val = val.substr(1);
3944  any_found = true;
3946  } else if (NStr::EndsWith(val, ",")
3947  || NStr::EndsWith(val, ":")
3948  || (!except_paren && NStr::EndsWith(val, "("))) {
3949  val = val.substr(0, val.length() - 1);
3950  any_found = true;
3952  } else if (NStr::EndsWith(val, "the") && val.length() > 3 && !isalpha((unsigned char)val[val.length() - 4])) {
3953  val = val.substr(0, val.length() - 4);
3954  any_found = true;
3955  } else if (NStr::EndsWith(val, ".")) {
3956  size_t len = val.length();
3957  if (len > 1 && isspace((unsigned char)val[len - 2])) {
3958  val = val.substr(0, val.length() - 1);
3959  any_found = true;
3961  } else if (len > 5) {
3962  // make sure no spaces or punctuation within 4 characters before '.'
3963  bool do_remove = true;
3964  size_t pos = val.length() - 2;
3965  size_t dist = 0;
3966  while (dist < 4 && do_remove) {
3967  if (isspace((unsigned char)val[pos]) || ispunct((unsigned char)val[pos])) {
3968  do_remove = false;
3969  }
3970  pos--;
3971  dist++;
3972  }
3973  if (do_remove) {
3974  val = val.substr(0, val.length() - 1);
3975  any_found = true;
3976  }
3977  }
3978  }
3979  }
3980 }
3981 
3982 
3983 vector<string> CCountries::x_Tokenize(const string& val)
3984 {
3985  vector<string> tokens;
3986  NStr::Split(val, ",:()", tokens);
3987  // special tokenizing - if tokens contain periods but resulting token is at least four characters long
3988  vector<string>::iterator it = tokens.begin();
3989  while (it != tokens.end()) {
3990  size_t pos = NStr::Find(*it, ".");
3991  if (pos != NPOS && pos > 3 && (*it).length() - pos > 4) {
3992  string first = (*it).substr(0, pos);
3993  string remainder = (*it).substr(pos + 1);
3994  size_t space_pos = NStr::Find(first, " ");
3995  size_t len_to_space = first.length();
3996  while (space_pos != NPOS) {
3997  first = first.substr(space_pos + 1);
3998  len_to_space = first.length();
3999  space_pos = NStr::Find(first, " ");
4000  }
4001  if (len_to_space > 4) {
4002  (*it) = (*it).substr(0, pos);
4003  it = tokens.insert(it, remainder);
4004  } else {
4005  it++;
4006  }
4007  } else {
4008  it++;
4009  }
4010  }
4011  return tokens;
4012 }
4013 
4014 
4015 bool s_ContainsWholeWord(const CTempString test, const CTempString word, NStr::ECase case_sense)
4016 {
4017  size_t start = 0;
4018  size_t tlen = test.length();
4019  size_t wlen = word.length();
4020 
4021  size_t pos = NStr::Find(test, word, case_sense);
4022  while (pos != NPOS) {
4023  size_t p = start + pos;
4024  if ( (p == 0 || !isalpha((unsigned char)test[p - 1])) &&
4025  (p + wlen >= tlen || !isalpha((unsigned char)test[p + wlen])) ) {
4026  return true;
4027  }
4028  start = p + 1;
4029  pos = NStr::Find(CTempString(test, start, tlen - start), word, case_sense);
4030  }
4031  return false;
4032 }
4033 
4034 
4035 bool s_SuppressCountryFix(const string& test)
4036 {
4037  if (s_ContainsWholeWord(test, "Sea", NStr::eNocase)) {
4038  return true;
4039  } else if (s_ContainsWholeWord(test, "USSR", NStr::eNocase)) {
4040  return true;
4041  }
4042  return false;
4043 }
4044 
4045 
4047 (const TCStringPairsMap& fix_map,
4048  const vector<string>& countries,
4049  string& valid_country,
4050  string& orig_valid_country,
4051  bool& too_many_countries,
4052  bool& bad_cap)
4053 {
4054  for (auto country : countries) {
4055  if (!country.empty() && !too_many_countries)
4056  {
4057  string check = country;
4060 
4061  bool check_has_bad_cap = false;
4062  if (IsValid(check,check_has_bad_cap))
4063  {
4064  if (valid_country.empty())
4065  {
4066  valid_country = check;
4067  orig_valid_country = check;
4068  bad_cap = check_has_bad_cap;
4069  }
4070  else
4071  {
4072  too_many_countries = true;
4073  }
4074  }
4075  else // see if this is a fixable country
4076  {
4077  TCStringPairsMap::const_iterator found = fix_map.find(check.c_str());
4078  if (found != fix_map.end())
4079  {
4080  if (valid_country.empty())
4081  {
4082  valid_country = found->second;
4083  orig_valid_country = check;
4084  }
4085  else
4086  {
4087  too_many_countries = true;
4088  }
4089  }
4090  }
4091  }
4092  }
4093 }
4094 
4095 // start of RW-1278
4096 
4097 bool s_CompressRunsOfSpaces(string& val)
4098 {
4099  if (val.length() == 0) return false;
4100 
4101  char * str = new char[sizeof(char) * (val.length() + 1)];
4102  strcpy(str, val.c_str());
4103 
4104  unsigned char ch; /* to use 8bit characters in multibyte languages */
4105  unsigned char pv; /* to use 8bit characters in multibyte languages */
4106  char * dst;
4107  char * ptr;
4108 
4109  dst = str;
4110  ptr = str;
4111  ch = *ptr;
4112  pv = '\0';
4113  while (ch != '\0') {
4114  *dst = ch;
4115  dst++;
4116  ptr++;
4117  pv = ch;
4118  ch = *ptr;
4119  if (pv == ' ') {
4120  while (ch == ' ') {
4121  ptr++;
4122  ch = *ptr;
4123  }
4124  pv = '\0';
4125  }
4126  }
4127  if (dst != NULL) {
4128  *dst = '\0';
4129  }
4130 
4131  string new_val;
4132  new_val = str;
4133  delete[] str;
4134 
4135  if (!NStr::Equal(val, new_val)) {
4136  val = new_val;
4137  return true;
4138  }
4139  else {
4140  return false;
4141  }
4142 }
4143 
4146  { "Acadia Parish", "Acadia Parish" },
4147  { "AcadiaParish", "Acadia Parish" },
4148  { "Allen Parish", "Allen Parish" },
4149  { "AllenParish", "Allen Parish" },
4150  { "Ascension Parish", "Ascension Parish" },
4151  { "AscensionParish", "Ascension Parish" },
4152  { "Assumption Parish", "Assumption Parish" },
4153  { "AssumptionParish", "Assumption Parish" },
4154  { "Avoyelles Parish", "Avoyelles Parish" },
4155  { "AvoyellesParish", "Avoyelles Parish" },
4156  { "Beauregard Parish", "Beauregard Parish" },
4157  { "BeauregardParish", "Beauregard Parish" },
4158  { "Bienville Parish", "Bienville Parish" },
4159  { "BienvilleParish", "Bienville Parish" },
4160  { "Bossier Parish", "Bossier Parish" },
4161  { "BossierParish", "Bossier Parish" },
4162  { "Caddo Parish", "Caddo Parish" },
4163  { "CaddoParish", "Caddo Parish" },
4164  { "Calcasieu Parish", "Calcasieu Parish" },
4165  { "CalcasieuParish", "Calcasieu Parish" },
4166  { "Caldwell Parish", "Caldwell Parish" },
4167  { "CaldwellParish", "Caldwell Parish" },
4168  { "Cameron Parish", "Cameron Parish" },
4169  { "CameronParish", "Cameron Parish" },
4170  { "Catahoula Parish", "Catahoula Parish" },
4171  { "CatahoulaParish", "Catahoula Parish" },
4172  { "Claiborne Parish", "Claiborne Parish" },
4173  { "ClaiborneParish", "Claiborne Parish" },
4174  { "Concordia Parish", "Concordia Parish" },
4175  { "ConcordiaParish", "Concordia Parish" },
4176  { "DeSoto Parish", "DeSoto Parish" },
4177  { "DeSotoParish", "DeSoto Parish" },
4178  { "East Baton Rouge Parish", "East Baton Rouge Parish" },
4179  { "East Carroll Parish", "East Carroll Parish" },
4180  { "East Feliciana Parish", "East Feliciana Parish" },
4181  { "EastBatonRougeParish", "East Baton Rouge Parish" },
4182  { "EastCarrollParish", "East Carroll Parish" },
4183  { "EastFelicianaParish", "East Feliciana Parish" },
4184  { "Evangeline Parish", "Evangeline Parish" },
4185  { "EvangelineParish", "Evangeline Parish" },
4186  { "Franklin Parish", "Franklin Parish" },
4187  { "FranklinParish", "Franklin Parish" },
4188  { "Grant Parish", "Grant Parish" },
4189  { "GrantParish", "Grant Parish" },
4190  { "Iberia Parish", "Iberia Parish" },
4191  { "IberiaParish", "Iberia Parish" },
4192  { "Iberville Parish", "Iberville Parish" },
4193  { "IbervilleParish", "Iberville Parish" },
4194  { "Jackson Parish", "Jackson Parish" },
4195  { "JacksonParish", "Jackson Parish" },
4196  { "Jefferson Davis Parish", "Jefferson Davis Parish" },
4197  { "Jefferson Parish", "Jefferson Parish" },
4198  { "JeffersonDavisParish", "Jefferson Davis Parish" },
4199  { "JeffersonParish", "Jefferson Parish" },
4200  { "Lafayette Parish", "Lafayette Parish" },
4201  { "LafayetteParish", "Lafayette Parish" },
4202  { "Lafourche Parish", "Lafourche Parish" },
4203  { "LafourcheParish", "Lafourche Parish" },
4204  { "LaSalle Parish", "LaSalle Parish" },
4205  { "LaSalleParish", "LaSalle Parish" },
4206  { "Lincoln Parish", "Lincoln Parish" },
4207  { "LincolnParish", "Lincoln Parish" },
4208  { "Livingston Parish", "Livingston Parish" },
4209  { "LivingstonParish", "Livingston Parish" },
4210  { "Madison Parish", "Madison Parish" },
4211  { "MadisonParish", "Madison Parish" },
4212  { "Morehouse Parish", "Morehouse Parish" },
4213  { "MorehouseParish", "Morehouse Parish" },
4214  { "Natchitoches Parish", "Natchitoches Parish" },
4215  { "NatchitochesParish", "Natchitoches Parish" },
4216  { "Orleans Parish", "Orleans Parish" },
4217  { "OrleansParish", "Orleans Parish" },
4218  { "Ouachita Parish", "Ouachita Parish" },
4219  { "OuachitaParish", "Ouachita Parish" },
4220  { "Plaquemines Parish", "Plaquemines Parish" },
4221  { "PlaqueminesParish", "Plaquemines Parish" },
4222  { "Pointe Coupee Parish", "Pointe Coupee Parish" },
4223  { "PointeCoupeeParish", "Pointe Coupee Parish" },
4224  { "Rapides Parish", "Rapides Parish" },
4225  { "RapidesParish", "Rapides Parish" },
4226  { "Red River Parish", "Red River Parish" },
4227  { "RedRiverParish", "Red River Parish" },
4228  { "Richland Parish", "Richland Parish" },
4229  { "RichlandParish", "Richland Parish" },
4230  { "Sabine Parish", "Sabine Parish" },
4231  { "SabineParish", "Sabine Parish" },
4232  { "St. Bernard Parish", "St. Bernard Parish" },
4233  { "St. Charles Parish", "St. Charles Parish" },
4234  { "St. Helena Parish", "St. Helena Parish" },
4235  { "St. James Parish", "St. James Parish" },
4236  { "St. John the Baptist Parish", "St. John the Baptist Parish" },
4237  { "St. Landry Parish", "St. Landry Parish" },
4238  { "St. Martin Parish", "St. Martin Parish" },
4239  { "St. Mary Parish", "St. Mary Parish" },
4240  { "St. Tammany Parish", "St. Tammany Parish" },
4241  { "St.BernardParish", "St. Bernard Parish" },
4242  { "St.CharlesParish", "St. Charles Parish" },
4243  { "St.HelenaParish", "St. Helena Parish" },
4244  { "St.JamesParish", "St. James Parish" },
4245  { "St.JohntheBaptistParish", "St. John the Baptist Parish" },
4246  { "St.LandryParish", "St. Landry Parish" },
4247  { "St.MartinParish", "St. Martin Parish" },
4248  { "St.MaryParish", "St. Mary Parish" },
4249  { "St.TammanyParish", "St. Tammany Parish" },
4250  { "Tangipahoa Parish", "Tangipahoa Parish" },
4251  { "TangipahoaParish", "Tangipahoa Parish" },
4252  { "Tensas Parish", "Tensas Parish" },
4253  { "TensasParish", "Tensas Parish" },
4254  { "Terrebonne Parish", "Terrebonne Parish" },
4255  { "TerrebonneParish", "Terrebonne Parish" },
4256  { "Union Parish", "Union Parish" },
4257  { "UnionParish", "Union Parish" },
4258  { "Vermilion Parish", "Vermilion Parish" },
4259  { "VermilionParish", "Vermilion Parish" },
4260  { "Vernon Parish", "Vernon Parish" },
4261  { "VernonParish", "Vernon Parish" },
4262  { "Washington Parish", "Washington Parish" },
4263  { "WashingtonParish", "Washington Parish" },
4264  { "Webster Parish", "Webster Parish" },
4265  { "WebsterParish", "Webster Parish" },
4266  { "West Baton Rouge Parish", "West Baton Rouge Parish" },
4267  { "West Carroll Parish", "West Carroll Parish" },
4268  { "West Feliciana Parish", "West Feliciana Parish" },
4269  { "WestBatonRougeParish", "West Baton Rouge Parish" },
4270  { "WestCarrollParish", "West Carroll Parish" },
4271  { "WestFelicianaParish", "West Feliciana Parish" },
4272  { "Winn Parish", "Winn Parish" },
4273  { "WinnParish", "Winn Parish" }
4274 };
4275 
4278 
4279 bool s_IsParish ( string& parish ) {
4280 
4281  if ( parish.empty() ) {
4282  return false;
4283  }
4284 
4285  TParishMap::const_iterator parish_find_iter = parishAbbrevMap.find(parish.c_str());
4286  if ( parish_find_iter != parishAbbrevMap.end() ) {
4287  // replace with full parish name
4288  parish = parish_find_iter->second;
4289  return true;
4290  }
4291 
4292  return false;
4293 }
4294 
4297  { "AK", "Alaska" },
4298  { "AL", "Alabama" },
4299  { "Alabama", "Alabama" },
4300  { "Alaska", "Alaska" },
4301  { "American Samoa", "American Samoa" },
4302  { "AR", "Arkansas" },
4303  { "Arizona", "Arizona" },
4304  { "Arkansas", "Arkansas" },
4305  { "AS", "American Samoa" },
4306  { "AZ", "Arizona" },
4307  { "CA", "California" },
4308  { "California", "California" },
4309  { "CO", "Colorado" },
4310  { "Colorado", "Colorado" },
4311  { "Connecticut", "Connecticut" },
4312  { "CT", "Connecticut" },
4313  { "DC", "District of Columbia" },
4314  { "DE", "Delaware" },
4315  { "Delaware", "Delaware" },
4316  { "District of Columbia", "District of Columbia" },
4317  { "FL", "Florida" },
4318  { "Florida", "Florida" },
4319  { "GA", "Georgia" },
4320  { "Georgia", "Georgia" },
4321  { "GU", "Guam" },
4322  { "Guam", "Guam" },
4323  { "Hawaii", "Hawaii" },
4324  { "HI", "Hawaii" },
4325  { "IA", "Iowa" },
4326  { "ID", "Idaho" },
4327  { "Idaho", "Idaho" },
4328  { "IL", "Illinois" },
4329  { "Illinois", "Illinois" },
4330  { "IN", "Indiana" },
4331  { "Indiana", "Indiana" },
4332  { "Iowa", "Iowa" },
4333  { "Kansas", "Kansas" },
4334  { "Kentucky", "Kentucky" },
4335  { "KS", "Kansas" },
4336  { "KY", "Kentucky" },
4337  { "LA", "Louisiana" },
4338  { "Louisiana", "Louisiana" },
4339  { "MA", "Massachusetts" },
4340  { "Maine", "Maine" },
4341  { "Maryland", "Maryland" },
4342  { "Massachusetts", "Massachusetts" },
4343  { "MD", "Maryland" },
4344  { "ME", "Maine" },
4345  { "MI", "Michigan" },
4346  { "Michigan", "Michigan" },
4347  { "Minnesota", "Minnesota" },
4348  { "Mississippi", "Mississippi" },
4349  { "Missouri", "Missouri" },
4350  { "MN", "Minnesota" },
4351  { "MO", "Missouri" },
4352  { "Montana", "Montana" },
4353  { "MS", "Mississippi" },
4354  { "MT", "Montana" },
4355  { "NC", "North Carolina" },
4356  { "ND", "North Dakota" },
4357  { "NE", "Nebraska" },
4358  { "Nebraska", "Nebraska" },
4359  { "Nevada", "Nevada" },
4360  { "New Hampshire", "New Hampshire" },
4361  { "New Jersey", "New Jersey" },
4362  { "New Mexico", "New Mexico" },
4363  { "New York", "New York" },
4364  { "NH", "New Hampshire" },
4365  { "NJ", "New Jersey" },
4366  { "NM", "New Mexico" },
4367  { "North Carolina", "North Carolina" },
4368  { "North Dakota", "North Dakota" },
4369  { "NV", "Nevada" },
4370  { "NY", "New York" },
4371  { "OH", "Ohio" },
4372  { "Ohio", "Ohio" },
4373  { "OK", "Oklahoma" },
4374  { "Oklahoma", "Oklahoma" },
4375  { "OR", "Oregon" },
4376  { "Oregon", "Oregon" },
4377  { "PA", "Pennsylvania" },
4378  { "Pennsylvania", "Pennsylvania" },
4379  { "PR", "Puerto Rico" },
4380  { "Puerto Rico", "Puerto Rico" },
4381  { "Rhode Island", "Rhode Island" },
4382  { "RI", "Rhode Island" },
4383  { "SC", "South Carolina" },
4384  { "SD", "South Dakota" },
4385  { "South Carolina", "South Carolina" },
4386  { "South Dakota", "South Dakota" },
4387  { "Tennessee", "Tennessee" },
4388  { "Texas", "Texas" },
4389  { "TN", "Tennessee" },
4390  { "TX", "Texas" },
4391  { "US Virgin Islands", "US Virgin Islands" },
4392  { "UT", "Utah" },
4393  { "Utah", "Utah" },
4394  { "VA", "Virginia" },
4395  { "Vermont", "Vermont" },
4396  { "VI", "US Virgin Islands" },
4397  { "Virgin Islands", "US Virgin Islands" },
4398  { "Virginia", "Virginia" },
4399  { "VT", "Vermont" },
4400  { "WA", "Washington" },
4401  { "Washington", "Washington" },
4402  { "West Virginia", "West Virginia" },
4403  { "WI", "Wisconsin" },
4404  { "Wisconsin", "Wisconsin" },
4405  { "WV", "West Virginia" },
4406  { "WY", "Wyoming" },
4407  { "Wyoming", "Wyoming" }
4408 };
4409 
4412 
4413 bool s_IsState ( string& state, bool& modified ) {
4414 
4415  if ( state.empty() ) {
4416  return false;
4417  }
4418 
4419  string original = state;
4420  string working = state;
4421 
4422  if ( NStr::StartsWith ( working, "State of ", NStr::eNocase )) {
4423  NStr::TrimPrefixInPlace ( working, "State of ", NStr::eNocase );
4424  }
4425 
4426  if ( NStr::StartsWith ( working, "Commonwealth of ", NStr::eNocase )) {
4427  NStr::TrimPrefixInPlace ( working, "Commonwealth of ", NStr::eNocase );
4428  }
4429 
4430  if ( NStr::EndsWith ( working, " State", NStr::eNocase )) {
4431  NStr::TrimSuffixInPlace ( working, " State", NStr::eNocase );
4432  }
4433 
4434  NStr::TruncateSpacesInPlace ( working );
4435 
4436  TStateMap::const_iterator state_find_iter = stateAbbrevMap.find(working.c_str());
4437  if ( state_find_iter != stateAbbrevMap.end() ) {
4438  // replace with full state name
4439  state = state_find_iter->second;
4440  // report conversion from two-letter, changed capitalization, or prefix/suffix removal
4441  if ( ! NStr::Equal ( original, state )) {
4442  modified = true;
4443  }
4444  return true;
4445  }
4446 
4447  return false;
4448 }
4449 
4451 
4452  if ( country.empty() ) {
4453  return CCountries::e_NoResult;
4454  }
4455 
4456  // make working copy
4457  string original = country;
4458  string working = country;
4459 
4460  // remove flanking quotation marks - if CCountries::NewFixCountry not called
4461  if ( NStr::StartsWith ( working, "\"" ) && NStr::EndsWith ( working, "\"" )) {
4462  working = working.substr ( 1, working.length() - 2 );
4463  }
4464 
4465  // remove flanking spaces
4466  NStr::TruncateSpacesInPlace ( working );
4467 
4468  // separate strings before and after colon
4469  string frst, scnd;
4470  NStr::SplitInTwo ( working, ":", frst, scnd );
4471 
4472  NStr::TruncateSpacesInPlace ( frst );
4473  NStr::TruncateSpacesInPlace ( scnd );
4474 
4475  // confirm that country is USA
4476  if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
4477  // if not, first try rescuing US territory
4478  working = CCountries::NewFixCountry(working, true);
4479  NStr::SplitInTwo ( working, ":", frst, scnd );
4480  NStr::TruncateSpacesInPlace ( frst );
4481  NStr::TruncateSpacesInPlace ( scnd );
4482  if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
4483  return CCountries::e_NotUSA;
4484  }
4485  }
4486 
4487  // split state/county/city clauses at commas
4488  vector<string> components;
4489  NStr::Split(scnd, ",", components);
4490 
4491  // check for only country
4492  if ( components.size() < 1 ) {
4493  country = "USA";
4494  return CCountries::e_Valid;
4495  }
4496 
4497  for ( int j = 0; j < components.size(); j++ ) {
4498  // remove flanking spaces around components
4499  NStr::TruncateSpacesInPlace ( components[j] );
4500  s_CompressRunsOfSpaces ( components[j] );
4501  // clean up runon strings like EastBatonRougeParish
4502  if ( NStr::EndsWith ( components[j], "Parish", NStr::eNocase )) {
4503  s_IsParish( components[j] );
4504  }
4505  }
4506 
4507  // bool any_modified = false;
4508  int num_states = 0;
4509  int match = -1;
4510 
4511  // string* first = 0;
4512  // string* last = 0;
4513 
4514  // has multiple components
4515  // int max = components.size() - 1;
4516  for ( int j = 0; j < components.size(); j++ ) {
4517  bool modified = false;
4518  if ( s_IsState ( components[j], modified )) {
4519  /*
4520  if (modified) {
4521  any_modified = true;
4522  }
4523  */
4524  if ( match < 0 ) {
4525  // record position of first s_IsState match
4526  match = j;
4527  }
4528  // count successful matches
4529  num_states++;
4530  /*
4531  if ( j == 0 ) {
4532  first = &(components[j]);
4533  }
4534  if ( j == max ) {
4535  last = &(components[j]);
4536  }
4537  */
4538  }
4539  }
4540 
4541  // generate result
4542  string res;
4543  res.append ("USA: ");
4544  string pfx = "";
4545 
4546  if ( match >= 0 ) {
4547  // move first state matched to first position
4548  res.append ( components[match] );
4549  pfx = ", ";
4550  }
4551 
4552  for ( int j = 0; j < components.size(); j++ ) {
4553  if ( j == match) continue;
4554  res.append ( pfx );
4555  res.append ( components[j] );
4556  pfx = ", ";
4557  }
4558 
4559  country = res;
4560 
4561  if ( match < 0 ) {
4562  return CCountries::e_Missing;
4563  } else if ( num_states > 1 ) {
4564  return CCountries::e_Ambiguous;
4565  } else if ( ! NStr::Equal ( original, res )) {
4566  return CCountries::e_Corrected;
4567  }
4568 
4569  return CCountries::e_Valid;
4570 }
4571 
4573 
4575 static bool exceptions_initialized = false;
4576 
4577 void CCountries::ReadUSAExceptionMap (CCountries::TUsaExceptionMap& exceptions, const string& exception_file ) {
4578 
4579  if ( ! exception_file.empty()) {
4580 
4581  TNCBITSVStream my_stream (exception_file);
4582  for ( const auto & row : my_stream ) {
4583  TFieldNo number_of_fields = row. GetNumberOfFields();
4584  if ( number_of_fields != 2 ) continue;
4585  string fr = row[0].Get<string>();
4586  string to = row[1].Get<string>();
4587  exceptions [fr] = to;
4588  }
4589  }
4590 }
4591 
4593 
4594  // clear previous map
4595  exception_map.clear();
4596 
4597  // initialize internal exception map
4598  for ( const auto & itm : exceptions ) {
4599  string fr = itm.first;
4600  string to = itm.second;
4601 
4602  // ensure colon is followed by space to match initial correction
4603  string f1, f2;
4604  NStr::SplitInTwo ( fr, ":", f1, f2 );
4607  if ( ! f1.empty() && ! f2.empty()) {
4608  fr = f1 + ": " + f2;
4609  }
4610 
4611  exception_map [fr] = to;
4612  }
4613 
4614  exceptions_initialized = true;
4615 }
4616 
4617 void CCountries::LoadUSAExceptionMap (const string& exception_file ) {
4618 
4619  if ( ! exception_file.empty()) {
4620 
4621  TUsaExceptionMap exceptions;
4622  ReadUSAExceptionMap ( exceptions, exception_file );
4623  LoadUSAExceptionMap ( exceptions );
4624  }
4625 }
4626 
4627 string CCountries::USAStateCleanup ( const string& country, CCountries::EStateCleanup& type ) {
4628 
4629  // call algorithmic mapping function
4630  string working = country;
4631  type = s_DoUSAStateCleanup ( working );
4632 
4633  // apply exceptions from preloaded data file
4634  if ( exceptions_initialized ) {
4635  string corrected = exception_map [working];
4636  if ( ! corrected.empty()) {
4637  // presence in map here will disambiguate otherwise ambiguous name pair,
4638  // thus self-entries need to be added to the ambiguous state exception list
4639  if ( ! NStr::StartsWith ( corrected, "USA" )) {
4640  type = e_NotUSA;
4641  } else if ( NStr::Equal ( corrected, working ) && NStr::Equal ( corrected, country )) {
4642  type = e_Valid;
4643  } else {
4644  type = e_Corrected;
4645  }
4646  return corrected;
4647  }
4648  }
4649 
4650  if ( ! NStr::StartsWith ( working, "USA" )) {
4651  type = e_NotUSA;
4652  }
4653  return working;
4654 }
4655 
4656 string CCountries::USAStateCleanup ( const string& country ) {
4657 
4659  return USAStateCleanup ( country, type );
4660 }
4661 
4662 // end of RW-1278
4663 
4664 string CCountries::NewFixCountry (const string& test, bool us_territories)
4665 {
4666  // change requested for JIRA:SQD-1410
4667  if (s_SuppressCountryFix(test)) {
4668  if (IsValid(test)) {
4669  return test;
4670  } else {
4671  return kEmptyStr;
4672  }
4673  }
4674 
4675  // JIRA:RW-2243 Micronesia is the only entry with a comma, special case test here
4676  string micronesia = "Micronesia, Federated States of";
4677  if (NStr::EqualNocase(test, micronesia)) {
4678  if (! NStr::EqualCase(test, micronesia)) {
4679  return micronesia;
4680  }
4681  }
4682  // JIRA:RW-2243 also special case to convert old Micronesia name to new name
4683  if (NStr::EqualNocase(test, "Micronesia")) {
4684  return micronesia;
4685  }
4686 
4687  string input = test;
4688  if (NStr::StartsWith(input, "\"") && NStr::EndsWith(input, "\"")) {
4689  input = input.substr(1, input.length() - 2);
4690  }
4692 
4693  if (NStr::EndsWith(input, ":")) {
4694  input = input.substr(0, input.length() - 1);
4696  }
4697 
4698  string usa1,usa2;
4699  NStr::SplitInTwo(input, ":", usa1, usa2);
4700  if (!usa1.empty() && !usa2.empty()) {
4703  if (NStr::EqualNocase(usa1, "U.S.A.") || NStr::EqualNocase(usa1, "United States") || NStr::EqualNocase(usa1, "United States of America")) {
4704  input = "USA: " + usa2;
4705  }
4706  }
4707 
4708  auto old_name_fix = k_old_country_name_fixes.find(input.c_str());
4709  if (old_name_fix != k_old_country_name_fixes.end()) {
4710  input = old_name_fix->second;
4711  return input;
4712  }
4713 
4714  if (us_territories) {
4715  if ( NStr::StartsWith( input, "Puerto Rico", NStr::eNocase) || NStr::StartsWith( input, "Guam", NStr::eNocase) || NStr::StartsWith( input, "American Samoa", NStr::eNocase) ) {
4716  input = "USA: " + input;
4719  return input;
4720  } else if ( NStr::StartsWith( input, "Virgin Islands", NStr::eNocase) ) {
4721  input = "USA: US " + input;
4724  return input;
4725  }
4726  }
4727 
4728  if (IsValid(input)) {
4730  return input;
4731  }
4732  string new_country = WholeCountryFix(input);
4733  if (!new_country.empty())
4734  return new_country;
4735 
4736  bool too_many_countries = false;
4737  bool bad_cap = false;
4738  vector<string> countries = x_Tokenize(input);
4739  string valid_country;
4740  string orig_valid_country;
4741 
4742  x_FindCountryName(k_country_name_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
4743  if (valid_country.empty()) {
4744  x_FindCountryName(k_subregion_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
4745  }
4746 
4747  if (!valid_country.empty() && !too_many_countries)
4748  too_many_countries = ContainsMultipleCountryNames (input);
4749 
4750  if (!valid_country.empty() && too_many_countries && valid_country == input)
4751  {
4752  string str1,str2;
4753  NStr::SplitInTwo(valid_country,":",str1,str2);
4754  if (!str1.empty() && !str2.empty() && !NStr::StartsWith(str2," "))
4755  new_country = str1+": "+str2;
4756 
4758  }
4759  else if(!valid_country.empty() && !too_many_countries)
4760  {
4761  // find valid_country in input
4762  size_t pos = NStr::Find(input,orig_valid_country);
4763  // save preceeding string without trailing spaces or delimiters ":,"
4764  string before = input.substr(0,pos);
4765 
4768  // save trailing string without initial spaces or delimiters
4769  string after = input.substr(pos+orig_valid_country.length());
4770  x_RemoveDelimitersFromEnds(after, true);
4772  if (bad_cap) new_country = GetCorrectedCountryCapitalization(valid_country);
4773  else new_country = valid_country;
4774  if (!before.empty() || !after.empty()) {
4775  if (NStr::Find(valid_country, ":") == NPOS) {
4776  new_country += ": ";
4777  } else {
4778  new_country += ", ";
4779  }
4780  }
4781  if (!before.empty())
4782  new_country += before;
4783  if (!before.empty() && !after.empty() && !NStr::Equal(after, ")"))
4784  new_country += ", ";
4785  if (!after.empty())
4786  new_country += after;
4788  }
4789 
4790  return new_country;
4791 }
4792 
4793 
4795 {
4796  // requested in SQD-4516
4797  bool rval = false;
4798  int count = 0;
4799  for (size_t i = 0; i < country.length(); i++) {
4800  if (country[i] == ':') {
4801  count++;
4802  if (count > 1) {
4803  country[i] = ',';
4804  rval = true;
4805  }
4806  }
4807  }
4808  return rval;
4809 }
4810 
4811 
4812 string CCountries::CountryFixupItem(const string &input, bool capitalize_after_colon)
4813 {
4814  string country = NewFixCountry (input);
4815  string new_country = country;
4816  SIZE_TYPE country_end_pos = NStr::Find(country,":");
4817  if (country_end_pos != NPOS)
4818  {
4819  SIZE_TYPE pos = country_end_pos;
4820  while (country[pos] == ',' || country[pos] == ':' || isspace((unsigned char)country[pos]))
4821  {
4822  pos++;
4823  }
4824  string after = country.substr(pos);
4825  if (after.empty()) {
4826  if (pos > country_end_pos) {
4827  new_country = country.substr(0, country_end_pos);
4828  }
4829  } else {
4831  if (capitalize_after_colon)
4832  after = CapitalizeFirstLetterOfEveryWord (after);
4833  new_country = country.substr(0,country_end_pos);
4834  new_country += ": " + after;
4835  }
4836  }
4837  return new_country;
4838 }
4839 
4840 
4841 // SubSource Qual Fixups
4844 
4846  { "adult", "adult" },
4847  { "egg", "egg" },
4848  { "juvenile", "juvenile" },
4849  { "larva", "larva" }
4850 };
4851 
4853 
4854 
4856 {
4857  string fix = value;
4858 
4859  TStaticQualFixMap::const_iterator it = sc_DevStagePairs.find(value.c_str());
4860  if (it != sc_DevStagePairs.end()) {
4861  fix = it->second;
4862  }
4863  return fix;
4864 }
4865 
4866 
4868  { "hemocyte", "hemocyte" },
4869  { "hepatocyte", "hepatocyte" },
4870  { "lymphocyte", "lymphocyte" },
4871  { "neuroblast", "neuroblast" }
4872 };
4873 
4875 
4877 {
4878  string fix = value;
4879 
4880  TStaticQualFixMap::const_iterator it = sc_CellTypePairs.find(value.c_str());
4881  if (it != sc_CellTypePairs.end()) {
4882  fix = it->second;
4883  }
4884  return fix;
4885 
4886 }
4887 
4890 
4892 static bool s_QualFixupMapsInitialized = false;
4893 
4894 static void s_ProcessQualMapLine(const CTempString& line, TQualFixMap& qual_map)
4895 {
4896  vector<CTempString> tokens;
4897  NStr::Split(line, "\t", tokens);
4898  if (tokens.size() > 1) {
4899  qual_map[tokens[0]] = tokens[1];
4900  }
4901 }
4902 
4903 
4904 void s_AddOneDataFile(const string& file_name, const string& data_name,
4905  const char **built_in, size_t num_built_in,
4906  TQualFixMap& qual_map)
4907 {
4908  string file = g_FindDataFile(file_name);
4909  CRef<ILineReader> lr;
4910  if (!file.empty()) {
4911  try {
4912  lr = ILineReader::New(file);
4913  } NCBI_CATCH("s_InitializeQualMaps")
4914  }
4915 
4916  if (lr.Empty()) {
4917  if (built_in == NULL) {
4918  ERR_POST(Note << "No data for " + data_name);
4919  } else {
4920  if (getenv("NCBI_DEBUG")) {
4921  ERR_POST(Note << "Falling back on built-in data for " + data_name);
4922  }
4923  for (size_t i = 0; i < num_built_in; i++) {
4924  const char *p = built_in[i];
4925  s_ProcessQualMapLine(p, qual_map);
4926  }
4927  }
4928  } else {
4929  if (getenv("NCBI_DEBUG")) {
4930  ERR_POST(Note << "Reading from " + file + " for " + data_name);
4931  }
4932  do {
4933  s_ProcessQualMapLine(*++*lr, qual_map);
4934  } while (!lr->AtEOF());
4935  }
4936 }
4937 
4938 #include "isolation_sources.inc"
4939 
4940 static void s_InitializeQualMaps(void)
4941 {
4942  CFastMutexGuard GUARD(s_QualFixMutex);
4944  return;
4945  }
4946 
4947  // tissue types
4948  s_AddOneDataFile("isolation_sources.txt", "isolation sources", (const char **)k_isolation_sources, sizeof(k_isolation_sources) / sizeof(char *), s_IsolationSourceMap);
4950 }
4951 
4952 
4953 
4954 
4955 
4957 {
4958  string fix = value;
4959 
4961 
4963  if (it != s_IsolationSourceMap.end()) {
4964  return it->second;
4965  }
4966 
4967  size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
4968  for (size_t i = 0; i < max; i++) {
4971  break;
4972  }
4973  }
4974 
4975  fix = COrgMod::FixHostCapitalization(fix);
4976  fix = FixDevStageCapitalization(fix);
4977  fix = FixCellTypeCapitalization(fix);
4978 
4979  return fix;
4980 }
4981 
4982 
4984 {
4985  string fix = value;
4986 
4989  if (it != s_IsolationSourceMap.end()) {
4990  return it->second;
4991  }
4992 
4993 
4994  size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
4995  for (size_t i = 0; i < max; i++) {
4998  break;
4999  }
5000  }
5001 
5002  fix = COrgMod::FixHostCapitalization(fix);
5003  fix = FixDevStageCapitalization(fix);
5004  fix = FixCellTypeCapitalization(fix);
5005 
5006  return fix;
5007 }
5008 
5009 
5011 {
5013 }
5014 
5015 
5016 string CSubSource::FixCapitalization(TSubtype subtype, const string& value)
5017 {
5018  string new_val = value;
5019  switch (subtype) {
5021  new_val = FixSexQualifierValue(value);
5022  if (NStr::IsBlank(new_val)) {
5023  new_val = value;
5024  }
5025  break;
5028  break;
5030  new_val = FixLabHostCapitalization(value);
5031  break;
5034  break;
5036  new_val = FixDevStageCapitalization(value);
5037  break;
5039  new_val = FixCellTypeCapitalization(value);
5040  break;
5041  default:
5042  new_val = value;
5043  break;
5044  }
5045  return new_val;
5046 }
5047 
5048 
5050 {
5051  if (!IsSetSubtype() || !IsSetName()) {
5052  return;
5053  }
5054 
5055  TSubtype subtype = GetSubtype();
5056 
5057  if (subtype == CSubSource::eSubtype_sex) {
5058  string upr = GetName();
5059  string lwr = upr;
5060  NStr::ToLower(lwr);
5061  if (! NStr::Equal(upr, lwr)) {
5062  SetName(lwr);
5063  }
5064  }
5065 
5066  const string& name = GetName();
5067 
5068  string new_val = FixCapitalization(subtype, name);
5069 
5070  if (!NStr::IsBlank(new_val)) {
5071  SetName(new_val);
5072  }
5073 
5074 }
5075 
5076 
5077 string CSubSource::AutoFix(TSubtype subtype, const string& value)
5078 {
5079  string new_val;
5080  switch (subtype) {
5082  new_val = CCountries::NewFixCountry(value);
5083  break;
5085  new_val = FixDateFormat(value);
5086  break;
5088  new_val = FixLatLonFormat(value);
5089  break;
5091  new_val = FixSexQualifierValue(value);
5092  break;
5094  new_val = FixAltitude(value);
5095  break;
5096  default:
5097  break;
5098  }
5099  return new_val;
5100 }
5101 
5102 
5104 {
5105  if (!IsSetSubtype() || !IsSetName()) {
5106  return;
5107  }
5108 
5109  TSubtype subtype = GetSubtype();
5110  string new_val = AutoFix(subtype, GetName());
5111 
5112  if (!NStr::IsBlank(new_val)) {
5113  SetName(new_val);
5114  } else if (subtype == CSubSource::eSubtype_sex) {
5115  string upr = GetName();
5116  string lwr = upr;
5117  NStr::ToLower(lwr);
5118  if (! NStr::Equal(upr, lwr)) {
5119  SetName(lwr);
5120  }
5121  }
5122 }
5123 
5124 
5125 
5126 // NOTE (for two arrays below): If string A is a prefix of string B, string B should be placed
5127 // BEFORE string A. I.e. longer string should be earlier
5128 static const char * s_RemovableCultureNotes[] = {
5129  "[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]",
5130  "[BankIt_uncultured16S_wizard]; [universal primers]; [dgge]",
5131  "[BankIt_uncultured16S_wizard]; [universal primers]",
5132  "[BankIt_cultured16S_wizard]",
5133  "[BankIt_organellerRNA_wizard]",
5134  "[BankIt_ITS_wizard]; [rRNAITS_notfound]",
5135  "[BankIt_ITS_wizard]",
5136  "[uncultured (using universal primers)]",
5137  "[uncultured (using universal primers) bacterial source]",
5138  "[cultured bacterial source]",
5139  "[enrichment culture bacterial source]",
5140  "[mixed bacterial source (cultured and uncultured)]",
5141  "[uncultured]; [universal primers]",
5142  "[mixed bacterial source]",
5143  "[virus wizard]",
5144  "[cDNA derived from mRNA, purified viral particles]",
5145  "[cDNA derived from mRNA, whole cell/tissue lysate]",
5146  "[cDNA derived from genomic RNA, whole cell/tissue lysate]",
5147  "[cDNA derived from genomic RNA, purified viral particles]",
5148  "[universal primers]",
5149  "[uncultured; wizard]",
5150  "[uncultured; wizard; spans unknown]",
5151  "[cultured; wizard]",
5152  "[cultured; wizard; spans unknown]",
5153  "[intergenic wizard]",
5154  "[intergenic wizard; spans unknown]",
5155  "[Microsatellite wizard]",
5156  "[Microsatellite wizard; multiple repeats]",
5157  "[D-loop wizard]",
5158  "[D-loop wizard; spans unknown]",
5159  "[D-loop wizard; spans known]",
5160  NULL
5161 };
5162 
5163 static const char * s_ReplaceableCultureNotes[] = {
5164  "[BankIt_uncultured16S_wizard]; [species_specific primers]; [tgge]",
5165  "[BankIt_uncultured16S_wizard]; [species_specific primers]; [dgge]",
5166  "[BankIt_uncultured16S_wizard]; [species_specific primers]",
5167  "[uncultured (with species-specific primers)]",
5168  "[uncultured]; [amplified with species-specific primers]",
5169  "[uncultured (using species-specific primers) bacterial source]",
5170  "[amplified with species-specific primers]",
5171  NULL
5172 };
5173 
5174 
5176 {
5177  for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
5179  if (pos != string::npos) {
5180  return true;
5181  }
5182  }
5183  for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
5185  return true;
5186  }
5187  }
5188  return false;
5189 }
5190 
5191 
5192 void CSubSource::RemoveCultureNotes (string& value, bool is_species_level)
5193 {
5194  if (NStr::IsBlank(value)) {
5195  return;
5196  }
5197 
5198  for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
5199  string to_remove = s_RemovableCultureNotes[i];
5200  size_t remove_len = to_remove.length();
5201  size_t pos = NStr::FindNoCase(value, to_remove);
5202  while (pos != NPOS) {
5203  size_t extra_len = strspn (value.c_str() + pos + remove_len, " ;");
5204  value = value.substr(0, pos) + value.substr(pos + remove_len + extra_len);
5205  pos = NStr::FindNoCase(value, to_remove);
5206  }
5207  }
5208  // remove leading/trailing semicolons
5209  while (NStr::StartsWith(value, " ") || NStr::StartsWith(value, ";")) {
5210  value = value.substr(1);
5211  }
5212  while (NStr::EndsWith(value, " ") || NStr::EndsWith(value, ";")) {
5213  value = value.substr(0, value.length() - 1);
5214  }
5215 
5216  if (is_species_level) {
5217  for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
5219  value = "amplified with species-specific primers";
5220  break;
5221  }
5222  }
5223  }
5224 }
5225 
5226 
5227 void CSubSource::RemoveCultureNotes (bool is_species_level)
5228 {
5229  if (IsSetName()) {
5230  RemoveCultureNotes(SetName(), is_species_level);
5231  if (NStr::IsBlank(GetName())) {
5232  ResetName();
5233  }
5234  }
5235 }
5236 
5237 
5238 // CCountryLine
5240 (const string & country_name, double y, double min_x, double max_x, double scale)
5241 : m_CountryName(country_name) ,
5242  m_Scale (scale)
5243 {
5244  m_Y = x_ConvertLat(y);
5245  m_MinX = x_ConvertLon(min_x);
5246  m_MaxX = x_ConvertLon(max_x);
5247 
5248 }
5249 
5250 
5252 {
5253 }
5254 
5255 
5256 #define EPSILON 0.001
5257 
5258 int CCountryLine::ConvertLat (double y, double scale)
5259 {
5260 
5261  int val = 0;
5262 
5263  if (y < -90.0) {
5264  y = -90.0;
5265  }
5266  if (y > 90.0) {
5267  y = 90.0;
5268  }
5269 
5270  if (y > 0) {
5271  val = (int) (y * scale + EPSILON);
5272  } else {
5273  val = (int) (-(-y * scale + EPSILON));
5274  }
5275 
5276  return val;
5277 }
5278 
5279 
5281 {
5282  return ConvertLat(y, m_Scale);
5283 }
5284 
5285 int CCountryLine::ConvertLon (double x, double scale)
5286 {
5287 
5288  int val = 0;
5289 
5290  if (x < -180.0) {
5291  x = -180.0;
5292  }
5293  if (x > 180.0) {
5294  x = 180.0;
5295  }
5296 
5297  if (x > 0) {
5298  val = (int) (x * scale + EPSILON);
5299  } else {
5300  val = (int) (-(-x * scale + EPSILON));
5301  }
5302 
5303  return val;
5304 }
5305 
5306 
5308 {
5309  return ConvertLon(x, m_Scale);
5310 }
5311 
5312 
5313 CCountryExtreme::CCountryExtreme (const string & country_name, int min_x, int min_y, int max_x, int max_y)
5314 : m_CountryName(country_name) , m_MinX (min_x), m_MinY (min_y), m_MaxX(max_x), m_MaxY (max_y)
5315 {
5316  m_Area = (1 + m_MaxY - m_MinY) * (1 + m_MaxX - m_MinX);
5317  size_t pos = NStr::Find(country_name, ":");
5318  if (pos == NPOS) {
5319  m_Level0 = country_name;
5320  m_Level1.clear();
5321  } else {
5322  m_Level0 = country_name.substr(0, pos);
5324  m_Level1 = country_name.substr(pos + 1);
5326  }
5327 
5328 }
5329 
5330 
5332 {
5333 
5334 }
5335 
5336 
5338 {
5339  if (min_x < m_MinX) {
5340  m_MinX = min_x;
5341  return true;
5342  } else {
5343  return false;
5344  }
5345 }
5346 
5347 
5349 {
5350  if (max_x > m_MaxX) {
5351  m_MaxX = max_x;
5352  return true;
5353  } else {
5354  return false;
5355  }
5356 }
5357 
5358 
5360 {
5361  if (min_y < m_MinY) {
5362  m_MinY = min_y;
5363  return true;
5364  } else {
5365  return false;
5366  }
5367 }
5368 
5369 
5371 {
5372  if (max_y > m_MaxY) {
5373  m_MaxY = max_y;
5374  return true;
5375  } else {
5376  return false;
5377  }
5378 }
5379 
5380 
5382 {
5383  if (line) {
5384  SetMinX(line->GetMinX());
5385  SetMaxX(line->GetMaxX());
5386  SetMinY(line->GetY());
5387  SetMaxY(line->GetY());
5388  m_Area += 1 + line->GetMaxX() - line->GetMinX();
5389  }
5390 }
5391 
5392 
5393 bool CCountryExtreme::DoesOverlap(const CCountryExtreme* other_block) const
5394 {
5395  if (!other_block) {
5396  return false;
5397  } else if (m_MaxX >= other_block->GetMinX()
5398  && m_MaxX <= other_block->GetMaxX()
5399  && m_MaxY >= other_block->GetMinY()
5400  && m_MinY <= other_block->GetMaxY()) {
5401  return true;
5402  } else if (other_block->GetMaxX() >= m_MinX
5403  && other_block->GetMaxX() <= m_MaxX
5404  && other_block->GetMaxY() >= m_MinY
5405  && other_block->GetMinY() <= m_MaxY) {
5406  return true;
5407  } else {
5408  return false;
5409  }
5410 }
5411 
5412 
5413 bool CCountryExtreme::PreferTo(const CCountryExtreme* other_block, const string country, const string province, const bool prefer_new) const
5414 {
5415  if (!other_block) {
5416  return true;
5417  }
5418 
5419  // if no preferred country, these are equal
5420  if (NStr::IsBlank(country)) {
5421  return prefer_new;
5422  }
5423 
5424  // if match to preferred country
5425  if (NStr::EqualNocase(country, m_Level0)) {
5426  // if best was not preferred country, take new match
5427  if (!NStr::EqualNocase(country, other_block->GetLevel0())) {
5428  return true;
5429  }
5430  // if match to preferred province
5431  if (!NStr::IsBlank(province) && NStr::EqualNocase(province, m_Level1)) {
5432  // if best was not preferred province, take new match
5433  if (!NStr::EqualNocase(province, other_block->GetLevel1())) {
5434  return true;
5435  }
5436  }
5437 
5438  // if both match province, or neither does, or no preferred province, take smallest
5439  return prefer_new;
5440  }
5441 
5442  // if best matches preferred country, keep
5443  if (NStr::EqualNocase(country, other_block->GetLevel0())) {
5444  return false;
5445  }
5446 
5447  // otherwise take smallest
5448  return prefer_new;
5449 }
5450 
5451 
5453  : m_Lat(lat),
5454  m_Lon(lon),
5455  m_LandDistance(-1),
5456  m_WaterDistance(-1),
5457  m_ClaimedDistance(-1)
5458 {}
5459 
5460 
5462 {
5464 
5465  // compare guesses or closest regions to indicated country and province
5466  if (!NStr::IsBlank(GetGuessCountry())) {
5467  // if top level countries match
5468  if (NStr::EqualNocase(country, GetGuessCountry())) {
5470  // if both are empty, still call it a match
5471  if (NStr::EqualNocase(province, GetGuessProvince())) {
5473  }
5474  }
5475  // if they don't match, are they closest?
5476  if (!(rval & CLatLonCountryId::fCountryMatch)) {
5477  if (NStr::EqualNocase(country, GetClosestCountry())) {
5479  if (NStr::EqualNocase(province, GetClosestProvince())) {
5481  }
5482  }
5483  } else if (!(rval &