NCBI C++ ToolKit
SubSource.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: SubSource.cpp 102247 2024-04-10 22:44:36Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: .......
27  *
28  * File Description:
29  * .......
30  *
31  * Remark:
32  * This code was originally generated by application DATATOOL
33  * using the following specifications:
34  * 'seqfeat.asn'.
35  */
36 
37 // standard includes
38 #include <ncbi_pch.hpp>
39 #include <serial/enumvalues.hpp>
40 
41 // generated includes
43 
44 #include <math.h>
46 #include <corelib/ncbitime.hpp>
47 
49 #include <mutex>
50 #include <util/compile_time.hpp>
51 
52 // generated classes
53 
55 
56 BEGIN_objects_SCOPE // namespace ncbi::objects::
57 
58 unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonCountryMap;
59 unique_ptr<CLatLonCountryMap> CSubSource::m_LatLonWaterMap;
60 
61 
62 // destructor
64 {
65 }
66 
67 
69 {
71  return false;
72  }
73 
75  string fromEnv = env.Get("NCBI_GEO_LOC_NAME_FOR_COUNTRY");
76  NStr::ToLower(fromEnv);
77  if (fromEnv == "true") {
78  return true;
79  } else if (fromEnv == "false") {
80  return false;
81  }
82 
84  string fromConfig = reg.GetString("OrgSubSource", "UseGeoLocNameForCountry", "off");
85  NStr::ToLower(fromConfig);
86  if (fromConfig == "1" || fromConfig == "on" || fromConfig == "true" || fromConfig == "yes") {
87  return true;
88  }
89 
90  return false;
91 }
92 
93 
95 {
96  static bool value = s_init_UseGeoLocNameForCountry();
97  return value;
98 }
99 
100 
101 void CSubSource::GetLabel(string* str) const
102 {
103  *str += '/';
104  string type_name;
105  if (GetSubtype() == eSubtype_other) {
106  type_name = "other";
107  } else {
108  try {
109  // eVocabulary_insdc has some special cases not (historically)
110  // used here.
112  replace(type_name.begin(), type_name.end(), '_', '-');
113  } catch (const CSerialException&) {
114  type_name = "unknown";
115  }
116  }
117  *str += type_name;
118  *str += '=';
119  *str += GetName();
120  if (IsSetAttrib()) {
121  *str += " (";
122  *str += GetAttrib();
123  *str += ")";
124  }
125 }
126 
127 
129  EVocabulary vocabulary)
130 {
131  string name = NStr::TruncateSpaces(str);
132  NStr::ToLower(name);
133  replace(name.begin(), name.end(), '_', '-');
134  replace(name.begin(), name.end(), ' ', '-');
135 
136  if ( NStr::EqualNocase(name, "note") ||
137  NStr::EqualNocase(name, "subsource-note") ||
138  NStr::EqualNocase(name, "subsrc-note") ||
139  NStr::EqualNocase(name, "note-subsource")) {
140  return eSubtype_other;
141  } else if (vocabulary == eVocabulary_insdc) {
142  // consider a table if more special cases arise.
143  if (name == "insertion-seq") {
145  } else if (name == "plasmid") {
146  return eSubtype_plasmid_name;
147  } else if (name == "transposon") {
149  } else if (name == "sub-clone") {
150  return eSubtype_subclone;
151  }
152  }
153  return ENUM_METHOD_NAME(ESubtype)()->FindValue(name);
154 }
155 
156 
158  EVocabulary vocabulary)
159 {
160 
161  string name = NStr::TruncateSpaces(str);
162  NStr::ToLower(name);
163  replace(name.begin(), name.end(), '_', '-');
164  replace(name.begin(), name.end(), ' ', '-');
165 
166  if ( NStr::EqualNocase(name, "note") ||
167  NStr::EqualNocase(name, "subsource-note") ||
168  NStr::EqualNocase(name, "subsrc-note") ||
169  NStr::EqualNocase(name, "note-subsource")) {
170  return true;
171  }
172  if (vocabulary == eVocabulary_insdc) {
173  // consider a table if more special cases arise.
174  if (name == "insertion-seq" ||
175  name == "plasmid" ||
176  name == "transposon" ||
177  name == "sub-clone") {
178  return true;
179  }
180  }
181  return ENUM_METHOD_NAME(ESubtype)()->IsValidName(name);
182 }
183 
184 
186  EVocabulary vocabulary)
187 {
188  if (stype == CSubSource::eSubtype_other) {
189  return "note";
190  } else if (vocabulary == eVocabulary_insdc) {
191  switch (stype) {
192  case eSubtype_subclone: return "sub_clone";
193  case eSubtype_plasmid_name: return "plasmid";
194  case eSubtype_transposon_name: return "transposon";
195  case eSubtype_insertion_seq_name: return "insertion_seq";
196  default:
197  return NStr::Replace
198  (ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true),
199  "-", "_");
200  }
201  } else {
202  return ENUM_METHOD_NAME(ESubtype)()->FindName(stype, true);
203  }
204 }
205 
206 
207 
209 {
210  return subtype != eSubtype_chromosome
211  && subtype != eSubtype_sex
212  && subtype != eSubtype_germline
213  && subtype != eSubtype_rearranged
214  && subtype != eSubtype_plasmid_name
215  && subtype != eSubtype_segment
216  && subtype != eSubtype_country
217  && subtype != eSubtype_transgenic
218  && subtype != eSubtype_environmental_sample
219  && subtype != eSubtype_lat_lon
220  && subtype != eSubtype_collection_date
221  && subtype != eSubtype_collected_by
222  && subtype != eSubtype_identified_by
223  && subtype != eSubtype_fwd_primer_seq
224  && subtype != eSubtype_rev_primer_seq
225  && subtype != eSubtype_fwd_primer_name
226  && subtype != eSubtype_rev_primer_name
227  && subtype != eSubtype_metagenomic
228  && subtype != eSubtype_altitude
229  && subtype != eSubtype_clone;
230 }
231 
232 
233 bool CSubSource::NeedsNoText(const TSubtype& subtype)
234 {
235  if (subtype == eSubtype_germline
236  || subtype == eSubtype_rearranged
237  || subtype == eSubtype_transgenic
238  || subtype == eSubtype_environmental_sample
239  || subtype == eSubtype_metagenomic) {
240  return true;
241  } else {
242  return false;
243  }
244 }
245 
246 
248 {
249  if (subtype == eSubtype_frequency
250  || subtype == eSubtype_insertion_seq_name
251  || subtype == eSubtype_phenotype
252  || subtype == eSubtype_plastid_name
253  || subtype == eSubtype_transposon_name
254  || subtype == eSubtype_fwd_primer_seq
255  || subtype == eSubtype_rev_primer_seq
256  || subtype == eSubtype_fwd_primer_name
257  || subtype == eSubtype_rev_primer_name
258  || subtype == eSubtype_whole_replicon) { // metagenomic subsrc qualifier taken off this list: GB-3384
259  return true;
260  } else {
261  return false;
262  }
263 }
264 
265 
266 bool CSubSource::IsDayValueOkForMonth(int day, int month, int year)
267 {
268  if (month < 1 || month > 12 || day < 1) {
269  return false;
270  }
271  bool rval = true;
272  if (year < 100) {
273  year += 2000;
274  } else if (year > 3000) {
275  return false;
276  } else if (year < 1538) {
277  return false;
278  }
279  CTime month_o(year, month, 1);
280  if (day > month_o.DaysInMonth()) {
281  rval = false;
282  }
283  return rval;
284 }
285 
286 
288 {
289  if (NStr::IsBlank(test)) {
291  "collection-date string is blank");
292  }
293  string str = NStr::TruncateSpaces(test);
294 
295  if (IsISOFormatDate(str)) {
296  return GetDateFromISODate(str);
297  }
298 
299  size_t pos = NStr::Find(str, "-");
300  string year;
301  string month;
302  string day;
303 
304  if (pos == NPOS) {
305  year = str;
306  } else {
307  size_t pos2 = NStr::Find(str, "-", pos + 1);
308  if (pos2 == NPOS) {
309  month = str.substr(0, pos);
310  year = str.substr(pos + 1);
311  if (NStr::IsBlank(month)) {
313  "collection-date string is improperly formatted");
314  }
315  } else {
316  day = str.substr(0, pos);
317  month = str.substr(pos + 1, pos2 - pos - 1);
318  year = str.substr(pos2 + 1);
319  if (NStr::IsBlank(month) || NStr::IsBlank(day)) {
321  "collection-date string is improperly formatted");
322  }
323  }
324  }
325 
326  int month_val = 0;
327  if (!NStr::IsBlank(month)) {
328  try {
329  month_val = CTime::MonthNameToNum(month);
330  } catch (const CTimeException&) {
332  "collection-date string has invalid month");
333  }
334  }
335 
336  int day_val = 0;
337  if (!NStr::IsBlank(day)) {
338  try {
339  day_val = NStr::StringToInt (day);
340  if (day_val < 1) {
342  "collection-date string has invalid day value");
343  }
344  } catch ( const exception& ) {
345  // threw exception while converting to int
347  "collection-date string is improperly formatted");
348  }
349  }
350 
351  if (NStr::IsBlank(year)) {
353  "collection-date string is improperly formatted");
354  }
355 
356  int year_val = 0;
357  try {
358  year_val = NStr::StringToInt (year);
359  } catch ( const exception& ) {
360  // threw exception while converting to int
362  "collection-date string is improperly formatted");
363  }
364 
365  /*
366  if (year_val < 1000 || year_val >= 2100) {
367  NCBI_THROW (CException, eUnknown,
368  "collection-date year is out of range");
369  }
370  */
371 
372  if (year_val < 1000) {
374  "collection-date year is out of range");
375  }
376 
377  if (year_val >= 2100) {
379  "collection-date year is out of range");
380  }
381 
382  if (day_val > 0 && month_val > 0 && !IsDayValueOkForMonth(day_val, month_val, year_val)) {
384  "collection-date day is greater than monthly maximum");
385  }
386 
387  CRef<CDate> date(new CDate);
388 
389  date->SetStd().SetYear (year_val);
390  if (month_val > 0) {
391  date->SetStd().SetMonth (month_val);
392  }
393  if (day_val > 0) {
394  date->SetStd().SetDay (day_val);
395  }
396 
397  time_t t;
398 
399  time(&t);
400 
401  CDate now(t);
402 
403  /*
404  if (IsCollectionDateAfterTime(*date, t)) {
405  NCBI_THROW (CException, eUnknown,
406  "collection-date year is out of range");
407  }
408  */
409 
410  return date;
411 }
412 
413 
414 bool CSubSource::IsCollectionDateAfterTime(const string& collection_date, time_t t, bool& bad_format)
415 {
416  bad_format = false;
417  bool in_future = false;
418  vector<string> pieces;
419  NStr::Split(collection_date, "/", pieces);
420  if (pieces.size() > 2) {
421  bad_format = true;
422  } else {
423  ITERATE(vector<string>, it, pieces) {
424  CRef<CDate> coll_date = DateFromCollectionDate (*it);
425  if (!coll_date) {
426  bad_format = true;
427  } else if (IsCollectionDateAfterTime(*coll_date, t)) {
428  in_future = true;
429  }
430  }
431  }
432  return in_future;
433 }
434 
435 
436 bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, time_t t)
437 {
438  CDate now(t);
439  if (collection_date.Compare(now) == CDate::eCompare_after) {
440  return true;
441  } else {
442  return false;
443  }
444 }
445 
446 
447 bool CSubSource::IsCollectionDateAfterTime(const CDate& collection_date, CTime& ctime)
448 {
449  time_t t = ctime.GetTimeT();
450  return IsCollectionDateAfterTime(collection_date, t);
451 }
452 
453 
454 void CSubSource::IsCorrectDateFormat(const string& date_string, bool& bad_format, bool& in_future)
455 {
456  bad_format = false;
457  in_future = false;
458 
459  vector<string> pieces;
460  NStr::Split(date_string, "/", pieces);
461  if (pieces.size() > 2) {
462  bad_format = true;
463  return;
464  } else if (pieces.size() == 2) {
465  bool first_bad = false;
466  bool first_future = false;
467  bool second_bad = false;
468  bool second_future = false;
469  IsCorrectDateFormat(pieces[0], first_bad, first_future);
470  IsCorrectDateFormat(pieces[1], second_bad, second_future);
471  bad_format = first_bad || second_bad;
472  if (!bad_format) {
473  in_future = first_future || second_future;
474  }
475  return;
476  }
477 
478  try {
479  CRef<CDate> coll_date = CSubSource::DateFromCollectionDate (date_string);
480 
481  if (!IsISOFormatDate(date_string)) {
482  // if there are two dashes, then the first token needs to be the day, and the
483  // day has to have two numbers, a leading zero if the day is less than 10
484  size_t pos = NStr::Find(date_string, "-");
485  if (pos != NPOS) {
486  size_t pos2 = NStr::Find(date_string, "-", pos + 1);
487  if (pos2 != NPOS && pos != 2) {
488  bad_format = true;
489  }
490  }
491  }
492 
493  if (!bad_format) {
494  time_t t;
495 
496  time(&t);
497 
498  in_future = IsCollectionDateAfterTime(*coll_date, t);
499  }
500  } catch (const CException& ) {
501  bad_format = true;
502  }
503 }
504 
505 size_t CSubSource::CheckDateFormat(const string& date_string)
506 {
507  size_t rval = eDateFormatFlag_ok;
508  vector<string> pieces;
509  NStr::Split(date_string, "/", pieces);
510  if (pieces.size() > 2) {
512  } else if (pieces.size() == 2) {
513  rval |= CheckDateFormat(pieces[0]);
514  rval |= CheckDateFormat(pieces[1]);
515  if (rval == eDateFormatFlag_ok) {
516  try {
519  if (d2->Compare(*d1) == CDate::eCompare_before) {
521  }
522  } catch (const CException&) {
524  }
525  }
526  return rval;
527  }
528 
529  try {
530  CRef<CDate> coll_date = CSubSource::DateFromCollectionDate(date_string);
531 
532  if (!IsISOFormatDate(date_string)) {
533  // if there are two dashes, then the first token needs to be the day, and the
534  // day has to have two numbers, a leading zero if the day is less than 10
535  size_t pos = NStr::Find(date_string, "-");
536  if (pos != NPOS) {
537  size_t pos2 = NStr::Find(date_string, "-", pos + 1);
538  if (pos2 != NPOS && pos != 2) {
540  }
541  }
542  }
543 
544  if (rval == eDateFormatFlag_ok) {
545  time_t t;
546 
547  time(&t);
548  if (IsCollectionDateAfterTime(*coll_date, t)) {
550  }
551  }
552  } catch (const CException&) {
554  }
555  return rval;
556 }
557 
559 
560 // null term exemption values, order is not important
561 MAKE_CONST_SET(s_Null_CollectionDatesSet, ct::tagStrCase,
562 {
563  "missing",
564  "missing: control sample",
565  "missing: data agreement established pre-2023",
566  "missing: endangered species",
567  "missing: human-identifiable",
568  "missing: lab stock",
569  "missing: sample group",
570  "missing: synthetic construct",
571  "missing: third party data",
572  "not applicable",
573  "not collected",
574  "not provided",
575  "restricted access",
576 })
577 
578 string CSubSource::GetCollectionDateProblem (const string& date_string)
579 {
580  string problem;
581  if (s_Null_CollectionDatesSet.find(date_string.c_str()) != s_Null_CollectionDatesSet.end()) {
582  return problem;
583  }
584  size_t rval = CheckDateFormat(date_string);
585  if (rval & eDateFormatFlag_bad_format) {
586  problem = "Collection_date format is not in DD-Mmm-YYYY format";
587  } else if (rval & eDateFormatFlag_in_future) {
588  problem = "Collection_date is in the future";
589  } else if (rval & eDateFormatFlag_out_of_order) {
590  problem = "Collection_dates are out of order";
591  }
592  return problem;
593 }
594 
595 
596 string CSubSource::x_ParseDateRangeWithDelimiter(const string& orig_date, CTempString delim)
597 {
598  size_t pos = NStr::Find(orig_date, delim, NStr::eNocase);
599  if (pos == NPOS) {
600  return kEmptyStr;
601  }
602  size_t second_pos = NStr::Find(orig_date.substr(pos + 1), delim, NStr::eNocase);
603  if (second_pos != NPOS) {
604  return kEmptyStr;
605  }
606  bool month_ambig = false;
607  string first_date = FixDateFormat(orig_date.substr(0, pos), true, month_ambig);
608  if (month_ambig || NStr::IsBlank(first_date)) {
609  return kEmptyStr;
610  }
611  string second_date = FixDateFormat(orig_date.substr(pos + delim.length()), true, month_ambig);
612  if (month_ambig || NStr::IsBlank(second_date)) {
613  return kEmptyStr;
614  }
615  string fix = first_date + "/" + second_date;
616  return fix;
617 }
618 
619 
620 string CSubSource::FixDateFormat (const string& orig_date)
621 {
622  bool month_ambiguous = false;
623 
624  string fix = FixDateFormat(orig_date, true, month_ambiguous);
625  if (month_ambiguous) {
626  fix.clear();
627  } else if (NStr::IsBlank(fix)) {
628  static const char* delimiters[] = {"/", " to ", " and ", "-", "_"};
629  for (size_t i = 0; i < ArraySize(delimiters); i++) {
630  fix = x_ParseDateRangeWithDelimiter(orig_date, delimiters[i]);
631  if (!NStr::IsBlank(fix)) {
632  break;
633  }
634  }
635  }
636  return fix;
637 }
638 
639 // ISO Format for time is one of these:
640 // HH:MM:SS
641 // HH:MM
642 // HH
643 // Followed by either Z or +hh:mm to indicate an offset from Zulu
644 bool CSubSource::IsISOFormatTime(const string& orig_time, int& hour, int& min, int& sec, bool require_time_zone)
645 {
646  int offset_hour = 0;
647  int offset_min = 0;
648  size_t suffix = NStr::Find(orig_time, "Z");
649  if (suffix == NPOS) {
650  suffix = NStr::Find(orig_time, "+");
651  if (suffix == NPOS) {
652  if (require_time_zone) {
653  return false;
654  } else {
655  suffix = orig_time.length();
656  }
657  } else {
658  if (orig_time.substr(suffix).length() != 6 ||
659  !isdigit((unsigned char)orig_time[suffix + 1]) ||
660  !isdigit((unsigned char)orig_time[suffix + 2]) ||
661  orig_time[suffix + 3] != ':' ||
662  !isdigit((unsigned char)orig_time[suffix + 4]) ||
663  !isdigit((unsigned char)orig_time[suffix + 5])) {
664  return false;
665  }
666  try {
667  offset_hour = NStr::StringToInt(orig_time.substr(suffix + 1, 2));
668  offset_min = NStr::StringToInt(orig_time.substr(suffix + 4, 2));
669  } catch (...) {
670  return false;
671  }
672  }
673  }
674  if (suffix != 2 && suffix != 5 && suffix != 8) {
675  return false;
676  }
677 
678  if (!isdigit((unsigned char)orig_time[0]) || !isdigit((unsigned char)orig_time[1])) {
679  return false;
680  }
681  hour = 0;
682  min = 0;
683  sec = 0;
684  try {
685  hour = NStr::StringToInt(orig_time.substr(0, 2));
686  if (hour < 0 || hour > 23) {
687  return false;
688  }
689  hour -= offset_hour;
690  } catch (...) {
691  return false;
692  }
693  if (suffix > 2) {
694  if (!isdigit((unsigned char)orig_time[3]) || !isdigit((unsigned char)orig_time[4])) {
695  return false;
696  }
697  try {
698  min = NStr::StringToInt(orig_time.substr(3, 2));
699  if (min < 0 || min > 59) {
700  return false;
701  }
702  } catch (...) {
703  return false;
704  }
705  min -= offset_min;
706  }
707  if (suffix == 8) {
708  if (!isdigit((unsigned char)orig_time[6]) || !isdigit((unsigned char)orig_time[7])) {
709  return false;
710  }
711  try {
712  sec = NStr::StringToInt(orig_time.substr(6, 2));
713  if (sec < 0) {
714  // negative number bad
715  return false;
716  } else if (sec > 59) {
717  // too big
718  return false;
719  }
720  } catch (...) {
721  return false;
722  }
723  }
724 
725  return true;
726 }
727 
728 // ISO Format for date is exactly 10 characters long OR exactly 7 characters long.
729 // For ten characters:
730 // First four characters must be digits, represent year.
731 // Fifth character must be dash.
732 // Sixth and seventh characters must be digits, represent month, use zero padding.
733 // Eighth character must be dash.
734 // Ninth and tenth characters must be digits, represent day, use zero padding.
735 // For 7 characters:
736 // First four characters must be digits, represent year.
737 // Fifth character must be dash.
738 // Sixth and seventh characters must be digits, represent month, use zero padding.
739 bool CSubSource::IsISOFormatDateOnly (const string& cpy)
740 {
741  if (cpy.length() != 10 && cpy.length() != 7) {
742  return false;
743  }
744  bool rval = true;
745  size_t pos = 0;
746  string::const_iterator it = cpy.begin();
747  while (it != cpy.end() && rval) {
748  if (pos == 4 || pos == 7) {
749  if (*it != '-') {
750  rval = false;
751  }
752  } else if (!isdigit(*it)) {
753  rval = false;
754  }
755  ++it;
756  ++pos;
757  }
758  if (rval) {
759  try {
760  int year = NStr::StringToInt(cpy.substr(0, 4));
761  int month = NStr::StringToInt(cpy.substr(5, 2));
762  if (month < 1 || month > 12) {
763  rval = false;
764  }
765  if (cpy.length() == 10) { // has day
766  int day = NStr::StringToInt(cpy.substr(8, 2));
767  if (!IsDayValueOkForMonth(day, month, year)) {
768  rval = false;
769  }
770  }
771  } catch (...) {
772  rval = false;
773  }
774  }
775  return rval;
776 }
777 
778 
779 bool CSubSource::x_IsFixableIsoDate(const string& orig_date)
780 {
781  string cpy = orig_date;
783  size_t time_pos = NStr::Find(cpy, "T");
784  bool rval = false;
785  if (time_pos == NPOS) {
786  rval = false;
787  } else {
788  if (!IsISOFormatDateOnly(cpy.substr(0, time_pos))) {
789  rval = false;
790  } else {
791  int h, m, s;
792  if (IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, true)) {
793  // already fine, not fixable
794  rval = false;
795  } else {
796  rval = IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s, false);
797  }
798  }
799  }
800  return rval;
801 }
802 
803 
804 string CSubSource::x_RemoveIsoTime(const string& orig_date)
805 {
806  string cpy = orig_date;
808  size_t time_pos = NStr::Find(cpy, "T");
809  if (time_pos != NPOS) {
810  cpy = cpy.substr(0, time_pos);
811  }
812  return cpy;
813 }
814 
815 
816 bool CSubSource::IsISOFormatDate(const string& orig_date)
817 {
818  string cpy = orig_date;
820  size_t time_pos = NStr::Find(cpy, "T");
821  if (time_pos == NPOS) {
822  return IsISOFormatDateOnly(cpy);
823  } else {
824  int h, m, s;
825  return (IsISOFormatDateOnly(cpy.substr(0, time_pos)) &&
826  IsISOFormatTime(cpy.substr(time_pos + 1), h, m, s));
827  }
828 
829 }
830 
832 {
833  try {
834  string cpy = orig_date;
836  CRef<CDate> date(new CDate());
837  int year_val = NStr::StringToInt(cpy.substr(0, 4));
838  int month_val = NStr::StringToInt(cpy.substr(5, 2));
839  date->SetStd().SetYear (year_val);
840  date->SetStd().SetMonth (month_val);
841  if (cpy.length() > 7) {
842  int day_val = NStr::StringToInt(cpy.substr(8, 2));
843  date->SetStd().SetDay (day_val);
844  }
845  return date;
846  } catch (...) {
847  return CRef<CDate>();
848  }
849 }
850 
851 
852 vector<string> CSubSource::x_GetDateTokens(const string& orig_date)
853 {
854  vector<string> tokens;
855  string token_delimiters = " ,-/=_.";
856 
857  string cpy = orig_date;
859 
860  string curr_token;
861  bool is_chars = false;
862  ITERATE(string, s, cpy) {
863  if (token_delimiters.find(*s) != NPOS) {
864  if (!NStr::IsBlank(curr_token)) {
865  tokens.push_back(curr_token);
866  }
867  curr_token.clear();
868  is_chars = false;
869  } else if (is_chars && !isalpha((unsigned char)(*s))) {
870  // previous token was all letters, do not add non-letter characters
871  if (!NStr::IsBlank(curr_token)) {
872  tokens.push_back(curr_token);
873  }
874  curr_token = *s;
875  is_chars = false;
876  } else if (!NStr::IsBlank(curr_token) && !is_chars && isalpha(*s)) {
877  // previous token had no letters
878  tokens.push_back(curr_token);
879  curr_token = *s;
880  is_chars = true;
881  } else {
882  curr_token += *s;
883  if (isalpha(*s)) {
884  is_chars = true;
885  }
886  }
887  }
888  if (!NStr::IsBlank(curr_token)) {
889  tokens.push_back(curr_token);
890  }
891 
892  // reattach 'st', 'nd', 'rd', and 'th' to numbers if present
893  if (tokens.size() > 3) {
894  vector<string>::iterator p = tokens.begin();
895  bool prev_is_number = isdigit((unsigned char)(*p)[0]);
896  vector<string>::iterator s = p;
897  ++s;
898  while (s != tokens.end()) {
899  if (prev_is_number &&
900  (NStr::EqualNocase(*s, "st") ||
901  NStr::EqualNocase(*s, "nd") ||
902  NStr::EqualNocase(*s, "rd") ||
903  NStr::EqualNocase(*s, "th"))) {
904  *p += *s;
905  s = tokens.erase(s);
906  prev_is_number = false;
907  } else {
908  ++p;
909  ++s;
910  prev_is_number = isdigit((unsigned char)(*p)[0]);
911  }
912  }
913  }
914 
915  return tokens;
916 }
917 
918 
919 bool s_ChooseMonthAndDay(const string& token1, const string& token2, bool month_first, string& month, int& day, bool& month_ambiguous)
920 {
921  try {
922  int val1 = NStr::StringToInt (token1);
923  int val2 = NStr::StringToInt (token2);
924  if (val1 > 12 && val2 > 12) {
925  // both numbers too big for month
926  return false;
927  } else if (val1 < 13 && val2 < 13) {
928  if (val1 == val2) {
929  // no need to call this ambiguous
930  month = CTime::MonthNumToName(val1, CTime::eAbbr);
931  day = val2;
932  } else {
933  // both numbers could be month
934  month_ambiguous = true;
935  if (month_first) {
936  month = CTime::MonthNumToName(val1, CTime::eAbbr);
937  day = val2;
938  } else {
939  month = CTime::MonthNumToName(val2, CTime::eAbbr);
940  day = val1;
941  }
942  }
943  } else if (val1 < 13) {
944  month = CTime::MonthNumToName(val1, CTime::eAbbr);
945  day = val2;
946  } else {
947  month = CTime::MonthNumToName(val2, CTime::eAbbr);
948  day = val1;
949  }
950  return true;
951  } catch ( ... ) {
952  return false;
953  }
954 }
955 
956 
957 string CSubSource::FixDateFormat (const string& test, bool month_first, bool& month_ambiguous)
958 {
959  string orig_date = test;
960  NStr::TruncateSpacesInPlace(orig_date);
961 
962  if (IsISOFormatDate(orig_date)) {
963  return orig_date;
964  } else if (x_IsFixableIsoDate(orig_date)) {
965  return x_RemoveIsoTime(orig_date);
966  }
967 
968  string reformatted_date;
969  string month;
970  int year = 0, day = 0;
971  //string token_delimiters = " ,-/=_.";
972  size_t num_original_tokens = 0;
973 
974  month_ambiguous = false;
975  vector<string> tokens = x_GetDateTokens(orig_date);
976 
977  num_original_tokens = tokens.size();
978  if (tokens.size() < 1 || tokens.size() > 3) {
979  // no tokens or too many tokens
980  return kEmptyStr;
981  }
982 
983  string one_token;
984  vector<string>::iterator it = tokens.begin();
985  while (it != tokens.end()) {
986  one_token = *it;
987  bool found = false;
988  if (NStr::EqualNocase(one_token, "1st") || NStr::EqualNocase(one_token, "first")) {
989  day = 1;
990  found = true;
991  } else if (NStr::EqualNocase(one_token, "2nd") || NStr::EqualNocase(one_token, "second")) {
992  day = 2;
993  found = true;
994  } else if (NStr::EqualNocase(one_token, "3rd") || NStr::EqualNocase (one_token, "third")) {
995  day = 3;
996  found = true;
997  } else if (one_token.length() > 0
998  && isdigit((unsigned char)one_token[0])
999  && NStr::EndsWith(one_token, "th")) {
1000  try {
1001  day = NStr::StringToInt (one_token.substr(0, one_token.length() - 2));
1002  found = true;
1003  } catch ( ... ) {
1004  // threw exception while converting to int
1005  return kEmptyStr;
1006  }
1007  } else if (isalpha((unsigned char)one_token[0])) {
1008  if (!NStr::IsBlank(month)) {
1009  // already have month, error
1010  return kEmptyStr;
1011  }
1012  if (one_token.length() > 3) {
1013  one_token = one_token.substr(0, 3);
1014  }
1015  try {
1016  int month_num = CTime::MonthNameToNum(one_token);
1017  found = true;
1018  month = CTime::MonthNumToName(month_num, CTime::eAbbr);
1019  } catch (const CTimeException&) {
1020  }
1021  } else {
1022  try {
1023  int this_val = NStr::StringToInt (one_token);
1024  int min = 1;
1025  int max = 31;
1026  if (this_val < min) {
1027  return kEmptyStr;
1028  } else if (this_val > max) {
1029  if (year > 0) {
1030  // already have year, error
1031  return kEmptyStr;
1032  }
1033  year = this_val;
1034  found = true;
1035  }
1036  } catch ( ... ) {
1037  // threw exception while converting to int
1038  return kEmptyStr;
1039  }
1040  }
1041  if (found) {
1042  it = tokens.erase(it);
1043  } else {
1044  it++;
1045  }
1046  }
1047 
1048  if (tokens.size() == 0) {
1049  // good - all tokens assigned to values
1050  } else if (tokens.size() > 2) {
1051  // three numbers: treat last one as year
1052  try {
1053  year = NStr::StringToInt(tokens[2]);
1054  if (year < 100) {
1055  year += 2000;
1056  }
1057  if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
1058  return kEmptyStr;
1059  }
1060  // mark month as ambiguous, since we are guessing about year
1061  month_ambiguous = true;
1062  } catch ( ... ) {
1063  // threw exception while converting to int
1064  return kEmptyStr;
1065  }
1066  } else if (tokens.size() == 1) {
1067  try {
1068  int val = NStr::StringToInt (tokens[0]);
1069  if (year == 0) {
1070  year = val;
1071  } else {
1072  if (NStr::IsBlank (month)) {
1073  if (val > 0 && val < 13) {
1075  } else {
1076  // month number out of range
1077  return kEmptyStr;
1078  }
1079  } else {
1080  day = val;
1081  }
1082  }
1083  } catch ( ... ) {
1084  // threw exception while converting to int
1085  return kEmptyStr;
1086  }
1087  } else if (!NStr::IsBlank (month)) {
1088  if (tokens.size() == 2) {
1089  // we have a month and two other numbers (we hope)
1090  int val1 = 0;
1091  int val2 = 0;
1092  try {
1093  val1 = NStr::StringToInt (tokens[0]);
1094  val2 = NStr::StringToInt (tokens[1]);
1095  } catch (CException& /*e*/) {
1096  // not actually numbers
1097  return kEmptyStr;
1098  }
1099  bool zero_pad_1 = NStr::StartsWith(tokens[0], "0");
1100  bool zero_pad_2 = NStr::StartsWith(tokens[1], "0");
1101  if (val1 < 10 && !zero_pad_1 && (val2 > 10 || zero_pad_2)) {
1102  // if one token is not zero-padded and less than 10,
1103  // the other either is zero-padded and greater than 10,
1104  // the "small" token is the day and the second (+2000) is the year
1105  day = val1;
1106  year = val2 + 2000;
1107  } else if (val2 < 10 && !zero_pad_2 && (val1 > 10 || zero_pad_1)) {
1108  // if one token is not zero-padded and less than 10,
1109  // the other either is zero-padded and greater than 10,
1110  // the "small" token is the day and the second (+2000) is the year
1111  day = val2;
1112  year = val1 + 2000;
1113  } else {
1114  int month_num = CTime::MonthNameToNum(month);
1115  if (IsDayValueOkForMonth(val1, month_num, val2 + 2000)) {
1116  day = val1;
1117  year = val2 + 2000;
1118  } else {
1119  day = val2;
1120  year = val1 + 2000;
1121  }
1122  }
1123  } else {
1124  return kEmptyStr;
1125  }
1126  } else {
1127  if (!s_ChooseMonthAndDay(tokens[0], tokens[1], month_first, month, day, month_ambiguous)) {
1128  return kEmptyStr;
1129  }
1130  }
1131 
1132  // make sure day is valid
1133  if (day > 0 && !NStr::IsBlank(month) && year > -1) {
1134  try {
1135  int month_num = CTime::MonthNameToNum(month);
1136  if (!IsDayValueOkForMonth(day, month_num, year)) {
1137  return kEmptyStr;
1138  }
1139  } catch (const CTimeException&) {
1140  return kEmptyStr;
1141  }
1142  }
1143 
1144  if (year > 0 && year < 100 && num_original_tokens > 1) {
1145  // try to guess year from two-digit year provided,
1146  // only if it could not possibly be a day of the month
1147  // and if there were at least two tokens provided
1148  string year_date = NStr::NumericToString(year + 2000);
1149  bool format_bad = false;
1150  bool in_future = false;
1151  IsCorrectDateFormat(year_date, format_bad, in_future);
1152  if (in_future) {
1153  year += 1900;
1154  } else {
1155  year += 2000;
1156  }
1157  }
1158  if (year >= 1000 && year < 2100) {
1159  reformatted_date = NStr::NumericToString (year);
1160  if (!NStr::IsBlank (month)) {
1161  reformatted_date = month + "-" + reformatted_date;
1162  if (day > 0) {
1163  string day_str = NStr::NumericToString (day);
1164  if (day_str.length() < 2) {
1165  day_str = "0" + day_str;
1166  }
1167  reformatted_date = day_str + "-" + reformatted_date;
1168  }
1169  }
1170  }
1171 
1172  return reformatted_date;
1173 }
1174 
1175 
1176 void CSubSource::DetectDateFormat(const string& orig_date, bool& ambiguous, bool &day_first)
1177 {
1178  ambiguous = false;
1179  day_first = false;
1180  vector<string> tokens = x_GetDateTokens(orig_date);
1181  if (tokens.size() != 3) {
1182  // can't do detection if there are more or less than three tokens
1183  ambiguous = true;
1184  return;
1185  }
1186  vector<int> nums;
1187 
1188  // detection is only valid if all tokens are numbers and at least one is known to be the year
1189  try {
1190  ITERATE(vector<string>, it, tokens) {
1191  nums.push_back(NStr::StringToInt (*it));
1192  }
1193  } catch ( ... ) {
1194  // threw exception while converting to int
1195  ambiguous = true;
1196  return;
1197  }
1198  enum EPos { eDay = 0, eMonth = 1, eYear = 2 };
1199  vector<int> positions;
1200  positions.push_back(0);
1201  positions.push_back(0);
1202  positions.push_back(0);
1203 
1204  int token_pos = 1;
1205  ITERATE(vector<int>, it, nums) {
1206  if (*it > 31) {
1207  if (positions[eYear] > 0) {
1208  // already found a year
1209  ambiguous = true;
1210  return;
1211  }
1212  positions[eYear] = token_pos;
1213  } else if (*it > 12) {
1214  if (positions[eDay] > 0) {
1215  // already found a day
1216  ambiguous = true;
1217  return;
1218  }
1219  positions[eDay] = token_pos;
1220  } else if (positions[eMonth] > 0) {
1221  // already found a month
1222  ambiguous = true;
1223  return;
1224  } else {
1225  positions[eMonth] = token_pos;
1226  }
1227  token_pos++;
1228  }
1229  if (positions[eDay] < positions[eMonth]) {
1230  day_first = true;
1231  } else {
1232  day_first = false;
1233  }
1234 }
1235 
1236 
1237 void CSubSource::IsCorrectLatLonFormat (string lat_lon, bool& format_correct, bool& precision_correct,
1238  bool& lat_in_range, bool& lon_in_range,
1239  double& lat_value, double& lon_value)
1240 {
1241  format_correct = false;
1242  lat_in_range = false;
1243  lon_in_range = false;
1244  precision_correct = false;
1245  double ns, ew;
1246  char lon, lat;
1247  int processed;
1248 
1249  lat_value = 0.0;
1250  lon_value = 0.0;
1251 
1252  if (NStr::IsBlank(lat_lon)) {
1253  return;
1254  } else if (sscanf (lat_lon.c_str(), "%lf %c %lf %c%n", &ns, &lat, &ew, &lon, &processed) != 4
1255  || size_t(processed) != lat_lon.length()) {
1256  return;
1257  } else if ((lat != 'N' && lat != 'S') || (lon != 'E' && lon != 'W')) {
1258  return;
1259  } else {
1260  // init values found
1261  if (lat == 'N') {
1262  lat_value = ns;
1263  } else {
1264  lat_value = 0.0 - ns;
1265  }
1266  if (lon == 'E') {
1267  lon_value = ew;
1268  } else {
1269  lon_value = 0.0 - ew;
1270  }
1271 
1272  // make sure format is correct
1273  vector<string> pieces;
1274  NStr::Split(lat_lon, " ", pieces);
1275  if (pieces.size() > 3) {
1276  int precision_lat = x_GetPrecision(pieces[0]);
1277  int precision_lon = x_GetPrecision(pieces[2]);
1278 
1279  char reformatted[1000];
1280  sprintf (reformatted, "%.*lf %c %.*lf %c", precision_lat, ns, lat,
1281  precision_lon, ew, lon);
1282 
1283  size_t len = strlen (reformatted);
1284  if (NStr::StartsWith(lat_lon, reformatted)
1285  && (len == lat_lon.length()
1286  || (len < lat_lon.length()
1287  && lat_lon[len] == ';'))) {
1288  format_correct = true;
1289  if (ns <= 90 && ns >= 0) {
1290  lat_in_range = true;
1291  }
1292  if (ew <= 180 && ew >= 0) {
1293  lon_in_range = true;
1294  }
1295  if (precision_lat < 3 && precision_lon < 3) {
1296  precision_correct = true;
1297  }
1298  }
1299  }
1300  }
1301 }
1302 
1303 
1305 {
1306  bool format_correct = false;
1307  bool precision_correct = false;
1308  bool lat_in_range = false;
1309  bool lon_in_range = false;
1310  double lat_value = 0.0;
1311  double lon_value = 0.0;
1312  IsCorrectLatLonFormat(orig, format_correct, precision_correct,
1313  lat_in_range, lon_in_range,
1314  lat_value, lon_value);
1315  if (!format_correct || !lat_in_range || !lon_in_range || precision_correct) {
1316  return orig;
1317  }
1318  vector<string> pieces;
1319  NStr::Split(orig, " ", pieces);
1320  if (pieces.size() > 3) {
1321  int precision_lat = x_GetPrecision(pieces[0]);
1322  int precision_lon = x_GetPrecision(pieces[2]);
1323  if (precision_lat > 4) {
1324  precision_lat = 4;
1325  }
1326  if (precision_lon > 4) {
1327  precision_lon = 4;
1328  }
1329 
1330  char reformatted[1000];
1331  sprintf(reformatted, "%.*lf %c %.*lf %c", precision_lat, fabs(lat_value), pieces[1].c_str()[0],
1332  precision_lon, fabs(lon_value), pieces[3].c_str()[0]);
1333  string new_val = reformatted;
1334  return reformatted;
1335  }
1336  return kEmptyStr;
1337 }
1338 
1339 /*
1340 1. String should be converted to UTF8 string, this will get rid of \xC0 and similar substrings
1341 2. Every codepoint (note that this is not regular ascii "char") that is not a digit or a decimal point or a letter should be prepended with a space.
1342  Transitions from alpha to digit/point and from digit/point to alpha should also be prepended with a space.
1343 3. NStr::Split is called with space as a separator and Tokenize flag - need to check if Split works with UTF8 strings properly.
1344 4. After this we should have a vector of tokens, some of which are numbers and others are "modifiers" such as ', '', degrees, N, S, E, W, etc.
1345 5. A pattern string is created where each number is replaced with "1" and modifiers are normalized to "lat", or "N"; the actual numerical values are kept in a separate vector
1346 5. Based on the pattern the vector of numbers is parsed into degrees, minutes, or seconds,
1347 6. NSEW and "lattitude/longitude" are applied to degrees in the order of appearance, if none are present other heuristic to determine which is latitude and which is longitude
1348 */
1349 
1350 static string s_InsertSpacesBetweenTokens(const string &old_str)
1351 {
1352  string new_str;
1353  for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
1354  {
1356  if (sym < 0x80)
1357  {
1358  char c = static_cast<char>(sym);
1359  if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
1360  {
1361  new_str += ' ';
1362  }
1363  else if (!new_str.empty() &&
1364  ((isalpha(new_str.back()) && !isalpha(c)) ||
1365  (!isalpha(new_str.back()) && isalpha(c))))
1366  {
1367  new_str += ' ';
1368  }
1369  new_str += c;
1370  if (!isalpha(c) && !isdigit(c) && c != '.' && c != '-' && c != '+')
1371  {
1372  new_str += ' ';
1373  }
1374  }
1375  else
1376  {
1377  new_str += ' ';
1378  }
1379  }
1380  return new_str;
1381 }
1382 
1383 static string s_RemoveSpacesWithinNumbers(const string &old_str)
1384 {
1385  string new_str;
1386  bool is_number = true;
1387  for (string::const_iterator i = old_str.begin(); i != old_str.end(); ++i)
1388  {
1390  if (sym < 0x80)
1391  {
1392  char c = static_cast<char>(sym);
1393  size_t j = new_str.size();
1394  if (j >= 4 && new_str[j-1] == ' ' && new_str[j-2] == '.' && new_str[j-3] == ' ' && isdigit(new_str[j-4]) && isdigit(c))
1395  {
1396  new_str.pop_back();
1397  new_str.pop_back();
1398  new_str.pop_back();
1399  new_str += '.';
1400  }
1401  new_str += c;
1402  if (!isdigit(c) && c != '+' && c != '-' && c != '.' && !isspace(c)) {
1403  is_number = false;
1404  }
1405  }
1406  else
1407  {
1408  new_str += ' ';
1409  is_number = false;
1410  }
1411  }
1412  if (is_number)
1413  {
1414  NStr::ReplaceInPlace(new_str, "+", " +");
1415  NStr::ReplaceInPlace(new_str, "-", " -");
1416  }
1417  return new_str;
1418 }
1419 
1420 static bool s_IsNumber(const string &token, double *result = NULL)
1421 {
1422  double num = NStr::StringToDouble(token, NStr::fConvErr_NoThrow);
1423  if (!num && errno)
1424  {
1425  return false;
1426  }
1427  if (result) {
1428  *result = num;
1429  }
1430  return true;
1431 }
1432 
1433 static string s_NormalizeTokens(vector<string> &tokens, vector<double> &numbers, vector<string> &anum, vector<int> &precision, vector<string> &lat_long, vector<string> &nsew)
1434 {
1435  vector<string> pattern;
1436  for (size_t i = 0; i < tokens.size(); i++)
1437  {
1438  string &token = tokens[i];
1439 
1440  double num;
1441  if (s_IsNumber(token, &num))
1442  {
1443  numbers.push_back(num);
1444  anum.push_back(token);
1445  pattern.push_back("1");
1446  precision.push_back(0);
1447  if (NStr::Find(token, ".") != NPOS && !NStr::EndsWith(token, "."))
1448  {
1449  precision.back()
1450  = static_cast<int>(token.length() - token.find('.') - 1);
1451  }
1452  continue;
1453  }
1454 
1455  {
1456  vector<string> tmp;
1457  NStr::Split(token, ".", tmp);
1458  double num0, num1, num2;
1459  if (tmp.size() == 3 && s_IsNumber(tmp[0], &num0) && s_IsNumber(tmp[1], &num1) && s_IsNumber(tmp[2], &num2))
1460  {
1461  numbers.push_back(num0);
1462  anum.push_back(tmp[0]);
1463  pattern.push_back("1");
1464  precision.push_back(0);
1465  numbers.push_back(num1);
1466  anum.push_back(tmp[1]);
1467  pattern.push_back("1");
1468  precision.push_back(0);
1469  numbers.push_back(num2);
1470  anum.push_back(tmp[2]);
1471  pattern.push_back("1");
1472  precision.push_back(0);
1473  continue;
1474  }
1475  }
1476 
1477  if (token == "\'" && i >= 3 && s_IsNumber(tokens[i - 1]) && tokens[i - 2] == "\'" && s_IsNumber(tokens[i - 3]))
1478  {
1479  token = "\"";
1480  }
1481 
1482  if (NStr::EqualNocase(token, "degrees") || NStr::EqualNocase(token, "deg") || NStr::EqualNocase(token, "deg.") || NStr::EqualNocase(token, "degree"))
1483  {
1484  token = "degrees";
1485  pattern.push_back("degrees");
1486  }
1487  else if ( token == "\'" || NStr::EqualNocase(token, "min") || NStr::EqualNocase(token, "min.") || NStr::EqualNocase(token, "minute") || NStr::EqualNocase(token, "minutes"))
1488  {
1489  token = "\'";
1490  pattern.push_back("\'");
1491  }
1492  else if (token == "\"" || NStr::EqualNocase(token, "sec") || NStr::EqualNocase(token, "sec.") || NStr::EqualNocase(token, "second") || NStr::EqualNocase(token, "seconds"))
1493  {
1494  token = "\"";
1495  pattern.push_back("\"");
1496  }
1497  else if (token == "," || token == ":" || token == "_" || token == "&" || token == "." || token == ";" || token == "#" || NStr::EqualNocase(token, "and"))
1498  {
1499  }
1500  else if (NStr::EqualNocase(token, "lattitude") || NStr::EqualNocase(token, "latitude") || NStr::EqualNocase(token, "lat") || NStr::EqualNocase(token, "lat."))
1501  {
1502  pattern.push_back("lat");
1503  lat_long.push_back("lat");
1504  }
1505  else if (NStr::EqualNocase(token, "longitude") || NStr::EqualNocase(token, "lo") || NStr::EqualNocase(token, "lon") || NStr::EqualNocase(token, "long")
1506  || NStr::EqualNocase(token, "lo.") || NStr::EqualNocase(token, "lon.") || NStr::EqualNocase(token, "long."))
1507  {
1508  pattern.push_back("lat");
1509  lat_long.push_back("long");
1510  }
1511  else if (token == "N" || NStr::EqualNocase(token, "north"))
1512  {
1513  pattern.push_back("N");
1514  nsew.push_back("N");
1515  }
1516  else if (token == "S" || NStr::EqualNocase(token, "south"))
1517  {
1518  pattern.push_back("N");
1519  nsew.push_back("S");
1520  }
1521  else if (token == "E" || NStr::EqualNocase(token, "east"))
1522  {
1523  pattern.push_back("N");
1524  nsew.push_back("E");
1525  }
1526  else if (token == "W" || NStr::EqualNocase(token, "west") || token == "Wdeg")
1527  {
1528  pattern.push_back("N");
1529  nsew.push_back("W");
1530  }
1531  else if (token == "NW")
1532  {
1533  nsew.push_back("N");
1534  nsew.push_back("W");
1535  }
1536  else if (token == "NE")
1537  {
1538  nsew.push_back("N");
1539  nsew.push_back("E");
1540  }
1541  else if (token == "SW")
1542  {
1543  nsew.push_back("S");
1544  nsew.push_back("W");
1545  }
1546  else if (token == "SE")
1547  {
1548  nsew.push_back("S");
1549  nsew.push_back("E");
1550  }
1551  else
1552  {
1553  //cout << "Token: " << token << endl;
1554  numbers.clear();
1555  return kEmptyStr;
1556  }
1557  }
1558  //cout << "Pattern: " << NStr::Join(pattern, " ") << endl;
1559  return NStr::Join(pattern, " ");
1560 }
1561 
1562 static void s_ReorderNorthSouthEastWest(vector<double> &numbers, vector<int> &precision, const vector<string> &lat_long, vector<string> &nsew)
1563 {
1564  if (numbers.size() != 2)
1565  {
1566  numbers.clear();
1567  return;
1568  }
1569  if (lat_long.size() == 2)
1570  {
1571  if (lat_long.front() == "long")
1572  {
1573  swap(numbers[0], numbers[1]);
1574  swap(precision[0], precision[1]);
1575  if (nsew.size() == 2) {
1576  swap(nsew[0], nsew[1]);
1577  }
1578  }
1579  }
1580  else if (!lat_long.empty())
1581  {
1582  numbers.clear();
1583  return;
1584  }
1585  if (nsew.size() == 2)
1586  {
1587  if ((nsew[0] == "E" || nsew[0] == "W") &&
1588  (nsew[1] == "N" || nsew[1] == "S"))
1589  {
1590  swap(numbers[0], numbers[1]);
1591  swap(precision[0], precision[1]);
1592  swap(nsew[0], nsew[1]);
1593  }
1594  if (nsew[0] == "N")
1595  {
1596  numbers[0] = fabs(numbers[0]);
1597  }
1598  else if (nsew[0] == "S")
1599  {
1600  if (numbers[0] != 0)
1601  numbers[0] = -fabs(numbers[0]);
1602  }
1603  else
1604  {
1605  numbers.clear();
1606  return;
1607  }
1608  if (nsew[1] == "E")
1609  {
1610  numbers[1] = fabs(numbers[1]);
1611  }
1612  else if (nsew[1] == "W")
1613  {
1614  if (numbers[1] != 0)
1615  numbers[1] = -fabs(numbers[1]);
1616  }
1617  else
1618  {
1619  numbers.clear();
1620  return;
1621  }
1622 
1623  }
1624  else if (!nsew.empty())
1625  {
1626  numbers.clear();
1627  return;
1628  }
1629  if (lat_long.empty() && nsew.empty() && fabs(numbers[0]) > 90 && fabs(numbers[1]) < 90)
1630  {
1631  swap(numbers[0], numbers[1]);
1632  swap(precision[0], precision[1]);
1633  }
1634  if (fabs(numbers[0]) > 90 || fabs(numbers[1]) > 180)
1635  {
1636  numbers.clear();
1637  return;
1638  }
1639 }
1640 
1641 static void s_GetLatLong(const string &new_str, vector<double> &numbers, vector<int> &precision)
1642 {
1643  vector<string> tokens;
1644  NStr::Split(new_str, " ", tokens, NStr::fSplit_Tokenize);
1645  vector<string> lat_long;
1646  vector<string> nsew;
1647  vector<string> anum;
1648  string pattern = s_NormalizeTokens(tokens, numbers, anum, precision, lat_long, nsew);
1649  if (pattern.empty())
1650  {
1651  numbers.clear();
1652  return;
1653  }
1654  vector<double> degrees(2, 0);
1655  vector<int> prec(2, 0);
1656  int sign1 = 1;
1657  int sign2 = 1;
1658  if ( pattern == "1 1" ||
1659  pattern == "1 N 1 N" ||
1660  pattern == "N 1 N 1" ||
1661  pattern == "1 degrees N 1 degrees N" ||
1662  pattern == "lat 1 lat 1" ||
1663  pattern == "1 N lat 1 N lat" ||
1664  pattern == "1 degrees N lat 1 degrees N lat")
1665  {
1666  degrees[0] = numbers[0];
1667  degrees[1] = numbers[1];
1668  prec[0] = precision[0];
1669  prec[1] = precision[1];
1670  }
1671  else if ((pattern == "1 1 \" 1 1 '" ||
1672  pattern == "1 degrees 1 \" N 1 degrees 1 ' N")
1673  && numbers[1] < 60 && numbers[3] < 60
1674  && numbers[1] >= 0 && numbers[3] >= 0)
1675  {
1676  sign1 = anum[0][0] == '-' ? -1 : 1;
1677  sign2 = anum[2][0] == '-' ? -1 : 1;
1678  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 3600);
1679  degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
1680  prec[0] = max(precision[0], precision[1] + 4);
1681  prec[1] = max(precision[2], precision[3] + 2);
1682  }
1683  else if ( (pattern == "1 1 ' 1" ||
1684  pattern == "1 degrees 1 ' N 1 degrees N")
1685  && numbers[1] < 60
1686  && numbers[1] >= 0)
1687  {
1688  sign1 = anum[0][0] == '-' ? -1 : 1;
1689  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1690  degrees[1] = numbers[2];
1691  prec[0] = max(precision[0], precision[1] + 2);
1692  prec[1] = precision[2];
1693  }
1694  else if (pattern == "1 1 ' 1 \" 1"
1695  && numbers[1] < 60 && numbers[2] < 60
1696  && numbers[1] >= 0 && numbers[2] >= 0)
1697  {
1698  sign1 = anum[0][0] == '-' ? -1 : 1;
1699  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1700  degrees[1] = numbers[3];
1701  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1702  prec[1] = precision[3];
1703  }
1704  else if ((pattern == "1 1 ' 1 \" 1 1 '" ||
1705  pattern == "1 1 1 N 1 1 N" ||
1706  pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' N")
1707  && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
1708  && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
1709  {
1710  sign1 = anum[0][0] == '-' ? -1 : 1;
1711  sign2 = anum[3][0] == '-' ? -1 : 1;
1712  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1713  degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60);
1714  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1715  prec[1] = max(precision[3], precision[4] + 2);
1716  }
1717  else if (( pattern == "1 1 ' 1 \" 1 1 ' 1 \"" ||
1718  pattern == "1 1 ' 1 \" N 1 1 ' 1 \" N" ||
1719  pattern == "1 degrees 1 ' 1 \" 1 degrees 1 ' 1 \"" ||
1720  pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \" N" ||
1721  pattern == "N 1 degrees 1 ' 1 \" N 1 degrees 1 ' 1 \"" ||
1722  pattern == "1 degrees 1 ' 1 N 1 degrees 1 ' 1 N" ||
1723  pattern == "1 degrees 1 1 N 1 degrees 1 1 N" ||
1724  pattern == "1 1 1 N 1 1 1 N")
1725  && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60 && numbers[5] < 60
1726  && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0 && numbers[5] >= 0)
1727  {
1728  sign1 = anum[0][0] == '-' ? -1 : 1;
1729  sign2 = anum[3][0] == '-' ? -1 : 1;
1730  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1731  degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 60 + numbers[5] / 3600);
1732  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1733  prec[1] = max(max(precision[3], precision[4] + 2), precision[5] + 4);
1734  }
1735  else if (( pattern == "1 1 ' 1 1 '" ||
1736  pattern == "1 1 N 1 1 N" ||
1737  pattern == "1 1 ' N 1 1 ' N" ||
1738  pattern == "1 degrees 1 ' N 1 degrees 1 ' N" ||
1739  pattern == "lat 1 degrees 1 ' N lat 1 degrees 1 ' N" ||
1740  pattern == "1 degrees 1 N 1 degrees 1 N" ||
1741  pattern == "1 degrees 1 N 1 degrees 1 ' N" ||
1742  pattern == "1 degrees 1 ' N 1 degrees 1 N" ||
1743  pattern == "N 1 degrees 1 ' N 1 degrees 1" ||
1744  pattern == "N 1 degrees 1 ' N 1 degrees 1 '" ||
1745  pattern == "N 1 degrees 1 ' N 1 1 '")
1746  && numbers[1] < 60 && numbers[3] < 60
1747  && numbers[1] >= 0 && numbers[3] >= 0)
1748  {
1749  sign1 = anum[0][0] == '-' ? -1 : 1;
1750  sign2 = anum[2][0] == '-' ? -1 : 1;
1751  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1752  degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60);
1753  prec[0] = max(precision[0], precision[1] + 2);
1754  prec[1] = max(precision[2], precision[3] + 2);
1755  }
1756  else if ((pattern == "1 N 1 1 N" ||
1757  pattern == "1 degrees N 1 degrees 1 ' N")
1758  && numbers[2] < 60
1759  && numbers[2] >= 0)
1760  {
1761  sign2 = anum[1][0] == '-' ? -1 : 1;
1762  degrees[0] = numbers[0];
1763  degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60);
1764  prec[0] = precision[0];
1765  prec[1] = max(precision[1], precision[2] + 2);
1766  }
1767  else if ((pattern == "1 degrees 1 ' 1 degrees 1 ' 1 \"" ||
1768  pattern == "N 1 1 N 1 1 1")
1769  && numbers[1] < 60 && numbers[3] < 60 && numbers[4] < 60
1770  && numbers[1] >= 0 && numbers[3] >= 0 && numbers[4] >= 0)
1771  {
1772  sign1 = anum[0][0] == '-' ? -1 : 1;
1773  sign2 = anum[2][0] == '-' ? -1 : 1;
1774  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60);
1775  degrees[1] = sign2*(fabs(numbers[2]) + numbers[3] / 60 + numbers[4] / 3600);
1776  prec[0] = max(precision[0], precision[1] + 2);
1777  prec[1] = max(max(precision[2], precision[3] + 2), precision[4] + 4);
1778  }
1779  else if (pattern == "1 degrees 1 degrees 1 ' 1 \""
1780  && numbers[2] < 60 && numbers[3] < 60
1781  && numbers[2] >= 0 && numbers[3] >= 0)
1782  {
1783  sign2 = anum[1][0] == '-' ? -1 : 1;
1784  degrees[0] = numbers[0];
1785  degrees[1] = sign2*(fabs(numbers[1]) + numbers[2] / 60 + numbers[3] / 3600);
1786  prec[0] = precision[0];
1787  prec[1] = max(max(precision[1], precision[2] + 2), precision[3] + 4);
1788  }
1789  else if (pattern == "1 degrees 1 ' 1 \" N 1 degrees 1 \" N"
1790  && numbers[1] < 60 && numbers[2] < 60 && numbers[4] < 60
1791  && numbers[1] >= 0 && numbers[2] >= 0 && numbers[4] >= 0)
1792  {
1793  sign1 = anum[0][0] == '-' ? -1 : 1;
1794  sign2 = anum[3][0] == '-' ? -1 : 1;
1795  degrees[0] = sign1*(fabs(numbers[0]) + numbers[1] / 60 + numbers[2] / 3600);
1796  degrees[1] = sign2*(fabs(numbers[3]) + numbers[4] / 3600);
1797  prec[0] = max(max(precision[0], precision[1] + 2), precision[2] + 4);
1798  prec[1] = max(precision[3], precision[4] + 4);
1799  }
1800  else
1801  {
1802  degrees.clear();
1803  prec.clear();
1804  }
1805  swap(degrees, numbers);
1806  swap(prec, precision);
1807  s_ReorderNorthSouthEastWest(numbers, precision, lat_long, nsew);
1808 }
1809 
1810 
1811 string s_ShortenLatLon( string &subname ) {
1812  string lat;
1813  string north_or_south;
1814  string lon;
1815  string east_or_west;
1816 
1817  if (subname.length() < 1) {
1818  return subname;
1819  }
1820  char ch = subname[0];
1821  if (ch < '0' || ch > '9') {
1822  return subname;
1823  }
1824 
1825  // extract the pieces
1826  CNcbiIstrstream lat_lon_stream( subname );
1827  lat_lon_stream >> lat;
1828  lat_lon_stream >> north_or_south;
1829  lat_lon_stream >> lon;
1830  lat_lon_stream >> east_or_west;
1831  if( lat_lon_stream.bad() ) {
1832  return subname;
1833  }
1834 
1835  if( north_or_south != "N" && north_or_south != "S" ) {
1836  return subname;
1837  }
1838 
1839  if( east_or_west != "E" && east_or_west != "W" ) {
1840  return subname;
1841  }
1842 
1843  size_t pos = NStr::Find(lat, ".");
1844  if (pos > 0) {
1845  size_t len = lat.length();
1846  if (pos + 9 < len) {
1847  lat.erase(pos + 9);
1848  }
1849  }
1850 
1851  pos = NStr::Find(lon, ".");
1852  if (pos > 0) {
1853  size_t len = lon.length();
1854  if (pos + 9 < len) {
1855  lon.erase(pos + 9);
1856  }
1857  }
1858 
1859  return lat + " " + north_or_south + " " + lon + " " + east_or_west;
1860 }
1861 
1862 string CSubSource::FixLatLonFormat (string orig_lat_lon, bool guess)
1863 {
1864  //cout << "Before: " << orig_lat_lon << endl;
1865  NStr::ParseEscapes(orig_lat_lon);
1866  CStringUTF8 old_str = CUtf8::AsUTF8(orig_lat_lon, CUtf8::GuessEncoding(orig_lat_lon));
1867  if (NStr::StartsWith(old_str, "\""))
1868  {
1869  NStr::TrimPrefixInPlace(old_str, "\"");
1870  NStr::TrimSuffixInPlace(old_str, "\"");
1871  }
1872  NStr::ReplaceInPlace(old_str, "\'\'", "\"");
1873  string fixed_str = s_RemoveSpacesWithinNumbers(old_str);
1874  string new_str = s_InsertSpacesBetweenTokens(fixed_str);
1875  NStr::Sanitize(new_str);
1876  vector<double> numbers;
1877  vector<int> precision;
1878  s_GetLatLong(new_str, numbers, precision);
1879  string res;
1880  if (!numbers.empty())
1881  {
1882  res = MakeLatLon(numbers[0], numbers[1], precision[0], precision[1]);
1883  }
1884  //cout << "After: " << res << endl;
1885  res = s_ShortenLatLon(res);
1886  return res;
1887 }
1888 
1889 
1890 string CSubSource::MakeLatLon(double lat_value, double lon_value, int lat_precision, int lon_precision )
1891 {
1892  char ns = 'N';
1893  if (lat_value < 0) {
1894  ns = 'S';
1895  lat_value = -lat_value;
1896  }
1897  char ew = 'E';
1898  if (lon_value < 0) {
1899  ew = 'W';
1900  lon_value = -lon_value;
1901  }
1902  string lat = NStr::DoubleToString(lat_value, lat_precision);
1903  string lon = NStr::DoubleToString(lon_value, lon_precision);
1904 
1905  NStr::TrimSuffixInPlace(lat, ".");
1906  NStr::TrimSuffixInPlace(lon, ".");
1907  string res = lat + " " + ns + " " + lon + " " + ew;
1908  return res;
1909 }
1910 
1911 
1912 CLatLonCountryId *CSubSource::x_CalculateLatLonId(float lat_value, float lon_value, string country, string province)
1913 {
1914  CLatLonCountryId *id = new CLatLonCountryId(lat_value, lon_value);
1915 
1916  bool goodmatch = false;
1917 
1918  // lookup region by coordinates, or find nearest region and calculate distance
1919  const CCountryExtreme * guess = m_LatLonCountryMap->GuessRegionForLatLon(lat_value, lon_value, country, province);
1920  if (guess) {
1921  id->SetFullGuess(guess->GetCountry());
1922  id->SetGuessCountry(guess->GetLevel0());
1923  id->SetGuessProvince(guess->GetLevel1());
1924  if (NStr::EqualNocase(country, id->GetGuessCountry())
1925  && (NStr::IsBlank(province) || NStr::EqualNocase(province, id->GetGuessProvince()))) {
1926  goodmatch = true;
1927  }
1928  } else {
1929  // not inside a country, check water
1930  guess = m_LatLonWaterMap->GuessRegionForLatLon(lat_value, lon_value, country);
1931  if (guess) {
1932  // found inside water
1933  id->SetGuessWater(guess->GetCountry());
1934  if (NStr::EqualNocase(country, id->GetGuessWater())) {
1935  goodmatch = true;
1936  }
1937 
1938  // also see if close to land for coastal warning (if country is land)
1939  // or proximity message (if country is water)
1940  double landdistance = 0.0;
1941  guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
1942  if (guess) {
1943  id->SetClosestFull(guess->GetCountry());
1944  id->SetClosestCountry(guess->GetLevel0());
1945  id->SetClosestProvince(guess->GetLevel1());
1946  id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
1947  if (NStr::EqualNocase(country, id->GetClosestCountry())
1948  && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
1949  goodmatch = true;
1950  }
1951  }
1952  } else {
1953  // may be coastal inlet, area of data insufficiency
1954  double landdistance = 0.0;
1955  guess = m_LatLonCountryMap->FindClosestToLatLon (lat_value, lon_value, 5.0, landdistance);
1956  if (guess) {
1957  id->SetClosestFull(guess->GetCountry());
1958  id->SetClosestCountry(guess->GetLevel0());
1959  id->SetClosestProvince(guess->GetLevel1());
1960  id->SetLandDistance(m_LatLonCountryMap->AdjustAndRoundDistance (landdistance));
1961  if (NStr::EqualNocase(country, id->GetClosestCountry())
1962  && (NStr::IsBlank(province) || NStr::EqualNocase(province, guess->GetLevel1()))) {
1963  goodmatch = true;
1964  }
1965  }
1966 
1967  double waterdistance = 0.0;
1968  guess = m_LatLonWaterMap->FindClosestToLatLon (lat_value, lon_value, 5.0, waterdistance);
1969  if (guess) {
1970  id->SetClosestWater(guess->GetLevel0());
1971  id->SetWaterDistance(m_LatLonWaterMap->AdjustAndRoundDistance (waterdistance));
1972  if (NStr::EqualNocase(country, id->GetClosestWater())) {
1973  goodmatch = true;
1974  }
1975  }
1976  }
1977  }
1978 
1979  // if guess is not the provided country or province, calculate distance to claimed country
1980  if (!goodmatch) {
1981  double distance = 0.0;
1982  guess = m_LatLonCountryMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
1983  if (guess) {
1984  if (distance < ErrorDistance(lat_value, lon_value, m_LatLonCountryMap->GetScale())) {
1985  // close enough
1986  id->SetGuessCountry(country);
1987  id->SetGuessProvince(province);
1988  id->SetFullGuess(guess->GetCountry());
1989  } else {
1990  id->SetClaimedFull(guess->GetCountry());
1991  id->SetClaimedDistance(m_LatLonCountryMap->AdjustAndRoundDistance (distance));
1992  }
1993  } else if (NStr::IsBlank(province)) {
1994  guess = m_LatLonWaterMap->IsNearLatLon (lat_value, lon_value, 5.0, distance, country, province);
1995  if (guess) {
1996  id->SetClaimedFull(guess->GetCountry());
1997  id->SetClaimedDistance(m_LatLonWaterMap->AdjustAndRoundDistance (distance));
1998  }
1999  }
2000  }
2001 
2002  return id;
2003 }
2004 
2005 
2006 
2009  {"Adriatic Sea", "Mediterranean Sea"},
2010  {"Aegean Sea", "Mediterranean Sea"},
2011  {"Alboran Sea", "Mediterranean Sea"},
2012  {"Andaman Sea", "Indian Ocean"},
2013  {"Arabian Sea", "Indian Ocean"},
2014  {"Argentine Sea", "Atlantic Ocean"},
2015  {"Ariake Sea", "Pacific Ocean"},
2016  {"Baffin Bay", "Atlantic Ocean"},
2017  {"Balearic Sea", "Mediterranean Sea"},
2018  {"Baltic Sea", "Atlantic Ocean"},
2019  {"Barents Sea", "Arctic Ocean"},
2020  {"Bay of Bengal", "Indian Ocean"},
2021  {"Beaufort Sea", "Arctic Ocean"},
2022  {"Bering Sea", "Pacific Ocean"},
2023  {"Bismarck Sea", "Pacific Ocean"},
2024  {"Black Sea", "Mediterranean Sea"},
2025  {"Bohai Sea", "Pacific Ocean"},
2026  {"Caribbean Sea", "Atlantic Ocean"},
2027  {"Celebes Sea", "Pacific Ocean"},
2028  {"Champlain Sea", "Atlantic Ocean"},
2029  {"Chilean Sea", "Pacific Ocean"},
2030  {"China Seas", "Pacific Ocean"},
2031  {"Chukchi Sea", "Arctic Ocean"},
2032  {"Coral Sea", "Pacific Ocean"},
2033  {"Davis Strait", "Atlantic Ocean"},
2034  {"East China Sea", "Pacific Ocean"},
2035  {"East Siberian Sea", "Arctic Ocean"},
2036  {"English Channel", "Atlantic Ocean"},
2037  {"Erythraean Sea", "Indian Ocean"},
2038  {"Golfo de California", "Pacific Ocean"},
2039  {"Greenland Sea", "Arctic Ocean"},
2040  {"Gulf of Mexico", "Atlantic Ocean"},
2041  {"Gulf of Thailand", "Pacific Ocean"},
2042  {"Gulf of Tonkin", "Pacific Ocean"},
2043  {"Hudson Bay", "Arctic Ocean"},
2044  {"Ionian Sea", "Mediterranean Sea"},
2045  {"Irish Sea", "Atlantic Ocean"},
2046  {"Irminger Sea", "Atlantic Ocean"},
2047  {"James Bay", "Atlantic Ocean"},
2048  {"Java Sea", "Indian Ocean"},
2049  {"Kara Sea", "Arctic Ocean"},
2050  {"Koro Sea", "Pacific Ocean"},
2051  {"Labrador Sea", "Atlantic Ocean"},
2052  {"Laccadive Sea", "Indian Ocean"},
2053  {"Laptev Sea", "Arctic Ocean"},
2054  {"Ligurian Sea", "Mediterranean Sea"},
2055  {"Lincoln Sea", "Arctic Ocean"},
2056  {"Myrtoan Sea", "Mediterranean Sea"},
2057  {"North Sea", "Atlantic Ocean"},
2058  {"Norwegian Sea", "Atlantic Ocean"},
2059  {"Pechora Sea", "Arctic Ocean"},
2060  {"Persian Gulf", "Indian Ocean"},
2061  {"Philippine Sea", "Pacific Ocean"},
2062  {"Red Sea", "Indian Ocean"},
2063  {"Salish Sea", "Pacific Ocean"},
2064  {"Sargasso Sea", "Atlantic Ocean"},
2065  {"Scotia Sea", "Southern Ocean"},
2066  {"Sea of Azov", "Black Sea"},
2067  {"Sea of Chiloe", "Pacific Ocean"},
2068  {"Sea of Crete", "Mediterranean Sea"},
2069  {"Sea of Japan", "Pacific Ocean"},
2070  {"Sea of Okhotsk", "Pacific Ocean"},
2071  {"Sea of the Hebrides", "Atlantic Ocean"},
2072  {"Sea of Zanj", "Indian Ocean"},
2073  {"Seas of Greenland", "Atlantic Ocean"},
2074  {"Sethusamudram", "Indian Ocean"},
2075  {"Sibutu Passage", "Pacific Ocean"},
2076  {"Solomon Sea", "Pacific Ocean"},
2077  {"South China Sea", "Pacific Ocean"},
2078  {"Sulu Sea", "Pacific Ocean"},
2079  {"Tasman Sea", "Pacific Ocean"},
2080  {"Thracian Sea", "Mediterranean Sea"},
2081  {"Timor Sea", "Indian Ocean"},
2082  {"Tyrrhenian Sea", "Mediterranean Sea"},
2083  {"Wandel Sea", "Arctic Ocean"},
2084  {"White Sea", "Arctic Ocean"},
2085  {"Yellow Sea", "Pacific Ocean"}
2086 };
2089 
2090 static string x_FindSurroundingOcean (string& water)
2091 
2092 {
2093  TWaterPairMap::const_iterator new_water_pair_iter = sc_WaterPairMap.find(water.c_str());
2094  if( new_water_pair_iter != sc_WaterPairMap.end() ) {
2095  return new_water_pair_iter->second;
2096  }
2097  return kEmptyStr;
2098 }
2099 
2100 
2101 string CSubSource::ValidateLatLonCountry (const string& input_countryname, string& lat_lon, bool check_state, ELatLonCountryErr& errcode)
2102 {
2103  errcode = eLatLonCountryErr_None;
2104  string countryname = input_countryname;
2105  if (NStr::IsBlank(countryname) || NStr::IsBlank(lat_lon)) {
2106  return kEmptyStr;
2107  }
2108 
2109  {
2110  static std::mutex m;
2111 
2112  std::lock_guard g(m);
2113 
2114  if ( m_LatLonCountryMap.get() == 0 ) {
2115  m_LatLonCountryMap.reset (new CLatLonCountryMap(false));
2116  }
2117  if ( m_LatLonWaterMap.get() == 0 ) {
2118  m_LatLonWaterMap.reset (new CLatLonCountryMap(true));
2119  }
2120  }
2121 
2122  // only do these checks if the latlon format is good
2123  bool format_correct, lat_in_range, lon_in_range, precision_correct;
2124  double lat_value = 0.0, lon_value = 0.0;
2125  CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
2126  lat_in_range, lon_in_range,
2127  lat_value, lon_value);
2128  if (!format_correct) {
2129  // may have comma and then altitude, so just get lat_lon component */
2130  size_t pos = NStr::Find(lat_lon, ",", NStr::eNocase, NStr::eReverseSearch);
2131  if (pos != NPOS) {
2132  lat_lon = lat_lon.substr(0, pos);
2133  CSubSource::IsCorrectLatLonFormat (lat_lon, format_correct, precision_correct,
2134  lat_in_range, lon_in_range,
2135  lat_value, lon_value);
2136  }
2137  }
2138 
2139  // reality checks
2140  if (!format_correct || !lat_in_range || !lon_in_range) {
2141  // incorrect lat_lon format should be reported elsewhere
2142  // incorrect latitude range should be reported elsewhere
2143  // incorrect longitude range should be reported elsewhere
2144  return kEmptyStr;
2145  }
2146 
2147  // get rid of comments after semicolon or comma in country name
2148  size_t pos = NStr::Find(countryname, ";");
2149  if (pos != NPOS) {
2150  countryname = countryname.substr(0, pos);
2151  }
2152  pos = NStr::Find(countryname, ",");
2153  if (pos != NPOS) {
2154  countryname = countryname.substr(0, pos);
2155  }
2156 
2157  // adjust for special cases
2158  if (NStr::StartsWith(countryname, "Norway: Svalbard")) {
2159  countryname = "Svalbard";
2160  }
2161 
2162  string country = countryname;
2163  string province;
2164  pos = NStr::Find(country, ":");
2165  if (pos != NPOS) {
2166  // is the full string in the list?
2167  if (m_LatLonCountryMap->HaveLatLonForRegion(countryname)) {
2168  province = country.substr(pos + 1);
2170  }
2171  country = country.substr(0, pos);
2173  }
2174  if (NStr::IsBlank(country)) {
2175  return kEmptyStr;
2176  }
2177 
2178  // known exceptions - don't even bother calculating any further
2179  if (NStr::EqualNocase (country, "Antarctica") && lat_value < -60.0) {
2180  return kEmptyStr;
2181  }
2182 
2183  if (! NStr::IsBlank(province)) {
2184  // do not attempt quick exit
2185  } else if (m_LatLonCountryMap->HaveLatLonForRegion(country)) {
2186  if (m_LatLonCountryMap->IsCountryInLatLon(country, lat_value, lon_value)) {
2187  return kEmptyStr;
2188  }
2189  } else if (m_LatLonWaterMap->HaveLatLonForRegion(country)) {
2190  if (m_LatLonWaterMap->IsCountryInLatLon(country, lat_value, lon_value)) {
2191  return kEmptyStr;
2192  }
2193  } else if (NStr::EqualNocase (country, "State of Palestine")) {
2194  } else {
2195  // report unrecognized country
2196  return kEmptyStr;
2197  }
2198 
2199  CLatLonCountryId *id = x_CalculateLatLonId(lat_value, lon_value, country, province);
2200  CLatLonCountryId::TClassificationFlags flags = (id == NULL ? 0 : id->Classify(country, province));
2201 
2202  string wguess = id->GetGuessWater();
2203  string cguess = id->GetGuessCountry();
2204 
2205  // special case where subsection of country has been identified but is not in coordinates of country
2206  // VR-840
2207  if (province.empty() && NStr::Equal(cguess, country)) {
2208  delete id;
2209  return kEmptyStr;
2210  }
2211 
2212  if (NStr::EqualNocase (country, "State of Palestine") &&
2213  (NStr::EqualNocase (cguess, "Gaza Strip") ||
2214  NStr::EqualNocase (cguess, "West Bank"))) {
2215  delete id;
2216  return kEmptyStr;
2217  }
2218 
2219  if (NStr::IsBlank (cguess) && (! NStr::IsBlank (wguess))) {
2220  string parent = x_FindSurroundingOcean (wguess);
2221  if ((! NStr::IsBlank (parent)) && NStr::EqualNocase (country, parent)) {
2222  delete id;
2223  return kEmptyStr;
2224  }
2225  }
2226 
2227  double neardist = 0.0;
2229  CLatLonCountryId::TClassificationFlags adjusted_flags = 0;
2230 
2231  if (!flags && m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 2.0, neardist, country) && neardist < 5.0) {
2232  id->SetGuessCountry (country);
2233  id->SetGuessProvince (kEmptyStr);
2234  flags = id->Classify(country, province);
2235  }
2236 
2237  if (!flags && !m_LatLonCountryMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)
2238  && !m_LatLonWaterMap->IsNearLatLon(lat_value, lon_value, 20.0, neardist, country)) {
2239  /* do not flip from water */
2240  CLatLonCountryId *adjust_id = x_CalculateLatLonId(lon_value, lat_value, country, province);
2241  adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2242  if (adjusted_flags) {
2243  string awguess = adjust_id->GetGuessWater();
2244  string acguess = adjust_id->GetGuessCountry();
2245  if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2246  delete id;
2247  id = adjust_id;
2248  flags = adjusted_flags;
2249  adjustment = CLatLonCountryMap::fFlip;
2250  }
2251  } else {
2252  if (adjust_id) {
2253  delete adjust_id;
2254  }
2255  adjust_id = x_CalculateLatLonId(-lat_value, lon_value, country, province);
2256  adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2257  if (adjusted_flags) {
2258  string awguess = adjust_id->GetGuessWater();
2259  string acguess = adjust_id->GetGuessCountry();
2260  if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2261  delete id;
2262  id = adjust_id;
2263  flags = adjusted_flags;
2264  adjustment = CLatLonCountryMap::fNegateLat;
2265  }
2266  } else {
2267  if (adjust_id) {
2268  delete adjust_id;
2269  }
2270  adjust_id = x_CalculateLatLonId(lat_value, -lon_value, country, province);
2271  adjusted_flags = adjust_id == NULL ? 0 : adjust_id->Classify(country, province);
2272  if (adjusted_flags) {
2273  string awguess = adjust_id->GetGuessWater();
2274  string acguess = adjust_id->GetGuessCountry();
2275  if (NStr::IsBlank (awguess) && (! NStr::IsBlank (acguess))) {
2276  delete id;
2277  id = adjust_id;
2278  flags = adjusted_flags;
2279  adjustment = CLatLonCountryMap::fNegateLon;
2280  }
2281  } else {
2282  if (adjust_id) {
2283  delete adjust_id;
2284  }
2285  }
2286  }
2287  }
2288  }
2289 
2290  string error;
2291 
2292  if (adjustment != CLatLonCountryMap::fNone) {
2293  if (adjustment == CLatLonCountryMap::fFlip) {
2294  errcode = eLatLonCountryErr_Value;
2295  error = "Latitude and longitude values appear to be exchanged";
2296  lat_lon = MakeLatLon(lon_value, lat_value);
2297  } else if (adjustment == CLatLonCountryMap::fNegateLat) {
2298  errcode = eLatLonCountryErr_Value;
2299  if (lat_value < 0.0) {
2300  error = "Latitude should be set to N (northern hemisphere)";
2301  } else {
2302  error = "Latitude should be set to S (southern hemisphere)";
2303  }
2304  lat_lon = MakeLatLon(-lat_value, lon_value);
2305  } else if (adjustment == CLatLonCountryMap::fNegateLon) {
2306  errcode = eLatLonCountryErr_Value;
2307  if (lon_value < 0.0) {
2308  error = "Longitude should be set to E (eastern hemisphere)";
2309  } else {
2310  error = "Longitude should be set to W (western hemisphere)";
2311  }
2312  lat_lon = MakeLatLon(lat_value, -lon_value);
2313  }
2315  // success! nothing to report
2316  } else if (flags & CLatLonCountryId::fWaterMatch) {
2317  // success! nothing to report
2318  } else if (flags & CLatLonCountryId::fCountryMatch && NStr::IsBlank(province)) {
2319  if (check_state) {
2320  string full_guess = id->GetFullGuess();
2321  if (!NStr::Equal(full_guess, country)) {
2322  errcode = eLatLonCountryErr_State;
2323  error = "Lat_lon " + lat_lon + " is in " + id->GetFullGuess()
2324  + " (more specific than " + country + ")";
2325  }
2326  }
2327  } else if (!NStr::IsBlank(id->GetGuessWater())) {
2329  bool suppress = false;
2330  string reportregion;
2331  string nosubphrase;
2332  string desphrase = "designated subregion ";
2333  string subphrase = "another subregion ";
2334  string phrase = nosubphrase;
2335  bool show_claimed = false;
2336 
2337  if (id->GetLandDistance() < 100) {
2338  // for now, will not report
2339  // this is a policy decision
2340  suppress = true;
2341  } else if (NStr::Find(countryname, "Island") != NPOS) {
2342  suppress = true;
2343  }
2344 
2345 
2347  reportregion = countryname;
2348  phrase = desphrase;
2349  } else {
2350  // wasn't closest province, so must be closest country
2351  if (!NStr::IsBlank(province) && check_state) {
2352  phrase = subphrase;
2353  reportregion = id->GetClosestFull();
2354  } else {
2355  reportregion = id->GetClosestCountry();
2356  }
2357  if (!NStr::IsBlank(id->GetClaimedFull())) {
2358  show_claimed = true;
2359  }
2360  }
2361  string water = id->GetGuessWater();
2362  if (NStr::EqualNocase (water, "Red Sea") &&
2363  (NStr::EqualNocase (reportregion, "Egypt") ||
2364  NStr::EqualNocase (reportregion, "Saudi Arabia") ||
2365  NStr::EqualNocase (reportregion, "Sudan") ||
2366  NStr::EqualNocase (reportregion, "Eritrea") ||
2367  NStr::EqualNocase (reportregion, "Dijibouti") ||
2368  NStr::EqualNocase (reportregion, "Yemen") ||
2369  NStr::EqualNocase (reportregion, "Israel") ||
2370  NStr::EqualNocase (reportregion, "Jordan"))) {
2371  } else if (NStr::EqualNocase (water, "Gulf of Mexico") &&
2372  (NStr::EqualNocase (reportregion, "USA") ||
2373  NStr::EqualNocase (reportregion, "Mexico"))) {
2374  } else if (!suppress) {
2375  errcode = eLatLonCountryErr_Water;
2376  if (show_claimed) {
2377  error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion + "' at distance "
2379  + " km, but in water '" + id->GetGuessWater()
2380  + "' - claimed region '" + id->GetClaimedFull()
2381  + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2382  } else {
2383  error = "Lat_lon '" + lat_lon + "' is closest to " + phrase + "'" + reportregion
2384  + "' at distance " + NStr::IntToString(id->GetLandDistance()) + " km, but in water '"
2385  + id->GetGuessWater() + "'";
2386  }
2387  }
2388  } else if (neardist > 0.0) {
2389  errcode = eLatLonCountryErr_Water;
2390  error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "', '"
2391  + countryname + "' is " + NStr::IntToString(m_LatLonCountryMap->AdjustAndRoundDistance(neardist)) + " km away";
2392  } else {
2393  errcode = eLatLonCountryErr_Water;
2394  error = "Lat_lon '" + lat_lon + "' is in water '" + id->GetGuessWater() + "'";
2395  }
2396  } else if (!NStr::IsBlank(id->GetGuessCountry())) {
2397  string full_guess = id->GetFullGuess();
2398  if (NStr::EqualNocase (country, "China") && NStr::EqualNocase (full_guess, "Hong Kong")) {
2399  // skip
2400  } else if (NStr::IsBlank(id->GetClaimedFull())) {
2401  if (NStr::Equal(id->GetGuessCountry(), country) && !NStr::Equal(id->GetGuessProvince(), province)) {
2402  errcode = eLatLonCountryErr_State;
2403  } else {
2404  errcode = eLatLonCountryErr_Country;
2405  }
2406  error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2407  + countryname + "'";
2408  } else {
2409  if (NStr::IsBlank(province)) {
2410  errcode = eLatLonCountryErr_Country;
2411  error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2412  + country + "' - claimed region '" + id->GetClaimedFull()
2413  + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2414  } else {
2415  errcode = eLatLonCountryErr_Country;
2416  if (NStr::EqualNocase(id->GetGuessCountry(), country)) {
2417  errcode = eLatLonCountryErr_State;
2418  }
2419  if (errcode == eLatLonCountryErr_Country || check_state) {
2420  error = "Lat_lon '" + lat_lon + "' maps to '" + id->GetFullGuess() + "' instead of '"
2421  + countryname + "' - claimed region '" + id->GetClaimedFull()
2422  + "' is at distance " + NStr::IntToString(id->GetClaimedDistance()) + " km";
2423  } else {
2424  errcode = eLatLonCountryErr_None;
2425  }
2426  }
2427  }
2428  } else if (!NStr::IsBlank(id->GetClosestCountry())) {
2429  errcode = eLatLonCountryErr_Country;
2430  error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestCountry() + "' instead of '"
2431  + countryname + "'";
2432  } else if (!NStr::IsBlank(id->GetClosestWater())) {
2433  errcode = eLatLonCountryErr_Water;
2434  error = "Lat_lon '" + lat_lon + "' is closest to '" + id->GetClosestWater() + "' instead of '"
2435  + countryname + "'";
2436  } else {
2437  errcode = eLatLonCountryErr_Country;
2438  error = "Unable to determine mapping for lat_lon '" + lat_lon + "' and country '" + countryname + "'";
2439  }
2440 
2441 
2442  delete id;
2443  return error;
2444 }
2445 
2446 
2448  "asexual",
2449  "bisexual",
2450  "diecious",
2451  "dioecious",
2452  "f",
2453  "female",
2454  "gelding",
2455  "hermaphrodite",
2456  "intersex",
2457  "m",
2458  "male",
2459  "mixed",
2460  "monecious",
2461  "monoecious",
2462  "neuter",
2463  "unisexual",
2464 };
2465 
2466 
2468  "pooled males and females",
2469  "pooled male and female",
2470 };
2471 
2472 
2474 {
2475  size_t max = sizeof(sm_ValidSexQualifierPhrases) / sizeof(const char*);
2476 
2477  const char* *begin = sm_ValidSexQualifierPhrases;
2478  const char* *end = &(sm_ValidSexQualifierPhrases[max]);
2479 
2480  if (find(begin, end, value) != end) {
2481  return true;
2482  } else {
2483  return false;
2484  }
2485 }
2486 
2487 
2489 
2490 {
2491  string str = value;
2492  NStr::ToLower(str);
2493 
2495  return true;
2496  }
2497 
2498  vector<string> words;
2499  NStr::Split(str, " ,/", words);
2500  if (words.size() == 0) {
2501  return false;
2502  }
2503 
2504  size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
2505 
2506  const char* *begin = sm_ValidSexQualifierTokens;
2507  const char* *end = &(sm_ValidSexQualifierTokens[max]);
2508 
2509  bool is_good = false;
2510 
2511  ITERATE(vector<string>, w, words) {
2512  if (NStr::Equal(*w, "and")) {
2513  // ok, skip it
2514  } else {
2515  if (find(begin, end, *w) != end) {
2516  is_good = true;
2517  } else {
2518  is_good = false;
2519  break;
2520  }
2521  }
2522  }
2523  return is_good;
2524 }
2525 
2526 
2528 {
2529  string str = value;
2530  NStr::ToLower(str);
2531 
2533  return str;
2534  }
2535 
2536  vector<string> words;
2537  NStr::Split(str, " ,/", words);
2538 
2539  if (words.size() == 0) {
2540  return kEmptyStr;
2541  }
2543 
2544  const char* *begin = sm_ValidSexQualifierTokens;
2545  const char* *end = &(sm_ValidSexQualifierTokens[max]);
2546 
2547  vector<string> good_values;
2548  bool pooled = false;
2549 
2550  ITERATE(vector<string>, w, words) {
2551  if (NStr::Equal(*w, "and")) {
2552  // ok, skip it
2553  } else if (NStr::EqualNocase(*w, "(pooled)") || NStr::EqualNocase(*w, "pooled")) {
2554  // set pooled flag
2555  pooled = true;
2556  } else {
2557  if (find(begin, end, *w) != end) {
2558  if (NStr::Equal(*w, "m")) {
2559  good_values.push_back("male");
2560  } else if (NStr::Equal(*w, "f")) {
2561  good_values.push_back("female");
2562  } else {
2563  good_values.push_back(*w);
2564  }
2565  } else {
2566  // if any bad values, can't autofix
2567  return kEmptyStr;
2568  }
2569  }
2570  }
2571  if (good_values.size() == 0) {
2572  // no good tokens, can't autofix
2573  return kEmptyStr;
2574  }
2575 
2576  string fixed = good_values[0];
2577  for (size_t i = 1; i < good_values.size(); i++) {
2578  if (good_values.size() > 2) {
2579  fixed += ",";
2580  }
2581  if (i == good_values.size() - 1) {
2582  fixed += " and";
2583  }
2584  fixed += " " + good_values[i];
2585  }
2586  if (pooled) {
2587  fixed = "pooled " + fixed;
2588  }
2589  return fixed;
2590 }
2591 
2592 
2593 void s_CollectNumberAndUnits(const string& value, string& number, string& units)
2594 {
2595  number.clear();
2596  units.clear();
2597 
2598  if (NStr::IsBlank(value)) {
2599  return;
2600  }
2601 
2602  string::const_iterator it = value.begin();
2603  if (*it == '+' || *it == '-') {
2604  number += *it;
2605  it++;
2606  }
2607 
2608  bool any_digit = false;
2609  bool skip_comma = true;
2610  while (it != value.end() && (isdigit(*it) || *it == ',')) {
2611  if (*it == ',') {
2612  if (skip_comma) {
2613  // only skip the first comma
2614  skip_comma = false;
2615  } else {
2616  break;
2617  }
2618  } else {
2619  any_digit = true;
2620  number += *it;
2621  }
2622  it++;
2623  }
2624 
2625  if (it == value.end()) {
2626  number.clear();
2627  return;
2628  }
2629 
2630  if (*it == '.') {
2631  number += *it;
2632  it++;
2633  while (it != value.end() && isdigit(*it)) {
2634  any_digit = true;
2635  number += *it;
2636  it++;
2637  }
2638  }
2639 
2640  if (it == value.end() || *it != ' ' || !any_digit) {
2641  number.clear();
2642  return;
2643  }
2644 
2645  it++;
2646  while (it != value.end()) {
2647  units += *it;
2648  it++;
2649  }
2650 }
2651 
2652 
2654 {
2655  if (NStr::IsBlank(value)) {
2656  return false;
2657  }
2658 
2659  string number;
2660  string units;
2662  if (NStr::IsBlank(number) || !NStr::EqualCase(units, "m")) {
2663  return false;
2664  } else {
2665  return true;
2666  }
2667 
2668 }
2669 
2670 
2671 int CSubSource::x_GetPrecision(const string& num_str)
2672 {
2673  int precision = 0;
2674  size_t pos = NStr::Find(num_str, ".");
2675  if (pos != NPOS) {
2676  precision = int(num_str.length() - pos - 1);
2677  }
2678  return precision;
2679 }
2680 
2681 
2683 {
2684  char reformatted[1000];
2685  sprintf(reformatted, "%.*lf", precision, val);
2686  string rval = reformatted;
2687  return rval;
2688 }
2689 
2690 string CSubSource::FixAltitude (const string& value)
2691 {
2692  if (NStr::IsBlank(value)) {
2693  return kEmptyStr;
2694  }
2695 
2696  string number;
2697  string units;
2699  if (NStr::IsBlank(number)) {
2700  return kEmptyStr;
2701  } else if (NStr::Equal(units, "ft.") || NStr::Equal(units, "ft") || NStr::Equal(units, "feet") || NStr::Equal(units, "foot")) {
2703  double val = NStr::StringToDouble(number);
2704  val *= 0.3048;
2706  units = "m";
2707  }
2708 
2709  string rval = kEmptyStr;
2710  if (NStr::Equal(units, "m.")
2711  || NStr::Equal(units, "meters")
2712  || NStr::Equal(units, "meter")
2713  || NStr::Equal(units, "m")) {
2714 
2715  rval = number + " " + "m";
2716  }
2717  return rval;
2718 }
2719 
2720 
2721 // From VR-793:
2722 // A. For segment, endogenous_virus_name:
2723 // 1. Must begin with a letter or number
2724 // 2. Spaces and other printable characters are permitted
2725 // 3. Must not be empty, must not be longer than 240 characters
2726 
2728 {
2729  if (NStr::IsBlank(value)) {
2730  return false;
2731  } else if (!isalnum(value.c_str()[0])) {
2732  return false;
2733  } else if (value.length() > 240) {
2734  return false;
2735  }
2736 
2737  for (auto it : value) {
2738  if (!isprint(it)) {
2739  return false;
2740  }
2741  }
2742 
2743  return true;
2744 }
2745 
2746 
2748 {
2750 }
2751 
2752 
2754 {
2756 }
2757 
2758 
2759 // From VR-793:
2760 // B. For chromosome, linkage_group and plasmid_name values:
2761 // 4. Must begin with a letter or number
2762 // 5. Must not be empty, must not be longer than 32 characters
2763 // 6. Must not contain <tab>
2764 // 7. Spaces and other printable characters are permitted
2765 // 8. Must not contain the word "plasmid" (ignoring case)
2766 // 9. Must not contain the word "chromosome" (ignoring case)
2767 // 10. Must not contain the phrase "linkage group" (ignoring case)
2768 // 11. Must not contain the series of letters "chr" (ignoring case)
2769 // 12. Must not contain the taxname (ignoring case)
2770 // 14. Must not contain the genus (ignoring case)
2771 // 15. Must not contain the species (ignoring case)
2772 // except allow the species to match the value after an initial 'p' (e.g., JX416328)
2773 // 16. Must not contain the series of letters "chrm" (ignoring case)
2774 // 17. Must not contain the series of letters "chrom" (ignoring case)
2775 // 18. Must not contain the phrase "linkage-group" (ignoring case)
2776 static bool s_FailsGenusOrSpeciesTest(const string& value, const string& taxname)
2777 { // See RW-1436
2778  if (NStr::IsBlank(taxname) ||
2779  NStr::StartsWith(taxname, "Plasmid ", NStr::eNocase) ||
2780  NStr::StartsWith(taxname, "IncQ plasmid", NStr::eNocase)) {
2781  return false;
2782  }
2783 
2784  size_t pos = NStr::Find(taxname, " ");
2785  if (pos != NPOS) {
2786  string genus = taxname.substr(0, pos);
2787  if (NStr::FindNoCase(value, genus) != NPOS) {
2788  // B.14
2789  return true;
2790  }
2791  string species = taxname.substr(pos + 1);
2792 
2793  pos = NStr::FindNoCase(value, species);
2794  if (pos != NPOS) {
2795  if (pos != 1 || value[0] != 'p') {
2796  // B.15
2797  return true;
2798  }
2799  }
2800  }
2801 
2802  return false;
2803 }
2804 
2806 {
2807  if (NStr::FindNoCase(taxname, "Borrelia") != NPOS || NStr::FindNoCase(taxname, "Borreliella") != NPOS) {
2808  if (NStr::StartsWith(value, "cp") || NStr::StartsWith(value, "lp")) {
2809  return true;
2810  }
2811  }
2813  // checks for isalnum start, blankness and unprintable characters
2814  // B.4, B.5, B.7
2815  return false;
2816  } else if (value.length() > 32) {
2817  // B.5
2818  return false;
2819  }
2820 
2821  if (s_FailsGenusOrSpeciesTest(value, taxname)) {
2822  return false;
2823  }
2824 
2825  static string s_ForbiddenPhrases[] = {
2826  "\t", // B.6.
2827  "plasmid", // B.8
2828  "chromosome", // B.9
2829  "linkage group", // B.10
2830  "chr", // B.11
2831  "linkage_group", // B.15
2832  "chrm", // B.16
2833  "chrom", // B.17
2834  "linkage-group" // B.18
2835  };
2836 
2837  for (auto it : s_ForbiddenPhrases) {
2838  if (NStr::FindNoCase(value, it) != NPOS) {
2839  return false;
2840  }
2841  }
2842  return true;
2843 }
2844 
2845 
2846 bool CSubSource::IsChromosomeNameValid(const string& value, const string& taxname)
2847 {
2848  if (NStr::IsBlank(value)) {
2849  return false;
2850  }
2851  if (NStr::StartsWith(value, "LG", NStr::eNocase)) {
2852  return false;
2853  } else {
2855  }
2856 }
2857 
2858 
2859 bool CSubSource::IsLinkageGroupNameValid(const string& value, const string& taxname)
2860 {
2861  if (NStr::IsBlank(value)) {
2862  return false;
2863  }
2865 }
2866 
2867 
2868 // VR-793
2869 // C. For plasmid_name values:
2870 // 19. Exception- megaplasmid is legal
2871 bool CSubSource::IsPlasmidNameValid(const string& value, const string& taxname)
2872 {
2873  if (NStr::IsBlank(value)) {
2874  return false;
2875  }
2876  if (NStr::Equal(value, "megaplasmid")) {
2877  return true;
2878  }
2879  if (NStr::StartsWith(value, "megaplasmid ") && value.length() > 12 && NStr::Find(value.substr(12), " ") == NPOS) {
2880  return true;
2881  }
2882  if (NStr::Equal(value, "F") || NStr::Equal(value, "F factor") || NStr::Equal(value, "F plasmid")) {
2883  return true;
2884  }
2885 
2886  if (NStr::FindNoCase(value,"plasmid") != NPOS) {
2887  static const set<string, PNocase_Conditional> s_PlasmidNameExceptions =
2888  { // This list comes from RW-1436/RW-1430
2889  "Plasmid F",
2890  "Plasmid R",
2891  "Plasmid pIP630",
2892  "Plasmid pNG2",
2893  "Plasmid pGT633",
2894  "Plasmid pE5",
2895  "Plasmid pIP1527",
2896  "Plasmid pAM77",
2897  "Plasmid pAZ1",
2898  "Plasmid RP4"
2899  };
2900 
2901  if (s_PlasmidNameExceptions.find(value) != end(s_PlasmidNameExceptions)) {
2902  return true;
2903  }
2904  return false;
2905  }
2906 
2908 }
2909 
2910 
2911 typedef pair<string, string> TContaminatingCellLine;
2914 
2917 DEFINE_STATIC_FAST_MUTEX(s_CellLineContaminationMutex);
2918 
2919 #include "cell_line.inc"
2920 
2921 static void s_ProcessCellLineLine(const CTempString& line)
2922 {
2923  vector<string> tokens;
2924  NStr::Split(line, "\t", tokens);
2925  if (tokens.size() < 4) {
2926  ERR_POST_X(1, Warning << "Not enough columns in cell_line entry " << line
2927  << "; disregarding");
2928  } else {
2929  NStr::ToUpper(tokens[0]);
2930  (s_CellLineContaminationMap[tokens[0]])[tokens[1]] = TContaminatingCellLine(tokens[2], tokens[3]);
2931  }
2932 }
2933 
2934 
2936 {
2937  CFastMutexGuard GUARD(s_CellLineContaminationMutex);
2939  return;
2940  }
2941 
2942  // read table
2943 
2944  size_t count = sizeof(kCellLine) / sizeof (*kCellLine);
2945  const char * const * start = kCellLine;
2946  while (count--) {
2947  s_ProcessCellLineLine(*start++);
2948  }
2949 
2950 
2952 }
2953 
2954 
2955 string CSubSource::CheckCellLine(const string& cell_line, const string& organism)
2956 {
2957  string rval;
2958 
2960  string cell_line_search = cell_line;
2961  NStr::ToUpper(cell_line_search);
2962 
2963  if (!NStr::IsBlank(((s_CellLineContaminationMap[cell_line_search])[organism]).first)) {
2964  rval = "The International Cell Line Authentication Committee database indicates that " +
2965  cell_line + " from " + organism + " is known to be contaminated by " +
2966  ((s_CellLineContaminationMap[cell_line_search])[organism]).first +
2967  " from " + ((s_CellLineContaminationMap[cell_line_search])[organism]).second +
2968  ". Please see http://iclac.org/databases/cross-contaminations/ for more information and references.";
2969  }
2970  return rval;
2971 }
2972 
2973 
2974 // =============================================================================
2975 // Country Names
2976 // =============================================================================
2977 
2978 
2979 // legal country names, must be in alphabetical order (case sensitive)
2980 static const char* const s_Countries[] = {
2981  "Afghanistan",
2982  "Albania",
2983  "Algeria",
2984  "American Samoa",
2985  "Andorra",
2986  "Angola",
2987  "Anguilla",
2988  "Antarctica",
2989  "Antigua and Barbuda",
2990  "Arctic Ocean",
2991  "Argentina",
2992  "Armenia",
2993  "Aruba",
2994  "Ashmore and Cartier Islands",
2995  "Atlantic Ocean",
2996  "Australia",
2997  "Austria",
2998  "Azerbaijan",
2999  "Bahamas",
3000  "Bahrain",
3001  "Baker Island",
3002  "Baltic Sea",
3003  "Bangladesh",
3004  "Barbados",
3005  "Bassas da India",
3006  "Belarus",
3007  "Belgium",
3008  "Belize",
3009  "Benin",
3010  "Bermuda",
3011  "Bhutan",
3012  "Bolivia",
3013  "Borneo",
3014  "Bosnia and Herzegovina",
3015  "Botswana",
3016  "Bouvet Island",
3017  "Brazil",
3018  "British Virgin Islands",
3019  "Brunei",
3020  "Bulgaria",
3021  "Burkina Faso",
3022  "Burundi",
3023  "Cambodia",
3024  "Cameroon",
3025  "Canada",
3026  "Cape Verde",
3027  "Cayman Islands",
3028  "Central African Republic",
3029  "Chad",
3030  "Chile",
3031  "China",
3032  "Christmas Island",
3033  "Clipperton Island",
3034  "Cocos Islands",
3035  "Colombia",
3036  "Comoros",
3037  "Cook Islands",
3038  "Coral Sea Islands",
3039  "Costa Rica",
3040  "Cote d'Ivoire",
3041  "Croatia",
3042  "Cuba",
3043  "Curacao",
3044  "Cyprus",
3045  "Czechia",
3046  "Democratic Republic of the Congo",
3047  "Denmark",
3048  "Djibouti",
3049  "Dominica",
3050  "Dominican Republic",
3051  "Ecuador",
3052  "Egypt",
3053  "El Salvador",
3054  "Equatorial Guinea",
3055  "Eritrea",
3056  "Estonia",
3057  "Eswatini",
3058  "Ethiopia",
3059  "Europa Island",
3060  "Falkland Islands (Islas Malvinas)",
3061  "Faroe Islands",
3062  "Fiji",
3063  "Finland",
3064  "France",
3065  "French Guiana",
3066  "French Polynesia",
3067  "French Southern and Antarctic Lands",
3068  "Gabon",
3069  "Gambia",
3070  "Gaza Strip",
3071  "Georgia",
3072  "Germany",
3073  "Ghana",
3074  "Gibraltar",
3075  "Glorioso Islands",
3076  "Greece",
3077  "Greenland",
3078  "Grenada",
3079  "Guadeloupe",
3080  "Guam",
3081  "Guatemala",
3082  "Guernsey",
3083  "Guinea",
3084  "Guinea-Bissau",
3085  "Guyana",
3086  "Haiti",
3087  "Heard Island and McDonald Islands",
3088  "Honduras",
3089  "Hong Kong",
3090  "Howland Island",
3091  "Hungary",
3092  "Iceland",
3093  "India",
3094  "Indian Ocean",
3095  "Indonesia",
3096  "Iran",
3097  "Iraq",
3098  "Ireland",
3099  "Isle of Man",
3100  "Israel",
3101  "Italy",
3102  "Jamaica",
3103  "Jan Mayen",
3104  "Japan",
3105  "Jarvis Island",
3106  "Jersey",
3107  "Johnston Atoll",
3108  "Jordan",
3109  "Juan de Nova Island",
3110  "Kazakhstan",
3111  "Kenya",
3112  "Kerguelen Archipelago",
3113  "Kingman Reef",
3114  "Kiribati",
3115  "Kosovo",
3116  "Kuwait",
3117  "Kyrgyzstan",
3118  "Laos",
3119  "Latvia",
3120  "Lebanon",
3121  "Lesotho",
3122  "Liberia",
3123  "Libya",
3124  "Liechtenstein",
3125  "Line Islands",
3126  "Lithuania",
3127  "Luxembourg",
3128  "Macau",
3129  "Madagascar",
3130  "Malawi",
3131  "Malaysia",
3132  "Maldives",
3133  "Mali",
3134  "Malta",
3135  "Marshall Islands",
3136  "Martinique",
3137  "Mauritania",
3138  "Mauritius",
3139  "Mayotte",
3140  "Mediterranean Sea",
3141  "Mexico",
3142  "Micronesia, Federated States of",
3143  "Midway Islands",
3144  "Moldova",
3145  "Monaco",
3146  "Mongolia",
3147  "Montenegro",
3148  "Montserrat",
3149  "Morocco",
3150  "Mozambique",
3151  "Myanmar",
3152  "Namibia",
3153  "Nauru",
3154  "Navassa Island",
3155  "Nepal",
3156  "Netherlands",
3157  "New Caledonia",
3158  "New Zealand",
3159  "Nicaragua",
3160  "Niger",
3161  "Nigeria",
3162  "Niue",
3163  "Norfolk Island",
3164  "North Korea",
3165  "North Macedonia",
3166  "North Sea",
3167  "Northern Mariana Islands",
3168  "Norway",
3169  "Oman",
3170  "Pacific Ocean",
3171  "Pakistan",
3172  "Palau",
3173  "Palmyra Atoll",
3174  "Panama",
3175  "Papua New Guinea",
3176  "Paracel Islands",
3177  "Paraguay",
3178  "Peru",
3179  "Philippines",
3180  "Pitcairn Islands",
3181  "Poland",
3182  "Portugal",
3183  "Puerto Rico",
3184  "Qatar",
3185  "Republic of the Congo",
3186  "Reunion",
3187  "Romania",
3188  "Ross Sea",
3189  "Russia",
3190  "Rwanda",
3191  "Saint Barthelemy",
3192  "Saint Helena",
3193  "Saint Kitts and Nevis",
3194  "Saint Lucia",
3195  "Saint Martin",
3196  "Saint Pierre and Miquelon",
3197  "Saint Vincent and the Grenadines",
3198  "Samoa",
3199  "San Marino",
3200  "Sao Tome and Principe",
3201  "Saudi Arabia",
3202  "Senegal",
3203  "Serbia",
3204  "Seychelles",
3205  "Sierra Leone",
3206  "Singapore",
3207  "Sint Maarten",
3208  "Slovakia",
3209  "Slovenia",
3210  "Solomon Islands",
3211  "Somalia",
3212  "South Africa",
3213  "South Georgia and the South Sandwich Islands",
3214  "South Korea",
3215  "South Sudan",
3216  "Southern Ocean",
3217  "Spain",
3218  "Spratly Islands",
3219  "Sri Lanka",
3220  "State of Palestine",
3221  "Sudan",
3222  "Suriname",
3223  "Svalbard",
3224  "Sweden",
3225  "Switzerland",
3226  "Syria",
3227  "Taiwan",
3228  "Tajikistan",
3229  "Tanzania",
3230  "Tasman Sea",
3231  "Thailand",
3232  "Timor-Leste",
3233  "Togo",
3234  "Tokelau",
3235  "Tonga",
3236  "Trinidad and Tobago",
3237  "Tromelin Island",
3238  "Tunisia",
3239  "Turkey",
3240  "Turkmenistan",
3241  "Turks and Caicos Islands",
3242  "Tuvalu",
3243  "USA",
3244  "Uganda",
3245  "Ukraine",
3246  "United Arab Emirates",
3247  "United Kingdom",
3248  "Uruguay",
3249  "Uzbekistan",
3250  "Vanuatu",
3251  "Venezuela",
3252  "Viet Nam",
3253  "Virgin Islands",
3254  "Wake Island",
3255  "Wallis and Futuna",
3256  "West Bank",
3257  "Western Sahara",
3258  "Yemen",
3259  "Zambia",
3260  "Zimbabwe"
3261 };
3262 static const TCStrSet s_CountriesSet(s_Countries, sizeof(s_Countries), __FILE__, __LINE__);
3263 
3264 // former legal country names, must be in alphabetical order (case sensitive)
3265 static const char* const s_Former_Countries[] = {
3266  "Belgian Congo",
3267  "British Guiana",
3268  "Burma",
3269  "Czech Republic",
3270  "Czechoslovakia",
3271  "East Timor",
3272  "Korea",
3273  "Macedonia",
3274  "Micronesia",
3275  "Netherlands Antilles",
3276  "Serbia and Montenegro",
3277  "Siam",
3278  "Swaziland",
3279  "The former Yugoslav Republic of Macedonia",
3280  "USSR",
3281  "Yugoslavia",
3282  "Zaire"
3283 };
3284 static const TCStrSet s_Former_CountriesSet(s_Former_Countries, sizeof(s_Former_Countries), __FILE__, __LINE__);
3285 
3286 // null term exemption values, must be in alphabetical order (case sensitive)
3287 static const char* const s_Null_Countries[] = {
3288  "missing",
3289  "missing: control sample",
3290  "missing: data agreement established pre-2023",
3291  "missing: endangered species",
3292  "missing: human-identifiable",
3293  "missing: lab stock",
3294  "missing: sample group",
3295  "missing: synthetic construct",
3296  "missing: third party data",
3297  "not applicable",
3298  "not collected",
3299  "not provided",
3300  "restricted access"
3301 };
3302 static const TCStrSet s_Null_CountriesSet(s_Null_Countries, sizeof(s_Null_Countries), __FILE__, __LINE__);
3303 
3304 bool CCountries::IsValid(const string& country)
3305 {
3306  string name = country;
3307  size_t pos = country.find(':');
3308 
3309  if ( pos != NPOS ) {
3310  if (pos == country.length() - 1) {
3311  return false;
3312  }
3313  name = country.substr(0, pos);
3314  }
3315 
3316  // try current countries
3317  if (s_CountriesSet.find(name.c_str()) != s_CountriesSet.end()) {
3318  return true;
3319  } else if (s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end()) {
3320  return true;
3321  } else if (s_Null_CountriesSet.find(name.c_str()) != s_Null_CountriesSet.end()) {
3322  return true;
3323  } else {
3324  return false;
3325  }
3326 }
3327 
3328 
3329 bool CCountries::IsValid(const string& country, bool& is_miscapitalized)
3330 {
3331  string name = country;
3332  size_t pos = country.find(':');
3333 
3334  if ( pos != NPOS ) {
3335  name = country.substr(0, pos);
3336  if (pos == country.length() - 1) {
3337  return false;
3338  }
3339  }
3340 
3341  is_miscapitalized = false;
3342  // try current countries
3343  // fast check for properly capitalized
3344  if ( s_CountriesSet.find(name.c_str()) != s_CountriesSet.end() ) {
3345  return true;
3346  }
3347  if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
3348  return true;
3349  }
3350  if ( s_Null_CountriesSet.find(name.c_str()) != s_Null_CountriesSet.end() ) {
3351  return true;
3352  }
3353  // slow check for miscapitalized
3354  ITERATE ( TCStrSet, it, s_CountriesSet ) {
3355  if ( NStr::EqualNocase(name, *it) ) {
3356  is_miscapitalized = true;
3357  return true;
3358  }
3359  }
3361  if ( NStr::EqualNocase(name, *it) ) {
3362  is_miscapitalized = true;
3363  return true;
3364  }
3365  }
3367  if ( NStr::EqualNocase(name, *it) ) {
3368  is_miscapitalized = true;
3369  return true;
3370  }
3371  }
3372 
3373  return false;
3374 }
3375 
3376 
3377 bool CCountries::WasValid(const string& country)
3378 {
3379  string name = country;
3380  size_t pos = country.find(':');
3381 
3382  if ( pos != NPOS ) {
3383  name = country.substr(0, pos);
3384  }
3385 
3386  // try formerly-valid countries
3387  return s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end();
3388 }
3389 
3390 
3391 bool CCountries::WasValid(const string& country, bool& is_miscapitalized)
3392 {
3393  string name = country;
3394  size_t pos = country.find(':');
3395 
3396  if ( pos != NPOS ) {
3397  name = country.substr(0, pos);
3398  }
3399 
3400  is_miscapitalized = false;
3401  // try formerly-valid countries
3402  // fast check for properly capitalized
3403  if ( s_Former_CountriesSet.find(name.c_str()) != s_Former_CountriesSet.end() ) {
3404  return true;
3405  }
3406  // slow check for miscapitalized
3408  if ( NStr::EqualNocase(name, *it) ) {
3409  is_miscapitalized = true;
3410  return true;
3411  }
3412  }
3413  return false;
3414 }
3415 
3416 /////////////////////////////////////////////////////////////////////////////
3417 ////// Country Capitalization Fix ///////////////////////////////////////////
3418 
3420 {
3421  {"england", "United Kingdom: England"},
3422  {"great britain", "United Kingdom: Great Britain"},
3423  {"new jersey, usa", "USA: New Jersey"}
3424 };
3427 
3429 {"ABW", "Aruba"},
3430 {"AFG", "Afghanistan"},
3431 {"AGO", "Angola"},
3432 {"AIA", "Anguilla"},
3433 {"ALA", "Aland Islands"},
3434 {"ALB", "Albania"},
3435 {"AND", "Andorra"},
3436 {"ARE", "United Arab Emirates"},
3437 {"ARG", "Argentina"},
3438 {"ARM", "Armenia"},
3439 {"ASM", "American Samoa"},
3440 {"ATA", "Antarctica"},
3441 {"ATF", "French Southern Territories"},
3442 {"ATG", "Antigua and Barbuda"},
3443 {"AUS", "Australia"},
3444 {"AUT", "Austria"},
3445 {"AZE", "Azerbaijan"},
3446 {"Antigua & Barbuda", "Antigua and Barbuda"},
3447 {"Ashmore & Cartier Islands", "Ashmore and Cartier Islands"},
3448 {"BDI", "Burundi"},
3449 {"BEL", "Belgium"},
3450 {"BEN", "Benin"},
3451 {"BES", "Bonaire, Sint Eustatius and Saba"},
3452 {"BFA", "Burkina Faso"},
3453 {"BGD", "Bangladesh"},
3454 {"BGR", "Bulgaria"},
3455 {"BHR", "Bahrain"},
3456 {"BHS", "Bahamas"},
3457 {"BIH", "Bosnia and Herzegovina"},
3458 {"BLM", "Saint Barthelemy"},
3459 {"BLR", "Belarus"},
3460 {"BLZ", "Belize"},
3461 {"BMU", "Bermuda"},
3462 {"BOL", "Bolivia"},
3463 {"BRA", "Brazil"},
3464 {"BRB", "Barbados"},
3465 {"BRN", "Brunei"},
3466 {"BTN", "Bhutan"},
3467 {"BVT", "Bouvet Island"},
3468 {"BWA", "Botswana"},
3469 {"Brasil", "Brazil"},
3470 {"CAF", "Central African Republic"},
3471 {"CAN", "Canada"},
3472 {"CCK", "Cocos Islands"},
3473 {"CHE", "Switzerland"},
3474 {"CHL", "Chile"},
3475 {"CHN", "China"},
3476 {"CIV", "Cote d'Ivoire"},
3477 {"CMR", "Cameroon"},
3478 {"COD", "Democratic Republic of the Congo"},
3479 {"COG", "Republic of the Congo"},
3480 {"COK", "Cook Islands"},
3481 {"COL", "Colombia"},
3482 {"COM", "Comoros"},
3483 {"CPV", "Cape Verde"},
3484 {"CRI", "Costa Rica"},
3485 {"CUB", "Cuba"},
3486 {"CUW", "Curacao"},
3487 {"CXR", "Christmas Island"},
3488 {"CYM", "Cayman Islands"},
3489 {"CYP", "Cyprus"},
3490 {"CZE", "Czechia"},
3491 {"Cape Verde Islands", "Cape Verde"},
3492 {"DEU", "Germany"},
3493 {"DJI", "Djibouti"},
3494 {"DMA", "Dominica"},
3495 {"DNK", "Denmark"},
3496 {"DOM", "Dominican Republic"},
3497 {"DZA", "Algeria"},
3498 {"Democratic Republic of Congo", "Democratic Republic of the Congo"},
3499 {"ECU", "Ecuador"},
3500 {"EGY", "Egypt"},
3501 {"ERI", "Eritrea"},
3502 {"ESH", "Western Sahara"},
3503 {"ESP", "Spain"},
3504 {"EST", "Estonia"},
3505 {"ETH", "Ethiopia"},
3506 {"FIN", "Finland"},
3507 {"FJI", "Fiji"},
3508 {"FLK", "Falkland Islands (Islas Malvinas)"},
3509 {"FRA", "France"},
3510 {"FRO", "Faroe Islands"},
3511 {"FSM", "Micronesia, Federated States of"},
3512 {"Falkland Islands", "Falkland Islands (Islas Malvinas)"},
3513 {"French Southern & Antarctic Lands", "French Southern and Antarctic Lands"},
3514 {"GAB", "Gabon"},
3515 {"GBR", "United Kingdom"},
3516 {"GEO", "Georgia"},
3517 {"GGY", "Guernsey"},
3518 {"GHA", "Ghana"},
3519 {"GIB", "Gibraltar"},
3520 {"GIN", "Guinea"},
3521 {"GLP", "Guadeloupe"},
3522 {"GMB", "Gambia"},
3523 {"GNB", "Guinea-Bissau"},
3524 {"GNQ", "Equatorial Guinea"},
3525 {"GRC", "Greece"},
3526 {"GRD", "Grenada"},
3527 {"GRL", "Greenland"},
3528 {"GTM", "Guatemala"},
3529 {"GUF", "French Guiana"},
3530 {"GUM", "Guam"},
3531 {"GUY", "Guyana"},
3532 {"HKG", "Hong Kong"},
3533 {"HMD", "Heard Island and McDonald Islands"},
3534 {"HND", "Honduras"},
3535 {"HRV", "Croatia"},
3536 {"HTI", "Haiti"},
3537 {"HUN", "Hungary"},
3538 {"Heard Island & McDonald Islands", "Heard Island and McDonald Islands"},
3539 {"IDN", "Indonesia"},
3540 {"IMN", "Isle of Man"},
3541 {"IND", "India"},
3542 {"IOT", "British Indian Ocean Territory"},
3543 {"IRL", "Ireland"},
3544 {"IRN", "Iran"},
3545 {"IRQ", "Iraq"},
3546 {"ISL", "Iceland"},
3547 {"ISR", "Israel"},
3548 {"ITA", "Italy"},
3549 {"Ivory Coast", "Cote d'Ivoire"},
3550 {"JAM", "Jamaica"},
3551 {"JEY", "Jersey"},
3552 {"JOR", "Jordan"},
3553 {"JPN", "Japan"},
3554 {"KAZ", "Kazakhstan"},
3555 {"KEN", "Kenya"},
3556 {"KGZ", "Kyrgyzstan"},
3557 {"KHM", "Cambodia"},
3558 {"KIR", "Kiribati"},
3559 {"KNA", "Saint Kitts and Nevis"},
3560 {"KOR", "South Korea"},
3561 {"KWT", "Kuwait"},
3562 {"LAO", "Lao People's Democratic Republic"},
3563 {"LBN", "Lebanon"},
3564 {"LBR", "Liberia"},
3565 {"LBY", "Libyan Arab Jamahiriya"},
3566 {"LCA", "Saint Lucia"},
3567 {"LIE", "Liechtenstein"},
3568 {"LKA", "Sri Lanka"},
3569 {"LSO", "Lesotho"},
3570 {"LTU", "Lithuania"},
3571 {"LUX", "Luxembourg"},
3572 {"LVA", "Latvia"},
3573 {"La Reunion Island", "Reunion"},
3574 {"Luxemburg", "Luxembourg"},
3575 {"MAC", "Macao"},
3576 {"MAF", "Saint Martin (French part)"},
3577 {"MAR", "Morocco"},
3578 {"MCO", "Monaco"},
3579 {"MDA", "Moldova"},
3580 {"MDG", "Madagascar"},
3581 {"MDV", "Maldives"},
3582 {"MEX", "Mexico"},
3583 {"MHL", "Marshall Islands"},
3584 {"MKD", "North Macedonia"},
3585 {"MLI", "Mali"},
3586 {"MLT", "Malta"},
3587 {"MMR", "Myanmar"},
3588 {"MNE", "Montenegro"},
3589 {"MNG", "Mongolia"},
3590 {"MNP", "Northern Mariana Islands"},
3591 {"MOZ", "Mozambique"},
3592 {"MRT", "Mauritania"},
3593 {"MSR", "Montserrat"},
3594 {"MTQ", "Martinique"},
3595 {"MUS", "Mauritius"},
3596 {"MWI", "Malawi"},
3597 {"MYS", "Malaysia"},
3598 {"MYT", "Mayotte"},
3599 {"Macedonia", "North Macedonia"},
3600 {"NAM", "Namibia"},
3601 {"NCL", "New Caledonia"},
3602 {"NER", "Niger"},
3603 {"NFK", "Norfolk Island"},
3604 {"NGA", "Nigeria"},
3605 {"NIC", "Nicaragua"},
3606 {"NIU", "Niue"},
3607 {"NLD", "Netherlands"},
3608 {"NOR", "Norway"},
3609 {"NPL", "Nepal"},
3610 {"NRU", "Nauru"},
3611 {"NZL", "New Zealand"},
3612 {"Netherland", "Netherlands"},
3613 {"New Guinea", "Papua New Guinea"},
3614 {"OMN", "Oman"},
3615 {"P, R, China", "China"},
3616 {"P.R. China", "China"},
3617 {"P.R.China", "China"},
3618 {"PAK", "Pakistan"},
3619 {"PAN", "Panama"},
3620 {"PCN", "Pitcairn"},
3621 {"PER", "Peru"},
3622 {"PHL", "Philippines"},
3623 {"PLW", "Palau"},
3624 {"PNG", "Papua New Guinea"},
3625 {"POL", "Poland"},
3626 {"PRI", "Puerto Rico"},
3627 {"PRK", "North Korea"},
3628 {"PRT", "Portugal"},
3629 {"PRY", "Paraguay"},
3630 {"PSE", "Palestinian Territory"},
3631 {"PYF", "French Polynesia"},
3632 {"People's Republic of China", "China"},
3633 {"Pr China", "China"},
3634 {"Prchina", "China"},
3635 {"QAT", "Qatar"},
3636 {"REU", "Reunion"},
3637 {"ROU", "Romania"},
3638 {"RUS", "Russia"},
3639 {"RWA", "Rwanda"},
3640 {"Republic of Congo", "Republic of the Congo"},
3641 {"SAU", "Saudi Arabia"},
3642 {"SDN", "Sudan"},
3643 {"SEN", "Senegal"},
3644 {"SGP", "Singapore"},
3645 {"SGS", "South Georgia and the South Sandwich Islands"},
3646 {"SHN", "Saint Helena"},
3647 {"SJM", "Svalbard and Jan Mayen"},
3648 {"SLB", "Solomon Islands"},
3649 {"SLE", "Sierra Leone"},
3650 {"SLV", "El Salvador"},
3651 {"SMR", "San Marino"},
3652 {"SOM", "Somalia"},
3653 {"SPM", "Saint Pierre and Miquelon"},
3654 {"SRB", "Serbia"},
3655 {"SSD", "South Sudan"},
3656 {"STP", "Sao Tome and Principe"},
3657 {"SUR", "Suriname"},
3658 {"SVK", "Slovakia"},
3659 {"SVN", "Slovenia"},
3660 {"SWE", "Sweden"},
3661 {"SWZ", "Eswatini"},
3662 {"SXM", "Sint Maarten (Dutch part)"},
3663 {"SYC", "Seychelles"},
3664 {"SYR", "Syrian Arab Republic"},
3665 {"Saint Kitts & Nevis", "Saint Kitts and Nevis"},
3666 {"Saint Pierre & Miquelon", "Saint Pierre and Miquelon"},
3667 {"Saint Vincent & Grenadines", "Saint Vincent and the Grenadines"},
3668 {"Saint Vincent & the Grenadines", "Saint Vincent and the Grenadines"},
3669 {"Saint Vincent and Grenadines", "Saint Vincent and the Grenadines"},
3670 {"San Tome and Principe Island", "Sao Tome and Principe"},
3671 {"Sao Tome & Principe", "Sao Tome and Principe"},
3672 {"South Georgia & South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
3673 {"South Georgia & the South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
3674 {"St Helena", "Saint Helena"},
3675 {"St Lucia", "Saint Lucia"},
3676 {"St Pierre and Miquelon", "Saint Pierre and Miquelon"},
3677 {"St Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
3678 {"St. Helena", "Saint Helena"},
3679 {"St. Lucia", "Saint Lucia"},
3680 {"St. Pierre and Miquelon", "Saint Pierre and Miquelon"},
3681 {"St. Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
3682 {"TCA", "Turks and Caicos Islands"},
3683 {"TCD", "Chad"},
3684 {"TGO", "Togo"},
3685 {"THA", "Thailand"},
3686 {"TJK", "Tajikistan"},
3687 {"TKL", "Tokelau"},
3688 {"TKM", "Turkmenistan"},
3689 {"TLS", "Timor-Leste"},
3690 {"TON", "Tonga"},
3691 {"TTO", "Trinidad and Tobago"},
3692 {"TUN", "Tunisia"},
3693 {"TUR", "Turkey"},
3694 {"TUV", "Tuvalu"},
3695 {"TWN", "Taiwan"},
3696 {"TZA", "Tanzania"},
3697 {"The Netherlands", "Netherlands"},
3698 {"Trinidad & Tobago", "Trinidad and Tobago"},
3699 {"Turks & Caicos", "Turks and Caicos Islands"},
3700 {"Turks & Caicos Islands", "Turks and Caicos Islands"},
3701 {"Turks and Caicos", "Turks and Caicos Islands"},
3702 {"U.S.A.", "USA"},
3703 {"UGA", "Uganda"},
3704 {"UK", "United Kingdom"},
3705 {"UKR", "Ukraine"},
3706 {"UMI", "United States Minor Outlying Islands"},
3707 {"URY", "Uruguay"},
3708 {"UZB", "Uzbekistan"},
3709 {"United States", "USA"},
3710 {"United States of America", "USA"},
3711 {"VAT", "Holy See (Vatican City State)"},
3712 {"VCT", "Saint Vincent and the Grenadines"},
3713 {"VEN", "Venezuela"},
3714 {"VGB", "British Virgin Islands"},
3715 {"VIR", "Virgin Islands"},
3716 {"VNM", "Viet Nam"},
3717 {"VUT", "Vanuatu"},
3718 {"Vietnam", "Viet Nam"},
3719 {"WLF", "Wallis and Futuna"},
3720 {"WSM", "Samoa"},
3721 {"YEM", "Yemen"},
3722 {"ZAF", "South Africa"},
3723 {"ZMB", "Zambia"},
3724 {"ZWE", "Zimbabwe"},
3725 {"the Netherlands", "Netherlands"}
3726 };
3727 
3729 
3730 // for GP-24841
3732 {"Burma", "Myanmar"},
3733 {"Siam", "Thailand"}
3734 };
3736 
3737 // for GB-7408
3739 {"Antigua", "Antigua and Barbuda: Antigua"},
3740 {"Ashmore Island", "Ashmore and Cartier Islands: Ashmore Island"},
3741 {"Autonomous Region of the Azores", "Portugal: Azores"},
3742 {"Azores", "Portugal: Azores"},
3743 {"Barbuda", "Antigua and Barbuda: Barbuda"},
3744 {"Bassas da India", "French Southern and Antarctic Lands: Bassas da India"},
3745 {"Caicos Islands", "Turks and Caicos Islands: Caicos Islands"},
3746 {"Canary Islands", "Spain: Canary Islands"},
3747 {"Cartier Island", "Ashmore and Cartier Islands: Cartier Island"},
3748 {"East Germany", "Germany: East Germany"},
3749 {"El Hierro", "Spain: El Hierro"},
3750 {"Europa Island", "French Southern and Antarctic Lands: Europa Island"},
3751 {"Fuerteventura", "Spain: Fuerteventura"},
3752 {"Glorioso Islands", "French Southern and Antarctic Lands: Glorioso Islands"},
3753 {"Gran Canaria", "Spain: Gran Canaria"},
3754 {"Grenadines", "Saint Vincent and the Grenadines: Grenadines"},
3755 {"Heard Island", "Heard Island and McDonald Islands: Heard Island"},
3756 {"Ile Amsterdam", "French Southern and Antarctic Lands: Ile Amsterdam"},
3757 {"Ile Saint-Paul", "French Southern and Antarctic Lands: Ile Saint-Paul"},
3758 {"Iles Crozet", "French Southern and Antarctic Lands: Iles Crozet"},
3759 {"Iles Kerguelen", "French Southern and Antarctic Lands: Iles Kerguelen"},
3760 {"Juan de Nova Island", "French Southern and Antarctic Lands: Juan de Nova Island"},
3761 {"La Gomera", "Spain: La Gomera"},
3762 {"La Graciosa", "Spain: La Graciosa"},
3763 {"La Palma", "Spain: La Palma"},
3764 {"Lanzarote", "Spain: Lanzarote"},
3765 {"Madeira", "Portugal: Madeira"},
3766 {"McDonald Island", "Heard Island and McDonald Islands: McDonald Island"},
3767 {"McDonald Islands", "Heard Island and McDonald Islands: McDonald Islands"},
3768 {"Miquelon", "Saint Pierre and Miquelon: Miquelon"},
3769 {"Nevis", "Saint Kitts and Nevis: Nevis"},
3770 {"Principe", "Sao Tome and Principe: Principe"},
3771 {"Saint Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3772 {"Saint Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3773 {"Saint Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3774 {"Sao Tome", "Sao Tome and Principe: Sao Tome"},
3775 {"Scotland", "United Kingdom: Scotland"},
3776 {"South Sandwich Islands", "South Georgia and the South Sandwich Islands: South Sandwich Islands"},
3777 {"St Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3778 {"St Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3779 {"St Thomas", "USA: Saint Thomas"},
3780 {"St Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3781 {"St. Kitts", "Saint Kitts and Nevis: Saint Kitts"},
3782 {"St. Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
3783 {"St. Thomas", "USA: Saint Thomas"},
3784 {"St. Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
3785 {"Tenerife", "Spain: Tenerife"},
3786 {"Tobago", "Trinidad and Tobago: Tobago"},
3787 {"Trinidad", "Trinidad and Tobago: Trinidad"},
3788 {"Tromelin Island", "French Southern and Antarctic Lands: Tromelin Island"},
3789 {"Turks Islands", "Turks and Caicos Islands: Turks Islands"},
3790 {"Wales", "United Kingdom: Wales"},
3791 {"West Germany", "Germany: West Germany"},
3792 
3793 };
3795 
3796 
3797 static const char* s_USAStates[] = {
3798  "Alabama",
3799  "Alaska",
3800  "Arizona",
3801  "Arkansas",
3802  "California",
3803  "Colorado",
3804  "Connecticut",
3805  "Delaware",
3806  "District of Columbia",
3807  "Florida",
3808  "Georgia",
3809  "Hawaii",
3810  "Idaho",
3811  "Illinois",
3812  "Indiana",
3813  "Iowa",
3814  "Kansas",
3815  "Kentucky",
3816  "Louisiana",
3817  "Maine",
3818  "Maryland",
3819  "Massachusetts",
3820  "Michigan",
3821  "Minnesota",
3822  "Mississippi",
3823  "Missouri",
3824  "Montana",
3825  "Nebraska",
3826  "Nevada",
3827  "New Hampshire",
3828  "New Jersey",
3829  "New Mexico",
3830  "New York",
3831  "North Carolina",
3832  "North Dakota",
3833  "Ohio",
3834  "Oklahoma",
3835  "Oregon",
3836  "Pennsylvania",
3837  "Rhode Island",
3838  "South Carolina",
3839  "South Dakota",
3840  "Tennessee",
3841  "Texas",
3842  "Utah",
3843  "Vermont",
3844  "Virginia",
3845  "Washington",
3846  "West Virginia",
3847  "Wisconsin",
3848  "Wyoming"
3849 };
3850 
3852 {
3853  vector<string> words;
3854  NStr::Split(phrase, " \t\r\n", words);
3855  for(vector<string>::iterator word = words.begin(); word != words.end(); ++word)
3856  if (!word->empty() && isalpha(word->at(0)))
3857  word->at(0) = (unsigned char)toupper(word->at(0));
3858  return NStr::Join(words," ");
3859 }
3860 
3861 string CCountries::WholeCountryFix(string country)
3862 {
3863  string new_country;
3864  TCStringPairsMap::const_iterator found = k_whole_country_fixes.find(NStr::ToLower(country).c_str());
3865  if (found != k_whole_country_fixes.end()) {
3866  new_country = found->second;
3867  return new_country;
3868  }
3869 
3870  const size_t num_states = sizeof(s_USAStates) / sizeof(s_USAStates[0]);
3871  for (size_t i = 0; i < num_states; ++i) {
3872  if (NStr::EqualNocase(s_USAStates[i], country)) {
3873  new_country = "USA: " + CTempString(s_USAStates[i]);
3874  break;
3875  }
3876  }
3877 
3878  return new_country;
3879 }
3880 
3881 bool CCountries::IsSubstringOfStringInList(const string& phrase, const string& country1, size_t pos1)
3882 {
3883  bool r = false;
3885  {
3886  string country2(*c);
3887  if (country2.length() > country1.length() && NStr::FindNoCase(country2,country1) != NPOS)
3888  {
3889  SIZE_TYPE pos2 = NStr::FindNoCase(phrase,country2);
3890  while (pos2 != NPOS)
3891  {
3892  if (pos2 <= pos1 && pos2+country2.length() >= pos1+country1.length())
3893  r = true;
3894  pos2 = NStr::FindNoCase(phrase,country2,pos2+country2.length());
3895  }
3896  }
3897  }
3898  return r;
3899 }
3900 
3901 bool CCountries::ContainsMultipleCountryNames (const string &phrase)
3902 {
3903  int num_matches = 0;
3905  {
3906  string country(*c);
3907  size_t pos = NStr::FindNoCase(phrase,country);
3908  while (pos != NPOS)
3909  {
3910  if (!((pos+country.length()<phrase.length() && isalpha(phrase[pos+country.length()]))
3911  || (pos > 0 && isalpha(phrase[pos-1]))
3912  || IsSubstringOfStringInList(phrase,country,pos)))
3913  num_matches++;
3914  pos = NStr::FindNoCase(phrase,country,pos+country.length());
3915  }
3916 
3917  }
3918  return (num_matches > 1);
3919 }
3920 
3922 {
3923  string output = country;
3924  ITERATE ( TCStrSet, it, s_CountriesSet ) {
3925  if ( NStr::EqualNocase(country, *it) ) {
3926  output = *it;
3927  }
3928  }
3929  return output;
3930 }
3931 
3932 
3933 void CCountries::x_RemoveDelimitersFromEnds(string& val, bool except_paren)
3934 {
3936  bool any_found = true;
3937  while (!val.empty() && any_found) {
3938  any_found = false;
3939  if (NStr::StartsWith(val, ",")
3940  || NStr::StartsWith(val, ":")
3941  || NStr::StartsWith(val, ".")
3942  || (!except_paren && NStr::StartsWith(val, ")"))) {
3943  val = val.substr(1);
3944  any_found = true;
3946  } else if (NStr::EndsWith(val, ",")
3947  || NStr::EndsWith(val, ":")
3948  || (!except_paren && NStr::EndsWith(val, "("))) {
3949  val = val.substr(0, val.length() - 1);
3950  any_found = true;
3952  } else if (NStr::EndsWith(val, "the") && val.length() > 3 && !isalpha((unsigned char)val[val.length() - 4])) {
3953  val = val.substr(0, val.length() - 4);
3954  any_found = true;
3955  } else if (NStr::EndsWith(val, ".")) {
3956  size_t len = val.length();
3957  if (len > 1 && isspace((unsigned char)val[len - 2])) {
3958  val = val.substr(0, val.length() - 1);
3959  any_found = true;
3961  } else if (len > 5) {
3962  // make sure no spaces or punctuation within 4 characters before '.'
3963  bool do_remove = true;
3964  size_t pos = val.length() - 2;
3965  size_t dist = 0;
3966  while (dist < 4 && do_remove) {
3967  if (isspace((unsigned char)val[pos]) || ispunct((unsigned char)val[pos])) {
3968  do_remove = false;
3969  }
3970  pos--;
3971  dist++;
3972  }
3973  if (do_remove) {
3974  val = val.substr(0, val.length() - 1);
3975  any_found = true;
3976  }
3977  }
3978  }
3979  }
3980 }
3981 
3982 
3983 vector<string> CCountries::x_Tokenize(const string& val)
3984 {
3985  vector<string> tokens;
3986  NStr::Split(val, ",:()", tokens);
3987  // special tokenizing - if tokens contain periods but resulting token is at least four characters long
3988  vector<string>::iterator it = tokens.begin();
3989  while (it != tokens.end()) {
3990  size_t pos = NStr::Find(*it, ".");
3991  if (pos != NPOS && pos > 3 && (*it).length() - pos > 4) {
3992  string first = (*it).substr(0, pos);
3993  string remainder = (*it).substr(pos + 1);
3994  size_t space_pos = NStr::Find(first, " ");
3995  size_t len_to_space = first.length();
3996  while (space_pos != NPOS) {
3997  first = first.substr(space_pos + 1);
3998  len_to_space = first.length();
3999  space_pos = NStr::Find(first, " ");
4000  }
4001  if (len_to_space > 4) {
4002  (*it) = (*it).substr(0, pos);
4003  it = tokens.insert(it, remainder);
4004  } else {
4005  it++;
4006  }
4007  } else {
4008  it++;
4009  }
4010  }
4011  return tokens;
4012 }
4013 
4014 
4015 bool s_ContainsWholeWord(const CTempString test, const CTempString word, NStr::ECase case_sense)
4016 {
4017  size_t start = 0;
4018  size_t tlen = test.length();
4019  size_t wlen = word.length();
4020 
4021  size_t pos = NStr::Find(test, word, case_sense);
4022  while (pos != NPOS) {
4023  size_t p = start + pos;
4024  if ( (p == 0 || !isalpha((unsigned char)test[p - 1])) &&
4025  (p + wlen >= tlen || !isalpha((unsigned char)test[p + wlen])) ) {
4026  return true;
4027  }
4028  start = p + 1;
4029  pos = NStr::Find(CTempString(test, start, tlen - start), word, case_sense);
4030  }
4031  return false;
4032 }
4033 
4034 
4035 bool s_SuppressCountryFix(const string& test)
4036 {
4037  if (s_ContainsWholeWord(test, "Sea", NStr::eNocase)) {
4038  return true;
4039  } else if (s_ContainsWholeWord(test, "USSR", NStr::eNocase)) {
4040  return true;
4041  }
4042  return false;
4043 }
4044 
4045 
4047 (const TCStringPairsMap& fix_map,
4048  const vector<string>& countries,
4049  string& valid_country,
4050  string& orig_valid_country,
4051  bool& too_many_countries,
4052  bool& bad_cap)
4053 {
4054  for (auto country : countries) {
4055  if (!country.empty() && !too_many_countries)
4056  {
4057  string check = country;
4060 
4061  bool check_has_bad_cap = false;
4062  if (IsValid(check,check_has_bad_cap))
4063  {
4064  if (valid_country.empty())
4065  {
4066  valid_country = check;
4067  orig_valid_country = check;
4068  bad_cap = check_has_bad_cap;
4069  }
4070  else
4071  {
4072  too_many_countries = true;
4073  }
4074  }
4075  else // see if this is a fixable country
4076  {
4077  TCStringPairsMap::const_iterator found = fix_map.find(check.c_str());
4078  if (found != fix_map.end())
4079  {
4080  if (valid_country.empty())
4081  {
4082  valid_country = found->second;
4083  orig_valid_country = check;
4084  }
4085  else
4086  {
4087  too_many_countries = true;
4088  }
4089  }
4090  }
4091  }
4092  }
4093 }
4094 
4095 // start of RW-1278
4096 
4097 bool s_CompressRunsOfSpaces(string& val)
4098 {
4099  if (val.length() == 0) return false;
4100 
4101  char * str = new char[sizeof(char) * (val.length() + 1)];
4102  strcpy(str, val.c_str());
4103 
4104  unsigned char ch; /* to use 8bit characters in multibyte languages */
4105  unsigned char pv; /* to use 8bit characters in multibyte languages */
4106  char * dst;
4107  char * ptr;
4108 
4109  dst = str;
4110  ptr = str;
4111  ch = *ptr;
4112  pv = '\0';
4113  while (ch != '\0') {
4114  *dst = ch;
4115  dst++;
4116  ptr++;
4117  pv = ch;
4118  ch = *ptr;
4119  if (pv == ' ') {
4120  while (ch == ' ') {
4121  ptr++;
4122  ch = *ptr;
4123  }
4124  pv = '\0';
4125  }
4126  }
4127  if (dst != NULL) {
4128  *dst = '\0';
4129  }
4130 
4131  string new_val;
4132  new_val = str;
4133  delete[] str;
4134 
4135  if (!NStr::Equal(val, new_val)) {
4136  val = new_val;
4137  return true;
4138  }
4139  else {
4140  return false;
4141  }
4142 }
4143 
4146  { "Acadia Parish", "Acadia Parish" },
4147  { "AcadiaParish", "Acadia Parish" },
4148  { "Allen Parish", "Allen Parish" },
4149  { "AllenParish", "Allen Parish" },
4150  { "Ascension Parish", "Ascension Parish" },
4151  { "AscensionParish", "Ascension Parish" },
4152  { "Assumption Parish", "Assumption Parish" },
4153  { "AssumptionParish", "Assumption Parish" },
4154  { "Avoyelles Parish", "Avoyelles Parish" },
4155  { "AvoyellesParish", "Avoyelles Parish" },
4156  { "Beauregard Parish", "Beauregard Parish" },
4157  { "BeauregardParish", "Beauregard Parish" },
4158  { "Bienville Parish", "Bienville Parish" },
4159  { "BienvilleParish", "Bienville Parish" },
4160  { "Bossier Parish", "Bossier Parish" },
4161  { "BossierParish", "Bossier Parish" },
4162  { "Caddo Parish", "Caddo Parish" },
4163  { "CaddoParish", "Caddo Parish" },
4164  { "Calcasieu Parish", "Calcasieu Parish" },
4165  { "CalcasieuParish", "Calcasieu Parish" },
4166  { "Caldwell Parish", "Caldwell Parish" },
4167  { "CaldwellParish", "Caldwell Parish" },
4168  { "Cameron Parish", "Cameron Parish" },
4169  { "CameronParish", "Cameron Parish" },
4170  { "Catahoula Parish", "Catahoula Parish" },
4171  { "CatahoulaParish", "Catahoula Parish" },
4172  { "Claiborne Parish", "Claiborne Parish" },
4173  { "ClaiborneParish", "Claiborne Parish" },
4174  { "Concordia Parish", "Concordia Parish" },
4175  { "ConcordiaParish", "Concordia Parish" },
4176  { "DeSoto Parish", "DeSoto Parish" },
4177  { "DeSotoParish", "DeSoto Parish" },
4178  { "East Baton Rouge Parish", "East Baton Rouge Parish" },
4179  { "East Carroll Parish", "East Carroll Parish" },
4180  { "East Feliciana Parish", "East Feliciana Parish" },
4181  { "EastBatonRougeParish", "East Baton Rouge Parish" },
4182  { "EastCarrollParish", "East Carroll Parish" },
4183  { "EastFelicianaParish", "East Feliciana Parish" },
4184  { "Evangeline Parish", "Evangeline Parish" },
4185  { "EvangelineParish", "Evangeline Parish" },
4186  { "Franklin Parish", "Franklin Parish" },
4187  { "FranklinParish", "Franklin Parish" },
4188  { "Grant Parish", "Grant Parish" },
4189  { "GrantParish", "Grant Parish" },
4190  { "Iberia Parish", "Iberia Parish" },
4191  { "IberiaParish", "Iberia Parish" },
4192  { "Iberville Parish", "Iberville Parish" },
4193  { "IbervilleParish", "Iberville Parish" },
4194  { "Jackson Parish", "Jackson Parish" },
4195  { "JacksonParish", "Jackson Parish" },
4196  { "Jefferson Davis Parish", "Jefferson Davis Parish" },
4197  { "Jefferson Parish", "Jefferson Parish" },
4198  { "JeffersonDavisParish", "Jefferson Davis Parish" },
4199  { "JeffersonParish", "Jefferson Parish" },
4200  { "Lafayette Parish", "Lafayette Parish" },
4201  { "LafayetteParish", "Lafayette Parish" },
4202  { "Lafourche Parish", "Lafourche Parish" },
4203  { "LafourcheParish", "Lafourche Parish" },
4204  { "LaSalle Parish", "LaSalle Parish" },
4205  { "LaSalleParish", "LaSalle Parish" },
4206  { "Lincoln Parish", "Lincoln Parish" },
4207  { "LincolnParish", "Lincoln Parish" },
4208  { "Livingston Parish", "Livingston Parish" },
4209  { "LivingstonParish", "Livingston Parish" },
4210  { "Madison Parish", "Madison Parish" },
4211  { "MadisonParish", "Madison Parish" },
4212  { "Morehouse Parish", "Morehouse Parish" },
4213  { "MorehouseParish", "Morehouse Parish" },
4214  { "Natchitoches Parish", "Natchitoches Parish" },
4215  { "NatchitochesParish", "Natchitoches Parish" },
4216  { "Orleans Parish", "Orleans Parish" },
4217  { "OrleansParish", "Orleans Parish" },
4218  { "Ouachita Parish", "Ouachita Parish" },
4219  { "OuachitaParish", "Ouachita Parish" },
4220  { "Plaquemines Parish", "Plaquemines Parish" },
4221  { "PlaqueminesParish", "Plaquemines Parish" },
4222  { "Pointe Coupee Parish", "Pointe Coupee Parish" },
4223  { "PointeCoupeeParish", "Pointe Coupee Parish" },
4224  { "Rapides Parish", "Rapides Parish" },
4225  { "RapidesParish", "Rapides Parish" },
4226  { "Red River Parish", "Red River Parish" },
4227  { "RedRiverParish", "Red River Parish" },
4228  { "Richland Parish", "Richland Parish" },
4229  { "RichlandParish", "Richland Parish" },
4230  { "Sabine Parish", "Sabine Parish" },
4231  { "SabineParish", "Sabine Parish" },
4232  { "St. Bernard Parish", "St. Bernard Parish" },
4233  { "St. Charles Parish", "St. Charles Parish" },
4234  { "St. Helena Parish", "St. Helena Parish" },
4235  { "St. James Parish", "St. James Parish" },
4236  { "St. John the Baptist Parish", "St. John the Baptist Parish" },
4237  { "St. Landry Parish", "St. Landry Parish" },
4238  { "St. Martin Parish", "St. Martin Parish" },
4239  { "St. Mary Parish", "St. Mary Parish" },
4240  { "St. Tammany Parish", "St. Tammany Parish" },
4241  { "St.BernardParish", "St. Bernard Parish" },
4242  { "St.CharlesParish", "St. Charles Parish" },
4243  { "St.HelenaParish", "St. Helena Parish" },
4244  { "St.JamesParish", "St. James Parish" },
4245  { "St.JohntheBaptistParish", "St. John the Baptist Parish" },
4246  { "St.LandryParish", "St. Landry Parish" },
4247  { "St.MartinParish", "St. Martin Parish" },
4248  { "St.MaryParish", "St. Mary Parish" },
4249  { "St.TammanyParish", "St. Tammany Parish" },
4250  { "Tangipahoa Parish", "Tangipahoa Parish" },
4251  { "TangipahoaParish", "Tangipahoa Parish" },
4252  { "Tensas Parish", "Tensas Parish" },
4253  { "TensasParish", "Tensas Parish" },
4254  { "Terrebonne Parish", "Terrebonne Parish" },
4255  { "TerrebonneParish", "Terrebonne Parish" },
4256  { "Union Parish", "Union Parish" },
4257  { "UnionParish", "Union Parish" },
4258  { "Vermilion Parish", "Vermilion Parish" },
4259  { "VermilionParish", "Vermilion Parish" },
4260  { "Vernon Parish", "Vernon Parish" },
4261  { "VernonParish", "Vernon Parish" },
4262  { "Washington Parish", "Washington Parish" },
4263  { "WashingtonParish", "Washington Parish" },
4264  { "Webster Parish", "Webster Parish" },
4265  { "WebsterParish", "Webster Parish" },
4266  { "West Baton Rouge Parish", "West Baton Rouge Parish" },
4267  { "West Carroll Parish", "West Carroll Parish" },
4268  { "West Feliciana Parish", "West Feliciana Parish" },
4269  { "WestBatonRougeParish", "West Baton Rouge Parish" },
4270  { "WestCarrollParish", "West Carroll Parish" },
4271  { "WestFelicianaParish", "West Feliciana Parish" },
4272  { "Winn Parish", "Winn Parish" },
4273  { "WinnParish", "Winn Parish" }
4274 };
4275 
4278 
4279 bool s_IsParish ( string& parish ) {
4280 
4281  if ( parish.empty() ) {
4282  return false;
4283  }
4284 
4285  TParishMap::const_iterator parish_find_iter = parishAbbrevMap.find(parish.c_str());
4286  if ( parish_find_iter != parishAbbrevMap.end() ) {
4287  // replace with full parish name
4288  parish = parish_find_iter->second;
4289  return true;
4290  }
4291 
4292  return false;
4293 }
4294 
4297  { "AK", "Alaska" },
4298  { "AL", "Alabama" },
4299  { "Alabama", "Alabama" },
4300  { "Alaska", "Alaska" },
4301  { "American Samoa", "American Samoa" },
4302  { "AR", "Arkansas" },
4303  { "Arizona", "Arizona" },
4304  { "Arkansas", "Arkansas" },
4305  { "AS", "American Samoa" },
4306  { "AZ", "Arizona" },
4307  { "CA", "California" },
4308  { "California", "California" },
4309  { "CO", "Colorado" },
4310  { "Colorado", "Colorado" },
4311  { "Connecticut", "Connecticut" },
4312  { "CT", "Connecticut" },
4313  { "DC", "District of Columbia" },
4314  { "DE", "Delaware" },
4315  { "Delaware", "Delaware" },
4316  { "District of Columbia", "District of Columbia" },
4317  { "FL", "Florida" },
4318  { "Florida", "Florida" },
4319  { "GA", "Georgia" },
4320  { "Georgia", "Georgia" },
4321  { "GU", "Guam" },
4322  { "Guam", "Guam" },
4323  { "Hawaii", "Hawaii" },
4324  { "HI", "Hawaii" },
4325  { "IA", "Iowa" },
4326  { "ID", "Idaho" },
4327  { "Idaho", "Idaho" },
4328  { "IL", "Illinois" },
4329  { "Illinois", "Illinois" },
4330  { "IN", "Indiana" },
4331  { "Indiana", "Indiana" },
4332  { "Iowa", "Iowa" },
4333  { "Kansas", "Kansas" },
4334  { "Kentucky", "Kentucky" },
4335  { "KS", "Kansas" },
4336  { "KY", "Kentucky" },
4337  { "LA", "Louisiana" },
4338  { "Louisiana", "Louisiana" },
4339  { "MA", "Massachusetts" },
4340  { "Maine", "Maine" },
4341  { "Maryland", "Maryland" },
4342  { "Massachusetts", "Massachusetts" },
4343  { "MD", "Maryland" },
4344  { "ME", "Maine" },
4345  { "MI", "Michigan" },
4346  { "Michigan", "Michigan" },
4347  { "Minnesota", "Minnesota" },
4348  { "Mississippi", "Mississippi" },
4349  { "Missouri", "Missouri" },
4350  { "MN", "Minnesota" },
4351  { "MO", "Missouri" },
4352  { "Montana", "Montana" },
4353  { "MS", "Mississippi" },
4354  { "MT", "Montana" },
4355  { "NC", "North Carolina" },
4356  { "ND", "North Dakota" },
4357  { "NE", "Nebraska" },
4358  { "Nebraska", "Nebraska" },
4359  { "Nevada", "Nevada" },
4360  { "New Hampshire", "New Hampshire" },
4361  { "New Jersey", "New Jersey" },
4362  { "New Mexico", "New Mexico" },
4363  { "New York", "New York" },
4364  { "NH", "New Hampshire" },
4365  { "NJ", "New Jersey" },
4366  { "NM", "New Mexico" },
4367  { "North Carolina", "North Carolina" },
4368  { "North Dakota", "North Dakota" },
4369  { "NV", "Nevada" },
4370  { "NY", "New York" },
4371  { "OH", "Ohio" },
4372  { "Ohio", "Ohio" },
4373  { "OK", "Oklahoma" },
4374  { "Oklahoma", "Oklahoma" },
4375  { "OR", "Oregon" },
4376  { "Oregon", "Oregon" },
4377  { "PA", "Pennsylvania" },
4378  { "Pennsylvania", "Pennsylvania" },
4379  { "PR", "Puerto Rico" },
4380  { "Puerto Rico", "Puerto Rico" },
4381  { "Rhode Island", "Rhode Island" },
4382  { "RI", "Rhode Island" },
4383  { "SC", "South Carolina" },
4384  { "SD", "South Dakota" },
4385  { "South Carolina", "South Carolina" },
4386  { "South Dakota", "South Dakota" },
4387  { "Tennessee", "Tennessee" },
4388  { "Texas", "Texas" },
4389  { "TN", "Tennessee" },
4390  { "TX", "Texas" },
4391  { "US Virgin Islands", "US Virgin Islands" },
4392  { "UT", "Utah" },
4393  { "Utah", "Utah" },
4394  { "VA", "Virginia" },
4395  { "Vermont", "Vermont" },
4396  { "VI", "US Virgin Islands" },
4397  { "Virgin Islands", "US Virgin Islands" },
4398  { "Virginia", "Virginia" },
4399  { "VT", "Vermont" },
4400  { "WA", "Washington" },
4401  { "Washington", "Washington" },
4402  { "West Virginia", "West Virginia" },
4403  { "WI", "Wisconsin" },
4404  { "Wisconsin", "Wisconsin" },
4405  { "WV", "West Virginia" },
4406  { "WY", "Wyoming" },
4407  { "Wyoming", "Wyoming" }
4408 };
4409 
4412 
4413 bool s_IsState ( string& state, bool& modified ) {
4414 
4415  if ( state.empty() ) {
4416  return false;
4417  }
4418 
4419  string original = state;
4420  string working = state;
4421 
4422  if ( NStr::StartsWith ( working, "State of ", NStr::eNocase )) {
4423  NStr::TrimPrefixInPlace ( working, "State of ", NStr::eNocase );
4424  }
4425 
4426  if ( NStr::StartsWith ( working, "Commonwealth of ", NStr::eNocase )) {
4427  NStr::TrimPrefixInPlace ( working, "Commonwealth of ", NStr::eNocase );
4428  }
4429 
4430  if ( NStr::EndsWith ( working, " State", NStr::eNocase )) {
4431  NStr::TrimSuffixInPlace ( working, " State", NStr::eNocase );
4432  }
4433 
4434  NStr::TruncateSpacesInPlace ( working );
4435 
4436  TStateMap::const_iterator state_find_iter = stateAbbrevMap.find(working.c_str());
4437  if ( state_find_iter != stateAbbrevMap.end() ) {
4438  // replace with full state name
4439  state = state_find_iter->second;
4440  // report conversion from two-letter, changed capitalization, or prefix/suffix removal
4441  if ( ! NStr::Equal ( original, state )) {
4442  modified = true;
4443  }
4444  return true;
4445  }
4446 
4447  return false;
4448 }
4449 
4451 
4452  if ( country.empty() ) {
4453  return CCountries::e_NoResult;
4454  }
4455 
4456  // make working copy
4457  string original = country;
4458  string working = country;
4459 
4460  // remove flanking quotation marks - if CCountries::NewFixCountry not called
4461  if ( NStr::StartsWith ( working, "\"" ) && NStr::EndsWith ( working, "\"" )) {
4462  working = working.substr ( 1, working.length() - 2 );
4463  }
4464 
4465  // remove flanking spaces
4466  NStr::TruncateSpacesInPlace ( working );
4467 
4468  // separate strings before and after colon
4469  string frst, scnd;
4470  NStr::SplitInTwo ( working, ":", frst, scnd );
4471 
4472  NStr::TruncateSpacesInPlace ( frst );
4473  NStr::TruncateSpacesInPlace ( scnd );
4474 
4475  // confirm that country is USA
4476  if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
4477  // if not, first try rescuing US territory
4478  working = CCountries::NewFixCountry(working, true);
4479  NStr::SplitInTwo ( working, ":", frst, scnd );
4480  NStr::TruncateSpacesInPlace ( frst );
4481  NStr::TruncateSpacesInPlace ( scnd );
4482  if ( ! NStr::EqualNocase ( frst, "USA") && ! NStr::EqualNocase ( frst, "US")) {
4483  return CCountries::e_NotUSA;
4484  }
4485  }
4486 
4487  // split state/county/city clauses at commas
4488  vector<string> components;
4489  NStr::Split(scnd, ",", components);
4490 
4491  // check for only country
4492  if ( components.size() < 1 ) {
4493  country = "USA";
4494  return CCountries::e_Valid;
4495  }
4496 
4497  for ( int j = 0; j < components.size(); j++ ) {
4498  // remove flanking spaces around components
4499  NStr::TruncateSpacesInPlace ( components[j] );
4500  s_CompressRunsOfSpaces ( components[j] );
4501  // clean up runon strings like EastBatonRougeParish
4502  if ( NStr::EndsWith ( components[j], "Parish", NStr::eNocase )) {
4503  s_IsParish( components[j] );
4504  }
4505  }
4506 
4507  // bool any_modified = false;
4508  int num_states = 0;
4509  int match = -1;
4510 
4511  // string* first = 0;
4512  // string* last = 0;
4513 
4514  // has multiple components
4515  // int max = components.size() - 1;
4516  for ( int j = 0; j < components.size(); j++ ) {
4517  bool modified = false;
4518  if ( s_IsState ( components[j], modified )) {
4519  /*
4520  if (modified) {
4521  any_modified = true;
4522  }
4523  */
4524  if ( match < 0 ) {
4525  // record position of first s_IsState match
4526  match = j;
4527  }
4528  // count successful matches
4529  num_states++;
4530  /*
4531  if ( j == 0 ) {
4532  first = &(components[j]);
4533  }
4534  if ( j == max ) {
4535  last = &(components[j]);
4536  }
4537  */
4538  }
4539  }
4540 
4541  // generate result
4542  string res;
4543  res.append ("USA: ");
4544  string pfx = "";
4545 
4546  if ( match >= 0 ) {
4547  // move first state matched to first position
4548  res.append ( components[match] );
4549  pfx = ", ";
4550  }
4551 
4552  for ( int j = 0; j < components.size(); j++ ) {
4553  if ( j == match) continue;
4554  res.append ( pfx );
4555  res.append ( components[j] );
4556  pfx = ", ";
4557  }
4558 
4559  country = res;
4560 
4561  if ( match < 0 ) {
4562  return CCountries::e_Missing;
4563  } else if ( num_states > 1 ) {
4564  return CCountries::e_Ambiguous;
4565  } else if ( ! NStr::Equal ( original, res )) {
4566  return CCountries::e_Corrected;
4567  }
4568 
4569  return CCountries::e_Valid;
4570 }
4571 
4573 
4575 static bool exceptions_initialized = false;
4576 
4577 void CCountries::ReadUSAExceptionMap (CCountries::TUsaExceptionMap& exceptions, const string& exception_file ) {
4578 
4579  if ( ! exception_file.empty()) {
4580 
4581  TNCBITSVStream my_stream (exception_file);
4582  for ( const auto & row : my_stream ) {
4583  TFieldNo number_of_fields = row. GetNumberOfFields();
4584  if ( number_of_fields != 2 ) continue;
4585  string fr = row[0].Get<string>();
4586  string to = row[1].Get<string>();
4587  exceptions [fr] = to;
4588  }
4589  }
4590 }
4591 
4593 
4594  // clear previous map
4595  exception_map.clear();
4596 
4597  // initialize internal exception map
4598  for ( const auto & itm : exceptions ) {
4599  string fr = itm.first;
4600  string to = itm.second;
4601 
4602  // ensure colon is followed by space to match initial correction
4603  string f1, f2;
4604  NStr::SplitInTwo ( fr, ":", f1, f2 );
4607  if ( ! f1.empty() && ! f2.empty()) {
4608  fr = f1 + ": " + f2;
4609  }
4610 
4611  exception_map [fr] = to;
4612  }
4613 
4614  exceptions_initialized = true;
4615 }
4616 
4617 void CCountries::LoadUSAExceptionMap (const string& exception_file ) {
4618 
4619  if ( ! exception_file.empty()) {
4620 
4621  TUsaExceptionMap exceptions;
4622  ReadUSAExceptionMap ( exceptions, exception_file );
4623  LoadUSAExceptionMap ( exceptions );
4624  }
4625 }
4626 
4627 string CCountries::USAStateCleanup ( const string& country, CCountries::EStateCleanup& type ) {
4628 
4629  // call algorithmic mapping function
4630  string working = country;
4631  type = s_DoUSAStateCleanup ( working );
4632 
4633  // apply exceptions from preloaded data file
4634  if ( exceptions_initialized ) {
4635  string corrected = exception_map [working];
4636  if ( ! corrected.empty()) {
4637  // presence in map here will disambiguate otherwise ambiguous name pair,
4638  // thus self-entries need to be added to the ambiguous state exception list
4639  if ( ! NStr::StartsWith ( corrected, "USA" )) {
4640  type = e_NotUSA;
4641  } else if ( NStr::Equal ( corrected, working ) && NStr::Equal ( corrected, country )) {
4642  type = e_Valid;
4643  } else {
4644  type = e_Corrected;
4645  }
4646  return corrected;
4647  }
4648  }
4649 
4650  if ( ! NStr::StartsWith ( working, "USA" )) {
4651  type = e_NotUSA;
4652  }
4653  return working;
4654 }
4655 
4656 string CCountries::USAStateCleanup ( const string& country ) {
4657 
4659  return USAStateCleanup ( country, type );
4660 }
4661 
4662 // end of RW-1278
4663 
4664 string CCountries::NewFixCountry (const string& test, bool us_territories)
4665 {
4666  // change requested for JIRA:SQD-1410
4667  if (s_SuppressCountryFix(test)) {
4668  if (IsValid(test)) {
4669  return test;
4670  } else {
4671  return kEmptyStr;
4672  }
4673  }
4674 
4675  // JIRA:RW-2243 Micronesia is the only entry with a comma, special case test here
4676  string micronesia = "Micronesia, Federated States of";
4677  if (NStr::EqualNocase(test, micronesia)) {
4678  if (! NStr::EqualCase(test, micronesia)) {
4679  return micronesia;
4680  }
4681  }
4682  // JIRA:RW-2243 also special case to convert old Micronesia name to new name
4683  if (NStr::EqualNocase(test, "Micronesia")) {
4684  return micronesia;
4685  }
4686 
4687  string input = test;
4688  if (NStr::StartsWith(input, "\"") && NStr::EndsWith(input, "\"")) {
4689  input = input.substr(1, input.length() - 2);
4690  }
4692 
4693  if (NStr::EndsWith(input, ":")) {
4694  input = input.substr(0, input.length() - 1);
4696  }
4697 
4698  string usa1,usa2;
4699  NStr::SplitInTwo(input, ":", usa1, usa2);
4700  if (!usa1.empty() && !usa2.empty()) {
4703  if (NStr::EqualNocase(usa1, "U.S.A.") || NStr::EqualNocase(usa1, "United States") || NStr::EqualNocase(usa1, "United States of America")) {
4704  input = "USA: " + usa2;
4705  }
4706  }
4707 
4708  auto old_name_fix = k_old_country_name_fixes.find(input.c_str());
4709  if (old_name_fix != k_old_country_name_fixes.end()) {
4710  input = old_name_fix->second;
4711  return input;
4712  }
4713 
4714  if (us_territories) {
4715  if ( NStr::StartsWith( input, "Puerto Rico", NStr::eNocase) || NStr::StartsWith( input, "Guam", NStr::eNocase) || NStr::StartsWith( input, "American Samoa", NStr::eNocase) ) {
4716  input = "USA: " + input;
4719  return input;
4720  } else if ( NStr::StartsWith( input, "Virgin Islands", NStr::eNocase) ) {
4721  input = "USA: US " + input;
4724  return input;
4725  }
4726  }
4727 
4728  if (IsValid(input)) {
4730  return input;
4731  }
4732  string new_country = WholeCountryFix(input);
4733  if (!new_country.empty())
4734  return new_country;
4735 
4736  bool too_many_countries = false;
4737  bool bad_cap = false;
4738  vector<string> countries = x_Tokenize(input);
4739  string valid_country;
4740  string orig_valid_country;
4741 
4742  x_FindCountryName(k_country_name_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
4743  if (valid_country.empty()) {
4744  x_FindCountryName(k_subregion_fixes, countries, valid_country, orig_valid_country, too_many_countries, bad_cap);
4745  }
4746 
4747  if (!valid_country.empty() && !too_many_countries)
4748  too_many_countries = ContainsMultipleCountryNames (input);
4749 
4750  if (!valid_country.empty() && too_many_countries && valid_country == input)
4751  {
4752  string str1,str2;
4753  NStr::SplitInTwo(valid_country,":",str1,str2);
4754  if (!str1.empty() && !str2.empty() && !NStr::StartsWith(str2," "))
4755  new_country = str1+": "+str2;
4756 
4758  }
4759  else if(!valid_country.empty() && !too_many_countries)
4760  {
4761  // find valid_country in input
4762  size_t pos = NStr::Find(input,orig_valid_country);
4763  // save preceeding string without trailing spaces or delimiters ":,"
4764  string before = input.substr(0,pos);
4765 
4768  // save trailing string without initial spaces or delimiters
4769  string after = input.substr(pos+orig_valid_country.length());
4770  x_RemoveDelimitersFromEnds(after, true);
4772  if (bad_cap) new_country = GetCorrectedCountryCapitalization(valid_country);
4773  else new_country = valid_country;
4774  if (!before.empty() || !after.empty()) {
4775  if (NStr::Find(valid_country, ":") == NPOS) {
4776  new_country += ": ";
4777  } else {
4778  new_country += ", ";
4779  }
4780  }
4781  if (!before.empty())
4782  new_country += before;
4783  if (!before.empty() && !after.empty() && !NStr::Equal(after, ")"))
4784  new_country += ", ";
4785  if (!after.empty())
4786  new_country += after;
4788  }
4789 
4790  return new_country;
4791 }
4792 
4793 
4795 {
4796  // requested in SQD-4516
4797  bool rval = false;
4798  int count = 0;
4799  for (size_t i = 0; i < country.length(); i++) {
4800  if (country[i] == ':') {
4801  count++;
4802  if (count > 1) {
4803  country[i] = ',';
4804  rval = true;
4805  }
4806  }
4807  }
4808  return rval;
4809 }
4810 
4811 
4812 string CCountries::CountryFixupItem(const string &input, bool capitalize_after_colon)
4813 {
4814  string country = NewFixCountry (input);
4815  string new_country = country;
4816  SIZE_TYPE country_end_pos = NStr::Find(country,":");
4817  if (country_end_pos != NPOS)
4818  {
4819  SIZE_TYPE pos = country_end_pos;
4820  while (country[pos] == ',' || country[pos] == ':' || isspace((unsigned char)country[pos]))
4821  {
4822  pos++;
4823  }
4824  string after = country.substr(pos);
4825  if (after.empty()) {
4826  if (pos > country_end_pos) {
4827  new_country = country.substr(0, country_end_pos);
4828  }
4829  } else {
4831  if (capitalize_after_colon)
4832  after = CapitalizeFirstLetterOfEveryWord (after);
4833  new_country = country.substr(0,country_end_pos);
4834  new_country += ": " + after;
4835  }
4836  }
4837  return new_country;
4838 }
4839 
4840 
4841 // SubSource Qual Fixups
4844 
4846  { "adult", "adult" },
4847  { "egg", "egg" },
4848  { "juvenile", "juvenile" },
4849  { "larva", "larva" }
4850 };
4851 
4853 
4854 
4856 {
4857  string fix = value;
4858 
4859  TStaticQualFixMap::const_iterator it = sc_DevStagePairs.find(value.c_str());
4860  if (it != sc_DevStagePairs.end()) {
4861  fix = it->second;
4862  }
4863  return fix;
4864 }
4865 
4866 
4868  { "hemocyte", "hemocyte" },
4869  { "hepatocyte", "hepatocyte" },
4870  { "lymphocyte", "lymphocyte" },
4871  { "neuroblast", "neuroblast" }
4872 };
4873 
4875 
4877 {
4878  string fix = value;
4879 
4880  TStaticQualFixMap::const_iterator it = sc_CellTypePairs.find(value.c_str());
4881  if (it != sc_CellTypePairs.end()) {
4882  fix = it->second;
4883  }
4884  return fix;
4885 
4886 }
4887 
4890 
4892 static bool s_QualFixupMapsInitialized = false;
4893 
4894 static void s_ProcessQualMapLine(const CTempString& line, TQualFixMap& qual_map)
4895 {
4896  vector<CTempString> tokens;
4897  NStr::Split(line, "\t", tokens);
4898  if (tokens.size() > 1) {
4899  qual_map[tokens[0]] = tokens[1];
4900  }
4901 }
4902 
4903 
4904 void s_AddOneDataFile(const string& file_name, const string& data_name,
4905  const char **built_in, size_t num_built_in,
4906  TQualFixMap& qual_map)
4907 {
4908  string file = g_FindDataFile(file_name);
4909  CRef<ILineReader> lr;
4910  if (!file.empty()) {
4911  try {
4912  lr = ILineReader::New(file);
4913  } NCBI_CATCH("s_InitializeQualMaps")
4914  }
4915 
4916  if (lr.Empty()) {
4917  if (built_in == NULL) {
4918  ERR_POST(Note << "No data for " + data_name);
4919  } else {
4920  if (getenv("NCBI_DEBUG")) {
4921  ERR_POST(Note << "Falling back on built-in data for " + data_name);
4922  }
4923  for (size_t i = 0; i < num_built_in; i++) {
4924  const char *p = built_in[i];
4925  s_ProcessQualMapLine(p, qual_map);
4926  }
4927  }
4928  } else {
4929  if (getenv("NCBI_DEBUG")) {
4930  ERR_POST(Note << "Reading from " + file + " for " + data_name);
4931  }
4932  do {
4933  s_ProcessQualMapLine(*++*lr, qual_map);
4934  } while (!lr->AtEOF());
4935  }
4936 }
4937 
4938 #include "isolation_sources.inc"
4939 
4940 static void s_InitializeQualMaps(void)
4941 {
4942  CFastMutexGuard GUARD(s_QualFixMutex);
4944  return;
4945  }
4946 
4947  // tissue types
4948  s_AddOneDataFile("isolation_sources.txt", "isolation sources", (const char **)k_isolation_sources, sizeof(k_isolation_sources) / sizeof(char *), s_IsolationSourceMap);
4950 }
4951 
4952 
4953 
4954 
4955 
4957 {
4958  string fix = value;
4959 
4961 
4963  if (it != s_IsolationSourceMap.end()) {
4964  return it->second;
4965  }
4966 
4967  size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
4968  for (size_t i = 0; i < max; i++) {
4971  break;
4972  }
4973  }
4974 
4975  fix = COrgMod::FixHostCapitalization(fix);
4976  fix = FixDevStageCapitalization(fix);
4977  fix = FixCellTypeCapitalization(fix);
4978 
4979  return fix;
4980 }
4981 
4982 
4984 {
4985  string fix = value;
4986 
4989  if (it != s_IsolationSourceMap.end()) {
4990  return it->second;
4991  }
4992 
4993 
4994  size_t max = sizeof(sm_ValidSexQualifierTokens) / sizeof(const char*);
4995  for (size_t i = 0; i < max; i++) {
4998  break;
4999  }
5000  }
5001 
5002  fix = COrgMod::FixHostCapitalization(fix);
5003  fix = FixDevStageCapitalization(fix);
5004  fix = FixCellTypeCapitalization(fix);
5005 
5006  return fix;
5007 }
5008 
5009 
5011 {
5013 }
5014 
5015 
5016 string CSubSource::FixCapitalization(TSubtype subtype, const string& value)
5017 {
5018  string new_val = value;
5019  switch (subtype) {
5021  new_val = FixSexQualifierValue(value);
5022  if (NStr::IsBlank(new_val)) {
5023  new_val = value;
5024  }
5025  break;
5028  break;
5030  new_val = FixLabHostCapitalization(value);
5031  break;
5034  break;
5036  new_val = FixDevStageCapitalization(value);
5037  break;
5039  new_val = FixCellTypeCapitalization(value);
5040  break;
5041  default:
5042  new_val = value;
5043  break;
5044  }
5045  return new_val;
5046 }
5047 
5048 
5050 {
5051  if (!IsSetSubtype() || !IsSetName()) {
5052  return;
5053  }
5054 
5055  TSubtype subtype = GetSubtype();
5056 
5057  if (subtype == CSubSource::eSubtype_sex) {
5058  string upr = GetName();
5059  string lwr = upr;
5060  NStr::ToLower(lwr);
5061  if (! NStr::Equal(upr, lwr)) {
5062  SetName(lwr);
5063  }
5064  }
5065 
5066  const string& name = GetName();
5067 
5068  string new_val = FixCapitalization(subtype, name);
5069 
5070  if (!NStr::IsBlank(new_val)) {
5071  SetName(new_val);
5072  }
5073 
5074 }
5075 
5076 
5077 string CSubSource::AutoFix(TSubtype subtype, const string& value)
5078 {
5079  string new_val;
5080  switch (subtype) {
5082  new_val = CCountries::NewFixCountry(value);
5083  break;
5085  new_val = FixDateFormat(value);
5086  break;
5088  new_val = FixLatLonFormat(value);
5089  break;
5091  new_val = FixSexQualifierValue(value);
5092  break;
5094  new_val = FixAltitude(value);
5095  break;
5096  default:
5097  break;
5098  }
5099  return new_val;
5100 }
5101 
5102 
5104 {
5105  if (!IsSetSubtype() || !IsSetName()) {
5106  return;
5107  }
5108 
5109  TSubtype subtype = GetSubtype();
5110  string new_val = AutoFix(subtype, GetName());
5111 
5112  if (!NStr::IsBlank(new_val)) {
5113  SetName(new_val);
5114  } else if (subtype == CSubSource::eSubtype_sex) {
5115  string upr = GetName();
5116  string lwr = upr;
5117  NStr::ToLower(lwr);
5118  if (! NStr::Equal(upr, lwr)) {
5119  SetName(lwr);
5120  }
5121  }
5122 }
5123 
5124 
5125 
5126 // NOTE (for two arrays below): If string A is a prefix of string B, string B should be placed
5127 // BEFORE string A. I.e. longer string should be earlier
5128 static const char * s_RemovableCultureNotes[] = {
5129  "[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]",
5130  "[BankIt_uncultured16S_wizard]; [universal primers]; [dgge]",
5131  "[BankIt_uncultured16S_wizard]; [universal primers]",
5132  "[BankIt_cultured16S_wizard]",
5133  "[BankIt_organellerRNA_wizard]",
5134  "[BankIt_ITS_wizard]; [rRNAITS_notfound]",
5135  "[BankIt_ITS_wizard]",
5136  "[uncultured (using universal primers)]",
5137  "[uncultured (using universal primers) bacterial source]",
5138  "[cultured bacterial source]",
5139  "[enrichment culture bacterial source]",
5140  "[mixed bacterial source (cultured and uncultured)]",
5141  "[uncultured]; [universal primers]",
5142  "[mixed bacterial source]",
5143  "[virus wizard]",
5144  "[cDNA derived from mRNA, purified viral particles]",
5145  "[cDNA derived from mRNA, whole cell/tissue lysate]",
5146  "[cDNA derived from genomic RNA, whole cell/tissue lysate]",
5147  "[cDNA derived from genomic RNA, purified viral particles]",
5148  "[universal primers]",
5149  "[uncultured; wizard]",
5150  "[uncultured; wizard; spans unknown]",
5151  "[cultured; wizard]",
5152  "[cultured; wizard; spans unknown]",
5153  "[intergenic wizard]",
5154  "[intergenic wizard; spans unknown]",
5155  "[Microsatellite wizard]",
5156  "[Microsatellite wizard; multiple repeats]",
5157  "[D-loop wizard]",
5158  "[D-loop wizard; spans unknown]",
5159  "[D-loop wizard; spans known]",
5160  NULL
5161 };
5162 
5163 static const char * s_ReplaceableCultureNotes[] = {
5164  "[BankIt_uncultured16S_wizard]; [species_specific primers]; [tgge]",
5165  "[BankIt_uncultured16S_wizard]; [species_specific primers]; [dgge]",
5166  "[BankIt_uncultured16S_wizard]; [species_specific primers]",
5167  "[uncultured (with species-specific primers)]",
5168  "[uncultured]; [amplified with species-specific primers]",
5169  "[uncultured (using species-specific primers) bacterial source]",
5170  "[amplified with species-specific primers]",
5171  NULL
5172 };
5173 
5174 
5176 {
5177  for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
5179  if (pos != string::npos) {
5180  return true;
5181  }
5182  }
5183  for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
5185  return true;
5186  }
5187  }
5188  return false;
5189 }
5190 
5191 
5192 void CSubSource::RemoveCultureNotes (string& value, bool is_species_level)
5193 {
5194  if (NStr::IsBlank(value)) {
5195  return;
5196  }
5197 
5198  for (size_t i = 0; s_RemovableCultureNotes[i] != NULL; i++) {
5199  string to_remove = s_RemovableCultureNotes[i];
5200  size_t remove_len = to_remove.length();
5201  size_t pos = NStr::FindNoCase(value, to_remove);
5202  while (pos != NPOS) {
5203  size_t extra_len = strspn (value.c_str() + pos + remove_len, " ;");
5204  value = value.substr(0, pos) + value.substr(pos + remove_len + extra_len);
5205  pos = NStr::FindNoCase(value, to_remove);
5206  }
5207  }
5208  // remove leading/trailing semicolons
5209  while (NStr::StartsWith(value, " ") || NStr::StartsWith(value, ";")) {
5210  value = value.substr(1);
5211  }
5212  while (NStr::EndsWith(value, " ") || NStr::EndsWith(value, ";")) {
5213  value = value.substr(0, value.length() - 1);
5214  }
5215 
5216  if (is_species_level) {
5217  for (size_t i = 0; s_ReplaceableCultureNotes[i] != NULL; i++) {
5219  value = "amplified with species-specific primers";
5220  break;
5221  }
5222  }
5223  }
5224 }
5225 
5226 
5227 void CSubSource::RemoveCultureNotes (bool is_species_level)
5228 {
5229  if (IsSetName()) {
5230  RemoveCultureNotes(SetName(), is_species_level);
5231  if (NStr::IsBlank(GetName())) {
5232  ResetName();
5233  }
5234  }
5235 }
5236 
5237 
5238 // CCountryLine
5240 (const string & country_name, double y, double min_x, double max_x, double scale)
5241 : m_CountryName(country_name) ,
5242  m_Scale (scale)
5243 {
5244  m_Y = x_ConvertLat(y);
5245  m_MinX = x_ConvertLon(min_x);
5246  m_MaxX = x_ConvertLon(max_x);
5247 
5248 }
5249 
5250 
5252 {
5253 }
5254 
5255 
5256 #define EPSILON 0.001
5257 
5258 int CCountryLine::ConvertLat (double y, double scale)
5259 {
5260 
5261  int val = 0;
5262 
5263  if (y < -90.0) {
5264  y = -90.0;
5265  }
5266  if (y > 90.0) {
5267  y = 90.0;
5268  }
5269 
5270  if (y > 0) {
5271  val = (int) (y * scale + EPSILON);
5272  } else {
5273  val = (int) (-(-y * scale + EPSILON));
5274  }
5275 
5276  return val;
5277 }
5278 
5279 
5281 {
5282  return ConvertLat(y, m_Scale);
5283 }
5284 
5285 int CCountryLine::ConvertLon (double x, double scale)
5286 {
5287 
5288  int val = 0;
5289 
5290  if (x < -180.0) {
5291  x = -180.0;
5292  }
5293  if (x > 180.0) {
5294  x = 180.0;
5295  }
5296 
5297  if (x > 0) {
5298  val = (int) (x * scale + EPSILON);
5299  } else {
5300  val = (int) (-(-x * scale + EPSILON));
5301  }
5302 
5303  return val;
5304 }
5305 
5306 
5308 {
5309  return ConvertLon(x, m_Scale);
5310 }
5311 
5312 
5313 CCountryExtreme::CCountryExtreme (const string & country_name, int min_x, int min_y, int max_x, int max_y)
5314 : m_CountryName(country_name) , m_MinX (min_x), m_MinY (min_y), m_MaxX(max_x), m_MaxY (max_y)
5315 {
5316  m_Area = (1 + m_MaxY - m_MinY) * (1 + m_MaxX - m_MinX);
5317  size_t pos = NStr::Find(country_name, ":");
5318  if (pos == NPOS) {
5319  m_Level0 = country_name;
5320  m_Level1.clear();
5321  } else {
5322  m_Level0 = country_name.substr(0, pos);
5324  m_Level1 = country_name.substr(pos + 1);
5326  }
5327 
5328 }
5329 
5330 
5332 {
5333 
5334 }
5335 
5336 
5338 {
5339  if (min_x < m_MinX) {
5340  m_MinX = min_x;
5341  return true;
5342  } else {
5343  return false;
5344  }
5345 }
5346 
5347 
5349 {
5350  if (max_x > m_MaxX) {
5351  m_MaxX = max_x;
5352  return true;
5353  } else {
5354  return false;
5355  }
5356 }
5357 
5358 
5360 {
5361  if (min_y < m_MinY) {
5362  m_MinY = min_y;
5363  return true;
5364  } else {
5365  return false;
5366  }
5367 }
5368 
5369 
5371 {
5372  if (max_y > m_MaxY) {
5373  m_MaxY = max_y;
5374  return true;
5375  } else {
5376  return false;
5377  }
5378 }
5379 
5380 
5382 {
5383  if (line) {
5384  SetMinX(line->GetMinX());
5385  SetMaxX(line->GetMaxX());
5386  SetMinY(line->GetY());
5387  SetMaxY(line->GetY());
5388  m_Area += 1 + line->GetMaxX() - line->GetMinX();
5389  }
5390 }
5391 
5392 
5393 bool CCountryExtreme::DoesOverlap(const CCountryExtreme* other_block) const
5394 {
5395  if (!other_block) {
5396  return false;
5397  } else if (m_MaxX >= other_block->GetMinX()
5398  && m_MaxX <= other_block->GetMaxX()
5399  && m_MaxY >= other_block->GetMinY()
5400  && m_MinY <= other_block->GetMaxY()) {
5401  return true;
5402  } else if (other_block->GetMaxX() >= m_MinX
5403  && other_block->GetMaxX() <= m_MaxX
5404  && other_block->GetMaxY() >= m_MinY
5405  && other_block->GetMinY() <= m_MaxY) {
5406  return true;
5407  } else {
5408  return false;
5409  }
5410 }
5411 
5412 
5413 bool CCountryExtreme::PreferTo(const CCountryExtreme* other_block, const string country, const string province, const bool prefer_new) const
5414 {
5415  if (!other_block) {
5416  return true;
5417  }
5418 
5419  // if no preferred country, these are equal
5420  if (NStr::IsBlank(country)) {
5421  return prefer_new;
5422  }
5423 
5424  // if match to preferred country
5425  if (NStr::EqualNocase(country, m_Level0)) {
5426  // if best was not preferred country, take new match
5427  if (!NStr::EqualNocase(country, other_block->GetLevel0())) {
5428  return true;
5429  }
5430  // if match to preferred province
5431  if (!NStr::IsBlank(province) && NStr::EqualNocase(province, m_Level1)) {
5432  // if best was not preferred province, take new match
5433  if (!NStr::EqualNocase(province, other_block->GetLevel1())) {
5434  return true;
5435  }
5436  }
5437 
5438  // if both match province, or neither does, or no preferred province, take smallest
5439  return prefer_new;
5440  }
5441 
5442  // if best matches preferred country, keep
5443  if (NStr::EqualNocase(country, other_block->GetLevel0())) {
5444  return false;
5445  }
5446 
5447  // otherwise take smallest
5448  return prefer_new;
5449 }
5450 
5451 
5453  : m_Lat(lat),
5454  m_Lon(lon),
5455  m_LandDistance(-1),
5456  m_WaterDistance(-1),
5457  m_ClaimedDistance(-1)
5458 {}
5459 
5460 
5462 {
5464 
5465  // compare guesses or closest regions to indicated country and province
5466  if (!NStr::IsBlank(GetGuessCountry())) {
5467  // if top level countries match
5468  if (NStr::EqualNocase(country, GetGuessCountry())) {
5470  // if both are empty, still call it a match
5471  if (NStr::EqualNocase(province, GetGuessProvince())) {
5473  }
5474  }
5475  // if they don't match, are they closest?
5476  if (!(rval & CLatLonCountryId::fCountryMatch)) {
5477  if (NStr::EqualNocase(country, GetClosestCountry())) {
5479  if (NStr::EqualNocase(province, GetClosestProvince())) {
5481  }
5482  }
5483  } else if (!(rval & CLatLonCountryId::fProvinceMatch) && !NStr::IsBlank(province)) {
5484  if (NStr::EqualNocase (province, GetClosestProvince())) {
5486  }
5487  }
5488  }
5489 
5490  if (!NStr::IsBlank(GetGuessWater())) {
5491  // was the non-approved body of water correctly indicated?
5492  if (NStr::EqualNocase(country, GetGuessWater())) {
5494  } else if (NStr::EqualNocase(country, GetClosestWater())) {
5496  }
5497  }
5498 
5508  }
5509  } else {
5513  }
5514  }
5515  }
5516  return rval;
5517 }
5518 
5519 
5521 {
5522 }
5523 
5524 
5525 #include "lat_lon_country.inc"
5526 static const size_t k_NumLatLonCountryText = ArraySize(s_DefaultLatLonCountryText);
5527 
5528 #include "lat_lon_water.inc"
5529 static const size_t k_NumLatLonWaterText = ArraySize(s_DefaultLatLonWaterText);
5530 
5531 void CLatLonCountryMap::x_InitFromDefaultList(const char * const *list, int num)
5532 {
5533  if (getenv("NCBI_DEBUG")) {
5534  ERR_POST(Note << "Falling back on built-in data for latlon / water data.");
5535  }
5536  // initialize list of country lines
5537  m_CountryLineList.clear();
5538  m_Scale = 20.0;
5539  string current_country;
5540 
5541  for (int i = 0; i < num; i++) {
5542  CTempString line = list[i];
5543  if (line[0] == '-') {
5544  // skip comment
5545  } else if (isalpha ((unsigned char)line[0])) {
5546  current_country = line;
5547  } else if (isdigit ((unsigned char)line[0])) {
5548  m_Scale = NStr::StringToDouble(line);
5549  } else {
5550  vector<string> tokens;
5551  NStr::Split(line, "\t", tokens);
5552  if (tokens.size() > 3) {
5553  double x = NStr::StringToDouble(tokens[1]);
5554  for (size_t j = 2; j < tokens.size() - 1; j+=2) {
5555  m_CountryLineList.push_back(new CCountryLine(current_country, x, NStr::StringToDouble(tokens[j]), NStr::StringToDouble(tokens[j + 1]), m_Scale));
5556  }
5557  }
5558  }
5559  }
5560 }
5561 
5562 
5563 
5564 
5565 bool CLatLonCountryMap::x_InitFromFile(const string& filename)
5566 {
5567  string fname = g_FindDataFile (filename);
5568  if (NStr::IsBlank (fname)) {
5569  return false;
5570  }
5571  if (getenv("NCBI_DEBUG")) {
5572  ERR_POST(Note << "Reading from " + filename + " for latlon/water data.");
5573  }
5574  CRef<ILineReader> lr = ILineReader::New (fname);
5575  if (lr.Empty()) {
5576  return false;
5577  } else {
5578  m_Scale = 20.0;
5579  string current_country;
5580 
5581  // make sure to clear before using. in this outer
5582  // scope in the interest of speed (avoid repeated
5583  // construction/destruction)
5584  vector<SIZE_TYPE> tab_positions;
5585 
5586  do {
5587  // const string& line = *++*lr;
5588  CTempString line = *++*lr;
5589  if (line[0] == '-') {
5590  // skip comment
5591  } else if (isalpha ((unsigned char)line[0])) {
5592  current_country = line;
5593  } else if (isdigit ((unsigned char)line[0])) {
5594  m_Scale = NStr::StringToDouble(line);
5595  } else {
5596  // NStr::Tokenize would be much simpler, but
5597  // it's just too slow in this case, especially
5598  // in debug mode.
5599 
5600  // for the future, if we need even more speed,
5601  // it should be possible to eliminate the tab_positions
5602  // vector and collect tab positions on the fly without
5603  // any heap-allocated memory
5604 
5605  // find position of all tabs on this line
5606  tab_positions.clear();
5607  SIZE_TYPE tab_pos = line.find('\t');
5608  while( tab_pos != NPOS ) {
5609  tab_positions.push_back(tab_pos);
5610  tab_pos = line.find('\t', tab_pos+1);
5611  }
5612  // an imaginary sentinel tab
5613  tab_positions.push_back(line.length());
5614 
5615  const char * line_start = line.data();
5616  if( tab_positions.size() >= 4 ) {
5617  CTempString y_str( line_start + tab_positions[0]+1, tab_positions[1] - tab_positions[0] - 1 );
5618  double y = NStr::StringToDouble( y_str );
5619 
5620  // convert into line list
5621  for (size_t j = 1; j < tab_positions.size() - 2; j+=2) {
5622  const SIZE_TYPE pos1 = tab_positions[j];
5623  const SIZE_TYPE pos2 = tab_positions[j+1];
5624  const SIZE_TYPE pos3 = tab_positions[j+2];
5625  CTempString first_num( line_start + pos1 + 1, pos2 - pos1 - 1 );
5626  CTempString second_num( line_start + pos2 + 1, pos3 - pos2 - 1 );
5627  m_CountryLineList.push_back(new CCountryLine(current_country, y, NStr::StringToDouble(first_num), NStr::StringToDouble(second_num), m_Scale));
5628  }
5629  }
5630  }
5631  } while ( !lr->AtEOF() );
5632 
5633  return true;
5634  }
5635 }
5636 
5637 bool
5639  const CCountryLine* line1,
5640  const CCountryLine* line2)
5641 {
5642  if (line1->GetY() < line2->GetY()) {
5643  return true;
5644  } else if (line1->GetY() > line2->GetY()) {
5645  return false;
5646  } else {
5647  if (line1->GetMinX() < line2->GetMinX()) {
5648  return true;
5649  } else {
5650  return false;
5651  }
5652  }
5653 }
5654 
5657  const CCountryLine* line2)
5658 {
5659  int cmp = NStr::CompareNocase(line1->GetCountry(), line2->GetCountry());
5660  if (cmp == 0) {
5662  } else if (cmp < 0) {
5663  return true;
5664  } else {
5665  return false;
5666  }
5667 }
5668 
5669 
5672  const CCountryLine* line2)
5673 {
5674  if (line1->GetY() < line2->GetY()) {
5675  return true;
5676  } else if (line1->GetY() > line2->GetY()) {
5677  return false;
5678  } if (line1->GetMinX() < line2->GetMinX()) {
5679  return true;
5680  } else if (line1->GetMinX() > line2->GetMinX()) {
5681  return false;
5682  } else if (line1->GetMaxX() < line2->GetMaxX()) {
5683  return true;
5684  } else if (line1->GetMaxX() > line2->GetMaxX()) {
5685  return false;
5686  } else {
5687  int cmp = NStr::CompareNocase(line1->GetCountry(), line2->GetCountry());
5688  if (cmp < 0) {
5689  return true;
5690  } else {
5691  return false;
5692  }
5693  }
5694 }
5695 
5696 
5698 {
5699  // initialize list of country lines
5700  m_CountryLineList.clear();
5701 
5702  const char* env_val = getenv("NCBI_LAT_LON_DATA_PATH");
5703  string data_path;
5704  if (env_val) {
5705  data_path = (string) env_val;
5706  if (! NStr::EndsWith(data_path, "/")) {
5707  data_path = data_path + "/";
5708  }
5709  }
5710 
5711  if (is_water) {
5712  if (!x_InitFromFile("lat_lon_water.txt")) {
5713  if (data_path.empty() || !x_InitFromFile(data_path + "lat_lon_water.txt")) {
5714  x_InitFromDefaultList(s_DefaultLatLonWaterText, k_NumLatLonWaterText);
5715  }
5716  }
5717  } else {
5718  if (!x_InitFromFile("lat_lon_country.txt")) {
5719  if (data_path.empty() || !x_InitFromFile(data_path + "lat_lon_country.txt")) {
5720  x_InitFromDefaultList(s_DefaultLatLonCountryText, k_NumLatLonCountryText);
5721  }
5722  }
5723  }
5724 
5725  // Instead of doing a plain sort, we take advantage of the fact that
5726  // there are few unique country names versus the number
5727  // of lines.
5728  typedef map<CTempString, TCountryLineList, PNocase> TCountryToLinesMap;
5729  // this map maps a country name (case insens) to all the lines that
5730  // belong to that country.
5731  TCountryToLinesMap countryToLinesMap;
5733  countryToLinesMap[(*line_it)->GetCountry()].push_back(*line_it);
5734  }
5735 
5736  // build new m_CountryLineList here:
5737  TCountryLineList new_country_line_list;
5738  NON_CONST_ITERATE(TCountryToLinesMap, country_lines_it, countryToLinesMap)
5739  {
5740  // sort the lines for each country by lat/lon only, since we've already
5741  // implicitly sorted by country in countryToLinesMap
5742  TCountryLineList & line_list_for_this_country =
5743  country_lines_it->second;
5744  stable_sort(
5745  BEGIN_COMMA_END(line_list_for_this_country),
5747  copy(BEGIN_COMMA_END(line_list_for_this_country),
5748  back_inserter(new_country_line_list));
5749  }
5750  // swap should be constant time
5751  m_CountryLineList.swap(new_country_line_list);
5752 
5753  // set up extremes index and copy into LatLon index
5754  m_CountryExtremes.clear();
5755  m_LatLonSortedList.clear();
5756  size_t i, ext = 0;
5757 
5758  for (i = 0; i < m_CountryLineList.size(); i++) {
5759  if (ext > 0 && NStr::Equal(m_CountryLineList[i]->GetCountry(), m_CountryExtremes[ext - 1]->GetCountry())) {
5760  m_CountryExtremes[ext - 1]->AddLine(m_CountryLineList[i]);
5761  } else {
5762  m_CountryExtremes.push_back(new CCountryExtreme(m_CountryLineList[i]->GetCountry(),
5763  m_CountryLineList[i]->GetMinX(),
5764  m_CountryLineList[i]->GetY(),
5765  m_CountryLineList[i]->GetMaxX(),
5766  m_CountryLineList[i]->GetY()));
5767  ext++;
5768  }
5770  m_CountryLineList[i]->SetBlock(m_CountryExtremes[ext - 1]);
5771  }
5773 
5774 }
5775 
5776 
5778 {
5779  size_t i;
5780 
5781  for (i = 0; i < m_CountryLineList.size(); i++) {
5782  delete (m_CountryLineList[i]);
5783  }
5784  m_CountryLineList.clear();
5785 
5786  for (i = 0; i < m_CountryExtremes.size(); i++) {
5787  delete (m_CountryExtremes[i]);
5788  }
5789  m_CountryExtremes.clear();
5790  // note - do not delete items in m_LatLonSortedList, they are pointing to the same objects as m_CountryLineList
5791  m_LatLonSortedList.clear();
5792 }
5793 
5794 
5795 bool CLatLonCountryMap::IsCountryInLatLon(const string& country, double lat,
5796  double lon) const
5797 {
5798  int x = CCountryLine::ConvertLon(lon, m_Scale);
5799  int y = CCountryLine::ConvertLat(lat, m_Scale);
5800 
5801  size_t L, R, mid;
5802 
5803  L = 0;
5804  R = m_CountryLineList.size() - 1;
5805  mid = 0;
5806 
5807  while (L < R) {
5808  mid = (L + R) / 2;
5809  int cmp = NStr::Compare(m_CountryLineList[mid]->GetCountry(), country);
5810  if (cmp < 0) {
5811  L = mid + 1;
5812  } else if (cmp > 0) {
5813  R = mid;
5814  } else {
5815  while (mid > 0
5816  && NStr::Compare(m_CountryLineList[mid - 1]->GetCountry(), country) == 0
5817  && m_CountryLineList[mid - 1]->GetY() >= y) {
5818  mid--;
5819  }
5820  L = mid;
5821  R = mid;
5822  }
5823  }
5824 
5825  while (R < m_CountryLineList.size()
5826  && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
5827  && m_CountryLineList[R]->GetY() < y) {
5828  R++;
5829  }
5830 
5831  while (R < m_CountryLineList.size()
5832  && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
5833  && m_CountryLineList[R]->GetY() == y
5834  && m_CountryLineList[R]->GetMaxX() < x) {
5835  R++;
5836  }
5837  if (R < m_CountryLineList.size()
5838  && NStr::EqualNocase(country, m_CountryLineList[R]->GetCountry())
5839  && m_CountryLineList[R]->GetY() == y
5840  && m_CountryLineList[R]->GetMinX() <= x
5841  && m_CountryLineList[R]->GetMaxX() >= x) {
5842  return true;
5843  } else {
5844  return false;
5845  }
5846 }
5847 
5848 
5849 const CCountryExtreme *
5850 CLatLonCountryMap::x_FindCountryExtreme(const string& country) const
5851 {
5852  size_t L, R, mid;
5853 
5854  if (NStr::IsBlank (country)) return NULL;
5855 
5856  L = 0;
5857  R = m_CountryExtremes.size() - 1;
5858 
5859  while (L < R) {
5860  mid = (L + R) / 2;
5861  if (NStr::CompareNocase(m_CountryExtremes[mid]->GetCountry(), country) < 0) {
5862  L = mid + 1;
5863  } else {
5864  R = mid;
5865  }
5866  }
5867  if (!NStr::EqualNocase(m_CountryExtremes[R]->GetCountry(), country)) {
5868  return NULL;
5869  } else {
5870  return m_CountryExtremes[R];
5871  }
5872 }
5873 
5874 
5875 bool CLatLonCountryMap::HaveLatLonForRegion(const string& region) const
5876 {
5877  if (x_FindCountryExtreme(region) == NULL) {
5878  return false;
5879  } else {
5880  return true;
5881  }
5882 }
5883 
5884 
5886 {
5887  size_t L, R, mid;
5888 
5889  L = 0;
5890  R = m_LatLonSortedList.size() - 1;
5891  mid = 0;
5892 
5893  while (L < R) {
5894  mid = (L + R) / 2;
5895  if (m_LatLonSortedList[mid]->GetY() < y) {
5896  L = mid + 1;
5897  } else if (m_LatLonSortedList[mid]->GetY() > y) {
5898  R = mid;
5899  } else {
5900  while (mid > 0 && m_LatLonSortedList[mid - 1]->GetY() == y) {
5901  mid--;
5902  }
5903  L = mid;
5904  R = mid;
5905  }
5906  }
5907  return R;
5908 }
5909 
5910 
5911 const CCountryExtreme *
5913  const string& country,
5914  const string& province) const
5915 {
5916  int x = CCountryLine::ConvertLon(lon, m_Scale);
5917  int y = CCountryLine::ConvertLon(lat, m_Scale);
5918 
5919  size_t R = x_GetLatStartIndex(y);
5920 
5921  const CCountryExtreme *best = NULL;
5922 
5923  while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() == y) {
5924  if (m_LatLonSortedList[R]->GetMinX() <= x
5925  && m_LatLonSortedList[R]->GetMaxX() >= x) {
5926  const CCountryExtreme *other = m_LatLonSortedList[R]->GetBlock();
5927  if (best == NULL) {
5928  best = other;
5929  } else if (!best->PreferTo(other, country, province, (bool)(best->GetArea() <= other->GetArea()))) {
5930  best = other;
5931  }
5932  }
5933  R++;
5934  }
5935  return best;
5936 }
5937 
5938 
5939 //Distance on a spherical surface calculation adapted from
5940 //http://www.linuxjournal.com/magazine/
5941 //work-shell-calculating-distance-between-two-latitudelongitude-points
5942 
5943 #define EARTH_RADIUS 6371.0 /* average radius of non-spherical earth in kilometers */
5944 #define CONST_PI 3.14159265359
5945 
5946 static double DegreesToRadians (
5947  double degrees
5948 )
5949 
5950 {
5951  return (degrees * (CONST_PI / 180.0));
5952 }
5953 
5954 static double DistanceOnGlobe (
5955  double latA,
5956  double lonA,
5957  double latB,
5958  double lonB
5959 )
5960 
5961 {
5962  double lat1, lon1, lat2, lon2;
5963  double dLat, dLon, a, c;
5964 
5965  lat1 = DegreesToRadians (latA);
5966  lon1 = DegreesToRadians (lonA);
5967  lat2 = DegreesToRadians (latB);
5968  lon2 = DegreesToRadians (lonB);
5969 
5970  dLat = lat2 - lat1;
5971  dLon = lon2 - lon1;
5972 
5973  a = sin (dLat / 2) * sin (dLat / 2) +
5974  cos (lat1) * cos (lat2) * sin (dLon / 2) * sin (dLon / 2);
5975  c = 2 * atan2 (sqrt (a), sqrt (1 - a));
5976 
5977  return (double) (EARTH_RADIUS * c);
5978 }
5979 
5980 
5982  double latA,
5983  double lonA,
5984  double scale)
5985 {
5986  double lat1, lon1, lat2, lon2;
5987  double dLat, dLon, a, c;
5988 
5989  lat1 = DegreesToRadians (latA);
5990  lon1 = DegreesToRadians (lonA);
5991  lat2 = DegreesToRadians (latA + (1.0 / scale));
5992  lon2 = DegreesToRadians (lonA + (1.0 / scale));
5993 
5994  dLat = lat2 - lat1;
5995  dLon = lon2 - lon1;
5996 
5997  a = sin (dLat / 2) * sin (dLat / 2) +
5998  cos (lat1) * cos (lat2) * sin (dLon / 2) * sin (dLon / 2);
5999  c = 2 * atan2 (sqrt (a), sqrt (1 - a));
6000 
6001  return (double) (EARTH_RADIUS * c);
6002 
6003 }
6004 
6005 
6007  double lon,
6008  double range,
6009  double &distance)
6010 {
6011  int x = CCountryLine::ConvertLon(lon, m_Scale);
6012  int y = CCountryLine::ConvertLon(lat, m_Scale);
6013 
6014  int maxDelta = (int) (range * m_Scale + EPSILON);
6015  int min_y = y - maxDelta;
6016  int max_y = y + maxDelta;
6017  int min_x = x - maxDelta;
6018  int max_x = x + maxDelta;
6019 
6020  // binary search to lowest lat
6021  size_t R = x_GetLatStartIndex(min_y);
6022 
6023  double closest = 0.0;
6024  CCountryExtreme *rval = NULL;
6025 
6026  while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
6027  if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
6028  // out of range, don't bother calculating distance
6029  } else {
6030  double end;
6031  if (x < m_LatLonSortedList[R]->GetMinX()) {
6032  end = m_LatLonSortedList[R]->GetMinLon();
6033  } else if (x > m_LatLonSortedList[R]->GetMaxX()) {
6034  end = m_LatLonSortedList[R]->GetMaxLon();
6035  } else {
6036  end = lon;
6037  }
6038  double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
6039  if (rval == NULL || closest > dist
6040  || (closest == dist
6041  && (rval->GetArea() > m_LatLonSortedList[R]->GetBlock()->GetArea()
6042  || (rval->GetArea() == m_LatLonSortedList[R]->GetBlock()->GetArea()
6043  && NStr::IsBlank(rval->GetLevel1())
6044  && !NStr::IsBlank(m_LatLonSortedList[R]->GetBlock()->GetLevel1()))))) {
6045  rval = m_LatLonSortedList[R]->GetBlock();
6046  closest = dist;
6047  }
6048  }
6049  R++;
6050  }
6051  distance = closest;
6052  return rval;
6053 }
6054 
6055 
6056 bool CLatLonCountryMap::IsClosestToLatLon(const string& comp_country,
6057  double lat, double lon,
6058  double range, double &distance) const
6059 {
6060  int x = CCountryLine::ConvertLon(lon, m_Scale);
6061  int y = CCountryLine::ConvertLon(lat, m_Scale);
6062 
6063  int maxDelta = (int) (range * m_Scale + EPSILON);
6064  int min_y = y - maxDelta;
6065  int max_y = y + maxDelta;
6066  int min_x = x - maxDelta;
6067  int max_x = x + maxDelta;
6068 
6069  // binary search to lowest lat
6070  size_t R = x_GetLatStartIndex(min_y);
6071 
6072  string country;
6073  double closest = 0.0;
6074  int smallest_area = -1;
6075 
6076  while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
6077  if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
6078  // out of range, don't bother calculating distance
6079  } else {
6080  double end;
6081  if (x < m_LatLonSortedList[R]->GetMinX()) {
6082  end = m_LatLonSortedList[R]->GetMinLon();
6083  } else {
6084  end = m_LatLonSortedList[R]->GetMaxLon();
6085  }
6086  double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
6087  if (NStr::IsBlank (country) || closest > dist) {
6088  country = m_LatLonSortedList[R]->GetCountry();
6089  closest = dist;
6090  const CCountryExtreme * ext = x_FindCountryExtreme(country);
6091  if (ext) {
6092  smallest_area = ext->GetArea();
6093  }
6094  } else if (closest == dist) {
6095  // if the distances are the same, prefer the input country, otherwise prefer the smaller region
6096  if (NStr::Equal(country, comp_country)) {
6097  // keep country we're searching for
6098  } else if (!NStr::Equal(m_LatLonSortedList[R]->GetCountry(), country)) {
6099  const CCountryExtreme * ext = x_FindCountryExtreme(m_LatLonSortedList[R]->GetCountry());
6100  if (ext
6101  && (ext->GetArea() < smallest_area
6102  || NStr::Equal(m_LatLonSortedList[R]->GetCountry(), comp_country))) {
6103  country = m_LatLonSortedList[R]->GetCountry();
6104  smallest_area = ext->GetArea();
6105  }
6106  }
6107  }
6108  }
6109  R++;
6110  }
6111  distance = closest;
6112  return NStr::Equal(country, comp_country);
6113 }
6114 
6115 
6116 const CCountryExtreme * CLatLonCountryMap::IsNearLatLon(double lat, double lon,
6117  double range,
6118  double &distance,
6119  const string& country,
6120  const string& province) const
6121 {
6122  int x = CCountryLine::ConvertLon(lon, m_Scale);
6123  int y = CCountryLine::ConvertLat(lat, m_Scale);
6124  double closest = -1.0;
6125  int maxDelta = (int) (range * m_Scale + EPSILON);
6126  int min_y = y - maxDelta;
6127  int max_y = y + maxDelta;
6128  int min_x = x - maxDelta;
6129  int max_x = x + maxDelta;
6130  CCountryExtreme *ext = NULL;
6131 
6132  // binary search to lowest lat
6133  size_t R = x_GetLatStartIndex(min_y);
6134 
6135  while (R < m_LatLonSortedList.size() && m_LatLonSortedList[R]->GetY() <= max_y) {
6136  if (m_LatLonSortedList[R]->GetMaxX() < min_x || m_LatLonSortedList[R]->GetMinX() > max_x) {
6137  // out of range, don't bother calculating distance
6138  } else if (!NStr::EqualNocase(m_LatLonSortedList[R]->GetBlock()->GetLevel0(), country)) {
6139  // wrong country, skip
6140  } else if (!NStr::IsBlank(province) && !NStr::EqualNocase(m_LatLonSortedList[R]->GetBlock()->GetLevel1(), province)) {
6141  // wrong province, skip
6142  } else {
6143  double end;
6144  if (x < m_LatLonSortedList[R]->GetMinX()) {
6145  end = m_LatLonSortedList[R]->GetMinLon();
6146  } else if (x > m_LatLonSortedList[R]->GetMaxX()) {
6147  end = m_LatLonSortedList[R]->GetMaxLon();
6148  } else {
6149  end = lon;
6150  }
6151  double dist = DistanceOnGlobe (lat, lon, m_LatLonSortedList[R]->GetLat(), end);
6152  if (closest < 0.0 || closest > dist) {
6153  closest = dist;
6154  ext = m_LatLonSortedList[R]->GetBlock();
6155  }
6156  }
6157  R++;
6158  }
6159  distance = closest;
6160  return ext;
6161 }
6162 
6163 
6164 
6165 
6166 
6167 bool CLatLonCountryMap::DoCountryBoxesOverlap(const string& country1,
6168  const string& country2) const
6169 {
6170  if (NStr::IsBlank (country1) || NStr::IsBlank(country2)) return false;
6171 
6172  const CCountryExtreme *ext1 = x_FindCountryExtreme (country1);
6173  if (!ext1) {
6174  return false;
6175  }
6176  const CCountryExtreme *ext2 = x_FindCountryExtreme (country2);
6177  if (!ext2) {
6178  return false;
6179  }
6180 
6181 
6182  return ext1->DoesOverlap(ext2);
6183 }
6184 
6185 
6186 int CLatLonCountryMap::AdjustAndRoundDistance (double distance, double scale)
6187 
6188 {
6189  if (scale < 1.1) {
6190  distance += 111.19;
6191  } else if (scale > 19.5 && scale < 20.5) {
6192  distance += 5.56;
6193  } else if (scale > 99.5 && scale < 100.5) {
6194  distance += 1.11;
6195  }
6196 
6197  return (int) (distance + 0.5);
6198 }
6199 
6200 
6202 
6203 {
6204  return AdjustAndRoundDistance (distance, m_Scale);
6205 }
6206 
6207 
6208 
6209 
6210 END_objects_SCOPE // namespace ncbi::objects::
6211 
6213 
6214 /* Original file checksum: lines: 65, chars: 1891, CRC32: 7724f0c5 */
#define EPSILON
Definition: SubSource.cpp:5256
#define CONST_PI
Definition: SubSource.cpp:5944
static void s_ProcessCellLineLine(const CTempString &line)
Definition: SubSource.cpp:2921
SStaticPair< const char *, const char * > TParishMapEntry
Definition: SubSource.cpp:4144
static const char *const s_Null_Countries[]
Definition: SubSource.cpp:3287
static string s_InsertSpacesBetweenTokens(const string &old_str)
Definition: SubSource.cpp:1350
double ErrorDistance(double latA, double lonA, double scale)
Definition: SubSource.cpp:5981
static TCellLineContaminationMap s_CellLineContaminationMap
Definition: SubSource.cpp:2915
void s_AddOneDataFile(const string &file_name, const string &data_name, const char **built_in, size_t num_built_in, TQualFixMap &qual_map)
Definition: SubSource.cpp:4904
static string s_NormalizeTokens(vector< string > &tokens, vector< double > &numbers, vector< string > &anum, vector< int > &precision, vector< string > &lat_long, vector< string > &nsew)
Definition: SubSource.cpp:1433
map< string, string, PNocase > TQualFixMap
Definition: SubSource.cpp:4889
DEFINE_STATIC_ARRAY_MAP(TWaterPairMap, sc_WaterPairMap, k_water_pair_map)
static void s_InitializeCellLineContaminationMap(void)
Definition: SubSource.cpp:2935
static const size_t k_NumLatLonCountryText
Definition: SubSource.cpp:5526
CStaticArrayMap< const char *, const char *, PNocase_CStr > TWaterPairMap
Definition: SubSource.cpp:2087
const char * sm_ValidSexQualifierTokens[]
Definition: SubSource.cpp:2447
map< string, TSpeciesContaminant > TCellLineContaminationMap
Definition: SubSource.cpp:2913
CCountries::EStateCleanup s_DoUSAStateCleanup(string &country)
Definition: SubSource.cpp:4450
SStaticPair< const char *, const char * > TStateMapEntry
Definition: SubSource.cpp:4295
static const TCStrSet s_Former_CountriesSet(s_Former_Countries, sizeof(s_Former_Countries), __FILE__, __LINE__)
static CCountries::TUsaExceptionMap exception_map
Definition: SubSource.cpp:4574
static double DegreesToRadians(double degrees)
Definition: SubSource.cpp:5946
static const char * s_ReplaceableCultureNotes[]
Definition: SubSource.cpp:5163
static TQualFixMap s_IsolationSourceMap
Definition: SubSource.cpp:4891
static bool s_CellLineContaminationMapInitialized
Definition: SubSource.cpp:2916
static void s_InitializeQualMaps(void)
Definition: SubSource.cpp:4940
static const TCStrSet s_Null_CountriesSet(s_Null_Countries, sizeof(s_Null_Countries), __FILE__, __LINE__)
bool s_IsState(string &state, bool &modified)
Definition: SubSource.cpp:4413
static bool s_FailsGenusOrSpeciesTest(const string &value, const string &taxname)
Definition: SubSource.cpp:2776
string s_ShortenLatLon(string &subname)
Definition: SubSource.cpp:1811
bool s_IsParish(string &parish)
Definition: SubSource.cpp:4279
static const TWaterPairElem k_water_pair_map[]
Definition: SubSource.cpp:2008
static const SStaticPair< const char *, const char * > s_map_subregion_fixes[]
Definition: SubSource.cpp:3738
CStaticPairArrayMap< const char *, const char *, PCase_CStr > TCStringPairsMap
Definition: SubSource.cpp:3425
static bool s_init_UseGeoLocNameForCountry(void)
Definition: SubSource.cpp:68
static void s_ProcessQualMapLine(const CTempString &line, TQualFixMap &qual_map)
Definition: SubSource.cpp:4894
CStaticArraySet< const char *, PCase_CStr > TCStrSet
Definition: SubSource.cpp:558
static const SStaticPair< const char *, const char * > s_map_old_country_name_fixes[]
Definition: SubSource.cpp:3731
static const TStaticQualFixPair kCellTypePairs[]
Definition: SubSource.cpp:4867
static const TStaticQualFixPair kDevStagePairs[]
Definition: SubSource.cpp:4845
static string s_RemoveSpacesWithinNumbers(const string &old_str)
Definition: SubSource.cpp:1383
static bool s_QualFixupMapsInitialized
Definition: SubSource.cpp:4892
static const char * s_RemovableCultureNotes[]
Definition: SubSource.cpp:5128
CStaticPairArrayMap< const char *, const char *, PNocase_CStr > TParishMap
Definition: SubSource.cpp:4276
static string x_FindSurroundingOcean(string &water)
Definition: SubSource.cpp:2090
SStaticPair< const char *, const char * > TWaterPairElem
Definition: SubSource.cpp:2007
#define EARTH_RADIUS
Definition: SubSource.cpp:5943
DEFINE_STATIC_FAST_MUTEX(s_CellLineContaminationMutex)
static bool exceptions_initialized
Definition: SubSource.cpp:4575
CStaticPairArrayMap< const char *, const char *, PNocase_CStr > TStaticQualFixMap
Definition: SubSource.cpp:4843
static void s_GetLatLong(const string &new_str, vector< double > &numbers, vector< int > &precision)
Definition: SubSource.cpp:1641
bool s_SuppressCountryFix(const string &test)
Definition: SubSource.cpp:4035
static bool s_IsNumber(const string &token, double *result=NULL)
Definition: SubSource.cpp:1420
static const TParishMapEntry parish_abbrev_array[]
Definition: SubSource.cpp:4145
bool s_ChooseMonthAndDay(const string &token1, const string &token2, bool month_first, string &month, int &day, bool &month_ambiguous)
Definition: SubSource.cpp:919
static const TCStrSet s_CountriesSet(s_Countries, sizeof(s_Countries), __FILE__, __LINE__)
const char * sm_ValidSexQualifierPhrases[]
Definition: SubSource.cpp:2467
static const char *const s_Countries[]
Definition: SubSource.cpp:2980
map< string, TContaminatingCellLine > TSpeciesContaminant
Definition: SubSource.cpp:2912
CRowReader< CRowReaderStream_NCBI_TSV > TNCBITSVStream
Definition: SubSource.cpp:4572
static const SStaticPair< const char *, const char * > s_map_country_name_fixes[]
Definition: SubSource.cpp:3428
bool s_IsValidSexQualifierPhrase(const string &value)
Definition: SubSource.cpp:2473
bool s_CompressRunsOfSpaces(string &val)
Definition: SubSource.cpp:4097
static const SStaticPair< const char *, const char * > s_map_whole_country_fixes[]
Definition: SubSource.cpp:3419
static void s_ReorderNorthSouthEastWest(vector< double > &numbers, vector< int > &precision, const vector< string > &lat_long, vector< string > &nsew)
Definition: SubSource.cpp:1562
static const char * s_USAStates[]
Definition: SubSource.cpp:3797
SStaticPair< const char *, const char * > TStaticQualFixPair
Definition: SubSource.cpp:4842
pair< string, string > TContaminatingCellLine
Definition: SubSource.cpp:2911
MAKE_CONST_SET(s_Null_CollectionDatesSet, ct::tagStrCase, { "missing", "missing: control sample", "missing: data agreement established pre-2023", "missing: endangered species", "missing: human-identifiable", "missing: lab stock", "missing: sample group", "missing: synthetic construct", "missing: third party data", "not applicable", "not collected", "not provided", "restricted access", }) string CSubSource
Definition: SubSource.cpp:561
CStaticPairArrayMap< const char *, const char *, PNocase_CStr > TStateMap
Definition: SubSource.cpp:4410
void s_CollectNumberAndUnits(const string &value, string &number, string &units)
Definition: SubSource.cpp:2593
bool s_ContainsWholeWord(const CTempString test, const CTempString word, NStr::ECase case_sense)
Definition: SubSource.cpp:4015
static double DistanceOnGlobe(double latA, double lonA, double latB, double lonB)
Definition: SubSource.cpp:5954
static const char *const s_Former_Countries[]
Definition: SubSource.cpp:3265
static const TStateMapEntry state_abbrev_array[]
Definition: SubSource.cpp:4296
static const size_t k_NumLatLonWaterText
Definition: SubSource.cpp:5529
static vector< string > x_Tokenize(const string &val)
Definition: SubSource.cpp:3983
static string NewFixCountry(const string &input, bool us_territories=false)
Definition: SubSource.cpp:4664
static bool WasValid(const string &country)
Definition: SubSource.cpp:3377
static string USAStateCleanup(const string &country)
Definition: SubSource.cpp:4656
static string WholeCountryFix(string country)
Definition: SubSource.cpp:3861
static void x_RemoveDelimitersFromEnds(string &val, bool except_paren=false)
Definition: SubSource.cpp:3933
static bool IsValid(const string &country)
Definition: SubSource.cpp:3304
static bool ContainsMultipleCountryNames(const string &phrase)
Definition: SubSource.cpp:3901
static bool IsSubstringOfStringInList(const string &phrase, const string &country1, size_t pos1)
Definition: SubSource.cpp:3881
static void x_FindCountryName(const TCStringPairsMap &fix_map, const vector< string > &countries, string &valid_country, string &orig_valid_country, bool &too_many_countries, bool &bad_cap)
Definition: SubSource.cpp:4047
static void ReadUSAExceptionMap(TUsaExceptionMap &exceptions, const string &filepath)
Definition: SubSource.cpp:4577
static bool ChangeExtraColonsToCommas(string &country)
Definition: SubSource.cpp:4794
static string CapitalizeFirstLetterOfEveryWord(const string &phrase)
Definition: SubSource.cpp:3851
static string CountryFixupItem(const string &input, bool capitalize_after_colon)
Definition: SubSource.cpp:4812
static string GetCorrectedCountryCapitalization(const string &country)
Definition: SubSource.cpp:3921
static void LoadUSAExceptionMap(const TUsaExceptionMap &exceptions)
Definition: SubSource.cpp:4592
void AddLine(const CCountryLine *line)
Definition: SubSource.cpp:5381
bool SetMinX(int min_x)
Definition: SubSource.cpp:5337
bool SetMaxY(int max_y)
Definition: SubSource.cpp:5370
bool DoesOverlap(const CCountryExtreme *other_block) const
Definition: SubSource.cpp:5393
int GetMinX(void) const
Definition: SubSource.hpp:338
string GetLevel0(void) const
Definition: SubSource.hpp:336
CCountryExtreme(const string &country_name, int min_x, int min_y, int max_x, int max_y)
Definition: SubSource.cpp:5313
string GetCountry(void) const
Definition: SubSource.hpp:335
int GetArea(void) const
Definition: SubSource.hpp:342
int GetMaxX(void) const
Definition: SubSource.hpp:340
int GetMaxY(void) const
Definition: SubSource.hpp:341
int GetMinY(void) const
Definition: SubSource.hpp:339
bool SetMaxX(int max_x)
Definition: SubSource.cpp:5348
~CCountryExtreme(void)
Definition: SubSource.cpp:5331
bool SetMinY(int min_y)
Definition: SubSource.cpp:5359
bool PreferTo(const CCountryExtreme *other_block, const string country, const string province, const bool prefer_new) const
Definition: SubSource.cpp:5413
string GetLevel1(void) const
Definition: SubSource.hpp:337
~CCountryLine(void)
Definition: SubSource.cpp:5251
static int ConvertLat(double y, double scale)
Definition: SubSource.cpp:5258
int GetMaxX(void) const
Definition: SubSource.hpp:375
int GetMinX(void) const
Definition: SubSource.hpp:374
int GetY(void) const
Definition: SubSource.hpp:373
int x_ConvertLat(double y)
Definition: SubSource.cpp:5280
double m_Scale
Definition: SubSource.hpp:392
CCountryLine(const string &country_name, double y, double min_x, double max_x, double scale)
Definition: SubSource.cpp:5240
int x_ConvertLon(double x)
Definition: SubSource.cpp:5307
static int ConvertLon(double x, double scale)
Definition: SubSource.cpp:5285
Definition: Date.hpp:53
ECompare Compare(const CDate &date) const
Definition: Date.cpp:83
@ eCompare_before
*this comes first.
Definition: Date.hpp:74
@ eCompare_after
*this comes second.
Definition: Date.hpp:76
int GetClaimedDistance(void) const
Definition: SubSource.hpp:429
string GetClosestProvince(void) const
Definition: SubSource.hpp:418
void SetFullGuess(string guess)
Definition: SubSource.hpp:407
string GetClaimedFull(void) const
Definition: SubSource.hpp:422
string GetClosestWater(void) const
Definition: SubSource.hpp:420
void SetGuessProvince(string guess)
Definition: SubSource.hpp:411
CLatLonCountryId(float lat, float lon)
Definition: SubSource.cpp:5452
int TClassificationFlags
Bitwise OR of "EClassificationFlags".
Definition: SubSource.hpp:442
string GetGuessCountry(void) const
Definition: SubSource.hpp:408
string GetGuessWater(void) const
Definition: SubSource.hpp:412
CLatLonCountryId::TClassificationFlags Classify(string country, string province)
Definition: SubSource.cpp:5461
string GetClosestFull(void) const
Definition: SubSource.hpp:414
int GetLandDistance(void) const
Definition: SubSource.hpp:425
string GetClosestCountry(void) const
Definition: SubSource.hpp:416
string GetGuessProvince(void) const
Definition: SubSource.hpp:410
void SetGuessCountry(string guess)
Definition: SubSource.hpp:409
const CCountryExtreme * x_FindCountryExtreme(const string &country) const
Definition: SubSource.cpp:5850
CLatLonCountryMap(bool is_water)
Definition: SubSource.cpp:5697
size_t x_GetLatStartIndex(int y) const
Definition: SubSource.cpp:5885
static bool s_CompareTwoLinesByLatLonThenCountry(const CCountryLine *line1, const CCountryLine *line2)
Definition: SubSource.cpp:5671
static bool s_CompareTwoLinesByCountry(const CCountryLine *line1, const CCountryLine *line2)
Definition: SubSource.cpp:5656
TCountryLineList m_CountryLineList
Definition: SubSource.hpp:513
static int AdjustAndRoundDistance(double distance, double scale)
Definition: SubSource.cpp:6186
static bool s_CompareTwoLinesByLatLonOnly(const CCountryLine *line1, const CCountryLine *line2)
Definition: SubSource.cpp:5638
const CCountryExtreme * IsNearLatLon(double lat, double lon, double range, double &distance, const string &country, const string &province=kEmptyStr) const
Definition: SubSource.cpp:6116
bool DoCountryBoxesOverlap(const string &country1, const string &country2) const
Definition: SubSource.cpp:6167
const CCountryExtreme * FindClosestToLatLon(double lat, double lon, double range, double &distance)
Definition: SubSource.cpp:6006
int TLatLonAdjustFlags
Bitwise OR of "ELatLonAdjustFlags".
Definition: SubSource.hpp:493
const CCountryExtreme * GuessRegionForLatLon(double lat, double lon, const string &country=kEmptyStr, const string &province=kEmptyStr) const
Definition: SubSource.cpp:5912
bool IsCountryInLatLon(const string &country, double lat, double lon) const
Definition: SubSource.cpp:5795
bool x_InitFromFile(const string &filename)
Definition: SubSource.cpp:5565
bool HaveLatLonForRegion(const string &country) const
Definition: SubSource.cpp:5875
TCountryExtremeList m_CountryExtremes
Definition: SubSource.hpp:519
TCountryLineList m_LatLonSortedList
Definition: SubSource.hpp:514
bool IsClosestToLatLon(const string &country, double lat, double lon, double range, double &distance) const
Definition: SubSource.cpp:6056
void x_InitFromDefaultList(const char *const *list, int num)
Definition: SubSource.cpp:5531
vector< CCountryLine * > TCountryLineList
Definition: SubSource.hpp:510
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CNcbiEnvironment –.
Definition: ncbienv.hpp:110
CNcbiRegistry –.
Definition: ncbireg.hpp:913
static string FixHostCapitalization(const string &value)
Definition: OrgMod.cpp:965
Callback style template to iterate over a row stream.
Definition: row_reader.hpp:358
Root class for all serialization exceptions.
Definition: exception.hpp:50
class CStaticArrayMap<> provides access to a static array in much the same way as CStaticArraySet<>,...
Definition: static_map.hpp:175
TBase::const_iterator const_iterator
Definition: static_map.hpp:179
const_iterator find(const key_type &key) const
Return a const_iterator pointing to the specified element, or to the end if the element is not found.
Definition: static_set.hpp:680
const_iterator end() const
Return the end of the controlled sequence.
Definition: static_set.hpp:647
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
static bool IsISOFormatDate(const string &orig_date)
Definition: SubSource.cpp:816
static string GetCollectionDateProblem(const string &date_string)
static bool NCBI_UseGeoLocNameForCountry(void)
Definition: SubSource.cpp:94
static string FixTissueTypeCapitalization(const string &value)
Definition: SubSource.cpp:4983
static string FixLatLonPrecision(const string &orig)
Definition: SubSource.cpp:1304
static string x_RemoveIsoTime(const string &orig_date)
Definition: SubSource.cpp:804
static string x_ParseDateRangeWithDelimiter(const string &orig_date, CTempString delim)
Definition: SubSource.cpp:596
static string FixSexQualifierValue(const string &value)
Definition: SubSource.cpp:2527
static bool IsISOFormatTime(const string &orig_time, int &hour, int &min, int &sec, bool require_time_zone=true)
Definition: SubSource.cpp:644
@ eDateFormatFlag_bad_format
Definition: SubSource.hpp:112
@ eDateFormatFlag_in_future
Definition: SubSource.hpp:113
@ eDateFormatFlag_ok
Definition: SubSource.hpp:111
@ eDateFormatFlag_out_of_order
Definition: SubSource.hpp:114
static TSubtype GetSubtypeValue(const string &str, EVocabulary vocabulary=eVocabulary_raw)
Definition: SubSource.cpp:128
static bool x_MeetsCommonChromosomeLinkageGroupPlasmidNameRules(const string &value, const string &taxname)
Definition: SubSource.cpp:2805
static bool IsValidSubtypeName(const string &str, EVocabulary vocabulary=eVocabulary_raw)
Definition: SubSource.cpp:157
static string FixLatLonFormat(string orig_lat_lon, bool guess=false)
Definition: SubSource.cpp:1862
static unique_ptr< CLatLonCountryMap > m_LatLonWaterMap
Definition: SubSource.hpp:248
static bool IsPlasmidNameValid(const string &value, const string &taxname)
Definition: SubSource.cpp:2871
~CSubSource(void)
Definition: SubSource.cpp:63
static bool x_IsFixableIsoDate(const string &orig_date)
Definition: SubSource.cpp:779
static CRef< CDate > GetDateFromISODate(const string &orig_date)
Definition: SubSource.cpp:831
static string FixIsolationSourceCapitalization(const string &value)
Definition: SubSource.cpp:4956
static bool HasCultureNotes(const string &value)
Definition: SubSource.cpp:5175
static bool IsValidSexQualifierValue(const string &value)
Definition: SubSource.cpp:2488
static string FixCellTypeCapitalization(const string &value)
Definition: SubSource.cpp:4876
static vector< string > x_GetDateTokens(const string &orig_date)
Definition: SubSource.cpp:852
void GetLabel(string *str) const
Definition: SubSource.cpp:101
static bool IsMultipleValuesAllowed(TSubtype)
Definition: SubSource.cpp:208
@ eLatLonCountryErr_None
Definition: SubSource.hpp:190
@ eLatLonCountryErr_Value
Definition: SubSource.hpp:194
@ eLatLonCountryErr_State
Definition: SubSource.hpp:192
@ eLatLonCountryErr_Water
Definition: SubSource.hpp:193
@ eLatLonCountryErr_Country
Definition: SubSource.hpp:191
static CLatLonCountryId * x_CalculateLatLonId(float lat_value, float lon_value, string country, string province)
Definition: SubSource.cpp:1912
static bool IsISOFormatDateOnly(const string &date)
Definition: SubSource.cpp:739
static bool IsDayValueOkForMonth(int day, int month, int year)
Determine whether day number could occur in month.
Definition: SubSource.cpp:266
static bool IsAltitudeValid(const string &value)
Definition: SubSource.cpp:2653
static string ValidateLatLonCountry(const string &countryname, string &lat_lon, bool check_state, ELatLonCountryErr &errcode)
Definition: SubSource.cpp:2101
static string FixDateFormat(const string &orig_date)
Attempt to fix the format of the date Returns a blank if the format of the date cannot be determined.
Definition: SubSource.cpp:620
void AutoFix()
Definition: SubSource.cpp:5103
static string CheckCellLine(const string &cell_line, const string &organism)
Definition: SubSource.cpp:2955
static string MakeLatLon(double lat_value, double lon_value, int lat_precision=2, int lon_precision=2)
Definition: SubSource.cpp:1890
void FixCapitalization()
Definition: SubSource.cpp:5049
@ eVocabulary_insdc
Definition: SubSource.hpp:83
static bool IsCollectionDateAfterTime(const string &collection_date, time_t t, bool &bad_format)
Definition: SubSource.cpp:414
static size_t CheckDateFormat(const string &date_string)
Definition: SubSource.cpp:505
static string x_FormatWithPrecision(double val, int precision)
Definition: SubSource.cpp:2682
static string GetSubtypeName(CSubSource::TSubtype stype, EVocabulary vocabulary=eVocabulary_raw)
Definition: SubSource.cpp:185
static int x_GetPrecision(const string &num_str)
Definition: SubSource.cpp:2671
static bool NeedsNoText(const TSubtype &subtype)
Definition: SubSource.cpp:233
static bool IsEndogenousVirusNameValid(const string &value)
Definition: SubSource.cpp:2753
static bool IsChromosomeNameValid(const string &value, const string &taxname)
Definition: SubSource.cpp:2846
static bool x_GenericRepliconNameValid(const string &value)
Definition: SubSource.cpp:2727
static void IsCorrectLatLonFormat(string lat_lon, bool &format_correct, bool &precision_correct, bool &lat_in_range, bool &lon_in_range, double &lat_value, double &lon_value)
Definition: SubSource.cpp:1237
static CRef< CDate > DateFromCollectionDate(const string &str) THROWS((CException))
Definition: SubSource.cpp:287
static bool IsSegmentValid(const string &value)
Definition: SubSource.cpp:2747
static string FixDevStageCapitalization(const string &value)
Definition: SubSource.cpp:4855
static unique_ptr< CLatLonCountryMap > m_LatLonCountryMap
Definition: SubSource.hpp:247
static bool IsLinkageGroupNameValid(const string &value, const string &taxname)
Definition: SubSource.cpp:2859
static string FixAltitude(const string &value)
Definition: SubSource.cpp:2690
static bool IsDiscouraged(const TSubtype subtype)
Definition: SubSource.cpp:247
static void RemoveCultureNotes(string &value, bool is_species_level=true)
Definition: SubSource.cpp:5192
static string FixLabHostCapitalization(const string &value)
Definition: SubSource.cpp:5010
static void IsCorrectDateFormat(const string &date_string, bool &bad_format, bool &in_future)
Definition: SubSource.cpp:454
static void DetectDateFormat(const string &orig_date, bool &ambiguous, bool &day_first)
Definition: SubSource.cpp:1176
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CTimeException –.
Definition: ncbitime.hpp:2076
CTime –.
Definition: ncbitime.hpp:296
const_iterator end() const
Definition: map.hpp:152
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
const_iterator find(const key_type &key) const
Definition: set.hpp:137
static uch flags
const char * file_name[]
static void check_state(const char name[], prfunc print, int erc)
Definition: done_handling.c:80
#define test(a, b, c, d, e)
Definition: numeric.c:170
static char line1[1024 *16]
Definition: t0016.c:98
static char line2[1024 *16]
Definition: t0016.c:99
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
#define check(s)
Definition: describecol2.c:21
static char precision
Definition: genparams.c:28
static SQLCHAR output[256]
Definition: print.c:5
static const char * str(char *buf, int n)
Definition: stats.c:84
static HENV env
Definition: transaction2.c:38
static char tmp[3200]
Definition: utf8.c:42
const CNcbiEnvironment & GetEnvironment(void) const
Get the application's cached environment.
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
#define NCBI_CATCH(message)
Catch CExceptions as well This macro is deprecated - use *_X or *_XX variant instead of it.
Definition: ncbiexpt.hpp:580
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
#define THROWS(x)
Definition: ncbiexpt.hpp:75
@ eUnknown
Definition: app_popup.hpp:72
#define ENUM_METHOD_NAME(EnumName)
Definition: serialbase.hpp:994
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
Definition: line_reader.cpp:49
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
virtual string GetString(const string &section, const string &name, const string &default_value, TFlags flags=0) const
Get the parameter string value.
Definition: ncbireg.cpp:321
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
std::string CStringUTF8
Definition: ncbistl.hpp:254
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5187
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1387
#define NPOS
Definition: ncbistr.hpp:133
Uint4 TUnicodeSymbol
Unicode character.
Definition: ncbistr.hpp:141
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static EEncoding GuessEncoding(const CTempString &src)
Guess the encoding of the C/C++ string.
Definition: ncbistr.cpp:6691
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2697
static string ParseEscapes(const CTempString str, EEscSeqRange mode=eEscSeqRange_Standard, char user_char='?')
Parse C-style escape sequences in the specified string.
Definition: ncbistr.cpp:4793
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5325
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3314
static int Compare(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Compare of a substring with another string.
Definition: ncbistr.hpp:5297
static CStringUTF8 AsUTF8(const CTempString &src, EEncoding encoding, EValidate validate=eNoValidate)
Convert into UTF8 from a C/C++ string.
Definition: ncbistr.hpp:3889
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static void TrimSuffixInPlace(string &str, const CTempString suffix, ECase use_case=eCase)
Trim suffix from a string (in-place)
Definition: ncbistr.cpp:3278
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3554
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
static string Sanitize(CTempString str, TSS_Flags flags=fSS_print)
Sanitize a string, allowing only specified classes of characters.
Definition: ncbistr.hpp:2876
static TUnicodeSymbol Decode(const char *&src)
Convert sequence of UTF8 code units into Unicode code point.
Definition: ncbistr.hpp:5662
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static void TrimPrefixInPlace(string &str, const CTempString prefix, ECase use_case=eCase)
Trim prefix from a string (in-place)
Definition: ncbistr.cpp:3242
ECase
Which type of string comparison.
Definition: ncbistr.hpp:1204
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ eReverseSearch
Search in a backward direction.
Definition: ncbistr.hpp:1947
@ eTrunc_Both
Truncate spaces at both begin and end of string.
Definition: ncbistr.hpp:2242
@ eTrunc_Begin
Truncate leading spaces only.
Definition: ncbistr.hpp:2240
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
int DaysInMonth(void) const
Get number of days in the month.
Definition: ncbitime.cpp:1198
time_t GetTimeT(void) const
Get time in time_t format.
Definition: ncbitime.cpp:1395
static int MonthNameToNum(const string &month)
Get numerical value of the month by name.
Definition: ncbitime.cpp:1211
static string MonthNumToName(int month, ENameFormat format=eFull)
Get name of the month by numerical value.
Definition: ncbitime.cpp:1229
@ eAbbr
Use abbreviated name.
Definition: ncbitime.hpp:319
const TAttrib & GetAttrib(void) const
Get the Attrib member data.
Definition: SubSource_.hpp:397
TSubtype GetSubtype(void) const
Get the Subtype member data.
Definition: SubSource_.hpp:310
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: SubSource_.hpp:291
void ResetName(void)
Reset Name data member.
Definition: SubSource_.cpp:101
TName & SetName(void)
Assign a value to Name data member.
Definition: SubSource_.hpp:373
const TName & GetName(void) const
Get the Name member data.
Definition: SubSource_.hpp:350
bool IsSetAttrib(void) const
attribution/source of this name Check if a value has been assigned to Attrib data member.
Definition: SubSource_.hpp:385
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: SubSource_.hpp:338
@ eSubtype_collection_date
DD-MMM-YYYY format.
Definition: SubSource_.hpp:114
@ eSubtype_fwd_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:117
@ eSubtype_lat_lon
+/- decimal degrees
Definition: SubSource_.hpp:113
@ eSubtype_collected_by
name of person who collected the sample
Definition: SubSource_.hpp:115
@ eSubtype_rev_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:118
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eSubtype_identified_by
name of person who identified the sample
Definition: SubSource_.hpp:116
void SetYear(TYear value)
Assign a value to Year data member.
Definition: Date_std_.hpp:435
void SetMonth(TMonth value)
Assign a value to Month data member.
Definition: Date_std_.hpp:482
TStd & SetStd(void)
Select the variant.
Definition: Date_.cpp:115
void SetDay(TDay value)
Assign a value to Day data member.
Definition: Date_std_.hpp:529
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
where both of them are integers Note
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is orig
FILE * file
static int input()
int i
int len
static char * subname
Definition: mdb_load.c:26
range(_Ty, _Ty) -> range< _Ty >
std::integral_constant< ncbi::NStr::ECase, ncbi::NStr::eCase > tagStrCase
constexpr auto sort(_Init &&init)
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
#define fabs(v)
Definition: ncbi_dispd.c:46
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int isprint(Uchar c)
Definition: ncbictype.hpp:67
int ispunct(Uchar c)
Definition: ncbictype.hpp:68
@ eYear
Definition: ncbitime.cpp:2753
@ eDay
Definition: ncbitime.cpp:2755
Defines: CTimeFormat - storage class for time format.
T max(T x_, T y_)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
static BOOL number
Definition: pcregrep.c:193
static const char * suffix[]
Definition: pcregrep.c:408
Uint4 TFieldNo
Field number (zero based)
Definition: row_reader.hpp:53
Generic utility macros and templates for exploring NCBI objects.
#define BEGIN_COMMA_END(container)
#define R(t)
#define row(bind, expected)
Definition: string_bind.c:73
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: type.c:6
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
else result
Definition: token2.c:20
string g_FindDataFile(const CTempString &name, CDirEntry::EType type=CDirEntry::eFile)
Look for an NCBI application data file or directory of the given name and type; in general,...
Definition: util_misc.cpp:139
static const char * type_name(CS_INT value)
Definition: will_convert.c:122
Modified on Wed Apr 17 13:10:21 2024 by modify_doxy.py rev. 669887