NCBI C++ ToolKit
utilfun.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: utilfun.cpp 102982 2024-08-15 12:44:06Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: utilfun.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Utility functions for parser and indexing.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbitime.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <corelib/ncbistr.hpp>
41 #include <objmgr/scope.hpp>
43 #include <objects/seq/MolInfo.hpp>
45 #include <corelib/tempstr.hpp>
46 
47 #include "index.h"
48 
49 #include "ftaerr.hpp"
50 #include "indx_def.h"
51 #include "utilfun.h"
52 
53 #ifdef THIS_FILE
54 # undef THIS_FILE
55 #endif
56 #define THIS_FILE "utilfun.cpp"
57 
59 
61 
63 {
64  static CScope scope(*CObjectManager::GetInstance());
65  return scope;
66 }
67 
68 
69 static const char* ParFlat_EST_kw_array[] = {
70  "EST",
71  "EST PROTO((expressed sequence tag)",
72  "expressed sequence tag",
73  "EST (expressed sequence tag)",
74  "EST (expressed sequence tags)",
75  "EST(expressed sequence tag)",
76  "transcribed sequence fragment",
77  nullptr
78 };
79 
80 static const char* ParFlat_GSS_kw_array[] = {
81  "GSS",
82  "GSS (genome survey sequence)",
83  "trapped exon",
84  nullptr
85 };
86 
87 static const char* ParFlat_STS_kw_array[] = {
88  "STS",
89  "STS(sequence tagged site)",
90  "STS (sequence tagged site)",
91  "STS sequence",
92  "sequence tagged site",
93  nullptr
94 };
95 
96 static const char* ParFlat_HTC_kw_array[] = {
97  "HTC",
98  nullptr
99 };
100 
101 static const char* ParFlat_FLI_kw_array[] = {
102  "FLI_CDNA",
103  nullptr
104 };
105 
106 static const char* ParFlat_WGS_kw_array[] = {
107  "WGS",
108  nullptr
109 };
110 
111 static const char* ParFlat_MGA_kw_array[] = {
112  "MGA",
113  "CAGE (Cap Analysis Gene Expression)",
114  "5'-SAGE",
115  nullptr
116 };
117 
118 static const char* ParFlat_MGA_more_kw_array[] = {
119  "CAGE (Cap Analysis Gene Expression)",
120  "5'-SAGE",
121  "5'-end tag",
122  "unspecified tag",
123  "small RNA",
124  nullptr
125 };
126 
127 /* Any change of contents of next array below requires proper
128  * modifications in function fta_tsa_keywords_check().
129  */
130 static const char* ParFlat_TSA_kw_array[] = {
131  "TSA",
132  "Transcriptome Shotgun Assembly",
133  nullptr
134 };
135 
136 /* Any change of contents of next array below requires proper
137  * modifications in function fta_tls_keywords_check().
138  */
139 static const char* ParFlat_TLS_kw_array[] = {
140  "TLS",
141  "Targeted Locus Study",
142  nullptr
143 };
144 
145 /* Any change of contents of next 2 arrays below requires proper
146  * modifications in function fta_tpa_keywords_check().
147  */
148 static const char* ParFlat_TPA_kw_array[] = {
149  "TPA",
150  "THIRD PARTY ANNOTATION",
151  "THIRD PARTY DATA",
152  "TPA:INFERENTIAL",
153  "TPA:EXPERIMENTAL",
154  "TPA:REASSEMBLY",
155  "TPA:ASSEMBLY",
156  "TPA:SPECIALIST_DB",
157  nullptr
158 };
159 
160 static const char* ParFlat_TPA_kw_array_to_remove[] = {
161  "TPA",
162  "THIRD PARTY ANNOTATION",
163  "THIRD PARTY DATA",
164  nullptr
165 };
166 
167 static const char* ParFlat_ENV_kw_array[] = {
168  "ENV",
169  nullptr
170 };
171 
172 static const char* ParFlat_MAG_kw_array[] = {
173  "Metagenome Assembled Genome",
174  "MAG",
175  nullptr
176 };
177 
178 /**********************************************************/
179 static string FTAitoa(Int4 m)
180 {
181  Int4 sign = (m < 0) ? -1 : 1;
182  string res;
183 
184  for (m *= sign; m > 9; m /= 10)
185  res += m % 10 + '0';
186 
187  res += m + '0';
188 
189  if (sign < 0)
190  res += '-';
191 
192  std::reverse(res.begin(), res.end());
193  return res;
194 }
195 
196 /**********************************************************/
198 {
199  Int4 num1;
200  Int4 num2;
201 
203 
204  for (const string& acc : extra_accs) {
205  if (acc.empty())
206  continue;
207 
208  size_t dash = acc.find('-');
209  if (dash == string::npos) {
210  ret.push_back(acc);
211  continue;
212  }
213 
214  string first(acc.begin(), acc.begin() + dash),
215  last(acc.begin() + dash + 1, acc.end());
216  size_t acclen = first.size();
217 
218  const Char* p = first.c_str();
219  for (; (*p >= 'A' && *p <= 'Z') || *p == '_';)
220  p++;
221 
222  size_t preflen = p - first.c_str();
223 
224  string prefix = first.substr(0, preflen);
225  while (*p == '0')
226  p++;
227 
228  const Char* q;
229  for (q = p; *p >= '0' && *p <= '9';)
230  p++;
231  num1 = atoi(q);
232 
233  for (p = last.c_str() + preflen; *p == '0';)
234  p++;
235  for (q = p; *p >= '0' && *p <= '9';)
236  p++;
237  num2 = atoi(q);
238 
239  ret.push_back(first);
240 
241  if (num1 == num2)
242  continue;
243 
244  for (num1++; num1 <= num2; num1++) {
245  string new_acc = prefix;
246  string num_str = FTAitoa(num1);
247  size_t j = acclen - preflen - num_str.size();
248 
249  for (size_t i = 0; i < j; i++)
250  new_acc += '0';
251 
252  new_acc += num_str;
253  ret.push_back(new_acc);
254  }
255  }
256 
257  ret.swap(hist);
258 }
259 
260 static bool sIsPrefixChar(char c)
261 {
262  return ('A' <= c && c <= 'Z') || c == '_';
263 }
264 /**********************************************************/
265 bool ParseAccessionRange(list<string>& tokens, unsigned skip)
266 {
267  bool bad = false;
268 
269  if (tokens.empty()) {
270  return true;
271  }
272 
273  if (tokens.size() <= skip + 1) {
274  return true;
275  }
276 
277 
278  auto it = tokens.begin();
279  if (skip) {
280  advance(it, skip);
281  }
282 
283  for (; it != tokens.end(); ++it) {
284  const auto& token = *it;
285  if (token.empty()) {
286  continue;
287  }
288 
290  if (! NStr::SplitInTwo(token, "-", first, last)) {
291  continue;
292  }
293  if (first.size() != last.size()) {
294  bad = true;
295  break;
296  }
297 
298  auto first_it =
299  find_if_not(begin(first), end(first), sIsPrefixChar);
300 
301  if (first_it == first.end()) {
302  bad = true;
303  break;
304  }
305 
306 
307  auto last_it =
308  find_if_not(begin(last), end(last), sIsPrefixChar);
309  if (last_it == last.end()) {
310  bad = true;
311  break;
312  }
313 
314  auto prefixLength = distance(first.begin(), first_it);
315  if (prefixLength != distance(last.begin(), last_it) ||
316  ! NStr::EqualCase(first, 0, prefixLength, last.substr(0, prefixLength))) {
317  ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch, "Inconsistent prefix found in secondary accession range \"%s\".", token.c_str());
318  break;
319  }
320 
321  auto num1 = NStr::StringToInt(first.substr(prefixLength));
322  auto num2 = NStr::StringToInt(last.substr(prefixLength));
323 
324  if (num2 <= num1) {
325  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Invalid start/end values in secondary accession range \"%s\".", token.c_str());
326  }
327 
328  *it = first;
329  it = tokens.insert(it, "-");
330  it = tokens.insert(it, last);
331  }
332 
333  if (bad) {
334  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Incorrect secondary accession range provided: \"%s\".", it->c_str());
335  }
336  return false;
337 }
338 
339 
340 inline bool IsLeadPrefixChar(char c)
341 {
342  return ('A' <= c && c <= 'Z');
343 }
344 inline bool IsDigit(char c)
345 {
346  return ('0' <= c && c <= '9');
347 }
348 /**********************************************************/
349 bool ParseAccessionRange(TokenStatBlkPtr tsbp, unsigned skip)
350 {
351  auto& tokens = tsbp->list;
352  if (tokens.empty())
353  return true;
354  if ((int)skip >= tsbp->num)
355  return true;
356 
357  auto tbp = tokens.begin();
358  if (skip > 0)
359  advance(tbp, skip);
360 
361  bool bad = false, msg_issued = false;
362  for (; tbp != tokens.end(); ++tbp) {
363  const string& token = *tbp;
364  string_view tok_view = token;
365  if (token.empty())
366  continue;
367  size_t dash = token.find('-');
368  if (dash == string::npos)
369  continue;
370  if (dash == 0 || tok_view.size() != (dash + 1 + dash)) {
371  bad = true;
372  break;
373  }
374 
375  string_view first(tok_view.substr(0, dash));
376  string_view last(tok_view.substr(dash + 1));
377  if (! IsLeadPrefixChar(first.front()) || ! IsLeadPrefixChar(last.front())) {
378  bad = true;
379  break;
380  }
381 
382  auto first_it = find_if_not(first.begin(), first.end(), sIsPrefixChar);
383  if (first_it == first.end() || ! IsDigit(*first_it)) {
384  bad = true;
385  break;
386  }
387  auto last_it = find_if_not(last.begin(), last.end(), sIsPrefixChar);
388  if (last_it == last.end() || ! IsDigit(*last_it)) {
389  bad = true;
390  break;
391  }
392 
393  size_t preflen = first_it - first.begin();
394  size_t preflen2 = last_it - last.begin();
395  string_view first_prefix = first.substr(0, preflen);
396  string_view last_prefix = last.substr(0, preflen2);
397  if (first_prefix != last_prefix) {
398  msg_issued = true;
399  ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch, "Inconsistent prefix found in secondary accession range \"%s\".", token.c_str());
400  bad = true;
401  break;
402  }
403 
404  string_view first_digits = first.substr(preflen);
405  string_view last_digits = last.substr(preflen);
406  if (! all_of(first_digits.begin(), first_digits.end(), IsDigit) ||
407  ! all_of(last_digits.begin(), last_digits.end(), IsDigit)) {
408  bad = true;
409  break;
410  }
411 
412  auto num1 = NStr::StringToInt(first_digits, NStr::fConvErr_NoThrow);
413  auto num2 = NStr::StringToInt(last_digits, NStr::fConvErr_NoThrow);
414  if (num2 < num1) {
415  msg_issued = true;
416  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Invalid start/end values in secondary accession range \"%s\".", token.c_str());
417  bad = true;
418  break;
419  }
420 
421  // cut in half
422  string tmp(last);
423  tbp->resize(dash);
424  tbp = tokens.insert_after(tbp, "-");
425  tbp = tokens.insert_after(tbp, tmp);
426  tsbp->num += 2;
427  }
428  if (! bad)
429  return true;
430  if (! msg_issued) {
431  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Incorrect secondary accession range provided: \"%s\".", tbp->c_str());
432  }
433  return false;
434 }
435 
436 /**********************************************************
437  *
438  * TokenStatBlkPtr TokenString(str, delimiter):
439  *
440  * Parsing string "str" by delimiter or tab key, blank.
441  * Parsing stop at newline ('\n') or end of string ('\0').
442  * Return a statistics of link list token.
443  *
444  **********************************************************/
445 unique_ptr<TokenStatBlk> TokenString(const char* str, Char delimiter)
446 {
447  const char* bptr;
448  const char* ptr;
449  Int2 num;
450  TokenStatBlkPtr token;
451 
452  token = new TokenStatBlk;
453  auto tail = token->list.before_begin();
454 
455  /* skip first several delimiters if any existed
456  */
457  for (ptr = str; *ptr == delimiter;)
458  ptr++;
459 
460  for (num = 0; *ptr != '\0' && *ptr != '\r' && *ptr != '\n';) {
461  for (bptr = ptr; *ptr != delimiter && *ptr != '\r' && *ptr != '\n' &&
462  *ptr != '\t' && *ptr != ' ' && *ptr != '\0';)
463  ptr++;
464 
465  tail = token->list.insert_after(tail, string(bptr, ptr));
466  num++;
467 
468  while (*ptr == delimiter || *ptr == '\t' || *ptr == ' ')
469  ptr++;
470  }
471 
472  token->num = num;
473 
474  return unique_ptr<TokenStatBlk>(token);
475 }
476 
477 /**********************************************************
478  *
479  * Int2 fta_StringMatch(array, text):
480  *
481  * Return array position of the matched length
482  * of string in array.
483  * Return -1 if no match.
484  *
485  **********************************************************/
486 Int2 fta_StringMatch(const Char** array, string_view text)
487 {
488  Int2 i;
489 
490  for (i = 0; *array; i++, array++) {
492  return i;
493  }
494 
495  return -1;
496 }
497 
498 /**********************************************************
499  *
500  * Int2 StringMatchIcase(array, text):
501  *
502  * Return array position of the matched lenght of
503  * string (ignored case) in array.
504  * Return -1 if no match.
505  *
506  **********************************************************/
507 Int2 StringMatchIcase(const Char** array, string_view text)
508 {
509  Int2 i;
510 
511  for (i = 0; *array; i++, array++) {
512  // If string from an array is empty its length == 0 and would be equval to any other string
513  // The next 'if' statement will avoid that behavior
514  if (! text.empty() && *array[0] == 0)
515  continue;
516 
518  return i;
519  }
520 
521  return -1;
522 }
523 
524 /**********************************************************
525  *
526  * Int2 MatchArrayString(array, text):
527  *
528  * Return array position of the string in the
529  * array.
530  * Return -1 if no match.
531  *
532  **********************************************************/
533 Int2 MatchArrayString(const char** array, const char* text)
534 {
535  Int2 i;
536 
537  if (! text)
538  return (-1);
539 
540  for (i = 0; *array; i++, array++) {
541  if (NStr::Equal(*array, text))
542  return i;
543  }
544 
545  return -1;
546 }
547 
548 /**********************************************************/
550 {
551  Int2 i;
552 
553  if (! text)
554  return (-1);
555 
556  for (i = 0; *array; i++, array++) {
557  // If string from an array is empty its length == 0 and would be equval to any other string
558  // The next 'if' statement will avoid that behavior
559  if (text[0] != 0 && *array[0] == 0)
560  continue;
561 
563  return i;
564  }
565 
566  return -1;
567 }
568 
569 /**********************************************************
570  *
571  * Int2 MatchArraySubString(array, text):
572  *
573  * Return array position of the string in the array
574  * if any array is in the substring of "text".
575  * Return -1 if no match.
576  *
577  **********************************************************/
578 Int2 MatchArraySubString(const Char** array, string_view text)
579 {
580  Int2 i;
581 
582  for (i = 0; *array; i++, array++) {
583  if (NStr::Find(text, *array) != NPOS)
584  return i;
585  }
586 
587  return -1;
588 }
589 
590 /**********************************************************/
591 Char* StringIStr(const Char* where, const Char* what)
592 {
593  const Char* p;
594  const Char* q;
595 
596  if (! where || *where == '\0' || ! what || *what == '\0')
597  return nullptr;
598 
599  q = nullptr;
600  for (; *where != '\0'; where++) {
601  for (q = what, p = where; *q != '\0' && *p != '\0'; q++, p++) {
602  if (*q == *p)
603  continue;
604 
605  if (*q >= 'A' && *q <= 'Z') {
606  if (*q + 32 == *p)
607  continue;
608  } else if (*q >= 'a' && *q <= 'z') {
609  if (*q - 32 == *p)
610  continue;
611  }
612  break;
613  }
614  if (*p == '\0' || *q == '\0')
615  break;
616  }
617  if (q && *q == '\0')
618  return const_cast<char*>(where);
619  return nullptr;
620 }
621 
622 /**********************************************************/
623 Int2 MatchArrayISubString(const Char** array, string_view text)
624 {
625  Int2 i;
626 
627  for (i = 0; *array; i++, array++) {
628  if (NStr::FindNoCase(text, *array) != NPOS)
629  return i;
630  }
631 
632  return -1;
633 }
634 
635 /**********************************************************
636  *
637  * char* GetBlkDataReplaceNewLine(bptr, eptr,
638  * start_col_data):
639  *
640  * Return a string which replace newline to blank
641  * and skip "XX" line data.
642  *
643  **********************************************************/
644 string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
645 {
646  vector<string> lines;
647  NStr::Split(instr, "\n", lines);
648  string replaced;
649  for (auto line : lines) {
650  if (line.empty() || NStr::StartsWith(line, "XX") || line.size() <= indent) {
651  continue;
652  }
653  replaced += line.substr(indent);
654  auto last = line.size() - 1;
655  if (line[last] != '-') {
656  replaced += ' ';
657  } else if (line[last - 1] == ' ') {
658  replaced += ' ';
659  }
660  }
661  NStr::TruncateSpacesInPlace(replaced);
662  return replaced;
663 }
664 
665 
666 /**********************************************************/
667 static size_t SeekLastAlphaChar(const Char* str, size_t len)
668 {
669  if (str && len != 0) {
670  for (size_t ret = len; ret > 0;) {
671  char c = str[--ret];
672  if (c != ' ' && c != '\n' && c != '\\' && c != ',' &&
673  c != ';' && c != '~' && c != '.' && c != ':') {
674  return ret + 1;
675  }
676  }
677  }
678 
679  return 0;
680 }
681 
682 /**********************************************************/
684 {
685  size_t ret = SeekLastAlphaChar(str.c_str(), str.size());
686  str = str.substr(0, ret);
687 }
688 
689 /**********************************************************
690  *
691  * void CleanTailNoneAlphaChar(str):
692  *
693  * Delete any tailing ' ', '\n', '\\', ',', ';', '~',
694  * '.', ':' characters.
695  *
696  **********************************************************/
698 {
699  if (! str || *str == '\0')
700  return;
701 
702  size_t last = SeekLastAlphaChar(str, strlen(str));
703  str[last] = '\0';
704 }
705 
706 /**********************************************************/
707 char* PointToNextToken(char* ptr)
708 {
709  if (ptr) {
710  while (*ptr != ' ')
711  ptr++;
712  while (*ptr == ' ')
713  ptr++;
714  }
715  return (ptr);
716 }
717 
718 /**********************************************************
719  *
720  * char* GetTheCurrentToken(ptr):
721  *
722  * Return the current token (also CleanTailNoneAlphaChar)
723  * which ptr points to and ptr will points to next token
724  * after the routine return.
725  *
726  **********************************************************/
727 char* GetTheCurrentToken(char** ptr)
728 {
729  char* retptr;
730  char* bptr;
731  char* str;
732 
733  bptr = retptr = *ptr;
734  if (! retptr || *retptr == '\0')
735  return nullptr;
736 
737  while (*retptr != '\0' && *retptr != ' ')
738  retptr++;
739 
740  str = StringSave(string_view(bptr, retptr - bptr));
741 
742  while (*retptr != '\0' && *retptr == ' ') /* skip blanks */
743  retptr++;
744  *ptr = retptr;
745 
747  return (str);
748 }
749 
750 /**********************************************************
751  *
752  * char* SrchTheChar(bptr, eptr, letter):
753  *
754  * Search The character letter.
755  * Return NULL if not found; otherwise, return
756  * a pointer points first occurrence The character.
757  *
758  **********************************************************/
759 char* SrchTheChar(char* bptr, char* eptr, Char letter)
760 {
761  string_view sv(bptr, eptr - bptr);
762 
763  auto i = sv.find(letter);
764  if (i != string_view::npos)
765  return bptr + i;
766  else
767  return nullptr;
768 }
769 
770 /**********************************************************
771  *
772  * char* SrchTheStr(bptr, eptr, leadstr):
773  *
774  * Search The leading string.
775  * Return NULL if not found; otherwise, return
776  * a pointer points first occurrence The leading string.
777  *
778  **********************************************************/
779 char* SrchTheStr(char* bptr, char* eptr, const char* leadstr)
780 {
781  string_view sv(bptr, eptr - bptr);
782 
783  auto i = sv.find(leadstr);
784  if (i != string_view::npos)
785  return bptr + i;
786  else
787  return nullptr;
788 }
789 
790 /**********************************************************/
791 void CpSeqId(InfoBioseqPtr ibp, const CSeq_id& id)
792 {
793  const CTextseq_id* text_id = id.GetTextseq_Id();
794  if (text_id) {
795  if (text_id->IsSetName())
796  ibp->mLocus = text_id->GetName();
797 
798  CRef<CSeq_id> new_id(new CSeq_id);
799  if (text_id->IsSetAccession()) {
800  ibp->mAccNum = text_id->GetAccession();
801 
802  CRef<CTextseq_id> new_text_id(new CTextseq_id);
803  new_text_id->SetAccession(text_id->GetAccession());
804  if (text_id->IsSetVersion())
805  new_text_id->SetVersion(text_id->GetVersion());
806 
807  SetTextId(id.Which(), *new_id, *new_text_id);
808  } else {
809  new_id->Assign(id);
810  }
811 
812  ibp->ids.push_back(new_id);
813  } else {
814  auto pId = Ref(new CSeq_id());
815  pId->Assign(id);
816  ibp->ids.push_back(std::move(pId));
817  }
818 }
819 
820 /**********************************************************
821  *
822  * CRef<CDate_std> get_full_date(s, is_ref, source):
823  *
824  * Get year, month, day and return CRef<CDate_std>.
825  *
826  **********************************************************/
828 {
829  CRef<CDate_std> date;
830 
831  if (! s || *s == '\0')
832  return date;
833 
834  int parse_day = 0;
835  if (isdigit(*s) != 0) {
836  parse_day = atoi(s);
837  s += 3;
838  // should we make at least a token effort of validation (like <32)?
839  }
840 
841  static const vector<string> months{
842  "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
843  };
844  CTempString maybe_month(s, 3);
845  auto it = find(months.begin(), months.end(), maybe_month);
846  if (it == months.end()) {
847  char msg[11];
848  StringNCpy(msg, s, 10);
849  msg[10] = '\0';
850  is_ref ? ErrPostEx(
851  SEV_WARNING, ERR_REFERENCE_IllegalDate, "Unrecognized month: %s", msg)
852  : ErrPostEx(
853  SEV_WARNING, ERR_DATE_IllegalDate, "Unrecognized month: %s", msg);
854  return date;
855  }
856  int parse_month = int(it - months.begin()) + 1;
857 
858  s += 4;
859 
860  int parse_year = atoi(s);
861  int cur_year = CCurrentTime().Year();
862  if (1900 <= parse_year && parse_year <= cur_year) {
863  // all set
864  } else if (0 <= parse_year && parse_year <= 99 && '0' <= s[1] && s[1] <= '9') {
865  // insist that short form year has exactly two digits
866  (parse_year < 70) ? (parse_year += 2000) : (parse_year += 1900);
867  } else {
868  if (is_ref) {
869  ErrPostEx(
870  SEV_ERROR, ERR_REFERENCE_IllegalDate, "Illegal year: %d, current year: %d", parse_year, cur_year);
871  } else if (source != Parser::ESource::SPROT || parse_year - cur_year > 1) {
872  ErrPostEx(
873  SEV_WARNING, ERR_DATE_IllegalDate, "Illegal year: %d, current year: %d", parse_year, cur_year);
874  }
875  // treat bad year like bad month above:
876  return date;
877  }
878  date.Reset(new CDate_std);
879  date->SetYear(parse_year);
880  date->SetMonth(parse_month);
881  date->SetDay(parse_day);
882 
883  return date;
884 }
885 
886 /**********************************************************
887  *
888  * int SrchKeyword(ptr, kwl):
889  *
890  * Compare first kwl.len byte in ptr to kwl.str.
891  * Return the position of keyword block array;
892  * return unknown keyword, UNKW, if not found.
893  *
894  * 3-25-93
895  *
896  **********************************************************/
897 int SrchKeyword(const CTempString& ptr, const vector<string>& keywordList)
898 {
899  SIZE_TYPE keywordCount = keywordList.size();
900 
901  for (unsigned i = 0; i < keywordCount; ++i) {
902  if (NStr::StartsWith(ptr, keywordList[i])) {
903  return (int)i;
904  }
905  }
906  return ParFlat_UNKW;
907 }
908 
909 /**********************************************************/
910 bool CheckLineType(char* ptr, Int4 line, const vector<string>& keywordList, bool after_origin)
911 {
912  char* p;
913  Char msg[51];
914 
915  if (after_origin) {
916  for (p = ptr; *p >= '0' && *p <= '9';)
917  p++;
918  if (*p == ' ')
919  return true;
920  }
921 
922  auto keywordCount = keywordList.size();
923  for (unsigned i = 0; i < keywordCount; i++) {
924  auto keyword = keywordList[i];
925  if (StringEquN(ptr, keyword.c_str(), keyword.size()))
926  return true;
927  }
928 
929  StringNCpy(msg, ptr, 50);
930  msg[50] = '\0';
931  p = StringChr(msg, '\n');
932  if (p)
933  *p = '\0';
934  ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Unknown linetype \"%s\". Line number %d.", msg, line);
935  if (p)
936  *p = '\n';
937 
938  return false;
939 }
940 
941 /**********************************************************
942  *
943  * char* SrchNodeType(entry, type, len):
944  *
945  * Return a memory location of the node which has
946  * the "type".
947  *
948  **********************************************************/
949 char* SrchNodeType(DataBlkPtr entry, Int4 type, size_t* len)
950 {
951  DataBlkPtr temp;
952 
953  temp = TrackNodeType(*entry, (Int2)type);
954  if (temp) {
955  *len = temp->len;
956  return (temp->mOffset);
957  }
958 
959  *len = 0;
960  return nullptr;
961 }
962 
963 char* xSrchNodeType(const DataBlk& entry, Int4 type, size_t* len)
964 {
965  DataBlkPtr temp;
966 
967  temp = TrackNodeType(entry, (Int2)type);
968  if (temp) {
969  *len = temp->len;
970  return (temp->mOffset);
971  }
972 
973  *len = 0;
974  return nullptr;
975 }
976 
977 string xGetNodeData(const DataBlk& entry, int nodeType)
978 {
979  auto tmp = TrackNodeType(entry, (Int2)nodeType);
980  if (! tmp) {
981  return "";
982  }
983  return string(tmp->mOffset, tmp->len);
984 }
985 
986 /**********************************************************
987  *
988  * DataBlkPtr TrackNodeType(entry, type):
989  *
990  * Return a pointer points to the Node which has
991  * the "type".
992  *
993  **********************************************************/
995 {
996  DataBlkPtr temp;
997  EntryBlkPtr ebp;
998 
999  ebp = static_cast<EntryBlk*>(entry.mpData);
1000  temp = ebp->chain;
1001  while (temp && temp->mType != type)
1002  temp = temp->mpNext;
1003 
1004  return (temp);
1005 }
1006 
1007 
1008 const Section* xTrackNodeType(const Entry& entry, int type)
1009 {
1010  for (const Section* sectionPtr : entry.mSections) {
1011  if (sectionPtr->mType == type) {
1012  return sectionPtr;
1013  }
1014  }
1015  return nullptr;
1016 }
1017 
1018 
1019 /**********************************************************/
1021 {
1022  const char* b[4];
1023 
1024  bool kwd_tpa = false;
1025  bool kwd_party = false;
1026  bool kwd_inf = false;
1027  bool kwd_exp = false;
1028  bool kwd_asm = false;
1029  bool kwd_spedb = false;
1030  bool ret = true;
1031 
1032  Int4 j;
1033  Int2 i;
1034 
1035  if (kwds.empty())
1036  return true;
1037 
1038  size_t len = 0;
1039  j = 0;
1040  for (const string& key : kwds) {
1041  if (key.empty())
1042  continue;
1043 
1044  const char* p = key.c_str();
1046  if (i == 0)
1047  kwd_tpa = true;
1048  else if (i == 1 || i == 2)
1049  kwd_party = true;
1050  else if (i == 3)
1051  kwd_inf = true;
1052  else if (i == 4)
1053  kwd_exp = true;
1054  else if (i == 5 || i == 6)
1055  kwd_asm = true;
1056  else if (i == 7)
1057  kwd_spedb = true;
1058  else if (NStr::EqualNocase(p, 0, 3, "TPA")) {
1059  if (p[3] == ':') {
1060  ErrPostEx(SEV_REJECT, ERR_KEYWORD_InvalidTPATier, "Keyword \"%s\" is not a valid TPA-tier keyword.", p);
1061  ret = false;
1062  } else if (p[3] != '\0' && p[4] != '\0') {
1063  ErrPostEx(SEV_WARNING, ERR_KEYWORD_UnexpectedTPA, "Keyword \"%s\" looks like it might be TPA-related, but it is not a recognized TPA keyword.", p);
1064  }
1065  }
1066  if (i > 2 && i < 8 && j < 4) {
1067  b[j] = p;
1068  ++j;
1069  len += key.size() + 1;
1070  }
1071  }
1072 
1073  if (kwd_tpa && ! kwd_party) {
1074  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords, "This TPA-record should have keyword \"Third Party Annotation\" or \"Third Party Data\" in addition to \"TPA\".");
1075  ret = false;
1076  } else if (! kwd_tpa && kwd_party) {
1077  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords, "This TPA-record should have keyword \"TPA\" in addition to \"Third Party Annotation\" or \"Third Party Data\".");
1078  ret = false;
1079  }
1080  if (! kwd_tpa && (kwd_inf || kwd_exp)) {
1081  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords, "This TPA-record should have keyword \"TPA\" in addition to its TPA-tier keyword.");
1082  ret = false;
1083  } else if (kwd_tpa && kwd_inf == false && kwd_exp == false &&
1084  kwd_asm == false && kwd_spedb == false) {
1085  ErrPostStr(SEV_ERROR, ERR_KEYWORD_MissingTPATier, "This TPA record lacks a keyword to indicate which tier it belongs to: experimental, inferential, reassembly or specialist_db.");
1086  }
1087  if (j > 1) {
1088  string buf;
1089  for (i = 0; i < j; i++) {
1090  if (i > 0)
1091  buf += ';';
1092  buf += b[i];
1093  }
1094  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingTPATiers, "Keywords for multiple TPA tiers exist on this record: \"%s\". A TPA record can only be in one tier.", buf.c_str());
1095  ret = false;
1096  }
1097 
1098  return (ret);
1099 }
1100 
1101 /**********************************************************/
1103 {
1104  bool kwd_tsa = false;
1105  bool kwd_assembly = false;
1106  bool ret = true;
1107  Int2 i;
1108 
1109  if (kwds.empty())
1110  return true;
1111 
1112  for (const string& key : kwds) {
1113  if (key.empty())
1114  continue;
1116  if (i == 0)
1117  kwd_tsa = true;
1118  else if (i == 1)
1119  kwd_assembly = true;
1120  else if (source == Parser::ESource::EMBL &&
1121  NStr::EqualNocase(key, "Transcript Shotgun Assembly"))
1122  kwd_assembly = true;
1123  }
1124 
1125  if (kwd_tsa && ! kwd_assembly) {
1126  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingTSAKeywords, "This TSA-record should have keyword \"Transcriptome Shotgun Assembly\" in addition to \"TSA\".");
1127  ret = false;
1128  } else if (! kwd_tsa && kwd_assembly) {
1129  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingTSAKeywords, "This TSA-record should have keyword \"TSA\" in addition to \"Transcriptome Shotgun Assembly\".");
1130  ret = false;
1131  }
1132  return (ret);
1133 }
1134 
1135 /**********************************************************/
1137 {
1138  bool kwd_tls = false;
1139  bool kwd_study = false;
1140  bool ret = true;
1141  Int2 i;
1142 
1143  if (kwds.empty())
1144  return true;
1145 
1146  for (const string& key : kwds) {
1147  if (key.empty())
1148  continue;
1150  if (i == 0)
1151  kwd_tls = true;
1152  else if (i == 1)
1153  kwd_study = true;
1154  else if (source == Parser::ESource::EMBL &&
1155  NStr::EqualNocase(key, "Targeted Locus Study"))
1156  kwd_study = true;
1157  }
1158 
1159  if (kwd_tls && ! kwd_study) {
1160  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingTLSKeywords, "This TLS-record should have keyword \"Targeted Locus Study\" in addition to \"TLS\".");
1161  ret = false;
1162  } else if (! kwd_tls && kwd_study) {
1163  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingTLSKeywords, "This TLS-record should have keyword \"TLS\" in addition to \"Targeted Locus Study\".");
1164  ret = false;
1165  }
1166  return (ret);
1167 }
1168 
1169 /**********************************************************/
1170 bool fta_is_tpa_keyword(const char* str)
1171 {
1172  if (! str || *str == '\0' || MatchArrayIString(ParFlat_TPA_kw_array, str) < 0)
1173  return false;
1174 
1175  return true;
1176 }
1177 
1178 /**********************************************************/
1179 bool fta_is_tsa_keyword(const char* str)
1180 {
1181  if (! str || *str == '\0' || MatchArrayIString(ParFlat_TSA_kw_array, str) < 0)
1182  return false;
1183  return true;
1184 }
1185 
1186 /**********************************************************/
1187 bool fta_is_tls_keyword(const char* str)
1188 {
1189  if (! str || *str == '\0' || MatchArrayIString(ParFlat_TLS_kw_array, str) < 0)
1190  return false;
1191  return true;
1192 }
1193 
1194 /**********************************************************/
1195 void fta_keywords_check(const char* str, bool* estk, bool* stsk, bool* gssk, bool* htck, bool* flik, bool* wgsk, bool* tpak, bool* envk, bool* mgak, bool* tsak, bool* tlsk)
1196 {
1197  if (estk && MatchArrayString(ParFlat_EST_kw_array, str) != -1)
1198  *estk = true;
1199 
1200  if (stsk && MatchArrayString(ParFlat_STS_kw_array, str) != -1)
1201  *stsk = true;
1202 
1203  if (gssk && MatchArrayString(ParFlat_GSS_kw_array, str) != -1)
1204  *gssk = true;
1205 
1206  if (htck && MatchArrayString(ParFlat_HTC_kw_array, str) != -1)
1207  *htck = true;
1208 
1209  if (flik && MatchArrayString(ParFlat_FLI_kw_array, str) != -1)
1210  *flik = true;
1211 
1212  if (wgsk && MatchArrayString(ParFlat_WGS_kw_array, str) != -1)
1213  *wgsk = true;
1214 
1215  if (tpak && MatchArrayString(ParFlat_TPA_kw_array, str) != -1)
1216  *tpak = true;
1217 
1218  if (envk && MatchArrayString(ParFlat_ENV_kw_array, str) != -1)
1219  *envk = true;
1220 
1221  if (mgak && MatchArrayString(ParFlat_MGA_kw_array, str) != -1)
1222  *mgak = true;
1223 
1224  if (tsak && MatchArrayString(ParFlat_TSA_kw_array, str) != -1)
1225  *tsak = true;
1226 
1227  if (tlsk && MatchArrayString(ParFlat_TLS_kw_array, str) != -1)
1228  *tlsk = true;
1229 }
1230 
1231 /**********************************************************/
1233 {
1234  const char** b;
1235 
1236  if (kwds.empty())
1237  return;
1238 
1239  if (tech == CMolInfo::eTech_est)
1241  else if (tech == CMolInfo::eTech_sts)
1243  else if (tech == CMolInfo::eTech_survey)
1245  else if (tech == CMolInfo::eTech_htc)
1247  else if (tech == CMolInfo::eTech_fli_cdna)
1249  else if (tech == CMolInfo::eTech_wgs)
1251  else
1252  return;
1253 
1254  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1255  if (key->empty() || MatchArrayString(b, key->c_str()) != -1) {
1256  key = kwds.erase(key);
1257  } else
1258  ++key;
1259  }
1260 }
1261 
1262 /**********************************************************/
1264 {
1265  if (kwds.empty())
1266  return;
1267 
1268  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1269  if (key->empty() || MatchArrayIString(ParFlat_TPA_kw_array_to_remove, key->c_str()) != -1) {
1270  key = kwds.erase(key);
1271  } else
1272  ++key;
1273  }
1274 }
1275 
1276 /**********************************************************/
1278 {
1279  if (kwds.empty())
1280  return;
1281 
1282  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1283  if (key->empty() || MatchArrayIString(ParFlat_TSA_kw_array, key->c_str()) != -1 ||
1284  (source == Parser::ESource::EMBL && NStr::EqualNocase(*key, "Transcript Shotgun Assembly"))) {
1285  key = kwds.erase(key);
1286  } else
1287  ++key;
1288  }
1289 }
1290 
1291 /**********************************************************/
1293 {
1294  if (kwds.empty())
1295  return;
1296 
1297  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1298  if (key->empty() || MatchArrayIString(ParFlat_TLS_kw_array, key->c_str()) != -1 ||
1299  (source == Parser::ESource::EMBL && NStr::EqualNocase(*key, "Targeted Locus Study"))) {
1300  key = kwds.erase(key);
1301  } else
1302  ++key;
1303  }
1304 }
1305 
1306 /**********************************************************/
1308 {
1309  if (kwds.empty())
1310  return;
1311 
1312  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1313  if (key->empty() || MatchArrayIString(ParFlat_ENV_kw_array, key->c_str()) != -1) {
1314  key = kwds.erase(key);
1315  } else
1316  ++key;
1317  }
1318 }
1319 
1320 /**********************************************************/
1322 {
1323  if (kwds.empty())
1324  return;
1325 
1326  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1327  if (key->empty() || MatchArrayIString(ParFlat_MAG_kw_array, key->c_str()) != -1) {
1328  key = kwds.erase(key);
1329  } else
1330  ++key;
1331  }
1332 }
1333 
1334 /**********************************************************/
1336  const list<string> keywordList,
1337  bool tpa_check,
1338  IndexblkPtr entry
1339  // bool& specialist_db,
1340  // bool& inferential,
1341  // bool& experimental,
1342  // bool& assembly
1343 )
1344 {
1345  if (keywordList.empty()) {
1346  return;
1347  }
1348  for (auto keyword : keywordList) {
1350  keyword.c_str(), &entry->EST, &entry->STS, &entry->GSS, &entry->HTC, nullptr, nullptr, (tpa_check ? &entry->is_tpa : nullptr), nullptr, nullptr, nullptr, nullptr);
1351  if (NStr::EqualNocase(keyword, "TPA:assembly")) {
1352  entry->specialist_db = true;
1353  entry->assembly = true;
1354  continue;
1355  }
1356  if (NStr::EqualNocase(keyword, "TPA:specialist_db")) {
1357  entry->specialist_db = true;
1358  continue;
1359  }
1360  if (NStr::EqualNocase(keyword, "TPA:inferential")) {
1361  entry->inferential = true;
1362  continue;
1363  }
1364  if (NStr::EqualNocase(keyword, "TPA:experimental")) {
1365  entry->experimental = true;
1366  continue;
1367  }
1368  }
1369 }
1370 
1371 void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool& specialist_db, bool& inferential, bool& experimental, bool& assembly)
1372 {
1373  char* line;
1374  char* p;
1375  char* q;
1376 
1377  if (! kwds || ! kwds->data || len < 1)
1378  return;
1379 
1380  line = StringNew(len);
1381  line[0] = '\0';
1382  for (; kwds; kwds = kwds->next) {
1383  StringCat(line, kwds->data);
1384  }
1385  for (p = line; *p != '\0'; p++)
1386  if (*p == '\n' || *p == '\t')
1387  *p = ' ';
1388  for (p = line; *p == ' ' || *p == '.' || *p == ';';)
1389  p++;
1390  if (*p == '\0') {
1391  MemFree(line);
1392  return;
1393  }
1394  for (q = p; *q != '\0';)
1395  q++;
1396  for (q--; *q == ' ' || *q == '.' || *q == ';'; q--)
1397  *q = '\0';
1398  for (q = p, p = line; *q != '\0';) {
1399  if (*q != ' ' && *q != ';') {
1400  *p++ = *q++;
1401  continue;
1402  }
1403  if (*q == ' ') {
1404  for (q++; *q == ' ';)
1405  q++;
1406  if (*q != ';')
1407  *p++ = ' ';
1408  }
1409  if (*q == ';') {
1410  *p++ = *q++;
1411  while (*q == ' ' || *q == ';')
1412  q++;
1413  }
1414  }
1415  *p++ = ';';
1416  *p = '\0';
1417  for (p = line;; p = q + 1) {
1418  q = StringChr(p, ';');
1419  if (! q)
1420  break;
1421  *q = '\0';
1422  fta_keywords_check(p, &entry->EST, &entry->STS, &entry->GSS, &entry->HTC, nullptr, nullptr, (tpa_check ? &entry->is_tpa : nullptr), nullptr, nullptr, nullptr, nullptr);
1423  if (NStr::EqualNocase(p, "TPA:specialist_db") ||
1424  NStr::EqualNocase(p, "TPA:assembly")) {
1425  specialist_db = true;
1426  if (NStr::EqualNocase(p, "TPA:assembly"))
1427  assembly = true;
1428  } else if (NStr::EqualNocase(p, "TPA:inferential"))
1429  inferential = true;
1430  else if (NStr::EqualNocase(p, "TPA:experimental"))
1431  experimental = true;
1432  }
1433  MemFree(line);
1434 }
1435 
1436 /**********************************************************/
1438 {
1439  ValNodePtr res;
1440 
1441  res = ValNodeNew(nullptr, data);
1442  res->choice = choice;
1443  return (res);
1444 }
1445 
1446 /**********************************************************/
1447 bool fta_check_mga_keywords(CMolInfo& mol_info, const TKeywordList& kwds)
1448 {
1449  bool is_cage;
1450  bool is_sage;
1451 
1452  TKeywordList::const_iterator key_it = kwds.end();
1453 
1454  bool got = false;
1455  if (! kwds.empty() && NStr::EqualNocase(kwds.front(), "MGA")) {
1456  for (TKeywordList::const_iterator key = kwds.begin(); key != kwds.end(); ++key) {
1458  key->c_str()) < 0)
1459  continue;
1460  got = true;
1461  key_it = key;
1462  break;
1463  }
1464  }
1465 
1466  if (! got) {
1467  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingMGAKeywords, "This is apparently a CAGE record, but it lacks the required keywords. Entry dropped.");
1468  return false;
1469  }
1470 
1471  if (! mol_info.IsSetTechexp() || ! kwds.empty() ||
1472  mol_info.GetTechexp() != "cage")
1473  return true;
1474 
1475  for (is_sage = false, is_cage = false; key_it != kwds.end(); ++key_it) {
1476  const char* p = key_it->c_str();
1477 
1478  if (NStr::EqualNocase(p, "5'-SAGE"))
1479  is_sage = true;
1480  else if (NStr::EqualNocase(p, "CAGE (Cap Analysis Gene Expression)"))
1481  is_cage = true;
1482  }
1483 
1484  if (is_sage) {
1485  if (is_cage) {
1486  ErrPostStr(SEV_REJECT, ERR_KEYWORD_ConflictingMGAKeywords, "This MGA record contains more than one of the special keywords indicating different techniques.");
1487  return false;
1488  }
1489  mol_info.SetTechexp("5'-sage");
1490  }
1491 
1492  return true;
1493 }
1494 
1495 /**********************************************************/
1496 void fta_StringCpy(char* dst, const char* src)
1497 {
1498  const char* p;
1499  char* q;
1500 
1501  for (q = dst, p = src; *p != '\0';)
1502  *q++ = *p++;
1503  *q = '\0';
1504 }
1505 
1506 /**********************************************************/
1507 bool SetTextId(Uint1 seqtype, CSeq_id& seqId, CTextseq_id& textId)
1508 {
1509  bool wasSet = true;
1510 
1511  switch (seqtype) {
1512  case CSeq_id::e_Genbank:
1513  seqId.SetGenbank(textId);
1514  break;
1515  case CSeq_id::e_Embl:
1516  seqId.SetEmbl(textId);
1517  break;
1518  case CSeq_id::e_Pir:
1519  seqId.SetPir(textId);
1520  break;
1521  case CSeq_id::e_Swissprot:
1522  seqId.SetSwissprot(textId);
1523  break;
1524  case CSeq_id::e_Other:
1525  seqId.SetOther(textId);
1526  break;
1527  case CSeq_id::e_Ddbj:
1528  seqId.SetDdbj(textId);
1529  break;
1530  case CSeq_id::e_Prf:
1531  seqId.SetPrf(textId);
1532  break;
1533  case CSeq_id::e_Pdb: {
1534  // TODO: test this branch
1535  CPDB_seq_id pdbId;
1536  pdbId.SetChain_id();
1537  seqId.SetPdb(pdbId);
1538  } break;
1539  case CSeq_id::e_Tpg:
1540  seqId.SetTpg(textId);
1541  break;
1542  case CSeq_id::e_Tpe:
1543  seqId.SetTpe(textId);
1544  break;
1545  case CSeq_id::e_Tpd:
1546  seqId.SetTpd(textId);
1547  break;
1548  case CSeq_id::e_Gpipe:
1549  seqId.SetGpipe(textId);
1550  break;
1552  seqId.SetNamed_annot_track(textId);
1553  break;
1554 
1555  default:
1556  wasSet = false;
1557  }
1558 
1559  return wasSet;
1560 }
1561 
1562 /**********************************************************/
1563 bool IsCancelled(const TKeywordList& keywords)
1564 {
1565  for (const string& key : keywords) {
1566  if (NStr::EqualNocase(key, "HTGS_CANCELLED"))
1567  return true;
1568  }
1569 
1570  return false;
1571 }
1572 
1573 /**********************************************************/
1574 bool HasHtg(const TKeywordList& keywords)
1575 {
1576  for (const string& key : keywords) {
1577  if (key == "HTG" || key == "HTGS_PHASE0" ||
1578  key == "HTGS_PHASE1" || key == "HTGS_PHASE2" ||
1579  key == "HTGS_PHASE3") {
1580  return true;
1581  }
1582  }
1583 
1584  return false;
1585 }
1586 
1587 /**********************************************************/
1589 {
1590  for (TKeywordList::iterator key = keywords.begin(); key != keywords.end();) {
1591  const char* p = key->c_str();
1592  if (NStr::EqualNocase(p, 0, 10, "HTGS_PHASE") &&
1593  (p[10] == '0' || p[10] == '1' || p[10] == '2' ||
1594  p[10] == '3') &&
1595  p[11] == '\0') {
1596  key = keywords.erase(key);
1597  } else
1598  ++key;
1599  }
1600 }
1601 
1602 /**********************************************************/
1603 bool HasHtc(const TKeywordList& keywords)
1604 {
1605  for (const string& key : keywords) {
1606  if (NStr::EqualNocase(key, "HTC")) {
1607  return true;
1608  }
1609  }
1610 
1611  return false;
1612 }
1613 
CCurrentTime –.
Definition: ncbitime.hpp:1283
CScope –.
Definition: scope.hpp:92
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
char * mOffset
Definition: ftablock.h:329
size_t len
Definition: ftablock.h:330
CFlatFileData * mpData
Definition: ftablock.h:328
DataBlk * mpNext
Definition: ftablock.h:333
int mType
Definition: ftablock.h:327
The NCBI C++ standard methods for dealing with std::string.
#define ERR_REFERENCE_IllegalDate
Definition: flat2err.h:282
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
std::list< std::string > TKeywordList
Definition: ftablock.h:163
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:90
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void StringCat(char *d, const char *s)
Definition: ftacpp.hpp:88
char * StringNew(size_t sz)
Definition: ftacpp.hpp:43
const char * months[]
Definition: ftaerr.cpp:118
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
string
Definition: cgiapp.hpp:690
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2984
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
Definition: ncbistr.cpp:3192
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5319
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3545
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5347
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5378
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
int Year(void) const
Get year.
Definition: ncbitime.hpp:2265
list< string > TExtra_accessions
Definition: GB_block_.hpp:91
void SetYear(TYear value)
Assign a value to Year data member.
Definition: Date_std_.hpp:435
void SetMonth(TMonth value)
Assign a value to Month data member.
Definition: Date_std_.hpp:482
void SetDay(TDay value)
Assign a value to Day data member.
Definition: Date_std_.hpp:529
TNamed_annot_track & SetNamed_annot_track(void)
Select the variant.
Definition: Seq_id_.cpp:551
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seq_id_.cpp:265
TOther & SetOther(void)
Select the variant.
Definition: Seq_id_.cpp:353
const TName & GetName(void) const
Get the Name member data.
TTpe & SetTpe(void)
Select the variant.
Definition: Seq_id_.cpp:485
TTpg & SetTpg(void)
Select the variant.
Definition: Seq_id_.cpp:463
TPir & SetPir(void)
Select the variant.
Definition: Seq_id_.cpp:287
TTpd & SetTpd(void)
Select the variant.
Definition: Seq_id_.cpp:507
TVersion GetVersion(void) const
Get the Version member data.
TGpipe & SetGpipe(void)
Select the variant.
Definition: Seq_id_.cpp:529
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
TDdbj & SetDdbj(void)
Select the variant.
Definition: Seq_id_.cpp:397
TPrf & SetPrf(void)
Select the variant.
Definition: Seq_id_.cpp:419
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seq_id_.cpp:243
TSwissprot & SetSwissprot(void)
Select the variant.
Definition: Seq_id_.cpp:309
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
void SetChain_id(const TChain_id &value)
Assign a value to Chain_id data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
TPdb & SetPdb(void)
Select the variant.
Definition: Seq_id_.cpp:441
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
Definition: Seq_id_.hpp:113
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_Named_annot_track
Internal named annotation tracking ID.
Definition: Seq_id_.hpp:114
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
bool IsSetTechexp(void) const
explanation if tech not enough
Definition: MolInfo_.hpp:522
const TTechexp & GetTechexp(void) const
Get the Techexp member data.
Definition: MolInfo_.hpp:534
void SetTechexp(const TTechexp &value)
Assign a value to Techexp data member.
Definition: MolInfo_.hpp:543
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
#define ERR_KEYWORD_MissingTPAKeywords
Definition: indx_err.h:111
#define ERR_ACCESSION_Invalid2ndAccRange
Definition: indx_err.h:71
#define ERR_ACCESSION_2ndAccPrefixMismatch
Definition: indx_err.h:70
#define ERR_KEYWORD_InvalidTPATier
Definition: indx_err.h:109
#define ERR_KEYWORD_UnexpectedTPA
Definition: indx_err.h:110
#define ERR_KEYWORD_MissingTSAKeywords
Definition: indx_err.h:114
#define ERR_KEYWORD_MissingTPATier
Definition: indx_err.h:112
#define ERR_KEYWORD_ConflictingTPATiers
Definition: indx_err.h:113
#define ERR_KEYWORD_MissingTLSKeywords
Definition: indx_err.h:117
#define ERR_ENTRY_InvalidLineType
Definition: indx_err.h:64
#define ERR_KEYWORD_MissingMGAKeywords
Definition: indx_err.h:115
#define ERR_KEYWORD_ConflictingMGAKeywords
Definition: indx_err.h:116
char * buf
int i
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
Defines: CTimeFormat - storage class for time format.
The Object manager core.
static const char delimiter[]
string indent(" ")
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
DataBlkPtr chain
Definition: ftablock.h:341
Definition: entry.h:57
list< SectionPtr > mSections
Definition: entry.h:99
bool assembly
Definition: ftablock.h:241
bool is_tpa
Definition: ftablock.h:206
bool STS
Definition: ftablock.h:193
bool HTC
Definition: ftablock.h:195
bool experimental
Definition: ftablock.h:247
bool inferential
Definition: ftablock.h:245
bool EST
Definition: ftablock.h:192
bool specialist_db
Definition: ftablock.h:243
bool GSS
Definition: ftablock.h:194
string mAccNum
Definition: ftablock.h:85
string mLocus
Definition: ftablock.h:84
TSeqIdList ids
Definition: ftablock.h:83
Definition: entry.h:13
int mType
Definition: entry.h:47
TokenBlkList list
Definition: ftablock.h:137
ValNode * next
Definition: valnode.h:51
char * data
Definition: valnode.h:49
unsigned char choice
Definition: valnode.h:47
Definition: type.c:6
static const char * ParFlat_TLS_kw_array[]
Definition: utilfun.cpp:139
USING_SCOPE(objects)
Int2 MatchArrayIString(const Char **array, const Char *text)
Definition: utilfun.cpp:549
CScope & GetScope()
Definition: utilfun.cpp:62
bool HasHtg(const TKeywordList &keywords)
Definition: utilfun.cpp:1574
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
Definition: utilfun.cpp:897
static const char * ParFlat_STS_kw_array[]
Definition: utilfun.cpp:87
bool HasHtc(const TKeywordList &keywords)
Definition: utilfun.cpp:1603
static const char * ParFlat_MGA_kw_array[]
Definition: utilfun.cpp:111
char * SrchTheChar(char *bptr, char *eptr, Char letter)
Definition: utilfun.cpp:759
static const char * ParFlat_MAG_kw_array[]
Definition: utilfun.cpp:172
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1136
void RemoveHtgPhase(TKeywordList &keywords)
Definition: utilfun.cpp:1588
bool fta_is_tsa_keyword(const char *str)
Definition: utilfun.cpp:1179
static bool sIsPrefixChar(char c)
Definition: utilfun.cpp:260
bool fta_is_tls_keyword(const char *str)
Definition: utilfun.cpp:1187
bool CheckLineType(char *ptr, Int4 line, const vector< string > &keywordList, bool after_origin)
Definition: utilfun.cpp:910
static const char * ParFlat_TPA_kw_array_to_remove[]
Definition: utilfun.cpp:160
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
Definition: utilfun.cpp:1507
string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
Definition: utilfun.cpp:644
Int2 StringMatchIcase(const Char **array, string_view text)
Definition: utilfun.cpp:507
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1277
void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool &specialist_db, bool &inferential, bool &experimental, bool &assembly)
Definition: utilfun.cpp:1371
Int2 MatchArrayISubString(const Char **array, string_view text)
Definition: utilfun.cpp:623
void fta_remove_tpa_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1263
Int2 MatchArraySubString(const Char **array, string_view text)
Definition: utilfun.cpp:578
unique_ptr< TokenStatBlk > TokenString(const char *str, Char delimiter)
Definition: utilfun.cpp:445
static const char * ParFlat_FLI_kw_array[]
Definition: utilfun.cpp:101
CRef< CDate_std > get_full_date(const char *s, bool is_ref, Parser::ESource source)
Definition: utilfun.cpp:827
static const char * ParFlat_ENV_kw_array[]
Definition: utilfun.cpp:167
bool fta_is_tpa_keyword(const char *str)
Definition: utilfun.cpp:1170
void CleanTailNoneAlphaCharInString(string &str)
Definition: utilfun.cpp:683
static const char * ParFlat_TPA_kw_array[]
Definition: utilfun.cpp:148
const Section * xTrackNodeType(const Entry &entry, int type)
Definition: utilfun.cpp:1008
char * SrchNodeType(DataBlkPtr entry, Int4 type, size_t *len)
Definition: utilfun.cpp:949
static const char * ParFlat_TSA_kw_array[]
Definition: utilfun.cpp:130
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
Definition: utilfun.cpp:1232
static const char * ParFlat_MGA_more_kw_array[]
Definition: utilfun.cpp:118
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1292
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Definition: utilfun.cpp:963
string xGetNodeData(const DataBlk &entry, int nodeType)
Definition: utilfun.cpp:977
bool ParseAccessionRange(list< string > &tokens, unsigned skip)
Definition: utilfun.cpp:265
char * GetTheCurrentToken(char **ptr)
Definition: utilfun.cpp:727
Int2 fta_StringMatch(const Char **array, string_view text)
Definition: utilfun.cpp:486
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Definition: utilfun.cpp:1195
bool IsLeadPrefixChar(char c)
Definition: utilfun.cpp:340
static const char * ParFlat_WGS_kw_array[]
Definition: utilfun.cpp:106
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1496
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:994
static const char * ParFlat_HTC_kw_array[]
Definition: utilfun.cpp:96
void fta_remove_mag_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1321
void CleanTailNoneAlphaChar(char *str)
Definition: utilfun.cpp:697
static size_t SeekLastAlphaChar(const Char *str, size_t len)
Definition: utilfun.cpp:667
bool IsCancelled(const TKeywordList &keywords)
Definition: utilfun.cpp:1563
Int2 MatchArrayString(const char **array, const char *text)
Definition: utilfun.cpp:533
static const char * ParFlat_GSS_kw_array[]
Definition: utilfun.cpp:80
static string FTAitoa(Int4 m)
Definition: utilfun.cpp:179
Char * StringIStr(const Char *where, const Char *what)
Definition: utilfun.cpp:591
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1102
void CpSeqId(InfoBioseqPtr ibp, const CSeq_id &id)
Definition: utilfun.cpp:791
static const char * ParFlat_EST_kw_array[]
Definition: utilfun.cpp:69
void fta_remove_env_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1307
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
Definition: utilfun.cpp:779
bool fta_tpa_keywords_check(const TKeywordList &kwds)
Definition: utilfun.cpp:1020
char * PointToNextToken(char *ptr)
Definition: utilfun.cpp:707
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
Definition: utilfun.cpp:1447
void xCheckEstStsGssTpaKeywords(const list< string > keywordList, bool tpa_check, IndexblkPtr entry)
Definition: utilfun.cpp:1335
void UnwrapAccessionRange(const CGB_block::TExtra_accessions &extra_accs, CGB_block::TExtra_accessions &hist)
Definition: utilfun.cpp:197
bool IsDigit(char c)
Definition: utilfun.cpp:344
ValNodePtr ConstructValNode(CSeq_id::E_Choice choice, const char *data)
Definition: utilfun.cpp:1437
#define ParFlat_UNKW
Definition: utilfun.h:44
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
Definition: valnode.cpp:53
static Uint4 letter(char c)
Modified on Fri Sep 20 14:58:02 2024 by modify_doxy.py rev. 669887