NCBI C++ ToolKit
utilfun.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: utilfun.cpp 102490 2024-05-14 11:53:07Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: utilfun.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Utility functions for parser and indexing.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbitime.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <corelib/ncbistr.hpp>
41 #include <objmgr/scope.hpp>
43 #include <objects/seq/MolInfo.hpp>
45 #include <corelib/tempstr.hpp>
46 
47 #include "index.h"
48 
49 #include "ftaerr.hpp"
50 #include "indx_def.h"
51 #include "utilfun.h"
52 
53 #ifdef THIS_FILE
54 # undef THIS_FILE
55 #endif
56 #define THIS_FILE "utilfun.cpp"
57 
59 
61 
63 {
64  static CScope scope(*CObjectManager::GetInstance());
65  return scope;
66 }
67 
68 
69 static const char* ParFlat_EST_kw_array[] = {
70  "EST",
71  "EST PROTO((expressed sequence tag)",
72  "expressed sequence tag",
73  "EST (expressed sequence tag)",
74  "EST (expressed sequence tags)",
75  "EST(expressed sequence tag)",
76  "transcribed sequence fragment",
77  nullptr
78 };
79 
80 static const char* ParFlat_GSS_kw_array[] = {
81  "GSS",
82  "GSS (genome survey sequence)",
83  "trapped exon",
84  nullptr
85 };
86 
87 static const char* ParFlat_STS_kw_array[] = {
88  "STS",
89  "STS(sequence tagged site)",
90  "STS (sequence tagged site)",
91  "STS sequence",
92  "sequence tagged site",
93  nullptr
94 };
95 
96 static const char* ParFlat_HTC_kw_array[] = {
97  "HTC",
98  nullptr
99 };
100 
101 static const char* ParFlat_FLI_kw_array[] = {
102  "FLI_CDNA",
103  nullptr
104 };
105 
106 static const char* ParFlat_WGS_kw_array[] = {
107  "WGS",
108  nullptr
109 };
110 
111 static const char* ParFlat_MGA_kw_array[] = {
112  "MGA",
113  "CAGE (Cap Analysis Gene Expression)",
114  "5'-SAGE",
115  nullptr
116 };
117 
118 static const char* ParFlat_MGA_more_kw_array[] = {
119  "CAGE (Cap Analysis Gene Expression)",
120  "5'-SAGE",
121  "5'-end tag",
122  "unspecified tag",
123  "small RNA",
124  nullptr
125 };
126 
127 /* Any change of contents of next array below requires proper
128  * modifications in function fta_tsa_keywords_check().
129  */
130 static const char* ParFlat_TSA_kw_array[] = {
131  "TSA",
132  "Transcriptome Shotgun Assembly",
133  nullptr
134 };
135 
136 /* Any change of contents of next array below requires proper
137  * modifications in function fta_tls_keywords_check().
138  */
139 static const char* ParFlat_TLS_kw_array[] = {
140  "TLS",
141  "Targeted Locus Study",
142  nullptr
143 };
144 
145 /* Any change of contents of next 2 arrays below requires proper
146  * modifications in function fta_tpa_keywords_check().
147  */
148 static const char* ParFlat_TPA_kw_array[] = {
149  "TPA",
150  "THIRD PARTY ANNOTATION",
151  "THIRD PARTY DATA",
152  "TPA:INFERENTIAL",
153  "TPA:EXPERIMENTAL",
154  "TPA:REASSEMBLY",
155  "TPA:ASSEMBLY",
156  "TPA:SPECIALIST_DB",
157  nullptr
158 };
159 
160 static const char* ParFlat_TPA_kw_array_to_remove[] = {
161  "TPA",
162  "THIRD PARTY ANNOTATION",
163  "THIRD PARTY DATA",
164  nullptr
165 };
166 
167 static const char* ParFlat_ENV_kw_array[] = {
168  "ENV",
169  nullptr
170 };
171 
172 static const char* ParFlat_MAG_kw_array[] = {
173  "Metagenome Assembled Genome",
174  "MAG",
175  nullptr
176 };
177 
178 /**********************************************************/
179 static string FTAitoa(Int4 m)
180 {
181  Int4 sign = (m < 0) ? -1 : 1;
182  string res;
183 
184  for (m *= sign; m > 9; m /= 10)
185  res += m % 10 + '0';
186 
187  res += m + '0';
188 
189  if (sign < 0)
190  res += '-';
191 
192  std::reverse(res.begin(), res.end());
193  return res;
194 }
195 
196 /**********************************************************/
198 {
199  Int4 num1;
200  Int4 num2;
201 
203 
204  for (const string& acc : extra_accs) {
205  if (acc.empty())
206  continue;
207 
208  size_t dash = acc.find('-');
209  if (dash == string::npos) {
210  ret.push_back(acc);
211  continue;
212  }
213 
214  string first(acc.begin(), acc.begin() + dash),
215  last(acc.begin() + dash + 1, acc.end());
216  size_t acclen = first.size();
217 
218  const Char* p = first.c_str();
219  for (; (*p >= 'A' && *p <= 'Z') || *p == '_';)
220  p++;
221 
222  size_t preflen = p - first.c_str();
223 
224  string prefix = first.substr(0, preflen);
225  while (*p == '0')
226  p++;
227 
228  const Char* q;
229  for (q = p; *p >= '0' && *p <= '9';)
230  p++;
231  num1 = atoi(q);
232 
233  for (p = last.c_str() + preflen; *p == '0';)
234  p++;
235  for (q = p; *p >= '0' && *p <= '9';)
236  p++;
237  num2 = atoi(q);
238 
239  ret.push_back(first);
240 
241  if (num1 == num2)
242  continue;
243 
244  for (num1++; num1 <= num2; num1++) {
245  string new_acc = prefix;
246  string num_str = FTAitoa(num1);
247  size_t j = acclen - preflen - num_str.size();
248 
249  for (size_t i = 0; i < j; i++)
250  new_acc += '0';
251 
252  new_acc += num_str;
253  ret.push_back(new_acc);
254  }
255  }
256 
257  ret.swap(hist);
258 }
259 
260 static bool sIsPrefixChar(char c)
261 {
262  return ('A' <= c && c <= 'Z') || c == '_';
263 }
264 /**********************************************************/
265 bool ParseAccessionRange(list<string>& tokens, unsigned skip)
266 {
267  bool bad = false;
268 
269  if (tokens.empty()) {
270  return true;
271  }
272 
273  if (tokens.size() <= skip + 1) {
274  return true;
275  }
276 
277 
278  auto it = tokens.begin();
279  if (skip) {
280  advance(it, skip);
281  }
282 
283  for (; it != tokens.end(); ++it) {
284  const auto& token = *it;
285  if (token.empty()) {
286  continue;
287  }
288 
290  if (! NStr::SplitInTwo(token, "-", first, last)) {
291  continue;
292  }
293  if (first.size() != last.size()) {
294  bad = true;
295  break;
296  }
297 
298  auto first_it =
299  find_if_not(begin(first), end(first), sIsPrefixChar);
300 
301  if (first_it == first.end()) {
302  bad = true;
303  break;
304  }
305 
306 
307  auto last_it =
308  find_if_not(begin(last), end(last), sIsPrefixChar);
309  if (last_it == last.end()) {
310  bad = true;
311  break;
312  }
313 
314  auto prefixLength = distance(first.begin(), first_it);
315  if (prefixLength != distance(last.begin(), last_it) ||
316  ! NStr::EqualCase(first, 0, prefixLength, last.substr(0, prefixLength))) {
317  ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch, "Inconsistent prefix found in secondary accession range \"%s\".", token.c_str());
318  break;
319  }
320 
321  auto num1 = NStr::StringToInt(first.substr(prefixLength));
322  auto num2 = NStr::StringToInt(last.substr(prefixLength));
323 
324  if (num2 <= num1) {
325  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Invalid start/end values in secondary accession range \"%s\".", token.c_str());
326  }
327 
328  *it = first;
329  it = tokens.insert(it, "-");
330  it = tokens.insert(it, last);
331  }
332 
333  if (bad) {
334  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Incorrect secondary accession range provided: \"%s\".", it->c_str());
335  }
336  return false;
337 }
338 
339 
340 inline bool IsLeadPrefixChar(char c)
341 {
342  return ('A' <= c && c <= 'Z');
343 }
344 inline bool IsDigit(char c)
345 {
346  return ('0' <= c && c <= '9');
347 }
348 /**********************************************************/
349 bool ParseAccessionRange(TokenStatBlkPtr tsbp, unsigned skip)
350 {
351  auto& tokens = tsbp->list;
352  if (tokens.empty())
353  return true;
354  if ((int)skip >= tsbp->num)
355  return true;
356 
357  auto tbp = tokens.begin();
358  if (skip > 0)
359  advance(tbp, skip);
360 
361  bool bad = false, msg_issued = false;
362  for (; tbp != tokens.end(); ++tbp) {
363  const string& token = *tbp;
364  string_view tok_view = token;
365  if (token.empty())
366  continue;
367  size_t dash = token.find('-');
368  if (dash == string::npos)
369  continue;
370  if (dash == 0 || tok_view.size() != (dash + 1 + dash)) {
371  bad = true;
372  break;
373  }
374 
375  string_view first(tok_view.substr(0, dash));
376  string_view last(tok_view.substr(dash + 1));
377  if (! IsLeadPrefixChar(first.front()) || ! IsLeadPrefixChar(last.front())) {
378  bad = true;
379  break;
380  }
381 
382  auto first_it = find_if_not(first.begin(), first.end(), sIsPrefixChar);
383  if (first_it == first.end() || ! IsDigit(*first_it)) {
384  bad = true;
385  break;
386  }
387  auto last_it = find_if_not(last.begin(), last.end(), sIsPrefixChar);
388  if (last_it == last.end() || ! IsDigit(*last_it)) {
389  bad = true;
390  break;
391  }
392 
393  size_t preflen = first_it - first.begin();
394  size_t preflen2 = last_it - last.begin();
395  string_view first_prefix = first.substr(0, preflen);
396  string_view last_prefix = last.substr(0, preflen2);
397  if (first_prefix != last_prefix) {
398  msg_issued = true;
399  ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch, "Inconsistent prefix found in secondary accession range \"%s\".", token.c_str());
400  bad = true;
401  break;
402  }
403 
404  string_view first_digits = first.substr(preflen);
405  string_view last_digits = last.substr(preflen);
406  if (! all_of(first_digits.begin(), first_digits.end(), IsDigit) ||
407  ! all_of(last_digits.begin(), last_digits.end(), IsDigit)) {
408  bad = true;
409  break;
410  }
411 
412  auto num1 = NStr::StringToInt(first_digits, NStr::fConvErr_NoThrow);
413  auto num2 = NStr::StringToInt(last_digits, NStr::fConvErr_NoThrow);
414  if (num2 < num1) {
415  msg_issued = true;
416  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Invalid start/end values in secondary accession range \"%s\".", token.c_str());
417  bad = true;
418  break;
419  }
420 
421  // cut in half
422  string tmp(last);
423  tbp->resize(dash);
424  tbp = tokens.insert_after(tbp, "-");
425  tbp = tokens.insert_after(tbp, tmp);
426  tsbp->num += 2;
427  }
428  if (! bad)
429  return true;
430  if (! msg_issued) {
431  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Incorrect secondary accession range provided: \"%s\".", tbp->c_str());
432  }
433  return false;
434 }
435 
436 /**********************************************************
437  *
438  * TokenStatBlkPtr TokenString(str, delimiter):
439  *
440  * Parsing string "str" by delimiter or tab key, blank.
441  * Parsing stop at newline ('\n') or end of string ('\0').
442  * Return a statistics of link list token.
443  *
444  **********************************************************/
445 unique_ptr<TokenStatBlk> TokenString(const char* str, Char delimiter)
446 {
447  const char* bptr;
448  const char* ptr;
449  Int2 num;
450  TokenStatBlkPtr token;
451 
452  token = new TokenStatBlk;
453  auto tail = token->list.before_begin();
454 
455  /* skip first several delimiters if any existed
456  */
457  for (ptr = str; *ptr == delimiter;)
458  ptr++;
459 
460  for (num = 0; *ptr != '\0' && *ptr != '\r' && *ptr != '\n';) {
461  for (bptr = ptr; *ptr != delimiter && *ptr != '\r' && *ptr != '\n' &&
462  *ptr != '\t' && *ptr != ' ' && *ptr != '\0';)
463  ptr++;
464 
465  tail = token->list.insert_after(tail, string(bptr, ptr));
466  num++;
467 
468  while (*ptr == delimiter || *ptr == '\t' || *ptr == ' ')
469  ptr++;
470  }
471 
472  token->num = num;
473 
474  return unique_ptr<TokenStatBlk>(token);
475 }
476 
477 /**********************************************************
478  *
479  * Int2 fta_StringMatch(array, text):
480  *
481  * Return array position of the matched length
482  * of string in array.
483  * Return -1 if no match.
484  *
485  **********************************************************/
486 Int2 fta_StringMatch(const Char** array, string_view text)
487 {
488  Int2 i;
489 
490  for (i = 0; *array; i++, array++) {
492  return i;
493  }
494 
495  return -1;
496 }
497 
498 /**********************************************************
499  *
500  * Int2 StringMatchIcase(array, text):
501  *
502  * Return array position of the matched lenght of
503  * string (ignored case) in array.
504  * Return -1 if no match.
505  *
506  **********************************************************/
507 Int2 StringMatchIcase(const Char** array, string_view text)
508 {
509  Int2 i;
510 
511  for (i = 0; *array; i++, array++) {
512  // If string from an array is empty its length == 0 and would be equval to any other string
513  // The next 'if' statement will avoid that behavior
514  if (! text.empty() && *array[0] == 0)
515  continue;
516 
518  return i;
519  }
520 
521  return -1;
522 }
523 
524 /**********************************************************
525  *
526  * Int2 MatchArrayString(array, text):
527  *
528  * Return array position of the string in the
529  * array.
530  * Return -1 if no match.
531  *
532  **********************************************************/
533 Int2 MatchArrayString(const char** array, const char* text)
534 {
535  Int2 i;
536 
537  if (! text)
538  return (-1);
539 
540  for (i = 0; *array; i++, array++) {
541  if (NStr::Equal(*array, text))
542  return i;
543  }
544 
545  return -1;
546 }
547 
548 /**********************************************************/
550 {
551  Int2 i;
552 
553  if (! text)
554  return (-1);
555 
556  for (i = 0; *array; i++, array++) {
557  // If string from an array is empty its length == 0 and would be equval to any other string
558  // The next 'if' statement will avoid that behavior
559  if (text[0] != 0 && *array[0] == 0)
560  continue;
561 
563  return i;
564  }
565 
566  return -1;
567 }
568 
569 /**********************************************************
570  *
571  * Int2 MatchArraySubString(array, text):
572  *
573  * Return array position of the string in the array
574  * if any array is in the substring of "text".
575  * Return -1 if no match.
576  *
577  **********************************************************/
578 Int2 MatchArraySubString(const Char** array, string_view text)
579 {
580  Int2 i;
581 
582  for (i = 0; *array; i++, array++) {
583  if (NStr::Find(text, *array) != NPOS)
584  return i;
585  }
586 
587  return -1;
588 }
589 
590 /**********************************************************/
591 Char* StringIStr(const Char* where, const Char* what)
592 {
593  const Char* p;
594  const Char* q;
595 
596  if (! where || *where == '\0' || ! what || *what == '\0')
597  return nullptr;
598 
599  q = nullptr;
600  for (; *where != '\0'; where++) {
601  for (q = what, p = where; *q != '\0' && *p != '\0'; q++, p++) {
602  if (*q == *p)
603  continue;
604 
605  if (*q >= 'A' && *q <= 'Z') {
606  if (*q + 32 == *p)
607  continue;
608  } else if (*q >= 'a' && *q <= 'z') {
609  if (*q - 32 == *p)
610  continue;
611  }
612  break;
613  }
614  if (*p == '\0' || *q == '\0')
615  break;
616  }
617  if (q && *q == '\0')
618  return const_cast<char*>(where);
619  return nullptr;
620 }
621 
622 /**********************************************************/
623 Int2 MatchArrayISubString(const Char** array, string_view text)
624 {
625  Int2 i;
626 
627  for (i = 0; *array; i++, array++) {
628  if (NStr::FindNoCase(text, *array) != NPOS)
629  return i;
630  }
631 
632  return -1;
633 }
634 
635 /**********************************************************
636  *
637  * char* GetBlkDataReplaceNewLine(bptr, eptr,
638  * start_col_data):
639  *
640  * Return a string which replace newline to blank
641  * and skip "XX" line data.
642  *
643  **********************************************************/
644 string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
645 {
646  vector<string> lines;
647  NStr::Split(instr, "\n", lines);
648  string replaced;
649  for (auto line : lines) {
650  if (line.empty() || NStr::StartsWith(line, "XX") || line.size() <= indent) {
651  continue;
652  }
653  replaced += line.substr(indent);
654  auto last = line.size() - 1;
655  if (line[last] != '-') {
656  replaced += ' ';
657  } else if (line[last - 1] == ' ') {
658  replaced += ' ';
659  }
660  }
661  NStr::TruncateSpacesInPlace(replaced);
662  return replaced;
663 }
664 
665 
666 /**********************************************************/
667 static size_t SeekLastAlphaChar(const Char* str, size_t len)
668 {
669  if (str && len != 0) {
670  for (size_t ret = len; ret > 0;) {
671  char c = str[--ret];
672  if (c != ' ' && c != '\n' && c != '\\' && c != ',' &&
673  c != ';' && c != '~' && c != '.' && c != ':') {
674  return ret + 1;
675  }
676  }
677  }
678 
679  return 0;
680 }
681 
682 /**********************************************************/
684 {
685  size_t ret = SeekLastAlphaChar(str.c_str(), str.size());
686  str = str.substr(0, ret);
687 }
688 
689 /**********************************************************
690  *
691  * void CleanTailNoneAlphaChar(str):
692  *
693  * Delete any tailing ' ', '\n', '\\', ',', ';', '~',
694  * '.', ':' characters.
695  *
696  **********************************************************/
698 {
699  if (! str || *str == '\0')
700  return;
701 
702  size_t last = SeekLastAlphaChar(str, strlen(str));
703  str[last] = '\0';
704 }
705 
706 /**********************************************************/
707 char* PointToNextToken(char* ptr)
708 {
709  if (ptr) {
710  while (*ptr != ' ')
711  ptr++;
712  while (*ptr == ' ')
713  ptr++;
714  }
715  return (ptr);
716 }
717 
718 /**********************************************************
719  *
720  * char* GetTheCurrentToken(ptr):
721  *
722  * Return the current token (also CleanTailNoneAlphaChar)
723  * which ptr points to and ptr will points to next token
724  * after the routine return.
725  *
726  **********************************************************/
727 char* GetTheCurrentToken(char** ptr)
728 {
729  char* retptr;
730  char* bptr;
731  char* str;
732 
733  bptr = retptr = *ptr;
734  if (! retptr || *retptr == '\0')
735  return nullptr;
736 
737  while (*retptr != '\0' && *retptr != ' ')
738  retptr++;
739 
740  str = StringSave(string_view(bptr, retptr - bptr));
741 
742  while (*retptr != '\0' && *retptr == ' ') /* skip blanks */
743  retptr++;
744  *ptr = retptr;
745 
747  return (str);
748 }
749 
750 /**********************************************************
751  *
752  * char* SrchTheChar(bptr, eptr, letter):
753  *
754  * Search The character letter.
755  * Return NULL if not found; otherwise, return
756  * a pointer points first occurrence The character.
757  *
758  **********************************************************/
759 char* SrchTheChar(char* bptr, char* eptr, Char letter)
760 {
761  while (bptr < eptr && *bptr != letter)
762  bptr++;
763 
764  if (bptr < eptr)
765  return (bptr);
766 
767  return nullptr;
768 }
769 
770 /**********************************************************
771  *
772  * char* SrchTheStr(bptr, eptr, leadstr):
773  *
774  * Search The leading string.
775  * Return NULL if not found; otherwise, return
776  * a pointer points first occurrence The leading string.
777  *
778  **********************************************************/
779 char* SrchTheStr(char* bptr, char* eptr, const char* leadstr)
780 {
781  char* p;
782  Char c;
783 
784  c = *eptr;
785  *eptr = '\0';
786  p = StringStr(bptr, leadstr);
787  *eptr = c;
788  return (p);
789 }
790 
791 /**********************************************************/
792 void CpSeqId(InfoBioseqPtr ibp, const CSeq_id& id)
793 {
794  const CTextseq_id* text_id = id.GetTextseq_Id();
795  if (text_id) {
796  if (text_id->IsSetName())
797  ibp->mLocus = text_id->GetName();
798 
799  CRef<CSeq_id> new_id(new CSeq_id);
800  if (text_id->IsSetAccession()) {
801  ibp->mAccNum = text_id->GetAccession();
802 
803  CRef<CTextseq_id> new_text_id(new CTextseq_id);
804  new_text_id->SetAccession(text_id->GetAccession());
805  if (text_id->IsSetVersion())
806  new_text_id->SetVersion(text_id->GetVersion());
807 
808  SetTextId(id.Which(), *new_id, *new_text_id);
809  } else {
810  new_id->Assign(id);
811  }
812 
813  ibp->ids.push_back(new_id);
814  } else {
815  auto pId = Ref(new CSeq_id());
816  pId->Assign(id);
817  ibp->ids.push_back(std::move(pId));
818  }
819 }
820 
821 /**********************************************************
822  *
823  * CRef<CDate_std> get_full_date(s, is_ref, source):
824  *
825  * Get year, month, day and return CRef<CDate_std>.
826  *
827  **********************************************************/
829 {
830  CRef<CDate_std> date;
831 
832  if (! s || *s == '\0')
833  return date;
834 
835  int parse_day = 0;
836  if (isdigit(*s) != 0) {
837  parse_day = atoi(s);
838  s += 3;
839  // should we make at least a token effort of validation (like <32)?
840  }
841 
842  static const vector<string> months{
843  "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
844  };
845  CTempString maybe_month(s, 3);
846  auto it = find(months.begin(), months.end(), maybe_month);
847  if (it == months.end()) {
848  char msg[11];
849  StringNCpy(msg, s, 10);
850  msg[10] = '\0';
851  is_ref ? ErrPostEx(
852  SEV_WARNING, ERR_REFERENCE_IllegalDate, "Unrecognized month: %s", msg)
853  : ErrPostEx(
854  SEV_WARNING, ERR_DATE_IllegalDate, "Unrecognized month: %s", msg);
855  return date;
856  }
857  int parse_month = int(it - months.begin()) + 1;
858 
859  s += 4;
860 
861  int parse_year = atoi(s);
862  int cur_year = CCurrentTime().Year();
863  if (1900 <= parse_year && parse_year <= cur_year) {
864  // all set
865  } else if (0 <= parse_year && parse_year <= 99 && '0' <= s[1] && s[1] <= '9') {
866  // insist that short form year has exactly two digits
867  (parse_year < 70) ? (parse_year += 2000) : (parse_year += 1900);
868  } else {
869  if (is_ref) {
870  ErrPostEx(
871  SEV_ERROR, ERR_REFERENCE_IllegalDate, "Illegal year: %d, current year: %d", parse_year, cur_year);
872  } else if (source != Parser::ESource::SPROT || parse_year - cur_year > 1) {
873  ErrPostEx(
874  SEV_WARNING, ERR_DATE_IllegalDate, "Illegal year: %d, current year: %d", parse_year, cur_year);
875  }
876  // treat bad year like bad month above:
877  return date;
878  }
879  date.Reset(new CDate_std);
880  date->SetYear(parse_year);
881  date->SetMonth(parse_month);
882  date->SetDay(parse_day);
883 
884  return date;
885 }
886 
887 /**********************************************************
888  *
889  * int SrchKeyword(ptr, kwl):
890  *
891  * Compare first kwl.len byte in ptr to kwl.str.
892  * Return the position of keyword block array;
893  * return unknown keyword, UNKW, if not found.
894  *
895  * 3-25-93
896  *
897  **********************************************************/
898 int SrchKeyword(const CTempString& ptr, const vector<string>& keywordList)
899 {
900  SIZE_TYPE keywordCount = keywordList.size();
901 
902  for (unsigned i = 0; i < keywordCount; ++i) {
903  if (NStr::StartsWith(ptr, keywordList[i])) {
904  return (int)i;
905  }
906  }
907  return ParFlat_UNKW;
908 }
909 
910 /**********************************************************/
911 bool CheckLineType(char* ptr, Int4 line, const vector<string>& keywordList, bool after_origin)
912 {
913  char* p;
914  Char msg[51];
915 
916  if (after_origin) {
917  for (p = ptr; *p >= '0' && *p <= '9';)
918  p++;
919  if (*p == ' ')
920  return true;
921  }
922 
923  auto keywordCount = keywordList.size();
924  for (unsigned i = 0; i < keywordCount; i++) {
925  auto keyword = keywordList[i];
926  if (StringEquN(ptr, keyword.c_str(), keyword.size()))
927  return true;
928  }
929 
930  StringNCpy(msg, ptr, 50);
931  msg[50] = '\0';
932  p = StringChr(msg, '\n');
933  if (p)
934  *p = '\0';
935  ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Unknown linetype \"%s\". Line number %d.", msg, line);
936  if (p)
937  *p = '\n';
938 
939  return false;
940 }
941 
942 /**********************************************************
943  *
944  * char* SrchNodeType(entry, type, len):
945  *
946  * Return a memory location of the node which has
947  * the "type".
948  *
949  **********************************************************/
950 char* SrchNodeType(DataBlkPtr entry, Int4 type, size_t* len)
951 {
952  DataBlkPtr temp;
953 
954  temp = TrackNodeType(*entry, (Int2)type);
955  if (temp) {
956  *len = temp->len;
957  return (temp->mOffset);
958  }
959 
960  *len = 0;
961  return nullptr;
962 }
963 
964 char* xSrchNodeType(const DataBlk& entry, Int4 type, size_t* len)
965 {
966  DataBlkPtr temp;
967 
968  temp = TrackNodeType(entry, (Int2)type);
969  if (temp) {
970  *len = temp->len;
971  return (temp->mOffset);
972  }
973 
974  *len = 0;
975  return nullptr;
976 }
977 
978 string xGetNodeData(const DataBlk& entry, int nodeType)
979 {
980  auto tmp = TrackNodeType(entry, (Int2)nodeType);
981  if (! tmp) {
982  return "";
983  }
984  return string(tmp->mOffset, tmp->len);
985 }
986 
987 /**********************************************************
988  *
989  * DataBlkPtr TrackNodeType(entry, type):
990  *
991  * Return a pointer points to the Node which has
992  * the "type".
993  *
994  **********************************************************/
996 {
997  DataBlkPtr temp;
998  EntryBlkPtr ebp;
999 
1000  ebp = static_cast<EntryBlk*>(entry.mpData);
1001  temp = ebp->chain;
1002  while (temp && temp->mType != type)
1003  temp = temp->mpNext;
1004 
1005  return (temp);
1006 }
1007 
1008 
1009 const Section* xTrackNodeType(const Entry& entry, int type)
1010 {
1011  for (const Section* sectionPtr : entry.mSections) {
1012  if (sectionPtr->mType == type) {
1013  return sectionPtr;
1014  }
1015  }
1016  return nullptr;
1017 }
1018 
1019 
1020 /**********************************************************/
1022 {
1023  const char* b[4];
1024 
1025  bool kwd_tpa = false;
1026  bool kwd_party = false;
1027  bool kwd_inf = false;
1028  bool kwd_exp = false;
1029  bool kwd_asm = false;
1030  bool kwd_spedb = false;
1031  bool ret = true;
1032 
1033  Int4 j;
1034  Int2 i;
1035 
1036  if (kwds.empty())
1037  return true;
1038 
1039  size_t len = 0;
1040  j = 0;
1041  for (const string& key : kwds) {
1042  if (key.empty())
1043  continue;
1044 
1045  const char* p = key.c_str();
1047  if (i == 0)
1048  kwd_tpa = true;
1049  else if (i == 1 || i == 2)
1050  kwd_party = true;
1051  else if (i == 3)
1052  kwd_inf = true;
1053  else if (i == 4)
1054  kwd_exp = true;
1055  else if (i == 5 || i == 6)
1056  kwd_asm = true;
1057  else if (i == 7)
1058  kwd_spedb = true;
1059  else if (NStr::EqualNocase(p, 0, 3, "TPA")) {
1060  if (p[3] == ':') {
1061  ErrPostEx(SEV_REJECT, ERR_KEYWORD_InvalidTPATier, "Keyword \"%s\" is not a valid TPA-tier keyword.", p);
1062  ret = false;
1063  } else if (p[3] != '\0' && p[4] != '\0') {
1064  ErrPostEx(SEV_WARNING, ERR_KEYWORD_UnexpectedTPA, "Keyword \"%s\" looks like it might be TPA-related, but it is not a recognized TPA keyword.", p);
1065  }
1066  }
1067  if (i > 2 && i < 8 && j < 4) {
1068  b[j] = p;
1069  ++j;
1070  len += key.size() + 1;
1071  }
1072  }
1073 
1074  if (kwd_tpa && ! kwd_party) {
1075  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords, "This TPA-record should have keyword \"Third Party Annotation\" or \"Third Party Data\" in addition to \"TPA\".");
1076  ret = false;
1077  } else if (! kwd_tpa && kwd_party) {
1078  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords, "This TPA-record should have keyword \"TPA\" in addition to \"Third Party Annotation\" or \"Third Party Data\".");
1079  ret = false;
1080  }
1081  if (! kwd_tpa && (kwd_inf || kwd_exp)) {
1082  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords, "This TPA-record should have keyword \"TPA\" in addition to its TPA-tier keyword.");
1083  ret = false;
1084  } else if (kwd_tpa && kwd_inf == false && kwd_exp == false &&
1085  kwd_asm == false && kwd_spedb == false) {
1086  ErrPostEx(SEV_ERROR, ERR_KEYWORD_MissingTPATier, "This TPA record lacks a keyword to indicate which tier it belongs to: experimental, inferential, reassembly or specialist_db.");
1087  }
1088  if (j > 1) {
1089  string buf;
1090  for (i = 0; i < j; i++) {
1091  if (i > 0)
1092  buf += ';';
1093  buf += b[i];
1094  }
1095  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingTPATiers, "Keywords for multiple TPA tiers exist on this record: \"%s\". A TPA record can only be in one tier.", buf.c_str());
1096  ret = false;
1097  }
1098 
1099  return (ret);
1100 }
1101 
1102 /**********************************************************/
1104 {
1105  bool kwd_tsa = false;
1106  bool kwd_assembly = false;
1107  bool ret = true;
1108  Int2 i;
1109 
1110  if (kwds.empty())
1111  return true;
1112 
1113  for (const string& key : kwds) {
1114  if (key.empty())
1115  continue;
1117  if (i == 0)
1118  kwd_tsa = true;
1119  else if (i == 1)
1120  kwd_assembly = true;
1121  else if (source == Parser::ESource::EMBL &&
1122  NStr::EqualNocase(key, "Transcript Shotgun Assembly"))
1123  kwd_assembly = true;
1124  }
1125 
1126  if (kwd_tsa && ! kwd_assembly) {
1127  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSAKeywords, "This TSA-record should have keyword \"Transcriptome Shotgun Assembly\" in addition to \"TSA\".");
1128  ret = false;
1129  } else if (! kwd_tsa && kwd_assembly) {
1130  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSAKeywords, "This TSA-record should have keyword \"TSA\" in addition to \"Transcriptome Shotgun Assembly\".");
1131  ret = false;
1132  }
1133  return (ret);
1134 }
1135 
1136 /**********************************************************/
1138 {
1139  bool kwd_tls = false;
1140  bool kwd_study = false;
1141  bool ret = true;
1142  Int2 i;
1143 
1144  if (kwds.empty())
1145  return true;
1146 
1147  for (const string& key : kwds) {
1148  if (key.empty())
1149  continue;
1151  if (i == 0)
1152  kwd_tls = true;
1153  else if (i == 1)
1154  kwd_study = true;
1155  else if (source == Parser::ESource::EMBL &&
1156  NStr::EqualNocase(key, "Targeted Locus Study"))
1157  kwd_study = true;
1158  }
1159 
1160  if (kwd_tls && ! kwd_study) {
1161  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLSKeywords, "This TLS-record should have keyword \"Targeted Locus Study\" in addition to \"TLS\".");
1162  ret = false;
1163  } else if (! kwd_tls && kwd_study) {
1164  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLSKeywords, "This TLS-record should have keyword \"TLS\" in addition to \"Targeted Locus Study\".");
1165  ret = false;
1166  }
1167  return (ret);
1168 }
1169 
1170 /**********************************************************/
1171 bool fta_is_tpa_keyword(const char* str)
1172 {
1173  if (! str || *str == '\0' || MatchArrayIString(ParFlat_TPA_kw_array, str) < 0)
1174  return false;
1175 
1176  return true;
1177 }
1178 
1179 /**********************************************************/
1180 bool fta_is_tsa_keyword(const char* str)
1181 {
1182  if (! str || *str == '\0' || MatchArrayIString(ParFlat_TSA_kw_array, str) < 0)
1183  return false;
1184  return true;
1185 }
1186 
1187 /**********************************************************/
1188 bool fta_is_tls_keyword(const char* str)
1189 {
1190  if (! str || *str == '\0' || MatchArrayIString(ParFlat_TLS_kw_array, str) < 0)
1191  return false;
1192  return true;
1193 }
1194 
1195 /**********************************************************/
1196 void fta_keywords_check(const char* str, bool* estk, bool* stsk, bool* gssk, bool* htck, bool* flik, bool* wgsk, bool* tpak, bool* envk, bool* mgak, bool* tsak, bool* tlsk)
1197 {
1198  if (estk && MatchArrayString(ParFlat_EST_kw_array, str) != -1)
1199  *estk = true;
1200 
1201  if (stsk && MatchArrayString(ParFlat_STS_kw_array, str) != -1)
1202  *stsk = true;
1203 
1204  if (gssk && MatchArrayString(ParFlat_GSS_kw_array, str) != -1)
1205  *gssk = true;
1206 
1207  if (htck && MatchArrayString(ParFlat_HTC_kw_array, str) != -1)
1208  *htck = true;
1209 
1210  if (flik && MatchArrayString(ParFlat_FLI_kw_array, str) != -1)
1211  *flik = true;
1212 
1213  if (wgsk && MatchArrayString(ParFlat_WGS_kw_array, str) != -1)
1214  *wgsk = true;
1215 
1216  if (tpak && MatchArrayString(ParFlat_TPA_kw_array, str) != -1)
1217  *tpak = true;
1218 
1219  if (envk && MatchArrayString(ParFlat_ENV_kw_array, str) != -1)
1220  *envk = true;
1221 
1222  if (mgak && MatchArrayString(ParFlat_MGA_kw_array, str) != -1)
1223  *mgak = true;
1224 
1225  if (tsak && MatchArrayString(ParFlat_TSA_kw_array, str) != -1)
1226  *tsak = true;
1227 
1228  if (tlsk && MatchArrayString(ParFlat_TLS_kw_array, str) != -1)
1229  *tlsk = true;
1230 }
1231 
1232 /**********************************************************/
1234 {
1235  const char** b;
1236 
1237  if (kwds.empty())
1238  return;
1239 
1240  if (tech == CMolInfo::eTech_est)
1242  else if (tech == CMolInfo::eTech_sts)
1244  else if (tech == CMolInfo::eTech_survey)
1246  else if (tech == CMolInfo::eTech_htc)
1248  else if (tech == CMolInfo::eTech_fli_cdna)
1250  else if (tech == CMolInfo::eTech_wgs)
1252  else
1253  return;
1254 
1255  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1256  if (key->empty() || MatchArrayString(b, key->c_str()) != -1) {
1257  key = kwds.erase(key);
1258  } else
1259  ++key;
1260  }
1261 }
1262 
1263 /**********************************************************/
1265 {
1266  if (kwds.empty())
1267  return;
1268 
1269  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1270  if (key->empty() || MatchArrayIString(ParFlat_TPA_kw_array_to_remove, key->c_str()) != -1) {
1271  key = kwds.erase(key);
1272  } else
1273  ++key;
1274  }
1275 }
1276 
1277 /**********************************************************/
1279 {
1280  if (kwds.empty())
1281  return;
1282 
1283  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1284  if (key->empty() || MatchArrayIString(ParFlat_TSA_kw_array, key->c_str()) != -1 ||
1285  (source == Parser::ESource::EMBL && NStr::EqualNocase(*key, "Transcript Shotgun Assembly"))) {
1286  key = kwds.erase(key);
1287  } else
1288  ++key;
1289  }
1290 }
1291 
1292 /**********************************************************/
1294 {
1295  if (kwds.empty())
1296  return;
1297 
1298  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1299  if (key->empty() || MatchArrayIString(ParFlat_TLS_kw_array, key->c_str()) != -1 ||
1300  (source == Parser::ESource::EMBL && NStr::EqualNocase(*key, "Targeted Locus Study"))) {
1301  key = kwds.erase(key);
1302  } else
1303  ++key;
1304  }
1305 }
1306 
1307 /**********************************************************/
1309 {
1310  if (kwds.empty())
1311  return;
1312 
1313  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1314  if (key->empty() || MatchArrayIString(ParFlat_ENV_kw_array, key->c_str()) != -1) {
1315  key = kwds.erase(key);
1316  } else
1317  ++key;
1318  }
1319 }
1320 
1321 /**********************************************************/
1323 {
1324  if (kwds.empty())
1325  return;
1326 
1327  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1328  if (key->empty() || MatchArrayIString(ParFlat_MAG_kw_array, key->c_str()) != -1) {
1329  key = kwds.erase(key);
1330  } else
1331  ++key;
1332  }
1333 }
1334 
1335 /**********************************************************/
1337  const list<string> keywordList,
1338  bool tpa_check,
1339  IndexblkPtr entry
1340  // bool& specialist_db,
1341  // bool& inferential,
1342  // bool& experimental,
1343  // bool& assembly
1344 )
1345 {
1346  if (keywordList.empty()) {
1347  return;
1348  }
1349  for (auto keyword : keywordList) {
1351  keyword.c_str(), &entry->EST, &entry->STS, &entry->GSS, &entry->HTC, nullptr, nullptr, (tpa_check ? &entry->is_tpa : nullptr), nullptr, nullptr, nullptr, nullptr);
1352  if (NStr::EqualNocase(keyword, "TPA:assembly")) {
1353  entry->specialist_db = true;
1354  entry->assembly = true;
1355  continue;
1356  }
1357  if (NStr::EqualNocase(keyword, "TPA:specialist_db")) {
1358  entry->specialist_db = true;
1359  continue;
1360  }
1361  if (NStr::EqualNocase(keyword, "TPA:inferential")) {
1362  entry->inferential = true;
1363  continue;
1364  }
1365  if (NStr::EqualNocase(keyword, "TPA:experimental")) {
1366  entry->experimental = true;
1367  continue;
1368  }
1369  }
1370 }
1371 
1372 void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool& specialist_db, bool& inferential, bool& experimental, bool& assembly)
1373 {
1374  char* line;
1375  char* p;
1376  char* q;
1377 
1378  if (! kwds || ! kwds->data || len < 1)
1379  return;
1380 
1381  line = StringNew(len);
1382  line[0] = '\0';
1383  for (; kwds; kwds = kwds->next) {
1384  StringCat(line, kwds->data);
1385  }
1386  for (p = line; *p != '\0'; p++)
1387  if (*p == '\n' || *p == '\t')
1388  *p = ' ';
1389  for (p = line; *p == ' ' || *p == '.' || *p == ';';)
1390  p++;
1391  if (*p == '\0') {
1392  MemFree(line);
1393  return;
1394  }
1395  for (q = p; *q != '\0';)
1396  q++;
1397  for (q--; *q == ' ' || *q == '.' || *q == ';'; q--)
1398  *q = '\0';
1399  for (q = p, p = line; *q != '\0';) {
1400  if (*q != ' ' && *q != ';') {
1401  *p++ = *q++;
1402  continue;
1403  }
1404  if (*q == ' ') {
1405  for (q++; *q == ' ';)
1406  q++;
1407  if (*q != ';')
1408  *p++ = ' ';
1409  }
1410  if (*q == ';') {
1411  *p++ = *q++;
1412  while (*q == ' ' || *q == ';')
1413  q++;
1414  }
1415  }
1416  *p++ = ';';
1417  *p = '\0';
1418  for (p = line;; p = q + 1) {
1419  q = StringChr(p, ';');
1420  if (! q)
1421  break;
1422  *q = '\0';
1423  fta_keywords_check(p, &entry->EST, &entry->STS, &entry->GSS, &entry->HTC, nullptr, nullptr, (tpa_check ? &entry->is_tpa : nullptr), nullptr, nullptr, nullptr, nullptr);
1424  if (NStr::EqualNocase(p, "TPA:specialist_db") ||
1425  NStr::EqualNocase(p, "TPA:assembly")) {
1426  specialist_db = true;
1427  if (NStr::EqualNocase(p, "TPA:assembly"))
1428  assembly = true;
1429  } else if (NStr::EqualNocase(p, "TPA:inferential"))
1430  inferential = true;
1431  else if (NStr::EqualNocase(p, "TPA:experimental"))
1432  experimental = true;
1433  }
1434  MemFree(line);
1435 }
1436 
1437 /**********************************************************/
1439 {
1440  ValNodePtr res;
1441 
1442  res = ValNodeNew(nullptr, data);
1443  res->choice = choice;
1444  return (res);
1445 }
1446 
1447 /**********************************************************/
1448 bool fta_check_mga_keywords(CMolInfo& mol_info, const TKeywordList& kwds)
1449 {
1450  bool is_cage;
1451  bool is_sage;
1452 
1453  TKeywordList::const_iterator key_it = kwds.end();
1454 
1455  bool got = false;
1456  if (! kwds.empty() && NStr::EqualNocase(kwds.front(), "MGA")) {
1457  for (TKeywordList::const_iterator key = kwds.begin(); key != kwds.end(); ++key) {
1459  key->c_str()) < 0)
1460  continue;
1461  got = true;
1462  key_it = key;
1463  break;
1464  }
1465  }
1466 
1467  if (! got) {
1468  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingMGAKeywords, "This is apparently a CAGE record, but it lacks the required keywords. Entry dropped.");
1469  return false;
1470  }
1471 
1472  if (! mol_info.IsSetTechexp() || ! kwds.empty() ||
1473  mol_info.GetTechexp() != "cage")
1474  return true;
1475 
1476  for (is_sage = false, is_cage = false; key_it != kwds.end(); ++key_it) {
1477  const char* p = key_it->c_str();
1478 
1479  if (NStr::EqualNocase(p, "5'-SAGE"))
1480  is_sage = true;
1481  else if (NStr::EqualNocase(p, "CAGE (Cap Analysis Gene Expression)"))
1482  is_cage = true;
1483  }
1484 
1485  if (is_sage) {
1486  if (is_cage) {
1487  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingMGAKeywords, "This MGA record contains more than one of the special keywords indicating different techniques.");
1488  return false;
1489  }
1490  mol_info.SetTechexp("5'-sage");
1491  }
1492 
1493  return true;
1494 }
1495 
1496 /**********************************************************/
1497 void fta_StringCpy(char* dst, const char* src)
1498 {
1499  const char* p;
1500  char* q;
1501 
1502  for (q = dst, p = src; *p != '\0';)
1503  *q++ = *p++;
1504  *q = '\0';
1505 }
1506 
1507 /**********************************************************/
1508 bool SetTextId(Uint1 seqtype, CSeq_id& seqId, CTextseq_id& textId)
1509 {
1510  bool wasSet = true;
1511 
1512  switch (seqtype) {
1513  case CSeq_id::e_Genbank:
1514  seqId.SetGenbank(textId);
1515  break;
1516  case CSeq_id::e_Embl:
1517  seqId.SetEmbl(textId);
1518  break;
1519  case CSeq_id::e_Pir:
1520  seqId.SetPir(textId);
1521  break;
1522  case CSeq_id::e_Swissprot:
1523  seqId.SetSwissprot(textId);
1524  break;
1525  case CSeq_id::e_Other:
1526  seqId.SetOther(textId);
1527  break;
1528  case CSeq_id::e_Ddbj:
1529  seqId.SetDdbj(textId);
1530  break;
1531  case CSeq_id::e_Prf:
1532  seqId.SetPrf(textId);
1533  break;
1534  case CSeq_id::e_Pdb: {
1535  // TODO: test this branch
1536  CPDB_seq_id pdbId;
1537  pdbId.SetChain_id();
1538  seqId.SetPdb(pdbId);
1539  } break;
1540  case CSeq_id::e_Tpg:
1541  seqId.SetTpg(textId);
1542  break;
1543  case CSeq_id::e_Tpe:
1544  seqId.SetTpe(textId);
1545  break;
1546  case CSeq_id::e_Tpd:
1547  seqId.SetTpd(textId);
1548  break;
1549  case CSeq_id::e_Gpipe:
1550  seqId.SetGpipe(textId);
1551  break;
1553  seqId.SetNamed_annot_track(textId);
1554  break;
1555 
1556  default:
1557  wasSet = false;
1558  }
1559 
1560  return wasSet;
1561 }
1562 
1563 /**********************************************************/
1564 bool IsCancelled(const TKeywordList& keywords)
1565 {
1566  for (const string& key : keywords) {
1567  if (NStr::EqualNocase(key, "HTGS_CANCELLED"))
1568  return true;
1569  }
1570 
1571  return false;
1572 }
1573 
1574 /**********************************************************/
1575 bool HasHtg(const TKeywordList& keywords)
1576 {
1577  for (const string& key : keywords) {
1578  if (key == "HTG" || key == "HTGS_PHASE0" ||
1579  key == "HTGS_PHASE1" || key == "HTGS_PHASE2" ||
1580  key == "HTGS_PHASE3") {
1581  return true;
1582  }
1583  }
1584 
1585  return false;
1586 }
1587 
1588 /**********************************************************/
1590 {
1591  for (TKeywordList::iterator key = keywords.begin(); key != keywords.end();) {
1592  const char* p = key->c_str();
1593  if (NStr::EqualNocase(p, 0, 10, "HTGS_PHASE") &&
1594  (p[10] == '0' || p[10] == '1' || p[10] == '2' ||
1595  p[10] == '3') &&
1596  p[11] == '\0') {
1597  key = keywords.erase(key);
1598  } else
1599  ++key;
1600  }
1601 }
1602 
1603 /**********************************************************/
1604 bool HasHtc(const TKeywordList& keywords)
1605 {
1606  for (const string& key : keywords) {
1607  if (NStr::EqualNocase(key, "HTC")) {
1608  return true;
1609  }
1610  }
1611 
1612  return false;
1613 }
1614 
CCurrentTime –.
Definition: ncbitime.hpp:1283
CScope –.
Definition: scope.hpp:92
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
char * mOffset
Definition: ftablock.h:329
size_t len
Definition: ftablock.h:330
CFlatFileData * mpData
Definition: ftablock.h:328
DataBlk * mpNext
Definition: ftablock.h:333
int mType
Definition: ftablock.h:327
The NCBI C++ standard methods for dealing with std::string.
#define ERR_REFERENCE_IllegalDate
Definition: flat2err.h:282
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
std::list< std::string > TKeywordList
Definition: ftablock.h:163
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:90
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void StringCat(char *d, const char *s)
Definition: ftacpp.hpp:88
char * StringNew(size_t sz)
Definition: ftacpp.hpp:43
const char * months[]
Definition: ftaerr.cpp:118
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
string
Definition: cgiapp.hpp:687
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5325
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3554
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
int Year(void) const
Get year.
Definition: ncbitime.hpp:2265
list< string > TExtra_accessions
Definition: GB_block_.hpp:91
void SetYear(TYear value)
Assign a value to Year data member.
Definition: Date_std_.hpp:435
void SetMonth(TMonth value)
Assign a value to Month data member.
Definition: Date_std_.hpp:482
void SetDay(TDay value)
Assign a value to Day data member.
Definition: Date_std_.hpp:529
TNamed_annot_track & SetNamed_annot_track(void)
Select the variant.
Definition: Seq_id_.cpp:551
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seq_id_.cpp:265
TOther & SetOther(void)
Select the variant.
Definition: Seq_id_.cpp:353
const TName & GetName(void) const
Get the Name member data.
TTpe & SetTpe(void)
Select the variant.
Definition: Seq_id_.cpp:485
TTpg & SetTpg(void)
Select the variant.
Definition: Seq_id_.cpp:463
TPir & SetPir(void)
Select the variant.
Definition: Seq_id_.cpp:287
TTpd & SetTpd(void)
Select the variant.
Definition: Seq_id_.cpp:507
TVersion GetVersion(void) const
Get the Version member data.
TGpipe & SetGpipe(void)
Select the variant.
Definition: Seq_id_.cpp:529
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
TDdbj & SetDdbj(void)
Select the variant.
Definition: Seq_id_.cpp:397
TPrf & SetPrf(void)
Select the variant.
Definition: Seq_id_.cpp:419
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seq_id_.cpp:243
TSwissprot & SetSwissprot(void)
Select the variant.
Definition: Seq_id_.cpp:309
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
void SetChain_id(const TChain_id &value)
Assign a value to Chain_id data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
TPdb & SetPdb(void)
Select the variant.
Definition: Seq_id_.cpp:441
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
Definition: Seq_id_.hpp:113
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_Named_annot_track
Internal named annotation tracking ID.
Definition: Seq_id_.hpp:114
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
bool IsSetTechexp(void) const
explanation if tech not enough
Definition: MolInfo_.hpp:522
const TTechexp & GetTechexp(void) const
Get the Techexp member data.
Definition: MolInfo_.hpp:534
void SetTechexp(const TTechexp &value)
Assign a value to Techexp data member.
Definition: MolInfo_.hpp:543
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
#define ERR_KEYWORD_MissingTPAKeywords
Definition: indx_err.h:111
#define ERR_ACCESSION_Invalid2ndAccRange
Definition: indx_err.h:71
#define ERR_ACCESSION_2ndAccPrefixMismatch
Definition: indx_err.h:70
#define ERR_KEYWORD_InvalidTPATier
Definition: indx_err.h:109
#define ERR_KEYWORD_UnexpectedTPA
Definition: indx_err.h:110
#define ERR_KEYWORD_MissingTSAKeywords
Definition: indx_err.h:114
#define ERR_KEYWORD_MissingTPATier
Definition: indx_err.h:112
#define ERR_KEYWORD_ConflictingTPATiers
Definition: indx_err.h:113
#define ERR_KEYWORD_MissingTLSKeywords
Definition: indx_err.h:117
#define ERR_ENTRY_InvalidLineType
Definition: indx_err.h:64
#define ERR_KEYWORD_MissingMGAKeywords
Definition: indx_err.h:115
#define ERR_KEYWORD_ConflictingMGAKeywords
Definition: indx_err.h:116
char * buf
int i
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
Defines: CTimeFormat - storage class for time format.
The Object manager core.
static const char * prefix[]
Definition: pcregrep.c:405
static const char delimiter[]
string indent(" ")
DataBlkPtr chain
Definition: ftablock.h:341
Definition: entry.h:57
list< SectionPtr > mSections
Definition: entry.h:99
bool assembly
Definition: ftablock.h:241
bool is_tpa
Definition: ftablock.h:206
bool STS
Definition: ftablock.h:193
bool HTC
Definition: ftablock.h:195
bool experimental
Definition: ftablock.h:247
bool inferential
Definition: ftablock.h:245
bool EST
Definition: ftablock.h:192
bool specialist_db
Definition: ftablock.h:243
bool GSS
Definition: ftablock.h:194
string mAccNum
Definition: ftablock.h:85
string mLocus
Definition: ftablock.h:84
TSeqIdList ids
Definition: ftablock.h:83
Definition: entry.h:13
int mType
Definition: entry.h:47
TokenBlkList list
Definition: ftablock.h:137
ValNode * next
Definition: valnode.h:51
char * data
Definition: valnode.h:49
unsigned char choice
Definition: valnode.h:47
Definition: type.c:6
static const char * ParFlat_TLS_kw_array[]
Definition: utilfun.cpp:139
USING_SCOPE(objects)
Int2 MatchArrayIString(const Char **array, const Char *text)
Definition: utilfun.cpp:549
CScope & GetScope()
Definition: utilfun.cpp:62
bool HasHtg(const TKeywordList &keywords)
Definition: utilfun.cpp:1575
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
Definition: utilfun.cpp:898
static const char * ParFlat_STS_kw_array[]
Definition: utilfun.cpp:87
bool HasHtc(const TKeywordList &keywords)
Definition: utilfun.cpp:1604
static const char * ParFlat_MGA_kw_array[]
Definition: utilfun.cpp:111
char * SrchTheChar(char *bptr, char *eptr, Char letter)
Definition: utilfun.cpp:759
static const char * ParFlat_MAG_kw_array[]
Definition: utilfun.cpp:172
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1137
void RemoveHtgPhase(TKeywordList &keywords)
Definition: utilfun.cpp:1589
bool fta_is_tsa_keyword(const char *str)
Definition: utilfun.cpp:1180
static bool sIsPrefixChar(char c)
Definition: utilfun.cpp:260
bool fta_is_tls_keyword(const char *str)
Definition: utilfun.cpp:1188
bool CheckLineType(char *ptr, Int4 line, const vector< string > &keywordList, bool after_origin)
Definition: utilfun.cpp:911
static const char * ParFlat_TPA_kw_array_to_remove[]
Definition: utilfun.cpp:160
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
Definition: utilfun.cpp:1508
string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
Definition: utilfun.cpp:644
Int2 StringMatchIcase(const Char **array, string_view text)
Definition: utilfun.cpp:507
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1278
void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool &specialist_db, bool &inferential, bool &experimental, bool &assembly)
Definition: utilfun.cpp:1372
Int2 MatchArrayISubString(const Char **array, string_view text)
Definition: utilfun.cpp:623
void fta_remove_tpa_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1264
Int2 MatchArraySubString(const Char **array, string_view text)
Definition: utilfun.cpp:578
unique_ptr< TokenStatBlk > TokenString(const char *str, Char delimiter)
Definition: utilfun.cpp:445
static const char * ParFlat_FLI_kw_array[]
Definition: utilfun.cpp:101
CRef< CDate_std > get_full_date(const char *s, bool is_ref, Parser::ESource source)
Definition: utilfun.cpp:828
static const char * ParFlat_ENV_kw_array[]
Definition: utilfun.cpp:167
bool fta_is_tpa_keyword(const char *str)
Definition: utilfun.cpp:1171
void CleanTailNoneAlphaCharInString(string &str)
Definition: utilfun.cpp:683
static const char * ParFlat_TPA_kw_array[]
Definition: utilfun.cpp:148
const Section * xTrackNodeType(const Entry &entry, int type)
Definition: utilfun.cpp:1009
char * SrchNodeType(DataBlkPtr entry, Int4 type, size_t *len)
Definition: utilfun.cpp:950
static const char * ParFlat_TSA_kw_array[]
Definition: utilfun.cpp:130
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
Definition: utilfun.cpp:1233
static const char * ParFlat_MGA_more_kw_array[]
Definition: utilfun.cpp:118
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1293
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Definition: utilfun.cpp:964
string xGetNodeData(const DataBlk &entry, int nodeType)
Definition: utilfun.cpp:978
bool ParseAccessionRange(list< string > &tokens, unsigned skip)
Definition: utilfun.cpp:265
char * GetTheCurrentToken(char **ptr)
Definition: utilfun.cpp:727
Int2 fta_StringMatch(const Char **array, string_view text)
Definition: utilfun.cpp:486
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Definition: utilfun.cpp:1196
bool IsLeadPrefixChar(char c)
Definition: utilfun.cpp:340
static const char * ParFlat_WGS_kw_array[]
Definition: utilfun.cpp:106
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1497
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:995
static const char * ParFlat_HTC_kw_array[]
Definition: utilfun.cpp:96
void fta_remove_mag_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1322
void CleanTailNoneAlphaChar(char *str)
Definition: utilfun.cpp:697
static size_t SeekLastAlphaChar(const Char *str, size_t len)
Definition: utilfun.cpp:667
bool IsCancelled(const TKeywordList &keywords)
Definition: utilfun.cpp:1564
Int2 MatchArrayString(const char **array, const char *text)
Definition: utilfun.cpp:533
static const char * ParFlat_GSS_kw_array[]
Definition: utilfun.cpp:80
static string FTAitoa(Int4 m)
Definition: utilfun.cpp:179
Char * StringIStr(const Char *where, const Char *what)
Definition: utilfun.cpp:591
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1103
void CpSeqId(InfoBioseqPtr ibp, const CSeq_id &id)
Definition: utilfun.cpp:792
static const char * ParFlat_EST_kw_array[]
Definition: utilfun.cpp:69
void fta_remove_env_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1308
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
Definition: utilfun.cpp:779
bool fta_tpa_keywords_check(const TKeywordList &kwds)
Definition: utilfun.cpp:1021
char * PointToNextToken(char *ptr)
Definition: utilfun.cpp:707
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
Definition: utilfun.cpp:1448
void xCheckEstStsGssTpaKeywords(const list< string > keywordList, bool tpa_check, IndexblkPtr entry)
Definition: utilfun.cpp:1336
void UnwrapAccessionRange(const CGB_block::TExtra_accessions &extra_accs, CGB_block::TExtra_accessions &hist)
Definition: utilfun.cpp:197
bool IsDigit(char c)
Definition: utilfun.cpp:344
ValNodePtr ConstructValNode(CSeq_id::E_Choice choice, const char *data)
Definition: utilfun.cpp:1438
#define ParFlat_UNKW
Definition: utilfun.h:44
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
Definition: valnode.cpp:53
static Uint4 letter(char c)
Modified on Mon May 27 04:37:04 2024 by modify_doxy.py rev. 669887