NCBI C++ ToolKit
utilfun.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: utilfun.cpp 101058 2023-10-24 13:28:57Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: utilfun.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Utility functions for parser and indexing.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbitime.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <corelib/ncbistr.hpp>
41 #include <objmgr/scope.hpp>
43 #include <objects/seq/MolInfo.hpp>
45 #include <corelib/tempstr.hpp>
46 
47 #include "index.h"
48 
49 #include "ftaerr.hpp"
50 #include "indx_def.h"
51 #include "utilfun.h"
52 
53 #ifdef THIS_FILE
54 # undef THIS_FILE
55 #endif
56 #define THIS_FILE "utilfun.cpp"
57 
59 
61 
63 {
64  static CScope scope(*CObjectManager::GetInstance());
65  return scope;
66 }
67 
68 
69 static const char* ParFlat_EST_kw_array[] = {
70  "EST",
71  "EST PROTO((expressed sequence tag)",
72  "expressed sequence tag",
73  "EST (expressed sequence tag)",
74  "EST (expressed sequence tags)",
75  "EST(expressed sequence tag)",
76  "transcribed sequence fragment",
77  nullptr
78 };
79 
80 static const char* ParFlat_GSS_kw_array[] = {
81  "GSS",
82  "GSS (genome survey sequence)",
83  "trapped exon",
84  nullptr
85 };
86 
87 static const char* ParFlat_STS_kw_array[] = {
88  "STS",
89  "STS(sequence tagged site)",
90  "STS (sequence tagged site)",
91  "STS sequence",
92  "sequence tagged site",
93  nullptr
94 };
95 
96 static const char* ParFlat_HTC_kw_array[] = {
97  "HTC",
98  nullptr
99 };
100 
101 static const char* ParFlat_FLI_kw_array[] = {
102  "FLI_CDNA",
103  nullptr
104 };
105 
106 static const char* ParFlat_WGS_kw_array[] = {
107  "WGS",
108  nullptr
109 };
110 
111 static const char* ParFlat_MGA_kw_array[] = {
112  "MGA",
113  "CAGE (Cap Analysis Gene Expression)",
114  "5'-SAGE",
115  nullptr
116 };
117 
118 static const char* ParFlat_MGA_more_kw_array[] = {
119  "CAGE (Cap Analysis Gene Expression)",
120  "5'-SAGE",
121  "5'-end tag",
122  "unspecified tag",
123  "small RNA",
124  nullptr
125 };
126 
127 /* Any change of contents of next array below requires proper
128  * modifications in function fta_tsa_keywords_check().
129  */
130 static const char* ParFlat_TSA_kw_array[] = {
131  "TSA",
132  "Transcriptome Shotgun Assembly",
133  nullptr
134 };
135 
136 /* Any change of contents of next array below requires proper
137  * modifications in function fta_tls_keywords_check().
138  */
139 static const char* ParFlat_TLS_kw_array[] = {
140  "TLS",
141  "Targeted Locus Study",
142  nullptr
143 };
144 
145 /* Any change of contents of next 2 arrays below requires proper
146  * modifications in function fta_tpa_keywords_check().
147  */
148 static const char* ParFlat_TPA_kw_array[] = {
149  "TPA",
150  "THIRD PARTY ANNOTATION",
151  "THIRD PARTY DATA",
152  "TPA:INFERENTIAL",
153  "TPA:EXPERIMENTAL",
154  "TPA:REASSEMBLY",
155  "TPA:ASSEMBLY",
156  "TPA:SPECIALIST_DB",
157  nullptr
158 };
159 
160 static const char* ParFlat_TPA_kw_array_to_remove[] = {
161  "TPA",
162  "THIRD PARTY ANNOTATION",
163  "THIRD PARTY DATA",
164  nullptr
165 };
166 
167 static const char* ParFlat_ENV_kw_array[] = {
168  "ENV",
169  nullptr
170 };
171 
172 static const char* ParFlat_MAG_kw_array[] = {
173  "Metagenome Assembled Genome",
174  "MAG",
175  nullptr
176 };
177 
178 /**********************************************************/
179 static string FTAitoa(Int4 m)
180 {
181  Int4 sign = (m < 0) ? -1 : 1;
182  string res;
183 
184  for (m *= sign; m > 9; m /= 10)
185  res += m % 10 + '0';
186 
187  res += m + '0';
188 
189  if (sign < 0)
190  res += '-';
191 
192  std::reverse(res.begin(), res.end());
193  return res;
194 }
195 
196 /**********************************************************/
198 {
199  Int4 num1;
200  Int4 num2;
201 
203 
204  for (const string& acc : extra_accs) {
205  if (acc.empty())
206  continue;
207 
208  size_t dash = acc.find('-');
209  if (dash == string::npos) {
210  ret.push_back(acc);
211  continue;
212  }
213 
214  string first(acc.begin(), acc.begin() + dash),
215  last(acc.begin() + dash + 1, acc.end());
216  size_t acclen = first.size();
217 
218  const Char* p = first.c_str();
219  for (; (*p >= 'A' && *p <= 'Z') || *p == '_';)
220  p++;
221 
222  size_t preflen = p - first.c_str();
223 
224  string prefix = first.substr(0, preflen);
225  while (*p == '0')
226  p++;
227 
228  const Char* q;
229  for (q = p; *p >= '0' && *p <= '9';)
230  p++;
231  num1 = atoi(q);
232 
233  for (p = last.c_str() + preflen; *p == '0';)
234  p++;
235  for (q = p; *p >= '0' && *p <= '9';)
236  p++;
237  num2 = atoi(q);
238 
239  ret.push_back(first);
240 
241  if (num1 == num2)
242  continue;
243 
244  for (num1++; num1 <= num2; num1++) {
245  string new_acc = prefix;
246  string num_str = FTAitoa(num1);
247  size_t j = acclen - preflen - num_str.size();
248 
249  for (size_t i = 0; i < j; i++)
250  new_acc += '0';
251 
252  new_acc += num_str;
253  ret.push_back(new_acc);
254  }
255  }
256 
257  ret.swap(hist);
258 }
259 
260 static bool sIsPrefixChar(char c)
261 {
262  return ('A' <= c && c <= 'Z') || c == '_';
263 }
264 /**********************************************************/
265 bool ParseAccessionRange(list<string>& tokens, unsigned skip)
266 {
267  bool bad = false;
268 
269  if (tokens.empty()) {
270  return true;
271  }
272 
273  if (tokens.size() <= skip + 1) {
274  return true;
275  }
276 
277 
278  auto it = tokens.begin();
279  if (skip) {
280  advance(it, skip);
281  }
282 
283  for (; it != tokens.end(); ++it) {
284  const auto& token = *it;
285  if (token.empty()) {
286  continue;
287  }
288 
290  if (! NStr::SplitInTwo(token, "-", first, last)) {
291  continue;
292  }
293  if (first.size() != last.size()) {
294  bad = true;
295  break;
296  }
297 
298  auto first_it =
299  find_if_not(begin(first), end(first), sIsPrefixChar);
300 
301  if (first_it == first.end()) {
302  bad = true;
303  break;
304  }
305 
306 
307  auto last_it =
308  find_if_not(begin(last), end(last), sIsPrefixChar);
309  if (last_it == last.end()) {
310  bad = true;
311  break;
312  }
313 
314  auto prefixLength = distance(first.begin(), first_it);
315  if (prefixLength != distance(last.begin(), last_it) ||
316  ! NStr::EqualCase(first, 0, prefixLength, last.substr(0, prefixLength))) {
317  ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch, "Inconsistent prefix found in secondary accession range \"%s\".", token.c_str());
318  break;
319  }
320 
321  auto num1 = NStr::StringToInt(first.substr(prefixLength));
322  auto num2 = NStr::StringToInt(last.substr(prefixLength));
323 
324  if (num2 <= num1) {
325  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Invalid start/end values in secondary accession range \"%s\".", token.c_str());
326  }
327 
328  *it = first;
329  it = tokens.insert(it, "-");
330  it = tokens.insert(it, last);
331  }
332 
333  if (bad) {
334  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Incorrect secondary accession range provided: \"%s\".", it->c_str());
335  }
336  return false;
337 }
338 
339 /**********************************************************/
340 bool ParseAccessionRange(TokenStatBlkPtr tsbp, unsigned skip)
341 {
342  TokenBlkPtr tbp;
343  TokenBlkPtr tbpnext;
344  char* dash;
345  const char* first;
346  char* last;
347  const char* p;
348  const char* q;
349  bool bad;
350  Int4 num1;
351  Int4 num2;
352 
353  if (! tsbp->list)
354  return true;
355 
356  tbp = nullptr;
357  if (skip == 0)
358  tbp = tsbp->list;
359  else if (skip == 1) {
360  if (tsbp->list)
361  tbp = tsbp->list->next;
362  } else {
363  if (tsbp->list && tsbp->list->next)
364  tbp = tsbp->list->next->next;
365  }
366  if (! tbp)
367  return true;
368 
369  for (bad = false; tbp; tbp = tbpnext) {
370  tbpnext = tbp->next;
371  if (! tbp->str)
372  continue;
373  dash = StringChr(tbp->str, '-');
374  if (! dash)
375  continue;
376  *dash = '\0';
377  first = tbp->str;
378  last = dash + 1;
379  if (StringLen(first) != StringLen(last) || *first < 'A' ||
380  *first > 'Z' || *last < 'A' || *last > 'Z') {
381  *dash = '-';
382  bad = true;
383  break;
384  }
385 
386  for (p = first; (*p >= 'A' && *p <= 'Z') || *p == '_';)
387  p++;
388  if (*p < '0' || *p > '9') {
389  *dash = '-';
390  bad = true;
391  break;
392  }
393  for (q = last; (*q >= 'A' && *q <= 'Z') || *q == '_';)
394  q++;
395  if (*q < '0' || *q > '9') {
396  *dash = '-';
397  bad = true;
398  break;
399  }
400  size_t preflen = p - first;
401  if (preflen != (size_t)(q - last) || ! StringEquN(first, last, preflen)) {
402  *dash = '-';
403  ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch, "Inconsistent prefix found in secondary accession range \"%s\".", tbp->str);
404  break;
405  }
406 
407  while (*p == '0')
408  p++;
409  for (q = p; *p >= '0' && *p <= '9';)
410  p++;
411  if (*p != '\0') {
412  *dash = '-';
413  bad = true;
414  break;
415  }
416  num1 = atoi(q);
417 
418  for (p = last + preflen; *p == '0';)
419  p++;
420  for (q = p; *p >= '0' && *p <= '9';)
421  p++;
422  if (*p != '\0') {
423  *dash = '-';
424  bad = true;
425  break;
426  }
427  num2 = atoi(q);
428 
429  if (num1 > num2) {
430  *dash = '-';
431  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Invalid start/end values in secondary accession range \"%s\".", tbp->str);
432  break;
433  }
434 
435  tbp->next = new TokenBlk;
436  tbp = tbp->next;
437  tbp->str = StringSave("-");
438  tbp->next = new TokenBlk;
439  tbp = tbp->next;
440  tbp->str = StringSave(last);
441  tsbp->num += 2;
442 
443  tbp->next = tbpnext;
444  }
445  if (! tbp)
446  return true;
447  if (bad) {
448  ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange, "Incorrect secondary accession range provided: \"%s\".", tbp->str);
449  }
450  return false;
451 }
452 
453 /**********************************************************/
455 {
456  TokenBlkPtr newnode = new TokenBlk;
457 
458  if (tbp) {
459  while (tbp->next)
460  tbp = tbp->next;
461  tbp->next = newnode;
462  }
463 
464  return (newnode);
465 }
466 
467 /**********************************************************/
468 static void InsertTokenVal(TokenBlkPtr* tbp, const char* str)
469 {
470  TokenBlkPtr ltbp;
471 
472  ltbp = *tbp;
473  ltbp = TokenNodeNew(ltbp);
474  ltbp->str = StringSave(str);
475 
476  if (! *tbp)
477  *tbp = ltbp;
478 }
479 
480 /**********************************************************
481  *
482  * TokenStatBlkPtr TokenString(str, delimiter):
483  *
484  * Parsing string "str" by delimiter or tab key, blank.
485  * Parsing stop at newline ('\n') or end of string ('\0').
486  * Return a statistics of link list token.
487  *
488  **********************************************************/
490 {
491  char* bptr;
492  char* ptr;
493  char* curtoken;
494  Int2 num;
495  TokenStatBlkPtr token;
496  Char ch;
497 
498  token = new TokenStatBlk;
499 
500  /* skip first several delimiters if any existed
501  */
502  for (ptr = str; *ptr == delimiter;)
503  ptr++;
504 
505  for (num = 0; *ptr != '\0' && *ptr != '\r' && *ptr != '\n';) {
506  for (bptr = ptr; *ptr != delimiter && *ptr != '\r' && *ptr != '\n' &&
507  *ptr != '\t' && *ptr != ' ' && *ptr != '\0';)
508  ptr++;
509 
510  ch = *ptr;
511  *ptr = '\0';
512  curtoken = StringSave(bptr);
513  *ptr = ch;
514 
515  InsertTokenVal(&token->list, curtoken);
516  num++;
517  MemFree(curtoken);
518 
519  while (*ptr == delimiter || *ptr == '\t' || *ptr == ' ')
520  ptr++;
521  }
522 
523  token->num = num;
524 
525  return (token);
526 }
527 
528 /**********************************************************/
530 {
531  TokenBlkPtr temp;
532 
533  while (tbp) {
534  temp = tbp;
535  tbp = tbp->next;
536  MemFree(temp->str);
537  delete temp;
538  }
539 }
540 
541 /**********************************************************/
543 {
544  FreeTokenblk(tsbp->list);
545  delete tsbp;
546 }
547 
548 /**********************************************************
549  *
550  * Int2 fta_StringMatch(array, text):
551  *
552  * Return array position of the matched length
553  * of string in array.
554  * Return -1 if no match.
555  *
556  **********************************************************/
558 {
559  Int2 i;
560 
561  if (! text)
562  return (-1);
563 
564  for (i = 0; *array; i++, array++) {
566  break;
567  }
568 
569  if (! *array)
570  return (-1);
571 
572  return (i);
573 }
574 
575 /**********************************************************
576  *
577  * Int2 StringMatchIcase(array, text):
578  *
579  * Return array position of the matched lenght of
580  * string (ignored case) in array.
581  * Return -1 if no match.
582  *
583  **********************************************************/
585 {
586  Int2 i;
587 
588  if (! text)
589  return (-1);
590 
591  for (i = 0; *array; i++, array++) {
592  // If string from an array is empty its length == 0 and would be equval to any other string
593  // The next 'if' statement will avoid that behavior
594  if (text[0] != 0 && *array[0] == 0)
595  continue;
596 
598  break;
599  }
600 
601  if (! *array)
602  return (-1);
603  return (i);
604 }
605 
606 /**********************************************************
607  *
608  * Int2 MatchArrayString(array, text):
609  *
610  * Return array position of the string in the
611  * array.
612  * Return -1 if no match.
613  *
614  **********************************************************/
615 Int2 MatchArrayString(const char** array, const char* text)
616 {
617  Int2 i;
618 
619  if (! text)
620  return (-1);
621 
622  for (i = 0; *array; i++, array++) {
623  if (NStr::Equal(*array, text))
624  break;
625  }
626 
627  if (! *array)
628  return (-1);
629  return (i);
630 }
631 
632 /**********************************************************/
634 {
635  Int2 i;
636 
637  if (! text)
638  return (-1);
639 
640  for (i = 0; *array; i++, array++) {
641  // If string from an array is empty its length == 0 and would be equval to any other string
642  // The next 'if' statement will avoid that behavior
643  if (text[0] != 0 && *array[0] == 0)
644  continue;
645 
647  break;
648  }
649 
650  if (! *array)
651  return (-1);
652  return (i);
653 }
654 
655 /**********************************************************
656  *
657  * Int2 MatchArraySubString(array, text):
658  *
659  * Return array position of the string in the array
660  * if any array is in the substring of "text".
661  * Return -1 if no match.
662  *
663  **********************************************************/
665 {
666  Int2 i;
667 
668  if (! text)
669  return (-1);
670 
671  for (i = 0; *array; i++, array++) {
672  if (NStr::Find(text, *array) != NPOS)
673  break;
674  }
675 
676  if (! *array)
677  return (-1);
678  return (i);
679 }
680 
681 /**********************************************************/
682 Char* StringIStr(const Char* where, const Char* what)
683 {
684  const Char* p;
685  const Char* q;
686 
687  if (! where || *where == '\0' || ! what || *what == '\0')
688  return nullptr;
689 
690  q = nullptr;
691  for (; *where != '\0'; where++) {
692  for (q = what, p = where; *q != '\0' && *p != '\0'; q++, p++) {
693  if (*q == *p)
694  continue;
695 
696  if (*q >= 'A' && *q <= 'Z') {
697  if (*q + 32 == *p)
698  continue;
699  } else if (*q >= 'a' && *q <= 'z') {
700  if (*q - 32 == *p)
701  continue;
702  }
703  break;
704  }
705  if (*p == '\0' || *q == '\0')
706  break;
707  }
708  if (q && *q == '\0')
709  return const_cast<char*>(where);
710  return nullptr;
711 }
712 
713 /**********************************************************/
715 {
716  Int2 i;
717 
718  if (! text)
719  return (-1);
720 
721  for (i = 0; *array; i++, array++) {
722  if (NStr::FindNoCase(text, *array) != NPOS)
723  break;
724  }
725 
726  if (! *array)
727  return (-1);
728  return (i);
729 }
730 
731 /**********************************************************
732  *
733  * char* GetBlkDataReplaceNewLine(bptr, eptr,
734  * start_col_data):
735  *
736  * Return a string which replace newline to blank
737  * and skip "XX" line data.
738  *
739  **********************************************************/
740 char* GetBlkDataReplaceNewLine(char* bptr, char* eptr, Int2 start_col_data)
741 {
742  string instr(bptr, eptr - bptr);
743  xGetBlkDataReplaceNewLine(instr, start_col_data);
744 
745  char* ptr;
746 
747  if (bptr + start_col_data >= eptr)
748  return nullptr;
749 
750  size_t size = eptr - bptr;
751  char* retstr = MemNew(size + 1);
752  char* str = retstr;
753 
754  while (bptr < eptr) {
755  if (NStr::Equal(bptr, 0, 2, "XX")) /* skip XX line data */
756  {
757  ptr = SrchTheChar(bptr, eptr, '\n');
758  bptr = ptr + 1;
759  continue;
760  }
761 
762  bptr += start_col_data;
763  ptr = SrchTheChar(bptr, eptr, '\n');
764 
765  if (ptr) {
766  size = ptr - bptr;
767  MemCpy(str, bptr, size);
768  str += size;
769  if (*(ptr - 1) != '-' || *(ptr - 2) == ' ') {
770  StringCpy(str, " ");
771  str++;
772  }
773  bptr = ptr;
774  }
775  bptr++;
776  }
777 
778  string tstr = NStr::TruncateSpaces(string(retstr), NStr::eTrunc_End);
779  MemFree(retstr);
780  retstr = StringSave(tstr.c_str());
781  return (retstr);
782 }
783 
784 void xGetBlkDataReplaceNewLine(string& instr, int indent)
785 {
786  vector<string> lines;
787  NStr::Split(instr, "\n", lines);
788  string replaced;
789  for (auto line : lines) {
790  if (line.empty() || NStr::StartsWith(line, "XX")) {
791  continue;
792  }
793  replaced += line.substr(indent);
794  auto last = line.size() - 1;
795  if (line[last] != '-') {
796  replaced += ' ';
797  } else if (line[last - 1] == ' ') {
798  replaced += ' ';
799  }
800  }
801  NStr::TruncateSpacesInPlace(replaced);
802  instr = replaced;
803 }
804 
805 
806 /**********************************************************/
807 static size_t SeekLastAlphaChar(const Char* str, size_t len)
808 {
809  if (str && len != 0) {
810  for (size_t ret = len; ret > 0;) {
811  char c = str[--ret];
812  if (c != ' ' && c != '\n' && c != '\\' && c != ',' &&
813  c != ';' && c != '~' && c != '.' && c != ':') {
814  return ret + 1;
815  }
816  }
817  }
818 
819  return 0;
820 }
821 
822 /**********************************************************/
824 {
825  size_t ret = SeekLastAlphaChar(str.c_str(), str.size());
826  str = str.substr(0, ret);
827 }
828 
829 /**********************************************************
830  *
831  * void CleanTailNoneAlphaChar(str):
832  *
833  * Delete any tailing ' ', '\n', '\\', ',', ';', '~',
834  * '.', ':' characters.
835  *
836  **********************************************************/
838 {
839  if (! str || *str == '\0')
840  return;
841 
842  size_t last = SeekLastAlphaChar(str, strlen(str));
843  str[last] = '\0';
844 }
845 
846 /**********************************************************/
847 char* PointToNextToken(char* ptr)
848 {
849  if (ptr) {
850  while (*ptr != ' ')
851  ptr++;
852  while (*ptr == ' ')
853  ptr++;
854  }
855  return (ptr);
856 }
857 
858 /**********************************************************
859  *
860  * char* GetTheCurrentToken(ptr):
861  *
862  * Return the current token (also CleanTailNoneAlphaChar)
863  * which ptr points to and ptr will points to next token
864  * after the routine return.
865  *
866  **********************************************************/
867 char* GetTheCurrentToken(char** ptr)
868 {
869  char* retptr;
870  char* bptr;
871  char* str;
872  Char ch;
873 
874  bptr = retptr = *ptr;
875  if (! retptr || *retptr == '\0')
876  return nullptr;
877 
878  while (*retptr != '\0' && *retptr != ' ')
879  retptr++;
880 
881  ch = *retptr;
882  *retptr = '\0';
883  str = StringSave(bptr);
884  *retptr = ch;
885 
886  while (*retptr != '\0' && *retptr == ' ') /* skip blanks */
887  retptr++;
888  *ptr = retptr;
889 
891  return (str);
892 }
893 
894 /**********************************************************
895  *
896  * char* SrchTheChar(bptr, eptr, letter):
897  *
898  * Search The character letter.
899  * Return NULL if not found; otherwise, return
900  * a pointer points first occurrence The character.
901  *
902  **********************************************************/
903 char* SrchTheChar(char* bptr, char* eptr, Char letter)
904 {
905  while (bptr < eptr && *bptr != letter)
906  bptr++;
907 
908  if (bptr < eptr)
909  return (bptr);
910 
911  return nullptr;
912 }
913 
914 /**********************************************************
915  *
916  * char* SrchTheStr(bptr, eptr, leadstr):
917  *
918  * Search The leading string.
919  * Return NULL if not found; otherwise, return
920  * a pointer points first occurrence The leading string.
921  *
922  **********************************************************/
923 char* SrchTheStr(char* bptr, char* eptr, const char* leadstr)
924 {
925  char* p;
926  Char c;
927 
928  c = *eptr;
929  *eptr = '\0';
930  p = StringStr(bptr, leadstr);
931  *eptr = c;
932  return (p);
933 }
934 
935 /**********************************************************/
936 void CpSeqId(InfoBioseqPtr ibp, const CSeq_id& id)
937 {
938  const CTextseq_id* text_id = id.GetTextseq_Id();
939  if (text_id) {
940  if (text_id->IsSetName())
941  ibp->mLocus = text_id->GetName();
942 
943  CRef<CSeq_id> new_id(new CSeq_id);
944  if (text_id->IsSetAccession()) {
945  ibp->mAccNum = text_id->GetAccession();
946 
947  CRef<CTextseq_id> new_text_id(new CTextseq_id);
948  new_text_id->SetAccession(text_id->GetAccession());
949  if (text_id->IsSetVersion())
950  new_text_id->SetVersion(text_id->GetVersion());
951 
952  SetTextId(id.Which(), *new_id, *new_text_id);
953  } else {
954  new_id->Assign(id);
955  }
956 
957  ibp->ids.push_back(new_id);
958  } else {
959  auto pId = Ref(new CSeq_id());
960  pId->Assign(id);
961  ibp->ids.push_back(std::move(pId));
962  }
963 }
964 
965 /**********************************************************
966  *
967  * CRef<CDate_std> get_full_date(s, is_ref, source):
968  *
969  * Get year, month, day and return CRef<CDate_std>.
970  *
971  **********************************************************/
973 {
974  CRef<CDate_std> date;
975 
976  if (! s || *s == '\0')
977  return date;
978 
979  int parse_day = 0;
980  if (isdigit(*s) != 0) {
981  parse_day = atoi(s);
982  s += 3;
983  // should we make at least a token effort of validation (like <32)?
984  }
985 
986  static const vector<string> months{
987  "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
988  };
989  CTempString maybe_month(s, 3);
990  auto it = find(months.begin(), months.end(), maybe_month);
991  if (it == months.end()) {
992  char msg[11];
993  StringNCpy(msg, s, 10);
994  msg[10] = '\0';
995  is_ref ? ErrPostEx(
996  SEV_WARNING, ERR_REFERENCE_IllegalDate, "Unrecognized month: %s", msg)
997  : ErrPostEx(
998  SEV_WARNING, ERR_DATE_IllegalDate, "Unrecognized month: %s", msg);
999  return date;
1000  }
1001  int parse_month = int(it - months.begin()) + 1;
1002 
1003  s += 4;
1004 
1005  int parse_year = atoi(s);
1006  int cur_year = CCurrentTime().Year();
1007  if (1900 <= parse_year && parse_year <= cur_year) {
1008  // all set
1009  } else if (0 <= parse_year && parse_year <= 99 && '0' <= s[1] && s[1] <= '9') {
1010  // insist that short form year has exactly two digits
1011  (parse_year < 70) ? (parse_year += 2000) : (parse_year += 1900);
1012  } else {
1013  if (is_ref) {
1014  ErrPostEx(
1015  SEV_ERROR, ERR_REFERENCE_IllegalDate, "Illegal year: %d, current year: %d", parse_year, cur_year);
1016  } else if (source != Parser::ESource::SPROT || parse_year - cur_year > 1) {
1017  ErrPostEx(
1018  SEV_WARNING, ERR_DATE_IllegalDate, "Illegal year: %d, current year: %d", parse_year, cur_year);
1019  }
1020  // treat bad year like bad month above:
1021  return date;
1022  }
1023  date.Reset(new CDate_std);
1024  date->SetYear(parse_year);
1025  date->SetMonth(parse_month);
1026  date->SetDay(parse_day);
1027 
1028  return date;
1029 }
1030 
1031 /**********************************************************
1032  *
1033  * int SrchKeyword(ptr, kwl):
1034  *
1035  * Compare first kwl.len byte in ptr to kwl.str.
1036  * Return the position of keyword block array;
1037  * return unknown keyword, UNKW, if not found.
1038  *
1039  * 3-25-93
1040  *
1041  **********************************************************/
1042 int SrchKeyword(const CTempString& ptr, const vector<string>& keywordList)
1043 {
1044  SIZE_TYPE keywordCount = keywordList.size();
1045 
1046  for (unsigned i = 0; i < keywordCount; ++i) {
1047  if (NStr::StartsWith(ptr, keywordList[i])) {
1048  return (int)i;
1049  }
1050  }
1051  return ParFlat_UNKW;
1052 }
1053 
1054 /**********************************************************/
1055 bool CheckLineType(char* ptr, Int4 line, const vector<string>& keywordList, bool after_origin)
1056 {
1057  char* p;
1058  Char msg[51];
1059 
1060  if (after_origin) {
1061  for (p = ptr; *p >= '0' && *p <= '9';)
1062  p++;
1063  if (*p == ' ')
1064  return true;
1065  }
1066 
1067  auto keywordCount = keywordList.size();
1068  for (unsigned i = 0; i < keywordCount; i++) {
1069  auto keyword = keywordList[i];
1070  if (StringEquN(ptr, keyword.c_str(), keyword.size()))
1071  return true;
1072  }
1073 
1074  StringNCpy(msg, ptr, 50);
1075  msg[50] = '\0';
1076  p = StringChr(msg, '\n');
1077  if (p)
1078  *p = '\0';
1079  ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Unknown linetype \"%s\". Line number %d.", msg, line);
1080  if (p)
1081  *p = '\n';
1082 
1083  return false;
1084 }
1085 
1086 /**********************************************************
1087  *
1088  * char* SrchNodeType(entry, type, len):
1089  *
1090  * Return a memory location of the node which has
1091  * the "type".
1092  *
1093  **********************************************************/
1094 char* SrchNodeType(DataBlkPtr entry, Int4 type, size_t* len)
1095 {
1096  DataBlkPtr temp;
1097 
1098  temp = TrackNodeType(*entry, (Int2)type);
1099  if (temp) {
1100  *len = temp->len;
1101  return (temp->mOffset);
1102  }
1103 
1104  *len = 0;
1105  return nullptr;
1106 }
1107 
1108 char* xSrchNodeType(const DataBlk& entry, Int4 type, size_t* len)
1109 {
1110  DataBlkPtr temp;
1111 
1112  temp = TrackNodeType(entry, (Int2)type);
1113  if (temp) {
1114  *len = temp->len;
1115  return (temp->mOffset);
1116  }
1117 
1118  *len = 0;
1119  return nullptr;
1120 }
1121 
1122 string xGetNodeData(const DataBlk& entry, int nodeType)
1123 {
1124  auto tmp = TrackNodeType(entry, (Int2)nodeType);
1125  if (! tmp) {
1126  return "";
1127  }
1128  return string(tmp->mOffset, tmp->len);
1129 }
1130 
1131 /**********************************************************
1132  *
1133  * DataBlkPtr TrackNodeType(entry, type):
1134  *
1135  * Return a pointer points to the Node which has
1136  * the "type".
1137  *
1138  **********************************************************/
1140 {
1141  DataBlkPtr temp;
1142  EntryBlkPtr ebp;
1143 
1144  ebp = static_cast<EntryBlk*>(entry.mpData);
1145  temp = ebp->chain;
1146  while (temp && temp->mType != type)
1147  temp = temp->mpNext;
1148 
1149  return (temp);
1150 }
1151 
1152 
1153 const Section* xTrackNodeType(const Entry& entry, int type)
1154 {
1155  for (const Section* sectionPtr : entry.mSections) {
1156  if (sectionPtr->mType == type) {
1157  return sectionPtr;
1158  }
1159  }
1160  return nullptr;
1161 }
1162 
1163 
1164 /**********************************************************/
1166 {
1167  const char* b[4];
1168 
1169  bool kwd_tpa = false;
1170  bool kwd_party = false;
1171  bool kwd_inf = false;
1172  bool kwd_exp = false;
1173  bool kwd_asm = false;
1174  bool kwd_spedb = false;
1175  bool ret = true;
1176 
1177  Int4 j;
1178  Int2 i;
1179 
1180  if (kwds.empty())
1181  return true;
1182 
1183  size_t len = 0;
1184  j = 0;
1185  for (const string& key : kwds) {
1186  if (key.empty())
1187  continue;
1188 
1189  const char* p = key.c_str();
1191  if (i == 0)
1192  kwd_tpa = true;
1193  else if (i == 1 || i == 2)
1194  kwd_party = true;
1195  else if (i == 3)
1196  kwd_inf = true;
1197  else if (i == 4)
1198  kwd_exp = true;
1199  else if (i == 5 || i == 6)
1200  kwd_asm = true;
1201  else if (i == 7)
1202  kwd_spedb = true;
1203  else if (NStr::EqualNocase(p, 0, 3, "TPA")) {
1204  if (p[3] == ':') {
1205  ErrPostEx(SEV_REJECT, ERR_KEYWORD_InvalidTPATier, "Keyword \"%s\" is not a valid TPA-tier keyword.", p);
1206  ret = false;
1207  } else if (p[3] != '\0' && p[4] != '\0') {
1208  ErrPostEx(SEV_WARNING, ERR_KEYWORD_UnexpectedTPA, "Keyword \"%s\" looks like it might be TPA-related, but it is not a recognized TPA keyword.", p);
1209  }
1210  }
1211  if (i > 2 && i < 8 && j < 4) {
1212  b[j] = p;
1213  ++j;
1214  len += key.size() + 1;
1215  }
1216  }
1217 
1218  if (kwd_tpa && ! kwd_party) {
1219  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords, "This TPA-record should have keyword \"Third Party Annotation\" or \"Third Party Data\" in addition to \"TPA\".");
1220  ret = false;
1221  } else if (! kwd_tpa && kwd_party) {
1222  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords, "This TPA-record should have keyword \"TPA\" in addition to \"Third Party Annotation\" or \"Third Party Data\".");
1223  ret = false;
1224  }
1225  if (! kwd_tpa && (kwd_inf || kwd_exp)) {
1226  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords, "This TPA-record should have keyword \"TPA\" in addition to its TPA-tier keyword.");
1227  ret = false;
1228  } else if (kwd_tpa && kwd_inf == false && kwd_exp == false &&
1229  kwd_asm == false && kwd_spedb == false) {
1230  ErrPostEx(SEV_ERROR, ERR_KEYWORD_MissingTPATier, "This TPA record lacks a keyword to indicate which tier it belongs to: experimental, inferential, reassembly or specialist_db.");
1231  }
1232  if (j > 1) {
1233  string buf;
1234  for (i = 0; i < j; i++) {
1235  if (i > 0)
1236  buf += ';';
1237  buf += b[i];
1238  }
1239  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingTPATiers, "Keywords for multiple TPA tiers exist on this record: \"%s\". A TPA record can only be in one tier.", buf.c_str());
1240  ret = false;
1241  }
1242 
1243  return (ret);
1244 }
1245 
1246 /**********************************************************/
1248 {
1249  bool kwd_tsa = false;
1250  bool kwd_assembly = false;
1251  bool ret = true;
1252  Int2 i;
1253 
1254  if (kwds.empty())
1255  return true;
1256 
1257  for (const string& key : kwds) {
1258  if (key.empty())
1259  continue;
1261  if (i == 0)
1262  kwd_tsa = true;
1263  else if (i == 1)
1264  kwd_assembly = true;
1265  else if (source == Parser::ESource::EMBL &&
1266  NStr::EqualNocase(key, "Transcript Shotgun Assembly"))
1267  kwd_assembly = true;
1268  }
1269 
1270  if (kwd_tsa && ! kwd_assembly) {
1271  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSAKeywords, "This TSA-record should have keyword \"Transcriptome Shotgun Assembly\" in addition to \"TSA\".");
1272  ret = false;
1273  } else if (! kwd_tsa && kwd_assembly) {
1274  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSAKeywords, "This TSA-record should have keyword \"TSA\" in addition to \"Transcriptome Shotgun Assembly\".");
1275  ret = false;
1276  }
1277  return (ret);
1278 }
1279 
1280 /**********************************************************/
1282 {
1283  bool kwd_tls = false;
1284  bool kwd_study = false;
1285  bool ret = true;
1286  Int2 i;
1287 
1288  if (kwds.empty())
1289  return true;
1290 
1291  for (const string& key : kwds) {
1292  if (key.empty())
1293  continue;
1295  if (i == 0)
1296  kwd_tls = true;
1297  else if (i == 1)
1298  kwd_study = true;
1299  else if (source == Parser::ESource::EMBL &&
1300  NStr::EqualNocase(key, "Targeted Locus Study"))
1301  kwd_study = true;
1302  }
1303 
1304  if (kwd_tls && ! kwd_study) {
1305  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLSKeywords, "This TLS-record should have keyword \"Targeted Locus Study\" in addition to \"TLS\".");
1306  ret = false;
1307  } else if (! kwd_tls && kwd_study) {
1308  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLSKeywords, "This TLS-record should have keyword \"TLS\" in addition to \"Targeted Locus Study\".");
1309  ret = false;
1310  }
1311  return (ret);
1312 }
1313 
1314 /**********************************************************/
1315 bool fta_is_tpa_keyword(const char* str)
1316 {
1317  if (! str || *str == '\0' || MatchArrayIString(ParFlat_TPA_kw_array, str) < 0)
1318  return false;
1319 
1320  return true;
1321 }
1322 
1323 /**********************************************************/
1324 bool fta_is_tsa_keyword(const char* str)
1325 {
1326  if (! str || *str == '\0' || MatchArrayIString(ParFlat_TSA_kw_array, str) < 0)
1327  return false;
1328  return true;
1329 }
1330 
1331 /**********************************************************/
1332 bool fta_is_tls_keyword(const char* str)
1333 {
1334  if (! str || *str == '\0' || MatchArrayIString(ParFlat_TLS_kw_array, str) < 0)
1335  return false;
1336  return true;
1337 }
1338 
1339 /**********************************************************/
1340 void fta_keywords_check(const char* str, bool* estk, bool* stsk, bool* gssk, bool* htck, bool* flik, bool* wgsk, bool* tpak, bool* envk, bool* mgak, bool* tsak, bool* tlsk)
1341 {
1342  if (estk && MatchArrayString(ParFlat_EST_kw_array, str) != -1)
1343  *estk = true;
1344 
1345  if (stsk && MatchArrayString(ParFlat_STS_kw_array, str) != -1)
1346  *stsk = true;
1347 
1348  if (gssk && MatchArrayString(ParFlat_GSS_kw_array, str) != -1)
1349  *gssk = true;
1350 
1351  if (htck && MatchArrayString(ParFlat_HTC_kw_array, str) != -1)
1352  *htck = true;
1353 
1354  if (flik && MatchArrayString(ParFlat_FLI_kw_array, str) != -1)
1355  *flik = true;
1356 
1357  if (wgsk && MatchArrayString(ParFlat_WGS_kw_array, str) != -1)
1358  *wgsk = true;
1359 
1360  if (tpak && MatchArrayString(ParFlat_TPA_kw_array, str) != -1)
1361  *tpak = true;
1362 
1363  if (envk && MatchArrayString(ParFlat_ENV_kw_array, str) != -1)
1364  *envk = true;
1365 
1366  if (mgak && MatchArrayString(ParFlat_MGA_kw_array, str) != -1)
1367  *mgak = true;
1368 
1369  if (tsak && MatchArrayString(ParFlat_TSA_kw_array, str) != -1)
1370  *tsak = true;
1371 
1372  if (tlsk && MatchArrayString(ParFlat_TLS_kw_array, str) != -1)
1373  *tlsk = true;
1374 }
1375 
1376 /**********************************************************/
1378 {
1379  const char** b;
1380 
1381  if (kwds.empty())
1382  return;
1383 
1384  if (tech == CMolInfo::eTech_est)
1386  else if (tech == CMolInfo::eTech_sts)
1388  else if (tech == CMolInfo::eTech_survey)
1390  else if (tech == CMolInfo::eTech_htc)
1392  else if (tech == CMolInfo::eTech_fli_cdna)
1394  else if (tech == CMolInfo::eTech_wgs)
1396  else
1397  return;
1398 
1399  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1400  if (key->empty() || MatchArrayString(b, key->c_str()) != -1) {
1401  key = kwds.erase(key);
1402  } else
1403  ++key;
1404  }
1405 }
1406 
1407 /**********************************************************/
1409 {
1410  if (kwds.empty())
1411  return;
1412 
1413  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1414  if (key->empty() || MatchArrayIString(ParFlat_TPA_kw_array_to_remove, key->c_str()) != -1) {
1415  key = kwds.erase(key);
1416  } else
1417  ++key;
1418  }
1419 }
1420 
1421 /**********************************************************/
1423 {
1424  if (kwds.empty())
1425  return;
1426 
1427  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1428  if (key->empty() || MatchArrayIString(ParFlat_TSA_kw_array, key->c_str()) != -1 ||
1429  (source == Parser::ESource::EMBL && NStr::EqualNocase(*key, "Transcript Shotgun Assembly"))) {
1430  key = kwds.erase(key);
1431  } else
1432  ++key;
1433  }
1434 }
1435 
1436 /**********************************************************/
1438 {
1439  if (kwds.empty())
1440  return;
1441 
1442  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1443  if (key->empty() || MatchArrayIString(ParFlat_TLS_kw_array, key->c_str()) != -1 ||
1444  (source == Parser::ESource::EMBL && NStr::EqualNocase(*key, "Targeted Locus Study"))) {
1445  key = kwds.erase(key);
1446  } else
1447  ++key;
1448  }
1449 }
1450 
1451 /**********************************************************/
1453 {
1454  if (kwds.empty())
1455  return;
1456 
1457  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1458  if (key->empty() || MatchArrayIString(ParFlat_ENV_kw_array, key->c_str()) != -1) {
1459  key = kwds.erase(key);
1460  } else
1461  ++key;
1462  }
1463 }
1464 
1465 /**********************************************************/
1467 {
1468  if (kwds.empty())
1469  return;
1470 
1471  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
1472  if (key->empty() || MatchArrayIString(ParFlat_MAG_kw_array, key->c_str()) != -1) {
1473  key = kwds.erase(key);
1474  } else
1475  ++key;
1476  }
1477 }
1478 
1479 /**********************************************************/
1481  const list<string> keywordList,
1482  bool tpa_check,
1483  IndexblkPtr entry
1484  // bool& specialist_db,
1485  // bool& inferential,
1486  // bool& experimental,
1487  // bool& assembly
1488 )
1489 {
1490  if (keywordList.empty()) {
1491  return;
1492  }
1493  for (auto keyword : keywordList) {
1495  keyword.c_str(), &entry->EST, &entry->STS, &entry->GSS, &entry->HTC, nullptr, nullptr, (tpa_check ? &entry->is_tpa : nullptr), nullptr, nullptr, nullptr, nullptr);
1496  if (NStr::EqualNocase(keyword, "TPA:assembly")) {
1497  entry->specialist_db = true;
1498  entry->assembly = true;
1499  continue;
1500  }
1501  if (NStr::EqualNocase(keyword, "TPA:specialist_db")) {
1502  entry->specialist_db = true;
1503  continue;
1504  }
1505  if (NStr::EqualNocase(keyword, "TPA:inferential")) {
1506  entry->inferential = true;
1507  continue;
1508  }
1509  if (NStr::EqualNocase(keyword, "TPA:experimental")) {
1510  entry->experimental = true;
1511  continue;
1512  }
1513  }
1514 }
1515 
1516 void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool& specialist_db, bool& inferential, bool& experimental, bool& assembly)
1517 {
1518  char* line;
1519  char* p;
1520  char* q;
1521 
1522  if (! kwds || ! kwds->data || len < 1)
1523  return;
1524 
1525  line = MemNew(len + 1);
1526  line[0] = '\0';
1527  for (; kwds; kwds = kwds->next) {
1528  StringCat(line, kwds->data);
1529  }
1530  for (p = line; *p != '\0'; p++)
1531  if (*p == '\n' || *p == '\t')
1532  *p = ' ';
1533  for (p = line; *p == ' ' || *p == '.' || *p == ';';)
1534  p++;
1535  if (*p == '\0') {
1536  MemFree(line);
1537  return;
1538  }
1539  for (q = p; *q != '\0';)
1540  q++;
1541  for (q--; *q == ' ' || *q == '.' || *q == ';'; q--)
1542  *q = '\0';
1543  for (q = p, p = line; *q != '\0';) {
1544  if (*q != ' ' && *q != ';') {
1545  *p++ = *q++;
1546  continue;
1547  }
1548  if (*q == ' ') {
1549  for (q++; *q == ' ';)
1550  q++;
1551  if (*q != ';')
1552  *p++ = ' ';
1553  }
1554  if (*q == ';') {
1555  *p++ = *q++;
1556  while (*q == ' ' || *q == ';')
1557  q++;
1558  }
1559  }
1560  *p++ = ';';
1561  *p = '\0';
1562  for (p = line;; p = q + 1) {
1563  q = StringChr(p, ';');
1564  if (! q)
1565  break;
1566  *q = '\0';
1567  fta_keywords_check(p, &entry->EST, &entry->STS, &entry->GSS, &entry->HTC, nullptr, nullptr, (tpa_check ? &entry->is_tpa : nullptr), nullptr, nullptr, nullptr, nullptr);
1568  if (NStr::EqualNocase(p, "TPA:specialist_db") ||
1569  NStr::EqualNocase(p, "TPA:assembly")) {
1570  specialist_db = true;
1571  if (NStr::EqualNocase(p, "TPA:assembly"))
1572  assembly = true;
1573  } else if (NStr::EqualNocase(p, "TPA:inferential"))
1574  inferential = true;
1575  else if (NStr::EqualNocase(p, "TPA:experimental"))
1576  experimental = true;
1577  }
1578  MemFree(line);
1579 }
1580 
1581 /**********************************************************/
1583 {
1584  ValNodePtr res;
1585 
1586  res = ValNodeNew(nullptr, data);
1587  res->choice = choice;
1588  return (res);
1589 }
1590 
1591 /**********************************************************/
1592 bool fta_check_mga_keywords(CMolInfo& mol_info, const TKeywordList& kwds)
1593 {
1594  bool is_cage;
1595  bool is_sage;
1596 
1597  TKeywordList::const_iterator key_it = kwds.end();
1598 
1599  bool got = false;
1600  if (! kwds.empty() && NStr::EqualNocase(kwds.front(), "MGA")) {
1601  for (TKeywordList::const_iterator key = kwds.begin(); key != kwds.end(); ++key) {
1603  key->c_str()) < 0)
1604  continue;
1605  got = true;
1606  key_it = key;
1607  break;
1608  }
1609  }
1610 
1611  if (! got) {
1612  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingMGAKeywords, "This is apparently a CAGE record, but it lacks the required keywords. Entry dropped.");
1613  return false;
1614  }
1615 
1616  if (! mol_info.IsSetTechexp() || ! kwds.empty() ||
1617  mol_info.GetTechexp() != "cage")
1618  return true;
1619 
1620  for (is_sage = false, is_cage = false; key_it != kwds.end(); ++key_it) {
1621  const char* p = key_it->c_str();
1622 
1623  if (NStr::EqualNocase(p, "5'-SAGE"))
1624  is_sage = true;
1625  else if (NStr::EqualNocase(p, "CAGE (Cap Analysis Gene Expression)"))
1626  is_cage = true;
1627  }
1628 
1629  if (is_sage) {
1630  if (is_cage) {
1631  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingMGAKeywords, "This MGA record contains more than one of the special keywords indicating different techniques.");
1632  return false;
1633  }
1634  mol_info.SetTechexp("5'-sage");
1635  }
1636 
1637  return true;
1638 }
1639 
1640 /**********************************************************/
1641 void fta_StringCpy(char* dst, const char* src)
1642 {
1643  const char* p;
1644  char* q;
1645 
1646  for (q = dst, p = src; *p != '\0';)
1647  *q++ = *p++;
1648  *q = '\0';
1649 }
1650 
1651 /**********************************************************/
1652 bool SetTextId(Uint1 seqtype, CSeq_id& seqId, CTextseq_id& textId)
1653 {
1654  bool wasSet = true;
1655 
1656  switch (seqtype) {
1657  case CSeq_id::e_Genbank:
1658  seqId.SetGenbank(textId);
1659  break;
1660  case CSeq_id::e_Embl:
1661  seqId.SetEmbl(textId);
1662  break;
1663  case CSeq_id::e_Pir:
1664  seqId.SetPir(textId);
1665  break;
1666  case CSeq_id::e_Swissprot:
1667  seqId.SetSwissprot(textId);
1668  break;
1669  case CSeq_id::e_Other:
1670  seqId.SetOther(textId);
1671  break;
1672  case CSeq_id::e_Ddbj:
1673  seqId.SetDdbj(textId);
1674  break;
1675  case CSeq_id::e_Prf:
1676  seqId.SetPrf(textId);
1677  break;
1678  case CSeq_id::e_Pdb: {
1679  // TODO: test this branch
1680  CPDB_seq_id pdbId;
1681  pdbId.SetChain_id();
1682  seqId.SetPdb(pdbId);
1683  } break;
1684  case CSeq_id::e_Tpg:
1685  seqId.SetTpg(textId);
1686  break;
1687  case CSeq_id::e_Tpe:
1688  seqId.SetTpe(textId);
1689  break;
1690  case CSeq_id::e_Tpd:
1691  seqId.SetTpd(textId);
1692  break;
1693  case CSeq_id::e_Gpipe:
1694  seqId.SetGpipe(textId);
1695  break;
1697  seqId.SetNamed_annot_track(textId);
1698  break;
1699 
1700  default:
1701  wasSet = false;
1702  }
1703 
1704  return wasSet;
1705 }
1706 
1707 /**********************************************************/
1708 bool IsCancelled(const TKeywordList& keywords)
1709 {
1710  for (const string& key : keywords) {
1711  if (NStr::EqualNocase(key, "HTGS_CANCELLED"))
1712  return true;
1713  }
1714 
1715  return false;
1716 }
1717 
1718 /**********************************************************/
1719 bool HasHtg(const TKeywordList& keywords)
1720 {
1721  for (const string& key : keywords) {
1722  if (key == "HTG" || key == "HTGS_PHASE0" ||
1723  key == "HTGS_PHASE1" || key == "HTGS_PHASE2" ||
1724  key == "HTGS_PHASE3") {
1725  return true;
1726  }
1727  }
1728 
1729  return false;
1730 }
1731 
1732 /**********************************************************/
1734 {
1735  for (TKeywordList::iterator key = keywords.begin(); key != keywords.end();) {
1736  const char* p = key->c_str();
1737  if (NStr::EqualNocase(p, 0, 10, "HTGS_PHASE") &&
1738  (p[10] == '0' || p[10] == '1' || p[10] == '2' ||
1739  p[10] == '3') &&
1740  p[11] == '\0') {
1741  key = keywords.erase(key);
1742  } else
1743  ++key;
1744  }
1745 }
1746 
1747 /**********************************************************/
1748 bool HasHtc(const TKeywordList& keywords)
1749 {
1750  for (const string& key : keywords) {
1751  if (NStr::EqualNocase(key, "HTC")) {
1752  return true;
1753  }
1754  }
1755 
1756  return false;
1757 }
1758 
CCurrentTime –.
Definition: ncbitime.hpp:1283
CScope –.
Definition: scope.hpp:92
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
char * mOffset
Definition: ftablock.h:332
size_t len
Definition: ftablock.h:333
CFlatFileData * mpData
Definition: ftablock.h:331
DataBlk * mpNext
Definition: ftablock.h:336
int mType
Definition: ftablock.h:330
The NCBI C++ standard methods for dealing with std::string.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
#define ERR_REFERENCE_IllegalDate
Definition: flat2err.h:282
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
std::list< std::string > TKeywordList
Definition: ftablock.h:166
char * StringSave(const char *s)
Definition: ftacpp.hpp:61
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:106
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:74
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:75
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
char * MemNew(size_t sz)
Definition: ftacpp.hpp:43
void StringCat(char *d, const char *s)
Definition: ftacpp.hpp:73
void MemCpy(void *p, const void *q, size_t sz)
Definition: ftacpp.hpp:50
const char * months[]
Definition: ftaerr.cpp:118
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
string
Definition: cgiapp.hpp:687
#define StringStr
Definition: ncbistr.hpp:322
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5324
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3550
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5383
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3182
@ eTrunc_End
Truncate trailing spaces only.
Definition: ncbistr.hpp:2241
int Year(void) const
Get year.
Definition: ncbitime.hpp:2266
list< string > TExtra_accessions
Definition: GB_block_.hpp:91
void SetYear(TYear value)
Assign a value to Year data member.
Definition: Date_std_.hpp:435
void SetMonth(TMonth value)
Assign a value to Month data member.
Definition: Date_std_.hpp:482
void SetDay(TDay value)
Assign a value to Day data member.
Definition: Date_std_.hpp:529
TNamed_annot_track & SetNamed_annot_track(void)
Select the variant.
Definition: Seq_id_.cpp:551
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seq_id_.cpp:265
TOther & SetOther(void)
Select the variant.
Definition: Seq_id_.cpp:353
const TName & GetName(void) const
Get the Name member data.
TTpe & SetTpe(void)
Select the variant.
Definition: Seq_id_.cpp:485
TTpg & SetTpg(void)
Select the variant.
Definition: Seq_id_.cpp:463
TPir & SetPir(void)
Select the variant.
Definition: Seq_id_.cpp:287
TTpd & SetTpd(void)
Select the variant.
Definition: Seq_id_.cpp:507
TVersion GetVersion(void) const
Get the Version member data.
TGpipe & SetGpipe(void)
Select the variant.
Definition: Seq_id_.cpp:529
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
TDdbj & SetDdbj(void)
Select the variant.
Definition: Seq_id_.cpp:397
TPrf & SetPrf(void)
Select the variant.
Definition: Seq_id_.cpp:419
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seq_id_.cpp:243
TSwissprot & SetSwissprot(void)
Select the variant.
Definition: Seq_id_.cpp:309
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
void SetChain_id(const TChain_id &value)
Assign a value to Chain_id data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
TPdb & SetPdb(void)
Select the variant.
Definition: Seq_id_.cpp:441
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
Definition: Seq_id_.hpp:113
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_Named_annot_track
Internal named annotation tracking ID.
Definition: Seq_id_.hpp:114
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
bool IsSetTechexp(void) const
explanation if tech not enough
Definition: MolInfo_.hpp:522
const TTechexp & GetTechexp(void) const
Get the Techexp member data.
Definition: MolInfo_.hpp:534
void SetTechexp(const TTechexp &value)
Assign a value to Techexp data member.
Definition: MolInfo_.hpp:543
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
#define ERR_KEYWORD_MissingTPAKeywords
Definition: indx_err.h:111
#define ERR_ACCESSION_Invalid2ndAccRange
Definition: indx_err.h:71
#define ERR_ACCESSION_2ndAccPrefixMismatch
Definition: indx_err.h:70
#define ERR_KEYWORD_InvalidTPATier
Definition: indx_err.h:109
#define ERR_KEYWORD_UnexpectedTPA
Definition: indx_err.h:110
#define ERR_KEYWORD_MissingTSAKeywords
Definition: indx_err.h:114
#define ERR_KEYWORD_MissingTPATier
Definition: indx_err.h:112
#define ERR_KEYWORD_ConflictingTPATiers
Definition: indx_err.h:113
#define ERR_KEYWORD_MissingTLSKeywords
Definition: indx_err.h:117
#define ERR_ENTRY_InvalidLineType
Definition: indx_err.h:64
#define ERR_KEYWORD_MissingMGAKeywords
Definition: indx_err.h:115
#define ERR_KEYWORD_ConflictingMGAKeywords
Definition: indx_err.h:116
char * buf
int i
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
Defines: CTimeFormat - storage class for time format.
The Object manager core.
static char tmp[2048]
Definition: utf8.c:42
static const char * prefix[]
Definition: pcregrep.c:405
static const char delimiter[]
string indent(" ")
static const char * str(char *buf, int n)
Definition: stats.c:84
DataBlkPtr chain
Definition: ftablock.h:344
Definition: entry.h:57
list< SectionPtr > mSections
Definition: entry.h:99
bool assembly
Definition: ftablock.h:244
bool is_tpa
Definition: ftablock.h:209
bool STS
Definition: ftablock.h:196
bool HTC
Definition: ftablock.h:198
bool experimental
Definition: ftablock.h:250
bool inferential
Definition: ftablock.h:248
bool EST
Definition: ftablock.h:195
bool specialist_db
Definition: ftablock.h:246
bool GSS
Definition: ftablock.h:197
string mAccNum
Definition: ftablock.h:84
string mLocus
Definition: ftablock.h:83
TSeqIdList ids
Definition: ftablock.h:82
Definition: entry.h:13
int mType
Definition: entry.h:47
TokenBlk * next
Definition: ftablock.h:135
char * str
Definition: ftablock.h:134
TokenBlk * list
Definition: ftablock.h:140
ValNode * next
Definition: valnode.h:51
char * data
Definition: valnode.h:49
unsigned char choice
Definition: valnode.h:47
Definition: type.c:6
Int2 MatchArraySubString(const Char **array, const Char *text)
Definition: utilfun.cpp:664
static const char * ParFlat_TLS_kw_array[]
Definition: utilfun.cpp:139
USING_SCOPE(objects)
Int2 MatchArrayIString(const Char **array, const Char *text)
Definition: utilfun.cpp:633
CScope & GetScope()
Definition: utilfun.cpp:62
bool HasHtg(const TKeywordList &keywords)
Definition: utilfun.cpp:1719
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
Definition: utilfun.cpp:1042
static const char * ParFlat_STS_kw_array[]
Definition: utilfun.cpp:87
bool HasHtc(const TKeywordList &keywords)
Definition: utilfun.cpp:1748
static const char * ParFlat_MGA_kw_array[]
Definition: utilfun.cpp:111
char * GetBlkDataReplaceNewLine(char *bptr, char *eptr, Int2 start_col_data)
Definition: utilfun.cpp:740
char * SrchTheChar(char *bptr, char *eptr, Char letter)
Definition: utilfun.cpp:903
static const char * ParFlat_MAG_kw_array[]
Definition: utilfun.cpp:172
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1281
void RemoveHtgPhase(TKeywordList &keywords)
Definition: utilfun.cpp:1733
bool fta_is_tsa_keyword(const char *str)
Definition: utilfun.cpp:1324
static bool sIsPrefixChar(char c)
Definition: utilfun.cpp:260
bool fta_is_tls_keyword(const char *str)
Definition: utilfun.cpp:1332
bool CheckLineType(char *ptr, Int4 line, const vector< string > &keywordList, bool after_origin)
Definition: utilfun.cpp:1055
static const char * ParFlat_TPA_kw_array_to_remove[]
Definition: utilfun.cpp:160
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
Definition: utilfun.cpp:1652
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1422
void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool &specialist_db, bool &inferential, bool &experimental, bool &assembly)
Definition: utilfun.cpp:1516
void fta_remove_tpa_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1408
static TokenBlkPtr TokenNodeNew(TokenBlkPtr tbp)
Definition: utilfun.cpp:454
Int2 fta_StringMatch(const Char **array, const Char *text)
Definition: utilfun.cpp:557
static const char * ParFlat_FLI_kw_array[]
Definition: utilfun.cpp:101
CRef< CDate_std > get_full_date(const char *s, bool is_ref, Parser::ESource source)
Definition: utilfun.cpp:972
static const char * ParFlat_ENV_kw_array[]
Definition: utilfun.cpp:167
bool fta_is_tpa_keyword(const char *str)
Definition: utilfun.cpp:1315
void CleanTailNoneAlphaCharInString(string &str)
Definition: utilfun.cpp:823
static const char * ParFlat_TPA_kw_array[]
Definition: utilfun.cpp:148
const Section * xTrackNodeType(const Entry &entry, int type)
Definition: utilfun.cpp:1153
char * SrchNodeType(DataBlkPtr entry, Int4 type, size_t *len)
Definition: utilfun.cpp:1094
void xGetBlkDataReplaceNewLine(string &instr, int indent)
Definition: utilfun.cpp:784
static const char * ParFlat_TSA_kw_array[]
Definition: utilfun.cpp:130
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
Definition: utilfun.cpp:1377
static const char * ParFlat_MGA_more_kw_array[]
Definition: utilfun.cpp:118
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1437
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Definition: utilfun.cpp:1108
string xGetNodeData(const DataBlk &entry, int nodeType)
Definition: utilfun.cpp:1122
bool ParseAccessionRange(list< string > &tokens, unsigned skip)
Definition: utilfun.cpp:265
char * GetTheCurrentToken(char **ptr)
Definition: utilfun.cpp:867
static void InsertTokenVal(TokenBlkPtr *tbp, const char *str)
Definition: utilfun.cpp:468
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Definition: utilfun.cpp:1340
Int2 StringMatchIcase(const Char **array, const Char *text)
Definition: utilfun.cpp:584
static const char * ParFlat_WGS_kw_array[]
Definition: utilfun.cpp:106
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1641
TokenStatBlkPtr TokenString(char *str, Char delimiter)
Definition: utilfun.cpp:489
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:1139
void FreeTokenblk(TokenBlkPtr tbp)
Definition: utilfun.cpp:529
static const char * ParFlat_HTC_kw_array[]
Definition: utilfun.cpp:96
void fta_remove_mag_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1466
void CleanTailNoneAlphaChar(char *str)
Definition: utilfun.cpp:837
static size_t SeekLastAlphaChar(const Char *str, size_t len)
Definition: utilfun.cpp:807
bool IsCancelled(const TKeywordList &keywords)
Definition: utilfun.cpp:1708
Int2 MatchArrayString(const char **array, const char *text)
Definition: utilfun.cpp:615
static const char * ParFlat_GSS_kw_array[]
Definition: utilfun.cpp:80
static string FTAitoa(Int4 m)
Definition: utilfun.cpp:179
Char * StringIStr(const Char *where, const Char *what)
Definition: utilfun.cpp:682
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1247
void CpSeqId(InfoBioseqPtr ibp, const CSeq_id &id)
Definition: utilfun.cpp:936
static const char * ParFlat_EST_kw_array[]
Definition: utilfun.cpp:69
void fta_remove_env_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1452
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
Definition: utilfun.cpp:923
bool fta_tpa_keywords_check(const TKeywordList &kwds)
Definition: utilfun.cpp:1165
void FreeTokenstatblk(TokenStatBlkPtr tsbp)
Definition: utilfun.cpp:542
char * PointToNextToken(char *ptr)
Definition: utilfun.cpp:847
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
Definition: utilfun.cpp:1592
void xCheckEstStsGssTpaKeywords(const list< string > keywordList, bool tpa_check, IndexblkPtr entry)
Definition: utilfun.cpp:1480
void UnwrapAccessionRange(const CGB_block::TExtra_accessions &extra_accs, CGB_block::TExtra_accessions &hist)
Definition: utilfun.cpp:197
ValNodePtr ConstructValNode(CSeq_id::E_Choice choice, const char *data)
Definition: utilfun.cpp:1582
Int2 MatchArrayISubString(const Char **array, const Char *text)
Definition: utilfun.cpp:714
#define ParFlat_UNKW
Definition: utilfun.h:44
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
Definition: valnode.cpp:53
static Uint4 letter(char c)
Modified on Thu Dec 07 10:08:32 2023 by modify_doxy.py rev. 669887