NCBI C++ ToolKit
asci_blk.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: asci_blk.cpp 102487 2024-05-13 19:56:07Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: asci_blk.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Common for all formats function processing ascii blocks to asn.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include <set>
38 
39 #include "ftacpp.hpp"
40 
46 #include <objects/seq/Bioseq.hpp>
48 #include <objects/seq/Seq_inst.hpp>
50 #include <objects/seq/Seq_data.hpp>
52 #include <objects/seq/Seq_ext.hpp>
54 #include <objects/seq/Seg_ext.hpp>
59 #include <objects/general/Date.hpp>
60 #include <objects/seq/Pubdesc.hpp>
63 #include <objects/pub/Pub.hpp>
68 #include <objects/pub/Pub_set.hpp>
74 #include <serial/iterator.hpp>
77 #include <objects/seq/MolInfo.hpp>
78 
79 #include "index.h"
80 #include "genbank.h"
81 #include "embl.h"
82 #include "sprot.h"
83 
85 
86 #include "ftaerr.hpp"
87 #include "indx_blk.h"
88 #include "asci_blk.h"
89 #include "utilfun.h"
90 #include "fta_xml.h"
91 
92 #include "add.h"
93 
94 #ifdef THIS_FILE
95 # undef THIS_FILE
96 #endif
97 #define THIS_FILE "asci_blk.cpp"
98 
99 #define Seq_descr_pub_same 50
100 
103 
104 const char* magic_phrases[] = {
105  "*** SEQUENCING IN PROGRESS ***",
106  "***SEQUENCING IN PROGRESS***",
107  "WORKING DRAFT SEQUENCE",
108  "LOW-PASS SEQUENCE SAMPLING",
109  "*** IN PROGRESS ***",
110  nullptr
111 };
112 
113 extern vector<string> genbankKeywords;
114 extern vector<string> emblKeywords;
115 extern vector<string> swissProtKeywords;
116 
117 /**********************************************************/
118 void ShrinkSpaces(char* line)
119 {
120  char* p;
121  char* q;
122  bool got_nl;
123 
124  if (! line || *line == '\0')
125  return;
126 
127  for (p = line; *p != '\0'; p++) {
128  if (*p == '\t')
129  *p = ' ';
130  if ((*p == ',' && p[1] == ',') || (*p == ';' && p[1] == ';'))
131  p[1] = ' ';
132  if ((p[1] == ',' || p[1] == ';') && p[0] == ' ') {
133  p[0] = p[1];
134  p[1] = ' ';
135  }
136  }
137 
138  for (p = line, q = line; *p != '\0';) {
139  *q = *p;
140  if (*p == ' ' || *p == '\n') {
141  for (got_nl = false; *p == ' ' || *p == '\n'; p++) {
142  if (*p == '\n')
143  got_nl = true;
144  }
145 
146  if (got_nl)
147  *q = '\n';
148  } else
149  p++;
150  q++;
151  }
152  if (q > line) {
153  for (q--; q > line && (*q == ' ' || *q == ';' || *q == '\n');)
154  q--;
155  if (*q != ' ' && *q != ';' && *q != '\n')
156  q++;
157  }
158  *q = '\0';
159 
160  for (p = line; *p == ' ' || *p == ';' || *p == '\n';)
161  p++;
162  if (p > line)
163  fta_StringCpy(line, p);
164 }
165 
166 void ShrinkSpaces(string& line)
167 {
168  size_t i;
169 
170  if (line.empty())
171  return;
172 
173  for (i = 0; i < line.size(); ++i) {
174  char& c = line[i];
175  if (c == '\t')
176  c = ' ';
177  if (i + 1 < line.size()) {
178  char& c1 = line[i + 1];
179  if ((c == ',' && c1 == ',') || (c == ';' && c1 == ';'))
180  c1 = ' ';
181  if ((c1 == ',' || c1 == ';') && c == ' ') {
182  c = c1;
183  c1 = ' ';
184  }
185  }
186  }
187 
188  size_t j = 0;
189  for (i = 0; i < line.size();) {
190  char c = line[i++];
191  if (c == ' ' || c == '\n') {
192  for (; i < line.size() && (line[i] == ' ' || line[i] == '\n'); ++i) {
193  if (line[i] == '\n')
194  c = '\n';
195  }
196  }
197  line[j++] = c;
198  }
199  line.resize(j);
200 
201  while (! line.empty()) {
202  char c = line.back();
203  if (c == ' ' || c == ';' || c == '\n')
204  line.pop_back();
205  else
206  break;
207  }
208 
209  i = 0;
210  for (char c : line) {
211  if (c == ' ' || c == ';' || c == '\n')
212  ++i;
213  else
214  break;
215  }
216  if (i > 0)
217  line.erase(0, i);
218 }
219 
220 /**********************************************************
221  *
222  * static void InsertDatablkVal(dbp, type, offset, len):
223  *
224  * Allocate a memory, then assign data-block value
225  * to a new node.
226  * dbp points to the new node if dbp is NULL.
227  *
228  * 3-18-93
229  *
230  **********************************************************/
231 static void InsertDatablkVal(DataBlkPtr* dbp, Int2 type, char* offset, size_t len)
232 {
233  DataBlk* ldp = new DataBlk(*dbp, type, offset, len);
234  if (! *dbp) {
235  *dbp = ldp;
236  }
237 }
238 
239 /**********************************************************
240  *
241  * char* GetGenBankBlock(chain, ptr, retkw, eptr):
242  *
243  * Enters knowing current keyword.type and offset,
244  * finds the length of the current keyword block,
245  * and builds the block to "chain".
246  * Since each key-word block always start at first
247  * column of the line, the loop stops when it found the
248  * first none (blank, newline, or tab) character after
249  * the newline character.
250  * Each data block will append to the "chain".
251  * Return a pointer points to next key-word block.
252  *
253  * 3-21-93
254  *
255  **********************************************************/
257 {
258  vector<string> lines;
259  NStr::Split(entry.mBaseData, "\n", lines);
260 
261  vector<string> sectionLines;
262  int currentKw = ParFlat_LOCUS;
263  int nextKw;
264  string sectionText;
265  for (const string& line : lines) {
266  nextKw = SrchKeyword(line, genbankKeywords);
267  if (nextKw == ParFlat_UNKW) {
268  nextKw = currentKw;
269  }
270  if (nextKw != currentKw || NStr::StartsWith(line, "REFERENCE")) {
271  auto* secPtr = new Section(currentKw, sectionLines);
272  // secPtr->DumpText(cerr);
273  entry.mSections.push_back(secPtr);
274  currentKw = nextKw;
275  sectionLines.clear();
276  sectionLines.push_back(line);
277  continue;
278  }
279  sectionLines.push_back(line);
280  }
281  entry.mSections.push_back(new Section(currentKw, sectionLines));
282 }
283 
284 char* GetGenBankBlock(DataBlkPtr* chain, char* ptr, Int2* retkw, char* eptr)
285 {
286  char* offset;
287  int curkw;
288  int nextkw;
289  Int4 len;
290 
291  len = 0;
292  offset = ptr;
293  curkw = *retkw;
294 
295  do /* repeat loop until it finds next key-word */
296  {
297  for (; ptr < eptr && *ptr != '\n'; ptr++)
298  len++;
299  if (ptr >= eptr)
300  return (ptr);
301 
302  ++ptr; /* newline character */
303  ++len;
304 
305  nextkw = SrchKeyword(CTempString(ptr, eptr - ptr), genbankKeywords);
306  if (nextkw == ParFlat_UNKW) /* it can be "XX" line,
307  treat as same line */
308  nextkw = curkw;
309 
310  if (StringEquN(ptr, "REFERENCE", 9)) /* treat as one block */
311  break;
312  } while (nextkw == curkw);
313 
314  nextkw = SrchKeyword(ptr, genbankKeywords);
315 
316  InsertDatablkVal(chain, curkw, offset, len);
317  *retkw = nextkw;
318  return (ptr);
319 }
320 
321 
322 /**********************************************************
323  *
324  * static void GetGenBankRefType(dbp, bases):
325  *
326  * Check the data in the "REFERENCE" line,
327  * - ParFlat_REF_END if it contains
328  * "(bases 1 to endbases)", pub for "descr"
329  * or no base range at all;
330  * - ParFlat_REF_SITES if it contains "(sites)",
331  * for ImpFeatPub;
332  * - ParFlat_REF_BTW, otherwise, for SeqFeatPub.
333  *
334  * 5-19-93
335  *
336  **********************************************************/
337 static void GetGenBankRefType(DataBlkPtr dbp, size_t bases)
338 {
339  char* bptr;
340  char* eptr;
341 
342  bptr = dbp->mOffset;
343  eptr = bptr + dbp->len;
344 
345  const string s = to_string(bases);
346  const string str = "(bases 1 to " + s + ")";
347  const string str1 = "(bases 1 to " + s + ";";
348  const string str2 = "(residues 1 to " + s + "aa)";
349 
350  string ref(bptr, bptr + dbp->len);
351 
352  while (bptr < eptr && *bptr != '\n' && *bptr != '(')
353  bptr++;
354  while (*bptr == ' ')
355  bptr++;
356 
357  if (*bptr == '\n')
359  else if (NStr::Find(ref, str) != NPOS || NStr::Find(ref, str1) != NPOS ||
360  NStr::Find(ref, str2) != NPOS)
361  dbp->mType = ParFlat_REF_END;
362  else if (NStr::Find(ref, "(sites)") != NPOS)
363  dbp->mType = ParFlat_REF_SITES;
364  else
365  dbp->mType = ParFlat_REF_BTW;
366 }
367 
368 /**********************************************************
369  *
370  * static void BuildFeatureBlock(dbp):
371  *
372  * The feature key in column 6-20.
373  *
374  * 5-3-93
375  *
376  **********************************************************/
378 {
379  char* bptr;
380  char* eptr;
381  char* ptr;
382  bool skip;
383 
384  bptr = dbp->mOffset;
385  eptr = bptr + dbp->len;
386  ptr = SrchTheChar(bptr, eptr, '\n');
387 
388  if (! ptr)
389  return;
390 
391  bptr = ptr + 1;
392 
393  while (bptr < eptr) {
394  InsertDatablkVal(reinterpret_cast<DataBlk**>(&dbp->mpData), ParFlat_FEATBLOCK, bptr, eptr - bptr);
395 
396  do {
397  bptr = SrchTheChar(bptr, eptr, '\n');
398  bptr++;
399 
400  skip = false;
401  if (! StringEquN(bptr, "XX", 2))
402  ptr = bptr + ParFlat_COL_FEATKEY;
403  else
404  skip = true;
405  } while ((*ptr == ' ' && ptr < eptr) || skip);
406  }
407 }
408 
409 /**********************************************************/
410 static void fta_check_mult_ids(DataBlkPtr dbp, const char* mtag, const char* ptag)
411 {
412  char* p;
413  Char ch;
414  Int4 muids;
415  Int4 pmids;
416 
417  if (! dbp || ! dbp->mOffset || (! mtag && ! ptag))
418  return;
419 
420  ch = dbp->mOffset[dbp->len];
421  dbp->mOffset[dbp->len] = '\0';
422 
423  size_t mlen = mtag ? StringLen(mtag) : 0;
424  size_t plen = ptag ? StringLen(ptag) : 0;
425 
426  muids = 0;
427  pmids = 0;
428  for (p = dbp->mOffset;; p++) {
429  p = StringChr(p, '\n');
430  if (! p)
431  break;
432  if (mtag && StringEquN(p + 1, mtag, mlen))
433  muids++;
434  else if (ptag && StringEquN(p + 1, ptag, plen))
435  pmids++;
436  }
437  dbp->mOffset[dbp->len] = ch;
438 
439  if (muids > 1) {
440  ErrPostEx(SEV_ERROR, ERR_REFERENCE_MultipleIdentifiers, "Reference has multiple MEDLINE identifiers. Ignoring all but the first.");
441  }
442  if (pmids > 1) {
443  ErrPostEx(SEV_ERROR, ERR_REFERENCE_MultipleIdentifiers, "Reference has multiple PUBMED identifiers. Ignoring all but the first.");
444  }
445 }
446 
447 /**********************************************************
448  *
449  * void GetGenBankSubBlock(entry, bases):
450  *
451  * 4-7-93
452  *
453  **********************************************************/
454 void GetGenBankSubBlock(const DataBlk& entry, size_t bases)
455 {
456  DataBlkPtr dbp;
457 
458  dbp = TrackNodeType(entry, ParFlat_SOURCE);
459  if (dbp) {
460  BuildSubBlock(dbp, ParFlat_ORGANISM, " ORGANISM");
461  GetLenSubNode(dbp);
462  }
463 
464  dbp = TrackNodeType(entry, ParFlat_REFERENCE);
465  for (; dbp; dbp = dbp->mpNext) {
466  if (dbp->mType != ParFlat_REFERENCE)
467  continue;
468 
469  fta_check_mult_ids(dbp, " MEDLINE", " PUBMED");
470  BuildSubBlock(dbp, ParFlat_AUTHORS, " AUTHORS");
471  BuildSubBlock(dbp, ParFlat_CONSRTM, " CONSRTM");
472  BuildSubBlock(dbp, ParFlat_TITLE, " TITLE");
473  BuildSubBlock(dbp, ParFlat_JOURNAL, " JOURNAL");
474  BuildSubBlock(dbp, ParFlat_MEDLINE, " MEDLINE");
475  BuildSubBlock(dbp, ParFlat_PUBMED, " PUBMED");
476  BuildSubBlock(dbp, ParFlat_STANDARD, " STANDARD");
477  BuildSubBlock(dbp, ParFlat_REMARK, " REMARK");
478  GetLenSubNode(dbp);
479  GetGenBankRefType(dbp, bases);
480  }
481 
482  dbp = TrackNodeType(entry, ParFlat_FEATURES);
483  for (; dbp; dbp = dbp->mpNext) {
484  if (dbp->mType != ParFlat_FEATURES)
485  continue;
486 
487  BuildFeatureBlock(dbp);
488  GetLenSubNode(dbp);
489  }
490 }
491 
492 // ----------------------------------------------------------------------------
493 void xGetGenBankSubBlocks(Entry& entry, size_t bases)
494 // ----------------------------------------------------------------------------
495 {
496  for (auto secPtr : entry.mSections) {
497  auto secType = secPtr->mType;
498  if (secType == ParFlat_SOURCE) {
499  secPtr->xBuildSubBlock(ParFlat_ORGANISM, " ORGANISM");
500  // GetLenSubNode(dbp);
501  }
502  if (secType == ParFlat_REFERENCE) {
503  // fta_check_mult_ids(dbp, " MEDLINE", " PUBMED");
504  secPtr->xBuildSubBlock(ParFlat_AUTHORS, " AUTHORS");
505  secPtr->xBuildSubBlock(ParFlat_CONSRTM, " CONSRTM");
506  secPtr->xBuildSubBlock(ParFlat_TITLE, " TITLE");
507  secPtr->xBuildSubBlock(ParFlat_JOURNAL, " JOURNAL");
508  secPtr->xBuildSubBlock(ParFlat_MEDLINE, " MEDLINE");
509  secPtr->xBuildSubBlock(ParFlat_PUBMED, " PUBMED");
510  secPtr->xBuildSubBlock(ParFlat_STANDARD, " STANDARD");
511  secPtr->xBuildSubBlock(ParFlat_REMARK, " REMARK");
512  // GetLenSubNode(dbp);
513  // GetGenBankRefType(dbp, bases);
514  }
515  if (secType == ParFlat_FEATURES) {
516  secPtr->xBuildFeatureBlocks();
517  // GetLenSubNode(dbp);
518  }
519  }
520 }
521 
522 /**********************************************************
523  *
524  * char* GetEmblBlock(chain, ptr, retkw, format, eptr):
525  *
526  * Enters knowing current keyword.type and offset,
527  * finds the length of the current keyword block, and
528  * builds the block to "chain".
529  * Loop will continue until it finds the next keyword
530  * or next "RN" after the newline character.
531  * Each data block will append to the "chain".
532  * Return a pointer points to next key-word block.
533  *
534  * 3-21-93
535  *
536  * The OS block can be
537  * - OS OS OC OC XX OG ==> this normal
538  * or
539  * - OS OC OC XX OS OS OC OC XX OG ==> this hybrids
540  * For case 2, it need to make two OS block.
541  *
542  * 12-15-93
543  *
544  **********************************************************/
545 char* GetEmblBlock(DataBlkPtr* chain, char* ptr, short* retkw, Parser::EFormat format, char* eptr)
546 {
547  char* offset;
548  Int2 curkw;
549  Int2 nextkw;
550  bool seen_oc = false;
551 
552  size_t len = 0;
553  offset = ptr;
554  curkw = *retkw;
555 
556  do /* repeat loop until it finds next key-word */
557  {
558  for (; ptr < eptr && *ptr != '\n'; ptr++)
559  len++;
560  if (ptr >= eptr) {
561  *retkw = ParFlat_END;
562  return (ptr);
563  }
564  ++ptr; /* newline character */
565  ++len;
566 
567  nextkw = SrchKeyword(
568  CTempString(ptr, eptr - ptr),
570  if (nextkw == ParFlat_UNKW) /* it can be "XX" line,
571  treat as same line */
572  nextkw = curkw;
573  if (StringEquN(ptr, "RN", 2)) /* treat each RN per block */
574  break;
575  if (StringEquN(ptr, "ID", 2)) /* treat each ID per block */
576  break;
577 
578  if (StringEquN(ptr, "OC", 2))
579  seen_oc = true;
580 
581  if (StringEquN(ptr, "OS", 2) && seen_oc)
582  break; /* treat as next OS block */
583 
584  } while (nextkw == curkw);
585 
586  InsertDatablkVal(chain, curkw, offset, len);
587 
588  *retkw = nextkw;
589  return (ptr);
590 }
591 
592 /**********************************************************
593  *
594  * static bool TrimEmblFeatBlk(dbp):
595  *
596  * Routine return TRUE if found FT data.
597  * The routine do the following things:
598  * - only leave last one FH line;
599  * - replace all "FT" to " " in the beginning of line.
600  *
601  * 6-15-93
602  *
603  **********************************************************/
604 static bool TrimEmblFeatBlk(DataBlkPtr dbp)
605 {
606  char* bptr;
607  char* eptr;
608  char* ptr;
609  bool flag = false;
610 
611  bptr = dbp->mOffset;
612  eptr = bptr + dbp->len;
613  ptr = SrchTheChar(bptr, eptr, '\n');
614 
615  while (ptr && ptr + 1 < eptr) {
616  if (ptr[2] == 'H') {
617  dbp->len = dbp->len - (ptr - dbp->mOffset + 1);
618  dbp->mOffset = ptr + 1;
619 
620  bptr = dbp->mOffset;
621  eptr = bptr + dbp->len;
622  } else {
623  bptr = ptr + 1;
624 
625  if (bptr[1] == 'T') {
626  flag = true;
627  *bptr = ' ';
628  bptr[1] = ' ';
629  }
630  }
631 
632  ptr = SrchTheChar(bptr, eptr, '\n');
633  }
634 
635  return (flag);
636 }
637 
638 /**********************************************************
639  *
640  * static bool GetSubNodeType(subkw, retbptr, eptr):
641  *
642  * Return TRUE and memory location which has
643  * the "subkw".
644  *
645  * 6-15-93
646  *
647  **********************************************************/
648 static bool GetSubNodeType(const char* subkw, char** retbptr, char* eptr)
649 {
650  char* bptr;
651  char* ptr;
652 
653  bptr = *retbptr;
654  size_t sublen = StringLen(subkw);
655 
656  while (bptr < eptr) {
657  if (StringEquN(bptr, subkw, sublen)) {
658  *retbptr = bptr;
659  return true;
660  }
661 
662  ptr = SrchTheChar(bptr, eptr, '\n');
663  if (ptr)
664  bptr = ptr;
665  bptr++;
666  }
667 
668  *retbptr = bptr;
669  return false;
670 }
671 
672 /**********************************************************
673  *
674  * static void GetEmblRefType(bases, source, dbp):
675  *
676  * If there is no "RP" line, default, or there is "RP"
677  * line and it contains "1-endbases", then
678  * type = ParFlat_REF_END, pub for "descr".
679  * Otherwise, ParFlat_REF_BTW, for SeqFeatPub.
680  *
681  * 6-15-93
682  *
683  **********************************************************/
684 static void GetEmblRefType(size_t bases, Parser::ESource source, DataBlkPtr dbp)
685 {
686  char* ptr;
687  char* bptr;
688  char* eptr;
689  char* sptr;
690 
691  bptr = dbp->mOffset;
692  eptr = bptr + dbp->len;
693 
694  if (! GetSubNodeType("RP", &bptr, eptr)) {
697  else
698  dbp->mType = ParFlat_REF_END;
699  return;
700  }
701 
702  const string str = " 1-" + to_string(bases);
703  ptr = SrchTheStr(bptr, eptr, str.c_str());
704  if (ptr) {
705  dbp->mType = ParFlat_REF_END;
706  return;
707  }
708 
709  if (source == Parser::ESource::EMBL) {
710  ptr = SrchTheStr(bptr, eptr, " 0-0");
711  if (ptr) {
713  return;
714  }
715  }
716 
717  dbp->mType = ParFlat_REF_BTW;
718  if (source == Parser::ESource::NCBI) {
719  for (sptr = bptr + 1; sptr < eptr && *sptr != 'R';)
720  sptr++;
721  if (SrchTheStr(bptr, sptr, "sites"))
722  dbp->mType = ParFlat_REF_SITES;
723  }
724 }
725 
726 /**********************************************************
727  *
728  * void GetEmblSubBlock(bases, source, entry):
729  *
730  * To build feature block:
731  * - report error if no FT data in the FH block;
732  * - to fit genbank feature table parsing:
733  * - only leave first FH line;
734  * - replace "FT" to " ";
735  * - delete any XX blocks.
736  *
737  * 5-27-93
738  *
739  **********************************************************/
740 void GetEmblSubBlock(size_t bases, Parser::ESource source, const DataBlk& entry)
741 {
742  DataBlkPtr temp;
743  DataBlkPtr curdbp;
744  DataBlkPtr predbp;
745  EntryBlkPtr ebp;
746 
747  temp = TrackNodeType(entry, ParFlat_OS);
748  for (; temp; temp = temp->mpNext) {
749  if (temp->mType != ParFlat_OS)
750  continue;
751 
752  BuildSubBlock(temp, ParFlat_OC, "OC");
753  BuildSubBlock(temp, ParFlat_OG, "OG");
754  GetLenSubNode(temp);
755  }
756 
757  temp = TrackNodeType(entry, ParFlat_RN);
758  for (; temp; temp = temp->mpNext) {
759  if (temp->mType != ParFlat_RN)
760  continue;
761 
762  fta_check_mult_ids(temp, "RX MEDLINE;", "RX PUBMED;");
763  BuildSubBlock(temp, ParFlat_RC, "RC");
764  BuildSubBlock(temp, ParFlat_RP, "RP");
765  BuildSubBlock(temp, ParFlat_RX, "RX");
766  BuildSubBlock(temp, ParFlat_RG, "RG");
767  BuildSubBlock(temp, ParFlat_RA, "RA");
768  BuildSubBlock(temp, ParFlat_RT, "RT");
769  BuildSubBlock(temp, ParFlat_RL, "RL");
770  GetEmblRefType(bases, source, temp);
771  GetLenSubNode(temp);
772  }
773 
774  ebp = static_cast<EntryBlk*>(entry.mpData);
775  temp = ebp->chain;
776  predbp = temp;
777  curdbp = temp->mpNext;
778  while (curdbp) {
779  if (curdbp->mType != ParFlat_FH) {
780  predbp = curdbp;
781  curdbp = curdbp->mpNext;
782  continue;
783  }
784 
785  if (TrimEmblFeatBlk(curdbp)) {
786  BuildFeatureBlock(curdbp);
787  GetLenSubNode(curdbp);
788 
789  predbp = curdbp;
790  curdbp = curdbp->mpNext;
791  } else /* report error, free this node */
792  {
793  ErrPostStr(SEV_WARNING, ERR_FEATURE_NoFeatData, "No feature data in the FH block (Embl)");
794 
795  predbp->mpNext = curdbp->mpNext;
796  curdbp->mpNext = nullptr;
797  delete curdbp;
798  curdbp = predbp->mpNext;
799  }
800  }
801 }
802 
803 /**********************************************************
804  *
805  * void BuildSubBlock(dbp, subtype, subkw):
806  *
807  * Some of sub-keyword may not be exist in every entry.
808  *
809  * 4-7-93
810  *
811  **********************************************************/
812 void BuildSubBlock(DataBlkPtr dbp, Int2 subtype, const char* subkw)
813 {
814  char* bptr;
815  char* eptr;
816 
817  bptr = dbp->mOffset;
818  eptr = bptr + dbp->len;
819 
820  if (GetSubNodeType(subkw, &bptr, eptr)) {
821  InsertDatablkVal(reinterpret_cast<DataBlk**>(&dbp->mpData), subtype, bptr, eptr - bptr);
822  }
823 }
824 
825 /**********************************************************
826  *
827  * void GetLenSubNode(dbp):
828  *
829  * Recalculate the length for the node which has
830  * subkeywords.
831  *
832  * 4-7-93
833  *
834  **********************************************************/
836 {
837  DataBlkPtr curdbp;
838  DataBlkPtr ndbp;
839  DataBlkPtr ldbp;
840  char* offset;
841  char* s;
842  Int2 n;
843  bool done = false;
844 
845  if (! dbp->mpData) /* no sublocks in this block */
846  return;
847 
848  offset = dbp->mOffset;
849  for (s = offset; *s != '\0' && isdigit(*s) == 0;)
850  s++;
851  n = atoi(s);
852  ldbp = nullptr;
853  for (ndbp = static_cast<DataBlk*>(dbp->mpData); ndbp; ndbp = ndbp->mpNext) {
854  size_t l = ndbp->mOffset - offset;
855  if (l > 0 && l < dbp->len) {
856  dbp->len = l;
857  ldbp = ndbp;
858  }
859  }
860 
861  if (ldbp != dbp->mpData && ldbp) {
862  ErrPostEx(SEV_WARNING, ERR_FORMAT_LineTypeOrder, "incorrect line type order for reference %d", n);
863  done = true;
864  }
865 
866  curdbp = static_cast<DataBlk*>(dbp->mpData);
867  for (; curdbp->mpNext; curdbp = curdbp->mpNext) {
868  offset = curdbp->mOffset;
869  ldbp = nullptr;
870  for (ndbp = static_cast<DataBlk*>(dbp->mpData); ndbp; ndbp = ndbp->mpNext) {
871  size_t l = ndbp->mOffset - offset;
872  if (l > 0 && l < curdbp->len) {
873  curdbp->len = l;
874  ldbp = ndbp;
875  }
876  }
877  if (ldbp != curdbp->mpNext && ldbp && ! done) {
878  ErrPostEx(SEV_WARNING, ERR_FORMAT_LineTypeOrder, "incorrect line type order for reference %d", n);
879  }
880  }
881 }
882 
883 /**********************************************************/
885 {
886  CRef<CPatent_seq_id> pat_id;
887  const char* p;
888  const char* q;
889 
890  if (! acc || *acc == '\0')
891  return (pat_id);
892 
893  pat_id = new CPatent_seq_id;
894 
895  p = StringChr(acc, '|');
896 
897  q = StringChr(p + 1, '|');
898  pat_id->SetCit().SetCountry(string(p + 1, q));
899 
900  p = StringChr(q + 1, '|');
901  pat_id->SetCit().SetId().SetNumber(string(q + 1, p));
902 
903  q = StringChr(p + 1, '|');
904  pat_id->SetCit().SetDoc_type(string(p + 1, q));
905 
906  pat_id->SetSeqid(atoi(q + 1));
907 
908  return (pat_id);
909 }
910 
911 /**********************************************************
912  *
913  * static Uint ValidSeqType(accession, type, is_nuc, is_tpa):
914  *
915  * 9-16-93
916  *
917  **********************************************************/
918 static Uint1 ValidSeqType(const char* accession, Uint1 type)
919 {
920  // CSeq_id::E_Choice cho;
921 
924  return (type);
925 
928  return (CSeq_id::e_not_set);
929 
930  if (! accession)
931  return (type);
932 
933  const auto cho = CSeq_id::GetAccType(CSeq_id::IdentifyAccession(accession));
934  /*
935  if (is_nuc)
936  cho = GetNucAccOwner(accession);
937  else
938  cho = GetProtAccOwner(accession);
939  */
940  if ((type == CSeq_id::e_Genbank || type == CSeq_id::e_Tpg) &&
941  (cho == CSeq_id::e_Genbank || cho == CSeq_id::e_Tpg))
942  return (cho);
943  else if ((type == CSeq_id::e_Ddbj || type == CSeq_id::e_Tpd) &&
944  (cho == CSeq_id::e_Ddbj || cho == CSeq_id::e_Tpd))
945  return (cho);
946  else if ((type == CSeq_id::e_Embl || type == CSeq_id::e_Tpe) &&
947  (cho == CSeq_id::e_Embl || cho == CSeq_id::e_Tpe))
948  return (cho);
949  return type;
950 }
951 
952 /**********************************************************
953  *
954  * CRef<CSeq_id> MakeAccSeqId(acc, seqtype, accver, vernum,
955  * is_nuc, is_tpa):
956  *
957  * 5-10-93
958  *
959  **********************************************************/
960 CRef<CSeq_id> MakeAccSeqId(const char* acc, Uint1 seqtype, bool accver, Int2 vernum)
961 {
962  CRef<CSeq_id> id;
963 
964  if (! acc || *acc == '\0')
965  return id;
966 
967  seqtype = ValidSeqType(acc, seqtype);
968 
969  if (seqtype == CSeq_id::e_not_set)
970  return id;
971 
972  CRef<CTextseq_id> text_id(new CTextseq_id);
973  text_id->SetAccession(acc);
974 
975  if (accver && vernum > 0)
976  text_id->SetVersion(vernum);
977 
978  id = new CSeq_id;
979  SetTextId(seqtype, *id, *text_id);
980  return id;
981 }
982 
983 /**********************************************************
984  *
985  * SeqIdPtr MakeLocusSeqId(locus, seqtype):
986  *
987  * 5-13-93
988  *
989  **********************************************************/
990 CRef<CSeq_id> MakeLocusSeqId(const char* locus, CSeq_id::E_Choice seqtype)
991 {
992  CRef<CSeq_id> res;
993  if (! locus || *locus == '\0')
994  return res;
995 
996  CRef<CTextseq_id> text_id(new CTextseq_id);
997  text_id->SetName(locus);
998 
999  res.Reset(new CSeq_id);
1000  SetTextId(seqtype, *res, *text_id);
1001 
1002  return res;
1003 }
1004 
1005 // LCOV_EXCL_START
1006 // Excluded per Mark's request on 12/14/2016
1007 /**********************************************************/
1008 static CRef<CSeq_id> MakeSegSetSeqId(const char* accession, const string& locus, Uint1 seqtype, bool is_tpa)
1009 {
1010  CRef<CSeq_id> res;
1011  if (locus.empty())
1012  return res;
1013 
1014  seqtype = ValidSeqType(accession, seqtype);
1015 
1016  if (seqtype == CSeq_id::e_not_set)
1017  return res;
1018 
1019  CRef<CTextseq_id> text_id(new CTextseq_id);
1020  text_id->SetName(locus);
1021 
1022  res.Reset(new CSeq_id);
1023  SetTextId(seqtype, *res, *text_id);
1024 
1025  return res;
1026 }
1027 // LCOV_EXCL_STOP
1028 
1029 /**********************************************************
1030  *
1031  * char* SrchNodeSubType(entry, type, subtype, len):
1032  *
1033  * Return a memory location of the node which has
1034  * the "subtype".
1035  *
1036  * 4-7-93
1037  *
1038  **********************************************************/
1039 char* SrchNodeSubType(const DataBlk& entry, Int2 type, Int2 subtype, size_t* len)
1040 {
1041  DataBlkPtr mdbp;
1042  DataBlkPtr sdbp;
1043 
1044  *len = 0;
1045  mdbp = TrackNodeType(entry, type);
1046  if (! mdbp)
1047  return nullptr;
1048 
1049  sdbp = static_cast<DataBlk*>(mdbp->mpData);
1050 
1051  while (sdbp && sdbp->mType != subtype)
1052  sdbp = sdbp->mpNext;
1053 
1054  if (! sdbp)
1055  return nullptr;
1056 
1057  *len = sdbp->len;
1058  return (sdbp->mOffset);
1059 }
1060 
1061 /**********************************************************/
1062 static void SetEmptyId(CBioseq& bioseq)
1063 {
1064  CRef<CObject_id> emptyId(new CObject_id);
1065  emptyId->SetId8(0);
1066 
1067  CRef<CSeq_id> seqId(new CSeq_id);
1068  seqId->SetLocal(*emptyId);
1069 
1070  bioseq.SetId().push_back(seqId);
1071 }
1072 
1073 /**********************************************************/
1075 {
1076  IndexblkPtr ibp;
1077 
1078  char* locus;
1079  const char* acc;
1080  Uint1 seqtype;
1081 
1082  CRef<CBioseq> res(new CBioseq);
1083 
1084  /* create the entry framework */
1085 
1086  ibp = pp->entrylist[pp->curindx];
1087  locus = ibp->locusname;
1088  acc = ibp->acnum;
1089 
1090  /* get the SeqId */
1091  if (pp->source == Parser::ESource::USPTO) {
1092  CRef<CSeq_id> id(new CSeq_id);
1094  id->SetPatent(*psip);
1095  return (res);
1096  }
1097  if (pp->source == Parser::ESource::EMBL && ibp->is_tpa)
1098  seqtype = CSeq_id::e_Tpe;
1099  else
1100  seqtype = ValidSeqType(acc, pp->seqtype);
1101 
1102  if (seqtype == CSeq_id::e_not_set) {
1103  if (acc && ! NStr::IsBlank(acc)) {
1104  auto pId = Ref(new CSeq_id(CSeq_id::e_Local, acc));
1105  res->SetId().push_back(std::move(pId));
1106  } else if (pp->mode == Parser::EMode::Relaxed && locus) {
1107  auto pId = Ref(new CSeq_id(CSeq_id::e_Local, locus));
1108  res->SetId().push_back(std::move(pId));
1109  } else {
1110  SetEmptyId(*res);
1111  }
1112  } else if ((! locus || *locus == '\0') && (! acc || *acc == '\0')) {
1113  SetEmptyId(*res);
1114  } else {
1115  CRef<CTextseq_id> textId(new CTextseq_id);
1116 
1117  if (ibp->embl_new_ID == false && locus && *locus != '\0' &&
1118  (! acc || ! StringEqu(acc, locus)))
1119  textId->SetName(locus);
1120 
1121  if (acc && *acc != '\0')
1122  textId->SetAccession(acc);
1123 
1124  if (pp->accver && ibp->vernum > 0)
1125  textId->SetVersion(ibp->vernum);
1126 
1127  CRef<CSeq_id> seqId(new CSeq_id);
1128  if (SetTextId(seqtype, *seqId, *textId))
1129  res->SetId().push_back(seqId);
1130  else
1131  SetEmptyId(*res);
1132  }
1133 
1134  return res;
1135 }
1136 
1137 /**********************************************************
1138  *
1139  * char* GetDescrComment(offset, len, col_data, is_htg):
1140  *
1141  * Return a pointer to a string comment.
1142  * Strip tailing or leading blanks, unless the
1143  * following rules occurrs (all the length will count
1144  * leading or tailing blanks):
1145  * - replace "\n" to "~~ ~~" if the length of a
1146  * line <= 12, except first blank line;
1147  * - if the column 13 is blank in the current line
1148  * and the previous line does not be added "~" at
1149  * end, then add "~" the beginning of the line
1150  * (indent format);
1151  * - replace "\n" to "~" if the length of a
1152  * line < 50 and (not a last line or not a first
1153  * line);
1154  * -- otherwise, change "\n" to a space.
1155  *
1156  * 4-28-93
1157  *
1158  **********************************************************/
1159 char* GetDescrComment(char* offset, size_t len, Uint2 col_data, bool is_htg, bool is_pat)
1160 {
1161  char* p;
1162  char* q;
1163  char* r;
1164  char* str;
1165 
1166  bool within = false;
1167  char* bptr = offset;
1168  char* eptr = bptr + len;
1169  char* com = StringNew(len);
1170 
1171  for (str = com; bptr < eptr; bptr = p + 1) {
1172  p = SrchTheChar(bptr, eptr, '\n');
1173 
1174  /* skip HTG generated comments starting with '*' */
1175  if ((is_htg && bptr[col_data] == '*') ||
1176  StringEquN(bptr, "XX", 2))
1177  continue;
1178 
1179  if (! within) {
1180  *p = '\0';
1181  r = StringStr(bptr, "-START##");
1182  *p = '\n';
1183  if (r)
1184  within = true;
1185  }
1186 
1187  q = bptr;
1188  if (*q == 'C')
1189  q++;
1190  if (*q == 'C')
1191  q++;
1192  while (*q == ' ')
1193  q++;
1194  if (q == p) {
1195  if (*(str - 1) != '~')
1196  *str++ = '~';
1197  *str++ = '~';
1198  continue;
1199  }
1200 
1201  if (p - bptr < col_data)
1202  continue;
1203 
1204  bptr += col_data;
1205  size_t size = p - bptr;
1206 
1207  if (*bptr == ' ' && *(str - 1) != '~')
1208  *str++ = '~';
1209  MemCpy(str, bptr, size);
1210  str += size;
1211  if (is_pat && size > 4 &&
1212  q[0] >= 'A' && q[0] <= 'Z' && q[1] >= 'A' && q[1] <= 'Z' &&
1213  StringEquN(q + 2, " ", 3))
1214  *str++ = '~';
1215  else if (size < 50 || within)
1216  *str++ = '~';
1217  else
1218  *str++ = ' ';
1219 
1220  if (within) {
1221  *p = '\0';
1222  r = StringStr(bptr, "-END##");
1223  *p = '\n';
1224  if (r)
1225  within = false;
1226  }
1227  }
1228 
1229  for (p = com;;) {
1230  p = StringStr(p, "; ");
1231  if (! p)
1232  break;
1233  for (p += 2, eptr = p; *eptr == ' ';)
1234  eptr++;
1235  if (eptr > p)
1236  fta_StringCpy(p, eptr);
1237  }
1238  for (p = com; *p == ' ';)
1239  p++;
1240  if (p > com)
1241  fta_StringCpy(com, p);
1242  for (p = com; *p != '\0';)
1243  p++;
1244  if (p > com) {
1245  for (p--;; p--) {
1246  if (*p == ' ' || *p == '\t' || *p == ';' || *p == ',' ||
1247  *p == '.' || *p == '~') {
1248  if (p > com)
1249  continue;
1250  *p = '\0';
1251  }
1252  break;
1253  }
1254  if (*p != '\0') {
1255  p++;
1256  if (StringEquN(p, "...", 3))
1257  p[3] = '\0';
1258  else if (StringChr(p, '.')) {
1259  *p = '.';
1260  p[1] = '\0';
1261  } else
1262  *p = '\0';
1263  }
1264  }
1265  if (*com != '\0')
1266  return (com);
1267  MemFree(com);
1268  return nullptr;
1269 }
1270 
1271 /**********************************************************/
1273 {
1274  auto it1 = secs.begin();
1275  if (it1 == secs.end() || it1->empty())
1276  return;
1277  auto it2 = next(it1);
1278  if (it2 == secs.end() || *it2 != "-" || fta_if_wgs_acc(*it1) != 0)
1279  return;
1280 
1281  auto tbp = secs.insert_after(it1, *it1);
1282  tbp->back() = '1';
1283 }
1284 
1285 
1286 /**********************************************************/
1287 /*
1288 static void fta_fix_secondaries(list<string>& secondaries)
1289 {
1290  if (secondaries.size() < 2) {
1291  return;
1292  }
1293 
1294  auto it = secondaries.begin();
1295  const auto& first = *it;
1296  const auto& second = *next(it);
1297 
1298  if (first.empty()||
1299  second.empty() ||
1300  fta_if_wgs_acc(second) != 0 ||
1301  second != "-") {
1302  return;
1303  }
1304 
1305  string newSecondary = *it;
1306  newSecondary.back() = '1';
1307  ++it;
1308  secondaries.insert(it, newSecondary);
1309 }
1310 */
1311 
1312 /**********************************************************
1313  *
1314  * void GetExtraAccession(ibp, allow_uwsec, source, accessions):
1315  *
1316  * Skip first accession, put remaining accessions
1317  * to link list 'accessions'.
1318  * Each accession separated by ";" or blanks.
1319  *
1320  **********************************************************/
1321 void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList& accessions)
1322 {
1323  Int4 pri_acc;
1324  Int4 sec_acc;
1325  const char* text;
1326  char* acc;
1327  size_t i = 0;
1328 
1329  bool unusual_wgs;
1330  bool unusual_wgs_msg;
1331  bool is_cp;
1332 
1333  CSeq_id::E_Choice pri_owner;
1334  CSeq_id::E_Choice sec_owner;
1335 
1336  if (ibp->secaccs.empty()) {
1337  return;
1338  }
1339 
1340  acc = StringSave(ibp->acnum);
1341  is_cp = (acc[0] == 'C' && acc[1] == 'P');
1342  pri_acc = fta_if_wgs_acc(acc);
1343  pri_owner = GetNucAccOwner(acc);
1344  if (pri_acc == 1 || pri_acc == 4) {
1345  char* p;
1346  for (p = acc; (*p >= 'A' && *p <= 'Z') || *p == '_';)
1347  p++;
1348  *p = '\0';
1349  i = StringLen(acc);
1350  }
1351 
1352  if (source == Parser::ESource::EMBL) {
1354  }
1355 
1356  unusual_wgs = false;
1357  for (auto tbp = ibp->secaccs.begin(); tbp != ibp->secaccs.end(); ++tbp) {
1358  if (*tbp == "-"s) {
1359  ++tbp;
1360  if (tbp == ibp->secaccs.end())
1361  break;
1362  if (! accessions.empty()) {
1363  accessions.back() += '-';
1364  accessions.back() += *tbp;
1365  }
1366  continue;
1367  }
1368 
1369  DelNonDigitTail(*tbp);
1370  const string& a = *tbp;
1371  sec_acc = fta_if_wgs_acc(a);
1372 
1373  unusual_wgs_msg = true;
1374  if (sec_acc == 0 || sec_acc == 3 ||
1375  sec_acc == 4 || sec_acc == 6 ||
1376  sec_acc == 10 || sec_acc == 12) /* 0 = AAAA01000000,
1377  3 = AAAA00000000,
1378  4 = GAAA01000000,
1379  6 = GAAA00000000,
1380  10 = KAAA01000000,
1381  12 = KAAA00000000 */
1382  {
1383  if (ibp->is_contig &&
1384  (ibp->wgssec.empty() || NStr::CommonSuffixSize(ibp->wgssec, a) >= 4))
1385  unusual_wgs_msg = false;
1386  if (ibp->wgssec.empty())
1387  ibp->wgssec = a;
1388  }
1389 
1390  sec_owner = GetNucAccOwner(a);
1391 
1392  if (sec_acc < 0 || sec_acc == 2) {
1393  if (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) {
1394  if (! allow_uwsec) {
1395  ErrPostEx(SEV_REJECT, ERR_ACCESSION_WGSWithNonWGS_Sec, "This WGS/TSA/TLS record has non-WGS/TSA/TLS secondary accession \"%s\". WGS/TSA/TLS records are not currently allowed to replace finished sequence records, scaffolds, etc. without human review and confirmation.", a.c_str());
1396  ibp->drop = true;
1397  } else {
1398  ErrPostEx(SEV_WARNING, ERR_ACCESSION_WGSWithNonWGS_Sec, "This WGS/TSA/TLS record has non-WGS/TSA/TLS secondary accession \"%s\". This is being allowed via the use of a special parser flag.", a.c_str());
1399  }
1400  }
1401 
1402  accessions.push_back(a);
1403  continue;
1404  }
1405 
1406  if (sec_acc == 3 || sec_acc == 6) /* like AAAA00000000 */
1407  {
1408  if (pri_owner == CSeq_id::e_Embl && sec_owner == CSeq_id::e_Embl &&
1409  (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) &&
1411  continue;
1414  ErrPostEx(SEV_REJECT, ERR_ACCESSION_WGSMasterAsSecondary, "WGS/TSA/TLS master accession \"%s\" is not allowed to be used as a secondary accession number.", a.c_str());
1415  ibp->drop = true;
1416  }
1417  continue;
1418  }
1419 
1420  if (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) /* WGS/TSA/TLS
1421  contig */
1422  {
1423  i = (StringEquN(a.c_str(), "NZ_", 3)) ? 7 : 4;
1424  if (! StringEquN(a.c_str(), ibp->acnum, i)) {
1425  if (! allow_uwsec) {
1426  ErrPostEx(SEV_REJECT, ERR_ACCESSION_UnusualWGS_Secondary, "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA/TLS project is being replaced (either by another project or by finished sequence). This is not allowed without human review and confirmation.");
1427  ibp->drop = true;
1428  } else if (! is_cp || source != Parser::ESource::NCBI) {
1429  ErrPostEx(SEV_WARNING, ERR_ACCESSION_UnusualWGS_Secondary, "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA project is being replaced (either by another project or by finished sequence). This is being allowed via the use of a special parser flag.");
1430  }
1431  }
1432  } else if (pri_acc == 2) /* WGS scaffold */
1433  {
1434  if (sec_acc == 1 || sec_acc == 5 || sec_acc == 11) /* WGS/TSA/TLS
1435  contig */
1436  {
1437  ErrPostEx(SEV_REJECT, ERR_ACCESSION_ScfldHasWGSContigSec, "This record, which appears to be a scaffold, has one or more WGS/TSA/TLS contig accessions as secondary. Currently, it does not make sense for a contig to replace a scaffold.");
1438  ibp->drop = true;
1439  }
1440  } else if (unusual_wgs_msg) {
1441  if (! allow_uwsec) {
1442  if (! unusual_wgs) {
1443  if (sec_acc == 1 || sec_acc == 5 || sec_acc == 11)
1444  text = "WGS/TSA/TLS contig secondaries are present, implying that a scaffold is replacing a contig";
1445  else
1446  text = "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA/TLS project is being replaced (either by another project or by finished sequence)";
1447  ErrPostEx(SEV_REJECT, ERR_ACCESSION_UnusualWGS_Secondary, "%s. This is not allowed without human review and confirmation.", text);
1448  }
1449  unusual_wgs = true;
1450  ibp->drop = true;
1451  } else if (! is_cp || source != Parser::ESource::NCBI) {
1452  if (! unusual_wgs) {
1453  if (sec_acc == 1 || sec_acc == 5 || sec_acc == 11)
1454  text = "WGS/TSA/TLS contig secondaries are present, implying that a scaffold is replacing a contig";
1455  else
1456  text = "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA/TLS project is being replaced (either by another project or by finished sequence)";
1457  ErrPostEx(SEV_WARNING, ERR_ACCESSION_UnusualWGS_Secondary, "%s. This is being allowed via the use of a special parser flag.", text);
1458  }
1459  unusual_wgs = true;
1460  }
1461  }
1462 
1463  if (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) {
1464  if (StringEquN(acc, a.c_str(), i) && a[i] >= '0' && a[i] <= '9') {
1465  if (sec_acc == 1 || sec_acc == 5 || pri_acc == 11)
1466  accessions.push_back(a);
1467  } else if (allow_uwsec) {
1468  accessions.push_back(a);
1469  }
1470  } else if (pri_acc == 2) {
1471  if (sec_acc == 0 || sec_acc == 4) /* like AAAA10000000 */
1472  accessions.push_back(a);
1473  } else if (allow_uwsec || (! unusual_wgs_msg && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL))) {
1474  accessions.push_back(a);
1475  }
1476  }
1477 
1478  MemFree(acc);
1479 }
1480 
1481 /**********************************************************/
1482 static void fta_fix_tpa_keywords(TKeywordList& keywords)
1483 {
1484  const char* p;
1485 
1486  for (string& key : keywords) {
1487  if (key.empty())
1488  continue;
1489 
1490  if (NStr::CompareNocase(key.c_str(), "TPA") == 0)
1491  key = "TPA";
1492  else if (StringEquNI(key.c_str(), "TPA:", 4)) {
1493  string buf("TPA:");
1494 
1495  for (p = key.c_str() + 4; *p == ' ' || *p == '\t';)
1496  p++;
1497 
1498  buf += p;
1499  if (fta_is_tpa_keyword(buf.c_str())) {
1500  for (string::iterator p = buf.begin() + 4; p != buf.end(); ++p) {
1501  if (*p >= 'A' && *p <= 'Z')
1502  *p |= 040;
1503  }
1504  }
1505 
1506  swap(key, buf);
1507  }
1508  }
1509 }
1510 
1511 // ----------------------------------------------------------------------------
1513  string& keywordData)
1514 // ----------------------------------------------------------------------------
1515 {
1516  const string problematic("WGS Third Party Data");
1517  const string desired("WGS; Third Party Data");
1518 
1519  if (keywordData.empty()) {
1520  return;
1521  }
1522  auto wgsStart = NStr::FindNoCase(keywordData, problematic);
1523  if (wgsStart == string::npos) {
1524  return;
1525  }
1526  auto afterProblematic = keywordData[wgsStart + problematic.size()];
1527  if (afterProblematic != ';' && afterProblematic != '.') {
1528  return;
1529  }
1530 
1531  string fixedKeywords;
1532  if (wgsStart > 0) {
1533  auto semiBefore = keywordData.rfind(';', wgsStart - 1);
1534  if (semiBefore == string::npos) {
1535  return;
1536  }
1537  for (auto i = semiBefore + 1; i < wgsStart; ++i) {
1538  if (keywordData[i] != ' ') {
1539  return;
1540  }
1541  }
1542  fixedKeywords = keywordData.substr(0, wgsStart - 1);
1543  }
1544  fixedKeywords += desired;
1545  fixedKeywords += keywordData.substr(wgsStart + problematic.size());
1546  keywordData = fixedKeywords;
1547 }
1548 
1549 
1550 // ----------------------------------------------------------------------------
1552  const DataBlk& entry,
1553  int type,
1554  Uint2 col_data,
1555  TKeywordList& keywords)
1556 // ----------------------------------------------------------------------------
1557 {
1558  // Expectation: Each keyword separated by ";", the last one ends with "."
1559 
1560  keywords.clear();
1561  auto keywordData = xGetNodeData(entry, type);
1562  if (keywordData.empty()) {
1563  return;
1564  }
1565  keywordData = GetBlkDataReplaceNewLine(keywordData, col_data);
1566  if (type == ParFlatSP_KW) {
1567  StripECO(keywordData);
1568  }
1569  xFixEMBLKeywords(keywordData);
1570 
1571  NStr::Split(keywordData, ";", keywords);
1572  auto it = keywords.begin();
1573  auto last = --keywords.end();
1574  while (it != keywords.end()) {
1575  auto& keyword = *it;
1576  NStr::TruncateSpacesInPlace(keyword);
1577  if (it == last) {
1578  NStr::TrimSuffixInPlace(keyword, ".");
1579  NStr::TruncateSpacesInPlace(keyword);
1580  }
1581  if (keyword.empty()) {
1582  keywords.erase(it++);
1583  } else {
1584  it++;
1585  }
1586  }
1587 
1588  fta_fix_tpa_keywords(keywords);
1589 }
1590 
1591 
1592 /**********************************************************
1593  *
1594  * Int4 ScanSequence(warn, seqptr, bsp, conv,
1595  * replacechar, numns):
1596  *
1597  * Scans a block of text converting characters to
1598  * sequence and storing in the ByteStorePtr bsp.
1599  * conv is a 255 Uint1 array where cells are indexed
1600  * by the ASCII value of the character in ptr:
1601  * - a value of 0 indicates skip;
1602  * - a value of 1 indicates an character is
1603  * unexpected (error);
1604  * - otherwise, it is a IUPACaa (protein) or a IUPACna
1605  * (nucleic acid) letter.
1606  * Function returns count of valid characters
1607  * converted to sequence.
1608  *
1609  * When sequence is presented in columns, this
1610  * function should be called once per line, so that
1611  * numbers can be recognized as errors.
1612  *
1613  * 3-30-93
1614  *
1615  * In order to skip the input flatfile put residue
1616  * label count at end, add blank variable to assume each
1617  * line only allow 6 blanks between residue.
1618  *
1619  * 7-28-93
1620  *
1621  **********************************************************/
1622 Int4 ScanSequence(bool warn, char** seqptr, std::vector<char>& bsp, unsigned char* conv, Char replacechar, int* numns)
1623 {
1624  Int2 blank;
1625  Int2 count;
1626  Uint1 residue;
1627  char* ptr;
1628  static Uint1 buf[133];
1629  unsigned char* bu;
1630 
1631  blank = count = 0;
1632  ptr = *seqptr;
1633 
1634  bu = buf;
1635  while (*ptr != '\n' && *ptr != '\0' && blank < 6 && count < 100) {
1636  if (numns && (*ptr == 'n' || *ptr == 'N'))
1637  (*numns)++;
1638 
1639  residue = conv[(int)*ptr];
1640 
1641  if (*ptr == ' ')
1642  blank++;
1643 
1644  if (residue > 2) {
1645  *bu++ = residue;
1646  count++;
1647  } else if (residue == 1 && (warn || isalpha(*ptr) != 0)) {
1648  /* it can be punctuation or alpha character */
1649  *bu++ = replacechar;
1650  count++;
1651  ErrPostEx(SEV_ERROR, ERR_SEQUENCE_BadResidue, "Invalid residue [%c]", *ptr);
1652  return (0);
1653  }
1654  ptr++;
1655  }
1656 
1657  *seqptr = ptr;
1658  std::copy(buf, bu, std::back_inserter(bsp));
1659  // BSWrite(bsp, buf, (Int4)(bu - buf));
1660  return (count);
1661 }
1662 
1663 /**********************************************************
1664  *
1665  * bool GetSeqData(pp, entry, bsp, nodetype, seqconv,
1666  * seq_data_type):
1667  *
1668  * Replace any bad residue to "N" if DNA sequence,
1669  * "X" if protein sequence.
1670  * PIR format allow punctuation in the sequence data,
1671  * so no warning message if found punctuation in the
1672  * sequence data.
1673  * Tatiana (mv from ScanSequence)
1674  *
1675  * 04-19-94
1676  *
1677  **********************************************************/
1678 bool GetSeqData(ParserPtr pp, const DataBlk& entry, CBioseq& bioseq, Int4 nodetype, unsigned char* seqconv, Uint1 seq_data_type)
1679 {
1680  // ByteStorePtr bp;
1681  IndexblkPtr ibp;
1682  char* seqptr;
1683  char* endptr;
1684  char* str;
1685  Char replacechar;
1686  size_t len = 0;
1687  Int4 numns;
1688 
1689  ibp = pp->entrylist[pp->curindx];
1690 
1691  bioseq.SetInst().SetLength(static_cast<TSeqPos>(ibp->bases));
1692 
1693  if (ibp->is_contig || ibp->is_mga)
1694  return true;
1695 
1696  if (pp->format == Parser::EFormat::XML) {
1698  seqptr = str;
1699  if (seqptr) {
1700  len = StringLen(seqptr);
1701  if (pp->source != Parser::ESource::USPTO || ! ibp->is_prot)
1702  for (char* p = seqptr; *p != '\0'; p++)
1703  if (*p >= 'A' && *p <= 'Z')
1704  *p |= 040; // tolower
1705  }
1706  } else {
1707  str = nullptr;
1708  seqptr = xSrchNodeType(entry, nodetype, &len);
1709  }
1710 
1711  if (! seqptr)
1712  return false;
1713 
1714  endptr = seqptr + len;
1715 
1718  replacechar = 'N';
1719  else
1720  replacechar = 'X';
1721 
1722  /* the sequence data will be located in next line of nodetype */
1723  if (pp->format == Parser::EFormat::XML) {
1724  while (*seqptr == ' ' || *seqptr == '\n' || *seqptr == '\t')
1725  seqptr++;
1726  } else {
1727  while (*seqptr != '\n')
1728  seqptr++;
1729  while (isalpha(*seqptr) == 0) /* skip leading blanks and digits */
1730  seqptr++;
1731  }
1732 
1733  std::vector<char> buf;
1734  size_t seqlen = 0;
1735  for (numns = 0; seqptr < endptr;) {
1736  len = ScanSequence(true, &seqptr, buf, seqconv, replacechar, &numns);
1737  if (len == 0) {
1738  if (str)
1739  MemFree(str);
1740  return false;
1741  }
1742 
1743  seqlen += len;
1744  while (isalpha(*seqptr) == 0 && seqptr < endptr)
1745  seqptr++;
1746  }
1747 
1748  if (seqlen != bioseq.GetLength()) {
1749  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_SeqLenNotEq, "Measured seqlen [%ld] != given [%ld]", (long int)seqlen, (long int)bioseq.GetLength());
1750  }
1751 
1752  if (str)
1753  MemFree(str);
1754 
1755  if (seq_data_type == CSeq_data::e_Iupacaa) {
1756  if (bioseq.GetLength() < 10) {
1758  if (ibp->is_pat == false)
1759  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_TooShort, "This sequence for this record falls below the minimum length requirement of 10 basepairs.");
1760  else
1761  ErrPostEx(SEV_INFO, ERR_SEQUENCE_TooShortIsPatent, "This sequence for this patent record falls below the minimum length requirement of 10 basepairs.");
1762  } else {
1763  if (ibp->is_pat == false)
1764  ErrPostEx(SEV_REJECT, ERR_SEQUENCE_TooShort, "This sequence for this record falls below the minimum length requirement of 10 basepairs.");
1765  else
1766  ErrPostEx(SEV_REJECT, ERR_SEQUENCE_TooShortIsPatent, "This sequence for this patent record falls below the minimum length requirement of 10 basepairs.");
1767  ibp->drop = true;
1768  }
1769  }
1770  if (seqlen == static_cast<Uint4>(numns)) {
1771  ErrPostEx(SEV_REJECT, ERR_SEQUENCE_AllNs, "This nucleotide sequence for this record contains nothing but unknown (N) basepairs.");
1772  ibp->drop = true;
1773  }
1774  }
1775 
1776  bioseq.SetInst().SetSeq_data().Assign(CSeq_data(buf, static_cast<CSeq_data::E_Choice>(seq_data_type)));
1777 
1778  return true;
1779 }
1780 
1781 /**********************************************************
1782  *
1783  * unsigned char* GetDNAConv():
1784  *
1785  * DNA conversion table array.
1786  *
1787  * 3-29-93
1788  *
1789  **********************************************************/
1790 unique_ptr<unsigned char[]> GetDNAConv(void)
1791 {
1792 
1793  unique_ptr<unsigned char[]> dnaconv(new unsigned char[255]());
1794  MemSet((char*)dnaconv.get(), (Uint1)1, (size_t)255);
1795 
1796  dnaconv[32] = 0; /* blank */
1797 
1799  for (CSeqportUtil::TIndex i = range.first; i <= range.second; ++i) {
1801 
1802  dnaconv[static_cast<int>(code[0])] = code[0];
1803  dnaconv[(int)tolower(code[0])] = code[0];
1804  }
1805 
1806  return dnaconv;
1807 }
1808 
1809 /**********************************************************
1810  *
1811  * unsigned char* GetProteinConv():
1812  *
1813  * Protein conversion table array.
1814  *
1815  * 3-29-93
1816  *
1817  **********************************************************/
1818 unique_ptr<unsigned char[]> GetProteinConv(void)
1819 {
1820  // unsigned char* protconv;
1821  unique_ptr<unsigned char[]> protconv(new unsigned char[255]());
1822 
1823  // protconv = (unsigned char*)MemNew((size_t)255); /* proteins */
1824  MemSet((char*)protconv.get(), (Uint1)1, (size_t)255); /* everything
1825  an error */
1826  protconv[32] = 0; /* blank */
1827 
1829  for (CSeqportUtil::TIndex i = range.first; i <= range.second; ++i) {
1831  protconv[(int)code[0]] = code[0]; /* swiss-prot, pir uses upper case
1832  protein code */
1833  }
1834 
1835  return (protconv);
1836 }
1837 
1838 /***********************************************************/
1839 static CSeq_descr::Tdata::const_iterator GetDescrByChoice(const CSeq_descr& descr, Uint1 choice)
1840 {
1841  const CSeq_descr::Tdata& descr_list = descr.Get();
1842 
1843  CSeq_descr::Tdata::const_iterator cur_descr = descr_list.begin();
1844  for (; cur_descr != descr_list.end(); ++cur_descr) {
1845  if ((*cur_descr)->Which() == choice)
1846  break;
1847  }
1848 
1849  return cur_descr;
1850 }
1851 
1852 // LCOV_EXCL_START
1853 // Excluded per Mark's request on 12/14/2016
1854 /**********************************************************
1855  *
1856  * static void GetFirstSegDescrChoice(bio_set, choice,
1857  * descr_new):
1858  *
1859  * 10-14-93
1860  *
1861  **********************************************************/
1862 static void GetFirstSegDescrChoice(CBioseq& bioseq, Uint1 choice, CSeq_descr& descr_new)
1863 {
1864  CSeq_descr& descr = bioseq.SetDescr();
1865  CSeq_descr::Tdata& descr_list = descr.Set();
1866 
1867  // Don't use GetDescrByChoice here just because GCC version does not support erase(const_iterator)
1868  CSeq_descr::Tdata::iterator cur_descr = descr_list.begin();
1869  for (; cur_descr != descr_list.end(); ++cur_descr) {
1870  if ((*cur_descr)->Which() == choice) {
1871  /* found the "choice" node, isolated node */
1872  descr_new.Set().push_back(*cur_descr);
1873  descr_list.erase(cur_descr);
1874  break;
1875  }
1876  }
1877 }
1878 // LCOV_EXCL_STOP
1879 
1880 // SameCitation and 'PubEquivMatch' have a bit different logic,
1881 // so below is an additional function that makes a check
1882 // for equality according to 'PubEquivMatch' rules
1884 {
1885  for (const CRef<CPub>& it1 : a.Get()) {
1886  for (const CRef<CPub>& it2 : b.Get()) {
1887  if (it1->SameCitation(*it2)) {
1888  bool same = true;
1889 
1890  if (it1->Which() == CPub::e_Gen && it2->Which() == CPub::e_Gen) {
1891  const CCit_gen& cit_a = it1->GetGen();
1892  const CCit_gen& cit_b = it2->GetGen();
1893 
1894  if (cit_a.IsSetSerial_number() && cit_b.IsSetSerial_number() && cit_a.GetSerial_number() == cit_b.GetSerial_number()) {
1895  // The special condition of 'PubEquivMatch'
1896  // a->volume == NULL && b->volume == NULL &&
1897  // a->issue == NULL && b->issue == NULL &&
1898  // a->pages == NULL && b->pages == NULL &&
1899  // a->title == NULL && b->title == NULL &&
1900  // a->cit == NULL && b->cit == NULL &&
1901  // a->authors == NULL && b->authors == NULL &&
1902  // a->muid == -1 && b->muid == -1 &&
1903  // a->journal == NULL && b->journal == NULL &&
1904  // a->date == NULL && b->date == NULL &&
1905  // a->serial_number != -1 && b->serial_number != -1
1906 
1907  if (! cit_a.IsSetVolume() && ! cit_b.IsSetVolume() &&
1908  ! cit_a.IsSetIssue() && ! cit_b.IsSetIssue() &&
1909  ! cit_a.IsSetPages() && ! cit_b.IsSetPages() &&
1910  ! cit_a.IsSetTitle() && ! cit_b.IsSetTitle() &&
1911  ! cit_a.IsSetCit() && ! cit_b.IsSetCit() &&
1912  ! cit_a.IsSetAuthors() && ! cit_b.IsSetAuthors() &&
1913  ! cit_a.IsSetMuid() && ! cit_b.IsSetMuid() &&
1914  ! cit_a.IsSetJournal() && ! cit_b.IsSetJournal() &&
1915  ! cit_a.IsSetDate() && ! cit_b.IsSetDate())
1916  same = false; // SIC!!!
1917  }
1918  }
1919 
1920  if (same)
1921  return true;
1922  }
1923  }
1924  }
1925 
1926  return false;
1927 }
1928 
1929 // LCOV_EXCL_START
1930 // Excluded per Mark's request on 12/14/2016
1931 /**********************************************************
1932  *
1933  * static bool CheckSegPub(pub, entries, same_pub_descr):
1934  *
1935  * 5-21-93
1936  *
1937  **********************************************************/
1938 static bool CheckSegPub(const CPubdesc& pub, TEntryList& entries, std::set<CSeqdesc*>& same_pub_descr)
1939 {
1940  if (! pub.IsSetPub() || ! pub.GetPub().IsSet() || pub.GetPub().Get().empty())
1941  return true;
1942 
1943  CRef<CPub> pub_ref = pub.GetPub().Get().front();
1944 
1945  if (! pub_ref->IsGen() || ! pub_ref->GetGen().IsSetSerial_number())
1946  return true;
1947 
1948  int num0 = pub_ref->GetGen().GetSerial_number();
1949 
1950  TEntryList::iterator next_seq = entries.begin();
1951  for (++next_seq; next_seq != entries.end(); ++next_seq) {
1952  if (! (*next_seq)->IsSetDescr())
1953  continue;
1954 
1955  CSeq_descr& descr = (*next_seq)->SetDescr();
1956 
1957  bool not_found = true;
1958  for (auto& cur_descr : descr.Set()) {
1959  if (! cur_descr->IsPub() || ! cur_descr->GetPub().IsSetPub() || ! cur_descr->GetPub().GetPub().IsSet() ||
1960  cur_descr->GetPub().GetPub().Get().empty())
1961  continue;
1962 
1963  const CPubdesc& cur_pub = cur_descr->GetPub();
1964  const CPub& cur_pub_ref = *cur_pub.GetPub().Get().front();
1965 
1966  if (! cur_pub_ref.IsGen() || ! cur_pub_ref.GetGen().IsSetSerial_number())
1967  continue;
1968 
1969  int num = cur_pub_ref.GetGen().GetSerial_number();
1970 
1971  if (! SameCitation_PubEquivMatch_Logic(cur_pub.GetPub(), pub.GetPub()))
1972  continue;
1973 
1974  if (num == num0) {
1975  same_pub_descr.insert(cur_descr); // store pointer to the same descr for future use
1976  not_found = false;
1977  break;
1978  }
1979 
1980  ErrPostStr(SEV_WARNING, ERR_SEGMENT_PubMatch, "Matching references with different serial numbers");
1981  }
1982 
1983  if (not_found)
1984  break;
1985  }
1986 
1987  return (next_seq == entries.end());
1988 }
1989 // LCOV_EXCL_STOP
1990 
1991 /***********************************************************/
1992 static void RemoveDescrByChoice(CSeq_descr& descr, Uint1 choice)
1993 {
1994  CSeq_descr::Tdata& descr_list = descr.Set();
1995 
1996  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
1997  if ((*cur_descr)->Which() == choice)
1998  cur_descr = descr_list.erase(cur_descr);
1999  else
2000  ++cur_descr;
2001  }
2002 }
2003 
2004 /**********************************************************
2005  *
2006  * static void CleanUpSeqDescrChoice(entries, choice):
2007  *
2008  * 5-21-93
2009  *
2010  **********************************************************/
2012 {
2013  TEntryList::iterator next_seq = entries.begin();
2014  ++next_seq;
2015 
2016  for (; next_seq != entries.end(); ++next_seq)
2017  RemoveDescrByChoice((*next_seq)->SetDescr(), choice);
2018 }
2019 
2020 /**********************************************************
2021  *
2022  * static void CleanUpSeqDescrPub(entries, to_clean):
2023  *
2024  * 1-13-16
2025  *
2026  **********************************************************/
2027 static void CleanUpSeqDescrPub(TEntryList& entries, std::set<CSeqdesc*>& to_clean)
2028 {
2029  TEntryList::iterator next_seq = entries.begin();
2030  ++next_seq;
2031 
2032  for (; next_seq != entries.end(); ++next_seq) {
2033  CSeq_descr::Tdata& descr_list = (*next_seq)->SetDescr().Set();
2034  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
2035  std::set<CSeqdesc*>::iterator it = to_clean.find(*cur_descr);
2036  if (it != to_clean.end()) {
2037  cur_descr = descr_list.erase(cur_descr);
2038  to_clean.erase(it);
2039  } else
2040  ++cur_descr;
2041  }
2042  }
2043 }
2044 
2045 // LCOV_EXCL_START
2046 // Excluded per Mark's request on 12/14/2016
2047 /**********************************************************
2048  *
2049  * static void GetSegPub(entries, descr):
2050  *
2051  * 5-21-93
2052  *
2053  **********************************************************/
2055 {
2056  CBioseq& bioseq = entries.front()->SetSeq();
2057  CSeq_descr::Tdata& descr_list = bioseq.SetDescr().Set();
2058 
2059  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
2060  if ((*cur_descr)->IsPub()) {
2061  CPubdesc& pubdesc = (*cur_descr)->SetPub();
2062 
2063  std::set<CSeqdesc*> same_pub_descr;
2064  if (CheckSegPub(pubdesc, entries, same_pub_descr)) {
2065  descr.Set().push_back(*cur_descr);
2066  cur_descr = descr_list.erase(cur_descr);
2067 
2068  CleanUpSeqDescrPub(entries, same_pub_descr);
2069  } else
2070  ++cur_descr;
2071  } else
2072  ++cur_descr;
2073  }
2074 }
2075 
2076 /**********************************************************
2077  *
2078  * static bool CheckSegDescrChoice(entry, choice):
2079  *
2080  * 5-18-93
2081  *
2082  **********************************************************/
2083 static bool CheckSegDescrChoice(const TEntryList& entries, Uint1 choice)
2084 {
2085  string org;
2086  CDate date;
2087  Int4 modif = -1;
2088 
2089  bool no_problem_found = true;
2090  for (TEntryList::const_iterator seq = entries.begin(); seq != entries.end(); ++seq) {
2091  const CSeq_descr& descr = (*seq)->GetDescr();
2092  const CSeq_descr::Tdata& descr_list = descr.Get();
2093 
2094  CSeq_descr::Tdata::const_iterator cur_descr = GetDescrByChoice(descr, choice);
2095 
2096  if (cur_descr == descr_list.end()) {
2097  no_problem_found = false;
2098  break;
2099  }
2100 
2101  if (choice == CSeqdesc::e_Org) {
2102  if (org.empty())
2103  org = (*cur_descr)->GetOrg().GetTaxname();
2104  else if (org != (*cur_descr)->GetOrg().GetTaxname()) {
2105  no_problem_found = false;
2106  break;
2107  }
2108  } else if (choice == CSeqdesc::e_Modif) {
2109  Int4 val = *(*cur_descr)->GetModif().begin();
2110  if (modif == -1)
2111  modif = val;
2112  else if (modif != val) {
2113  no_problem_found = false;
2114  break;
2115  }
2116  } else /* Seq_descr_update_date */
2117  {
2118  if (date.Which() == CDate::e_not_set)
2119  date.Assign((*cur_descr)->GetUpdate_date());
2120  else if (date.Compare((*cur_descr)->GetUpdate_date()) != CDate::eCompare_same) {
2121  no_problem_found = false;
2122  break;
2123  }
2124  }
2125  }
2126 
2127  return no_problem_found;
2128 }
2129 // LCOV_EXCL_STOP
2130 
2131 /**********************************************************
2132  *
2133  * static char* GetBioseqSetDescrTitle(descr):
2134  *
2135  * Copy title from the first one, truncate before
2136  * "complete cds" or "exon".
2137  *
2138  * 5-18-93
2139  *
2140  **********************************************************/
2141 static optional<string> GetBioseqSetDescrTitle(const CSeq_descr& descr)
2142 {
2143  const string* found = nullptr;
2144  for (auto it : descr.Get()) {
2145  if (it->IsTitle()) {
2146  found = &it->GetTitle();
2147  break;
2148  }
2149  }
2150 
2151  if (! found)
2152  return {};
2153 
2154  string title = *found;
2155 
2156  auto pos = title.find("complete cds");
2157  if (pos == string::npos) {
2158  pos = title.find("exon");
2159  }
2160 
2161  if (pos != string::npos) {
2162  title.resize(pos);
2164  }
2165 
2166  return title;
2167 }
2168 
2169 // LCOV_EXCL_START
2170 // Excluded per Mark's request on 12/14/2016
2171 /**********************************************************
2172  *
2173  * static void SrchSegDescr(TEntryList& entries, CSeq_descr& descr):
2174  *
2175  * Copy title from first one, truncate before
2176  * "complete cds" or "exon"
2177  * org, if they are all from one organism, then move
2178  * the data to this set, and make NULL to the sep chains
2179  * in which sep->mpData->descr->choice = Seq_descr_org.
2180  * modif, if they are all same modifier, then move
2181  * the data to this set, and make NULL to the sep chains
2182  * in which sep->mpData->descr->choice = Seq_descr_modif.
2183  *
2184  **********************************************************/
2186 {
2187  CRef<CSeq_entry>& entry = entries.front();
2188  CBioseq& bioseq = entry->SetSeq();
2189 
2190  if (auto title = GetBioseqSetDescrTitle(bioseq.GetDescr())) {
2191  CRef<CSeqdesc> desc_new(new CSeqdesc);
2192  desc_new->SetTitle(*title);
2193  descr.Set().push_back(desc_new);
2194  }
2195 
2197  GetFirstSegDescrChoice(bioseq, CSeqdesc::e_Org, descr);
2199  }
2201  GetFirstSegDescrChoice(bioseq, CSeqdesc::e_Modif, descr);
2203  }
2204 
2205  GetSegPub(entries, descr);
2206 
2210  }
2211 }
2212 
2213 /**********************************************************/
2214 static void GetSegSetDblink(CSeq_descr& descr, TEntryList& entries /*SeqEntryPtr headsep*/, bool* drop)
2215 {
2216  if (entries.empty())
2217  return;
2218 
2219  CRef<CSeqdesc> gpid,
2220  dblink,
2221  cur_gpid,
2222  cur_dblink;
2223 
2224  Uint4 dblink_count = 0;
2225  Uint4 gpid_count = 0;
2226 
2227  bool bad_gpid = false;
2228  bool bad_dblink = false;
2229 
2230  for (auto& entry : entries) {
2231  cur_gpid.Reset();
2232  cur_dblink.Reset();
2233 
2234  CSeq_descr::Tdata& descr_list = entry->SetDescr();
2235 
2236  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
2237  if (! (*cur_descr)->IsUser()) {
2238  ++cur_descr;
2239  continue;
2240  }
2241 
2242  const CUser_object& user = (*cur_descr)->GetUser();
2243  if (! user.CanGetType() || user.GetType().GetStr().empty()) {
2244  ++cur_descr;
2245  continue;
2246  }
2247 
2248  string type_str = user.GetType().GetStr();
2249 
2250  if (type_str == "DBLink") {
2251  if (cur_dblink.NotEmpty())
2252  continue;
2253 
2254  dblink_count++;
2255  cur_dblink = *cur_descr;
2256 
2257  if (dblink.Empty())
2258  dblink = cur_dblink;
2259 
2260  cur_descr = descr_list.erase(cur_descr);
2261  } else if (type_str == "GenomeProjectsDB") {
2262  if (cur_gpid.NotEmpty())
2263  continue;
2264 
2265  gpid_count++;
2266  cur_gpid = *cur_descr;
2267 
2268  if (gpid.Empty())
2269  gpid = cur_gpid;
2270 
2271  cur_descr = descr_list.erase(cur_descr);
2272  } else
2273  ++cur_descr;
2274  }
2275 
2276  if (cur_dblink.NotEmpty()) {
2277  if (dblink.Empty())
2278  dblink = cur_dblink;
2279  else {
2280  if (! cur_dblink->Equals(*dblink)) {
2281  bad_dblink = true;
2282  break;
2283  }
2284  }
2285  }
2286 
2287  if (cur_gpid.NotEmpty()) {
2288  if (gpid.Empty())
2289  gpid = cur_gpid;
2290  else {
2291  if (! cur_gpid->Equals(*gpid)) {
2292  bad_gpid = true;
2293  break;
2294  }
2295  }
2296  }
2297  }
2298 
2299  if (bad_dblink == false && bad_gpid == false) {
2300  if (dblink_count > 0 && entries.size() != dblink_count)
2301  bad_dblink = true;
2302  if (gpid_count > 0 && entries.size() != gpid_count)
2303  bad_gpid = true;
2304  }
2305 
2306  if (bad_dblink) {
2307  ErrPostEx(SEV_REJECT, ERR_SEGMENT_DBLinkMissingOrNonUnique, "One or more member of segmented set has missing or non-unique DBLink user-object. Entry dropped.");
2308  *drop = true;
2309  }
2310 
2311  if (bad_gpid) {
2312  ErrPostEx(SEV_REJECT, ERR_SEGMENT_GPIDMissingOrNonUnique, "One or more member of segmented set has missing or non-unique GPID user-object. Entry dropped.");
2313  *drop = true;
2314  }
2315 
2316  if (bad_gpid || bad_dblink ||
2317  (dblink.Empty() && gpid.Empty()) ||
2318  descr.Get().empty())
2319  return;
2320 
2321  if (dblink.NotEmpty())
2322  descr.Set().push_back(dblink);
2323  if (gpid.NotEmpty())
2324  descr.Set().push_back(gpid);
2325 }
2326 
2327 /**********************************************************
2328  *
2329  * static void GetBioseqSetDescr(entries, descr, drop)
2330  *
2331  * 1-20-16
2332  *
2333  **********************************************************/
2334 static void GetBioseqSetDescr(TEntryList& entries, CSeq_descr& descr, bool* drop)
2335 {
2336  SrchSegDescr(entries, descr); /* get from ASN.1 tree */
2337  GetSegSetDblink(descr, entries, drop);
2338 }
2339 
2340 /**********************************************************
2341  *
2342  * static const char *GetMoleculeClassString(mol):
2343  *
2344  * 6-25-93
2345  *
2346  **********************************************************/
2347 static const char* GetMoleculeClassString(Uint1 mol)
2348 {
2349  if (mol == 0)
2350  return ("not-set");
2351  if (mol == 1)
2352  return ("DNA");
2353  if (mol == 2)
2354  return ("RNA");
2355  if (mol == 3)
2356  return ("AA");
2357  if (mol == 4)
2358  return ("NA");
2359  return ("other");
2360 }
2361 
2362 /**********************************************************
2363  *
2364  * static CSeq_inst::EMol SrchSegSeqMol(entries):
2365  *
2366  * 5-14-93
2367  *
2368  **********************************************************/
2370 {
2371  const CBioseq& orig_bioseq = entries.front()->GetSeq();
2372  CSeq_inst::EMol mol = orig_bioseq.GetInst().GetMol();
2373 
2374  for (const auto& entry : entries) {
2375  const CBioseq& cur_bioseq = entry->GetSeq();
2376  if (mol == cur_bioseq.GetInst().GetMol())
2377  continue;
2378 
2379  ErrPostEx(SEV_WARNING, ERR_SEGMENT_DiffMolType, "Different molecule type in the segment set, \"%s\" to \"%s\"", GetMoleculeClassString(mol), GetMoleculeClassString(cur_bioseq.GetInst().GetMol()));
2380 
2381  return CSeq_inst::eMol_na;
2382  }
2383 
2384  return mol;
2385 }
2386 
2387 /**********************************************************
2388  *
2389  * static Int4 SrchSegLength(entries):
2390  *
2391  * 5-14-93
2392  *
2393  **********************************************************/
2395 {
2396  Int4 length = 0;
2397 
2398  for (const auto& entry : entries) {
2399  const CBioseq& cur_bioseq = entry->GetSeq();
2400  length += cur_bioseq.GetLength();
2401  }
2402 
2403  return (length);
2404 }
2405 
2406 /**********************************************************
2407  *
2408  * static CRef<CBioseq> GetBioseq(pp, orig_bioseq, slp):
2409  *
2410  * 5-12-93
2411  *
2412  **********************************************************/
2414 {
2415  IndexblkPtr ibp = pp->entrylist[pp->curindx];
2416  CRef<CBioseq> bioseq(new CBioseq);
2417 
2418  {
2419  string locusname = "SEG_";
2420  locusname.append(ibp->blocusname);
2421  bioseq->SetId().push_back(MakeSegSetSeqId(ibp->acnum, locusname, pp->seqtype, ibp->is_tpa));
2422  }
2423 
2424  if (pp->seg_acc) {
2425  string locusname = "SEG_";
2426  locusname.append(ibp->acnum);
2427  bioseq->SetId().push_back(MakeSegSetSeqId(ibp->acnum, locusname, pp->seqtype, ibp->is_tpa));
2428  }
2429 
2430  const CSeq_entry& first_entry = *(entries.front());
2431  const CBioseq& original = first_entry.GetSeq();
2432 
2433  if (auto title = GetBioseqSetDescrTitle(original.GetDescr())) {
2434  CRef<CSeqdesc> descr(new CSeqdesc);
2435  descr->SetTitle(*title);
2436  bioseq->SetDescr().Set().push_back(descr);
2437  }
2438 
2439  CSeq_inst& inst = bioseq->SetInst();
2441  inst.SetMol(SrchSegSeqMol(entries));
2442 
2443  bool need_null = false;
2444 
2445  CRef<CSeq_loc> null_loc(new CSeq_loc());
2446  null_loc->SetNull();
2447 
2448  for (CSeq_loc::const_iterator seq_it = slp.begin(); seq_it != slp.end(); ++seq_it) {
2449  if (need_null)
2450  inst.SetExt().SetSeg().Set().push_back(null_loc);
2451  else
2452  need_null = true;
2453 
2454  CRef<CSeq_loc> seqloc(new CSeq_loc());
2455  seqloc->Assign(seq_it.GetEmbeddingSeq_loc());
2456  inst.SetExt().SetSeg().Set().push_back(seqloc);
2457  }
2458 
2460  inst.SetFuzz().SetLim(CInt_fuzz::eLim_gt);
2461 
2462  return bioseq;
2463 }
2464 // LCOV_EXCL_STOP
2465 
2466 /**********************************************************
2467  *
2468  * void GetSeqExt(pp, slp):
2469  *
2470  * 5-12-93
2471  *
2472  **********************************************************/
2473 void GetSeqExt(ParserPtr pp, CSeq_loc& seq_loc)
2474 {
2475  const Indexblk* ibp;
2476 
2477  ibp = pp->entrylist[pp->curindx];
2478 
2479  CRef<CSeq_id> id = MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum);
2480 
2481  if (id.NotEmpty()) {
2482  CSeq_loc loc;
2483  loc.SetWhole(*id);
2484 
2485  seq_loc.Add(loc);
2486  }
2487 }
2488 
2489 // LCOV_EXCL_START
2490 // Excluded per Mark's request on 12/14/2016
2491 /**********************************************************
2492  *
2493  * SeqEntryPtr BuildBioSegHeader(pp, headsep, seqloc):
2494  *
2495  * 2-24-94
2496  *
2497  **********************************************************/
2499 {
2500  if (entries.empty())
2501  return;
2502 
2503  IndexblkPtr ibp = pp->entrylist[pp->curindx];
2504 
2505  CRef<CBioseq> bioseq = GetBioseq(pp, entries, seqloc); /* Bioseq, ext */
2506 
2507  CRef<CSeq_entry> bioseq_entry(new CSeq_entry);
2508  bioseq_entry->SetSeq(*bioseq);
2509 
2510  CRef<CBioseq_set> bioseq_set(new CBioseq_set);
2511  bioseq_set->SetSeq_set().assign(entries.begin(), entries.end());
2512  bioseq_set->SetClass(CBioseq_set::eClass_parts);
2513 
2514  CRef<CSeq_entry> bioseq_set_entry(new CSeq_entry);
2515  bioseq_set_entry->SetSet(*bioseq_set);
2516 
2517  CRef<CBioseq_set> bioseq_set_head(new CBioseq_set);
2518  bioseq_set_head->SetSeq_set().push_back(bioseq_entry);
2519  bioseq_set_head->SetSeq_set().push_back(bioseq_set_entry);
2520 
2521  CRef<CSeq_descr> descr(new CSeq_descr);
2522  GetBioseqSetDescr(bioseq_set->SetSeq_set(), *descr, &ibp->drop);
2523  bioseq_set_head->SetDescr(*descr);
2524  bioseq_set_head->SetClass(CBioseq_set::eClass_segset);
2525 
2526  CRef<CSeq_entry> bioseq_set_head_entry(new CSeq_entry);
2527  bioseq_set_head_entry->SetSet(*bioseq_set_head);
2528 
2529  entries.clear();
2530  entries.push_back(bioseq_set_head_entry);
2531 }
2532 
2533 /**********************************************************
2534  *
2535  * bool IsSegBioseq(const CSeq_id& id):
2536  *
2537  * 8-16-93
2538  *
2539  **********************************************************/
2540 bool IsSegBioseq(const CSeq_id& id)
2541 {
2542  if (id.Which() == CSeq_id::e_Patent)
2543  return false;
2544 
2545  const CTextseq_id* text_id = id.GetTextseq_Id();
2546 
2547  if (! text_id)
2548  return (false);
2549 
2550  if (! text_id->IsSetAccession() && text_id->IsSetName() &&
2551  StringEquN(text_id->GetName().c_str(), "SEG_", 4))
2552  return (true);
2553  return (false);
2554 }
2555 // LCOV_EXCL_STOP
2556 
2557 /**********************************************************
2558  *
2559  * char* check_div(pat_acc, pat_ref, est_kwd, sts_kwd,
2560  * gss_kwd, if_cds, div, tech, bases,
2561  * source, drop):
2562  *
2563  * 8-16-93
2564  *
2565  * gss and 1000 limit added.
2566  * 9-09-96
2567  *
2568  **********************************************************/
2569 bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string& div, CMolInfo::TTech* tech, size_t bases, Parser::ESource source, bool& drop)
2570 {
2571  if (div.empty())
2572  return false;
2573 
2574  if (pat_acc || pat_ref || StringEqu(div.c_str(), "PAT")) {
2575  if (pat_ref == false) {
2576  ErrPostEx(SEV_REJECT, ERR_DIVISION_MissingPatentRef, "Record in the patent division lacks a reference to a patent document. Entry dropped.");
2577  drop = true;
2578  }
2579  if (est_kwd) {
2580  ErrPostEx(SEV_WARNING, ERR_DIVISION_PATHasESTKeywords, "EST keywords present on patent sequence.");
2581  }
2582  if (sts_kwd) {
2583  ErrPostEx(SEV_WARNING, ERR_DIVISION_PATHasSTSKeywords, "STS keywords present on patent sequence.");
2584  }
2585  if (gss_kwd) {
2586  ErrPostEx(SEV_WARNING, ERR_DIVISION_PATHasGSSKeywords, "GSS keywords present on patent sequence.");
2587  }
2588  if (if_cds && source != Parser::ESource::EMBL) {
2589  ErrPostEx(SEV_INFO, ERR_DIVISION_PATHasCDSFeature, "CDS features present on patent sequence.");
2590  }
2591  if (! StringEqu(div.c_str(), "PAT")) {
2592  if (pat_acc)
2593  ErrPostEx(SEV_WARNING, ERR_DIVISION_ShouldBePAT, "Based on the accession number prefix letters, this is a patent sequence, but the division code is not PAT.");
2594 
2595  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoPAT, "Division %s mapped to PAT based on %s.", div.c_str(), (pat_acc == false) ? "patent reference" : "accession number");
2596  div = "PAT";
2597  }
2598  } else if (est_kwd) {
2599  if (if_cds) {
2600  if (StringEqu(div.c_str(), "EST")) {
2601  ErrPostEx(SEV_WARNING, ERR_DIVISION_ESTHasCDSFeature, "Coding region features exist and division is EST; EST might not be appropriate.");
2602  } else {
2603  ErrPostEx(SEV_INFO, ERR_DIVISION_NotMappedtoEST, "EST keywords exist, but this entry was not mapped to the EST division because of the presence of CDS features.");
2604  if (*tech == CMolInfo::eTech_est)
2605  *tech = CMolInfo::eTech_unknown;
2606  }
2607  } else if (bases > 1000) {
2608  if (StringEqu(div.c_str(), "EST")) {
2609  ErrPostEx(SEV_WARNING, ERR_DIVISION_LongESTSequence, "Division code is EST, but the length of the sequence is %ld.", bases);
2610  } else {
2611  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoEST, "EST keywords exist, but this entry was not mapped to the EST division because of the sequence length %ld.", bases);
2612  if (*tech == CMolInfo::eTech_est)
2613  *tech = CMolInfo::eTech_unknown;
2614  }
2615  } else {
2616  if (! StringEqu(div.c_str(), "EST"))
2617  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoEST, "%s division mapped to EST.", div.c_str());
2618  *tech = CMolInfo::eTech_est;
2619  div.clear();
2620  }
2621  } else if (StringEqu(div.c_str(), "EST")) {
2622  ErrPostEx(SEV_WARNING, ERR_DIVISION_MissingESTKeywords, "Division is EST, but entry lacks EST-related keywords.");
2623  if (sts_kwd) {
2624  ErrPostEx(SEV_WARNING, ERR_DIVISION_ESTHasSTSKeywords, "STS keywords present on EST sequence.");
2625  }
2626  if (if_cds) {
2627  ErrPostEx(SEV_WARNING, ERR_DIVISION_ESTHasCDSFeature, "Coding region features exist and division is EST; EST might not be appropriate.");
2628  }
2629  } else if (sts_kwd) {
2630  if (if_cds) {
2631  if (StringEqu(div.c_str(), "STS")) {
2632  ErrPostEx(SEV_WARNING, ERR_DIVISION_STSHasCDSFeature, "Coding region features exist and division is STS; STS might not be appropriate.");
2633  } else {
2634  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoSTS, "STS keywords exist, but this entry was not mapped to the STS division because of the presence of CDS features.");
2635  if (*tech == CMolInfo::eTech_sts)
2636  *tech = CMolInfo::eTech_unknown;
2637  }
2638  } else if (bases > 1000) {
2639  if (StringEqu(div.c_str(), "STS")) {
2640  ErrPostEx(SEV_WARNING, ERR_DIVISION_LongSTSSequence, "Division code is STS, but the length of the sequence is %ld.", bases);
2641  } else {
2642  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoSTS, "STS keywords exist, but this entry was not mapped to the STS division because of the sequence length %ld.", bases);
2643  if (*tech == CMolInfo::eTech_sts)
2644  *tech = CMolInfo::eTech_unknown;
2645  }
2646  } else {
2647  if (! StringEqu(div.c_str(), "STS"))
2648  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoSTS, "%s division mapped to STS.", div.c_str());
2649  *tech = CMolInfo::eTech_sts;
2650  div.clear();
2651  }
2652  } else if (StringEqu(div.c_str(), "STS")) {
2653  ErrPostEx(SEV_WARNING, ERR_DIVISION_MissingSTSKeywords, "Division is STS, but entry lacks STS-related keywords.");
2654  if (if_cds) {
2655  ErrPostEx(SEV_WARNING, ERR_DIVISION_STSHasCDSFeature, "Coding region features exist and division is STS; STS might not be appropriate.");
2656  }
2657  } else if (gss_kwd) {
2658  if (if_cds) {
2659  if (StringEqu(div.c_str(), "GSS")) {
2660  ErrPostEx(SEV_WARNING, ERR_DIVISION_GSSHasCDSFeature, "Coding region features exist and division is GSS; GSS might not be appropriate.");
2661  } else {
2662  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoGSS, "GSS keywords exist, but this entry was not mapped to the GSS division because of the presence of CDS features.");
2663  if (*tech == CMolInfo::eTech_survey)
2664  *tech = CMolInfo::eTech_unknown;
2665  }
2666  } else if (bases > 2500) {
2667  if (StringEqu(div.c_str(), "GSS")) {
2668  ErrPostEx(SEV_WARNING, ERR_DIVISION_LongGSSSequence, "Division code is GSS, but the length of the sequence is %ld.", bases);
2669  } else {
2670  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoGSS, "GSS keywords exist, but this entry was not mapped to the GSS division because of the sequence length %ld.", bases);
2671  if (*tech == CMolInfo::eTech_survey)
2672  *tech = CMolInfo::eTech_unknown;
2673  }
2674  } else {
2675  if (! StringEqu(div.c_str(), "GSS"))
2676  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoGSS, "%s division mapped to GSS.", div.c_str());
2677  *tech = CMolInfo::eTech_survey;
2678  div.clear();
2679  }
2680  } else if (StringEqu(div.c_str(), "GSS")) {
2681  ErrPostEx(SEV_WARNING, ERR_DIVISION_MissingGSSKeywords, "Division is GSS, but entry lacks GSS-related keywords.");
2682  if (if_cds) {
2683  ErrPostEx(SEV_WARNING, ERR_DIVISION_GSSHasCDSFeature, "Coding region features exist and division is GSS; GSS might not be appropriate.");
2684  }
2685  } else if (StringEqu(div.c_str(), "TSA")) {
2686  *tech = CMolInfo::eTech_tsa;
2687  div.clear();
2688  }
2689 
2690  return ! div.empty();
2691 }
2692 
2693 /**********************************************************/
2694 CRef<CSeq_id> StrToSeqId(const char* pch, bool pid)
2695 {
2696  long lID;
2697  char* pchEnd;
2698 
2699  CRef<CSeq_id> id;
2700 
2701  /* Figure out--what source is it */
2702  if (*pch == 'd' || *pch == 'e') {
2703  /* Get ID */
2704  errno = 0; /* clear errors, the error flag from stdlib */
2705  lID = strtol(pch + 1, &pchEnd, 10);
2706 
2707  if (! ((lID == 0 && pch + 1 == pchEnd) || (lID == LONG_MAX && errno == ERANGE))) {
2708  /* Allocate new SeqId */
2709 
2710  id = new CSeq_id;
2712  tag->SetStr(string(pch, pchEnd - pch));
2713 
2714  CRef<CDbtag> dbtag(new CDbtag);
2715  dbtag->SetTag(*tag);
2716  dbtag->SetDb(pid ? "PID" : "NID");
2717 
2718  id->SetGeneral(*dbtag);
2719  }
2720  }
2721 
2722  return id;
2723 }
2724 
2725 /**********************************************************/
2726 void AddNIDSeqId(CBioseq& bioseq, const DataBlk& entry, Int2 type, Int2 coldata, Parser::ESource source)
2727 {
2728  DataBlkPtr dbp;
2729  char* offset;
2730 
2731  dbp = TrackNodeType(entry, type);
2732  if (! dbp)
2733  return;
2734 
2735  offset = dbp->mOffset + coldata;
2736  CRef<CSeq_id> sid = StrToSeqId(offset, false);
2737  if (sid.Empty())
2738  return;
2739 
2740  if (! (*offset == 'g' && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL)))
2741  bioseq.SetId().push_back(sid);
2742 }
2743 
2744 /**********************************************************/
2745 static void CheckDivCode(TEntryList& seq_entries, ParserPtr pp)
2746 {
2747  for (auto& entry : seq_entries) {
2748  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
2749  if (bioseq->IsSetDescr()) {
2750  CGB_block* gb_block = nullptr;
2751  CMolInfo* molinfo = nullptr;
2753 
2754  for (auto& descr : bioseq->SetDescr().Set()) {
2755  if (descr->IsGenbank() && ! gb_block)
2756  gb_block = &descr->SetGenbank();
2757  else if (descr->IsMolinfo() && ! molinfo) {
2758  molinfo = &descr->SetMolinfo();
2759  tech = molinfo->GetTech();
2760  }
2761 
2762  if (gb_block && molinfo)
2763  break;
2764  }
2765 
2766  if (! gb_block)
2767  continue;
2768 
2769  IndexblkPtr ibp = pp->entrylist[pp->curindx];
2770 
2771  if (tech == CMolInfo::eTech_tsa &&
2772  ! NStr::CompareNocase(ibp->division, "TSA"))
2773  continue;
2774 
2775  if (! gb_block->IsSetDiv()) {
2776  ErrPostEx(SEV_WARNING, ERR_DIVISION_GBBlockDivision, "input division code is preserved in GBBlock");
2777  gb_block->SetDiv(ibp->division);
2778  }
2779  }
2780  }
2781  }
2782 }
2783 
2784 /**********************************************************/
2785 static const CBioSource* GetTopBiosource(const CSeq_entry& entry)
2786 {
2787  const TSeqdescList& descrs = GetDescrPointer(entry);
2788  for (const auto& descr : descrs) {
2789  if (descr->IsSource())
2790  return &(descr->GetSource());
2791  }
2792 
2793  return nullptr;
2794 }
2795 
2796 /**********************************************************/
2797 static bool SeqEntryCheckTaxonDiv(const CSeq_entry& entry)
2798 {
2799  const CBioSource* bio_src = GetTopBiosource(entry);
2800  if (! bio_src)
2801  return false;
2802 
2803  if (! bio_src->IsSetOrg() || ! bio_src->GetOrg().IsSetOrgname() || ! bio_src->GetOrg().GetOrgname().IsSetDiv())
2804  return false;
2805 
2806  return true;
2807 }
2808 
2809 /**********************************************************/
2811 {
2812  if (seq_entries.empty())
2813  return;
2814 
2815  if (! SeqEntryCheckTaxonDiv(*seq_entries.front())) {
2816  CheckDivCode(seq_entries, pp);
2817  }
2818 }
2819 
2820 /**********************************************************/
2821 void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk& entry, Int2 what, Int2 ori, bool cancelled)
2822 {
2823  DataBlkPtr dbp;
2824  const char** b;
2825  char* tmp;
2826  char* p;
2827  char* q;
2828  char* r;
2829  Int2 count;
2830 
2831  dbp = TrackNodeType(entry, what);
2832  if (! dbp || ! dbp->mOffset || dbp->len < 1)
2833  p = nullptr;
2834  else {
2835  tmp = StringSave(string_view(dbp->mOffset, dbp->len - 1));
2836  for (q = tmp; *q != '\0'; q++) {
2837  if (*q == '\n' && StringEquN(q + 1, "DE ", 5))
2838  fta_StringCpy(q, q + 5);
2839  else if (*q == '\n' || *q == '\t')
2840  *q = ' ';
2841  }
2842  for (q = tmp, p = tmp; *p != '\0'; p++) {
2843  if (*p == ' ' && p[1] == ' ')
2844  continue;
2845  *q++ = *p;
2846  }
2847  *q = '\0';
2848  for (b = magic_phrases, p = nullptr; *b && ! p; b++)
2849  p = StringStr(tmp, *b);
2850  MemFree(tmp);
2851  }
2852 
2853  if ((tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2854  tech == CMolInfo::eTech_htgs_2) &&
2855  ! p && ! cancelled) {
2856  ErrPostEx(SEV_WARNING, ERR_DEFINITION_HTGNotInProgress, "This Phase 0, 1 or 2 HTGS sequence is lacking an indication that sequencing is still in progress on its definition/description line.");
2857  } else if (tech == CMolInfo::eTech_htgs_3 && p) {
2858  ErrPostEx(SEV_ERROR, ERR_DEFINITION_HTGShouldBeComplete, "This complete Phase 3 sequence has a definition/description line indicating that its sequencing is still in progress.");
2859  }
2860 
2861  if (tech != CMolInfo::eTech_htgs_3)
2862  return;
2863 
2864  dbp = TrackNodeType(entry, ori);
2865  if (! dbp || ! dbp->mOffset || dbp->len < 1)
2866  return;
2867  r = new char[dbp->len + 1];
2868  if (! r)
2869  return;
2870  StringNCpy(r, dbp->mOffset, dbp->len);
2871  r[dbp->len] = '\0';
2872  for (p = r, q = r; *p != '\0'; p++)
2873  if (*p >= 'a' && *p <= 'z')
2874  *q++ = *p;
2875  *q = '\0';
2876 
2877  for (count = 0, p = r; *p != '\0'; p++) {
2878  if (*p != 'n')
2879  count = 0;
2880  else if (++count > 10) {
2881  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_UnknownBaseHTG3, "This complete Phase 3 HTGS sequence has one or more runs of 10 contiguous unknown ('n') bases.");
2882  break;
2883  }
2884  }
2885  delete[] r;
2886 }
2887 
2888 /**********************************************************/
2889 void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char* entry, XmlIndexPtr xip, bool cancelled)
2890 {
2891  const char** b;
2892  char* tmp;
2893  char* p;
2894  char* q;
2895  char* r;
2896  Int2 count;
2897 
2898  if (! entry || ! xip)
2899  return;
2900 
2902  if (! tmp)
2903  p = nullptr;
2904  else {
2905  for (q = tmp; *q != '\0'; q++)
2906  if (*q == '\n' || *q == '\t')
2907  *q = ' ';
2908  for (q = tmp, p = tmp; *p != '\0'; p++) {
2909  if (*p == ' ' && p[1] == ' ')
2910  continue;
2911  *q++ = *p;
2912  }
2913  *q = '\0';
2914  for (b = magic_phrases, p = nullptr; *b && ! p; b++)
2915  p = StringStr(tmp, *b);
2916  MemFree(tmp);
2917  }
2918 
2919  if ((tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2920  tech == CMolInfo::eTech_htgs_2) &&
2921  ! p && ! cancelled) {
2922  ErrPostEx(SEV_WARNING, ERR_DEFINITION_HTGNotInProgress, "This Phase 0, 1 or 2 HTGS sequence is lacking an indication that sequencing is still in progress on its definition/description line.");
2923  } else if (tech == CMolInfo::eTech_htgs_3 && p) {
2924  ErrPostEx(SEV_ERROR, ERR_DEFINITION_HTGShouldBeComplete, "This complete Phase 3 sequence has a definition/description line indicating that its sequencing is still in progress.");
2925  }
2926 
2927  if (tech != CMolInfo::eTech_htgs_3)
2928  return;
2929 
2930  r = StringSave(XMLFindTagValue(entry, xip, INSDSEQ_SEQUENCE));
2931  if (! r)
2932  return;
2933 
2934  for (count = 0, p = r; *p != '\0'; p++) {
2935  if (*p != 'n')
2936  count = 0;
2937  else if (++count > 10) {
2938  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_UnknownBaseHTG3, "This complete Phase 3 HTGS sequence has one or more runs of 10 contiguous unknown ('n') bases.");
2939  break;
2940  }
2941  }
2942  MemFree(r);
2943 }
2944 
2945 /**********************************************************/
2946 void CheckHTGDivision(const char* div, CMolInfo::TTech tech)
2947 {
2948  if (div && StringEqu(div, "HTG") && tech == CMolInfo::eTech_htgs_3) {
2949  ErrPostEx(SEV_WARNING, ERR_DIVISION_ShouldNotBeHTG, "This Phase 3 HTGS sequence is still in the HTG division. If truly complete, it should move to a non-HTG division.");
2950  } else if ((! div || ! StringEqu(div, "HTG")) &&
2951  (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2952  tech == CMolInfo::eTech_htgs_2)) {
2953  ErrPostEx(SEV_ERROR, ERR_DIVISION_ShouldBeHTG, "Phase 0, 1 or 2 HTGS sequences should have division code HTG.");
2954  }
2955 }
2956 
2957 /**********************************************************/
2959 {
2960  if (entry.IsSeq())
2961  return entry.GetSeq().GetDescr();
2962 
2963  return entry.GetSet().GetDescr();
2964 }
2965 
2966 /**********************************************************/
2967 static void CleanVisString(string& str)
2968 {
2969  if (str.empty())
2970  return;
2971 
2972  size_t start_pos = 0;
2973  for (; start_pos > str.size() && str[start_pos] <= ' '; ++start_pos)
2974  ;
2975 
2976  if (start_pos == str.size()) {
2977  str.clear();
2978  return;
2979  }
2980 
2981  str = str.substr(start_pos);
2982  size_t end_pos = str.size() - 1;
2983  for (;; --end_pos) {
2984  if (str[end_pos] == ';' || str[end_pos] <= ' ') {
2985  if (end_pos == 0)
2986  break;
2987  continue;
2988  }
2989  ++end_pos;
2990  break;
2991  }
2992 
2993  if (str[end_pos] != ';' || end_pos == 0) {
2994  if (end_pos == 0)
2995  str.clear();
2996  else
2997  str = str.substr(0, end_pos);
2998 
2999  return;
3000  }
3001 
3002  size_t amp_pos = end_pos - 1;
3003  for (; amp_pos; --amp_pos) {
3004  if (str[amp_pos] == ' ' || str[amp_pos] == '&' || str[amp_pos] == ';')
3005  break;
3006  }
3007 
3008  if (str[amp_pos] == '&')
3009  ++end_pos;
3010 
3011  str = str.substr(0, end_pos);
3012 }
3013 
3014 /**********************************************************/
3015 static void CleanVisStringList(list<string>& str_list)
3016 {
3017  for (list<string>::iterator it = str_list.begin(); it != str_list.end();) {
3018  CleanVisString(*it);
3019 
3020  if (it->empty())
3021  it = str_list.erase(it);
3022  else
3023  ++it;
3024  }
3025 }
3026 
3027 /**********************************************************/
3028 static void CheckGBBlock(TSeqdescList& descrs, bool& got)
3029 {
3030  const Char* div = nullptr;
3031 
3032  for (const auto& descr : descrs) {
3033  if (! descr->IsEmbl())
3034  continue;
3035 
3036  if (! descr->GetEmbl().IsSetDiv() || descr->GetEmbl().GetDiv() > 15)
3037  continue;
3038 
3039  div = GetEmblDiv(descr->GetEmbl().GetDiv());
3040  break;
3041  }
3042 
3043  for (TSeqdescList::iterator descr = descrs.begin(); descr != descrs.end();) {
3044  if (! (*descr)->IsGenbank()) {
3045  ++descr;
3046  continue;
3047  }
3048 
3049  CGB_block& gb_block = (*descr)->SetGenbank();
3050  if (div && gb_block.IsSetDiv() && NStr::CompareNocase(div, gb_block.GetDiv().c_str()) == 0)
3051  gb_block.ResetDiv();
3052 
3053  if (gb_block.IsSetSource()) {
3054  got = true;
3055  } else if (gb_block.IsSetDiv() && gb_block.GetDiv() != "PAT" &&
3056  gb_block.GetDiv() != "SYN") {
3057  got = true;
3058  }
3059 
3060  if (gb_block.IsSetExtra_accessions()) {
3062  if (gb_block.GetExtra_accessions().empty())
3063  gb_block.ResetExtra_accessions();
3064  }
3065 
3066 
3067  if (gb_block.IsSetKeywords()) {
3068  CleanVisStringList(gb_block.SetKeywords());
3069  if (gb_block.GetKeywords().empty())
3070  gb_block.ResetKeywords();
3071  }
3072 
3073  if (gb_block.IsSetSource()) {
3074  string& buf = gb_block.SetSource();
3076  if (buf.empty())
3077  gb_block.ResetSource();
3078  }
3079 
3080  if (gb_block.IsSetOrigin()) {
3081  string& buf = gb_block.SetOrigin();
3083  if (buf.empty())
3084  gb_block.ResetOrigin();
3085  }
3086 
3087  if (gb_block.IsSetDate()) {
3088  string& buf = gb_block.SetDate();
3090  if (buf.empty())
3091  gb_block.ResetDate();
3092  }
3093 
3094  if (gb_block.IsSetDiv()) {
3095  string& buf = gb_block.SetDiv();
3097  if (buf.empty())
3098  gb_block.ResetDiv();
3099  }
3100 
3101  if (! gb_block.IsSetExtra_accessions() && ! gb_block.IsSetSource() &&
3102  ! gb_block.IsSetKeywords() && ! gb_block.IsSetOrigin() &&
3103  ! gb_block.IsSetDate() && ! gb_block.IsSetEntry_date() &&
3104  ! gb_block.IsSetDiv()) {
3105  descr = descrs.erase(descr);
3106  } else {
3107  ++descr;
3108  }
3109  }
3110 }
3111 
3112 /**********************************************************/
3114 {
3115  bool got = false;
3116 
3117  for (auto& entry : seq_entries) {
3118  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3119  if (bioseq->IsSetDescr())
3120  CheckGBBlock(bioseq->SetDescr().Set(), got);
3121  }
3122 
3123  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
3124  if (bio_set->IsSetDescr())
3125  CheckGBBlock(bio_set->SetDescr().Set(), got);
3126  }
3127  }
3128 
3129  return (got);
3130 }
3131 
3132 /**********************************************************/
3133 static int GetSerialNumFromPubEquiv(const CPub_equiv& pub_eq)
3134 {
3135  int ret = -1;
3136  for (const auto& pub : pub_eq.Get()) {
3137  if (pub->IsGen()) {
3138  if (pub->GetGen().IsSetSerial_number()) {
3139  ret = pub->GetGen().GetSerial_number();
3140  break;
3141  }
3142  }
3143  }
3144 
3145  return ret;
3146 }
3147 
3148 /**********************************************************/
3149 static bool fta_if_pubs_sorted(const CPub_equiv& pub1, const CPub_equiv& pub2)
3150 {
3151  Int4 num1 = GetSerialNumFromPubEquiv(pub1);
3152  Int4 num2 = GetSerialNumFromPubEquiv(pub2);
3153 
3154  return num1 < num2;
3155 }
3156 
3157 /**********************************************************/
3158 static bool descr_cmp(const CRef<CSeqdesc>& desc1,
3159  const CRef<CSeqdesc>& desc2)
3160 {
3161  if (desc1->Which() == desc2->Which() && desc1->IsPub()) {
3162  const CPub_equiv& pub1 = desc1->GetPub().GetPub();
3163  const CPub_equiv& pub2 = desc2->GetPub().GetPub();
3164  return fta_if_pubs_sorted(pub1, pub2);
3165  }
3166  if (desc1->Which() == desc2->Which() && desc1->IsUser()) {
3167  const CUser_object& uop1 = desc1->GetUser();
3168  const CUser_object& uop2 = desc2->GetUser();
3169  const char* str1;
3170  const char* str2;
3171  if (uop1.IsSetType() && uop1.GetType().IsStr() &&
3172  uop2.IsSetType() && uop2.GetType().IsStr()) {
3173  str1 = uop1.GetType().GetStr().c_str();
3174  str2 = uop2.GetType().GetStr().c_str();
3175  if (strcmp(str1, str2) <= 0)
3176  return (true);
3177  return (false);
3178  }
3179  }
3180 
3181  return desc1->Which() < desc2->Which();
3182 }
3183 
3184 /**********************************************************/
3185 void fta_sort_descr(TEntryList& seq_entries)
3186 {
3187  for (auto& entry : seq_entries) {
3188  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3189  if (bioseq->IsSetDescr())
3190  bioseq->SetDescr().Set().sort(descr_cmp);
3191  }
3192 
3193  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
3194  if (bio_set->IsSetDescr())
3195  bio_set->SetDescr().Set().sort(descr_cmp);
3196  }
3197  }
3198 }
3199 
3200 /**********************************************************/
3201 static bool pub_cmp(const CRef<CPub>& pub1, const CRef<CPub>& pub2)
3202 {
3203  if (pub1->Which() == pub2->Which()) {
3204  if (pub1->IsMuid()) {
3205  return pub1->GetMuid() < pub2->GetMuid();
3206  } else if (pub1->IsGen()) {
3207  const CCit_gen& cit1 = pub1->GetGen();
3208  const CCit_gen& cit2 = pub2->GetGen();
3209 
3210  if (cit1.IsSetCit() && cit2.IsSetCit())
3211  return cit1.GetCit() < cit2.GetCit();
3212  }
3213  }
3214 
3215  return pub1->Which() < pub2->Which();
3216 }
3217 
3218 /**********************************************************/
3219 static void sort_feat_cit(CBioseq::TAnnot& annots)
3220 {
3221  for (auto& annot : annots) {
3222  if (annot->IsFtable()) {
3223  for (auto& feat : annot->SetData().SetFtable()) {
3224  if (feat->IsSetCit() && feat->GetCit().IsPub()) {
3225  // feat->SetCit().SetPub().sort(pub_cmp); TODO: may be this sort would be OK, the only difference with original one is it is stable
3226 
3227  TPubList& pubs = feat->SetCit().SetPub();
3228  for (TPubList::iterator pub = pubs.begin(); pub != pubs.end(); ++pub) {
3229  TPubList::iterator next_pub = pub;
3230  for (++next_pub; next_pub != pubs.end(); ++next_pub) {
3231  if (pub_cmp(*next_pub, *pub))
3232  swap(*next_pub, *pub);
3233  }
3234  }
3235  }
3236  }
3237  }
3238  }
3239 }
3240 
3241 /**********************************************************/
3243 {
3244  for (auto& entry : seq_entries) {
3245  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3246  if (bioseq->IsSetAnnot())
3247  sort_feat_cit(bioseq->SetAnnot());
3248  }
3249 
3250  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
3251  if (bio_set->IsSetAnnot())
3252  sort_feat_cit(bio_set->SetAnnot());
3253  }
3254  }
3255 }
3256 
3257 /**********************************************************/
3259 {
3260  for (const auto& tag : dbtags) {
3261  if (tag->IsSetDb() && tag->IsSetTag() &&
3262  ! tag->GetTag().IsStr() && tag->GetTag().GetId() > 0 &&
3263  tag->GetDb() == "taxon")
3264  return true;
3265  }
3266  return false;
3267 }
3268 
3269 /**********************************************************/
3270 void fta_fix_orgref_div(const CBioseq::TAnnot& annots, COrg_ref* org_ref, CGB_block& gbb)
3271 {
3272  Int4 count;
3273 
3274  if (! org_ref || ! gbb.IsSetDiv())
3275  return;
3276 
3277  count = 1;
3278  if (org_ref->IsSetOrgname() && ! org_ref->GetOrgname().IsSetDiv() &&
3279  ! fta_orgref_has_taxid(org_ref->GetDb())) {
3280  org_ref->SetOrgname().SetDiv(gbb.GetDiv());
3281  count--;
3282  }
3283 
3284  for (const auto& annot : annots) {
3285  if (! annot->IsFtable())
3286  continue;
3287 
3288  const CSeq_annot::C_Data::TFtable& feats = annot->GetData().GetFtable();
3289  for (const auto& feat : feats) {
3290  if (! feat->IsSetData() || ! feat->GetData().IsBiosrc())
3291  continue;
3292 
3293  count++;
3294 
3295  const CBioSource& bio_src = feat->GetData().GetBiosrc();
3296  if (bio_src.IsSetOrg() && ! fta_orgref_has_taxid(bio_src.GetOrg().GetDb())) {
3297  org_ref->SetOrgname().SetDiv(gbb.GetDiv());
3298  count--;
3299  }
3300  }
3301  }
3302 
3303  if (count > 0)
3304  return;
3305 
3306  gbb.ResetDiv();
3307 }
3308 
3309 /**********************************************************/
3310 bool XMLCheckCDS(const char* entry, XmlIndexPtr xip)
3311 {
3312  XmlIndexPtr txip;
3313  XmlIndexPtr fxip;
3314 
3315  if (! entry || ! xip)
3316  return (false);
3317 
3318  for (; xip; xip = xip->next)
3319  if (xip->tag == INSDSEQ_FEATURE_TABLE && xip->subtags)
3320  break;
3321  if (! xip)
3322  return (false);
3323 
3324  for (txip = xip->subtags; txip; txip = txip->next) {
3325  if (! txip->subtags)
3326  continue;
3327  for (fxip = txip->subtags; fxip; fxip = fxip->next)
3328  if (fxip->tag == INSDFEATURE_KEY && fxip->end - fxip->start == 3 &&
3329  StringEquN(entry + fxip->start, "CDS", 3))
3330  break;
3331  if (fxip)
3332  break;
3333  }
3334 
3335  if (! txip)
3336  return (false);
3337  return (true);
3338 }
3339 
3340 /**********************************************************/
3342 {
3343  for (auto& entry : seq_entries) {
3344  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3345  if (bioseq->IsSetInst() && bioseq->GetInst().IsSetStrand())
3346  continue;
3347 
3348  if (bioseq->GetInst().IsSetMol()) {
3349  CSeq_inst::EMol mol = bioseq->GetInst().GetMol();
3350  if (mol == CSeq_inst::eMol_dna)
3351  bioseq->SetInst().SetStrand(CSeq_inst::eStrand_ds);
3352  else if (mol == CSeq_inst::eMol_rna || mol == CSeq_inst::eMol_aa)
3353  bioseq->SetInst().SetStrand(CSeq_inst::eStrand_ss);
3354  }
3355  }
3356  }
3357 }
3358 
3359 /*****************************************************************************/
3360 static bool SwissProtIDPresent(const TEntryList& seq_entries)
3361 {
3362  for (const auto& entry : seq_entries) {
3363  for (CTypeConstIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3364  if (bioseq->IsSetId()) {
3365  for (const auto& id : bioseq->GetId()) {
3366  if (id->IsSwissprot())
3367  return true;
3368  }
3369  }
3370  }
3371  }
3372 
3373  return false;
3374 }
3375 
3376 /*****************************************************************************/
3377 static bool IsCitEmpty(const CCit_gen& cit)
3378 {
3379  if (cit.IsSetCit() || cit.IsSetAuthors() || cit.IsSetMuid() ||
3380  cit.IsSetJournal() || cit.IsSetVolume() || cit.IsSetIssue() ||
3381  cit.IsSetPages() || cit.IsSetDate() || cit.IsSetTitle() ||
3382  cit.IsSetPmid() || cit.IsSetSerial_number())
3383  return false;
3384 
3385  return true;
3386 }
3387 
3388 /*****************************************************************************/
3389 static void RemoveSerials(TPubList& pubs)
3390 {
3391  for (TPubList::iterator pub = pubs.begin(); pub != pubs.end();) {
3392  if ((*pub)->IsGen()) {
3393  if ((*pub)->GetGen().IsSetSerial_number())
3394  (*pub)->SetGen().ResetSerial_number();
3395 
3396  if (IsCitEmpty((*pub)->GetGen()))
3397  pub = pubs.erase(pub);
3398  else
3399  ++pub;
3400  } else
3401  ++pub;
3402  }
3403 }
3404 
3405 /*****************************************************************************/
3406 void StripSerialNumbers(TEntryList& seq_entries)
3407 {
3408  if (! SwissProtIDPresent(seq_entries)) {
3409  for (auto& entry : seq_entries) {
3410  for (CTypeIterator<CPubdesc> pubdesc(Begin(*entry)); pubdesc; ++pubdesc) {
3411  if (pubdesc->IsSetPub()) {
3412  RemoveSerials(pubdesc->SetPub().Set());
3413  if (pubdesc->GetPub().Get().empty())
3414  pubdesc->ResetPub();
3415  }
3416  }
3417 
3418  for (CTypeIterator<CSeq_feat> feat(Begin(*entry)); feat; ++feat) {
3419  if (feat->IsSetData()) {
3420  if (feat->GetData().IsPub()) {
3421  RemoveSerials(feat->SetData().SetPub().SetPub().Set());
3422  if (feat->GetData().GetPub().GetPub().Get().empty())
3423  feat->SetData().SetPub().ResetPub();
3424  } else if (feat->GetData().IsImp()) {
3425  CImp_feat& imp = feat->SetData().SetImp();
3426  if (imp.IsSetKey() && imp.GetKey() == "Site-ref" && feat->IsSetCit() && feat->GetCit().IsPub()) {
3427  RemoveSerials(feat->SetCit().SetPub());
3428  if (feat->GetCit().GetPub().empty())
3429  feat->SetCit().Reset();
3430  }
3431  }
3432  }
3433  }
3434  }
3435  }
3436 }
3437 
3438 /*****************************************************************************/
3440 {
3441  const string* seq_str = nullptr;
3442  const vector<Char>* seq_vec = nullptr;
3443 
3445  size_t old_size = 0;
3446 
3447  switch (code) {
3448  case CSeq_data::e_Iupacaa:
3449  seq_str = &seq_data.GetIupacaa().Get();
3450  old_coding = CSeqUtil::e_Iupacaa;
3451  old_size = seq_str->size();
3452  break;
3453 
3454  case CSeq_data::e_Ncbi8aa:
3455  seq_vec = &seq_data.GetNcbi8aa().Get();
3456  old_coding = CSeqUtil::e_Ncbi8aa;
3457  old_size = seq_vec->size();
3458  break;
3459 
3461  seq_vec = &seq_data.GetNcbistdaa().Get();
3462  old_coding = CSeqUtil::e_Ncbistdaa;
3463  old_size = seq_vec->size();
3464  break;
3465 
3466  default:; // do nothing
3467  }
3468 
3469  std::vector<Char> new_seq(old_size);
3470  size_t new_size = 0;
3471  if (seq_str)
3472  new_size = CSeqConvert::Convert(seq_str->c_str(), old_coding, 0, static_cast<TSeqPos>(old_size), &new_seq[0], CSeqUtil::e_Ncbieaa);
3473  else if (seq_vec)
3474  new_size = CSeqConvert::Convert(&(*seq_vec)[0], old_coding, 0, static_cast<TSeqPos>(old_size), &new_seq[0], CSeqUtil::e_Ncbieaa);
3475 
3476  if (! new_seq.empty()) {
3477  seq_data.SetNcbieaa().Set().assign(new_seq.begin(), new_seq.begin() + new_size);
3478  }
3479 }
3480 
3481 /*****************************************************************************/
3482 static void RawBioseqPack(CBioseq& bioseq)
3483 {
3484  if (bioseq.GetInst().IsSetSeq_data()) {
3485  if (! bioseq.GetInst().IsSetMol() || ! bioseq.GetInst().IsNa()) {
3487  PackSeqData(code, bioseq.SetInst().SetSeq_data());
3488  } else if (! bioseq.GetInst().GetSeq_data().IsGap()) {
3489  CSeqportUtil::Pack(&bioseq.SetInst().SetSeq_data());
3490  }
3491  }
3492 }
3493 
3494 static void DeltaBioseqPack(CBioseq& bioseq)
3495 {
3496  if (bioseq.GetInst().IsSetExt() && bioseq.GetInst().GetExt().IsDelta()) {
3497  for (auto& delta : bioseq.SetInst().SetExt().SetDelta().Set()) {
3498  if (delta->IsLiteral() && delta->GetLiteral().IsSetSeq_data() && ! delta->GetLiteral().GetSeq_data().IsGap()) {
3499  CSeqportUtil::Pack(&delta->SetLiteral().SetSeq_data());
3500  }
3501  }
3502  }
3503 }
3504 
3505 /*****************************************************************************/
3506 void PackEntries(TEntryList& seq_entries)
3507 {
3508  for (auto& entry : seq_entries) {
3509  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3510  if (bioseq->IsSetInst() && bioseq->GetInst().IsSetRepr()) {
3511  CSeq_inst::ERepr repr = bioseq->GetInst().GetRepr();
3512  if (repr == CSeq_inst::eRepr_raw || repr == CSeq_inst::eRepr_const)
3513  RawBioseqPack(*bioseq);
3514  else if (repr == CSeq_inst::eRepr_delta)
3515  DeltaBioseqPack(*bioseq);
3516  }
3517  }
3518  }
3519 }
3520 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void StripECO(string &str)
Definition: add.cpp:2811
void xFixEMBLKeywords(string &keywordData)
Definition: asci_blk.cpp:1512
static void fta_check_mult_ids(DataBlkPtr dbp, const char *mtag, const char *ptag)
Definition: asci_blk.cpp:410
USING_SCOPE(objects)
static CSeq_inst::EMol SrchSegSeqMol(const TEntryList &entries)
Definition: asci_blk.cpp:2369
void BuildSubBlock(DataBlkPtr dbp, Int2 subtype, const char *subkw)
Definition: asci_blk.cpp:812
vector< string > genbankKeywords
Definition: gb_index.cpp:56
static void CleanUpSeqDescrPub(TEntryList &entries, std::set< CSeqdesc * > &to_clean)
Definition: asci_blk.cpp:2027
static bool CheckSegPub(const CPubdesc &pub, TEntryList &entries, std::set< CSeqdesc * > &same_pub_descr)
Definition: asci_blk.cpp:1938
static bool SwissProtIDPresent(const TEntryList &seq_entries)
Definition: asci_blk.cpp:3360
void GetGenBankSubBlock(const DataBlk &entry, size_t bases)
Definition: asci_blk.cpp:454
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3406
static void RemoveSerials(TPubList &pubs)
Definition: asci_blk.cpp:3389
vector< string > emblKeywords
Definition: em_index.cpp:56
CRef< CSeq_id > StrToSeqId(const char *pch, bool pid)
Definition: asci_blk.cpp:2694
static void RemoveDescrByChoice(CSeq_descr &descr, Uint1 choice)
Definition: asci_blk.cpp:1992
static void GetSegPub(TEntryList &entries, CSeq_descr &descr)
Definition: asci_blk.cpp:2054
static void InsertDatablkVal(DataBlkPtr *dbp, Int2 type, char *offset, size_t len)
Definition: asci_blk.cpp:231
static optional< string > GetBioseqSetDescrTitle(const CSeq_descr &descr)
Definition: asci_blk.cpp:2141
static void fta_fix_secondaries(TokenBlkList &secs)
Definition: asci_blk.cpp:1272
static const CBioSource * GetTopBiosource(const CSeq_entry &entry)
Definition: asci_blk.cpp:2785
void AddNIDSeqId(CBioseq &bioseq, const DataBlk &entry, Int2 type, Int2 coldata, Parser::ESource source)
Definition: asci_blk.cpp:2726
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
Definition: asci_blk.cpp:3270
static void GetGenBankRefType(DataBlkPtr dbp, size_t bases)
Definition: asci_blk.cpp:337
static const char * GetMoleculeClassString(Uint1 mol)
Definition: asci_blk.cpp:2347
void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk &entry, Int2 what, Int2 ori, bool cancelled)
Definition: asci_blk.cpp:2821
static bool CheckSegDescrChoice(const TEntryList &entries, Uint1 choice)
Definition: asci_blk.cpp:2083
static void CheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2745
vector< string > swissProtKeywords
Definition: sp_index.cpp:54
CRef< CPatent_seq_id > MakeUsptoPatSeqId(const char *acc)
Definition: asci_blk.cpp:884
bool IsSegBioseq(const CSeq_id &id)
Definition: asci_blk.cpp:2540
void xGetGenBankSubBlocks(Entry &entry, size_t bases)
Definition: asci_blk.cpp:493
static void GetBioseqSetDescr(TEntryList &entries, CSeq_descr &descr, bool *drop)
Definition: asci_blk.cpp:2334
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
Definition: asci_blk.cpp:3242
void PackEntries(TEntryList &seq_entries)
Definition: asci_blk.cpp:3506
static Int4 SrchSegLength(const TEntryList &entries)
Definition: asci_blk.cpp:2394
void fta_set_strandedness(TEntryList &seq_entries)
Definition: asci_blk.cpp:3341
static void fta_fix_tpa_keywords(TKeywordList &keywords)
Definition: asci_blk.cpp:1482
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
Definition: asci_blk.cpp:2946
unique_ptr< unsigned char[]> GetDNAConv(void)
Definition: asci_blk.cpp:1790
bool XMLCheckCDS(const char *entry, XmlIndexPtr xip)
Definition: asci_blk.cpp:3310
unique_ptr< unsigned char[]> GetProteinConv(void)
Definition: asci_blk.cpp:1818
static bool fta_if_pubs_sorted(const CPub_equiv &pub1, const CPub_equiv &pub2)
Definition: asci_blk.cpp:3149
static void GetFirstSegDescrChoice(CBioseq &bioseq, Uint1 choice, CSeq_descr &descr_new)
Definition: asci_blk.cpp:1862
static bool SeqEntryCheckTaxonDiv(const CSeq_entry &entry)
Definition: asci_blk.cpp:2797
bool fta_orgref_has_taxid(const COrg_ref::TDb &dbtags)
Definition: asci_blk.cpp:3258
char * GetDescrComment(char *offset, size_t len, Uint2 col_data, bool is_htg, bool is_pat)
Definition: asci_blk.cpp:1159
void GetSequenceOfKeywords(const DataBlk &entry, int type, Uint2 col_data, TKeywordList &keywords)
Definition: asci_blk.cpp:1551
static void CheckGBBlock(TSeqdescList &descrs, bool &got)
Definition: asci_blk.cpp:3028
static void CleanUpSeqDescrChoice(TEntryList &entries, Uint1 choice)
Definition: asci_blk.cpp:2011
static void CleanVisString(string &str)
Definition: asci_blk.cpp:2967
static void CleanVisStringList(list< string > &str_list)
Definition: asci_blk.cpp:3015
static bool pub_cmp(const CRef< CPub > &pub1, const CRef< CPub > &pub2)
Definition: asci_blk.cpp:3201
static void SrchSegDescr(TEntryList &entries, CSeq_descr &descr)
Definition: asci_blk.cpp:2185
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2810
void GetEmblSubBlock(size_t bases, Parser::ESource source, const DataBlk &entry)
Definition: asci_blk.cpp:740
char * GetEmblBlock(DataBlkPtr *chain, char *ptr, short *retkw, Parser::EFormat format, char *eptr)
Definition: asci_blk.cpp:545
const char * magic_phrases[]
Definition: asci_blk.cpp:104
static void PackSeqData(CSeq_data::E_Choice code, CSeq_data &seq_data)
Definition: asci_blk.cpp:3439
static bool IsCitEmpty(const CCit_gen &cit)
Definition: asci_blk.cpp:3377
static Uint1 ValidSeqType(const char *accession, Uint1 type)
Definition: asci_blk.cpp:918
static bool descr_cmp(const CRef< CSeqdesc > &desc1, const CRef< CSeqdesc > &desc2)
Definition: asci_blk.cpp:3158
char * GetGenBankBlock(DataBlkPtr *chain, char *ptr, Int2 *retkw, char *eptr)
Definition: asci_blk.cpp:284
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
Definition: asci_blk.cpp:2473
static bool GetSubNodeType(const char *subkw, char **retbptr, char *eptr)
Definition: asci_blk.cpp:648
static CRef< CBioseq > GetBioseq(ParserPtr pp, const TEntryList &entries, const CSeq_loc &slp)
Definition: asci_blk.cpp:2413
static void GetEmblRefType(size_t bases, Parser::ESource source, DataBlkPtr dbp)
Definition: asci_blk.cpp:684
CRef< CSeq_id > MakeAccSeqId(const char *acc, Uint1 seqtype, bool accver, Int2 vernum)
Definition: asci_blk.cpp:960
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1678
void GetLenSubNode(DataBlkPtr dbp)
Definition: asci_blk.cpp:835
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
Definition: asci_blk.cpp:3113
char * SrchNodeSubType(const DataBlk &entry, Int2 type, Int2 subtype, size_t *len)
Definition: asci_blk.cpp:1039
void xGetGenBankBlocks(Entry &entry)
Definition: asci_blk.cpp:256
static void sort_feat_cit(CBioseq::TAnnot &annots)
Definition: asci_blk.cpp:3219
static void SetEmptyId(CBioseq &bioseq)
Definition: asci_blk.cpp:1062
CRef< CSeq_id > MakeLocusSeqId(const char *locus, CSeq_id::E_Choice seqtype)
Definition: asci_blk.cpp:990
void fta_sort_descr(TEntryList &seq_entries)
Definition: asci_blk.cpp:3185
static void DeltaBioseqPack(CBioseq &bioseq)
Definition: asci_blk.cpp:3494
void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char *entry, XmlIndexPtr xip, bool cancelled)
Definition: asci_blk.cpp:2889
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
Definition: asci_blk.cpp:2498
static CRef< CSeq_id > MakeSegSetSeqId(const char *accession, const string &locus, Uint1 seqtype, bool is_tpa)
Definition: asci_blk.cpp:1008
static int GetSerialNumFromPubEquiv(const CPub_equiv &pub_eq)
Definition: asci_blk.cpp:3133
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1321
void ShrinkSpaces(char *line)
Definition: asci_blk.cpp:118
static void RawBioseqPack(CBioseq &bioseq)
Definition: asci_blk.cpp:3482
static bool TrimEmblFeatBlk(DataBlkPtr dbp)
Definition: asci_blk.cpp:604
static void GetSegSetDblink(CSeq_descr &descr, TEntryList &entries, bool *drop)
Definition: asci_blk.cpp:2214
static CSeq_descr::Tdata::const_iterator GetDescrByChoice(const CSeq_descr &descr, Uint1 choice)
Definition: asci_blk.cpp:1839
static bool SameCitation_PubEquivMatch_Logic(const CPub_equiv &a, const CPub_equiv &b)
Definition: asci_blk.cpp:1883
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
Definition: asci_blk.cpp:2569
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1074
const CSeq_descr & GetDescrPointer(const CSeq_entry &entry)
Definition: asci_blk.cpp:2958
static void BuildFeatureBlock(DataBlkPtr dbp)
Definition: asci_blk.cpp:377
Int4 ScanSequence(bool warn, char **seqptr, std::vector< char > &bsp, unsigned char *conv, Char replacechar, int *numns)
Definition: asci_blk.cpp:1622
const char * GetEmblDiv(Uint1 num)
Definition: em_ascii.cpp:2401
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
Definition: Date.hpp:53
ECompare Compare(const CDate &date) const
Definition: Date.cpp:83
@ eCompare_same
They're equivalent.
Definition: Date.hpp:75
Definition: Dbtag.hpp:53
@Imp_feat.hpp User-defined methods of the data storage class.
Definition: Imp_feat.hpp:54
void SetId8(TId8 value)
Definition: Object_id.cpp:175
Definition: Pub.hpp:56
@Pubdesc.hpp User-defined methods of the data storage class.
Definition: Pubdesc.hpp:54
CRef –.
Definition: ncbiobj.hpp:618
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
@ e_Ncbieaa
Definition: sequtil.hpp:57
@ e_not_set
Definition: sequtil.hpp:44
@ e_Ncbi8aa
Definition: sequtil.hpp:56
@ e_Ncbistdaa
Definition: sequtil.hpp:58
@ e_Iupacaa
Definition: sequtil.hpp:55
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
static TPair GetCodeIndexFromTo(CSeq_data::E_Choice code_type)
unsigned int TIndex
static const string & GetCode(CSeq_data::E_Choice code_type, TIndex idx)
pair< TIndex, TIndex > TPair
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
char * mOffset
Definition: ftablock.h:329
size_t len
Definition: ftablock.h:330
CFlatFileData * mpData
Definition: ftablock.h:328
DataBlk * mpNext
Definition: ftablock.h:333
int mType
Definition: ftablock.h:327
@ ParFlat_OC
Definition: embl.h:61
@ ParFlat_RG
Definition: embl.h:66
@ ParFlat_RL
Definition: embl.h:69
@ ParFlat_RP
Definition: embl.h:64
@ ParFlat_RT
Definition: embl.h:68
@ ParFlat_OS
Definition: embl.h:48
@ ParFlat_OG
Definition: embl.h:62
@ ParFlat_RN
Definition: embl.h:49
@ ParFlat_RX
Definition: embl.h:65
@ ParFlat_RA
Definition: embl.h:67
@ ParFlat_RC
Definition: embl.h:63
@ ParFlat_FH
Definition: embl.h:52
#define ERR_DIVISION_NotMappedtoEST
Definition: flat2err.h:237
#define ERR_ACCESSION_UnusualWGS_Secondary
Definition: flat2err.h:175
#define ERR_DIVISION_ShouldBePAT
Definition: flat2err.h:256
#define ERR_DIVISION_MappedtoPAT
Definition: flat2err.h:224
#define ERR_DIVISION_MappedtoSTS
Definition: flat2err.h:225
#define ERR_SEQUENCE_TooShort
Definition: flat2err.h:155
#define ERR_SEQUENCE_TooShortIsPatent
Definition: flat2err.h:157
#define ERR_SEQUENCE_UnknownBaseHTG3
Definition: flat2err.h:147
#define ERR_DIVISION_LongGSSSequence
Definition: flat2err.h:246
#define ERR_SEGMENT_GPIDMissingOrNonUnique
Definition: flat2err.h:167
#define ERR_DIVISION_ESTHasCDSFeature
Definition: flat2err.h:236
#define ERR_DIVISION_PATHasGSSKeywords
Definition: flat2err.h:243
#define ERR_REFERENCE_MultipleIdentifiers
Definition: flat2err.h:313
#define ERR_DIVISION_MissingSTSKeywords
Definition: flat2err.h:228
#define ERR_DIVISION_MissingPatentRef
Definition: flat2err.h:229
#define ERR_SEQUENCE_BadResidue
Definition: flat2err.h:149
#define ERR_DIVISION_PATHasESTKeywords
Definition: flat2err.h:230
#define ERR_ACCESSION_ScfldHasWGSContigSec
Definition: flat2err.h:176
#define ERR_SEGMENT_PubMatch
Definition: flat2err.h:164
#define ERR_FORMAT_LineTypeOrder
Definition: flat2err.h:40
#define ERR_SEGMENT_DBLinkMissingOrNonUnique
Definition: flat2err.h:168
#define ERR_DIVISION_MappedtoGSS
Definition: flat2err.h:242
#define ERR_DIVISION_GSSHasCDSFeature
Definition: flat2err.h:240
#define ERR_DIVISION_MappedtoEST
Definition: flat2err.h:223
#define ERR_DEFINITION_HTGNotInProgress
Definition: flat2err.h:265
#define ERR_ACCESSION_WGSMasterAsSecondary
Definition: flat2err.h:174
#define ERR_DIVISION_STSHasCDSFeature
Definition: flat2err.h:233
#define ERR_FEATURE_NoFeatData
Definition: flat2err.h:325
#define ERR_SEGMENT_DiffMolType
Definition: flat2err.h:163
#define ERR_DIVISION_ShouldBeHTG
Definition: flat2err.h:238
#define ERR_DIVISION_MissingESTKeywords
Definition: flat2err.h:227
#define ERR_DIVISION_NotMappedtoGSS
Definition: flat2err.h:241
#define ERR_SEQUENCE_SeqLenNotEq
Definition: flat2err.h:148
#define ERR_DIVISION_PATHasCDSFeature
Definition: flat2err.h:232
#define ERR_DIVISION_MissingGSSKeywords
Definition: flat2err.h:239
#define ERR_DIVISION_NotMappedtoSTS
Definition: flat2err.h:234
#define ERR_DIVISION_LongSTSSequence
Definition: flat2err.h:245
#define ERR_DIVISION_GBBlockDivision
Definition: flat2err.h:247
#define ERR_SEQUENCE_AllNs
Definition: flat2err.h:156
#define ERR_ACCESSION_WGSWithNonWGS_Sec
Definition: flat2err.h:173
#define ERR_DIVISION_PATHasSTSKeywords
Definition: flat2err.h:231
#define ERR_DIVISION_LongESTSequence
Definition: flat2err.h:244
#define ERR_DEFINITION_HTGShouldBeComplete
Definition: flat2err.h:267
#define ERR_DIVISION_ESTHasSTSKeywords
Definition: flat2err.h:235
#define ERR_DIVISION_ShouldNotBeHTG
Definition: flat2err.h:250
list< CRef< objects::CSeq_entry > > TEntryList
#define INSDSEQ_DEFINITION
Definition: fta_xml.h:52
#define INSDSEQ_FEATURE_TABLE
Definition: fta_xml.h:68
#define INSDFEATURE_KEY
Definition: fta_xml.h:77
#define INSDSEQ_SEQUENCE
Definition: fta_xml.h:69
unique_ptr< string > XMLFindTagValue(const char *entry, const XmlIndex *xip, Int4 tag)
Definition: xm_index.cpp:214
std::list< std::string > TKeywordList
Definition: ftablock.h:163
std::list< CRef< objects::CPub > > TPubList
Definition: ftablock.h:63
forward_list< string > TokenBlkList
Definition: ftablock.h:134
std::list< CRef< objects::CSeqdesc > > TSeqdescList
Definition: ftablock.h:61
std::list< std::string > TAccessionList
Definition: ftablock.h:57
void MemSet(void *p, int n, size_t sz)
Definition: ftacpp.hpp:49
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:131
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:90
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void MemCpy(void *p, const void *q, size_t sz)
Definition: ftacpp.hpp:50
char * StringNew(size_t sz)
Definition: ftacpp.hpp:43
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
static TDSICONV * conv
Definition: charconv.c:168
@ ParFlat_FEATBLOCK
Definition: genbank.h:72
@ ParFlat_AUTHORS
Definition: genbank.h:67
@ ParFlat_FEATURES
Definition: genbank.h:51
@ ParFlat_SOURCE
Definition: genbank.h:48
@ ParFlat_JOURNAL
Definition: genbank.h:70
@ ParFlat_STANDARD
Definition: genbank.h:71
@ ParFlat_REFERENCE
Definition: genbank.h:49
@ ParFlat_LOCUS
Definition: genbank.h:41
@ ParFlat_CONSRTM
Definition: genbank.h:68
@ ParFlat_END
Definition: genbank.h:54
@ ParFlat_ORGANISM
Definition: genbank.h:66
@ ParFlat_REMARK
Definition: genbank.h:74
@ ParFlat_MEDLINE
Definition: genbank.h:73
@ ParFlat_TITLE
Definition: genbank.h:69
@ ParFlat_PUBMED
Definition: genbank.h:75
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
TPrim & Set(void)
Definition: serialbase.hpp:351
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
Definition: serialbase.hpp:347
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
static E_Choice GetAccType(EAccessionInfo info)
Definition: Seq_id.hpp:562
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
const_iterator end(void) const
Definition: Seq_loc.cpp:1034
const_iterator begin(void) const
Definition: Seq_loc.cpp:1028
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static SIZE_TYPE CommonSuffixSize(const CTempString s1, const CTempString s2)
Determine the common suffix of two strings.
Definition: ncbistr.hpp:5462
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static void TrimSuffixInPlace(string &str, const CTempString suffix, ECase use_case=eCase)
Trim suffix from a string (in-place)
Definition: ncbistr.cpp:3278
void SetSource(const TSource &value)
Assign a value to Source data member.
Definition: GB_block_.hpp:488
TKeywords & SetKeywords(void)
Assign a value to Keywords data member.
Definition: GB_block_.hpp:532
bool IsSetExtra_accessions(void) const
Check if a value has been assigned to Extra_accessions data member.
Definition: GB_block_.hpp:442
void ResetKeywords(void)
Reset Keywords data member.
Definition: GB_block_.cpp:63
void ResetOrigin(void)
Reset Origin data member.
Definition: GB_block_.cpp:69
bool IsSetDiv(void) const
GenBank division Check if a value has been assigned to Div data member.
Definition: GB_block_.hpp:654
void ResetSource(void)
Reset Source data member.
Definition: GB_block_.cpp:57
void ResetDate(void)
Reset Date data member.
Definition: GB_block_.cpp:75
bool IsSetSource(void) const
source line Check if a value has been assigned to Source data member.
Definition: GB_block_.hpp:467
void SetDate(const TDate &value)
Assign a value to Date data member.
Definition: GB_block_.hpp:607
bool IsSetEntry_date(void) const
replaces date Check if a value has been assigned to Entry_date data member.
Definition: GB_block_.hpp:633
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: GB_block_.hpp:666
TExtra_accessions & SetExtra_accessions(void)
Assign a value to Extra_accessions data member.
Definition: GB_block_.hpp:460
const TExtra_accessions & GetExtra_accessions(void) const
Get the Extra_accessions member data.
Definition: GB_block_.hpp:454
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
Definition: GB_block_.hpp:526
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: GB_block_.hpp:539
void SetDiv(const TDiv &value)
Assign a value to Div data member.
Definition: GB_block_.hpp:675
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
Definition: GB_block_.hpp:514
void ResetExtra_accessions(void)
Reset Extra_accessions data member.
Definition: GB_block_.cpp:51
void ResetDiv(void)
Reset Div data member.
Definition: GB_block_.cpp:98
bool IsSetDate(void) const
OBSOLETE old form Entry Date Check if a value has been assigned to Date data member.
Definition: GB_block_.hpp:586
void SetOrigin(const TOrigin &value)
Assign a value to Origin data member.
Definition: GB_block_.hpp:560
bool IsSetPages(void) const
Check if a value has been assigned to Pages data member.
Definition: Cit_gen_.hpp:806
bool IsSetDate(void) const
Check if a value has been assigned to Date data member.
Definition: Cit_gen_.hpp:853
TSerial_number GetSerial_number(void) const
Get the Serial_number member data.
Definition: Cit_gen_.hpp:893
bool IsSetAuthors(void) const
Check if a value has been assigned to Authors data member.
Definition: Cit_gen_.hpp:623
bool IsSetVolume(void) const
Check if a value has been assigned to Volume data member.
Definition: Cit_gen_.hpp:712
const TCit & GetCit(void) const
Get the Cit member data.
Definition: Cit_gen_.hpp:588
bool IsSetSerial_number(void) const
for GenBank style references Check if a value has been assigned to Serial_number data member.
Definition: Cit_gen_.hpp:874
bool IsSetCit(void) const
anything, not parsable Check if a value has been assigned to Cit data member.
Definition: Cit_gen_.hpp:576
bool IsSetTitle(void) const
eg.
Definition: Cit_gen_.hpp:921
bool IsSetJournal(void) const
Check if a value has been assigned to Journal data member.
Definition: Cit_gen_.hpp:691
bool IsSetPmid(void) const
PubMed Id Check if a value has been assigned to Pmid data member.
Definition: Cit_gen_.hpp:968
bool IsSetIssue(void) const
Check if a value has been assigned to Issue data member.
Definition: Cit_gen_.hpp:759
bool IsSetMuid(void) const
medline uid Check if a value has been assigned to Muid data member.
Definition: Cit_gen_.hpp:644
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool CanGetType(void) const
Check if it is safe to call GetType method.
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TType & GetType(void) const
Get the Type member data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Date_.hpp:271
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
@ eLim_gt
greater than
Definition: Int_fuzz_.hpp:211
@ e_not_set
No variant selected.
Definition: Date_.hpp:127
vector< CRef< CDbtag > > TDb
Definition: Org_ref_.hpp:101
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Pub_equiv_.hpp:153
const Tdata & Get(void) const
Get the member data.
Definition: Pub_equiv_.hpp:165
E_Choice Which(void) const
Which variant is currently selected.
Definition: Pub_.hpp:555
const TGen & GetGen(void) const
Get the variant data.
Definition: Pub_.cpp:167
TMuid GetMuid(void) const
Get the variant data.
Definition: Pub_.hpp:608
bool IsMuid(void) const
Check if variant Muid is selected.
Definition: Pub_.hpp:602
bool IsGen(void) const
Check if variant Gen is selected.
Definition: Pub_.hpp:584
@ e_Gen
general or generic unparsed
Definition: Pub_.hpp:102
@ eSeq_code_type_iupacaa
IUPAC 1 letter amino acid code.
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
const TKey & GetKey(void) const
Get the Key member data.
Definition: Imp_feat_.hpp:259
bool IsSetKey(void) const
Check if a value has been assigned to Key data member.
Definition: Imp_feat_.hpp:247
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
const TName & GetName(void) const
Get the Name member data.
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TDescr & GetDescr(void) const
Get the Descr member data.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
void SetClass(TClass value)
Assign a value to Class data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_parts
parts for 2 or 3
@ eClass_segset
segmented sequence + parts
const TIupacaa & GetIupacaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:530
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
ERepr
representation class
Definition: Seq_inst_.hpp:91
void SetPub(TPub &value)
Assign a value to Pub data member.
Definition: Pubdesc_.cpp:72
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
Definition: Seq_inst_.hpp:593
const TPub & GetPub(void) const
Get the variant data.
Definition: Seqdesc_.cpp:356
const TNcbi8aa & GetNcbi8aa(void) const
Get the variant data.
Definition: Seq_data_.hpp:630
TNcbieaa & SetNcbieaa(void)
Select the variant.
Definition: Seq_data_.hpp:657
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
const TNcbistdaa & GetNcbistdaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:690
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
bool IsPub(void) const
Check if variant Pub is selected.
Definition: Seqdesc_.hpp:1096
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
bool IsSetPub(void) const
the citation(s) Check if a value has been assigned to Pub data member.
Definition: Pubdesc_.hpp:593
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
void SetFuzz(TFuzz &value)
Assign a value to Fuzz data member.
Definition: Seq_inst_.cpp:113
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
bool IsGap(void) const
Check if variant Gap is selected.
Definition: Seq_data_.hpp:704
const TPub & GetPub(void) const
Get the Pub member data.
Definition: Pubdesc_.hpp:605
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ eRepr_const
constructed sequence
Definition: Seq_inst_.hpp:96
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Iupacaa
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
@ e_Ncbi8aa
8 bit extended amino acid codes
Definition: Seq_data_.hpp:110
@ e_Org
if all from one organism
Definition: Seqdesc_.hpp:116
@ e_Update_date
date of last update
Definition: Seqdesc_.hpp:129
@ e_Modif
modifiers
Definition: Seqdesc_.hpp:112
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
@ eStrand_ds
double strand
Definition: Seq_inst_.hpp:136
@ eStrand_ss
single strand
Definition: Seq_inst_.hpp:135
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
@ ParFlat_REF_BTW
Definition: index.h:61
@ ParFlat_REF_NO_TARGET
Definition: index.h:63
@ ParFlat_COL_FEATKEY
Definition: index.h:65
@ ParFlat_REF_SITES
Definition: index.h:62
@ ParFlat_REF_END
Definition: index.h:60
CSeq_id::E_Choice GetNucAccOwner(const CTempString &acc)
Definition: indx_blk.cpp:2250
int fta_if_wgs_acc(string_view accession)
Definition: indx_blk.cpp:1189
void DelNonDigitTail(string &str)
Definition: indx_blk.cpp:957
char * buf
int i
yy_size_t n
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
unsigned int a
Definition: ncbi_localip.c:102
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
static Format format
Definition: njn_ioutil.cpp:53
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
@ ParFlatSP_KW
Definition: sprot.h:52
DataBlkPtr chain
Definition: ftablock.h:341
Definition: entry.h:57
list< SectionPtr > mSections
Definition: entry.h:99
string mBaseData
Definition: entry.h:98
Char acnum[200]
Definition: ftablock.h:166
Char division[4]
Definition: ftablock.h:171
bool is_mga
Definition: ftablock.h:199
TokenBlkList secaccs
Definition: ftablock.h:216
Char blocusname[200]
Definition: ftablock.h:178
Int2 vernum
Definition: ftablock.h:167
bool is_tpa
Definition: ftablock.h:206
bool embl_new_ID
Definition: ftablock.h:218
bool is_prot
Definition: ftablock.h:222
bool is_contig
Definition: ftablock.h:197
bool is_pat
Definition: ftablock.h:202
bool drop
Definition: ftablock.h:182
size_t bases
Definition: ftablock.h:172
string wgssec
Definition: ftablock.h:236
Char locusname[200]
Definition: ftablock.h:170
XmlIndexPtr xip
Definition: ftablock.h:217
vector< IndexblkPtr > entrylist
Definition: entry.h:13
size_t start
Definition: ftablock.h:152
XmlIndex * next
Definition: ftablock.h:158
XmlIndex * subtags
Definition: ftablock.h:157
size_t end
Definition: ftablock.h:153
Int4 tag
Definition: ftablock.h:150
Definition: inftrees.h:24
Definition: type.c:6
done
Definition: token1.c:1
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
Definition: utilfun.cpp:898
char * SrchTheChar(char *bptr, char *eptr, Char letter)
Definition: utilfun.cpp:759
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
Definition: utilfun.cpp:1508
string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
Definition: utilfun.cpp:644
bool fta_is_tpa_keyword(const char *str)
Definition: utilfun.cpp:1171
void CleanTailNoneAlphaCharInString(string &str)
Definition: utilfun.cpp:683
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Definition: utilfun.cpp:964
string xGetNodeData(const DataBlk &entry, int nodeType)
Definition: utilfun.cpp:978
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1497
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:995
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
Definition: utilfun.cpp:779
#define ParFlat_UNKW
Definition: utilfun.h:44
static wxAcceleratorEntry entries[3]
Modified on Thu May 30 12:20:48 2024 by modify_doxy.py rev. 669887