NCBI C++ ToolKit
asci_blk.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: asci_blk.cpp 102982 2024-08-15 12:44:06Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: asci_blk.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Common for all formats function processing ascii blocks to asn.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include <set>
38 
39 #include "ftacpp.hpp"
40 
46 #include <objects/seq/Bioseq.hpp>
48 #include <objects/seq/Seq_inst.hpp>
50 #include <objects/seq/Seq_data.hpp>
52 #include <objects/seq/Seq_ext.hpp>
54 #include <objects/seq/Seg_ext.hpp>
59 #include <objects/general/Date.hpp>
60 #include <objects/seq/Pubdesc.hpp>
63 #include <objects/pub/Pub.hpp>
68 #include <objects/pub/Pub_set.hpp>
74 #include <serial/iterator.hpp>
77 #include <objects/seq/MolInfo.hpp>
78 
79 #include "index.h"
80 #include "genbank.h"
81 #include "embl.h"
82 #include "sprot.h"
83 
85 
86 #include "ftaerr.hpp"
87 #include "indx_blk.h"
88 #include "asci_blk.h"
89 #include "utilfun.h"
90 #include "fta_xml.h"
91 
92 #include "add.h"
93 
94 #ifdef THIS_FILE
95 # undef THIS_FILE
96 #endif
97 #define THIS_FILE "asci_blk.cpp"
98 
99 #define Seq_descr_pub_same 50
100 
103 
104 const char* magic_phrases[] = {
105  "*** SEQUENCING IN PROGRESS ***",
106  "***SEQUENCING IN PROGRESS***",
107  "WORKING DRAFT SEQUENCE",
108  "LOW-PASS SEQUENCE SAMPLING",
109  "*** IN PROGRESS ***",
110  nullptr
111 };
112 
113 extern vector<string> genbankKeywords;
114 extern vector<string> emblKeywords;
115 extern vector<string> swissProtKeywords;
116 
117 /**********************************************************/
118 void ShrinkSpaces(char* line)
119 {
120  char* p;
121  char* q;
122  bool got_nl;
123 
124  if (! line || *line == '\0')
125  return;
126 
127  for (p = line; *p != '\0'; p++) {
128  if (*p == '\t')
129  *p = ' ';
130  if ((*p == ',' && p[1] == ',') || (*p == ';' && p[1] == ';'))
131  p[1] = ' ';
132  if ((p[1] == ',' || p[1] == ';') && p[0] == ' ') {
133  p[0] = p[1];
134  p[1] = ' ';
135  }
136  }
137 
138  for (p = line, q = line; *p != '\0';) {
139  *q = *p;
140  if (*p == ' ' || *p == '\n') {
141  for (got_nl = false; *p == ' ' || *p == '\n'; p++) {
142  if (*p == '\n')
143  got_nl = true;
144  }
145 
146  if (got_nl)
147  *q = '\n';
148  } else
149  p++;
150  q++;
151  }
152  if (q > line) {
153  for (q--; q > line && (*q == ' ' || *q == ';' || *q == '\n');)
154  q--;
155  if (*q != ' ' && *q != ';' && *q != '\n')
156  q++;
157  }
158  *q = '\0';
159 
160  for (p = line; *p == ' ' || *p == ';' || *p == '\n';)
161  p++;
162  if (p > line)
163  fta_StringCpy(line, p);
164 }
165 
166 void ShrinkSpaces(string& line)
167 {
168  size_t i;
169 
170  if (line.empty())
171  return;
172 
173  for (i = 0; i < line.size(); ++i) {
174  char& c = line[i];
175  if (c == '\t')
176  c = ' ';
177  if (i + 1 < line.size()) {
178  char& c1 = line[i + 1];
179  if ((c == ',' && c1 == ',') || (c == ';' && c1 == ';'))
180  c1 = ' ';
181  if ((c1 == ',' || c1 == ';') && c == ' ') {
182  c = c1;
183  c1 = ' ';
184  }
185  }
186  }
187 
188  size_t j = 0;
189  for (i = 0; i < line.size();) {
190  char c = line[i++];
191  if (c == ' ' || c == '\n') {
192  for (; i < line.size() && (line[i] == ' ' || line[i] == '\n'); ++i) {
193  if (line[i] == '\n')
194  c = '\n';
195  }
196  }
197  line[j++] = c;
198  }
199  line.resize(j);
200 
201  while (! line.empty()) {
202  char c = line.back();
203  if (c == ' ' || c == ';' || c == '\n')
204  line.pop_back();
205  else
206  break;
207  }
208 
209  i = 0;
210  for (char c : line) {
211  if (c == ' ' || c == ';' || c == '\n')
212  ++i;
213  else
214  break;
215  }
216  if (i > 0)
217  line.erase(0, i);
218 }
219 
220 /**********************************************************
221  *
222  * static void InsertDatablkVal(dbp, type, offset, len):
223  *
224  * Allocate a memory, then assign data-block value
225  * to a new node.
226  * dbp points to the new node if dbp is NULL.
227  *
228  * 3-18-93
229  *
230  **********************************************************/
231 static void InsertDatablkVal(DataBlkPtr* dbp, Int2 type, char* offset, size_t len)
232 {
233  DataBlk* ldp = new DataBlk(*dbp, type, offset, len);
234  if (! *dbp) {
235  *dbp = ldp;
236  }
237 }
238 
239 /**********************************************************
240  *
241  * char* GetGenBankBlock(chain, ptr, retkw, eptr):
242  *
243  * Enters knowing current keyword.type and offset,
244  * finds the length of the current keyword block,
245  * and builds the block to "chain".
246  * Since each key-word block always start at first
247  * column of the line, the loop stops when it found the
248  * first none (blank, newline, or tab) character after
249  * the newline character.
250  * Each data block will append to the "chain".
251  * Return a pointer points to next key-word block.
252  *
253  * 3-21-93
254  *
255  **********************************************************/
257 {
258  vector<string> lines;
259  NStr::Split(entry.mBaseData, "\n", lines);
260 
261  vector<string> sectionLines;
262  int currentKw = ParFlat_LOCUS;
263  int nextKw;
264  string sectionText;
265  for (const string& line : lines) {
266  nextKw = SrchKeyword(line, genbankKeywords);
267  if (nextKw == ParFlat_UNKW) {
268  nextKw = currentKw;
269  }
270  if (nextKw != currentKw || NStr::StartsWith(line, "REFERENCE")) {
271  auto* secPtr = new Section(currentKw, sectionLines);
272  // secPtr->DumpText(cerr);
273  entry.mSections.push_back(secPtr);
274  currentKw = nextKw;
275  sectionLines.clear();
276  sectionLines.push_back(line);
277  continue;
278  }
279  sectionLines.push_back(line);
280  }
281  entry.mSections.push_back(new Section(currentKw, sectionLines));
282 }
283 
284 char* GetGenBankBlock(DataBlkPtr* chain, char* ptr, Int2* retkw, char* eptr)
285 {
286  char* offset;
287  int curkw;
288  int nextkw;
289  Int4 len;
290 
291  len = 0;
292  offset = ptr;
293  curkw = *retkw;
294 
295  do /* repeat loop until it finds next key-word */
296  {
297  for (; ptr < eptr && *ptr != '\n'; ptr++)
298  len++;
299  if (ptr >= eptr)
300  return (ptr);
301 
302  ++ptr; /* newline character */
303  ++len;
304 
305  nextkw = SrchKeyword(CTempString(ptr, eptr - ptr), genbankKeywords);
306  if (nextkw == ParFlat_UNKW) /* it can be "XX" line,
307  treat as same line */
308  nextkw = curkw;
309 
310  if (StringEquN(ptr, "REFERENCE", 9)) /* treat as one block */
311  break;
312  } while (nextkw == curkw);
313 
314  nextkw = SrchKeyword(ptr, genbankKeywords);
315 
316  InsertDatablkVal(chain, curkw, offset, len);
317  *retkw = nextkw;
318  return (ptr);
319 }
320 
321 
322 /**********************************************************
323  *
324  * static void GetGenBankRefType(dbp, bases):
325  *
326  * Check the data in the "REFERENCE" line,
327  * - ParFlat_REF_END if it contains
328  * "(bases 1 to endbases)", pub for "descr"
329  * or no base range at all;
330  * - ParFlat_REF_SITES if it contains "(sites)",
331  * for ImpFeatPub;
332  * - ParFlat_REF_BTW, otherwise, for SeqFeatPub.
333  *
334  * 5-19-93
335  *
336  **********************************************************/
337 static void GetGenBankRefType(DataBlkPtr dbp, size_t bases)
338 {
339  char* bptr;
340  char* eptr;
341 
342  bptr = dbp->mOffset;
343  eptr = bptr + dbp->len;
344 
345  const string s = to_string(bases);
346  const string str = "(bases 1 to " + s + ")";
347  const string str1 = "(bases 1 to " + s + ";";
348  const string str2 = "(residues 1 to " + s + "aa)";
349 
350  string ref(bptr, bptr + dbp->len);
351 
352  while (bptr < eptr && *bptr != '\n' && *bptr != '(')
353  bptr++;
354  while (*bptr == ' ')
355  bptr++;
356 
357  if (*bptr == '\n')
359  else if (NStr::Find(ref, str) != NPOS || NStr::Find(ref, str1) != NPOS ||
360  NStr::Find(ref, str2) != NPOS)
361  dbp->mType = ParFlat_REF_END;
362  else if (NStr::Find(ref, "(sites)") != NPOS)
363  dbp->mType = ParFlat_REF_SITES;
364  else
365  dbp->mType = ParFlat_REF_BTW;
366 }
367 
368 /**********************************************************
369  *
370  * static void BuildFeatureBlock(dbp):
371  *
372  * The feature key in column 6-20.
373  *
374  * 5-3-93
375  *
376  **********************************************************/
378 {
379  char* bptr;
380  char* eptr;
381  char* ptr;
382  bool skip;
383 
384  bptr = dbp->mOffset;
385  eptr = bptr + dbp->len;
386  ptr = SrchTheChar(bptr, eptr, '\n');
387 
388  if (! ptr)
389  return;
390 
391  bptr = ptr + 1;
392 
393  while (bptr < eptr) {
394  InsertDatablkVal(reinterpret_cast<DataBlk**>(&dbp->mpData), ParFlat_FEATBLOCK, bptr, eptr - bptr);
395 
396  do {
397  bptr = SrchTheChar(bptr, eptr, '\n');
398  bptr++;
399 
400  skip = false;
401  if (! StringEquN(bptr, "XX", 2))
402  ptr = bptr + ParFlat_COL_FEATKEY;
403  else
404  skip = true;
405  } while ((*ptr == ' ' && ptr < eptr) || skip);
406  }
407 }
408 
409 /**********************************************************/
410 static void fta_check_mult_ids(DataBlkPtr dbp, const char* mtag, const char* ptag)
411 {
412  char* p;
413  Char ch;
414  Int4 muids;
415  Int4 pmids;
416 
417  if (! dbp || ! dbp->mOffset || (! mtag && ! ptag))
418  return;
419 
420  ch = dbp->mOffset[dbp->len];
421  dbp->mOffset[dbp->len] = '\0';
422 
423  size_t mlen = mtag ? StringLen(mtag) : 0;
424  size_t plen = ptag ? StringLen(ptag) : 0;
425 
426  muids = 0;
427  pmids = 0;
428  for (p = dbp->mOffset;; p++) {
429  p = StringChr(p, '\n');
430  if (! p)
431  break;
432  if (mtag && StringEquN(p + 1, mtag, mlen))
433  muids++;
434  else if (ptag && StringEquN(p + 1, ptag, plen))
435  pmids++;
436  }
437  dbp->mOffset[dbp->len] = ch;
438 
439  if (muids > 1) {
440  ErrPostStr(SEV_ERROR, ERR_REFERENCE_MultipleIdentifiers, "Reference has multiple MEDLINE identifiers. Ignoring all but the first.");
441  }
442  if (pmids > 1) {
443  ErrPostStr(SEV_ERROR, ERR_REFERENCE_MultipleIdentifiers, "Reference has multiple PUBMED identifiers. Ignoring all but the first.");
444  }
445 }
446 
447 /**********************************************************
448  *
449  * void GetGenBankSubBlock(entry, bases):
450  *
451  * 4-7-93
452  *
453  **********************************************************/
454 void GetGenBankSubBlock(const DataBlk& entry, size_t bases)
455 {
456  DataBlkPtr dbp;
457 
458  dbp = TrackNodeType(entry, ParFlat_SOURCE);
459  if (dbp) {
460  BuildSubBlock(dbp, ParFlat_ORGANISM, " ORGANISM");
461  GetLenSubNode(dbp);
462  }
463 
464  dbp = TrackNodeType(entry, ParFlat_REFERENCE);
465  for (; dbp; dbp = dbp->mpNext) {
466  if (dbp->mType != ParFlat_REFERENCE)
467  continue;
468 
469  fta_check_mult_ids(dbp, " MEDLINE", " PUBMED");
470  BuildSubBlock(dbp, ParFlat_AUTHORS, " AUTHORS");
471  BuildSubBlock(dbp, ParFlat_CONSRTM, " CONSRTM");
472  BuildSubBlock(dbp, ParFlat_TITLE, " TITLE");
473  BuildSubBlock(dbp, ParFlat_JOURNAL, " JOURNAL");
474  BuildSubBlock(dbp, ParFlat_MEDLINE, " MEDLINE");
475  BuildSubBlock(dbp, ParFlat_PUBMED, " PUBMED");
476  BuildSubBlock(dbp, ParFlat_STANDARD, " STANDARD");
477  BuildSubBlock(dbp, ParFlat_REMARK, " REMARK");
478  GetLenSubNode(dbp);
479  GetGenBankRefType(dbp, bases);
480  }
481 
482  dbp = TrackNodeType(entry, ParFlat_FEATURES);
483  for (; dbp; dbp = dbp->mpNext) {
484  if (dbp->mType != ParFlat_FEATURES)
485  continue;
486 
487  BuildFeatureBlock(dbp);
488  GetLenSubNode(dbp);
489  }
490 }
491 
492 // ----------------------------------------------------------------------------
493 void xGetGenBankSubBlocks(Entry& entry, size_t bases)
494 // ----------------------------------------------------------------------------
495 {
496  for (auto secPtr : entry.mSections) {
497  auto secType = secPtr->mType;
498  if (secType == ParFlat_SOURCE) {
499  secPtr->xBuildSubBlock(ParFlat_ORGANISM, " ORGANISM");
500  // GetLenSubNode(dbp);
501  }
502  if (secType == ParFlat_REFERENCE) {
503  // fta_check_mult_ids(dbp, " MEDLINE", " PUBMED");
504  secPtr->xBuildSubBlock(ParFlat_AUTHORS, " AUTHORS");
505  secPtr->xBuildSubBlock(ParFlat_CONSRTM, " CONSRTM");
506  secPtr->xBuildSubBlock(ParFlat_TITLE, " TITLE");
507  secPtr->xBuildSubBlock(ParFlat_JOURNAL, " JOURNAL");
508  secPtr->xBuildSubBlock(ParFlat_MEDLINE, " MEDLINE");
509  secPtr->xBuildSubBlock(ParFlat_PUBMED, " PUBMED");
510  secPtr->xBuildSubBlock(ParFlat_STANDARD, " STANDARD");
511  secPtr->xBuildSubBlock(ParFlat_REMARK, " REMARK");
512  // GetLenSubNode(dbp);
513  // GetGenBankRefType(dbp, bases);
514  }
515  if (secType == ParFlat_FEATURES) {
516  secPtr->xBuildFeatureBlocks();
517  // GetLenSubNode(dbp);
518  }
519  }
520 }
521 
522 /**********************************************************
523  *
524  * char* GetEmblBlock(chain, ptr, retkw, format, eptr):
525  *
526  * Enters knowing current keyword.type and offset,
527  * finds the length of the current keyword block, and
528  * builds the block to "chain".
529  * Loop will continue until it finds the next keyword
530  * or next "RN" after the newline character.
531  * Each data block will append to the "chain".
532  * Return a pointer points to next key-word block.
533  *
534  * 3-21-93
535  *
536  * The OS block can be
537  * - OS OS OC OC XX OG ==> this normal
538  * or
539  * - OS OC OC XX OS OS OC OC XX OG ==> this hybrids
540  * For case 2, it need to make two OS block.
541  *
542  * 12-15-93
543  *
544  **********************************************************/
545 char* GetEmblBlock(DataBlkPtr* chain, char* ptr, short* retkw, Parser::EFormat format, char* eptr)
546 {
547  char* offset;
548  Int2 curkw;
549  Int2 nextkw;
550  bool seen_oc = false;
551 
552  size_t len = 0;
553  offset = ptr;
554  curkw = *retkw;
555 
556  do /* repeat loop until it finds next key-word */
557  {
558  for (; ptr < eptr && *ptr != '\n'; ptr++)
559  len++;
560  if (ptr >= eptr) {
561  *retkw = ParFlat_END;
562  return (ptr);
563  }
564  ++ptr; /* newline character */
565  ++len;
566 
567  nextkw = SrchKeyword(
568  CTempString(ptr, eptr - ptr),
570  if (nextkw == ParFlat_UNKW) /* it can be "XX" line,
571  treat as same line */
572  nextkw = curkw;
573  if (StringEquN(ptr, "RN", 2)) /* treat each RN per block */
574  break;
575  if (StringEquN(ptr, "ID", 2)) /* treat each ID per block */
576  break;
577 
578  if (StringEquN(ptr, "OC", 2))
579  seen_oc = true;
580 
581  if (StringEquN(ptr, "OS", 2) && seen_oc)
582  break; /* treat as next OS block */
583 
584  } while (nextkw == curkw);
585 
586  InsertDatablkVal(chain, curkw, offset, len);
587 
588  *retkw = nextkw;
589  return (ptr);
590 }
591 
592 /**********************************************************
593  *
594  * static bool TrimEmblFeatBlk(dbp):
595  *
596  * Routine return TRUE if found FT data.
597  * The routine do the following things:
598  * - only leave last one FH line;
599  * - replace all "FT" to " " in the beginning of line.
600  *
601  * 6-15-93
602  *
603  **********************************************************/
604 static bool TrimEmblFeatBlk(DataBlkPtr dbp)
605 {
606  char* bptr;
607  char* eptr;
608  char* ptr;
609  bool flag = false;
610 
611  bptr = dbp->mOffset;
612  eptr = bptr + dbp->len;
613  ptr = SrchTheChar(bptr, eptr, '\n');
614 
615  while (ptr && ptr + 1 < eptr) {
616  if (ptr[2] == 'H') {
617  dbp->len = dbp->len - (ptr - dbp->mOffset + 1);
618  dbp->mOffset = ptr + 1;
619 
620  bptr = dbp->mOffset;
621  eptr = bptr + dbp->len;
622  } else {
623  bptr = ptr + 1;
624 
625  if (bptr[1] == 'T') {
626  flag = true;
627  *bptr = ' ';
628  bptr[1] = ' ';
629  }
630  }
631 
632  ptr = SrchTheChar(bptr, eptr, '\n');
633  }
634 
635  return (flag);
636 }
637 
638 /**********************************************************
639  *
640  * static bool GetSubNodeType(subkw, retbptr, eptr):
641  *
642  * Return TRUE and memory location which has
643  * the "subkw".
644  *
645  * 6-15-93
646  *
647  **********************************************************/
648 static bool GetSubNodeType(const char* subkw, char** retbptr, char* eptr)
649 {
650  char* bptr;
651  char* ptr;
652 
653  bptr = *retbptr;
654  size_t sublen = StringLen(subkw);
655 
656  while (bptr < eptr) {
657  if (StringEquN(bptr, subkw, sublen)) {
658  *retbptr = bptr;
659  return true;
660  }
661 
662  ptr = SrchTheChar(bptr, eptr, '\n');
663  if (ptr)
664  bptr = ptr;
665  bptr++;
666  }
667 
668  *retbptr = bptr;
669  return false;
670 }
671 
672 /**********************************************************
673  *
674  * static void GetEmblRefType(bases, source, dbp):
675  *
676  * If there is no "RP" line, default, or there is "RP"
677  * line and it contains "1-endbases", then
678  * type = ParFlat_REF_END, pub for "descr".
679  * Otherwise, ParFlat_REF_BTW, for SeqFeatPub.
680  *
681  * 6-15-93
682  *
683  **********************************************************/
684 static void GetEmblRefType(size_t bases, Parser::ESource source, DataBlkPtr dbp)
685 {
686  char* ptr;
687  char* bptr;
688  char* eptr;
689  char* sptr;
690 
691  bptr = dbp->mOffset;
692  eptr = bptr + dbp->len;
693 
694  if (! GetSubNodeType("RP", &bptr, eptr)) {
697  else
698  dbp->mType = ParFlat_REF_END;
699  return;
700  }
701 
702  const string str = " 1-" + to_string(bases);
703  ptr = SrchTheStr(bptr, eptr, str.c_str());
704  if (ptr) {
705  dbp->mType = ParFlat_REF_END;
706  return;
707  }
708 
709  if (source == Parser::ESource::EMBL) {
710  ptr = SrchTheStr(bptr, eptr, " 0-0");
711  if (ptr) {
713  return;
714  }
715  }
716 
717  dbp->mType = ParFlat_REF_BTW;
718  if (source == Parser::ESource::NCBI) {
719  for (sptr = bptr + 1; sptr < eptr && *sptr != 'R';)
720  sptr++;
721  if (SrchTheStr(bptr, sptr, "sites"))
722  dbp->mType = ParFlat_REF_SITES;
723  }
724 }
725 
726 /**********************************************************
727  *
728  * void GetEmblSubBlock(bases, source, entry):
729  *
730  * To build feature block:
731  * - report error if no FT data in the FH block;
732  * - to fit genbank feature table parsing:
733  * - only leave first FH line;
734  * - replace "FT" to " ";
735  * - delete any XX blocks.
736  *
737  * 5-27-93
738  *
739  **********************************************************/
740 void GetEmblSubBlock(size_t bases, Parser::ESource source, const DataBlk& entry)
741 {
742  DataBlkPtr temp;
743  DataBlkPtr curdbp;
744  DataBlkPtr predbp;
745  EntryBlkPtr ebp;
746 
747  temp = TrackNodeType(entry, ParFlat_OS);
748  for (; temp; temp = temp->mpNext) {
749  if (temp->mType != ParFlat_OS)
750  continue;
751 
752  BuildSubBlock(temp, ParFlat_OC, "OC");
753  BuildSubBlock(temp, ParFlat_OG, "OG");
754  GetLenSubNode(temp);
755  }
756 
757  temp = TrackNodeType(entry, ParFlat_RN);
758  for (; temp; temp = temp->mpNext) {
759  if (temp->mType != ParFlat_RN)
760  continue;
761 
762  fta_check_mult_ids(temp, "RX MEDLINE;", "RX PUBMED;");
763  BuildSubBlock(temp, ParFlat_RC, "RC");
764  BuildSubBlock(temp, ParFlat_RP, "RP");
765  BuildSubBlock(temp, ParFlat_RX, "RX");
766  BuildSubBlock(temp, ParFlat_RG, "RG");
767  BuildSubBlock(temp, ParFlat_RA, "RA");
768  BuildSubBlock(temp, ParFlat_RT, "RT");
769  BuildSubBlock(temp, ParFlat_RL, "RL");
770  GetEmblRefType(bases, source, temp);
771  GetLenSubNode(temp);
772  }
773 
774  ebp = static_cast<EntryBlk*>(entry.mpData);
775  temp = ebp->chain;
776  predbp = temp;
777  curdbp = temp->mpNext;
778  while (curdbp) {
779  if (curdbp->mType != ParFlat_FH) {
780  predbp = curdbp;
781  curdbp = curdbp->mpNext;
782  continue;
783  }
784 
785  if (TrimEmblFeatBlk(curdbp)) {
786  BuildFeatureBlock(curdbp);
787  GetLenSubNode(curdbp);
788 
789  predbp = curdbp;
790  curdbp = curdbp->mpNext;
791  } else /* report error, free this node */
792  {
793  ErrPostStr(SEV_WARNING, ERR_FEATURE_NoFeatData, "No feature data in the FH block (Embl)");
794 
795  predbp->mpNext = curdbp->mpNext;
796  curdbp->mpNext = nullptr;
797  delete curdbp;
798  curdbp = predbp->mpNext;
799  }
800  }
801 }
802 
803 /**********************************************************
804  *
805  * void BuildSubBlock(dbp, subtype, subkw):
806  *
807  * Some of sub-keyword may not be exist in every entry.
808  *
809  * 4-7-93
810  *
811  **********************************************************/
812 void BuildSubBlock(DataBlkPtr dbp, Int2 subtype, const char* subkw)
813 {
814  char* bptr;
815  char* eptr;
816 
817  bptr = dbp->mOffset;
818  eptr = bptr + dbp->len;
819 
820  if (GetSubNodeType(subkw, &bptr, eptr)) {
821  InsertDatablkVal(reinterpret_cast<DataBlk**>(&dbp->mpData), subtype, bptr, eptr - bptr);
822  }
823 }
824 
825 /**********************************************************
826  *
827  * void GetLenSubNode(dbp):
828  *
829  * Recalculate the length for the node which has
830  * subkeywords.
831  *
832  * 4-7-93
833  *
834  **********************************************************/
836 {
837  DataBlkPtr curdbp;
838  DataBlkPtr ndbp;
839  DataBlkPtr ldbp;
840  char* offset;
841  char* s;
842  Int2 n;
843  bool done = false;
844 
845  if (! dbp->mpData) /* no sublocks in this block */
846  return;
847 
848  offset = dbp->mOffset;
849  for (s = offset; *s != '\0' && isdigit(*s) == 0;)
850  s++;
851  n = atoi(s);
852  ldbp = nullptr;
853  for (ndbp = static_cast<DataBlk*>(dbp->mpData); ndbp; ndbp = ndbp->mpNext) {
854  size_t l = ndbp->mOffset - offset;
855  if (l > 0 && l < dbp->len) {
856  dbp->len = l;
857  ldbp = ndbp;
858  }
859  }
860 
861  if (ldbp != dbp->mpData && ldbp) {
862  ErrPostEx(SEV_WARNING, ERR_FORMAT_LineTypeOrder, "incorrect line type order for reference %d", n);
863  done = true;
864  }
865 
866  curdbp = static_cast<DataBlk*>(dbp->mpData);
867  for (; curdbp->mpNext; curdbp = curdbp->mpNext) {
868  offset = curdbp->mOffset;
869  ldbp = nullptr;
870  for (ndbp = static_cast<DataBlk*>(dbp->mpData); ndbp; ndbp = ndbp->mpNext) {
871  size_t l = ndbp->mOffset - offset;
872  if (l > 0 && l < curdbp->len) {
873  curdbp->len = l;
874  ldbp = ndbp;
875  }
876  }
877  if (ldbp != curdbp->mpNext && ldbp && ! done) {
878  ErrPostEx(SEV_WARNING, ERR_FORMAT_LineTypeOrder, "incorrect line type order for reference %d", n);
879  }
880  }
881 }
882 
883 /**********************************************************/
885 {
886  CRef<CPatent_seq_id> pat_id;
887  const char* p;
888  const char* q;
889 
890  if (! acc || *acc == '\0')
891  return (pat_id);
892 
893  pat_id = new CPatent_seq_id;
894 
895  p = StringChr(acc, '|');
896 
897  q = StringChr(p + 1, '|');
898  pat_id->SetCit().SetCountry(string(p + 1, q));
899 
900  p = StringChr(q + 1, '|');
901  pat_id->SetCit().SetId().SetNumber(string(q + 1, p));
902 
903  q = StringChr(p + 1, '|');
904  pat_id->SetCit().SetDoc_type(string(p + 1, q));
905 
906  pat_id->SetSeqid(atoi(q + 1));
907 
908  return (pat_id);
909 }
910 
911 /**********************************************************
912  *
913  * static Uint ValidSeqType(accession, type, is_nuc, is_tpa):
914  *
915  * 9-16-93
916  *
917  **********************************************************/
918 static Uint1 ValidSeqType(const char* accession, Uint1 type)
919 {
920  // CSeq_id::E_Choice cho;
921 
924  return (type);
925 
928  return (CSeq_id::e_not_set);
929 
930  if (! accession)
931  return (type);
932 
933  const auto cho = CSeq_id::GetAccType(CSeq_id::IdentifyAccession(accession));
934  /*
935  if (is_nuc)
936  cho = GetNucAccOwner(accession);
937  else
938  cho = GetProtAccOwner(accession);
939  */
940  if ((type == CSeq_id::e_Genbank || type == CSeq_id::e_Tpg) &&
941  (cho == CSeq_id::e_Genbank || cho == CSeq_id::e_Tpg))
942  return (cho);
943  else if ((type == CSeq_id::e_Ddbj || type == CSeq_id::e_Tpd) &&
944  (cho == CSeq_id::e_Ddbj || cho == CSeq_id::e_Tpd))
945  return (cho);
946  else if ((type == CSeq_id::e_Embl || type == CSeq_id::e_Tpe) &&
947  (cho == CSeq_id::e_Embl || cho == CSeq_id::e_Tpe))
948  return (cho);
949  return type;
950 }
951 
952 /**********************************************************
953  *
954  * CRef<CSeq_id> MakeAccSeqId(acc, seqtype, accver, vernum,
955  * is_nuc, is_tpa):
956  *
957  * 5-10-93
958  *
959  **********************************************************/
960 CRef<CSeq_id> MakeAccSeqId(const char* acc, Uint1 seqtype, bool accver, Int2 vernum)
961 {
962  CRef<CSeq_id> id;
963 
964  if (! acc || *acc == '\0')
965  return id;
966 
967  seqtype = ValidSeqType(acc, seqtype);
968 
969  if (seqtype == CSeq_id::e_not_set)
970  return id;
971 
972  CRef<CTextseq_id> text_id(new CTextseq_id);
973  text_id->SetAccession(acc);
974 
975  if (accver && vernum > 0)
976  text_id->SetVersion(vernum);
977 
978  id = new CSeq_id;
979  SetTextId(seqtype, *id, *text_id);
980  return id;
981 }
982 
983 /**********************************************************
984  *
985  * SeqIdPtr MakeLocusSeqId(locus, seqtype):
986  *
987  * 5-13-93
988  *
989  **********************************************************/
990 CRef<CSeq_id> MakeLocusSeqId(const char* locus, CSeq_id::E_Choice seqtype)
991 {
992  CRef<CSeq_id> res;
993  if (! locus || *locus == '\0')
994  return res;
995 
996  CRef<CTextseq_id> text_id(new CTextseq_id);
997  text_id->SetName(locus);
998 
999  res.Reset(new CSeq_id);
1000  SetTextId(seqtype, *res, *text_id);
1001 
1002  return res;
1003 }
1004 
1005 // LCOV_EXCL_START
1006 // Excluded per Mark's request on 12/14/2016
1007 /**********************************************************/
1008 static CRef<CSeq_id> MakeSegSetSeqId(const char* accession, const string& locus, Uint1 seqtype, bool is_tpa)
1009 {
1010  CRef<CSeq_id> res;
1011  if (locus.empty())
1012  return res;
1013 
1014  seqtype = ValidSeqType(accession, seqtype);
1015 
1016  if (seqtype == CSeq_id::e_not_set)
1017  return res;
1018 
1019  CRef<CTextseq_id> text_id(new CTextseq_id);
1020  text_id->SetName(locus);
1021 
1022  res.Reset(new CSeq_id);
1023  SetTextId(seqtype, *res, *text_id);
1024 
1025  return res;
1026 }
1027 // LCOV_EXCL_STOP
1028 
1029 /**********************************************************
1030  *
1031  * char* SrchNodeSubType(entry, type, subtype, len):
1032  *
1033  * Return a memory location of the node which has
1034  * the "subtype".
1035  *
1036  * 4-7-93
1037  *
1038  **********************************************************/
1039 char* SrchNodeSubType(const DataBlk& entry, Int2 type, Int2 subtype, size_t* len)
1040 {
1041  DataBlkPtr mdbp;
1042  DataBlkPtr sdbp;
1043 
1044  *len = 0;
1045  mdbp = TrackNodeType(entry, type);
1046  if (! mdbp)
1047  return nullptr;
1048 
1049  sdbp = static_cast<DataBlk*>(mdbp->mpData);
1050 
1051  while (sdbp && sdbp->mType != subtype)
1052  sdbp = sdbp->mpNext;
1053 
1054  if (! sdbp)
1055  return nullptr;
1056 
1057  *len = sdbp->len;
1058  return (sdbp->mOffset);
1059 }
1060 
1061 /**********************************************************/
1062 static void SetEmptyId(CBioseq& bioseq)
1063 {
1064  CRef<CObject_id> emptyId(new CObject_id);
1065  emptyId->SetId8(0);
1066 
1067  CRef<CSeq_id> seqId(new CSeq_id);
1068  seqId->SetLocal(*emptyId);
1069 
1070  bioseq.SetId().push_back(seqId);
1071 }
1072 
1073 /**********************************************************/
1075 {
1076  IndexblkPtr ibp;
1077 
1078  char* locus;
1079  const char* acc;
1080  Uint1 seqtype;
1081 
1082  CRef<CBioseq> res(new CBioseq);
1083 
1084  /* create the entry framework */
1085 
1086  ibp = pp->entrylist[pp->curindx];
1087  locus = ibp->locusname;
1088  acc = ibp->acnum;
1089 
1090  /* get the SeqId */
1091  if (pp->source == Parser::ESource::USPTO) {
1092  CRef<CSeq_id> id(new CSeq_id);
1094  id->SetPatent(*psip);
1095  return (res);
1096  }
1097  if (pp->source == Parser::ESource::EMBL && ibp->is_tpa)
1098  seqtype = CSeq_id::e_Tpe;
1099  else
1100  seqtype = ValidSeqType(acc, pp->seqtype);
1101 
1102  if (seqtype == CSeq_id::e_not_set) {
1103  if (acc && ! NStr::IsBlank(acc)) {
1104  auto pId = Ref(new CSeq_id(CSeq_id::e_Local, acc));
1105  res->SetId().push_back(std::move(pId));
1106  } else if (pp->mode == Parser::EMode::Relaxed && locus) {
1107  auto pId = Ref(new CSeq_id(CSeq_id::e_Local, locus));
1108  res->SetId().push_back(std::move(pId));
1109  } else {
1110  SetEmptyId(*res);
1111  }
1112  } else if ((! locus || *locus == '\0') && (! acc || *acc == '\0')) {
1113  SetEmptyId(*res);
1114  } else {
1115  CRef<CTextseq_id> textId(new CTextseq_id);
1116 
1117  if (ibp->embl_new_ID == false && locus && *locus != '\0' &&
1118  (! acc || ! StringEqu(acc, locus)))
1119  textId->SetName(locus);
1120 
1121  if (acc && *acc != '\0')
1122  textId->SetAccession(acc);
1123 
1124  if (pp->accver && ibp->vernum > 0)
1125  textId->SetVersion(ibp->vernum);
1126 
1127  CRef<CSeq_id> seqId(new CSeq_id);
1128  if (SetTextId(seqtype, *seqId, *textId))
1129  res->SetId().push_back(seqId);
1130  else
1131  SetEmptyId(*res);
1132  }
1133 
1134  return res;
1135 }
1136 
1137 /**********************************************************
1138  *
1139  * char* GetDescrComment(offset, len, col_data, is_htg):
1140  *
1141  * Return a pointer to a string comment.
1142  * Strip tailing or leading blanks, unless the
1143  * following rules occurrs (all the length will count
1144  * leading or tailing blanks):
1145  * - replace "\n" to "~~ ~~" if the length of a
1146  * line <= 12, except first blank line;
1147  * - if the column 13 is blank in the current line
1148  * and the previous line does not be added "~" at
1149  * end, then add "~" the beginning of the line
1150  * (indent format);
1151  * - replace "\n" to "~" if the length of a
1152  * line < 50 and (not a last line or not a first
1153  * line);
1154  * -- otherwise, change "\n" to a space.
1155  *
1156  * 4-28-93
1157  *
1158  **********************************************************/
1159 char* GetDescrComment(char* offset, size_t len, Uint2 col_data, bool is_htg, bool is_pat)
1160 {
1161  char* p;
1162  char* q;
1163  char* r;
1164  char* str;
1165 
1166  bool within = false;
1167  char* bptr = offset;
1168  char* eptr = bptr + len;
1169  char* com = StringNew(len);
1170 
1171  for (str = com; bptr < eptr; bptr = p + 1) {
1172  p = SrchTheChar(bptr, eptr, '\n');
1173 
1174  /* skip HTG generated comments starting with '*' */
1175  if ((is_htg && bptr[col_data] == '*') ||
1176  StringEquN(bptr, "XX", 2))
1177  continue;
1178 
1179  if (! within) {
1180  r = SrchTheStr(bptr, p, "-START##");
1181  if (r)
1182  within = true;
1183  }
1184 
1185  q = bptr;
1186  if (*q == 'C')
1187  q++;
1188  if (*q == 'C')
1189  q++;
1190  while (*q == ' ')
1191  q++;
1192  if (q == p) {
1193  if (*(str - 1) != '~')
1194  *str++ = '~';
1195  *str++ = '~';
1196  continue;
1197  }
1198 
1199  if (p - bptr < col_data)
1200  continue;
1201 
1202  bptr += col_data;
1203  size_t size = p - bptr;
1204 
1205  if (*bptr == ' ' && *(str - 1) != '~')
1206  *str++ = '~';
1207  MemCpy(str, bptr, size);
1208  str += size;
1209  if (is_pat && size > 4 &&
1210  q[0] >= 'A' && q[0] <= 'Z' && q[1] >= 'A' && q[1] <= 'Z' &&
1211  StringEquN(q + 2, " ", 3))
1212  *str++ = '~';
1213  else if (size < 50 || within)
1214  *str++ = '~';
1215  else
1216  *str++ = ' ';
1217 
1218  if (within) {
1219  r = SrchTheStr(bptr, p, "-END##");
1220  if (r)
1221  within = false;
1222  }
1223  }
1224 
1225  for (p = com;;) {
1226  p = StringStr(p, "; ");
1227  if (! p)
1228  break;
1229  for (p += 2, eptr = p; *eptr == ' ';)
1230  eptr++;
1231  if (eptr > p)
1232  fta_StringCpy(p, eptr);
1233  }
1234  for (p = com; *p == ' ';)
1235  p++;
1236  if (p > com)
1237  fta_StringCpy(com, p);
1238  for (p = com; *p != '\0';)
1239  p++;
1240  if (p > com) {
1241  for (p--;; p--) {
1242  if (*p == ' ' || *p == '\t' || *p == ';' || *p == ',' ||
1243  *p == '.' || *p == '~') {
1244  if (p > com)
1245  continue;
1246  *p = '\0';
1247  }
1248  break;
1249  }
1250  if (*p != '\0') {
1251  p++;
1252  if (StringEquN(p, "...", 3))
1253  p[3] = '\0';
1254  else if (StringChr(p, '.')) {
1255  *p = '.';
1256  p[1] = '\0';
1257  } else
1258  *p = '\0';
1259  }
1260  }
1261  if (*com != '\0')
1262  return (com);
1263  MemFree(com);
1264  return nullptr;
1265 }
1266 
1267 /**********************************************************/
1269 {
1270  auto it1 = secs.begin();
1271  if (it1 == secs.end() || it1->empty())
1272  return;
1273  auto it2 = next(it1);
1274  if (it2 == secs.end() || *it2 != "-" || fta_if_wgs_acc(*it1) != 0)
1275  return;
1276 
1277  auto tbp = secs.insert_after(it1, *it1);
1278  tbp->back() = '1';
1279 }
1280 
1281 
1282 /**********************************************************/
1283 /*
1284 static void fta_fix_secondaries(list<string>& secondaries)
1285 {
1286  if (secondaries.size() < 2) {
1287  return;
1288  }
1289 
1290  auto it = secondaries.begin();
1291  const auto& first = *it;
1292  const auto& second = *next(it);
1293 
1294  if (first.empty()||
1295  second.empty() ||
1296  fta_if_wgs_acc(second) != 0 ||
1297  second != "-") {
1298  return;
1299  }
1300 
1301  string newSecondary = *it;
1302  newSecondary.back() = '1';
1303  ++it;
1304  secondaries.insert(it, newSecondary);
1305 }
1306 */
1307 
1308 /**********************************************************
1309  *
1310  * void GetExtraAccession(ibp, allow_uwsec, source, accessions):
1311  *
1312  * Skip first accession, put remaining accessions
1313  * to link list 'accessions'.
1314  * Each accession separated by ";" or blanks.
1315  *
1316  **********************************************************/
1317 void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList& accessions)
1318 {
1319  Int4 pri_acc;
1320  Int4 sec_acc;
1321  const char* text;
1322  char* acc;
1323  size_t i = 0;
1324 
1325  bool unusual_wgs;
1326  bool unusual_wgs_msg;
1327  bool is_cp;
1328 
1329  CSeq_id::E_Choice pri_owner;
1330  CSeq_id::E_Choice sec_owner;
1331 
1332  if (ibp->secaccs.empty()) {
1333  return;
1334  }
1335 
1336  acc = StringSave(ibp->acnum);
1337  is_cp = (acc[0] == 'C' && acc[1] == 'P');
1338  pri_acc = fta_if_wgs_acc(acc);
1339  pri_owner = GetNucAccOwner(acc);
1340  if (pri_acc == 1 || pri_acc == 4) {
1341  char* p;
1342  for (p = acc; (*p >= 'A' && *p <= 'Z') || *p == '_';)
1343  p++;
1344  *p = '\0';
1345  i = StringLen(acc);
1346  }
1347 
1348  if (source == Parser::ESource::EMBL) {
1350  }
1351 
1352  unusual_wgs = false;
1353  for (auto tbp = ibp->secaccs.begin(); tbp != ibp->secaccs.end(); ++tbp) {
1354  if (*tbp == "-"s) {
1355  ++tbp;
1356  if (tbp == ibp->secaccs.end())
1357  break;
1358  if (! accessions.empty()) {
1359  accessions.back() += '-';
1360  accessions.back() += *tbp;
1361  }
1362  continue;
1363  }
1364 
1365  DelNonDigitTail(*tbp);
1366  const string& a = *tbp;
1367  sec_acc = fta_if_wgs_acc(a);
1368 
1369  unusual_wgs_msg = true;
1370  if (sec_acc == 0 || sec_acc == 3 ||
1371  sec_acc == 4 || sec_acc == 6 ||
1372  sec_acc == 10 || sec_acc == 12) /* 0 = AAAA01000000,
1373  3 = AAAA00000000,
1374  4 = GAAA01000000,
1375  6 = GAAA00000000,
1376  10 = KAAA01000000,
1377  12 = KAAA00000000 */
1378  {
1379  if (ibp->is_contig &&
1380  (ibp->wgssec.empty() || NStr::CommonSuffixSize(ibp->wgssec, a) >= 4))
1381  unusual_wgs_msg = false;
1382  if (ibp->wgssec.empty())
1383  ibp->wgssec = a;
1384  }
1385 
1386  sec_owner = GetNucAccOwner(a);
1387 
1388  if (sec_acc < 0 || sec_acc == 2) {
1389  if (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) {
1390  if (! allow_uwsec) {
1391  ErrPostEx(SEV_REJECT, ERR_ACCESSION_WGSWithNonWGS_Sec, "This WGS/TSA/TLS record has non-WGS/TSA/TLS secondary accession \"%s\". WGS/TSA/TLS records are not currently allowed to replace finished sequence records, scaffolds, etc. without human review and confirmation.", a.c_str());
1392  ibp->drop = true;
1393  } else {
1394  ErrPostEx(SEV_WARNING, ERR_ACCESSION_WGSWithNonWGS_Sec, "This WGS/TSA/TLS record has non-WGS/TSA/TLS secondary accession \"%s\". This is being allowed via the use of a special parser flag.", a.c_str());
1395  }
1396  }
1397 
1398  accessions.push_back(a);
1399  continue;
1400  }
1401 
1402  if (sec_acc == 3 || sec_acc == 6) /* like AAAA00000000 */
1403  {
1404  if (pri_owner == CSeq_id::e_Embl && sec_owner == CSeq_id::e_Embl &&
1405  (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) &&
1407  continue;
1410  ErrPostEx(SEV_REJECT, ERR_ACCESSION_WGSMasterAsSecondary, "WGS/TSA/TLS master accession \"%s\" is not allowed to be used as a secondary accession number.", a.c_str());
1411  ibp->drop = true;
1412  }
1413  continue;
1414  }
1415 
1416  if (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) /* WGS/TSA/TLS
1417  contig */
1418  {
1419  i = (StringEquN(a.c_str(), "NZ_", 3)) ? 7 : 4;
1420  if (! StringEquN(a.c_str(), ibp->acnum, i)) {
1421  if (! allow_uwsec) {
1422  ErrPostStr(SEV_REJECT, ERR_ACCESSION_UnusualWGS_Secondary, "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA/TLS project is being replaced (either by another project or by finished sequence). This is not allowed without human review and confirmation.");
1423  ibp->drop = true;
1424  } else if (! is_cp || source != Parser::ESource::NCBI) {
1425  ErrPostStr(SEV_WARNING, ERR_ACCESSION_UnusualWGS_Secondary, "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA project is being replaced (either by another project or by finished sequence). This is being allowed via the use of a special parser flag.");
1426  }
1427  }
1428  } else if (pri_acc == 2) /* WGS scaffold */
1429  {
1430  if (sec_acc == 1 || sec_acc == 5 || sec_acc == 11) /* WGS/TSA/TLS
1431  contig */
1432  {
1433  ErrPostStr(SEV_REJECT, ERR_ACCESSION_ScfldHasWGSContigSec, "This record, which appears to be a scaffold, has one or more WGS/TSA/TLS contig accessions as secondary. Currently, it does not make sense for a contig to replace a scaffold.");
1434  ibp->drop = true;
1435  }
1436  } else if (unusual_wgs_msg) {
1437  if (! allow_uwsec) {
1438  if (! unusual_wgs) {
1439  if (sec_acc == 1 || sec_acc == 5 || sec_acc == 11)
1440  text = "WGS/TSA/TLS contig secondaries are present, implying that a scaffold is replacing a contig";
1441  else
1442  text = "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA/TLS project is being replaced (either by another project or by finished sequence)";
1443  ErrPostEx(SEV_REJECT, ERR_ACCESSION_UnusualWGS_Secondary, "%s. This is not allowed without human review and confirmation.", text);
1444  }
1445  unusual_wgs = true;
1446  ibp->drop = true;
1447  } else if (! is_cp || source != Parser::ESource::NCBI) {
1448  if (! unusual_wgs) {
1449  if (sec_acc == 1 || sec_acc == 5 || sec_acc == 11)
1450  text = "WGS/TSA/TLS contig secondaries are present, implying that a scaffold is replacing a contig";
1451  else
1452  text = "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA/TLS project is being replaced (either by another project or by finished sequence)";
1453  ErrPostEx(SEV_WARNING, ERR_ACCESSION_UnusualWGS_Secondary, "%s. This is being allowed via the use of a special parser flag.", text);
1454  }
1455  unusual_wgs = true;
1456  }
1457  }
1458 
1459  if (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) {
1460  if (StringEquN(acc, a.c_str(), i) && a[i] >= '0' && a[i] <= '9') {
1461  if (sec_acc == 1 || sec_acc == 5 || pri_acc == 11)
1462  accessions.push_back(a);
1463  } else if (allow_uwsec) {
1464  accessions.push_back(a);
1465  }
1466  } else if (pri_acc == 2) {
1467  if (sec_acc == 0 || sec_acc == 4) /* like AAAA10000000 */
1468  accessions.push_back(a);
1469  } else if (allow_uwsec || (! unusual_wgs_msg && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL))) {
1470  accessions.push_back(a);
1471  }
1472  }
1473 
1474  MemFree(acc);
1475 }
1476 
1477 /**********************************************************/
1478 static void fta_fix_tpa_keywords(TKeywordList& keywords)
1479 {
1480  const char* p;
1481 
1482  for (string& key : keywords) {
1483  if (key.empty())
1484  continue;
1485 
1486  if (NStr::CompareNocase(key.c_str(), "TPA") == 0)
1487  key = "TPA";
1488  else if (StringEquNI(key.c_str(), "TPA:", 4)) {
1489  string buf("TPA:");
1490 
1491  for (p = key.c_str() + 4; *p == ' ' || *p == '\t';)
1492  p++;
1493 
1494  buf += p;
1495  if (fta_is_tpa_keyword(buf.c_str())) {
1496  for (string::iterator p = buf.begin() + 4; p != buf.end(); ++p) {
1497  if (*p >= 'A' && *p <= 'Z')
1498  *p |= 040;
1499  }
1500  }
1501 
1502  swap(key, buf);
1503  }
1504  }
1505 }
1506 
1507 // ----------------------------------------------------------------------------
1509  string& keywordData)
1510 // ----------------------------------------------------------------------------
1511 {
1512  const string problematic("WGS Third Party Data");
1513  const string desired("WGS; Third Party Data");
1514 
1515  if (keywordData.empty()) {
1516  return;
1517  }
1518  auto wgsStart = NStr::FindNoCase(keywordData, problematic);
1519  if (wgsStart == string::npos) {
1520  return;
1521  }
1522  auto afterProblematic = keywordData[wgsStart + problematic.size()];
1523  if (afterProblematic != ';' && afterProblematic != '.') {
1524  return;
1525  }
1526 
1527  string fixedKeywords;
1528  if (wgsStart > 0) {
1529  auto semiBefore = keywordData.rfind(';', wgsStart - 1);
1530  if (semiBefore == string::npos) {
1531  return;
1532  }
1533  for (auto i = semiBefore + 1; i < wgsStart; ++i) {
1534  if (keywordData[i] != ' ') {
1535  return;
1536  }
1537  }
1538  fixedKeywords = keywordData.substr(0, wgsStart - 1);
1539  }
1540  fixedKeywords += desired;
1541  fixedKeywords += keywordData.substr(wgsStart + problematic.size());
1542  keywordData = fixedKeywords;
1543 }
1544 
1545 
1546 // ----------------------------------------------------------------------------
1548  const DataBlk& entry,
1549  int type,
1550  Uint2 col_data,
1551  TKeywordList& keywords)
1552 // ----------------------------------------------------------------------------
1553 {
1554  // Expectation: Each keyword separated by ";", the last one ends with "."
1555 
1556  keywords.clear();
1557  auto keywordData = xGetNodeData(entry, type);
1558  if (keywordData.empty()) {
1559  return;
1560  }
1561  keywordData = GetBlkDataReplaceNewLine(keywordData, col_data);
1562  if (type == ParFlatSP_KW) {
1563  StripECO(keywordData);
1564  }
1565  xFixEMBLKeywords(keywordData);
1566 
1567  NStr::Split(keywordData, ";", keywords);
1568  auto it = keywords.begin();
1569  auto last = --keywords.end();
1570  while (it != keywords.end()) {
1571  auto& keyword = *it;
1572  NStr::TruncateSpacesInPlace(keyword);
1573  if (it == last) {
1574  NStr::TrimSuffixInPlace(keyword, ".");
1575  NStr::TruncateSpacesInPlace(keyword);
1576  }
1577  if (keyword.empty()) {
1578  keywords.erase(it++);
1579  } else {
1580  it++;
1581  }
1582  }
1583 
1584  fta_fix_tpa_keywords(keywords);
1585 }
1586 
1587 
1588 /**********************************************************
1589  *
1590  * Int4 ScanSequence(warn, seqptr, bsp, conv,
1591  * replacechar, numns):
1592  *
1593  * Scans a block of text converting characters to
1594  * sequence and storing in the ByteStorePtr bsp.
1595  * conv is a 255 Uint1 array where cells are indexed
1596  * by the ASCII value of the character in ptr:
1597  * - a value of 0 indicates skip;
1598  * - a value of 1 indicates an character is
1599  * unexpected (error);
1600  * - otherwise, it is a IUPACaa (protein) or a IUPACna
1601  * (nucleic acid) letter.
1602  * Function returns count of valid characters
1603  * converted to sequence.
1604  *
1605  * When sequence is presented in columns, this
1606  * function should be called once per line, so that
1607  * numbers can be recognized as errors.
1608  *
1609  * 3-30-93
1610  *
1611  * In order to skip the input flatfile put residue
1612  * label count at end, add blank variable to assume each
1613  * line only allow 6 blanks between residue.
1614  *
1615  * 7-28-93
1616  *
1617  **********************************************************/
1618 Int4 ScanSequence(bool warn, char** seqptr, std::vector<char>& bsp, unsigned char* conv, Char replacechar, int* numns)
1619 {
1620  Int2 blank;
1621  Int2 count;
1622  Uint1 residue;
1623  char* ptr;
1624  static Uint1 buf[133];
1625  unsigned char* bu;
1626 
1627  blank = count = 0;
1628  ptr = *seqptr;
1629 
1630  bu = buf;
1631  while (*ptr != '\n' && *ptr != '\0' && blank < 6 && count < 100) {
1632  if (numns && (*ptr == 'n' || *ptr == 'N'))
1633  (*numns)++;
1634 
1635  residue = conv[(int)*ptr];
1636 
1637  if (*ptr == ' ')
1638  blank++;
1639 
1640  if (residue > 2) {
1641  *bu++ = residue;
1642  count++;
1643  } else if (residue == 1 && (warn || isalpha(*ptr) != 0)) {
1644  /* it can be punctuation or alpha character */
1645  *bu++ = replacechar;
1646  count++;
1647  ErrPostEx(SEV_ERROR, ERR_SEQUENCE_BadResidue, "Invalid residue [%c]", *ptr);
1648  return (0);
1649  }
1650  ptr++;
1651  }
1652 
1653  *seqptr = ptr;
1654  std::copy(buf, bu, std::back_inserter(bsp));
1655  // BSWrite(bsp, buf, (Int4)(bu - buf));
1656  return (count);
1657 }
1658 
1659 /**********************************************************
1660  *
1661  * bool GetSeqData(pp, entry, bsp, nodetype, seqconv,
1662  * seq_data_type):
1663  *
1664  * Replace any bad residue to "N" if DNA sequence,
1665  * "X" if protein sequence.
1666  * PIR format allow punctuation in the sequence data,
1667  * so no warning message if found punctuation in the
1668  * sequence data.
1669  * Tatiana (mv from ScanSequence)
1670  *
1671  * 04-19-94
1672  *
1673  **********************************************************/
1674 bool GetSeqData(ParserPtr pp, const DataBlk& entry, CBioseq& bioseq, Int4 nodetype, unsigned char* seqconv, Uint1 seq_data_type)
1675 {
1676  // ByteStorePtr bp;
1677  IndexblkPtr ibp;
1678  char* seqptr;
1679  char* endptr;
1680  char* str;
1681  Char replacechar;
1682  size_t len = 0;
1683  Int4 numns;
1684 
1685  ibp = pp->entrylist[pp->curindx];
1686 
1687  bioseq.SetInst().SetLength(static_cast<TSeqPos>(ibp->bases));
1688 
1689  if (ibp->is_contig || ibp->is_mga)
1690  return true;
1691 
1692  if (pp->format == Parser::EFormat::XML) {
1694  seqptr = str;
1695  if (seqptr) {
1696  len = StringLen(seqptr);
1697  if (pp->source != Parser::ESource::USPTO || ! ibp->is_prot)
1698  for (char* p = seqptr; *p != '\0'; p++)
1699  if (*p >= 'A' && *p <= 'Z')
1700  *p |= 040; // tolower
1701  }
1702  } else {
1703  str = nullptr;
1704  seqptr = xSrchNodeType(entry, nodetype, &len);
1705  }
1706 
1707  if (! seqptr)
1708  return false;
1709 
1710  endptr = seqptr + len;
1711 
1714  replacechar = 'N';
1715  else
1716  replacechar = 'X';
1717 
1718  /* the sequence data will be located in next line of nodetype */
1719  if (pp->format == Parser::EFormat::XML) {
1720  while (*seqptr == ' ' || *seqptr == '\n' || *seqptr == '\t')
1721  seqptr++;
1722  } else {
1723  while (*seqptr != '\n')
1724  seqptr++;
1725  while (isalpha(*seqptr) == 0) /* skip leading blanks and digits */
1726  seqptr++;
1727  }
1728 
1729  std::vector<char> buf;
1730  size_t seqlen = 0;
1731  for (numns = 0; seqptr < endptr;) {
1732  len = ScanSequence(true, &seqptr, buf, seqconv, replacechar, &numns);
1733  if (len == 0) {
1734  if (str)
1735  MemFree(str);
1736  return false;
1737  }
1738 
1739  seqlen += len;
1740  while (isalpha(*seqptr) == 0 && seqptr < endptr)
1741  seqptr++;
1742  }
1743 
1744  if (seqlen != bioseq.GetLength()) {
1745  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_SeqLenNotEq, "Measured seqlen [%ld] != given [%ld]", (long int)seqlen, (long int)bioseq.GetLength());
1746  }
1747 
1748  if (str)
1749  MemFree(str);
1750 
1751  if (seq_data_type == CSeq_data::e_Iupacaa) {
1752  if (bioseq.GetLength() < 10) {
1754  if (ibp->is_pat == false)
1755  ErrPostStr(SEV_WARNING, ERR_SEQUENCE_TooShort, "This sequence for this record falls below the minimum length requirement of 10 basepairs.");
1756  else
1757  ErrPostStr(SEV_INFO, ERR_SEQUENCE_TooShortIsPatent, "This sequence for this patent record falls below the minimum length requirement of 10 basepairs.");
1758  } else {
1759  if (ibp->is_pat == false)
1760  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_TooShort, "This sequence for this record falls below the minimum length requirement of 10 basepairs.");
1761  else
1762  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_TooShortIsPatent, "This sequence for this patent record falls below the minimum length requirement of 10 basepairs.");
1763  ibp->drop = true;
1764  }
1765  }
1766  if (seqlen == static_cast<Uint4>(numns)) {
1767  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_AllNs, "This nucleotide sequence for this record contains nothing but unknown (N) basepairs.");
1768  ibp->drop = true;
1769  }
1770  }
1771 
1772  bioseq.SetInst().SetSeq_data().Assign(CSeq_data(buf, static_cast<CSeq_data::E_Choice>(seq_data_type)));
1773 
1774  return true;
1775 }
1776 
1777 /**********************************************************
1778  *
1779  * unsigned char* GetDNAConv():
1780  *
1781  * DNA conversion table array.
1782  *
1783  * 3-29-93
1784  *
1785  **********************************************************/
1786 unique_ptr<unsigned char[]> GetDNAConv(void)
1787 {
1788 
1789  unique_ptr<unsigned char[]> dnaconv(new unsigned char[255]());
1790  MemSet((char*)dnaconv.get(), (Uint1)1, (size_t)255);
1791 
1792  dnaconv[32] = 0; /* blank */
1793 
1795  for (CSeqportUtil::TIndex i = range.first; i <= range.second; ++i) {
1797 
1798  dnaconv[static_cast<int>(code[0])] = code[0];
1799  dnaconv[(int)tolower(code[0])] = code[0];
1800  }
1801 
1802  return dnaconv;
1803 }
1804 
1805 /**********************************************************
1806  *
1807  * unsigned char* GetProteinConv():
1808  *
1809  * Protein conversion table array.
1810  *
1811  * 3-29-93
1812  *
1813  **********************************************************/
1814 unique_ptr<unsigned char[]> GetProteinConv(void)
1815 {
1816  // unsigned char* protconv;
1817  unique_ptr<unsigned char[]> protconv(new unsigned char[255]());
1818 
1819  // protconv = (unsigned char*)MemNew((size_t)255); /* proteins */
1820  MemSet((char*)protconv.get(), (Uint1)1, (size_t)255); /* everything
1821  an error */
1822  protconv[32] = 0; /* blank */
1823 
1825  for (CSeqportUtil::TIndex i = range.first; i <= range.second; ++i) {
1827  protconv[(int)code[0]] = code[0]; /* swiss-prot, pir uses upper case
1828  protein code */
1829  }
1830 
1831  return (protconv);
1832 }
1833 
1834 /***********************************************************/
1835 static CSeq_descr::Tdata::const_iterator GetDescrByChoice(const CSeq_descr& descr, Uint1 choice)
1836 {
1837  const CSeq_descr::Tdata& descr_list = descr.Get();
1838 
1839  CSeq_descr::Tdata::const_iterator cur_descr = descr_list.begin();
1840  for (; cur_descr != descr_list.end(); ++cur_descr) {
1841  if ((*cur_descr)->Which() == choice)
1842  break;
1843  }
1844 
1845  return cur_descr;
1846 }
1847 
1848 // LCOV_EXCL_START
1849 // Excluded per Mark's request on 12/14/2016
1850 /**********************************************************
1851  *
1852  * static void GetFirstSegDescrChoice(bio_set, choice,
1853  * descr_new):
1854  *
1855  * 10-14-93
1856  *
1857  **********************************************************/
1858 static void GetFirstSegDescrChoice(CBioseq& bioseq, Uint1 choice, CSeq_descr& descr_new)
1859 {
1860  CSeq_descr& descr = bioseq.SetDescr();
1861  CSeq_descr::Tdata& descr_list = descr.Set();
1862 
1863  // Don't use GetDescrByChoice here just because GCC version does not support erase(const_iterator)
1864  CSeq_descr::Tdata::iterator cur_descr = descr_list.begin();
1865  for (; cur_descr != descr_list.end(); ++cur_descr) {
1866  if ((*cur_descr)->Which() == choice) {
1867  /* found the "choice" node, isolated node */
1868  descr_new.Set().push_back(*cur_descr);
1869  descr_list.erase(cur_descr);
1870  break;
1871  }
1872  }
1873 }
1874 // LCOV_EXCL_STOP
1875 
1876 // SameCitation and 'PubEquivMatch' have a bit different logic,
1877 // so below is an additional function that makes a check
1878 // for equality according to 'PubEquivMatch' rules
1880 {
1881  for (const CRef<CPub>& it1 : a.Get()) {
1882  for (const CRef<CPub>& it2 : b.Get()) {
1883  if (it1->SameCitation(*it2)) {
1884  bool same = true;
1885 
1886  if (it1->Which() == CPub::e_Gen && it2->Which() == CPub::e_Gen) {
1887  const CCit_gen& cit_a = it1->GetGen();
1888  const CCit_gen& cit_b = it2->GetGen();
1889 
1890  if (cit_a.IsSetSerial_number() && cit_b.IsSetSerial_number() && cit_a.GetSerial_number() == cit_b.GetSerial_number()) {
1891  // The special condition of 'PubEquivMatch'
1892  // a->volume == NULL && b->volume == NULL &&
1893  // a->issue == NULL && b->issue == NULL &&
1894  // a->pages == NULL && b->pages == NULL &&
1895  // a->title == NULL && b->title == NULL &&
1896  // a->cit == NULL && b->cit == NULL &&
1897  // a->authors == NULL && b->authors == NULL &&
1898  // a->muid == -1 && b->muid == -1 &&
1899  // a->journal == NULL && b->journal == NULL &&
1900  // a->date == NULL && b->date == NULL &&
1901  // a->serial_number != -1 && b->serial_number != -1
1902 
1903  if (! cit_a.IsSetVolume() && ! cit_b.IsSetVolume() &&
1904  ! cit_a.IsSetIssue() && ! cit_b.IsSetIssue() &&
1905  ! cit_a.IsSetPages() && ! cit_b.IsSetPages() &&
1906  ! cit_a.IsSetTitle() && ! cit_b.IsSetTitle() &&
1907  ! cit_a.IsSetCit() && ! cit_b.IsSetCit() &&
1908  ! cit_a.IsSetAuthors() && ! cit_b.IsSetAuthors() &&
1909  ! cit_a.IsSetMuid() && ! cit_b.IsSetMuid() &&
1910  ! cit_a.IsSetJournal() && ! cit_b.IsSetJournal() &&
1911  ! cit_a.IsSetDate() && ! cit_b.IsSetDate())
1912  same = false; // SIC!!!
1913  }
1914  }
1915 
1916  if (same)
1917  return true;
1918  }
1919  }
1920  }
1921 
1922  return false;
1923 }
1924 
1925 // LCOV_EXCL_START
1926 // Excluded per Mark's request on 12/14/2016
1927 /**********************************************************
1928  *
1929  * static bool CheckSegPub(pub, entries, same_pub_descr):
1930  *
1931  * 5-21-93
1932  *
1933  **********************************************************/
1934 static bool CheckSegPub(const CPubdesc& pub, TEntryList& entries, std::set<CSeqdesc*>& same_pub_descr)
1935 {
1936  if (! pub.IsSetPub() || ! pub.GetPub().IsSet() || pub.GetPub().Get().empty())
1937  return true;
1938 
1939  CRef<CPub> pub_ref = pub.GetPub().Get().front();
1940 
1941  if (! pub_ref->IsGen() || ! pub_ref->GetGen().IsSetSerial_number())
1942  return true;
1943 
1944  int num0 = pub_ref->GetGen().GetSerial_number();
1945 
1946  TEntryList::iterator next_seq = entries.begin();
1947  for (++next_seq; next_seq != entries.end(); ++next_seq) {
1948  if (! (*next_seq)->IsSetDescr())
1949  continue;
1950 
1951  CSeq_descr& descr = (*next_seq)->SetDescr();
1952 
1953  bool not_found = true;
1954  for (auto& cur_descr : descr.Set()) {
1955  if (! cur_descr->IsPub() || ! cur_descr->GetPub().IsSetPub() || ! cur_descr->GetPub().GetPub().IsSet() ||
1956  cur_descr->GetPub().GetPub().Get().empty())
1957  continue;
1958 
1959  const CPubdesc& cur_pub = cur_descr->GetPub();
1960  const CPub& cur_pub_ref = *cur_pub.GetPub().Get().front();
1961 
1962  if (! cur_pub_ref.IsGen() || ! cur_pub_ref.GetGen().IsSetSerial_number())
1963  continue;
1964 
1965  int num = cur_pub_ref.GetGen().GetSerial_number();
1966 
1967  if (! SameCitation_PubEquivMatch_Logic(cur_pub.GetPub(), pub.GetPub()))
1968  continue;
1969 
1970  if (num == num0) {
1971  same_pub_descr.insert(cur_descr); // store pointer to the same descr for future use
1972  not_found = false;
1973  break;
1974  }
1975 
1976  ErrPostStr(SEV_WARNING, ERR_SEGMENT_PubMatch, "Matching references with different serial numbers");
1977  }
1978 
1979  if (not_found)
1980  break;
1981  }
1982 
1983  return (next_seq == entries.end());
1984 }
1985 // LCOV_EXCL_STOP
1986 
1987 /***********************************************************/
1988 static void RemoveDescrByChoice(CSeq_descr& descr, Uint1 choice)
1989 {
1990  CSeq_descr::Tdata& descr_list = descr.Set();
1991 
1992  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
1993  if ((*cur_descr)->Which() == choice)
1994  cur_descr = descr_list.erase(cur_descr);
1995  else
1996  ++cur_descr;
1997  }
1998 }
1999 
2000 /**********************************************************
2001  *
2002  * static void CleanUpSeqDescrChoice(entries, choice):
2003  *
2004  * 5-21-93
2005  *
2006  **********************************************************/
2008 {
2009  TEntryList::iterator next_seq = entries.begin();
2010  ++next_seq;
2011 
2012  for (; next_seq != entries.end(); ++next_seq)
2013  RemoveDescrByChoice((*next_seq)->SetDescr(), choice);
2014 }
2015 
2016 /**********************************************************
2017  *
2018  * static void CleanUpSeqDescrPub(entries, to_clean):
2019  *
2020  * 1-13-16
2021  *
2022  **********************************************************/
2023 static void CleanUpSeqDescrPub(TEntryList& entries, std::set<CSeqdesc*>& to_clean)
2024 {
2025  TEntryList::iterator next_seq = entries.begin();
2026  ++next_seq;
2027 
2028  for (; next_seq != entries.end(); ++next_seq) {
2029  CSeq_descr::Tdata& descr_list = (*next_seq)->SetDescr().Set();
2030  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
2031  std::set<CSeqdesc*>::iterator it = to_clean.find(*cur_descr);
2032  if (it != to_clean.end()) {
2033  cur_descr = descr_list.erase(cur_descr);
2034  to_clean.erase(it);
2035  } else
2036  ++cur_descr;
2037  }
2038  }
2039 }
2040 
2041 // LCOV_EXCL_START
2042 // Excluded per Mark's request on 12/14/2016
2043 /**********************************************************
2044  *
2045  * static void GetSegPub(entries, descr):
2046  *
2047  * 5-21-93
2048  *
2049  **********************************************************/
2051 {
2052  CBioseq& bioseq = entries.front()->SetSeq();
2053  CSeq_descr::Tdata& descr_list = bioseq.SetDescr().Set();
2054 
2055  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
2056  if ((*cur_descr)->IsPub()) {
2057  CPubdesc& pubdesc = (*cur_descr)->SetPub();
2058 
2059  std::set<CSeqdesc*> same_pub_descr;
2060  if (CheckSegPub(pubdesc, entries, same_pub_descr)) {
2061  descr.Set().push_back(*cur_descr);
2062  cur_descr = descr_list.erase(cur_descr);
2063 
2064  CleanUpSeqDescrPub(entries, same_pub_descr);
2065  } else
2066  ++cur_descr;
2067  } else
2068  ++cur_descr;
2069  }
2070 }
2071 
2072 /**********************************************************
2073  *
2074  * static bool CheckSegDescrChoice(entry, choice):
2075  *
2076  * 5-18-93
2077  *
2078  **********************************************************/
2079 static bool CheckSegDescrChoice(const TEntryList& entries, Uint1 choice)
2080 {
2081  string org;
2082  CDate date;
2083  Int4 modif = -1;
2084 
2085  bool no_problem_found = true;
2086  for (TEntryList::const_iterator seq = entries.begin(); seq != entries.end(); ++seq) {
2087  const CSeq_descr& descr = (*seq)->GetDescr();
2088  const CSeq_descr::Tdata& descr_list = descr.Get();
2089 
2090  CSeq_descr::Tdata::const_iterator cur_descr = GetDescrByChoice(descr, choice);
2091 
2092  if (cur_descr == descr_list.end()) {
2093  no_problem_found = false;
2094  break;
2095  }
2096 
2097  if (choice == CSeqdesc::e_Org) {
2098  if (org.empty())
2099  org = (*cur_descr)->GetOrg().GetTaxname();
2100  else if (org != (*cur_descr)->GetOrg().GetTaxname()) {
2101  no_problem_found = false;
2102  break;
2103  }
2104  } else if (choice == CSeqdesc::e_Modif) {
2105  Int4 val = *(*cur_descr)->GetModif().begin();
2106  if (modif == -1)
2107  modif = val;
2108  else if (modif != val) {
2109  no_problem_found = false;
2110  break;
2111  }
2112  } else /* Seq_descr_update_date */
2113  {
2114  if (date.Which() == CDate::e_not_set)
2115  date.Assign((*cur_descr)->GetUpdate_date());
2116  else if (date.Compare((*cur_descr)->GetUpdate_date()) != CDate::eCompare_same) {
2117  no_problem_found = false;
2118  break;
2119  }
2120  }
2121  }
2122 
2123  return no_problem_found;
2124 }
2125 // LCOV_EXCL_STOP
2126 
2127 /**********************************************************
2128  *
2129  * static char* GetBioseqSetDescrTitle(descr):
2130  *
2131  * Copy title from the first one, truncate before
2132  * "complete cds" or "exon".
2133  *
2134  * 5-18-93
2135  *
2136  **********************************************************/
2137 static optional<string> GetBioseqSetDescrTitle(const CSeq_descr& descr)
2138 {
2139  const string* found = nullptr;
2140  for (auto it : descr.Get()) {
2141  if (it->IsTitle()) {
2142  found = &it->GetTitle();
2143  break;
2144  }
2145  }
2146 
2147  if (! found)
2148  return {};
2149 
2150  string title = *found;
2151 
2152  auto pos = title.find("complete cds");
2153  if (pos == string::npos) {
2154  pos = title.find("exon");
2155  }
2156 
2157  if (pos != string::npos) {
2158  title.resize(pos);
2160  }
2161 
2162  return title;
2163 }
2164 
2165 // LCOV_EXCL_START
2166 // Excluded per Mark's request on 12/14/2016
2167 /**********************************************************
2168  *
2169  * static void SrchSegDescr(TEntryList& entries, CSeq_descr& descr):
2170  *
2171  * Copy title from first one, truncate before
2172  * "complete cds" or "exon"
2173  * org, if they are all from one organism, then move
2174  * the data to this set, and make NULL to the sep chains
2175  * in which sep->mpData->descr->choice = Seq_descr_org.
2176  * modif, if they are all same modifier, then move
2177  * the data to this set, and make NULL to the sep chains
2178  * in which sep->mpData->descr->choice = Seq_descr_modif.
2179  *
2180  **********************************************************/
2182 {
2183  CRef<CSeq_entry>& entry = entries.front();
2184  CBioseq& bioseq = entry->SetSeq();
2185 
2186  if (auto title = GetBioseqSetDescrTitle(bioseq.GetDescr())) {
2187  CRef<CSeqdesc> desc_new(new CSeqdesc);
2188  desc_new->SetTitle(*title);
2189  descr.Set().push_back(desc_new);
2190  }
2191 
2193  GetFirstSegDescrChoice(bioseq, CSeqdesc::e_Org, descr);
2195  }
2197  GetFirstSegDescrChoice(bioseq, CSeqdesc::e_Modif, descr);
2199  }
2200 
2201  GetSegPub(entries, descr);
2202 
2206  }
2207 }
2208 
2209 /**********************************************************/
2210 static void GetSegSetDblink(CSeq_descr& descr, TEntryList& entries /*SeqEntryPtr headsep*/, bool* drop)
2211 {
2212  if (entries.empty())
2213  return;
2214 
2215  CRef<CSeqdesc> gpid,
2216  dblink,
2217  cur_gpid,
2218  cur_dblink;
2219 
2220  Uint4 dblink_count = 0;
2221  Uint4 gpid_count = 0;
2222 
2223  bool bad_gpid = false;
2224  bool bad_dblink = false;
2225 
2226  for (auto& entry : entries) {
2227  cur_gpid.Reset();
2228  cur_dblink.Reset();
2229 
2230  CSeq_descr::Tdata& descr_list = entry->SetDescr();
2231 
2232  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
2233  if (! (*cur_descr)->IsUser()) {
2234  ++cur_descr;
2235  continue;
2236  }
2237 
2238  const CUser_object& user = (*cur_descr)->GetUser();
2239  if (! user.CanGetType() || user.GetType().GetStr().empty()) {
2240  ++cur_descr;
2241  continue;
2242  }
2243 
2244  string type_str = user.GetType().GetStr();
2245 
2246  if (type_str == "DBLink") {
2247  if (cur_dblink.NotEmpty())
2248  continue;
2249 
2250  dblink_count++;
2251  cur_dblink = *cur_descr;
2252 
2253  if (dblink.Empty())
2254  dblink = cur_dblink;
2255 
2256  cur_descr = descr_list.erase(cur_descr);
2257  } else if (type_str == "GenomeProjectsDB") {
2258  if (cur_gpid.NotEmpty())
2259  continue;
2260 
2261  gpid_count++;
2262  cur_gpid = *cur_descr;
2263 
2264  if (gpid.Empty())
2265  gpid = cur_gpid;
2266 
2267  cur_descr = descr_list.erase(cur_descr);
2268  } else
2269  ++cur_descr;
2270  }
2271 
2272  if (cur_dblink.NotEmpty()) {
2273  if (dblink.Empty())
2274  dblink = cur_dblink;
2275  else {
2276  if (! cur_dblink->Equals(*dblink)) {
2277  bad_dblink = true;
2278  break;
2279  }
2280  }
2281  }
2282 
2283  if (cur_gpid.NotEmpty()) {
2284  if (gpid.Empty())
2285  gpid = cur_gpid;
2286  else {
2287  if (! cur_gpid->Equals(*gpid)) {
2288  bad_gpid = true;
2289  break;
2290  }
2291  }
2292  }
2293  }
2294 
2295  if (bad_dblink == false && bad_gpid == false) {
2296  if (dblink_count > 0 && entries.size() != dblink_count)
2297  bad_dblink = true;
2298  if (gpid_count > 0 && entries.size() != gpid_count)
2299  bad_gpid = true;
2300  }
2301 
2302  if (bad_dblink) {
2303  ErrPostStr(SEV_REJECT, ERR_SEGMENT_DBLinkMissingOrNonUnique, "One or more member of segmented set has missing or non-unique DBLink user-object. Entry dropped.");
2304  *drop = true;
2305  }
2306 
2307  if (bad_gpid) {
2308  ErrPostStr(SEV_REJECT, ERR_SEGMENT_GPIDMissingOrNonUnique, "One or more member of segmented set has missing or non-unique GPID user-object. Entry dropped.");
2309  *drop = true;
2310  }
2311 
2312  if (bad_gpid || bad_dblink ||
2313  (dblink.Empty() && gpid.Empty()) ||
2314  descr.Get().empty())
2315  return;
2316 
2317  if (dblink.NotEmpty())
2318  descr.Set().push_back(dblink);
2319  if (gpid.NotEmpty())
2320  descr.Set().push_back(gpid);
2321 }
2322 
2323 /**********************************************************
2324  *
2325  * static void GetBioseqSetDescr(entries, descr, drop)
2326  *
2327  * 1-20-16
2328  *
2329  **********************************************************/
2330 static void GetBioseqSetDescr(TEntryList& entries, CSeq_descr& descr, bool* drop)
2331 {
2332  SrchSegDescr(entries, descr); /* get from ASN.1 tree */
2333  GetSegSetDblink(descr, entries, drop);
2334 }
2335 
2336 /**********************************************************
2337  *
2338  * static const char *GetMoleculeClassString(mol):
2339  *
2340  * 6-25-93
2341  *
2342  **********************************************************/
2343 static const char* GetMoleculeClassString(Uint1 mol)
2344 {
2345  if (mol == 0)
2346  return ("not-set");
2347  if (mol == 1)
2348  return ("DNA");
2349  if (mol == 2)
2350  return ("RNA");
2351  if (mol == 3)
2352  return ("AA");
2353  if (mol == 4)
2354  return ("NA");
2355  return ("other");
2356 }
2357 
2358 /**********************************************************
2359  *
2360  * static CSeq_inst::EMol SrchSegSeqMol(entries):
2361  *
2362  * 5-14-93
2363  *
2364  **********************************************************/
2366 {
2367  const CBioseq& orig_bioseq = entries.front()->GetSeq();
2368  CSeq_inst::EMol mol = orig_bioseq.GetInst().GetMol();
2369 
2370  for (const auto& entry : entries) {
2371  const CBioseq& cur_bioseq = entry->GetSeq();
2372  if (mol == cur_bioseq.GetInst().GetMol())
2373  continue;
2374 
2375  ErrPostEx(SEV_WARNING, ERR_SEGMENT_DiffMolType, "Different molecule type in the segment set, \"%s\" to \"%s\"", GetMoleculeClassString(mol), GetMoleculeClassString(cur_bioseq.GetInst().GetMol()));
2376 
2377  return CSeq_inst::eMol_na;
2378  }
2379 
2380  return mol;
2381 }
2382 
2383 /**********************************************************
2384  *
2385  * static Int4 SrchSegLength(entries):
2386  *
2387  * 5-14-93
2388  *
2389  **********************************************************/
2391 {
2392  Int4 length = 0;
2393 
2394  for (const auto& entry : entries) {
2395  const CBioseq& cur_bioseq = entry->GetSeq();
2396  length += cur_bioseq.GetLength();
2397  }
2398 
2399  return (length);
2400 }
2401 
2402 /**********************************************************
2403  *
2404  * static CRef<CBioseq> GetBioseq(pp, orig_bioseq, slp):
2405  *
2406  * 5-12-93
2407  *
2408  **********************************************************/
2410 {
2411  IndexblkPtr ibp = pp->entrylist[pp->curindx];
2412  CRef<CBioseq> bioseq(new CBioseq);
2413 
2414  {
2415  string locusname = "SEG_";
2416  locusname.append(ibp->blocusname);
2417  bioseq->SetId().push_back(MakeSegSetSeqId(ibp->acnum, locusname, pp->seqtype, ibp->is_tpa));
2418  }
2419 
2420  if (pp->seg_acc) {
2421  string locusname = "SEG_";
2422  locusname.append(ibp->acnum);
2423  bioseq->SetId().push_back(MakeSegSetSeqId(ibp->acnum, locusname, pp->seqtype, ibp->is_tpa));
2424  }
2425 
2426  const CSeq_entry& first_entry = *(entries.front());
2427  const CBioseq& original = first_entry.GetSeq();
2428 
2429  if (auto title = GetBioseqSetDescrTitle(original.GetDescr())) {
2430  CRef<CSeqdesc> descr(new CSeqdesc);
2431  descr->SetTitle(*title);
2432  bioseq->SetDescr().Set().push_back(descr);
2433  }
2434 
2435  CSeq_inst& inst = bioseq->SetInst();
2437  inst.SetMol(SrchSegSeqMol(entries));
2438 
2439  bool need_null = false;
2440 
2441  CRef<CSeq_loc> null_loc(new CSeq_loc());
2442  null_loc->SetNull();
2443 
2444  for (CSeq_loc::const_iterator seq_it = slp.begin(); seq_it != slp.end(); ++seq_it) {
2445  if (need_null)
2446  inst.SetExt().SetSeg().Set().push_back(null_loc);
2447  else
2448  need_null = true;
2449 
2450  CRef<CSeq_loc> seqloc(new CSeq_loc());
2451  seqloc->Assign(seq_it.GetEmbeddingSeq_loc());
2452  inst.SetExt().SetSeg().Set().push_back(seqloc);
2453  }
2454 
2456  inst.SetFuzz().SetLim(CInt_fuzz::eLim_gt);
2457 
2458  return bioseq;
2459 }
2460 // LCOV_EXCL_STOP
2461 
2462 /**********************************************************
2463  *
2464  * void GetSeqExt(pp, slp):
2465  *
2466  * 5-12-93
2467  *
2468  **********************************************************/
2469 void GetSeqExt(ParserPtr pp, CSeq_loc& seq_loc)
2470 {
2471  const Indexblk* ibp;
2472 
2473  ibp = pp->entrylist[pp->curindx];
2474 
2475  CRef<CSeq_id> id = MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum);
2476 
2477  if (id.NotEmpty()) {
2478  CSeq_loc loc;
2479  loc.SetWhole(*id);
2480 
2481  seq_loc.Add(loc);
2482  }
2483 }
2484 
2485 // LCOV_EXCL_START
2486 // Excluded per Mark's request on 12/14/2016
2487 /**********************************************************
2488  *
2489  * SeqEntryPtr BuildBioSegHeader(pp, headsep, seqloc):
2490  *
2491  * 2-24-94
2492  *
2493  **********************************************************/
2495 {
2496  if (entries.empty())
2497  return;
2498 
2499  IndexblkPtr ibp = pp->entrylist[pp->curindx];
2500 
2501  CRef<CBioseq> bioseq = GetBioseq(pp, entries, seqloc); /* Bioseq, ext */
2502 
2503  CRef<CSeq_entry> bioseq_entry(new CSeq_entry);
2504  bioseq_entry->SetSeq(*bioseq);
2505 
2506  CRef<CBioseq_set> bioseq_set(new CBioseq_set);
2507  bioseq_set->SetSeq_set().assign(entries.begin(), entries.end());
2508  bioseq_set->SetClass(CBioseq_set::eClass_parts);
2509 
2510  CRef<CSeq_entry> bioseq_set_entry(new CSeq_entry);
2511  bioseq_set_entry->SetSet(*bioseq_set);
2512 
2513  CRef<CBioseq_set> bioseq_set_head(new CBioseq_set);
2514  bioseq_set_head->SetSeq_set().push_back(bioseq_entry);
2515  bioseq_set_head->SetSeq_set().push_back(bioseq_set_entry);
2516 
2517  CRef<CSeq_descr> descr(new CSeq_descr);
2518  GetBioseqSetDescr(bioseq_set->SetSeq_set(), *descr, &ibp->drop);
2519  bioseq_set_head->SetDescr(*descr);
2520  bioseq_set_head->SetClass(CBioseq_set::eClass_segset);
2521 
2522  CRef<CSeq_entry> bioseq_set_head_entry(new CSeq_entry);
2523  bioseq_set_head_entry->SetSet(*bioseq_set_head);
2524 
2525  entries.clear();
2526  entries.push_back(bioseq_set_head_entry);
2527 }
2528 
2529 /**********************************************************
2530  *
2531  * bool IsSegBioseq(const CSeq_id& id):
2532  *
2533  * 8-16-93
2534  *
2535  **********************************************************/
2536 bool IsSegBioseq(const CSeq_id& id)
2537 {
2538  if (id.Which() == CSeq_id::e_Patent)
2539  return false;
2540 
2541  const CTextseq_id* text_id = id.GetTextseq_Id();
2542 
2543  if (! text_id)
2544  return (false);
2545 
2546  if (! text_id->IsSetAccession() && text_id->IsSetName() &&
2547  StringEquN(text_id->GetName().c_str(), "SEG_", 4))
2548  return (true);
2549  return (false);
2550 }
2551 // LCOV_EXCL_STOP
2552 
2553 /**********************************************************
2554  *
2555  * char* check_div(pat_acc, pat_ref, est_kwd, sts_kwd,
2556  * gss_kwd, if_cds, div, tech, bases,
2557  * source, drop):
2558  *
2559  * 8-16-93
2560  *
2561  * gss and 1000 limit added.
2562  * 9-09-96
2563  *
2564  **********************************************************/
2565 bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string& div, CMolInfo::TTech* tech, size_t bases, Parser::ESource source, bool& drop)
2566 {
2567  if (div.empty())
2568  return false;
2569 
2570  if (pat_acc || pat_ref || StringEqu(div.c_str(), "PAT")) {
2571  if (pat_ref == false) {
2572  ErrPostStr(SEV_REJECT, ERR_DIVISION_MissingPatentRef, "Record in the patent division lacks a reference to a patent document. Entry dropped.");
2573  drop = true;
2574  }
2575  if (est_kwd) {
2576  ErrPostStr(SEV_WARNING, ERR_DIVISION_PATHasESTKeywords, "EST keywords present on patent sequence.");
2577  }
2578  if (sts_kwd) {
2579  ErrPostStr(SEV_WARNING, ERR_DIVISION_PATHasSTSKeywords, "STS keywords present on patent sequence.");
2580  }
2581  if (gss_kwd) {
2582  ErrPostStr(SEV_WARNING, ERR_DIVISION_PATHasGSSKeywords, "GSS keywords present on patent sequence.");
2583  }
2584  if (if_cds && source != Parser::ESource::EMBL) {
2585  ErrPostStr(SEV_INFO, ERR_DIVISION_PATHasCDSFeature, "CDS features present on patent sequence.");
2586  }
2587  if (! StringEqu(div.c_str(), "PAT")) {
2588  if (pat_acc)
2589  ErrPostStr(SEV_WARNING, ERR_DIVISION_ShouldBePAT, "Based on the accession number prefix letters, this is a patent sequence, but the division code is not PAT.");
2590 
2591  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoPAT, "Division %s mapped to PAT based on %s.", div.c_str(), (pat_acc == false) ? "patent reference" : "accession number");
2592  div = "PAT";
2593  }
2594  } else if (est_kwd) {
2595  if (if_cds) {
2596  if (StringEqu(div.c_str(), "EST")) {
2597  ErrPostStr(SEV_WARNING, ERR_DIVISION_ESTHasCDSFeature, "Coding region features exist and division is EST; EST might not be appropriate.");
2598  } else {
2599  ErrPostStr(SEV_INFO, ERR_DIVISION_NotMappedtoEST, "EST keywords exist, but this entry was not mapped to the EST division because of the presence of CDS features.");
2600  if (*tech == CMolInfo::eTech_est)
2601  *tech = CMolInfo::eTech_unknown;
2602  }
2603  } else if (bases > 1000) {
2604  if (StringEqu(div.c_str(), "EST")) {
2605  ErrPostEx(SEV_WARNING, ERR_DIVISION_LongESTSequence, "Division code is EST, but the length of the sequence is %ld.", bases);
2606  } else {
2607  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoEST, "EST keywords exist, but this entry was not mapped to the EST division because of the sequence length %ld.", bases);
2608  if (*tech == CMolInfo::eTech_est)
2609  *tech = CMolInfo::eTech_unknown;
2610  }
2611  } else {
2612  if (! StringEqu(div.c_str(), "EST"))
2613  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoEST, "%s division mapped to EST.", div.c_str());
2614  *tech = CMolInfo::eTech_est;
2615  div.clear();
2616  }
2617  } else if (StringEqu(div.c_str(), "EST")) {
2618  ErrPostStr(SEV_WARNING, ERR_DIVISION_MissingESTKeywords, "Division is EST, but entry lacks EST-related keywords.");
2619  if (sts_kwd) {
2620  ErrPostStr(SEV_WARNING, ERR_DIVISION_ESTHasSTSKeywords, "STS keywords present on EST sequence.");
2621  }
2622  if (if_cds) {
2623  ErrPostStr(SEV_WARNING, ERR_DIVISION_ESTHasCDSFeature, "Coding region features exist and division is EST; EST might not be appropriate.");
2624  }
2625  } else if (sts_kwd) {
2626  if (if_cds) {
2627  if (StringEqu(div.c_str(), "STS")) {
2628  ErrPostStr(SEV_WARNING, ERR_DIVISION_STSHasCDSFeature, "Coding region features exist and division is STS; STS might not be appropriate.");
2629  } else {
2630  ErrPostStr(SEV_WARNING, ERR_DIVISION_NotMappedtoSTS, "STS keywords exist, but this entry was not mapped to the STS division because of the presence of CDS features.");
2631  if (*tech == CMolInfo::eTech_sts)
2632  *tech = CMolInfo::eTech_unknown;
2633  }
2634  } else if (bases > 1000) {
2635  if (StringEqu(div.c_str(), "STS")) {
2636  ErrPostEx(SEV_WARNING, ERR_DIVISION_LongSTSSequence, "Division code is STS, but the length of the sequence is %ld.", bases);
2637  } else {
2638  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoSTS, "STS keywords exist, but this entry was not mapped to the STS division because of the sequence length %ld.", bases);
2639  if (*tech == CMolInfo::eTech_sts)
2640  *tech = CMolInfo::eTech_unknown;
2641  }
2642  } else {
2643  if (! StringEqu(div.c_str(), "STS"))
2644  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoSTS, "%s division mapped to STS.", div.c_str());
2645  *tech = CMolInfo::eTech_sts;
2646  div.clear();
2647  }
2648  } else if (StringEqu(div.c_str(), "STS")) {
2649  ErrPostStr(SEV_WARNING, ERR_DIVISION_MissingSTSKeywords, "Division is STS, but entry lacks STS-related keywords.");
2650  if (if_cds) {
2651  ErrPostStr(SEV_WARNING, ERR_DIVISION_STSHasCDSFeature, "Coding region features exist and division is STS; STS might not be appropriate.");
2652  }
2653  } else if (gss_kwd) {
2654  if (if_cds) {
2655  if (StringEqu(div.c_str(), "GSS")) {
2656  ErrPostStr(SEV_WARNING, ERR_DIVISION_GSSHasCDSFeature, "Coding region features exist and division is GSS; GSS might not be appropriate.");
2657  } else {
2658  ErrPostStr(SEV_WARNING, ERR_DIVISION_NotMappedtoGSS, "GSS keywords exist, but this entry was not mapped to the GSS division because of the presence of CDS features.");
2659  if (*tech == CMolInfo::eTech_survey)
2660  *tech = CMolInfo::eTech_unknown;
2661  }
2662  } else if (bases > 2500) {
2663  if (StringEqu(div.c_str(), "GSS")) {
2664  ErrPostEx(SEV_WARNING, ERR_DIVISION_LongGSSSequence, "Division code is GSS, but the length of the sequence is %ld.", bases);
2665  } else {
2666  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoGSS, "GSS keywords exist, but this entry was not mapped to the GSS division because of the sequence length %ld.", bases);
2667  if (*tech == CMolInfo::eTech_survey)
2668  *tech = CMolInfo::eTech_unknown;
2669  }
2670  } else {
2671  if (! StringEqu(div.c_str(), "GSS"))
2672  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoGSS, "%s division mapped to GSS.", div.c_str());
2673  *tech = CMolInfo::eTech_survey;
2674  div.clear();
2675  }
2676  } else if (StringEqu(div.c_str(), "GSS")) {
2677  ErrPostStr(SEV_WARNING, ERR_DIVISION_MissingGSSKeywords, "Division is GSS, but entry lacks GSS-related keywords.");
2678  if (if_cds) {
2679  ErrPostStr(SEV_WARNING, ERR_DIVISION_GSSHasCDSFeature, "Coding region features exist and division is GSS; GSS might not be appropriate.");
2680  }
2681  } else if (StringEqu(div.c_str(), "TSA")) {
2682  *tech = CMolInfo::eTech_tsa;
2683  div.clear();
2684  }
2685 
2686  return ! div.empty();
2687 }
2688 
2689 /**********************************************************/
2690 CRef<CSeq_id> StrToSeqId(const char* pch, bool pid)
2691 {
2692  long lID;
2693  char* pchEnd;
2694 
2695  CRef<CSeq_id> id;
2696 
2697  /* Figure out--what source is it */
2698  if (*pch == 'd' || *pch == 'e') {
2699  /* Get ID */
2700  errno = 0; /* clear errors, the error flag from stdlib */
2701  lID = strtol(pch + 1, &pchEnd, 10);
2702 
2703  if (! ((lID == 0 && pch + 1 == pchEnd) || (lID == LONG_MAX && errno == ERANGE))) {
2704  /* Allocate new SeqId */
2705 
2706  id = new CSeq_id;
2708  tag->SetStr(string(pch, pchEnd - pch));
2709 
2710  CRef<CDbtag> dbtag(new CDbtag);
2711  dbtag->SetTag(*tag);
2712  dbtag->SetDb(pid ? "PID" : "NID");
2713 
2714  id->SetGeneral(*dbtag);
2715  }
2716  }
2717 
2718  return id;
2719 }
2720 
2721 /**********************************************************/
2722 void AddNIDSeqId(CBioseq& bioseq, const DataBlk& entry, Int2 type, Int2 coldata, Parser::ESource source)
2723 {
2724  DataBlkPtr dbp;
2725  char* offset;
2726 
2727  dbp = TrackNodeType(entry, type);
2728  if (! dbp)
2729  return;
2730 
2731  offset = dbp->mOffset + coldata;
2732  CRef<CSeq_id> sid = StrToSeqId(offset, false);
2733  if (sid.Empty())
2734  return;
2735 
2736  if (! (*offset == 'g' && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL)))
2737  bioseq.SetId().push_back(sid);
2738 }
2739 
2740 /**********************************************************/
2741 static void CheckDivCode(TEntryList& seq_entries, ParserPtr pp)
2742 {
2743  for (auto& entry : seq_entries) {
2744  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
2745  if (bioseq->IsSetDescr()) {
2746  CGB_block* gb_block = nullptr;
2747  CMolInfo* molinfo = nullptr;
2749 
2750  for (auto& descr : bioseq->SetDescr().Set()) {
2751  if (descr->IsGenbank() && ! gb_block)
2752  gb_block = &descr->SetGenbank();
2753  else if (descr->IsMolinfo() && ! molinfo) {
2754  molinfo = &descr->SetMolinfo();
2755  tech = molinfo->GetTech();
2756  }
2757 
2758  if (gb_block && molinfo)
2759  break;
2760  }
2761 
2762  if (! gb_block)
2763  continue;
2764 
2765  IndexblkPtr ibp = pp->entrylist[pp->curindx];
2766 
2767  if (tech == CMolInfo::eTech_tsa &&
2768  ! NStr::CompareNocase(ibp->division, "TSA"))
2769  continue;
2770 
2771  if (! gb_block->IsSetDiv()) {
2772  ErrPostStr(SEV_WARNING, ERR_DIVISION_GBBlockDivision, "input division code is preserved in GBBlock");
2773  gb_block->SetDiv(ibp->division);
2774  }
2775  }
2776  }
2777  }
2778 }
2779 
2780 /**********************************************************/
2781 static const CBioSource* GetTopBiosource(const CSeq_entry& entry)
2782 {
2783  const TSeqdescList& descrs = GetDescrPointer(entry);
2784  for (const auto& descr : descrs) {
2785  if (descr->IsSource())
2786  return &(descr->GetSource());
2787  }
2788 
2789  return nullptr;
2790 }
2791 
2792 /**********************************************************/
2793 static bool SeqEntryCheckTaxonDiv(const CSeq_entry& entry)
2794 {
2795  const CBioSource* bio_src = GetTopBiosource(entry);
2796  if (! bio_src)
2797  return false;
2798 
2799  if (! bio_src->IsSetOrg() || ! bio_src->GetOrg().IsSetOrgname() || ! bio_src->GetOrg().GetOrgname().IsSetDiv())
2800  return false;
2801 
2802  return true;
2803 }
2804 
2805 /**********************************************************/
2807 {
2808  if (seq_entries.empty())
2809  return;
2810 
2811  if (! SeqEntryCheckTaxonDiv(*seq_entries.front())) {
2812  CheckDivCode(seq_entries, pp);
2813  }
2814 }
2815 
2816 /**********************************************************/
2817 void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk& entry, Int2 what, Int2 ori, bool cancelled)
2818 {
2819  DataBlkPtr dbp;
2820  const char** b;
2821  char* tmp;
2822  char* p;
2823  char* q;
2824  char* r;
2825  Int2 count;
2826 
2827  dbp = TrackNodeType(entry, what);
2828  if (! dbp || ! dbp->mOffset || dbp->len < 1)
2829  p = nullptr;
2830  else {
2831  tmp = StringSave(string_view(dbp->mOffset, dbp->len - 1));
2832  for (q = tmp; *q != '\0'; q++) {
2833  if (*q == '\n' && StringEquN(q + 1, "DE ", 5))
2834  fta_StringCpy(q, q + 5);
2835  else if (*q == '\n' || *q == '\t')
2836  *q = ' ';
2837  }
2838  for (q = tmp, p = tmp; *p != '\0'; p++) {
2839  if (*p == ' ' && p[1] == ' ')
2840  continue;
2841  *q++ = *p;
2842  }
2843  *q = '\0';
2844  for (b = magic_phrases, p = nullptr; *b && ! p; b++)
2845  p = StringStr(tmp, *b);
2846  MemFree(tmp);
2847  }
2848 
2849  if ((tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2850  tech == CMolInfo::eTech_htgs_2) &&
2851  ! p && ! cancelled) {
2852  ErrPostStr(SEV_WARNING, ERR_DEFINITION_HTGNotInProgress, "This Phase 0, 1 or 2 HTGS sequence is lacking an indication that sequencing is still in progress on its definition/description line.");
2853  } else if (tech == CMolInfo::eTech_htgs_3 && p) {
2854  ErrPostStr(SEV_ERROR, ERR_DEFINITION_HTGShouldBeComplete, "This complete Phase 3 sequence has a definition/description line indicating that its sequencing is still in progress.");
2855  }
2856 
2857  if (tech != CMolInfo::eTech_htgs_3)
2858  return;
2859 
2860  dbp = TrackNodeType(entry, ori);
2861  if (! dbp || ! dbp->mOffset || dbp->len < 1)
2862  return;
2863  r = new char[dbp->len + 1];
2864  if (! r)
2865  return;
2866  StringNCpy(r, dbp->mOffset, dbp->len);
2867  r[dbp->len] = '\0';
2868  for (p = r, q = r; *p != '\0'; p++)
2869  if (*p >= 'a' && *p <= 'z')
2870  *q++ = *p;
2871  *q = '\0';
2872 
2873  for (count = 0, p = r; *p != '\0'; p++) {
2874  if (*p != 'n')
2875  count = 0;
2876  else if (++count > 10) {
2877  ErrPostStr(SEV_WARNING, ERR_SEQUENCE_UnknownBaseHTG3, "This complete Phase 3 HTGS sequence has one or more runs of 10 contiguous unknown ('n') bases.");
2878  break;
2879  }
2880  }
2881  delete[] r;
2882 }
2883 
2884 /**********************************************************/
2885 void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char* entry, XmlIndexPtr xip, bool cancelled)
2886 {
2887  const char** b;
2888  char* tmp;
2889  char* p;
2890  char* q;
2891  char* r;
2892  Int2 count;
2893 
2894  if (! entry || ! xip)
2895  return;
2896 
2898  if (! tmp)
2899  p = nullptr;
2900  else {
2901  for (q = tmp; *q != '\0'; q++)
2902  if (*q == '\n' || *q == '\t')
2903  *q = ' ';
2904  for (q = tmp, p = tmp; *p != '\0'; p++) {
2905  if (*p == ' ' && p[1] == ' ')
2906  continue;
2907  *q++ = *p;
2908  }
2909  *q = '\0';
2910  for (b = magic_phrases, p = nullptr; *b && ! p; b++)
2911  p = StringStr(tmp, *b);
2912  MemFree(tmp);
2913  }
2914 
2915  if ((tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2916  tech == CMolInfo::eTech_htgs_2) &&
2917  ! p && ! cancelled) {
2918  ErrPostStr(SEV_WARNING, ERR_DEFINITION_HTGNotInProgress, "This Phase 0, 1 or 2 HTGS sequence is lacking an indication that sequencing is still in progress on its definition/description line.");
2919  } else if (tech == CMolInfo::eTech_htgs_3 && p) {
2920  ErrPostStr(SEV_ERROR, ERR_DEFINITION_HTGShouldBeComplete, "This complete Phase 3 sequence has a definition/description line indicating that its sequencing is still in progress.");
2921  }
2922 
2923  if (tech != CMolInfo::eTech_htgs_3)
2924  return;
2925 
2926  r = StringSave(XMLFindTagValue(entry, xip, INSDSEQ_SEQUENCE));
2927  if (! r)
2928  return;
2929 
2930  for (count = 0, p = r; *p != '\0'; p++) {
2931  if (*p != 'n')
2932  count = 0;
2933  else if (++count > 10) {
2934  ErrPostStr(SEV_WARNING, ERR_SEQUENCE_UnknownBaseHTG3, "This complete Phase 3 HTGS sequence has one or more runs of 10 contiguous unknown ('n') bases.");
2935  break;
2936  }
2937  }
2938  MemFree(r);
2939 }
2940 
2941 /**********************************************************/
2942 void CheckHTGDivision(const char* div, CMolInfo::TTech tech)
2943 {
2944  if (div && StringEqu(div, "HTG") && tech == CMolInfo::eTech_htgs_3) {
2945  ErrPostStr(SEV_WARNING, ERR_DIVISION_ShouldNotBeHTG, "This Phase 3 HTGS sequence is still in the HTG division. If truly complete, it should move to a non-HTG division.");
2946  } else if ((! div || ! StringEqu(div, "HTG")) &&
2947  (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2948  tech == CMolInfo::eTech_htgs_2)) {
2949  ErrPostStr(SEV_ERROR, ERR_DIVISION_ShouldBeHTG, "Phase 0, 1 or 2 HTGS sequences should have division code HTG.");
2950  }
2951 }
2952 
2953 /**********************************************************/
2955 {
2956  if (entry.IsSeq())
2957  return entry.GetSeq().GetDescr();
2958 
2959  return entry.GetSet().GetDescr();
2960 }
2961 
2962 /**********************************************************/
2963 static void CleanVisString(string& str)
2964 {
2965  if (str.empty())
2966  return;
2967 
2968  size_t start_pos = 0;
2969  for (; start_pos > str.size() && str[start_pos] <= ' '; ++start_pos)
2970  ;
2971 
2972  if (start_pos == str.size()) {
2973  str.clear();
2974  return;
2975  }
2976 
2977  str = str.substr(start_pos);
2978  size_t end_pos = str.size() - 1;
2979  for (;; --end_pos) {
2980  if (str[end_pos] == ';' || str[end_pos] <= ' ') {
2981  if (end_pos == 0)
2982  break;
2983  continue;
2984  }
2985  ++end_pos;
2986  break;
2987  }
2988 
2989  if (str[end_pos] != ';' || end_pos == 0) {
2990  if (end_pos == 0)
2991  str.clear();
2992  else
2993  str = str.substr(0, end_pos);
2994 
2995  return;
2996  }
2997 
2998  size_t amp_pos = end_pos - 1;
2999  for (; amp_pos; --amp_pos) {
3000  if (str[amp_pos] == ' ' || str[amp_pos] == '&' || str[amp_pos] == ';')
3001  break;
3002  }
3003 
3004  if (str[amp_pos] == '&')
3005  ++end_pos;
3006 
3007  str = str.substr(0, end_pos);
3008 }
3009 
3010 /**********************************************************/
3011 static void CleanVisStringList(list<string>& str_list)
3012 {
3013  for (list<string>::iterator it = str_list.begin(); it != str_list.end();) {
3014  CleanVisString(*it);
3015 
3016  if (it->empty())
3017  it = str_list.erase(it);
3018  else
3019  ++it;
3020  }
3021 }
3022 
3023 /**********************************************************/
3024 static void CheckGBBlock(TSeqdescList& descrs, bool& got)
3025 {
3026  const Char* div = nullptr;
3027 
3028  for (const auto& descr : descrs) {
3029  if (! descr->IsEmbl())
3030  continue;
3031 
3032  if (! descr->GetEmbl().IsSetDiv() || descr->GetEmbl().GetDiv() > 15)
3033  continue;
3034 
3035  div = GetEmblDiv(descr->GetEmbl().GetDiv());
3036  break;
3037  }
3038 
3039  for (TSeqdescList::iterator descr = descrs.begin(); descr != descrs.end();) {
3040  if (! (*descr)->IsGenbank()) {
3041  ++descr;
3042  continue;
3043  }
3044 
3045  CGB_block& gb_block = (*descr)->SetGenbank();
3046  if (div && gb_block.IsSetDiv() && NStr::CompareNocase(div, gb_block.GetDiv().c_str()) == 0)
3047  gb_block.ResetDiv();
3048 
3049  if (gb_block.IsSetSource()) {
3050  got = true;
3051  } else if (gb_block.IsSetDiv() && gb_block.GetDiv() != "PAT" &&
3052  gb_block.GetDiv() != "SYN") {
3053  got = true;
3054  }
3055 
3056  if (gb_block.IsSetExtra_accessions()) {
3058  if (gb_block.GetExtra_accessions().empty())
3059  gb_block.ResetExtra_accessions();
3060  }
3061 
3062 
3063  if (gb_block.IsSetKeywords()) {
3064  CleanVisStringList(gb_block.SetKeywords());
3065  if (gb_block.GetKeywords().empty())
3066  gb_block.ResetKeywords();
3067  }
3068 
3069  if (gb_block.IsSetSource()) {
3070  string& buf = gb_block.SetSource();
3072  if (buf.empty())
3073  gb_block.ResetSource();
3074  }
3075 
3076  if (gb_block.IsSetOrigin()) {
3077  string& buf = gb_block.SetOrigin();
3079  if (buf.empty())
3080  gb_block.ResetOrigin();
3081  }
3082 
3083  if (gb_block.IsSetDate()) {
3084  string& buf = gb_block.SetDate();
3086  if (buf.empty())
3087  gb_block.ResetDate();
3088  }
3089 
3090  if (gb_block.IsSetDiv()) {
3091  string& buf = gb_block.SetDiv();
3093  if (buf.empty())
3094  gb_block.ResetDiv();
3095  }
3096 
3097  if (! gb_block.IsSetExtra_accessions() && ! gb_block.IsSetSource() &&
3098  ! gb_block.IsSetKeywords() && ! gb_block.IsSetOrigin() &&
3099  ! gb_block.IsSetDate() && ! gb_block.IsSetEntry_date() &&
3100  ! gb_block.IsSetDiv()) {
3101  descr = descrs.erase(descr);
3102  } else {
3103  ++descr;
3104  }
3105  }
3106 }
3107 
3108 /**********************************************************/
3110 {
3111  bool got = false;
3112 
3113  for (auto& entry : seq_entries) {
3114  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3115  if (bioseq->IsSetDescr())
3116  CheckGBBlock(bioseq->SetDescr().Set(), got);
3117  }
3118 
3119  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
3120  if (bio_set->IsSetDescr())
3121  CheckGBBlock(bio_set->SetDescr().Set(), got);
3122  }
3123  }
3124 
3125  return (got);
3126 }
3127 
3128 /**********************************************************/
3129 static int GetSerialNumFromPubEquiv(const CPub_equiv& pub_eq)
3130 {
3131  int ret = -1;
3132  for (const auto& pub : pub_eq.Get()) {
3133  if (pub->IsGen()) {
3134  if (pub->GetGen().IsSetSerial_number()) {
3135  ret = pub->GetGen().GetSerial_number();
3136  break;
3137  }
3138  }
3139  }
3140 
3141  return ret;
3142 }
3143 
3144 /**********************************************************/
3145 static bool fta_if_pubs_sorted(const CPub_equiv& pub1, const CPub_equiv& pub2)
3146 {
3147  Int4 num1 = GetSerialNumFromPubEquiv(pub1);
3148  Int4 num2 = GetSerialNumFromPubEquiv(pub2);
3149 
3150  return num1 < num2;
3151 }
3152 
3153 /**********************************************************/
3154 static bool descr_cmp(const CRef<CSeqdesc>& desc1,
3155  const CRef<CSeqdesc>& desc2)
3156 {
3157  if (desc1->Which() == desc2->Which() && desc1->IsPub()) {
3158  const CPub_equiv& pub1 = desc1->GetPub().GetPub();
3159  const CPub_equiv& pub2 = desc2->GetPub().GetPub();
3160  return fta_if_pubs_sorted(pub1, pub2);
3161  }
3162  if (desc1->Which() == desc2->Which() && desc1->IsUser()) {
3163  const CUser_object& uop1 = desc1->GetUser();
3164  const CUser_object& uop2 = desc2->GetUser();
3165  const char* str1;
3166  const char* str2;
3167  if (uop1.IsSetType() && uop1.GetType().IsStr() &&
3168  uop2.IsSetType() && uop2.GetType().IsStr()) {
3169  str1 = uop1.GetType().GetStr().c_str();
3170  str2 = uop2.GetType().GetStr().c_str();
3171  if (strcmp(str1, str2) <= 0)
3172  return (true);
3173  return (false);
3174  }
3175  }
3176 
3177  return desc1->Which() < desc2->Which();
3178 }
3179 
3180 /**********************************************************/
3181 void fta_sort_descr(TEntryList& seq_entries)
3182 {
3183  for (auto& entry : seq_entries) {
3184  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3185  if (bioseq->IsSetDescr())
3186  bioseq->SetDescr().Set().sort(descr_cmp);
3187  }
3188 
3189  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
3190  if (bio_set->IsSetDescr())
3191  bio_set->SetDescr().Set().sort(descr_cmp);
3192  }
3193  }
3194 }
3195 
3196 /**********************************************************/
3197 static bool pub_cmp(const CRef<CPub>& pub1, const CRef<CPub>& pub2)
3198 {
3199  if (pub1->Which() == pub2->Which()) {
3200  if (pub1->IsMuid()) {
3201  return pub1->GetMuid() < pub2->GetMuid();
3202  } else if (pub1->IsGen()) {
3203  const CCit_gen& cit1 = pub1->GetGen();
3204  const CCit_gen& cit2 = pub2->GetGen();
3205 
3206  if (cit1.IsSetCit() && cit2.IsSetCit())
3207  return cit1.GetCit() < cit2.GetCit();
3208  }
3209  }
3210 
3211  return pub1->Which() < pub2->Which();
3212 }
3213 
3214 /**********************************************************/
3215 static void sort_feat_cit(CBioseq::TAnnot& annots)
3216 {
3217  for (auto& annot : annots) {
3218  if (annot->IsFtable()) {
3219  for (auto& feat : annot->SetData().SetFtable()) {
3220  if (feat->IsSetCit() && feat->GetCit().IsPub()) {
3221  // feat->SetCit().SetPub().sort(pub_cmp); TODO: may be this sort would be OK, the only difference with original one is it is stable
3222 
3223  TPubList& pubs = feat->SetCit().SetPub();
3224  for (TPubList::iterator pub = pubs.begin(); pub != pubs.end(); ++pub) {
3225  TPubList::iterator next_pub = pub;
3226  for (++next_pub; next_pub != pubs.end(); ++next_pub) {
3227  if (pub_cmp(*next_pub, *pub))
3228  swap(*next_pub, *pub);
3229  }
3230  }
3231  }
3232  }
3233  }
3234  }
3235 }
3236 
3237 /**********************************************************/
3239 {
3240  for (auto& entry : seq_entries) {
3241  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3242  if (bioseq->IsSetAnnot())
3243  sort_feat_cit(bioseq->SetAnnot());
3244  }
3245 
3246  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
3247  if (bio_set->IsSetAnnot())
3248  sort_feat_cit(bio_set->SetAnnot());
3249  }
3250  }
3251 }
3252 
3253 /**********************************************************/
3255 {
3256  for (const auto& tag : dbtags) {
3257  if (tag->IsSetDb() && tag->IsSetTag() &&
3258  ! tag->GetTag().IsStr() && tag->GetTag().GetId() > 0 &&
3259  tag->GetDb() == "taxon")
3260  return true;
3261  }
3262  return false;
3263 }
3264 
3265 /**********************************************************/
3266 void fta_fix_orgref_div(const CBioseq::TAnnot& annots, COrg_ref* org_ref, CGB_block& gbb)
3267 {
3268  Int4 count;
3269 
3270  if (! org_ref || ! gbb.IsSetDiv())
3271  return;
3272 
3273  count = 1;
3274  if (org_ref->IsSetOrgname() && ! org_ref->GetOrgname().IsSetDiv() &&
3275  ! fta_orgref_has_taxid(org_ref->GetDb())) {
3276  org_ref->SetOrgname().SetDiv(gbb.GetDiv());
3277  count--;
3278  }
3279 
3280  for (const auto& annot : annots) {
3281  if (! annot->IsFtable())
3282  continue;
3283 
3284  const CSeq_annot::C_Data::TFtable& feats = annot->GetData().GetFtable();
3285  for (const auto& feat : feats) {
3286  if (! feat->IsSetData() || ! feat->GetData().IsBiosrc())
3287  continue;
3288 
3289  count++;
3290 
3291  const CBioSource& bio_src = feat->GetData().GetBiosrc();
3292  if (bio_src.IsSetOrg() && ! fta_orgref_has_taxid(bio_src.GetOrg().GetDb())) {
3293  org_ref->SetOrgname().SetDiv(gbb.GetDiv());
3294  count--;
3295  }
3296  }
3297  }
3298 
3299  if (count > 0)
3300  return;
3301 
3302  gbb.ResetDiv();
3303 }
3304 
3305 /**********************************************************/
3306 bool XMLCheckCDS(const char* entry, XmlIndexPtr xip)
3307 {
3308  XmlIndexPtr txip;
3309  XmlIndexPtr fxip;
3310 
3311  if (! entry || ! xip)
3312  return (false);
3313 
3314  for (; xip; xip = xip->next)
3315  if (xip->tag == INSDSEQ_FEATURE_TABLE && xip->subtags)
3316  break;
3317  if (! xip)
3318  return (false);
3319 
3320  for (txip = xip->subtags; txip; txip = txip->next) {
3321  if (! txip->subtags)
3322  continue;
3323  for (fxip = txip->subtags; fxip; fxip = fxip->next)
3324  if (fxip->tag == INSDFEATURE_KEY && fxip->end - fxip->start == 3 &&
3325  StringEquN(entry + fxip->start, "CDS", 3))
3326  break;
3327  if (fxip)
3328  break;
3329  }
3330 
3331  if (! txip)
3332  return (false);
3333  return (true);
3334 }
3335 
3336 /**********************************************************/
3338 {
3339  for (auto& entry : seq_entries) {
3340  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3341  if (bioseq->IsSetInst() && bioseq->GetInst().IsSetStrand())
3342  continue;
3343 
3344  if (bioseq->GetInst().IsSetMol()) {
3345  CSeq_inst::EMol mol = bioseq->GetInst().GetMol();
3346  if (mol == CSeq_inst::eMol_dna)
3347  bioseq->SetInst().SetStrand(CSeq_inst::eStrand_ds);
3348  else if (mol == CSeq_inst::eMol_rna || mol == CSeq_inst::eMol_aa)
3349  bioseq->SetInst().SetStrand(CSeq_inst::eStrand_ss);
3350  }
3351  }
3352  }
3353 }
3354 
3355 /*****************************************************************************/
3356 static bool SwissProtIDPresent(const TEntryList& seq_entries)
3357 {
3358  for (const auto& entry : seq_entries) {
3359  for (CTypeConstIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3360  if (bioseq->IsSetId()) {
3361  for (const auto& id : bioseq->GetId()) {
3362  if (id->IsSwissprot())
3363  return true;
3364  }
3365  }
3366  }
3367  }
3368 
3369  return false;
3370 }
3371 
3372 /*****************************************************************************/
3373 static bool IsCitEmpty(const CCit_gen& cit)
3374 {
3375  if (cit.IsSetCit() || cit.IsSetAuthors() || cit.IsSetMuid() ||
3376  cit.IsSetJournal() || cit.IsSetVolume() || cit.IsSetIssue() ||
3377  cit.IsSetPages() || cit.IsSetDate() || cit.IsSetTitle() ||
3378  cit.IsSetPmid() || cit.IsSetSerial_number())
3379  return false;
3380 
3381  return true;
3382 }
3383 
3384 /*****************************************************************************/
3385 static void RemoveSerials(TPubList& pubs)
3386 {
3387  for (TPubList::iterator pub = pubs.begin(); pub != pubs.end();) {
3388  if ((*pub)->IsGen()) {
3389  if ((*pub)->GetGen().IsSetSerial_number())
3390  (*pub)->SetGen().ResetSerial_number();
3391 
3392  if (IsCitEmpty((*pub)->GetGen()))
3393  pub = pubs.erase(pub);
3394  else
3395  ++pub;
3396  } else
3397  ++pub;
3398  }
3399 }
3400 
3401 /*****************************************************************************/
3402 void StripSerialNumbers(TEntryList& seq_entries)
3403 {
3404  if (! SwissProtIDPresent(seq_entries)) {
3405  for (auto& entry : seq_entries) {
3406  for (CTypeIterator<CPubdesc> pubdesc(Begin(*entry)); pubdesc; ++pubdesc) {
3407  if (pubdesc->IsSetPub()) {
3408  RemoveSerials(pubdesc->SetPub().Set());
3409  if (pubdesc->GetPub().Get().empty())
3410  pubdesc->ResetPub();
3411  }
3412  }
3413 
3414  for (CTypeIterator<CSeq_feat> feat(Begin(*entry)); feat; ++feat) {
3415  if (feat->IsSetData()) {
3416  if (feat->GetData().IsPub()) {
3417  RemoveSerials(feat->SetData().SetPub().SetPub().Set());
3418  if (feat->GetData().GetPub().GetPub().Get().empty())
3419  feat->SetData().SetPub().ResetPub();
3420  } else if (feat->GetData().IsImp()) {
3421  CImp_feat& imp = feat->SetData().SetImp();
3422  if (imp.IsSetKey() && imp.GetKey() == "Site-ref" && feat->IsSetCit() && feat->GetCit().IsPub()) {
3423  RemoveSerials(feat->SetCit().SetPub());
3424  if (feat->GetCit().GetPub().empty())
3425  feat->SetCit().Reset();
3426  }
3427  }
3428  }
3429  }
3430  }
3431  }
3432 }
3433 
3434 /*****************************************************************************/
3436 {
3437  const string* seq_str = nullptr;
3438  const vector<Char>* seq_vec = nullptr;
3439 
3441  size_t old_size = 0;
3442 
3443  switch (code) {
3444  case CSeq_data::e_Iupacaa:
3445  seq_str = &seq_data.GetIupacaa().Get();
3446  old_coding = CSeqUtil::e_Iupacaa;
3447  old_size = seq_str->size();
3448  break;
3449 
3450  case CSeq_data::e_Ncbi8aa:
3451  seq_vec = &seq_data.GetNcbi8aa().Get();
3452  old_coding = CSeqUtil::e_Ncbi8aa;
3453  old_size = seq_vec->size();
3454  break;
3455 
3457  seq_vec = &seq_data.GetNcbistdaa().Get();
3458  old_coding = CSeqUtil::e_Ncbistdaa;
3459  old_size = seq_vec->size();
3460  break;
3461 
3462  default:; // do nothing
3463  }
3464 
3465  std::vector<Char> new_seq(old_size);
3466  size_t new_size = 0;
3467  if (seq_str)
3468  new_size = CSeqConvert::Convert(seq_str->c_str(), old_coding, 0, static_cast<TSeqPos>(old_size), &new_seq[0], CSeqUtil::e_Ncbieaa);
3469  else if (seq_vec)
3470  new_size = CSeqConvert::Convert(&(*seq_vec)[0], old_coding, 0, static_cast<TSeqPos>(old_size), &new_seq[0], CSeqUtil::e_Ncbieaa);
3471 
3472  if (! new_seq.empty()) {
3473  seq_data.SetNcbieaa().Set().assign(new_seq.begin(), new_seq.begin() + new_size);
3474  }
3475 }
3476 
3477 /*****************************************************************************/
3478 static void RawBioseqPack(CBioseq& bioseq)
3479 {
3480  if (bioseq.GetInst().IsSetSeq_data()) {
3481  if (! bioseq.GetInst().IsSetMol() || ! bioseq.GetInst().IsNa()) {
3483  PackSeqData(code, bioseq.SetInst().SetSeq_data());
3484  } else if (! bioseq.GetInst().GetSeq_data().IsGap()) {
3485  CSeqportUtil::Pack(&bioseq.SetInst().SetSeq_data());
3486  }
3487  }
3488 }
3489 
3490 static void DeltaBioseqPack(CBioseq& bioseq)
3491 {
3492  if (bioseq.GetInst().IsSetExt() && bioseq.GetInst().GetExt().IsDelta()) {
3493  for (auto& delta : bioseq.SetInst().SetExt().SetDelta().Set()) {
3494  if (delta->IsLiteral() && delta->GetLiteral().IsSetSeq_data() && ! delta->GetLiteral().GetSeq_data().IsGap()) {
3495  CSeqportUtil::Pack(&delta->SetLiteral().SetSeq_data());
3496  }
3497  }
3498  }
3499 }
3500 
3501 /*****************************************************************************/
3502 void PackEntries(TEntryList& seq_entries)
3503 {
3504  for (auto& entry : seq_entries) {
3505  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3506  if (bioseq->IsSetInst() && bioseq->GetInst().IsSetRepr()) {
3507  CSeq_inst::ERepr repr = bioseq->GetInst().GetRepr();
3508  if (repr == CSeq_inst::eRepr_raw || repr == CSeq_inst::eRepr_const)
3509  RawBioseqPack(*bioseq);
3510  else if (repr == CSeq_inst::eRepr_delta)
3511  DeltaBioseqPack(*bioseq);
3512  }
3513  }
3514  }
3515 }
3516 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void StripECO(string &str)
Definition: add.cpp:2805
void xFixEMBLKeywords(string &keywordData)
Definition: asci_blk.cpp:1508
static void fta_check_mult_ids(DataBlkPtr dbp, const char *mtag, const char *ptag)
Definition: asci_blk.cpp:410
USING_SCOPE(objects)
static CSeq_inst::EMol SrchSegSeqMol(const TEntryList &entries)
Definition: asci_blk.cpp:2365
void BuildSubBlock(DataBlkPtr dbp, Int2 subtype, const char *subkw)
Definition: asci_blk.cpp:812
vector< string > genbankKeywords
Definition: gb_index.cpp:56
static void CleanUpSeqDescrPub(TEntryList &entries, std::set< CSeqdesc * > &to_clean)
Definition: asci_blk.cpp:2023
static bool CheckSegPub(const CPubdesc &pub, TEntryList &entries, std::set< CSeqdesc * > &same_pub_descr)
Definition: asci_blk.cpp:1934
static bool SwissProtIDPresent(const TEntryList &seq_entries)
Definition: asci_blk.cpp:3356
void GetGenBankSubBlock(const DataBlk &entry, size_t bases)
Definition: asci_blk.cpp:454
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3402
static void RemoveSerials(TPubList &pubs)
Definition: asci_blk.cpp:3385
vector< string > emblKeywords
Definition: em_index.cpp:56
CRef< CSeq_id > StrToSeqId(const char *pch, bool pid)
Definition: asci_blk.cpp:2690
static void RemoveDescrByChoice(CSeq_descr &descr, Uint1 choice)
Definition: asci_blk.cpp:1988
static void GetSegPub(TEntryList &entries, CSeq_descr &descr)
Definition: asci_blk.cpp:2050
static void InsertDatablkVal(DataBlkPtr *dbp, Int2 type, char *offset, size_t len)
Definition: asci_blk.cpp:231
static optional< string > GetBioseqSetDescrTitle(const CSeq_descr &descr)
Definition: asci_blk.cpp:2137
static void fta_fix_secondaries(TokenBlkList &secs)
Definition: asci_blk.cpp:1268
static const CBioSource * GetTopBiosource(const CSeq_entry &entry)
Definition: asci_blk.cpp:2781
void AddNIDSeqId(CBioseq &bioseq, const DataBlk &entry, Int2 type, Int2 coldata, Parser::ESource source)
Definition: asci_blk.cpp:2722
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
Definition: asci_blk.cpp:3266
static void GetGenBankRefType(DataBlkPtr dbp, size_t bases)
Definition: asci_blk.cpp:337
static const char * GetMoleculeClassString(Uint1 mol)
Definition: asci_blk.cpp:2343
void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk &entry, Int2 what, Int2 ori, bool cancelled)
Definition: asci_blk.cpp:2817
static bool CheckSegDescrChoice(const TEntryList &entries, Uint1 choice)
Definition: asci_blk.cpp:2079
static void CheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2741
vector< string > swissProtKeywords
Definition: sp_index.cpp:54
CRef< CPatent_seq_id > MakeUsptoPatSeqId(const char *acc)
Definition: asci_blk.cpp:884
bool IsSegBioseq(const CSeq_id &id)
Definition: asci_blk.cpp:2536
void xGetGenBankSubBlocks(Entry &entry, size_t bases)
Definition: asci_blk.cpp:493
static void GetBioseqSetDescr(TEntryList &entries, CSeq_descr &descr, bool *drop)
Definition: asci_blk.cpp:2330
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
Definition: asci_blk.cpp:3238
void PackEntries(TEntryList &seq_entries)
Definition: asci_blk.cpp:3502
static Int4 SrchSegLength(const TEntryList &entries)
Definition: asci_blk.cpp:2390
void fta_set_strandedness(TEntryList &seq_entries)
Definition: asci_blk.cpp:3337
static void fta_fix_tpa_keywords(TKeywordList &keywords)
Definition: asci_blk.cpp:1478
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
Definition: asci_blk.cpp:2942
unique_ptr< unsigned char[]> GetDNAConv(void)
Definition: asci_blk.cpp:1786
bool XMLCheckCDS(const char *entry, XmlIndexPtr xip)
Definition: asci_blk.cpp:3306
unique_ptr< unsigned char[]> GetProteinConv(void)
Definition: asci_blk.cpp:1814
static bool fta_if_pubs_sorted(const CPub_equiv &pub1, const CPub_equiv &pub2)
Definition: asci_blk.cpp:3145
static void GetFirstSegDescrChoice(CBioseq &bioseq, Uint1 choice, CSeq_descr &descr_new)
Definition: asci_blk.cpp:1858
static bool SeqEntryCheckTaxonDiv(const CSeq_entry &entry)
Definition: asci_blk.cpp:2793
bool fta_orgref_has_taxid(const COrg_ref::TDb &dbtags)
Definition: asci_blk.cpp:3254
char * GetDescrComment(char *offset, size_t len, Uint2 col_data, bool is_htg, bool is_pat)
Definition: asci_blk.cpp:1159
void GetSequenceOfKeywords(const DataBlk &entry, int type, Uint2 col_data, TKeywordList &keywords)
Definition: asci_blk.cpp:1547
static void CheckGBBlock(TSeqdescList &descrs, bool &got)
Definition: asci_blk.cpp:3024
static void CleanUpSeqDescrChoice(TEntryList &entries, Uint1 choice)
Definition: asci_blk.cpp:2007
static void CleanVisString(string &str)
Definition: asci_blk.cpp:2963
static void CleanVisStringList(list< string > &str_list)
Definition: asci_blk.cpp:3011
static bool pub_cmp(const CRef< CPub > &pub1, const CRef< CPub > &pub2)
Definition: asci_blk.cpp:3197
static void SrchSegDescr(TEntryList &entries, CSeq_descr &descr)
Definition: asci_blk.cpp:2181
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2806
void GetEmblSubBlock(size_t bases, Parser::ESource source, const DataBlk &entry)
Definition: asci_blk.cpp:740
char * GetEmblBlock(DataBlkPtr *chain, char *ptr, short *retkw, Parser::EFormat format, char *eptr)
Definition: asci_blk.cpp:545
const char * magic_phrases[]
Definition: asci_blk.cpp:104
static void PackSeqData(CSeq_data::E_Choice code, CSeq_data &seq_data)
Definition: asci_blk.cpp:3435
static bool IsCitEmpty(const CCit_gen &cit)
Definition: asci_blk.cpp:3373
static Uint1 ValidSeqType(const char *accession, Uint1 type)
Definition: asci_blk.cpp:918
static bool descr_cmp(const CRef< CSeqdesc > &desc1, const CRef< CSeqdesc > &desc2)
Definition: asci_blk.cpp:3154
char * GetGenBankBlock(DataBlkPtr *chain, char *ptr, Int2 *retkw, char *eptr)
Definition: asci_blk.cpp:284
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
Definition: asci_blk.cpp:2469
static bool GetSubNodeType(const char *subkw, char **retbptr, char *eptr)
Definition: asci_blk.cpp:648
static CRef< CBioseq > GetBioseq(ParserPtr pp, const TEntryList &entries, const CSeq_loc &slp)
Definition: asci_blk.cpp:2409
static void GetEmblRefType(size_t bases, Parser::ESource source, DataBlkPtr dbp)
Definition: asci_blk.cpp:684
CRef< CSeq_id > MakeAccSeqId(const char *acc, Uint1 seqtype, bool accver, Int2 vernum)
Definition: asci_blk.cpp:960
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1674
void GetLenSubNode(DataBlkPtr dbp)
Definition: asci_blk.cpp:835
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
Definition: asci_blk.cpp:3109
char * SrchNodeSubType(const DataBlk &entry, Int2 type, Int2 subtype, size_t *len)
Definition: asci_blk.cpp:1039
void xGetGenBankBlocks(Entry &entry)
Definition: asci_blk.cpp:256
static void sort_feat_cit(CBioseq::TAnnot &annots)
Definition: asci_blk.cpp:3215
static void SetEmptyId(CBioseq &bioseq)
Definition: asci_blk.cpp:1062
CRef< CSeq_id > MakeLocusSeqId(const char *locus, CSeq_id::E_Choice seqtype)
Definition: asci_blk.cpp:990
void fta_sort_descr(TEntryList &seq_entries)
Definition: asci_blk.cpp:3181
static void DeltaBioseqPack(CBioseq &bioseq)
Definition: asci_blk.cpp:3490
void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char *entry, XmlIndexPtr xip, bool cancelled)
Definition: asci_blk.cpp:2885
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
Definition: asci_blk.cpp:2494
static CRef< CSeq_id > MakeSegSetSeqId(const char *accession, const string &locus, Uint1 seqtype, bool is_tpa)
Definition: asci_blk.cpp:1008
static int GetSerialNumFromPubEquiv(const CPub_equiv &pub_eq)
Definition: asci_blk.cpp:3129
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1317
void ShrinkSpaces(char *line)
Definition: asci_blk.cpp:118
static void RawBioseqPack(CBioseq &bioseq)
Definition: asci_blk.cpp:3478
static bool TrimEmblFeatBlk(DataBlkPtr dbp)
Definition: asci_blk.cpp:604
static void GetSegSetDblink(CSeq_descr &descr, TEntryList &entries, bool *drop)
Definition: asci_blk.cpp:2210
static CSeq_descr::Tdata::const_iterator GetDescrByChoice(const CSeq_descr &descr, Uint1 choice)
Definition: asci_blk.cpp:1835
static bool SameCitation_PubEquivMatch_Logic(const CPub_equiv &a, const CPub_equiv &b)
Definition: asci_blk.cpp:1879
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
Definition: asci_blk.cpp:2565
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1074
const CSeq_descr & GetDescrPointer(const CSeq_entry &entry)
Definition: asci_blk.cpp:2954
static void BuildFeatureBlock(DataBlkPtr dbp)
Definition: asci_blk.cpp:377
Int4 ScanSequence(bool warn, char **seqptr, std::vector< char > &bsp, unsigned char *conv, Char replacechar, int *numns)
Definition: asci_blk.cpp:1618
const char * GetEmblDiv(Uint1 num)
Definition: em_ascii.cpp:2401
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
Definition: Date.hpp:53
ECompare Compare(const CDate &date) const
Definition: Date.cpp:83
@ eCompare_same
They're equivalent.
Definition: Date.hpp:75
Definition: Dbtag.hpp:53
@Imp_feat.hpp User-defined methods of the data storage class.
Definition: Imp_feat.hpp:54
void SetId8(TId8 value)
Definition: Object_id.cpp:175
Definition: Pub.hpp:56
@Pubdesc.hpp User-defined methods of the data storage class.
Definition: Pubdesc.hpp:54
CRef –.
Definition: ncbiobj.hpp:618
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
@ e_Ncbieaa
Definition: sequtil.hpp:57
@ e_not_set
Definition: sequtil.hpp:44
@ e_Ncbi8aa
Definition: sequtil.hpp:56
@ e_Ncbistdaa
Definition: sequtil.hpp:58
@ e_Iupacaa
Definition: sequtil.hpp:55
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
static TPair GetCodeIndexFromTo(CSeq_data::E_Choice code_type)
unsigned int TIndex
static const string & GetCode(CSeq_data::E_Choice code_type, TIndex idx)
pair< TIndex, TIndex > TPair
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
char * mOffset
Definition: ftablock.h:329
size_t len
Definition: ftablock.h:330
CFlatFileData * mpData
Definition: ftablock.h:328
DataBlk * mpNext
Definition: ftablock.h:333
int mType
Definition: ftablock.h:327
@ ParFlat_OC
Definition: embl.h:61
@ ParFlat_RG
Definition: embl.h:66
@ ParFlat_RL
Definition: embl.h:69
@ ParFlat_RP
Definition: embl.h:64
@ ParFlat_RT
Definition: embl.h:68
@ ParFlat_OS
Definition: embl.h:48
@ ParFlat_OG
Definition: embl.h:62
@ ParFlat_RN
Definition: embl.h:49
@ ParFlat_RX
Definition: embl.h:65
@ ParFlat_RA
Definition: embl.h:67
@ ParFlat_RC
Definition: embl.h:63
@ ParFlat_FH
Definition: embl.h:52
#define ERR_DIVISION_NotMappedtoEST
Definition: flat2err.h:237
#define ERR_ACCESSION_UnusualWGS_Secondary
Definition: flat2err.h:175
#define ERR_DIVISION_ShouldBePAT
Definition: flat2err.h:256
#define ERR_DIVISION_MappedtoPAT
Definition: flat2err.h:224
#define ERR_DIVISION_MappedtoSTS
Definition: flat2err.h:225
#define ERR_SEQUENCE_TooShort
Definition: flat2err.h:155
#define ERR_SEQUENCE_TooShortIsPatent
Definition: flat2err.h:157
#define ERR_SEQUENCE_UnknownBaseHTG3
Definition: flat2err.h:147
#define ERR_DIVISION_LongGSSSequence
Definition: flat2err.h:246
#define ERR_SEGMENT_GPIDMissingOrNonUnique
Definition: flat2err.h:167
#define ERR_DIVISION_ESTHasCDSFeature
Definition: flat2err.h:236
#define ERR_DIVISION_PATHasGSSKeywords
Definition: flat2err.h:243
#define ERR_REFERENCE_MultipleIdentifiers
Definition: flat2err.h:313
#define ERR_DIVISION_MissingSTSKeywords
Definition: flat2err.h:228
#define ERR_DIVISION_MissingPatentRef
Definition: flat2err.h:229
#define ERR_SEQUENCE_BadResidue
Definition: flat2err.h:149
#define ERR_DIVISION_PATHasESTKeywords
Definition: flat2err.h:230
#define ERR_ACCESSION_ScfldHasWGSContigSec
Definition: flat2err.h:176
#define ERR_SEGMENT_PubMatch
Definition: flat2err.h:164
#define ERR_FORMAT_LineTypeOrder
Definition: flat2err.h:40
#define ERR_SEGMENT_DBLinkMissingOrNonUnique
Definition: flat2err.h:168
#define ERR_DIVISION_MappedtoGSS
Definition: flat2err.h:242
#define ERR_DIVISION_GSSHasCDSFeature
Definition: flat2err.h:240
#define ERR_DIVISION_MappedtoEST
Definition: flat2err.h:223
#define ERR_DEFINITION_HTGNotInProgress
Definition: flat2err.h:265
#define ERR_ACCESSION_WGSMasterAsSecondary
Definition: flat2err.h:174
#define ERR_DIVISION_STSHasCDSFeature
Definition: flat2err.h:233
#define ERR_FEATURE_NoFeatData
Definition: flat2err.h:325
#define ERR_SEGMENT_DiffMolType
Definition: flat2err.h:163
#define ERR_DIVISION_ShouldBeHTG
Definition: flat2err.h:238
#define ERR_DIVISION_MissingESTKeywords
Definition: flat2err.h:227
#define ERR_DIVISION_NotMappedtoGSS
Definition: flat2err.h:241
#define ERR_SEQUENCE_SeqLenNotEq
Definition: flat2err.h:148
#define ERR_DIVISION_PATHasCDSFeature
Definition: flat2err.h:232
#define ERR_DIVISION_MissingGSSKeywords
Definition: flat2err.h:239
#define ERR_DIVISION_NotMappedtoSTS
Definition: flat2err.h:234
#define ERR_DIVISION_LongSTSSequence
Definition: flat2err.h:245
#define ERR_DIVISION_GBBlockDivision
Definition: flat2err.h:247
#define ERR_SEQUENCE_AllNs
Definition: flat2err.h:156
#define ERR_ACCESSION_WGSWithNonWGS_Sec
Definition: flat2err.h:173
#define ERR_DIVISION_PATHasSTSKeywords
Definition: flat2err.h:231
#define ERR_DIVISION_LongESTSequence
Definition: flat2err.h:244
#define ERR_DEFINITION_HTGShouldBeComplete
Definition: flat2err.h:267
#define ERR_DIVISION_ESTHasSTSKeywords
Definition: flat2err.h:235
#define ERR_DIVISION_ShouldNotBeHTG
Definition: flat2err.h:250
list< CRef< objects::CSeq_entry > > TEntryList
#define INSDSEQ_DEFINITION
Definition: fta_xml.h:52
#define INSDSEQ_FEATURE_TABLE
Definition: fta_xml.h:68
#define INSDFEATURE_KEY
Definition: fta_xml.h:77
#define INSDSEQ_SEQUENCE
Definition: fta_xml.h:69
unique_ptr< string > XMLFindTagValue(const char *entry, const XmlIndex *xip, Int4 tag)
Definition: xm_index.cpp:214
std::list< std::string > TKeywordList
Definition: ftablock.h:163
std::list< CRef< objects::CPub > > TPubList
Definition: ftablock.h:63
forward_list< string > TokenBlkList
Definition: ftablock.h:134
std::list< CRef< objects::CSeqdesc > > TSeqdescList
Definition: ftablock.h:61
std::list< std::string > TAccessionList
Definition: ftablock.h:57
void MemSet(void *p, int n, size_t sz)
Definition: ftacpp.hpp:49
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:131
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:90
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void MemCpy(void *p, const void *q, size_t sz)
Definition: ftacpp.hpp:50
char * StringNew(size_t sz)
Definition: ftacpp.hpp:43
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
static TDSICONV * conv
Definition: charconv.c:168
@ ParFlat_FEATBLOCK
Definition: genbank.h:72
@ ParFlat_AUTHORS
Definition: genbank.h:67
@ ParFlat_FEATURES
Definition: genbank.h:51
@ ParFlat_SOURCE
Definition: genbank.h:48
@ ParFlat_JOURNAL
Definition: genbank.h:70
@ ParFlat_STANDARD
Definition: genbank.h:71
@ ParFlat_REFERENCE
Definition: genbank.h:49
@ ParFlat_LOCUS
Definition: genbank.h:41
@ ParFlat_CONSRTM
Definition: genbank.h:68
@ ParFlat_END
Definition: genbank.h:54
@ ParFlat_ORGANISM
Definition: genbank.h:66
@ ParFlat_REMARK
Definition: genbank.h:74
@ ParFlat_MEDLINE
Definition: genbank.h:73
@ ParFlat_TITLE
Definition: genbank.h:69
@ ParFlat_PUBMED
Definition: genbank.h:75
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
TPrim & Set(void)
Definition: serialbase.hpp:351
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
Definition: serialbase.hpp:347
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
static E_Choice GetAccType(EAccessionInfo info)
Definition: Seq_id.hpp:562
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
const_iterator end(void) const
Definition: Seq_loc.cpp:1034
const_iterator begin(void) const
Definition: Seq_loc.cpp:1028
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2984
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
Definition: ncbistr.cpp:3192
static SIZE_TYPE CommonSuffixSize(const CTempString s1, const CTempString s2)
Determine the common suffix of two strings.
Definition: ncbistr.hpp:5456
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
static void TrimSuffixInPlace(string &str, const CTempString suffix, ECase use_case=eCase)
Trim suffix from a string (in-place)
Definition: ncbistr.cpp:3269
void SetSource(const TSource &value)
Assign a value to Source data member.
Definition: GB_block_.hpp:488
TKeywords & SetKeywords(void)
Assign a value to Keywords data member.
Definition: GB_block_.hpp:532
bool IsSetExtra_accessions(void) const
Check if a value has been assigned to Extra_accessions data member.
Definition: GB_block_.hpp:442
void ResetKeywords(void)
Reset Keywords data member.
Definition: GB_block_.cpp:63
void ResetOrigin(void)
Reset Origin data member.
Definition: GB_block_.cpp:69
bool IsSetDiv(void) const
GenBank division Check if a value has been assigned to Div data member.
Definition: GB_block_.hpp:654
void ResetSource(void)
Reset Source data member.
Definition: GB_block_.cpp:57
void ResetDate(void)
Reset Date data member.
Definition: GB_block_.cpp:75
bool IsSetSource(void) const
source line Check if a value has been assigned to Source data member.
Definition: GB_block_.hpp:467
void SetDate(const TDate &value)
Assign a value to Date data member.
Definition: GB_block_.hpp:607
bool IsSetEntry_date(void) const
replaces date Check if a value has been assigned to Entry_date data member.
Definition: GB_block_.hpp:633
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: GB_block_.hpp:666
TExtra_accessions & SetExtra_accessions(void)
Assign a value to Extra_accessions data member.
Definition: GB_block_.hpp:460
const TExtra_accessions & GetExtra_accessions(void) const
Get the Extra_accessions member data.
Definition: GB_block_.hpp:454
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
Definition: GB_block_.hpp:526
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: GB_block_.hpp:539
void SetDiv(const TDiv &value)
Assign a value to Div data member.
Definition: GB_block_.hpp:675
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
Definition: GB_block_.hpp:514
void ResetExtra_accessions(void)
Reset Extra_accessions data member.
Definition: GB_block_.cpp:51
void ResetDiv(void)
Reset Div data member.
Definition: GB_block_.cpp:98
bool IsSetDate(void) const
OBSOLETE old form Entry Date Check if a value has been assigned to Date data member.
Definition: GB_block_.hpp:586
void SetOrigin(const TOrigin &value)
Assign a value to Origin data member.
Definition: GB_block_.hpp:560
bool IsSetPages(void) const
Check if a value has been assigned to Pages data member.
Definition: Cit_gen_.hpp:806
bool IsSetDate(void) const
Check if a value has been assigned to Date data member.
Definition: Cit_gen_.hpp:853
TSerial_number GetSerial_number(void) const
Get the Serial_number member data.
Definition: Cit_gen_.hpp:893
bool IsSetAuthors(void) const
Check if a value has been assigned to Authors data member.
Definition: Cit_gen_.hpp:623
bool IsSetVolume(void) const
Check if a value has been assigned to Volume data member.
Definition: Cit_gen_.hpp:712
const TCit & GetCit(void) const
Get the Cit member data.
Definition: Cit_gen_.hpp:588
bool IsSetSerial_number(void) const
for GenBank style references Check if a value has been assigned to Serial_number data member.
Definition: Cit_gen_.hpp:874
bool IsSetCit(void) const
anything, not parsable Check if a value has been assigned to Cit data member.
Definition: Cit_gen_.hpp:576
bool IsSetTitle(void) const
eg.
Definition: Cit_gen_.hpp:921
bool IsSetJournal(void) const
Check if a value has been assigned to Journal data member.
Definition: Cit_gen_.hpp:691
bool IsSetPmid(void) const
PubMed Id Check if a value has been assigned to Pmid data member.
Definition: Cit_gen_.hpp:968
bool IsSetIssue(void) const
Check if a value has been assigned to Issue data member.
Definition: Cit_gen_.hpp:759
bool IsSetMuid(void) const
medline uid Check if a value has been assigned to Muid data member.
Definition: Cit_gen_.hpp:644
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool CanGetType(void) const
Check if it is safe to call GetType method.
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TType & GetType(void) const
Get the Type member data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Date_.hpp:271
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
@ eLim_gt
greater than
Definition: Int_fuzz_.hpp:211
@ e_not_set
No variant selected.
Definition: Date_.hpp:127
vector< CRef< CDbtag > > TDb
Definition: Org_ref_.hpp:101
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Pub_equiv_.hpp:153
const Tdata & Get(void) const
Get the member data.
Definition: Pub_equiv_.hpp:165
E_Choice Which(void) const
Which variant is currently selected.
Definition: Pub_.hpp:555
const TGen & GetGen(void) const
Get the variant data.
Definition: Pub_.cpp:167
TMuid GetMuid(void) const
Get the variant data.
Definition: Pub_.hpp:608
bool IsMuid(void) const
Check if variant Muid is selected.
Definition: Pub_.hpp:602
bool IsGen(void) const
Check if variant Gen is selected.
Definition: Pub_.hpp:584
@ e_Gen
general or generic unparsed
Definition: Pub_.hpp:102
@ eSeq_code_type_iupacaa
IUPAC 1 letter amino acid code.
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
const TKey & GetKey(void) const
Get the Key member data.
Definition: Imp_feat_.hpp:259
bool IsSetKey(void) const
Check if a value has been assigned to Key data member.
Definition: Imp_feat_.hpp:247
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
const TName & GetName(void) const
Get the Name member data.
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TDescr & GetDescr(void) const
Get the Descr member data.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
void SetClass(TClass value)
Assign a value to Class data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_parts
parts for 2 or 3
@ eClass_segset
segmented sequence + parts
const TIupacaa & GetIupacaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:530
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
ERepr
representation class
Definition: Seq_inst_.hpp:91
void SetPub(TPub &value)
Assign a value to Pub data member.
Definition: Pubdesc_.cpp:72
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
Definition: Seq_inst_.hpp:593
const TPub & GetPub(void) const
Get the variant data.
Definition: Seqdesc_.cpp:356
const TNcbi8aa & GetNcbi8aa(void) const
Get the variant data.
Definition: Seq_data_.hpp:630
TNcbieaa & SetNcbieaa(void)
Select the variant.
Definition: Seq_data_.hpp:657
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
const TNcbistdaa & GetNcbistdaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:690
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
bool IsPub(void) const
Check if variant Pub is selected.
Definition: Seqdesc_.hpp:1096
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
bool IsSetPub(void) const
the citation(s) Check if a value has been assigned to Pub data member.
Definition: Pubdesc_.hpp:593
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
void SetFuzz(TFuzz &value)
Assign a value to Fuzz data member.
Definition: Seq_inst_.cpp:113
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
bool IsGap(void) const
Check if variant Gap is selected.
Definition: Seq_data_.hpp:704
const TPub & GetPub(void) const
Get the Pub member data.
Definition: Pubdesc_.hpp:605
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ eRepr_const
constructed sequence
Definition: Seq_inst_.hpp:96
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Iupacaa
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
@ e_Ncbi8aa
8 bit extended amino acid codes
Definition: Seq_data_.hpp:110
@ e_Org
if all from one organism
Definition: Seqdesc_.hpp:116
@ e_Update_date
date of last update
Definition: Seqdesc_.hpp:129
@ e_Modif
modifiers
Definition: Seqdesc_.hpp:112
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
@ eStrand_ds
double strand
Definition: Seq_inst_.hpp:136
@ eStrand_ss
single strand
Definition: Seq_inst_.hpp:135
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
@ ParFlat_REF_BTW
Definition: index.h:61
@ ParFlat_REF_NO_TARGET
Definition: index.h:63
@ ParFlat_COL_FEATKEY
Definition: index.h:65
@ ParFlat_REF_SITES
Definition: index.h:62
@ ParFlat_REF_END
Definition: index.h:60
CSeq_id::E_Choice GetNucAccOwner(const CTempString &acc)
Definition: indx_blk.cpp:2244
int fta_if_wgs_acc(string_view accession)
Definition: indx_blk.cpp:1190
void DelNonDigitTail(string &str)
Definition: indx_blk.cpp:958
char * buf
int i
yy_size_t n
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
unsigned int a
Definition: ncbi_localip.c:102
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
static Format format
Definition: njn_ioutil.cpp:53
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
#define count
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
@ ParFlatSP_KW
Definition: sprot.h:52
DataBlkPtr chain
Definition: ftablock.h:341
Definition: entry.h:57
list< SectionPtr > mSections
Definition: entry.h:99
string mBaseData
Definition: entry.h:98
Char acnum[200]
Definition: ftablock.h:166
Char division[4]
Definition: ftablock.h:171
bool is_mga
Definition: ftablock.h:199
TokenBlkList secaccs
Definition: ftablock.h:216
Char blocusname[200]
Definition: ftablock.h:178
Int2 vernum
Definition: ftablock.h:167
bool is_tpa
Definition: ftablock.h:206
bool embl_new_ID
Definition: ftablock.h:218
bool is_prot
Definition: ftablock.h:222
bool is_contig
Definition: ftablock.h:197
bool is_pat
Definition: ftablock.h:202
bool drop
Definition: ftablock.h:182
size_t bases
Definition: ftablock.h:172
string wgssec
Definition: ftablock.h:236
Char locusname[200]
Definition: ftablock.h:170
XmlIndexPtr xip
Definition: ftablock.h:217
vector< IndexblkPtr > entrylist
Definition: entry.h:13
size_t start
Definition: ftablock.h:152
XmlIndex * next
Definition: ftablock.h:158
XmlIndex * subtags
Definition: ftablock.h:157
size_t end
Definition: ftablock.h:153
Int4 tag
Definition: ftablock.h:150
Definition: inftrees.h:24
Definition: type.c:6
done
Definition: token1.c:1
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
Definition: utilfun.cpp:897
char * SrchTheChar(char *bptr, char *eptr, Char letter)
Definition: utilfun.cpp:759
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
Definition: utilfun.cpp:1507
string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
Definition: utilfun.cpp:644
bool fta_is_tpa_keyword(const char *str)
Definition: utilfun.cpp:1170
void CleanTailNoneAlphaCharInString(string &str)
Definition: utilfun.cpp:683
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Definition: utilfun.cpp:963
string xGetNodeData(const DataBlk &entry, int nodeType)
Definition: utilfun.cpp:977
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1496
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:994
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
Definition: utilfun.cpp:779
#define ParFlat_UNKW
Definition: utilfun.h:44
static wxAcceleratorEntry entries[3]
Modified on Fri Sep 20 14:58:20 2024 by modify_doxy.py rev. 669887