NCBI C++ ToolKit
asci_blk.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: asci_blk.cpp 101896 2024-02-29 12:12:01Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: asci_blk.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Common for all formats function processing ascii blocks to asn.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include <set>
38 
39 #include "ftacpp.hpp"
40 
46 #include <objects/seq/Bioseq.hpp>
48 #include <objects/seq/Seq_inst.hpp>
50 #include <objects/seq/Seq_data.hpp>
52 #include <objects/seq/Seq_ext.hpp>
54 #include <objects/seq/Seg_ext.hpp>
59 #include <objects/general/Date.hpp>
60 #include <objects/seq/Pubdesc.hpp>
63 #include <objects/pub/Pub.hpp>
68 #include <objects/pub/Pub_set.hpp>
74 #include <serial/iterator.hpp>
77 #include <objects/seq/MolInfo.hpp>
78 
79 #include "index.h"
80 #include "genbank.h"
81 #include "embl.h"
82 #include "sprot.h"
83 
85 
86 #include "ftaerr.hpp"
87 #include "indx_blk.h"
88 #include "asci_blk.h"
89 #include "utilfun.h"
90 #include "fta_xml.h"
91 
92 #include "add.h"
93 
94 #ifdef THIS_FILE
95 # undef THIS_FILE
96 #endif
97 #define THIS_FILE "asci_blk.cpp"
98 
99 #define Seq_descr_pub_same 50
100 
103 
104 const char* magic_phrases[] = {
105  "*** SEQUENCING IN PROGRESS ***",
106  "***SEQUENCING IN PROGRESS***",
107  "WORKING DRAFT SEQUENCE",
108  "LOW-PASS SEQUENCE SAMPLING",
109  "*** IN PROGRESS ***",
110  nullptr
111 };
112 
113 extern vector<string> genbankKeywords;
114 extern vector<string> emblKeywords;
115 extern vector<string> swissProtKeywords;
116 
117 /**********************************************************/
118 void ShrinkSpaces(char* line)
119 {
120  char* p;
121  char* q;
122  bool got_nl;
123 
124  if (! line || *line == '\0')
125  return;
126 
127  for (p = line; *p != '\0'; p++) {
128  if (*p == '\t')
129  *p = ' ';
130  if ((*p == ',' && p[1] == ',') || (*p == ';' && p[1] == ';'))
131  p[1] = ' ';
132  if ((p[1] == ',' || p[1] == ';') && p[0] == ' ') {
133  p[0] = p[1];
134  p[1] = ' ';
135  }
136  }
137 
138  for (p = line, q = line; *p != '\0';) {
139  *q = *p;
140  if (*p == ' ' || *p == '\n') {
141  for (got_nl = false; *p == ' ' || *p == '\n'; p++) {
142  if (*p == '\n')
143  got_nl = true;
144  }
145 
146  if (got_nl)
147  *q = '\n';
148  } else
149  p++;
150  q++;
151  }
152  if (q > line) {
153  for (q--; q > line && (*q == ' ' || *q == ';' || *q == '\n');)
154  q--;
155  if (*q != ' ' && *q != ';' && *q != '\n')
156  q++;
157  }
158  *q = '\0';
159 
160  for (p = line; *p == ' ' || *p == ';' || *p == '\n';)
161  p++;
162  if (p > line)
163  fta_StringCpy(line, p);
164 }
165 
166 /**********************************************************
167  *
168  * static void InsertDatablkVal(dbp, type, offset, len):
169  *
170  * Allocate a memory, then assign data-block value
171  * to a new node.
172  * dbp points to the new node if dbp is NULL.
173  *
174  * 3-18-93
175  *
176  **********************************************************/
177 static void InsertDatablkVal(DataBlkPtr* dbp, Int2 type, char* offset, size_t len)
178 {
179  DataBlk* ldp = new DataBlk(*dbp, type, offset, len);
180  if (! *dbp) {
181  *dbp = ldp;
182  }
183 }
184 
185 /**********************************************************
186  *
187  * char* GetGenBankBlock(chain, ptr, retkw, eptr):
188  *
189  * Enters knowing current keyword.type and offset,
190  * finds the length of the current keyword block,
191  * and builds the block to "chain".
192  * Since each key-word block always start at first
193  * column of the line, the loop stops when it found the
194  * first none (blank, newline, or tab) character after
195  * the newline character.
196  * Each data block will append to the "chain".
197  * Return a pointer points to next key-word block.
198  *
199  * 3-21-93
200  *
201  **********************************************************/
203 {
204  vector<string> lines;
205  NStr::Split(entry.mBaseData, "\n", lines);
206 
207  vector<string> sectionLines;
208  int currentKw = ParFlat_LOCUS;
209  int nextKw;
210  string sectionText;
211  for (const string& line : lines) {
212  nextKw = SrchKeyword(line, genbankKeywords);
213  if (nextKw == ParFlat_UNKW) {
214  nextKw = currentKw;
215  }
216  if (nextKw != currentKw || NStr::StartsWith(line, "REFERENCE")) {
217  auto* secPtr = new Section(currentKw, sectionLines);
218  // secPtr->DumpText(cerr);
219  entry.mSections.push_back(secPtr);
220  currentKw = nextKw;
221  sectionLines.clear();
222  sectionLines.push_back(line);
223  continue;
224  }
225  sectionLines.push_back(line);
226  }
227  entry.mSections.push_back(new Section(currentKw, sectionLines));
228 }
229 
230 char* GetGenBankBlock(DataBlkPtr* chain, char* ptr, Int2* retkw, char* eptr)
231 {
232  char* offset;
233  int curkw;
234  int nextkw;
235  Int4 len;
236 
237  len = 0;
238  offset = ptr;
239  curkw = *retkw;
240 
241  do /* repeat loop until it finds next key-word */
242  {
243  for (; ptr < eptr && *ptr != '\n'; ptr++)
244  len++;
245  if (ptr >= eptr)
246  return (ptr);
247 
248  ++ptr; /* newline character */
249  ++len;
250 
251  nextkw = SrchKeyword(CTempString(ptr, eptr - ptr), genbankKeywords);
252  if (nextkw == ParFlat_UNKW) /* it can be "XX" line,
253  treat as same line */
254  nextkw = curkw;
255 
256  if (StringEquN(ptr, "REFERENCE", 9)) /* treat as one block */
257  break;
258  } while (nextkw == curkw);
259 
260  nextkw = SrchKeyword(ptr, genbankKeywords);
261 
262  InsertDatablkVal(chain, curkw, offset, len);
263  *retkw = nextkw;
264  return (ptr);
265 }
266 
267 
268 /**********************************************************
269  *
270  * static void GetGenBankRefType(dbp, bases):
271  *
272  * Check the data in the "REFERENCE" line,
273  * - ParFlat_REF_END if it contains
274  * "(bases 1 to endbases)", pub for "descr"
275  * or no base range at all;
276  * - ParFlat_REF_SITES if it contains "(sites)",
277  * for ImpFeatPub;
278  * - ParFlat_REF_BTW, otherwise, for SeqFeatPub.
279  *
280  * 5-19-93
281  *
282  **********************************************************/
283 static void GetGenBankRefType(DataBlkPtr dbp, size_t bases)
284 {
285  char* bptr;
286  char* eptr;
287 
288  bptr = dbp->mOffset;
289  eptr = bptr + dbp->len;
290 
291  const string s = to_string(bases);
292  const string str = "(bases 1 to " + s + ")";
293  const string str1 = "(bases 1 to " + s + ";";
294  const string str2 = "(residues 1 to " + s + "aa)";
295 
296  string ref(bptr, bptr + dbp->len);
297 
298  while (bptr < eptr && *bptr != '\n' && *bptr != '(')
299  bptr++;
300  while (*bptr == ' ')
301  bptr++;
302 
303  if (*bptr == '\n')
305  else if (NStr::Find(ref, str) != NPOS || NStr::Find(ref, str1) != NPOS ||
306  NStr::Find(ref, str2) != NPOS)
307  dbp->mType = ParFlat_REF_END;
308  else if (NStr::Find(ref, "(sites)") != NPOS)
309  dbp->mType = ParFlat_REF_SITES;
310  else
311  dbp->mType = ParFlat_REF_BTW;
312 }
313 
314 /**********************************************************
315  *
316  * static void BuildFeatureBlock(dbp):
317  *
318  * The feature key in column 6-20.
319  *
320  * 5-3-93
321  *
322  **********************************************************/
324 {
325  char* bptr;
326  char* eptr;
327  char* ptr;
328  bool skip;
329 
330  bptr = dbp->mOffset;
331  eptr = bptr + dbp->len;
332  ptr = SrchTheChar(bptr, eptr, '\n');
333 
334  if (! ptr)
335  return;
336 
337  bptr = ptr + 1;
338 
339  while (bptr < eptr) {
340  InsertDatablkVal(reinterpret_cast<DataBlk**>(&dbp->mpData), ParFlat_FEATBLOCK, bptr, eptr - bptr);
341 
342  do {
343  bptr = SrchTheChar(bptr, eptr, '\n');
344  bptr++;
345 
346  skip = false;
347  if (! StringEquN(bptr, "XX", 2))
348  ptr = bptr + ParFlat_COL_FEATKEY;
349  else
350  skip = true;
351  } while ((*ptr == ' ' && ptr < eptr) || skip);
352  }
353 }
354 
355 /**********************************************************/
356 static void fta_check_mult_ids(DataBlkPtr dbp, const char* mtag, const char* ptag)
357 {
358  char* p;
359  Char ch;
360  Int4 muids;
361  Int4 pmids;
362 
363  if (! dbp || ! dbp->mOffset || (! mtag && ! ptag))
364  return;
365 
366  ch = dbp->mOffset[dbp->len];
367  dbp->mOffset[dbp->len] = '\0';
368 
369  size_t mlen = mtag ? StringLen(mtag) : 0;
370  size_t plen = ptag ? StringLen(ptag) : 0;
371 
372  muids = 0;
373  pmids = 0;
374  for (p = dbp->mOffset;; p++) {
375  p = StringChr(p, '\n');
376  if (! p)
377  break;
378  if (mtag && StringEquN(p + 1, mtag, mlen))
379  muids++;
380  else if (ptag && StringEquN(p + 1, ptag, plen))
381  pmids++;
382  }
383  dbp->mOffset[dbp->len] = ch;
384 
385  if (muids > 1) {
386  ErrPostEx(SEV_ERROR, ERR_REFERENCE_MultipleIdentifiers, "Reference has multiple MEDLINE identifiers. Ignoring all but the first.");
387  }
388  if (pmids > 1) {
389  ErrPostEx(SEV_ERROR, ERR_REFERENCE_MultipleIdentifiers, "Reference has multiple PUBMED identifiers. Ignoring all but the first.");
390  }
391 }
392 
393 /**********************************************************
394  *
395  * void GetGenBankSubBlock(entry, bases):
396  *
397  * 4-7-93
398  *
399  **********************************************************/
400 void GetGenBankSubBlock(const DataBlk& entry, size_t bases)
401 {
402  DataBlkPtr dbp;
403 
404  dbp = TrackNodeType(entry, ParFlat_SOURCE);
405  if (dbp) {
406  BuildSubBlock(dbp, ParFlat_ORGANISM, " ORGANISM");
407  GetLenSubNode(dbp);
408  }
409 
410  dbp = TrackNodeType(entry, ParFlat_REFERENCE);
411  for (; dbp; dbp = dbp->mpNext) {
412  if (dbp->mType != ParFlat_REFERENCE)
413  continue;
414 
415  fta_check_mult_ids(dbp, " MEDLINE", " PUBMED");
416  BuildSubBlock(dbp, ParFlat_AUTHORS, " AUTHORS");
417  BuildSubBlock(dbp, ParFlat_CONSRTM, " CONSRTM");
418  BuildSubBlock(dbp, ParFlat_TITLE, " TITLE");
419  BuildSubBlock(dbp, ParFlat_JOURNAL, " JOURNAL");
420  BuildSubBlock(dbp, ParFlat_MEDLINE, " MEDLINE");
421  BuildSubBlock(dbp, ParFlat_PUBMED, " PUBMED");
422  BuildSubBlock(dbp, ParFlat_STANDARD, " STANDARD");
423  BuildSubBlock(dbp, ParFlat_REMARK, " REMARK");
424  GetLenSubNode(dbp);
425  GetGenBankRefType(dbp, bases);
426  }
427 
428  dbp = TrackNodeType(entry, ParFlat_FEATURES);
429  for (; dbp; dbp = dbp->mpNext) {
430  if (dbp->mType != ParFlat_FEATURES)
431  continue;
432 
433  BuildFeatureBlock(dbp);
434  GetLenSubNode(dbp);
435  }
436 }
437 
438 // ----------------------------------------------------------------------------
439 void xGetGenBankSubBlocks(Entry& entry, size_t bases)
440 // ----------------------------------------------------------------------------
441 {
442  for (auto secPtr : entry.mSections) {
443  auto secType = secPtr->mType;
444  if (secType == ParFlat_SOURCE) {
445  secPtr->xBuildSubBlock(ParFlat_ORGANISM, " ORGANISM");
446  // GetLenSubNode(dbp);
447  }
448  if (secType == ParFlat_REFERENCE) {
449  // fta_check_mult_ids(dbp, " MEDLINE", " PUBMED");
450  secPtr->xBuildSubBlock(ParFlat_AUTHORS, " AUTHORS");
451  secPtr->xBuildSubBlock(ParFlat_CONSRTM, " CONSRTM");
452  secPtr->xBuildSubBlock(ParFlat_TITLE, " TITLE");
453  secPtr->xBuildSubBlock(ParFlat_JOURNAL, " JOURNAL");
454  secPtr->xBuildSubBlock(ParFlat_MEDLINE, " MEDLINE");
455  secPtr->xBuildSubBlock(ParFlat_PUBMED, " PUBMED");
456  secPtr->xBuildSubBlock(ParFlat_STANDARD, " STANDARD");
457  secPtr->xBuildSubBlock(ParFlat_REMARK, " REMARK");
458  // GetLenSubNode(dbp);
459  // GetGenBankRefType(dbp, bases);
460  }
461  if (secType == ParFlat_FEATURES) {
462  secPtr->xBuildFeatureBlocks();
463  // GetLenSubNode(dbp);
464  }
465  }
466 }
467 
468 /**********************************************************
469  *
470  * char* GetEmblBlock(chain, ptr, retkw, format, eptr):
471  *
472  * Enters knowing current keyword.type and offset,
473  * finds the length of the current keyword block, and
474  * builds the block to "chain".
475  * Loop will continue until it finds the next keyword
476  * or next "RN" after the newline character.
477  * Each data block will append to the "chain".
478  * Return a pointer points to next key-word block.
479  *
480  * 3-21-93
481  *
482  * The OS block can be
483  * - OS OS OC OC XX OG ==> this normal
484  * or
485  * - OS OC OC XX OS OS OC OC XX OG ==> this hybrids
486  * For case 2, it need to make two OS block.
487  *
488  * 12-15-93
489  *
490  **********************************************************/
491 char* GetEmblBlock(DataBlkPtr* chain, char* ptr, short* retkw, Parser::EFormat format, char* eptr)
492 {
493  char* offset;
494  Int2 curkw;
495  Int2 nextkw;
496  bool seen_oc = false;
497 
498  size_t len = 0;
499  offset = ptr;
500  curkw = *retkw;
501 
502  do /* repeat loop until it finds next key-word */
503  {
504  for (; ptr < eptr && *ptr != '\n'; ptr++)
505  len++;
506  if (ptr >= eptr) {
507  *retkw = ParFlat_END;
508  return (ptr);
509  }
510  ++ptr; /* newline character */
511  ++len;
512 
513  nextkw = SrchKeyword(
514  CTempString(ptr, eptr - ptr),
516  if (nextkw == ParFlat_UNKW) /* it can be "XX" line,
517  treat as same line */
518  nextkw = curkw;
519  if (StringEquN(ptr, "RN", 2)) /* treat each RN per block */
520  break;
521  if (StringEquN(ptr, "ID", 2)) /* treat each ID per block */
522  break;
523 
524  if (StringEquN(ptr, "OC", 2))
525  seen_oc = true;
526 
527  if (StringEquN(ptr, "OS", 2) && seen_oc)
528  break; /* treat as next OS block */
529 
530  } while (nextkw == curkw);
531 
532  InsertDatablkVal(chain, curkw, offset, len);
533 
534  *retkw = nextkw;
535  return (ptr);
536 }
537 
538 /**********************************************************
539  *
540  * static bool TrimEmblFeatBlk(dbp):
541  *
542  * Routine return TRUE if found FT data.
543  * The routine do the following things:
544  * - only leave last one FH line;
545  * - replace all "FT" to " " in the beginning of line.
546  *
547  * 6-15-93
548  *
549  **********************************************************/
550 static bool TrimEmblFeatBlk(DataBlkPtr dbp)
551 {
552  char* bptr;
553  char* eptr;
554  char* ptr;
555  bool flag = false;
556 
557  bptr = dbp->mOffset;
558  eptr = bptr + dbp->len;
559  ptr = SrchTheChar(bptr, eptr, '\n');
560 
561  while (ptr && ptr + 1 < eptr) {
562  if (ptr[2] == 'H') {
563  dbp->len = dbp->len - (ptr - dbp->mOffset + 1);
564  dbp->mOffset = ptr + 1;
565 
566  bptr = dbp->mOffset;
567  eptr = bptr + dbp->len;
568  } else {
569  bptr = ptr + 1;
570 
571  if (bptr[1] == 'T') {
572  flag = true;
573  *bptr = ' ';
574  bptr[1] = ' ';
575  }
576  }
577 
578  ptr = SrchTheChar(bptr, eptr, '\n');
579  }
580 
581  return (flag);
582 }
583 
584 /**********************************************************
585  *
586  * static bool GetSubNodeType(subkw, retbptr, eptr):
587  *
588  * Return TRUE and memory location which has
589  * the "subkw".
590  *
591  * 6-15-93
592  *
593  **********************************************************/
594 static bool GetSubNodeType(const char* subkw, char** retbptr, char* eptr)
595 {
596  char* bptr;
597  char* ptr;
598 
599  bptr = *retbptr;
600  size_t sublen = StringLen(subkw);
601 
602  while (bptr < eptr) {
603  if (StringEquN(bptr, subkw, sublen)) {
604  *retbptr = bptr;
605  return true;
606  }
607 
608  ptr = SrchTheChar(bptr, eptr, '\n');
609  if (ptr)
610  bptr = ptr;
611  bptr++;
612  }
613 
614  *retbptr = bptr;
615  return false;
616 }
617 
618 /**********************************************************
619  *
620  * static void GetEmblRefType(bases, source, dbp):
621  *
622  * If there is no "RP" line, default, or there is "RP"
623  * line and it contains "1-endbases", then
624  * type = ParFlat_REF_END, pub for "descr".
625  * Otherwise, ParFlat_REF_BTW, for SeqFeatPub.
626  *
627  * 6-15-93
628  *
629  **********************************************************/
630 static void GetEmblRefType(size_t bases, Parser::ESource source, DataBlkPtr dbp)
631 {
632  char* ptr;
633  char* bptr;
634  char* eptr;
635  char* sptr;
636 
637  bptr = dbp->mOffset;
638  eptr = bptr + dbp->len;
639 
640  if (! GetSubNodeType("RP", &bptr, eptr)) {
643  else
644  dbp->mType = ParFlat_REF_END;
645  return;
646  }
647 
648  const string str = " 1-" + to_string(bases);
649  ptr = SrchTheStr(bptr, eptr, str.c_str());
650  if (ptr) {
651  dbp->mType = ParFlat_REF_END;
652  return;
653  }
654 
655  if (source == Parser::ESource::EMBL) {
656  ptr = SrchTheStr(bptr, eptr, " 0-0");
657  if (ptr) {
659  return;
660  }
661  }
662 
663  dbp->mType = ParFlat_REF_BTW;
664  if (source == Parser::ESource::NCBI) {
665  for (sptr = bptr + 1; sptr < eptr && *sptr != 'R';)
666  sptr++;
667  if (SrchTheStr(bptr, sptr, "sites"))
668  dbp->mType = ParFlat_REF_SITES;
669  }
670 }
671 
672 /**********************************************************
673  *
674  * void GetEmblSubBlock(bases, source, entry):
675  *
676  * To build feature block:
677  * - report error if no FT data in the FH block;
678  * - to fit genbank feature table parsing:
679  * - only leave first FH line;
680  * - replace "FT" to " ";
681  * - delete any XX blocks.
682  *
683  * 5-27-93
684  *
685  **********************************************************/
686 void GetEmblSubBlock(size_t bases, Parser::ESource source, const DataBlk& entry)
687 {
688  DataBlkPtr temp;
689  DataBlkPtr curdbp;
690  DataBlkPtr predbp;
691  EntryBlkPtr ebp;
692 
693  temp = TrackNodeType(entry, ParFlat_OS);
694  for (; temp; temp = temp->mpNext) {
695  if (temp->mType != ParFlat_OS)
696  continue;
697 
698  BuildSubBlock(temp, ParFlat_OC, "OC");
699  BuildSubBlock(temp, ParFlat_OG, "OG");
700  GetLenSubNode(temp);
701  }
702 
703  temp = TrackNodeType(entry, ParFlat_RN);
704  for (; temp; temp = temp->mpNext) {
705  if (temp->mType != ParFlat_RN)
706  continue;
707 
708  fta_check_mult_ids(temp, "RX MEDLINE;", "RX PUBMED;");
709  BuildSubBlock(temp, ParFlat_RC, "RC");
710  BuildSubBlock(temp, ParFlat_RP, "RP");
711  BuildSubBlock(temp, ParFlat_RX, "RX");
712  BuildSubBlock(temp, ParFlat_RG, "RG");
713  BuildSubBlock(temp, ParFlat_RA, "RA");
714  BuildSubBlock(temp, ParFlat_RT, "RT");
715  BuildSubBlock(temp, ParFlat_RL, "RL");
716  GetEmblRefType(bases, source, temp);
717  GetLenSubNode(temp);
718  }
719 
720  ebp = static_cast<EntryBlk*>(entry.mpData);
721  temp = ebp->chain;
722  predbp = temp;
723  curdbp = temp->mpNext;
724  while (curdbp) {
725  if (curdbp->mType != ParFlat_FH) {
726  predbp = curdbp;
727  curdbp = curdbp->mpNext;
728  continue;
729  }
730 
731  if (TrimEmblFeatBlk(curdbp)) {
732  BuildFeatureBlock(curdbp);
733  GetLenSubNode(curdbp);
734 
735  predbp = curdbp;
736  curdbp = curdbp->mpNext;
737  } else /* report error, free this node */
738  {
739  ErrPostStr(SEV_WARNING, ERR_FEATURE_NoFeatData, "No feature data in the FH block (Embl)");
740 
741  predbp->mpNext = curdbp->mpNext;
742  curdbp->mpNext = nullptr;
743  delete curdbp;
744  curdbp = predbp->mpNext;
745  }
746  }
747 }
748 
749 /**********************************************************
750  *
751  * void BuildSubBlock(dbp, subtype, subkw):
752  *
753  * Some of sub-keyword may not be exist in every entry.
754  *
755  * 4-7-93
756  *
757  **********************************************************/
758 void BuildSubBlock(DataBlkPtr dbp, Int2 subtype, const char* subkw)
759 {
760  char* bptr;
761  char* eptr;
762 
763  bptr = dbp->mOffset;
764  eptr = bptr + dbp->len;
765 
766  if (GetSubNodeType(subkw, &bptr, eptr)) {
767  InsertDatablkVal(reinterpret_cast<DataBlk**>(&dbp->mpData), subtype, bptr, eptr - bptr);
768  }
769 }
770 
771 /**********************************************************
772  *
773  * void GetLenSubNode(dbp):
774  *
775  * Recalculate the length for the node which has
776  * subkeywords.
777  *
778  * 4-7-93
779  *
780  **********************************************************/
782 {
783  DataBlkPtr curdbp;
784  DataBlkPtr ndbp;
785  DataBlkPtr ldbp;
786  char* offset;
787  char* s;
788  Int2 n;
789  bool done = false;
790 
791  if (! dbp->mpData) /* no sublocks in this block */
792  return;
793 
794  offset = dbp->mOffset;
795  for (s = offset; *s != '\0' && isdigit(*s) == 0;)
796  s++;
797  n = atoi(s);
798  ldbp = nullptr;
799  for (ndbp = static_cast<DataBlk*>(dbp->mpData); ndbp; ndbp = ndbp->mpNext) {
800  size_t l = ndbp->mOffset - offset;
801  if (l > 0 && l < dbp->len) {
802  dbp->len = l;
803  ldbp = ndbp;
804  }
805  }
806 
807  if (ldbp != dbp->mpData && ldbp) {
808  ErrPostEx(SEV_WARNING, ERR_FORMAT_LineTypeOrder, "incorrect line type order for reference %d", n);
809  done = true;
810  }
811 
812  curdbp = static_cast<DataBlk*>(dbp->mpData);
813  for (; curdbp->mpNext; curdbp = curdbp->mpNext) {
814  offset = curdbp->mOffset;
815  ldbp = nullptr;
816  for (ndbp = static_cast<DataBlk*>(dbp->mpData); ndbp; ndbp = ndbp->mpNext) {
817  size_t l = ndbp->mOffset - offset;
818  if (l > 0 && l < curdbp->len) {
819  curdbp->len = l;
820  ldbp = ndbp;
821  }
822  }
823  if (ldbp != curdbp->mpNext && ldbp && ! done) {
824  ErrPostEx(SEV_WARNING, ERR_FORMAT_LineTypeOrder, "incorrect line type order for reference %d", n);
825  }
826  }
827 }
828 
829 /**********************************************************/
831 {
832  CRef<CPatent_seq_id> pat_id;
833  const char* p;
834  const char* q;
835 
836  if (! acc || *acc == '\0')
837  return (pat_id);
838 
839  pat_id = new CPatent_seq_id;
840 
841  p = StringChr(acc, '|');
842 
843  q = StringChr(p + 1, '|');
844  pat_id->SetCit().SetCountry(string(p + 1, q));
845 
846  p = StringChr(q + 1, '|');
847  pat_id->SetCit().SetId().SetNumber(string(q + 1, p));
848 
849  q = StringChr(p + 1, '|');
850  pat_id->SetCit().SetDoc_type(string(p + 1, q));
851 
852  pat_id->SetSeqid(atoi(q + 1));
853 
854  return (pat_id);
855 }
856 
857 /**********************************************************
858  *
859  * static Uint ValidSeqType(accession, type, is_nuc, is_tpa):
860  *
861  * 9-16-93
862  *
863  **********************************************************/
864 static Uint1 ValidSeqType(const char* accession, Uint1 type)
865 {
866  // CSeq_id::E_Choice cho;
867 
870  return (type);
871 
874  return (CSeq_id::e_not_set);
875 
876  if (! accession)
877  return (type);
878 
879  const auto cho = CSeq_id::GetAccType(CSeq_id::IdentifyAccession(accession));
880  /*
881  if (is_nuc)
882  cho = GetNucAccOwner(accession);
883  else
884  cho = GetProtAccOwner(accession);
885  */
886  if ((type == CSeq_id::e_Genbank || type == CSeq_id::e_Tpg) &&
887  (cho == CSeq_id::e_Genbank || cho == CSeq_id::e_Tpg))
888  return (cho);
889  else if ((type == CSeq_id::e_Ddbj || type == CSeq_id::e_Tpd) &&
890  (cho == CSeq_id::e_Ddbj || cho == CSeq_id::e_Tpd))
891  return (cho);
892  else if ((type == CSeq_id::e_Embl || type == CSeq_id::e_Tpe) &&
893  (cho == CSeq_id::e_Embl || cho == CSeq_id::e_Tpe))
894  return (cho);
895  return type;
896 }
897 
898 /**********************************************************
899  *
900  * CRef<CSeq_id> MakeAccSeqId(acc, seqtype, accver, vernum,
901  * is_nuc, is_tpa):
902  *
903  * 5-10-93
904  *
905  **********************************************************/
906 CRef<CSeq_id> MakeAccSeqId(const char* acc, Uint1 seqtype, bool accver, Int2 vernum)
907 {
908  CRef<CSeq_id> id;
909 
910  if (! acc || *acc == '\0')
911  return id;
912 
913  seqtype = ValidSeqType(acc, seqtype);
914 
915  if (seqtype == CSeq_id::e_not_set)
916  return id;
917 
918  CRef<CTextseq_id> text_id(new CTextseq_id);
919  text_id->SetAccession(acc);
920 
921  if (accver && vernum > 0)
922  text_id->SetVersion(vernum);
923 
924  id = new CSeq_id;
925  SetTextId(seqtype, *id, *text_id);
926  return id;
927 }
928 
929 /**********************************************************
930  *
931  * SeqIdPtr MakeLocusSeqId(locus, seqtype):
932  *
933  * 5-13-93
934  *
935  **********************************************************/
936 CRef<CSeq_id> MakeLocusSeqId(const char* locus, CSeq_id::E_Choice seqtype)
937 {
938  CRef<CSeq_id> res;
939  if (! locus || *locus == '\0')
940  return res;
941 
942  CRef<CTextseq_id> text_id(new CTextseq_id);
943  text_id->SetName(locus);
944 
945  res.Reset(new CSeq_id);
946  SetTextId(seqtype, *res, *text_id);
947 
948  return res;
949 }
950 
951 // LCOV_EXCL_START
952 // Excluded per Mark's request on 12/14/2016
953 /**********************************************************/
954 static CRef<CSeq_id> MakeSegSetSeqId(const char* accession, const string& locus, Uint1 seqtype, bool is_tpa)
955 {
956  CRef<CSeq_id> res;
957  if (locus.empty())
958  return res;
959 
960  seqtype = ValidSeqType(accession, seqtype);
961 
962  if (seqtype == CSeq_id::e_not_set)
963  return res;
964 
965  CRef<CTextseq_id> text_id(new CTextseq_id);
966  text_id->SetName(locus);
967 
968  res.Reset(new CSeq_id);
969  SetTextId(seqtype, *res, *text_id);
970 
971  return res;
972 }
973 // LCOV_EXCL_STOP
974 
975 /**********************************************************
976  *
977  * char* SrchNodeSubType(entry, type, subtype, len):
978  *
979  * Return a memory location of the node which has
980  * the "subtype".
981  *
982  * 4-7-93
983  *
984  **********************************************************/
985 char* SrchNodeSubType(const DataBlk& entry, Int2 type, Int2 subtype, size_t* len)
986 {
987  DataBlkPtr mdbp;
988  DataBlkPtr sdbp;
989 
990  *len = 0;
991  mdbp = TrackNodeType(entry, type);
992  if (! mdbp)
993  return nullptr;
994 
995  sdbp = static_cast<DataBlk*>(mdbp->mpData);
996 
997  while (sdbp && sdbp->mType != subtype)
998  sdbp = sdbp->mpNext;
999 
1000  if (! sdbp)
1001  return nullptr;
1002 
1003  *len = sdbp->len;
1004  return (sdbp->mOffset);
1005 }
1006 
1007 /**********************************************************/
1008 static void SetEmptyId(CBioseq& bioseq)
1009 {
1010  CRef<CObject_id> emptyId(new CObject_id);
1011  emptyId->SetId8(0);
1012 
1013  CRef<CSeq_id> seqId(new CSeq_id);
1014  seqId->SetLocal(*emptyId);
1015 
1016  bioseq.SetId().push_back(seqId);
1017 }
1018 
1019 /**********************************************************/
1021 {
1022  IndexblkPtr ibp;
1023 
1024  char* locus;
1025  const char* acc;
1026  Uint1 seqtype;
1027 
1028  CRef<CBioseq> res(new CBioseq);
1029 
1030  /* create the entry framework */
1031 
1032  ibp = pp->entrylist[pp->curindx];
1033  locus = ibp->locusname;
1034  acc = ibp->acnum;
1035 
1036  /* get the SeqId */
1037  if (pp->source == Parser::ESource::USPTO) {
1038  CRef<CSeq_id> id(new CSeq_id);
1040  id->SetPatent(*psip);
1041  return (res);
1042  }
1043  if (pp->source == Parser::ESource::EMBL && ibp->is_tpa)
1044  seqtype = CSeq_id::e_Tpe;
1045  else
1046  seqtype = ValidSeqType(acc, pp->seqtype);
1047 
1048  if (seqtype == CSeq_id::e_not_set) {
1049  if (acc && ! NStr::IsBlank(acc)) {
1050  auto pId = Ref(new CSeq_id(CSeq_id::e_Local, acc));
1051  res->SetId().push_back(std::move(pId));
1052  } else if (pp->mode == Parser::EMode::Relaxed && locus) {
1053  auto pId = Ref(new CSeq_id(CSeq_id::e_Local, locus));
1054  res->SetId().push_back(std::move(pId));
1055  } else {
1056  SetEmptyId(*res);
1057  }
1058  } else if ((! locus || *locus == '\0') && (! acc || *acc == '\0')) {
1059  SetEmptyId(*res);
1060  } else {
1061  CRef<CTextseq_id> textId(new CTextseq_id);
1062 
1063  if (ibp->embl_new_ID == false && locus && *locus != '\0' &&
1064  (! acc || ! StringEqu(acc, locus)))
1065  textId->SetName(locus);
1066 
1067  if (acc && *acc != '\0')
1068  textId->SetAccession(acc);
1069 
1070  if (pp->accver && ibp->vernum > 0)
1071  textId->SetVersion(ibp->vernum);
1072 
1073  CRef<CSeq_id> seqId(new CSeq_id);
1074  if (SetTextId(seqtype, *seqId, *textId))
1075  res->SetId().push_back(seqId);
1076  else
1077  SetEmptyId(*res);
1078  }
1079 
1080  return res;
1081 }
1082 
1083 /**********************************************************
1084  *
1085  * char* GetDescrComment(offset, len, col_data, is_htg):
1086  *
1087  * Return a pointer to a string comment.
1088  * Strip tailing or leading blanks, unless the
1089  * following rules occurrs (all the length will count
1090  * leading or tailing blanks):
1091  * - replace "\n" to "~~ ~~" if the length of a
1092  * line <= 12, except first blank line;
1093  * - if the column 13 is blank in the current line
1094  * and the previous line does not be added "~" at
1095  * end, then add "~" the beginning of the line
1096  * (indent format);
1097  * - replace "\n" to "~" if the length of a
1098  * line < 50 and (not a last line or not a first
1099  * line);
1100  * -- otherwise, change "\n" to a space.
1101  *
1102  * 4-28-93
1103  *
1104  **********************************************************/
1105 char* GetDescrComment(char* offset, size_t len, Int2 col_data, bool is_htg, bool is_pat)
1106 {
1107  char* p;
1108  char* q;
1109  char* r;
1110  char* str;
1111 
1112  bool within = false;
1113  char* bptr = offset;
1114  char* eptr = bptr + len;
1115  char* com = StringNew(len);
1116 
1117  for (str = com; bptr < eptr; bptr = p + 1) {
1118  p = SrchTheChar(bptr, eptr, '\n');
1119 
1120  /* skip HTG generated comments starting with '*' */
1121  if ((is_htg && bptr[col_data] == '*') ||
1122  StringEquN(bptr, "XX", 2))
1123  continue;
1124 
1125  if (! within) {
1126  *p = '\0';
1127  r = StringStr(bptr, "-START##");
1128  *p = '\n';
1129  if (r)
1130  within = true;
1131  }
1132 
1133  q = bptr;
1134  if (*q == 'C')
1135  q++;
1136  if (*q == 'C')
1137  q++;
1138  while (*q == ' ')
1139  q++;
1140  if (q == p) {
1141  if (*(str - 1) != '~')
1142  *str++ = '~';
1143  *str++ = '~';
1144  continue;
1145  }
1146 
1147  if (p - bptr < col_data)
1148  continue;
1149 
1150  bptr += col_data;
1151  size_t size = p - bptr;
1152 
1153  if (*bptr == ' ' && *(str - 1) != '~')
1154  *str++ = '~';
1155  MemCpy(str, bptr, size);
1156  str += size;
1157  if (is_pat && size > 4 &&
1158  q[0] >= 'A' && q[0] <= 'Z' && q[1] >= 'A' && q[1] <= 'Z' &&
1159  StringEquN(q + 2, " ", 3))
1160  *str++ = '~';
1161  else if (size < 50 || within)
1162  *str++ = '~';
1163  else
1164  *str++ = ' ';
1165 
1166  if (within) {
1167  *p = '\0';
1168  r = StringStr(bptr, "-END##");
1169  *p = '\n';
1170  if (r)
1171  within = false;
1172  }
1173  }
1174 
1175  for (p = com;;) {
1176  p = StringStr(p, "; ");
1177  if (! p)
1178  break;
1179  for (p += 2, eptr = p; *eptr == ' ';)
1180  eptr++;
1181  if (eptr > p)
1182  fta_StringCpy(p, eptr);
1183  }
1184  for (p = com; *p == ' ';)
1185  p++;
1186  if (p > com)
1187  fta_StringCpy(com, p);
1188  for (p = com; *p != '\0';)
1189  p++;
1190  if (p > com) {
1191  for (p--;; p--) {
1192  if (*p == ' ' || *p == '\t' || *p == ';' || *p == ',' ||
1193  *p == '.' || *p == '~') {
1194  if (p > com)
1195  continue;
1196  *p = '\0';
1197  }
1198  break;
1199  }
1200  if (*p != '\0') {
1201  p++;
1202  if (StringEquN(p, "...", 3))
1203  p[3] = '\0';
1204  else if (StringChr(p, '.')) {
1205  *p = '.';
1206  p[1] = '\0';
1207  } else
1208  *p = '\0';
1209  }
1210  }
1211  if (*com != '\0')
1212  return (com);
1213  MemFree(com);
1214  return nullptr;
1215 }
1216 
1217 /**********************************************************/
1219 {
1220  TokenBlkPtr tbp;
1221  char* p;
1222 
1223  if (! secs || ! secs->next || ! secs->str ||
1224  ! secs->next->str || fta_if_wgs_acc(secs->str) != 0 ||
1225  ! StringEqu(secs->next->str, "-"))
1226  return;
1227 
1228  tbp = new TokenBlk;
1229  tbp->str = StringSave(secs->str);
1230  tbp->next = secs->next;
1231  secs->next = tbp;
1232 
1233  for (p = tbp->str; *(p + 1) != '\0';)
1234  p++;
1235  *p = '1';
1236 }
1237 
1238 
1239 /**********************************************************/
1240 /*
1241 static void fta_fix_secondaries(list<string>& secondaries)
1242 {
1243  if (secondaries.size() < 2) {
1244  return;
1245  }
1246 
1247  auto it = secondaries.begin();
1248  const auto& first = *it;
1249  const auto& second = *next(it);
1250 
1251  if (first.empty()||
1252  second.empty() ||
1253  fta_if_wgs_acc(second.c_str()) != 0 ||
1254  second != "-") {
1255  return;
1256  }
1257 
1258  string newSecondary = *it;
1259  newSecondary.back() = '1';
1260  ++it;
1261  secondaries.insert(it, newSecondary);
1262 }
1263 */
1264 
1265 /**********************************************************
1266  *
1267  * void GetExtraAccession(ibp, allow_uwsec, source, accessions):
1268  *
1269  * Skip first accession, put remaining accessions
1270  * to link list 'accessions'.
1271  * Each accession separated by ";" or blanks.
1272  *
1273  **********************************************************/
1274 void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList& accessions)
1275 {
1276  TokenBlkPtr tbp;
1277  Int4 pri_acc;
1278  Int4 sec_acc;
1279  const char* text;
1280  char* acc;
1281  char* p;
1282  size_t i = 0;
1283 
1284  bool unusual_wgs;
1285  bool unusual_wgs_msg;
1286  bool is_cp;
1287 
1288  CSeq_id::E_Choice pri_owner;
1289  CSeq_id::E_Choice sec_owner;
1290 
1291  if (! ibp->secaccs) {
1292  return;
1293  }
1294 
1295  acc = StringSave(ibp->acnum);
1296  is_cp = (acc[0] == 'C' && acc[1] == 'P');
1297  pri_acc = fta_if_wgs_acc(acc);
1298  pri_owner = GetNucAccOwner(acc);
1299  if (pri_acc == 1 || pri_acc == 4) {
1300  for (p = acc; (*p >= 'A' && *p <= 'Z') || *p == '_';)
1301  p++;
1302  *p = '\0';
1303  i = StringLen(acc);
1304  }
1305 
1306  if (source == Parser::ESource::EMBL) {
1308  }
1309 
1310  unusual_wgs = false;
1311  for (tbp = ibp->secaccs; tbp; tbp = tbp->next) {
1312  p = tbp->str;
1313  if (p[0] == '-' && p[1] == '\0') {
1314  tbp = tbp->next;
1315  if (! tbp)
1316  break;
1317  if (! accessions.empty()) {
1318  accessions.back() += '-';
1319  accessions.back() += tbp->str;
1320  }
1321  continue;
1322  }
1323 
1324  DelNoneDigitTail(p);
1325  sec_acc = fta_if_wgs_acc(p);
1326 
1327  unusual_wgs_msg = true;
1328  if (sec_acc == 0 || sec_acc == 3 ||
1329  sec_acc == 4 || sec_acc == 6 ||
1330  sec_acc == 10 || sec_acc == 12) /* 0 = AAAA01000000,
1331  3 = AAAA00000000,
1332  4 = GAAA01000000,
1333  6 = GAAA00000000,
1334  10 = KAAA01000000,
1335  12 = KAAA00000000 */
1336  {
1337  if (ibp->is_contig &&
1338  (ibp->wgssec.empty() || NStr::CommonSuffixSize(ibp->wgssec, p) >= 4))
1339  unusual_wgs_msg = false;
1340  if (ibp->wgssec.empty())
1341  ibp->wgssec = p;
1342  }
1343 
1344  sec_owner = GetNucAccOwner(p);
1345 
1346  if (sec_acc < 0 || sec_acc == 2) {
1347  if (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) {
1348  if (! allow_uwsec) {
1349  ErrPostEx(SEV_REJECT, ERR_ACCESSION_WGSWithNonWGS_Sec, "This WGS/TSA/TLS record has non-WGS/TSA/TLS secondary accession \"%s\". WGS/TSA/TLS records are not currently allowed to replace finished sequence records, scaffolds, etc. without human review and confirmation.", p);
1350  ibp->drop = true;
1351  } else {
1352  ErrPostEx(SEV_WARNING, ERR_ACCESSION_WGSWithNonWGS_Sec, "This WGS/TSA/TLS record has non-WGS/TSA/TLS secondary accession \"%s\". This is being allowed via the use of a special parser flag.", p);
1353  }
1354  }
1355 
1356  accessions.push_back(p);
1357  continue;
1358  }
1359 
1360  if (sec_acc == 3 || sec_acc == 6) /* like AAAA00000000 */
1361  {
1362  if (pri_owner == CSeq_id::e_Embl && sec_owner == CSeq_id::e_Embl &&
1363  (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) &&
1365  continue;
1368  ErrPostEx(SEV_REJECT, ERR_ACCESSION_WGSMasterAsSecondary, "WGS/TSA/TLS master accession \"%s\" is not allowed to be used as a secondary accession number.", p);
1369  ibp->drop = true;
1370  }
1371  continue;
1372  }
1373 
1374  if (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) /* WGS/TSA/TLS
1375  contig */
1376  {
1377  i = (StringEquN(p, "NZ_", 3)) ? 7 : 4;
1378  if (! StringEquN(p, ibp->acnum, i)) {
1379  if (! allow_uwsec) {
1380  ErrPostEx(SEV_REJECT, ERR_ACCESSION_UnusualWGS_Secondary, "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA/TLS project is being replaced (either by another project or by finished sequence). This is not allowed without human review and confirmation.");
1381  ibp->drop = true;
1382  } else if (! is_cp || source != Parser::ESource::NCBI) {
1383  ErrPostEx(SEV_WARNING, ERR_ACCESSION_UnusualWGS_Secondary, "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA project is being replaced (either by another project or by finished sequence). This is being allowed via the use of a special parser flag.");
1384  }
1385  }
1386  } else if (pri_acc == 2) /* WGS scaffold */
1387  {
1388  if (sec_acc == 1 || sec_acc == 5 || sec_acc == 11) /* WGS/TSA/TLS
1389  contig */
1390  {
1391  ErrPostEx(SEV_REJECT, ERR_ACCESSION_ScfldHasWGSContigSec, "This record, which appears to be a scaffold, has one or more WGS/TSA/TLS contig accessions as secondary. Currently, it does not make sense for a contig to replace a scaffold.");
1392  ibp->drop = true;
1393  }
1394  } else if (unusual_wgs_msg) {
1395  if (! allow_uwsec) {
1396  if (! unusual_wgs) {
1397  if (sec_acc == 1 || sec_acc == 5 || sec_acc == 11)
1398  text = "WGS/TSA/TLS contig secondaries are present, implying that a scaffold is replacing a contig";
1399  else
1400  text = "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA/TLS project is being replaced (either by another project or by finished sequence)";
1401  ErrPostEx(SEV_REJECT, ERR_ACCESSION_UnusualWGS_Secondary, "%s. This is not allowed without human review and confirmation.", text);
1402  }
1403  unusual_wgs = true;
1404  ibp->drop = true;
1405  } else if (! is_cp || source != Parser::ESource::NCBI) {
1406  if (! unusual_wgs) {
1407  if (sec_acc == 1 || sec_acc == 5 || sec_acc == 11)
1408  text = "WGS/TSA/TLS contig secondaries are present, implying that a scaffold is replacing a contig";
1409  else
1410  text = "This record has one or more WGS/TSA/TLS secondary accession numbers which imply that a WGS/TSA/TLS project is being replaced (either by another project or by finished sequence)";
1411  ErrPostEx(SEV_WARNING, ERR_ACCESSION_UnusualWGS_Secondary, "%s. This is being allowed via the use of a special parser flag.", text);
1412  }
1413  unusual_wgs = true;
1414  }
1415  }
1416 
1417  if (pri_acc == 1 || pri_acc == 5 || pri_acc == 11) {
1418  if (StringEquN(acc, p, i) && p[i] >= '0' && p[i] <= '9') {
1419  if (sec_acc == 1 || sec_acc == 5 || pri_acc == 11)
1420  accessions.push_back(p);
1421  } else if (allow_uwsec) {
1422  accessions.push_back(p);
1423  }
1424  } else if (pri_acc == 2) {
1425  if (sec_acc == 0 || sec_acc == 4) /* like AAAA10000000 */
1426  accessions.push_back(p);
1427  } else if (allow_uwsec || (! unusual_wgs_msg && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL))) {
1428  accessions.push_back(p);
1429  }
1430  }
1431 
1432  MemFree(acc);
1433 }
1434 
1435 /**********************************************************/
1436 static void fta_fix_tpa_keywords(TKeywordList& keywords)
1437 {
1438  const char* p;
1439 
1440  for (string& key : keywords) {
1441  if (key.empty())
1442  continue;
1443 
1444  if (NStr::CompareNocase(key.c_str(), "TPA") == 0)
1445  key = "TPA";
1446  else if (StringEquNI(key.c_str(), "TPA:", 4)) {
1447  string buf("TPA:");
1448 
1449  for (p = key.c_str() + 4; *p == ' ' || *p == '\t';)
1450  p++;
1451 
1452  buf += p;
1453  if (fta_is_tpa_keyword(buf.c_str())) {
1454  for (string::iterator p = buf.begin() + 4; p != buf.end(); ++p) {
1455  if (*p >= 'A' && *p <= 'Z')
1456  *p |= 040;
1457  }
1458  }
1459 
1460  swap(key, buf);
1461  }
1462  }
1463 }
1464 
1465 // ----------------------------------------------------------------------------
1467  string& keywordData)
1468 // ----------------------------------------------------------------------------
1469 {
1470  const string problematic("WGS Third Party Data");
1471  const string desired("WGS; Third Party Data");
1472 
1473  if (keywordData.empty()) {
1474  return;
1475  }
1476  auto wgsStart = NStr::FindNoCase(keywordData, problematic);
1477  if (wgsStart == string::npos) {
1478  return;
1479  }
1480  auto afterProblematic = keywordData[wgsStart + problematic.size()];
1481  if (afterProblematic != ';' && afterProblematic != '.') {
1482  return;
1483  }
1484 
1485  string fixedKeywords;
1486  if (wgsStart > 0) {
1487  auto semiBefore = keywordData.rfind(';', wgsStart - 1);
1488  if (semiBefore == string::npos) {
1489  return;
1490  }
1491  for (auto i = semiBefore + 1; i < wgsStart; ++i) {
1492  if (keywordData[i] != ' ') {
1493  return;
1494  }
1495  }
1496  fixedKeywords = keywordData.substr(0, wgsStart - 1);
1497  }
1498  fixedKeywords += desired;
1499  fixedKeywords += keywordData.substr(wgsStart + problematic.size());
1500  keywordData = fixedKeywords;
1501 }
1502 
1503 
1504 // ----------------------------------------------------------------------------
1506  const DataBlk& entry,
1507  int type,
1508  int col_data,
1509  TKeywordList& keywords)
1510 // ----------------------------------------------------------------------------
1511 {
1512  // Expectation: Each keyword separated by ";", the last one ends with "."
1513 
1514  keywords.clear();
1515  auto keywordData = xGetNodeData(entry, type);
1516  if (keywordData.empty()) {
1517  return;
1518  }
1519  xGetBlkDataReplaceNewLine(keywordData, col_data);
1520  if (type == ParFlatSP_KW) {
1521  xStripECO(keywordData);
1522  }
1523  xFixEMBLKeywords(keywordData);
1524 
1525  NStr::Split(keywordData, ";", keywords);
1526  auto it = keywords.begin();
1527  auto last = --keywords.end();
1528  while (it != keywords.end()) {
1529  auto& keyword = *it;
1530  NStr::TruncateSpacesInPlace(keyword);
1531  if (it == last) {
1532  NStr::TrimSuffixInPlace(keyword, ".");
1533  NStr::TruncateSpacesInPlace(keyword);
1534  }
1535  if (keyword.empty()) {
1536  keywords.erase(it++);
1537  } else {
1538  it++;
1539  }
1540  }
1541 
1542  fta_fix_tpa_keywords(keywords);
1543 }
1544 
1545 
1546 /**********************************************************
1547  *
1548  * Int4 ScanSequence(warn, seqptr, bsp, conv,
1549  * replacechar, numns):
1550  *
1551  * Scans a block of text converting characters to
1552  * sequence and storing in the ByteStorePtr bsp.
1553  * conv is a 255 Uint1 array where cells are indexed
1554  * by the ASCII value of the character in ptr:
1555  * - a value of 0 indicates skip;
1556  * - a value of 1 indicates an character is
1557  * unexpected (error);
1558  * - otherwise, it is a IUPACaa (protein) or a IUPACna
1559  * (nucleic acid) letter.
1560  * Function returns count of valid characters
1561  * converted to sequence.
1562  *
1563  * When sequence is presented in columns, this
1564  * function should be called once per line, so that
1565  * numbers can be recognized as errors.
1566  *
1567  * 3-30-93
1568  *
1569  * In order to skip the input flatfile put residue
1570  * label count at end, add blank variable to assume each
1571  * line only allow 6 blanks between residue.
1572  *
1573  * 7-28-93
1574  *
1575  **********************************************************/
1576 Int4 ScanSequence(bool warn, char** seqptr, std::vector<char>& bsp, unsigned char* conv, Char replacechar, int* numns)
1577 {
1578  Int2 blank;
1579  Int2 count;
1580  Uint1 residue;
1581  char* ptr;
1582  static Uint1 buf[133];
1583  unsigned char* bu;
1584 
1585  blank = count = 0;
1586  ptr = *seqptr;
1587 
1588  bu = buf;
1589  while (*ptr != '\n' && *ptr != '\0' && blank < 6 && count < 100) {
1590  if (numns && (*ptr == 'n' || *ptr == 'N'))
1591  (*numns)++;
1592 
1593  residue = conv[(int)*ptr];
1594 
1595  if (*ptr == ' ')
1596  blank++;
1597 
1598  if (residue > 2) {
1599  *bu++ = residue;
1600  count++;
1601  } else if (residue == 1 && (warn || isalpha(*ptr) != 0)) {
1602  /* it can be punctuation or alpha character */
1603  *bu++ = replacechar;
1604  count++;
1605  ErrPostEx(SEV_ERROR, ERR_SEQUENCE_BadResidue, "Invalid residue [%c]", *ptr);
1606  return (0);
1607  }
1608  ptr++;
1609  }
1610 
1611  *seqptr = ptr;
1612  std::copy(buf, bu, std::back_inserter(bsp));
1613  // BSWrite(bsp, buf, (Int4)(bu - buf));
1614  return (count);
1615 }
1616 
1617 /**********************************************************
1618  *
1619  * bool GetSeqData(pp, entry, bsp, nodetype, seqconv,
1620  * seq_data_type):
1621  *
1622  * Replace any bad residue to "N" if DNA sequence,
1623  * "X" if protein sequence.
1624  * PIR format allow punctuation in the sequence data,
1625  * so no warning message if found punctuation in the
1626  * sequence data.
1627  * Tatiana (mv from ScanSequence)
1628  *
1629  * 04-19-94
1630  *
1631  **********************************************************/
1632 bool GetSeqData(ParserPtr pp, const DataBlk& entry, CBioseq& bioseq, Int4 nodetype, unsigned char* seqconv, Uint1 seq_data_type)
1633 {
1634  // ByteStorePtr bp;
1635  IndexblkPtr ibp;
1636  char* seqptr;
1637  char* endptr;
1638  char* str;
1639  Char replacechar;
1640  size_t len = 0;
1641  Int4 numns;
1642 
1643  ibp = pp->entrylist[pp->curindx];
1644 
1645  bioseq.SetInst().SetLength(static_cast<TSeqPos>(ibp->bases));
1646 
1647  if (ibp->is_contig || ibp->is_mga)
1648  return true;
1649 
1650  if (pp->format == Parser::EFormat::XML) {
1652  seqptr = str;
1653  if (seqptr) {
1654  len = StringLen(seqptr);
1655  if (pp->source != Parser::ESource::USPTO || ! ibp->is_prot)
1656  for (char* p = seqptr; *p != '\0'; p++)
1657  if (*p >= 'A' && *p <= 'Z')
1658  *p |= 040; // tolower
1659  }
1660  } else {
1661  str = nullptr;
1662  seqptr = xSrchNodeType(entry, nodetype, &len);
1663  }
1664 
1665  if (! seqptr)
1666  return false;
1667 
1668  endptr = seqptr + len;
1669 
1672  replacechar = 'N';
1673  else
1674  replacechar = 'X';
1675 
1676  /* the sequence data will be located in next line of nodetype */
1677  if (pp->format == Parser::EFormat::XML) {
1678  while (*seqptr == ' ' || *seqptr == '\n' || *seqptr == '\t')
1679  seqptr++;
1680  } else {
1681  while (*seqptr != '\n')
1682  seqptr++;
1683  while (isalpha(*seqptr) == 0) /* skip leading blanks and digits */
1684  seqptr++;
1685  }
1686 
1687  std::vector<char> buf;
1688  size_t seqlen = 0;
1689  for (numns = 0; seqptr < endptr;) {
1690  len = ScanSequence(true, &seqptr, buf, seqconv, replacechar, &numns);
1691  if (len == 0) {
1692  if (str)
1693  MemFree(str);
1694  return false;
1695  }
1696 
1697  seqlen += len;
1698  while (isalpha(*seqptr) == 0 && seqptr < endptr)
1699  seqptr++;
1700  }
1701 
1702  if (seqlen != bioseq.GetLength()) {
1703  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_SeqLenNotEq, "Measured seqlen [%ld] != given [%ld]", (long int)seqlen, (long int)bioseq.GetLength());
1704  }
1705 
1706  if (str)
1707  MemFree(str);
1708 
1709  if (seq_data_type == CSeq_data::e_Iupacaa) {
1710  if (bioseq.GetLength() < 10) {
1712  if (ibp->is_pat == false)
1713  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_TooShort, "This sequence for this record falls below the minimum length requirement of 10 basepairs.");
1714  else
1715  ErrPostEx(SEV_INFO, ERR_SEQUENCE_TooShortIsPatent, "This sequence for this patent record falls below the minimum length requirement of 10 basepairs.");
1716  } else {
1717  if (ibp->is_pat == false)
1718  ErrPostEx(SEV_REJECT, ERR_SEQUENCE_TooShort, "This sequence for this record falls below the minimum length requirement of 10 basepairs.");
1719  else
1720  ErrPostEx(SEV_REJECT, ERR_SEQUENCE_TooShortIsPatent, "This sequence for this patent record falls below the minimum length requirement of 10 basepairs.");
1721  ibp->drop = true;
1722  }
1723  }
1724  if (seqlen == static_cast<Uint4>(numns)) {
1725  ErrPostEx(SEV_REJECT, ERR_SEQUENCE_AllNs, "This nucleotide sequence for this record contains nothing but unknown (N) basepairs.");
1726  ibp->drop = true;
1727  }
1728  }
1729 
1730  bioseq.SetInst().SetSeq_data().Assign(CSeq_data(buf, static_cast<CSeq_data::E_Choice>(seq_data_type)));
1731 
1732  return true;
1733 }
1734 
1735 /**********************************************************
1736  *
1737  * unsigned char* GetDNAConv():
1738  *
1739  * DNA conversion table array.
1740  *
1741  * 3-29-93
1742  *
1743  **********************************************************/
1744 unique_ptr<unsigned char[]> GetDNAConv(void)
1745 {
1746 
1747  unique_ptr<unsigned char[]> dnaconv(new unsigned char[255]());
1748  MemSet((char*)dnaconv.get(), (Uint1)1, (size_t)255);
1749 
1750  dnaconv[32] = 0; /* blank */
1751 
1753  for (CSeqportUtil::TIndex i = range.first; i <= range.second; ++i) {
1755 
1756  dnaconv[static_cast<int>(code[0])] = code[0];
1757  dnaconv[(int)tolower(code[0])] = code[0];
1758  }
1759 
1760  return dnaconv;
1761 }
1762 
1763 /**********************************************************
1764  *
1765  * unsigned char* GetProteinConv():
1766  *
1767  * Protein conversion table array.
1768  *
1769  * 3-29-93
1770  *
1771  **********************************************************/
1772 unique_ptr<unsigned char[]> GetProteinConv(void)
1773 {
1774  // unsigned char* protconv;
1775  unique_ptr<unsigned char[]> protconv(new unsigned char[255]());
1776 
1777  // protconv = (unsigned char*)MemNew((size_t)255); /* proteins */
1778  MemSet((char*)protconv.get(), (Uint1)1, (size_t)255); /* everything
1779  an error */
1780  protconv[32] = 0; /* blank */
1781 
1783  for (CSeqportUtil::TIndex i = range.first; i <= range.second; ++i) {
1785  protconv[(int)code[0]] = code[0]; /* swiss-prot, pir uses upper case
1786  protein code */
1787  }
1788 
1789  return (protconv);
1790 }
1791 
1792 /***********************************************************/
1793 static CSeq_descr::Tdata::const_iterator GetDescrByChoice(const CSeq_descr& descr, Uint1 choice)
1794 {
1795  const CSeq_descr::Tdata& descr_list = descr.Get();
1796 
1797  CSeq_descr::Tdata::const_iterator cur_descr = descr_list.begin();
1798  for (; cur_descr != descr_list.end(); ++cur_descr) {
1799  if ((*cur_descr)->Which() == choice)
1800  break;
1801  }
1802 
1803  return cur_descr;
1804 }
1805 
1806 // LCOV_EXCL_START
1807 // Excluded per Mark's request on 12/14/2016
1808 /**********************************************************
1809  *
1810  * static void GetFirstSegDescrChoice(bio_set, choice,
1811  * descr_new):
1812  *
1813  * 10-14-93
1814  *
1815  **********************************************************/
1816 static void GetFirstSegDescrChoice(CBioseq& bioseq, Uint1 choice, CSeq_descr& descr_new)
1817 {
1818  CSeq_descr& descr = bioseq.SetDescr();
1819  CSeq_descr::Tdata& descr_list = descr.Set();
1820 
1821  // Don't use GetDescrByChoice here just because GCC version does not support erase(const_iterator)
1822  CSeq_descr::Tdata::iterator cur_descr = descr_list.begin();
1823  for (; cur_descr != descr_list.end(); ++cur_descr) {
1824  if ((*cur_descr)->Which() == choice) {
1825  /* found the "choice" node, isolated node */
1826  descr_new.Set().push_back(*cur_descr);
1827  descr_list.erase(cur_descr);
1828  break;
1829  }
1830  }
1831 }
1832 // LCOV_EXCL_STOP
1833 
1834 // SameCitation and 'PubEquivMatch' have a bit different logic,
1835 // so below is an additional function that makes a check
1836 // for equality according to 'PubEquivMatch' rules
1838 {
1839  for (const CRef<CPub>& it1 : a.Get()) {
1840  for (const CRef<CPub>& it2 : b.Get()) {
1841  if (it1->SameCitation(*it2)) {
1842  bool same = true;
1843 
1844  if (it1->Which() == CPub::e_Gen && it2->Which() == CPub::e_Gen) {
1845  const CCit_gen& cit_a = it1->GetGen();
1846  const CCit_gen& cit_b = it2->GetGen();
1847 
1848  if (cit_a.IsSetSerial_number() && cit_b.IsSetSerial_number() && cit_a.GetSerial_number() == cit_b.GetSerial_number()) {
1849  // The special condition of 'PubEquivMatch'
1850  // a->volume == NULL && b->volume == NULL &&
1851  // a->issue == NULL && b->issue == NULL &&
1852  // a->pages == NULL && b->pages == NULL &&
1853  // a->title == NULL && b->title == NULL &&
1854  // a->cit == NULL && b->cit == NULL &&
1855  // a->authors == NULL && b->authors == NULL &&
1856  // a->muid == -1 && b->muid == -1 &&
1857  // a->journal == NULL && b->journal == NULL &&
1858  // a->date == NULL && b->date == NULL &&
1859  // a->serial_number != -1 && b->serial_number != -1
1860 
1861  if (! cit_a.IsSetVolume() && ! cit_b.IsSetVolume() &&
1862  ! cit_a.IsSetIssue() && ! cit_b.IsSetIssue() &&
1863  ! cit_a.IsSetPages() && ! cit_b.IsSetPages() &&
1864  ! cit_a.IsSetTitle() && ! cit_b.IsSetTitle() &&
1865  ! cit_a.IsSetCit() && ! cit_b.IsSetCit() &&
1866  ! cit_a.IsSetAuthors() && ! cit_b.IsSetAuthors() &&
1867  ! cit_a.IsSetMuid() && ! cit_b.IsSetMuid() &&
1868  ! cit_a.IsSetJournal() && ! cit_b.IsSetJournal() &&
1869  ! cit_a.IsSetDate() && ! cit_b.IsSetDate())
1870  same = false; // SIC!!!
1871  }
1872  }
1873 
1874  if (same)
1875  return true;
1876  }
1877  }
1878  }
1879 
1880  return false;
1881 }
1882 
1883 // LCOV_EXCL_START
1884 // Excluded per Mark's request on 12/14/2016
1885 /**********************************************************
1886  *
1887  * static bool CheckSegPub(pub, entries, same_pub_descr):
1888  *
1889  * 5-21-93
1890  *
1891  **********************************************************/
1892 static bool CheckSegPub(const CPubdesc& pub, TEntryList& entries, std::set<CSeqdesc*>& same_pub_descr)
1893 {
1894  if (! pub.IsSetPub() || ! pub.GetPub().IsSet() || pub.GetPub().Get().empty())
1895  return true;
1896 
1897  CRef<CPub> pub_ref = pub.GetPub().Get().front();
1898 
1899  if (! pub_ref->IsGen() || ! pub_ref->GetGen().IsSetSerial_number())
1900  return true;
1901 
1902  int num0 = pub_ref->GetGen().GetSerial_number();
1903 
1904  TEntryList::iterator next_seq = entries.begin();
1905  for (++next_seq; next_seq != entries.end(); ++next_seq) {
1906  if (! (*next_seq)->IsSetDescr())
1907  continue;
1908 
1909  CSeq_descr& descr = (*next_seq)->SetDescr();
1910 
1911  bool not_found = true;
1912  for (auto& cur_descr : descr.Set()) {
1913  if (! cur_descr->IsPub() || ! cur_descr->GetPub().IsSetPub() || ! cur_descr->GetPub().GetPub().IsSet() ||
1914  cur_descr->GetPub().GetPub().Get().empty())
1915  continue;
1916 
1917  const CPubdesc& cur_pub = cur_descr->GetPub();
1918  const CPub& cur_pub_ref = *cur_pub.GetPub().Get().front();
1919 
1920  if (! cur_pub_ref.IsGen() || ! cur_pub_ref.GetGen().IsSetSerial_number())
1921  continue;
1922 
1923  int num = cur_pub_ref.GetGen().GetSerial_number();
1924 
1925  if (! SameCitation_PubEquivMatch_Logic(cur_pub.GetPub(), pub.GetPub()))
1926  continue;
1927 
1928  if (num == num0) {
1929  same_pub_descr.insert(cur_descr); // store pointer to the same descr for future use
1930  not_found = false;
1931  break;
1932  }
1933 
1934  ErrPostStr(SEV_WARNING, ERR_SEGMENT_PubMatch, "Matching references with different serial numbers");
1935  }
1936 
1937  if (not_found)
1938  break;
1939  }
1940 
1941  return (next_seq == entries.end());
1942 }
1943 // LCOV_EXCL_STOP
1944 
1945 /***********************************************************/
1946 static void RemoveDescrByChoice(CSeq_descr& descr, Uint1 choice)
1947 {
1948  CSeq_descr::Tdata& descr_list = descr.Set();
1949 
1950  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
1951  if ((*cur_descr)->Which() == choice)
1952  cur_descr = descr_list.erase(cur_descr);
1953  else
1954  ++cur_descr;
1955  }
1956 }
1957 
1958 /**********************************************************
1959  *
1960  * static void CleanUpSeqDescrChoice(entries, choice):
1961  *
1962  * 5-21-93
1963  *
1964  **********************************************************/
1966 {
1967  TEntryList::iterator next_seq = entries.begin();
1968  ++next_seq;
1969 
1970  for (; next_seq != entries.end(); ++next_seq)
1971  RemoveDescrByChoice((*next_seq)->SetDescr(), choice);
1972 }
1973 
1974 /**********************************************************
1975  *
1976  * static void CleanUpSeqDescrPub(entries, to_clean):
1977  *
1978  * 1-13-16
1979  *
1980  **********************************************************/
1981 static void CleanUpSeqDescrPub(TEntryList& entries, std::set<CSeqdesc*>& to_clean)
1982 {
1983  TEntryList::iterator next_seq = entries.begin();
1984  ++next_seq;
1985 
1986  for (; next_seq != entries.end(); ++next_seq) {
1987  CSeq_descr::Tdata& descr_list = (*next_seq)->SetDescr().Set();
1988  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
1989  std::set<CSeqdesc*>::iterator it = to_clean.find(*cur_descr);
1990  if (it != to_clean.end()) {
1991  cur_descr = descr_list.erase(cur_descr);
1992  to_clean.erase(it);
1993  } else
1994  ++cur_descr;
1995  }
1996  }
1997 }
1998 
1999 // LCOV_EXCL_START
2000 // Excluded per Mark's request on 12/14/2016
2001 /**********************************************************
2002  *
2003  * static void GetSegPub(entries, descr):
2004  *
2005  * 5-21-93
2006  *
2007  **********************************************************/
2009 {
2010  CBioseq& bioseq = entries.front()->SetSeq();
2011  CSeq_descr::Tdata& descr_list = bioseq.SetDescr().Set();
2012 
2013  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
2014  if ((*cur_descr)->IsPub()) {
2015  CPubdesc& pubdesc = (*cur_descr)->SetPub();
2016 
2017  std::set<CSeqdesc*> same_pub_descr;
2018  if (CheckSegPub(pubdesc, entries, same_pub_descr)) {
2019  descr.Set().push_back(*cur_descr);
2020  cur_descr = descr_list.erase(cur_descr);
2021 
2022  CleanUpSeqDescrPub(entries, same_pub_descr);
2023  } else
2024  ++cur_descr;
2025  } else
2026  ++cur_descr;
2027  }
2028 }
2029 
2030 /**********************************************************
2031  *
2032  * static bool CheckSegDescrChoice(entry, choice):
2033  *
2034  * 5-18-93
2035  *
2036  **********************************************************/
2037 static bool CheckSegDescrChoice(const TEntryList& entries, Uint1 choice)
2038 {
2039  string org;
2040  CDate date;
2041  Int4 modif = -1;
2042 
2043  bool no_problem_found = true;
2044  for (TEntryList::const_iterator seq = entries.begin(); seq != entries.end(); ++seq) {
2045  const CSeq_descr& descr = (*seq)->GetDescr();
2046  const CSeq_descr::Tdata& descr_list = descr.Get();
2047 
2048  CSeq_descr::Tdata::const_iterator cur_descr = GetDescrByChoice(descr, choice);
2049 
2050  if (cur_descr == descr_list.end()) {
2051  no_problem_found = false;
2052  break;
2053  }
2054 
2055  if (choice == CSeqdesc::e_Org) {
2056  if (org.empty())
2057  org = (*cur_descr)->GetOrg().GetTaxname();
2058  else if (org != (*cur_descr)->GetOrg().GetTaxname()) {
2059  no_problem_found = false;
2060  break;
2061  }
2062  } else if (choice == CSeqdesc::e_Modif) {
2063  Int4 val = *(*cur_descr)->GetModif().begin();
2064  if (modif == -1)
2065  modif = val;
2066  else if (modif != val) {
2067  no_problem_found = false;
2068  break;
2069  }
2070  } else /* Seq_descr_update_date */
2071  {
2072  if (date.Which() == CDate::e_not_set)
2073  date.Assign((*cur_descr)->GetUpdate_date());
2074  else if (date.Compare((*cur_descr)->GetUpdate_date()) != CDate::eCompare_same) {
2075  no_problem_found = false;
2076  break;
2077  }
2078  }
2079  }
2080 
2081  return no_problem_found;
2082 }
2083 // LCOV_EXCL_STOP
2084 
2085 /**********************************************************
2086  *
2087  * static char* GetBioseqSetDescrTitle(descr):
2088  *
2089  * Copy title from the first one, truncate before
2090  * "complete cds" or "exon".
2091  *
2092  * 5-18-93
2093  *
2094  **********************************************************/
2095 static char* GetBioseqSetDescrTitle(const CSeq_descr& descr)
2096 {
2097  const Char* title;
2098  const Char* ptr;
2099 
2100  char* str;
2101 
2102  const CSeq_descr::Tdata& descr_list = descr.Get();
2103 
2104  CSeq_descr::Tdata::const_iterator cur_descr = descr_list.begin();
2105  for (; cur_descr != descr_list.end(); ++cur_descr) {
2106  if ((*cur_descr)->IsTitle())
2107  break;
2108  }
2109 
2110  if (cur_descr == descr_list.end())
2111  return nullptr;
2112 
2113  title = (*cur_descr)->GetTitle().c_str();
2114 
2115  ptr = StringStr(title, "complete cds");
2116  if (! ptr) {
2117  ptr = StringStr(title, "exon");
2118  }
2119 
2120  if (ptr) {
2121  str = StringSave(string(title, ptr).c_str());
2123  } else {
2124  str = StringSave(title);
2125  }
2126 
2127  return str;
2128 }
2129 
2130 // LCOV_EXCL_START
2131 // Excluded per Mark's request on 12/14/2016
2132 /**********************************************************
2133  *
2134  * static void SrchSegDescr(TEntryList& entries, CSeq_descr& descr):
2135  *
2136  * Copy title from first one, truncate before
2137  * "complete cds" or "exon"
2138  * org, if they are all from one organism, then move
2139  * the data to this set, and make NULL to the sep chains
2140  * in which sep->mpData->descr->choice = Seq_descr_org.
2141  * modif, if they are all same modifier, then move
2142  * the data to this set, and make NULL to the sep chains
2143  * in which sep->mpData->descr->choice = Seq_descr_modif.
2144  *
2145  **********************************************************/
2147 {
2148  CRef<CSeq_entry>& entry = entries.front();
2149  CBioseq& bioseq = entry->SetSeq();
2150 
2151  char* title = GetBioseqSetDescrTitle(bioseq.GetDescr());
2152  if (title) {
2153  CRef<CSeqdesc> desc_new(new CSeqdesc);
2154  desc_new->SetTitle(title);
2155  descr.Set().push_back(desc_new);
2156  }
2157 
2159  GetFirstSegDescrChoice(bioseq, CSeqdesc::e_Org, descr);
2161  }
2163  GetFirstSegDescrChoice(bioseq, CSeqdesc::e_Modif, descr);
2165  }
2166 
2167  GetSegPub(entries, descr);
2168 
2172  }
2173 }
2174 
2175 /**********************************************************/
2176 static void GetSegSetDblink(CSeq_descr& descr, TEntryList& entries /*SeqEntryPtr headsep*/, bool* drop)
2177 {
2178  if (entries.empty())
2179  return;
2180 
2181  CRef<CSeqdesc> gpid,
2182  dblink,
2183  cur_gpid,
2184  cur_dblink;
2185 
2186  Uint4 dblink_count = 0;
2187  Uint4 gpid_count = 0;
2188 
2189  bool bad_gpid = false;
2190  bool bad_dblink = false;
2191 
2192  for (auto& entry : entries) {
2193  cur_gpid.Reset();
2194  cur_dblink.Reset();
2195 
2196  CSeq_descr::Tdata& descr_list = entry->SetDescr();
2197 
2198  for (CSeq_descr::Tdata::iterator cur_descr = descr_list.begin(); cur_descr != descr_list.end();) {
2199  if (! (*cur_descr)->IsUser()) {
2200  ++cur_descr;
2201  continue;
2202  }
2203 
2204  const CUser_object& user = (*cur_descr)->GetUser();
2205  if (! user.CanGetType() || user.GetType().GetStr().empty()) {
2206  ++cur_descr;
2207  continue;
2208  }
2209 
2210  string type_str = user.GetType().GetStr();
2211 
2212  if (type_str == "DBLink") {
2213  if (cur_dblink.NotEmpty())
2214  continue;
2215 
2216  dblink_count++;
2217  cur_dblink = *cur_descr;
2218 
2219  if (dblink.Empty())
2220  dblink = cur_dblink;
2221 
2222  cur_descr = descr_list.erase(cur_descr);
2223  } else if (type_str == "GenomeProjectsDB") {
2224  if (cur_gpid.NotEmpty())
2225  continue;
2226 
2227  gpid_count++;
2228  cur_gpid = *cur_descr;
2229 
2230  if (gpid.Empty())
2231  gpid = cur_gpid;
2232 
2233  cur_descr = descr_list.erase(cur_descr);
2234  } else
2235  ++cur_descr;
2236  }
2237 
2238  if (cur_dblink.NotEmpty()) {
2239  if (dblink.Empty())
2240  dblink = cur_dblink;
2241  else {
2242  if (! cur_dblink->Equals(*dblink)) {
2243  bad_dblink = true;
2244  break;
2245  }
2246  }
2247  }
2248 
2249  if (cur_gpid.NotEmpty()) {
2250  if (gpid.Empty())
2251  gpid = cur_gpid;
2252  else {
2253  if (! cur_gpid->Equals(*gpid)) {
2254  bad_gpid = true;
2255  break;
2256  }
2257  }
2258  }
2259  }
2260 
2261  if (bad_dblink == false && bad_gpid == false) {
2262  if (dblink_count > 0 && entries.size() != dblink_count)
2263  bad_dblink = true;
2264  if (gpid_count > 0 && entries.size() != gpid_count)
2265  bad_gpid = true;
2266  }
2267 
2268  if (bad_dblink) {
2269  ErrPostEx(SEV_REJECT, ERR_SEGMENT_DBLinkMissingOrNonUnique, "One or more member of segmented set has missing or non-unique DBLink user-object. Entry dropped.");
2270  *drop = true;
2271  }
2272 
2273  if (bad_gpid) {
2274  ErrPostEx(SEV_REJECT, ERR_SEGMENT_GPIDMissingOrNonUnique, "One or more member of segmented set has missing or non-unique GPID user-object. Entry dropped.");
2275  *drop = true;
2276  }
2277 
2278  if (bad_gpid || bad_dblink ||
2279  (dblink.Empty() && gpid.Empty()) ||
2280  descr.Get().empty())
2281  return;
2282 
2283  if (dblink.NotEmpty())
2284  descr.Set().push_back(dblink);
2285  if (gpid.NotEmpty())
2286  descr.Set().push_back(gpid);
2287 }
2288 
2289 /**********************************************************
2290  *
2291  * static void GetBioseqSetDescr(entries, descr, drop)
2292  *
2293  * 1-20-16
2294  *
2295  **********************************************************/
2296 static void GetBioseqSetDescr(TEntryList& entries, CSeq_descr& descr, bool* drop)
2297 {
2298  SrchSegDescr(entries, descr); /* get from ASN.1 tree */
2299  GetSegSetDblink(descr, entries, drop);
2300 }
2301 
2302 /**********************************************************
2303  *
2304  * static const char *GetMoleculeClassString(mol):
2305  *
2306  * 6-25-93
2307  *
2308  **********************************************************/
2309 static const char* GetMoleculeClassString(Uint1 mol)
2310 {
2311  if (mol == 0)
2312  return ("not-set");
2313  if (mol == 1)
2314  return ("DNA");
2315  if (mol == 2)
2316  return ("RNA");
2317  if (mol == 3)
2318  return ("AA");
2319  if (mol == 4)
2320  return ("NA");
2321  return ("other");
2322 }
2323 
2324 /**********************************************************
2325  *
2326  * static CSeq_inst::EMol SrchSegSeqMol(entries):
2327  *
2328  * 5-14-93
2329  *
2330  **********************************************************/
2332 {
2333  const CBioseq& orig_bioseq = entries.front()->GetSeq();
2334  CSeq_inst::EMol mol = orig_bioseq.GetInst().GetMol();
2335 
2336  for (const auto& entry : entries) {
2337  const CBioseq& cur_bioseq = entry->GetSeq();
2338  if (mol == cur_bioseq.GetInst().GetMol())
2339  continue;
2340 
2341  ErrPostEx(SEV_WARNING, ERR_SEGMENT_DiffMolType, "Different molecule type in the segment set, \"%s\" to \"%s\"", GetMoleculeClassString(mol), GetMoleculeClassString(cur_bioseq.GetInst().GetMol()));
2342 
2343  return CSeq_inst::eMol_na;
2344  }
2345 
2346  return mol;
2347 }
2348 
2349 /**********************************************************
2350  *
2351  * static Int4 SrchSegLength(entries):
2352  *
2353  * 5-14-93
2354  *
2355  **********************************************************/
2357 {
2358  Int4 length = 0;
2359 
2360  for (const auto& entry : entries) {
2361  const CBioseq& cur_bioseq = entry->GetSeq();
2362  length += cur_bioseq.GetLength();
2363  }
2364 
2365  return (length);
2366 }
2367 
2368 /**********************************************************
2369  *
2370  * static CRef<CBioseq> GetBioseq(pp, orig_bioseq, slp):
2371  *
2372  * 5-12-93
2373  *
2374  **********************************************************/
2376 {
2377  IndexblkPtr ibp = pp->entrylist[pp->curindx];
2378  CRef<CBioseq> bioseq(new CBioseq);
2379 
2380  {
2381  string locusname = "SEG_";
2382  locusname.append(ibp->blocusname);
2383  bioseq->SetId().push_back(MakeSegSetSeqId(ibp->acnum, locusname, pp->seqtype, ibp->is_tpa));
2384  }
2385 
2386  if (pp->seg_acc) {
2387  string locusname = "SEG_";
2388  locusname.append(ibp->acnum);
2389  bioseq->SetId().push_back(MakeSegSetSeqId(ibp->acnum, locusname, pp->seqtype, ibp->is_tpa));
2390  }
2391 
2392  const CSeq_entry& first_entry = *(entries.front());
2393  const CBioseq& original = first_entry.GetSeq();
2394 
2395  char* title = GetBioseqSetDescrTitle(original.GetDescr());
2396 
2397  if (title) {
2398  CRef<CSeqdesc> descr(new CSeqdesc);
2399  descr->SetTitle(title);
2400 
2401  MemFree(title);
2402  bioseq->SetDescr().Set().push_back(descr);
2403  }
2404 
2405  CSeq_inst& inst = bioseq->SetInst();
2407  inst.SetMol(SrchSegSeqMol(entries));
2408 
2409  bool need_null = false;
2410 
2411  CRef<CSeq_loc> null_loc(new CSeq_loc());
2412  null_loc->SetNull();
2413 
2414  for (CSeq_loc::const_iterator seq_it = slp.begin(); seq_it != slp.end(); ++seq_it) {
2415  if (need_null)
2416  inst.SetExt().SetSeg().Set().push_back(null_loc);
2417  else
2418  need_null = true;
2419 
2420  CRef<CSeq_loc> seqloc(new CSeq_loc());
2421  seqloc->Assign(seq_it.GetEmbeddingSeq_loc());
2422  inst.SetExt().SetSeg().Set().push_back(seqloc);
2423  }
2424 
2426  inst.SetFuzz().SetLim(CInt_fuzz::eLim_gt);
2427 
2428  return bioseq;
2429 }
2430 // LCOV_EXCL_STOP
2431 
2432 /**********************************************************
2433  *
2434  * void GetSeqExt(pp, slp):
2435  *
2436  * 5-12-93
2437  *
2438  **********************************************************/
2439 void GetSeqExt(ParserPtr pp, CSeq_loc& seq_loc)
2440 {
2441  const Indexblk* ibp;
2442 
2443  ibp = pp->entrylist[pp->curindx];
2444 
2445  CRef<CSeq_id> id = MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum);
2446 
2447  if (id.NotEmpty()) {
2448  CSeq_loc loc;
2449  loc.SetWhole(*id);
2450 
2451  seq_loc.Add(loc);
2452  }
2453 }
2454 
2455 // LCOV_EXCL_START
2456 // Excluded per Mark's request on 12/14/2016
2457 /**********************************************************
2458  *
2459  * SeqEntryPtr BuildBioSegHeader(pp, headsep, seqloc):
2460  *
2461  * 2-24-94
2462  *
2463  **********************************************************/
2465 {
2466  if (entries.empty())
2467  return;
2468 
2469  IndexblkPtr ibp = pp->entrylist[pp->curindx];
2470 
2471  CRef<CBioseq> bioseq = GetBioseq(pp, entries, seqloc); /* Bioseq, ext */
2472 
2473  CRef<CSeq_entry> bioseq_entry(new CSeq_entry);
2474  bioseq_entry->SetSeq(*bioseq);
2475 
2476  CRef<CBioseq_set> bioseq_set(new CBioseq_set);
2477  bioseq_set->SetSeq_set().assign(entries.begin(), entries.end());
2478  bioseq_set->SetClass(CBioseq_set::eClass_parts);
2479 
2480  CRef<CSeq_entry> bioseq_set_entry(new CSeq_entry);
2481  bioseq_set_entry->SetSet(*bioseq_set);
2482 
2483  CRef<CBioseq_set> bioseq_set_head(new CBioseq_set);
2484  bioseq_set_head->SetSeq_set().push_back(bioseq_entry);
2485  bioseq_set_head->SetSeq_set().push_back(bioseq_set_entry);
2486 
2487  CRef<CSeq_descr> descr(new CSeq_descr);
2488  GetBioseqSetDescr(bioseq_set->SetSeq_set(), *descr, &ibp->drop);
2489  bioseq_set_head->SetDescr(*descr);
2490  bioseq_set_head->SetClass(CBioseq_set::eClass_segset);
2491 
2492  CRef<CSeq_entry> bioseq_set_head_entry(new CSeq_entry);
2493  bioseq_set_head_entry->SetSet(*bioseq_set_head);
2494 
2495  entries.clear();
2496  entries.push_back(bioseq_set_head_entry);
2497 }
2498 
2499 /**********************************************************
2500  *
2501  * bool IsSegBioseq(const CSeq_id& id):
2502  *
2503  * 8-16-93
2504  *
2505  **********************************************************/
2506 bool IsSegBioseq(const CSeq_id& id)
2507 {
2508  if (id.Which() == CSeq_id::e_Patent)
2509  return false;
2510 
2511  const CTextseq_id* text_id = id.GetTextseq_Id();
2512 
2513  if (! text_id)
2514  return (false);
2515 
2516  if (! text_id->IsSetAccession() && text_id->IsSetName() &&
2517  StringEquN(text_id->GetName().c_str(), "SEG_", 4))
2518  return (true);
2519  return (false);
2520 }
2521 // LCOV_EXCL_STOP
2522 
2523 /**********************************************************
2524  *
2525  * char* check_div(pat_acc, pat_ref, est_kwd, sts_kwd,
2526  * gss_kwd, if_cds, div, tech, bases,
2527  * source, drop):
2528  *
2529  * 8-16-93
2530  *
2531  * gss and 1000 limit added.
2532  * 9-09-96
2533  *
2534  **********************************************************/
2535 bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string& div, CMolInfo::TTech* tech, size_t bases, Parser::ESource source, bool& drop)
2536 {
2537  if (div.empty())
2538  return false;
2539 
2540  if (pat_acc || pat_ref || StringEqu(div.c_str(), "PAT")) {
2541  if (pat_ref == false) {
2542  ErrPostEx(SEV_REJECT, ERR_DIVISION_MissingPatentRef, "Record in the patent division lacks a reference to a patent document. Entry dropped.");
2543  drop = true;
2544  }
2545  if (est_kwd) {
2546  ErrPostEx(SEV_WARNING, ERR_DIVISION_PATHasESTKeywords, "EST keywords present on patent sequence.");
2547  }
2548  if (sts_kwd) {
2549  ErrPostEx(SEV_WARNING, ERR_DIVISION_PATHasSTSKeywords, "STS keywords present on patent sequence.");
2550  }
2551  if (gss_kwd) {
2552  ErrPostEx(SEV_WARNING, ERR_DIVISION_PATHasGSSKeywords, "GSS keywords present on patent sequence.");
2553  }
2554  if (if_cds && source != Parser::ESource::EMBL) {
2555  ErrPostEx(SEV_INFO, ERR_DIVISION_PATHasCDSFeature, "CDS features present on patent sequence.");
2556  }
2557  if (! StringEqu(div.c_str(), "PAT")) {
2558  if (pat_acc)
2559  ErrPostEx(SEV_WARNING, ERR_DIVISION_ShouldBePAT, "Based on the accession number prefix letters, this is a patent sequence, but the division code is not PAT.");
2560 
2561  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoPAT, "Division %s mapped to PAT based on %s.", div.c_str(), (pat_acc == false) ? "patent reference" : "accession number");
2562  div = "PAT";
2563  }
2564  } else if (est_kwd) {
2565  if (if_cds) {
2566  if (StringEqu(div.c_str(), "EST")) {
2567  ErrPostEx(SEV_WARNING, ERR_DIVISION_ESTHasCDSFeature, "Coding region features exist and division is EST; EST might not be appropriate.");
2568  } else {
2569  ErrPostEx(SEV_INFO, ERR_DIVISION_NotMappedtoEST, "EST keywords exist, but this entry was not mapped to the EST division because of the presence of CDS features.");
2570  if (*tech == CMolInfo::eTech_est)
2571  *tech = CMolInfo::eTech_unknown;
2572  }
2573  } else if (bases > 1000) {
2574  if (StringEqu(div.c_str(), "EST")) {
2575  ErrPostEx(SEV_WARNING, ERR_DIVISION_LongESTSequence, "Division code is EST, but the length of the sequence is %ld.", bases);
2576  } else {
2577  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoEST, "EST keywords exist, but this entry was not mapped to the EST division because of the sequence length %ld.", bases);
2578  if (*tech == CMolInfo::eTech_est)
2579  *tech = CMolInfo::eTech_unknown;
2580  }
2581  } else {
2582  if (! StringEqu(div.c_str(), "EST"))
2583  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoEST, "%s division mapped to EST.", div.c_str());
2584  *tech = CMolInfo::eTech_est;
2585  div.clear();
2586  }
2587  } else if (StringEqu(div.c_str(), "EST")) {
2588  ErrPostEx(SEV_WARNING, ERR_DIVISION_MissingESTKeywords, "Division is EST, but entry lacks EST-related keywords.");
2589  if (sts_kwd) {
2590  ErrPostEx(SEV_WARNING, ERR_DIVISION_ESTHasSTSKeywords, "STS keywords present on EST sequence.");
2591  }
2592  if (if_cds) {
2593  ErrPostEx(SEV_WARNING, ERR_DIVISION_ESTHasCDSFeature, "Coding region features exist and division is EST; EST might not be appropriate.");
2594  }
2595  } else if (sts_kwd) {
2596  if (if_cds) {
2597  if (StringEqu(div.c_str(), "STS")) {
2598  ErrPostEx(SEV_WARNING, ERR_DIVISION_STSHasCDSFeature, "Coding region features exist and division is STS; STS might not be appropriate.");
2599  } else {
2600  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoSTS, "STS keywords exist, but this entry was not mapped to the STS division because of the presence of CDS features.");
2601  if (*tech == CMolInfo::eTech_sts)
2602  *tech = CMolInfo::eTech_unknown;
2603  }
2604  } else if (bases > 1000) {
2605  if (StringEqu(div.c_str(), "STS")) {
2606  ErrPostEx(SEV_WARNING, ERR_DIVISION_LongSTSSequence, "Division code is STS, but the length of the sequence is %ld.", bases);
2607  } else {
2608  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoSTS, "STS keywords exist, but this entry was not mapped to the STS division because of the sequence length %ld.", bases);
2609  if (*tech == CMolInfo::eTech_sts)
2610  *tech = CMolInfo::eTech_unknown;
2611  }
2612  } else {
2613  if (! StringEqu(div.c_str(), "STS"))
2614  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoSTS, "%s division mapped to STS.", div.c_str());
2615  *tech = CMolInfo::eTech_sts;
2616  div.clear();
2617  }
2618  } else if (StringEqu(div.c_str(), "STS")) {
2619  ErrPostEx(SEV_WARNING, ERR_DIVISION_MissingSTSKeywords, "Division is STS, but entry lacks STS-related keywords.");
2620  if (if_cds) {
2621  ErrPostEx(SEV_WARNING, ERR_DIVISION_STSHasCDSFeature, "Coding region features exist and division is STS; STS might not be appropriate.");
2622  }
2623  } else if (gss_kwd) {
2624  if (if_cds) {
2625  if (StringEqu(div.c_str(), "GSS")) {
2626  ErrPostEx(SEV_WARNING, ERR_DIVISION_GSSHasCDSFeature, "Coding region features exist and division is GSS; GSS might not be appropriate.");
2627  } else {
2628  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoGSS, "GSS keywords exist, but this entry was not mapped to the GSS division because of the presence of CDS features.");
2629  if (*tech == CMolInfo::eTech_survey)
2630  *tech = CMolInfo::eTech_unknown;
2631  }
2632  } else if (bases > 2500) {
2633  if (StringEqu(div.c_str(), "GSS")) {
2634  ErrPostEx(SEV_WARNING, ERR_DIVISION_LongGSSSequence, "Division code is GSS, but the length of the sequence is %ld.", bases);
2635  } else {
2636  ErrPostEx(SEV_WARNING, ERR_DIVISION_NotMappedtoGSS, "GSS keywords exist, but this entry was not mapped to the GSS division because of the sequence length %ld.", bases);
2637  if (*tech == CMolInfo::eTech_survey)
2638  *tech = CMolInfo::eTech_unknown;
2639  }
2640  } else {
2641  if (! StringEqu(div.c_str(), "GSS"))
2642  ErrPostEx(SEV_INFO, ERR_DIVISION_MappedtoGSS, "%s division mapped to GSS.", div.c_str());
2643  *tech = CMolInfo::eTech_survey;
2644  div.clear();
2645  }
2646  } else if (StringEqu(div.c_str(), "GSS")) {
2647  ErrPostEx(SEV_WARNING, ERR_DIVISION_MissingGSSKeywords, "Division is GSS, but entry lacks GSS-related keywords.");
2648  if (if_cds) {
2649  ErrPostEx(SEV_WARNING, ERR_DIVISION_GSSHasCDSFeature, "Coding region features exist and division is GSS; GSS might not be appropriate.");
2650  }
2651  } else if (StringEqu(div.c_str(), "TSA")) {
2652  *tech = CMolInfo::eTech_tsa;
2653  div.clear();
2654  }
2655 
2656  return ! div.empty();
2657 }
2658 
2659 /**********************************************************/
2660 CRef<CSeq_id> StrToSeqId(const char* pch, bool pid)
2661 {
2662  long lID;
2663  char* pchEnd;
2664 
2665  CRef<CSeq_id> id;
2666 
2667  /* Figure out--what source is it */
2668  if (*pch == 'd' || *pch == 'e') {
2669  /* Get ID */
2670  errno = 0; /* clear errors, the error flag from stdlib */
2671  lID = strtol(pch + 1, &pchEnd, 10);
2672 
2673  if (! ((lID == 0 && pch + 1 == pchEnd) || (lID == LONG_MAX && errno == ERANGE))) {
2674  /* Allocate new SeqId */
2675 
2676  id = new CSeq_id;
2678  tag->SetStr(string(pch, pchEnd - pch));
2679 
2680  CRef<CDbtag> dbtag(new CDbtag);
2681  dbtag->SetTag(*tag);
2682  dbtag->SetDb(pid ? "PID" : "NID");
2683 
2684  id->SetGeneral(*dbtag);
2685  }
2686  }
2687 
2688  return id;
2689 }
2690 
2691 /**********************************************************/
2692 void AddNIDSeqId(CBioseq& bioseq, const DataBlk& entry, Int2 type, Int2 coldata, Parser::ESource source)
2693 {
2694  DataBlkPtr dbp;
2695  char* offset;
2696 
2697  dbp = TrackNodeType(entry, type);
2698  if (! dbp)
2699  return;
2700 
2701  offset = dbp->mOffset + coldata;
2702  CRef<CSeq_id> sid = StrToSeqId(offset, false);
2703  if (sid.Empty())
2704  return;
2705 
2706  if (! (*offset == 'g' && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL)))
2707  bioseq.SetId().push_back(sid);
2708 }
2709 
2710 /**********************************************************/
2711 static void CheckDivCode(TEntryList& seq_entries, ParserPtr pp)
2712 {
2713  for (auto& entry : seq_entries) {
2714  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
2715  if (bioseq->IsSetDescr()) {
2716  CGB_block* gb_block = nullptr;
2717  CMolInfo* molinfo = nullptr;
2719 
2720  for (auto& descr : bioseq->SetDescr().Set()) {
2721  if (descr->IsGenbank() && ! gb_block)
2722  gb_block = &descr->SetGenbank();
2723  else if (descr->IsMolinfo() && ! molinfo) {
2724  molinfo = &descr->SetMolinfo();
2725  tech = molinfo->GetTech();
2726  }
2727 
2728  if (gb_block && molinfo)
2729  break;
2730  }
2731 
2732  if (! gb_block)
2733  continue;
2734 
2735  IndexblkPtr ibp = pp->entrylist[pp->curindx];
2736 
2737  if (tech == CMolInfo::eTech_tsa &&
2738  ! NStr::CompareNocase(ibp->division, "TSA"))
2739  continue;
2740 
2741  if (! gb_block->IsSetDiv()) {
2742  ErrPostEx(SEV_WARNING, ERR_DIVISION_GBBlockDivision, "input division code is preserved in GBBlock");
2743  gb_block->SetDiv(ibp->division);
2744  }
2745  }
2746  }
2747  }
2748 }
2749 
2750 /**********************************************************/
2751 static const CBioSource* GetTopBiosource(const CSeq_entry& entry)
2752 {
2753  const TSeqdescList& descrs = GetDescrPointer(entry);
2754  for (const auto& descr : descrs) {
2755  if (descr->IsSource())
2756  return &(descr->GetSource());
2757  }
2758 
2759  return nullptr;
2760 }
2761 
2762 /**********************************************************/
2763 static bool SeqEntryCheckTaxonDiv(const CSeq_entry& entry)
2764 {
2765  const CBioSource* bio_src = GetTopBiosource(entry);
2766  if (! bio_src)
2767  return false;
2768 
2769  if (! bio_src->IsSetOrg() || ! bio_src->GetOrg().IsSetOrgname() || ! bio_src->GetOrg().GetOrgname().IsSetDiv())
2770  return false;
2771 
2772  return true;
2773 }
2774 
2775 /**********************************************************/
2777 {
2778  if (seq_entries.empty())
2779  return;
2780 
2781  if (! SeqEntryCheckTaxonDiv(*seq_entries.front())) {
2782  CheckDivCode(seq_entries, pp);
2783  }
2784 }
2785 
2786 /**********************************************************/
2787 void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk& entry, Int2 what, Int2 ori, bool cancelled)
2788 {
2789  DataBlkPtr dbp;
2790  const char** b;
2791  char* tmp;
2792  char* p;
2793  char* q;
2794  char* r;
2795  Char c;
2796  Int2 count;
2797 
2798  dbp = TrackNodeType(entry, what);
2799  if (! dbp || ! dbp->mOffset || dbp->len < 1)
2800  p = nullptr;
2801  else {
2802  q = dbp->mOffset + dbp->len - 1;
2803  c = *q;
2804  *q = '\0';
2805  tmp = StringSave(dbp->mOffset);
2806  *q = c;
2807  for (q = tmp; *q != '\0'; q++) {
2808  if (*q == '\n' && StringEquN(q + 1, "DE ", 5))
2809  fta_StringCpy(q, q + 5);
2810  else if (*q == '\n' || *q == '\t')
2811  *q = ' ';
2812  }
2813  for (q = tmp, p = tmp; *p != '\0'; p++) {
2814  if (*p == ' ' && p[1] == ' ')
2815  continue;
2816  *q++ = *p;
2817  }
2818  *q = '\0';
2819  for (b = magic_phrases, p = nullptr; *b && ! p; b++)
2820  p = StringStr(tmp, *b);
2821  MemFree(tmp);
2822  }
2823 
2824  if ((tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2825  tech == CMolInfo::eTech_htgs_2) &&
2826  ! p && ! cancelled) {
2827  ErrPostEx(SEV_WARNING, ERR_DEFINITION_HTGNotInProgress, "This Phase 0, 1 or 2 HTGS sequence is lacking an indication that sequencing is still in progress on its definition/description line.");
2828  } else if (tech == CMolInfo::eTech_htgs_3 && p) {
2829  ErrPostEx(SEV_ERROR, ERR_DEFINITION_HTGShouldBeComplete, "This complete Phase 3 sequence has a definition/description line indicating that its sequencing is still in progress.");
2830  }
2831 
2832  if (tech != CMolInfo::eTech_htgs_3)
2833  return;
2834 
2835  dbp = TrackNodeType(entry, ori);
2836  if (! dbp || ! dbp->mOffset || dbp->len < 1)
2837  return;
2838  r = new char[dbp->len + 1];
2839  if (! r)
2840  return;
2841  StringNCpy(r, dbp->mOffset, dbp->len);
2842  r[dbp->len] = '\0';
2843  for (p = r, q = r; *p != '\0'; p++)
2844  if (*p >= 'a' && *p <= 'z')
2845  *q++ = *p;
2846  *q = '\0';
2847 
2848  for (count = 0, p = r; *p != '\0'; p++) {
2849  if (*p != 'n')
2850  count = 0;
2851  else if (++count > 10) {
2852  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_UnknownBaseHTG3, "This complete Phase 3 HTGS sequence has one or more runs of 10 contiguous unknown ('n') bases.");
2853  break;
2854  }
2855  }
2856  delete[] r;
2857 }
2858 
2859 /**********************************************************/
2860 void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char* entry, XmlIndexPtr xip, bool cancelled)
2861 {
2862  const char** b;
2863  char* tmp;
2864  char* p;
2865  char* q;
2866  char* r;
2867  Int2 count;
2868 
2869  if (! entry || ! xip)
2870  return;
2871 
2872  tmp = XMLFindTagValue(entry, xip, INSDSEQ_DEFINITION);
2873  if (! tmp)
2874  p = nullptr;
2875  else {
2876  for (q = tmp; *q != '\0'; q++)
2877  if (*q == '\n' || *q == '\t')
2878  *q = ' ';
2879  for (q = tmp, p = tmp; *p != '\0'; p++) {
2880  if (*p == ' ' && p[1] == ' ')
2881  continue;
2882  *q++ = *p;
2883  }
2884  *q = '\0';
2885  for (b = magic_phrases, p = nullptr; *b && ! p; b++)
2886  p = StringStr(tmp, *b);
2887  MemFree(tmp);
2888  }
2889 
2890  if ((tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2891  tech == CMolInfo::eTech_htgs_2) &&
2892  ! p && ! cancelled) {
2893  ErrPostEx(SEV_WARNING, ERR_DEFINITION_HTGNotInProgress, "This Phase 0, 1 or 2 HTGS sequence is lacking an indication that sequencing is still in progress on its definition/description line.");
2894  } else if (tech == CMolInfo::eTech_htgs_3 && p) {
2895  ErrPostEx(SEV_ERROR, ERR_DEFINITION_HTGShouldBeComplete, "This complete Phase 3 sequence has a definition/description line indicating that its sequencing is still in progress.");
2896  }
2897 
2898  if (tech != CMolInfo::eTech_htgs_3)
2899  return;
2900 
2901  r = XMLFindTagValue(entry, xip, INSDSEQ_SEQUENCE);
2902  if (! r)
2903  return;
2904 
2905  for (count = 0, p = r; *p != '\0'; p++) {
2906  if (*p != 'n')
2907  count = 0;
2908  else if (++count > 10) {
2909  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_UnknownBaseHTG3, "This complete Phase 3 HTGS sequence has one or more runs of 10 contiguous unknown ('n') bases.");
2910  break;
2911  }
2912  }
2913  MemFree(r);
2914 }
2915 
2916 /**********************************************************/
2917 void CheckHTGDivision(const char* div, CMolInfo::TTech tech)
2918 {
2919  if (div && StringEqu(div, "HTG") && tech == CMolInfo::eTech_htgs_3) {
2920  ErrPostEx(SEV_WARNING, ERR_DIVISION_ShouldNotBeHTG, "This Phase 3 HTGS sequence is still in the HTG division. If truly complete, it should move to a non-HTG division.");
2921  } else if ((! div || ! StringEqu(div, "HTG")) &&
2922  (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2923  tech == CMolInfo::eTech_htgs_2)) {
2924  ErrPostEx(SEV_ERROR, ERR_DIVISION_ShouldBeHTG, "Phase 0, 1 or 2 HTGS sequences should have division code HTG.");
2925  }
2926 }
2927 
2928 /**********************************************************/
2930 {
2931  if (entry.IsSeq())
2932  return entry.GetSeq().GetDescr();
2933 
2934  return entry.GetSet().GetDescr();
2935 }
2936 
2937 /**********************************************************/
2938 static void CleanVisString(string& str)
2939 {
2940  if (str.empty())
2941  return;
2942 
2943  size_t start_pos = 0;
2944  for (; start_pos > str.size() && str[start_pos] <= ' '; ++start_pos)
2945  ;
2946 
2947  if (start_pos == str.size()) {
2948  str.clear();
2949  return;
2950  }
2951 
2952  str = str.substr(start_pos);
2953  size_t end_pos = str.size() - 1;
2954  for (;; --end_pos) {
2955  if (str[end_pos] == ';' || str[end_pos] <= ' ') {
2956  if (end_pos == 0)
2957  break;
2958  continue;
2959  }
2960  ++end_pos;
2961  break;
2962  }
2963 
2964  if (str[end_pos] != ';' || end_pos == 0) {
2965  if (end_pos == 0)
2966  str.clear();
2967  else
2968  str = str.substr(0, end_pos);
2969 
2970  return;
2971  }
2972 
2973  size_t amp_pos = end_pos - 1;
2974  for (; amp_pos; --amp_pos) {
2975  if (str[amp_pos] == ' ' || str[amp_pos] == '&' || str[amp_pos] == ';')
2976  break;
2977  }
2978 
2979  if (str[amp_pos] == '&')
2980  ++end_pos;
2981 
2982  str = str.substr(0, end_pos);
2983 }
2984 
2985 /**********************************************************/
2986 static void CleanVisStringList(list<string>& str_list)
2987 {
2988  for (list<string>::iterator it = str_list.begin(); it != str_list.end();) {
2989  CleanVisString(*it);
2990 
2991  if (it->empty())
2992  it = str_list.erase(it);
2993  else
2994  ++it;
2995  }
2996 }
2997 
2998 /**********************************************************/
2999 static void CheckGBBlock(TSeqdescList& descrs, bool& got)
3000 {
3001  const Char* div = nullptr;
3002 
3003  for (const auto& descr : descrs) {
3004  if (! descr->IsEmbl())
3005  continue;
3006 
3007  if (! descr->GetEmbl().IsSetDiv() || descr->GetEmbl().GetDiv() > 15)
3008  continue;
3009 
3010  div = GetEmblDiv(descr->GetEmbl().GetDiv());
3011  break;
3012  }
3013 
3014  for (TSeqdescList::iterator descr = descrs.begin(); descr != descrs.end();) {
3015  if (! (*descr)->IsGenbank()) {
3016  ++descr;
3017  continue;
3018  }
3019 
3020  CGB_block& gb_block = (*descr)->SetGenbank();
3021  if (div && gb_block.IsSetDiv() && NStr::CompareNocase(div, gb_block.GetDiv().c_str()) == 0)
3022  gb_block.ResetDiv();
3023 
3024  if (gb_block.IsSetSource()) {
3025  got = true;
3026  } else if (gb_block.IsSetDiv() && gb_block.GetDiv() != "PAT" &&
3027  gb_block.GetDiv() != "SYN") {
3028  got = true;
3029  }
3030 
3031  if (gb_block.IsSetExtra_accessions()) {
3033  if (gb_block.GetExtra_accessions().empty())
3034  gb_block.ResetExtra_accessions();
3035  }
3036 
3037 
3038  if (gb_block.IsSetKeywords()) {
3039  CleanVisStringList(gb_block.SetKeywords());
3040  if (gb_block.GetKeywords().empty())
3041  gb_block.ResetKeywords();
3042  }
3043 
3044  if (gb_block.IsSetSource()) {
3045  string& buf = gb_block.SetSource();
3047  if (buf.empty())
3048  gb_block.ResetSource();
3049  }
3050 
3051  if (gb_block.IsSetOrigin()) {
3052  string& buf = gb_block.SetOrigin();
3054  if (buf.empty())
3055  gb_block.ResetOrigin();
3056  }
3057 
3058  if (gb_block.IsSetDate()) {
3059  string& buf = gb_block.SetDate();
3061  if (buf.empty())
3062  gb_block.ResetDate();
3063  }
3064 
3065  if (gb_block.IsSetDiv()) {
3066  string& buf = gb_block.SetDiv();
3068  if (buf.empty())
3069  gb_block.ResetDiv();
3070  }
3071 
3072  if (! gb_block.IsSetExtra_accessions() && ! gb_block.IsSetSource() &&
3073  ! gb_block.IsSetKeywords() && ! gb_block.IsSetOrigin() &&
3074  ! gb_block.IsSetDate() && ! gb_block.IsSetEntry_date() &&
3075  ! gb_block.IsSetDiv()) {
3076  descr = descrs.erase(descr);
3077  } else {
3078  ++descr;
3079  }
3080  }
3081 }
3082 
3083 /**********************************************************/
3085 {
3086  bool got = false;
3087 
3088  for (auto& entry : seq_entries) {
3089  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3090  if (bioseq->IsSetDescr())
3091  CheckGBBlock(bioseq->SetDescr().Set(), got);
3092  }
3093 
3094  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
3095  if (bio_set->IsSetDescr())
3096  CheckGBBlock(bio_set->SetDescr().Set(), got);
3097  }
3098  }
3099 
3100  return (got);
3101 }
3102 
3103 /**********************************************************/
3104 static int GetSerialNumFromPubEquiv(const CPub_equiv& pub_eq)
3105 {
3106  int ret = -1;
3107  for (const auto& pub : pub_eq.Get()) {
3108  if (pub->IsGen()) {
3109  if (pub->GetGen().IsSetSerial_number()) {
3110  ret = pub->GetGen().GetSerial_number();
3111  break;
3112  }
3113  }
3114  }
3115 
3116  return ret;
3117 }
3118 
3119 /**********************************************************/
3120 static bool fta_if_pubs_sorted(const CPub_equiv& pub1, const CPub_equiv& pub2)
3121 {
3122  Int4 num1 = GetSerialNumFromPubEquiv(pub1);
3123  Int4 num2 = GetSerialNumFromPubEquiv(pub2);
3124 
3125  return num1 < num2;
3126 }
3127 
3128 /**********************************************************/
3129 static bool descr_cmp(const CRef<CSeqdesc>& desc1,
3130  const CRef<CSeqdesc>& desc2)
3131 {
3132  if (desc1->Which() == desc2->Which() && desc1->IsPub()) {
3133  const CPub_equiv& pub1 = desc1->GetPub().GetPub();
3134  const CPub_equiv& pub2 = desc2->GetPub().GetPub();
3135  return fta_if_pubs_sorted(pub1, pub2);
3136  }
3137  if (desc1->Which() == desc2->Which() && desc1->IsUser()) {
3138  const CUser_object& uop1 = desc1->GetUser();
3139  const CUser_object& uop2 = desc2->GetUser();
3140  const char* str1;
3141  const char* str2;
3142  if (uop1.IsSetType() && uop1.GetType().IsStr() &&
3143  uop2.IsSetType() && uop2.GetType().IsStr()) {
3144  str1 = uop1.GetType().GetStr().c_str();
3145  str2 = uop2.GetType().GetStr().c_str();
3146  if (strcmp(str1, str2) <= 0)
3147  return (true);
3148  return (false);
3149  }
3150  }
3151 
3152  return desc1->Which() < desc2->Which();
3153 }
3154 
3155 /**********************************************************/
3156 void fta_sort_descr(TEntryList& seq_entries)
3157 {
3158  for (auto& entry : seq_entries) {
3159  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3160  if (bioseq->IsSetDescr())
3161  bioseq->SetDescr().Set().sort(descr_cmp);
3162  }
3163 
3164  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
3165  if (bio_set->IsSetDescr())
3166  bio_set->SetDescr().Set().sort(descr_cmp);
3167  }
3168  }
3169 }
3170 
3171 /**********************************************************/
3172 static bool pub_cmp(const CRef<CPub>& pub1, const CRef<CPub>& pub2)
3173 {
3174  if (pub1->Which() == pub2->Which()) {
3175  if (pub1->IsMuid()) {
3176  return pub1->GetMuid() < pub2->GetMuid();
3177  } else if (pub1->IsGen()) {
3178  const CCit_gen& cit1 = pub1->GetGen();
3179  const CCit_gen& cit2 = pub2->GetGen();
3180 
3181  if (cit1.IsSetCit() && cit2.IsSetCit())
3182  return cit1.GetCit() < cit2.GetCit();
3183  }
3184  }
3185 
3186  return pub1->Which() < pub2->Which();
3187 }
3188 
3189 /**********************************************************/
3190 static void sort_feat_cit(CBioseq::TAnnot& annots)
3191 {
3192  for (auto& annot : annots) {
3193  if (annot->IsFtable()) {
3194  for (auto& feat : annot->SetData().SetFtable()) {
3195  if (feat->IsSetCit() && feat->GetCit().IsPub()) {
3196  // feat->SetCit().SetPub().sort(pub_cmp); TODO: may be this sort would be OK, the only difference with original one is it is stable
3197 
3198  TPubList& pubs = feat->SetCit().SetPub();
3199  for (TPubList::iterator pub = pubs.begin(); pub != pubs.end(); ++pub) {
3200  TPubList::iterator next_pub = pub;
3201  for (++next_pub; next_pub != pubs.end(); ++next_pub) {
3202  if (pub_cmp(*next_pub, *pub))
3203  swap(*next_pub, *pub);
3204  }
3205  }
3206  }
3207  }
3208  }
3209  }
3210 }
3211 
3212 /**********************************************************/
3214 {
3215  for (auto& entry : seq_entries) {
3216  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3217  if (bioseq->IsSetAnnot())
3218  sort_feat_cit(bioseq->SetAnnot());
3219  }
3220 
3221  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
3222  if (bio_set->IsSetAnnot())
3223  sort_feat_cit(bio_set->SetAnnot());
3224  }
3225  }
3226 }
3227 
3228 /**********************************************************/
3230 {
3231  for (const auto& tag : dbtags) {
3232  if (tag->IsSetDb() && tag->IsSetTag() &&
3233  ! tag->GetTag().IsStr() && tag->GetTag().GetId() > 0 &&
3234  tag->GetDb() == "taxon")
3235  return true;
3236  }
3237  return false;
3238 }
3239 
3240 /**********************************************************/
3241 void fta_fix_orgref_div(const CBioseq::TAnnot& annots, COrg_ref* org_ref, CGB_block& gbb)
3242 {
3243  Int4 count;
3244 
3245  if (! org_ref || ! gbb.IsSetDiv())
3246  return;
3247 
3248  count = 1;
3249  if (org_ref->IsSetOrgname() && ! org_ref->GetOrgname().IsSetDiv() &&
3250  ! fta_orgref_has_taxid(org_ref->GetDb())) {
3251  org_ref->SetOrgname().SetDiv(gbb.GetDiv());
3252  count--;
3253  }
3254 
3255  for (const auto& annot : annots) {
3256  if (! annot->IsFtable())
3257  continue;
3258 
3259  const CSeq_annot::C_Data::TFtable& feats = annot->GetData().GetFtable();
3260  for (const auto& feat : feats) {
3261  if (! feat->IsSetData() || ! feat->GetData().IsBiosrc())
3262  continue;
3263 
3264  count++;
3265 
3266  const CBioSource& bio_src = feat->GetData().GetBiosrc();
3267  if (bio_src.IsSetOrg() && ! fta_orgref_has_taxid(bio_src.GetOrg().GetDb())) {
3268  org_ref->SetOrgname().SetDiv(gbb.GetDiv());
3269  count--;
3270  }
3271  }
3272  }
3273 
3274  if (count > 0)
3275  return;
3276 
3277  gbb.ResetDiv();
3278 }
3279 
3280 /**********************************************************/
3281 bool XMLCheckCDS(const char* entry, XmlIndexPtr xip)
3282 {
3283  XmlIndexPtr txip;
3284  XmlIndexPtr fxip;
3285 
3286  if (! entry || ! xip)
3287  return (false);
3288 
3289  for (; xip; xip = xip->next)
3290  if (xip->tag == INSDSEQ_FEATURE_TABLE && xip->subtags)
3291  break;
3292  if (! xip)
3293  return (false);
3294 
3295  for (txip = xip->subtags; txip; txip = txip->next) {
3296  if (! txip->subtags)
3297  continue;
3298  for (fxip = txip->subtags; fxip; fxip = fxip->next)
3299  if (fxip->tag == INSDFEATURE_KEY && fxip->end - fxip->start == 3 &&
3300  StringEquN(entry + fxip->start, "CDS", 3))
3301  break;
3302  if (fxip)
3303  break;
3304  }
3305 
3306  if (! txip)
3307  return (false);
3308  return (true);
3309 }
3310 
3311 /**********************************************************/
3313 {
3314  for (auto& entry : seq_entries) {
3315  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3316  if (bioseq->IsSetInst() && bioseq->GetInst().IsSetStrand())
3317  continue;
3318 
3319  if (bioseq->GetInst().IsSetMol()) {
3320  CSeq_inst::EMol mol = bioseq->GetInst().GetMol();
3321  if (mol == CSeq_inst::eMol_dna)
3322  bioseq->SetInst().SetStrand(CSeq_inst::eStrand_ds);
3323  else if (mol == CSeq_inst::eMol_rna || mol == CSeq_inst::eMol_aa)
3324  bioseq->SetInst().SetStrand(CSeq_inst::eStrand_ss);
3325  }
3326  }
3327  }
3328 }
3329 
3330 /*****************************************************************************/
3331 static bool SwissProtIDPresent(const TEntryList& seq_entries)
3332 {
3333  for (const auto& entry : seq_entries) {
3334  for (CTypeConstIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3335  if (bioseq->IsSetId()) {
3336  for (const auto& id : bioseq->GetId()) {
3337  if (id->IsSwissprot())
3338  return true;
3339  }
3340  }
3341  }
3342  }
3343 
3344  return false;
3345 }
3346 
3347 /*****************************************************************************/
3348 static bool IsCitEmpty(const CCit_gen& cit)
3349 {
3350  if (cit.IsSetCit() || cit.IsSetAuthors() || cit.IsSetMuid() ||
3351  cit.IsSetJournal() || cit.IsSetVolume() || cit.IsSetIssue() ||
3352  cit.IsSetPages() || cit.IsSetDate() || cit.IsSetTitle() ||
3353  cit.IsSetPmid() || cit.IsSetSerial_number())
3354  return false;
3355 
3356  return true;
3357 }
3358 
3359 /*****************************************************************************/
3360 static void RemoveSerials(TPubList& pubs)
3361 {
3362  for (TPubList::iterator pub = pubs.begin(); pub != pubs.end();) {
3363  if ((*pub)->IsGen()) {
3364  if ((*pub)->GetGen().IsSetSerial_number())
3365  (*pub)->SetGen().ResetSerial_number();
3366 
3367  if (IsCitEmpty((*pub)->GetGen()))
3368  pub = pubs.erase(pub);
3369  else
3370  ++pub;
3371  } else
3372  ++pub;
3373  }
3374 }
3375 
3376 /*****************************************************************************/
3377 void StripSerialNumbers(TEntryList& seq_entries)
3378 {
3379  if (! SwissProtIDPresent(seq_entries)) {
3380  for (auto& entry : seq_entries) {
3381  for (CTypeIterator<CPubdesc> pubdesc(Begin(*entry)); pubdesc; ++pubdesc) {
3382  if (pubdesc->IsSetPub()) {
3383  RemoveSerials(pubdesc->SetPub().Set());
3384  if (pubdesc->GetPub().Get().empty())
3385  pubdesc->ResetPub();
3386  }
3387  }
3388 
3389  for (CTypeIterator<CSeq_feat> feat(Begin(*entry)); feat; ++feat) {
3390  if (feat->IsSetData()) {
3391  if (feat->GetData().IsPub()) {
3392  RemoveSerials(feat->SetData().SetPub().SetPub().Set());
3393  if (feat->GetData().GetPub().GetPub().Get().empty())
3394  feat->SetData().SetPub().ResetPub();
3395  } else if (feat->GetData().IsImp()) {
3396  CImp_feat& imp = feat->SetData().SetImp();
3397  if (imp.IsSetKey() && imp.GetKey() == "Site-ref" && feat->IsSetCit() && feat->GetCit().IsPub()) {
3398  RemoveSerials(feat->SetCit().SetPub());
3399  if (feat->GetCit().GetPub().empty())
3400  feat->SetCit().Reset();
3401  }
3402  }
3403  }
3404  }
3405  }
3406  }
3407 }
3408 
3409 /*****************************************************************************/
3411 {
3412  const string* seq_str = nullptr;
3413  const vector<Char>* seq_vec = nullptr;
3414 
3416  size_t old_size = 0;
3417 
3418  switch (code) {
3419  case CSeq_data::e_Iupacaa:
3420  seq_str = &seq_data.GetIupacaa().Get();
3421  old_coding = CSeqUtil::e_Iupacaa;
3422  old_size = seq_str->size();
3423  break;
3424 
3425  case CSeq_data::e_Ncbi8aa:
3426  seq_vec = &seq_data.GetNcbi8aa().Get();
3427  old_coding = CSeqUtil::e_Ncbi8aa;
3428  old_size = seq_vec->size();
3429  break;
3430 
3432  seq_vec = &seq_data.GetNcbistdaa().Get();
3433  old_coding = CSeqUtil::e_Ncbistdaa;
3434  old_size = seq_vec->size();
3435  break;
3436 
3437  default:; // do nothing
3438  }
3439 
3440  std::vector<Char> new_seq(old_size);
3441  size_t new_size = 0;
3442  if (seq_str)
3443  new_size = CSeqConvert::Convert(seq_str->c_str(), old_coding, 0, static_cast<TSeqPos>(old_size), &new_seq[0], CSeqUtil::e_Ncbieaa);
3444  else if (seq_vec)
3445  new_size = CSeqConvert::Convert(&(*seq_vec)[0], old_coding, 0, static_cast<TSeqPos>(old_size), &new_seq[0], CSeqUtil::e_Ncbieaa);
3446 
3447  if (! new_seq.empty()) {
3448  seq_data.SetNcbieaa().Set().assign(new_seq.begin(), new_seq.begin() + new_size);
3449  }
3450 }
3451 
3452 /*****************************************************************************/
3453 static void RawBioseqPack(CBioseq& bioseq)
3454 {
3455  if (bioseq.GetInst().IsSetSeq_data()) {
3456  if (! bioseq.GetInst().IsSetMol() || ! bioseq.GetInst().IsNa()) {
3458  PackSeqData(code, bioseq.SetInst().SetSeq_data());
3459  } else if (! bioseq.GetInst().GetSeq_data().IsGap()) {
3460  CSeqportUtil::Pack(&bioseq.SetInst().SetSeq_data());
3461  }
3462  }
3463 }
3464 
3465 static void DeltaBioseqPack(CBioseq& bioseq)
3466 {
3467  if (bioseq.GetInst().IsSetExt() && bioseq.GetInst().GetExt().IsDelta()) {
3468  for (auto& delta : bioseq.SetInst().SetExt().SetDelta().Set()) {
3469  if (delta->IsLiteral() && delta->GetLiteral().IsSetSeq_data() && ! delta->GetLiteral().GetSeq_data().IsGap()) {
3470  CSeqportUtil::Pack(&delta->SetLiteral().SetSeq_data());
3471  }
3472  }
3473  }
3474 }
3475 
3476 /*****************************************************************************/
3477 void PackEntries(TEntryList& seq_entries)
3478 {
3479  for (auto& entry : seq_entries) {
3480  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
3481  if (bioseq->IsSetInst() && bioseq->GetInst().IsSetRepr()) {
3482  CSeq_inst::ERepr repr = bioseq->GetInst().GetRepr();
3483  if (repr == CSeq_inst::eRepr_raw || repr == CSeq_inst::eRepr_const)
3484  RawBioseqPack(*bioseq);
3485  else if (repr == CSeq_inst::eRepr_delta)
3486  DeltaBioseqPack(*bioseq);
3487  }
3488  }
3489  }
3490 }
3491 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void xStripECO(string &instr)
Definition: add.cpp:2833
void xFixEMBLKeywords(string &keywordData)
Definition: asci_blk.cpp:1466
static void fta_check_mult_ids(DataBlkPtr dbp, const char *mtag, const char *ptag)
Definition: asci_blk.cpp:356
USING_SCOPE(objects)
static CSeq_inst::EMol SrchSegSeqMol(const TEntryList &entries)
Definition: asci_blk.cpp:2331
void BuildSubBlock(DataBlkPtr dbp, Int2 subtype, const char *subkw)
Definition: asci_blk.cpp:758
vector< string > genbankKeywords
Definition: gb_index.cpp:56
static void CleanUpSeqDescrPub(TEntryList &entries, std::set< CSeqdesc * > &to_clean)
Definition: asci_blk.cpp:1981
static bool CheckSegPub(const CPubdesc &pub, TEntryList &entries, std::set< CSeqdesc * > &same_pub_descr)
Definition: asci_blk.cpp:1892
static bool SwissProtIDPresent(const TEntryList &seq_entries)
Definition: asci_blk.cpp:3331
void GetGenBankSubBlock(const DataBlk &entry, size_t bases)
Definition: asci_blk.cpp:400
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3377
static void RemoveSerials(TPubList &pubs)
Definition: asci_blk.cpp:3360
vector< string > emblKeywords
Definition: em_index.cpp:56
CRef< CSeq_id > StrToSeqId(const char *pch, bool pid)
Definition: asci_blk.cpp:2660
static void RemoveDescrByChoice(CSeq_descr &descr, Uint1 choice)
Definition: asci_blk.cpp:1946
static void GetSegPub(TEntryList &entries, CSeq_descr &descr)
Definition: asci_blk.cpp:2008
static void InsertDatablkVal(DataBlkPtr *dbp, Int2 type, char *offset, size_t len)
Definition: asci_blk.cpp:177
static const CBioSource * GetTopBiosource(const CSeq_entry &entry)
Definition: asci_blk.cpp:2751
void AddNIDSeqId(CBioseq &bioseq, const DataBlk &entry, Int2 type, Int2 coldata, Parser::ESource source)
Definition: asci_blk.cpp:2692
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
Definition: asci_blk.cpp:3241
static void GetGenBankRefType(DataBlkPtr dbp, size_t bases)
Definition: asci_blk.cpp:283
char * GetDescrComment(char *offset, size_t len, Int2 col_data, bool is_htg, bool is_pat)
Definition: asci_blk.cpp:1105
static const char * GetMoleculeClassString(Uint1 mol)
Definition: asci_blk.cpp:2309
void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk &entry, Int2 what, Int2 ori, bool cancelled)
Definition: asci_blk.cpp:2787
static bool CheckSegDescrChoice(const TEntryList &entries, Uint1 choice)
Definition: asci_blk.cpp:2037
static void CheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2711
vector< string > swissProtKeywords
Definition: sp_index.cpp:54
CRef< CPatent_seq_id > MakeUsptoPatSeqId(const char *acc)
Definition: asci_blk.cpp:830
bool IsSegBioseq(const CSeq_id &id)
Definition: asci_blk.cpp:2506
void xGetGenBankSubBlocks(Entry &entry, size_t bases)
Definition: asci_blk.cpp:439
static void GetBioseqSetDescr(TEntryList &entries, CSeq_descr &descr, bool *drop)
Definition: asci_blk.cpp:2296
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
Definition: asci_blk.cpp:3213
void PackEntries(TEntryList &seq_entries)
Definition: asci_blk.cpp:3477
static Int4 SrchSegLength(const TEntryList &entries)
Definition: asci_blk.cpp:2356
void fta_set_strandedness(TEntryList &seq_entries)
Definition: asci_blk.cpp:3312
static void fta_fix_tpa_keywords(TKeywordList &keywords)
Definition: asci_blk.cpp:1436
static void fta_fix_secondaries(TokenBlkPtr secs)
Definition: asci_blk.cpp:1218
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
Definition: asci_blk.cpp:2917
unique_ptr< unsigned char[]> GetDNAConv(void)
Definition: asci_blk.cpp:1744
bool XMLCheckCDS(const char *entry, XmlIndexPtr xip)
Definition: asci_blk.cpp:3281
unique_ptr< unsigned char[]> GetProteinConv(void)
Definition: asci_blk.cpp:1772
static bool fta_if_pubs_sorted(const CPub_equiv &pub1, const CPub_equiv &pub2)
Definition: asci_blk.cpp:3120
static void GetFirstSegDescrChoice(CBioseq &bioseq, Uint1 choice, CSeq_descr &descr_new)
Definition: asci_blk.cpp:1816
static bool SeqEntryCheckTaxonDiv(const CSeq_entry &entry)
Definition: asci_blk.cpp:2763
bool fta_orgref_has_taxid(const COrg_ref::TDb &dbtags)
Definition: asci_blk.cpp:3229
static void CheckGBBlock(TSeqdescList &descrs, bool &got)
Definition: asci_blk.cpp:2999
static void CleanUpSeqDescrChoice(TEntryList &entries, Uint1 choice)
Definition: asci_blk.cpp:1965
static void CleanVisString(string &str)
Definition: asci_blk.cpp:2938
static void CleanVisStringList(list< string > &str_list)
Definition: asci_blk.cpp:2986
static bool pub_cmp(const CRef< CPub > &pub1, const CRef< CPub > &pub2)
Definition: asci_blk.cpp:3172
static void SrchSegDescr(TEntryList &entries, CSeq_descr &descr)
Definition: asci_blk.cpp:2146
void GetSequenceOfKeywords(const DataBlk &entry, int type, int col_data, TKeywordList &keywords)
Definition: asci_blk.cpp:1505
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2776
void GetEmblSubBlock(size_t bases, Parser::ESource source, const DataBlk &entry)
Definition: asci_blk.cpp:686
char * GetEmblBlock(DataBlkPtr *chain, char *ptr, short *retkw, Parser::EFormat format, char *eptr)
Definition: asci_blk.cpp:491
const char * magic_phrases[]
Definition: asci_blk.cpp:104
static void PackSeqData(CSeq_data::E_Choice code, CSeq_data &seq_data)
Definition: asci_blk.cpp:3410
static bool IsCitEmpty(const CCit_gen &cit)
Definition: asci_blk.cpp:3348
static Uint1 ValidSeqType(const char *accession, Uint1 type)
Definition: asci_blk.cpp:864
static bool descr_cmp(const CRef< CSeqdesc > &desc1, const CRef< CSeqdesc > &desc2)
Definition: asci_blk.cpp:3129
char * GetGenBankBlock(DataBlkPtr *chain, char *ptr, Int2 *retkw, char *eptr)
Definition: asci_blk.cpp:230
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
Definition: asci_blk.cpp:2439
static bool GetSubNodeType(const char *subkw, char **retbptr, char *eptr)
Definition: asci_blk.cpp:594
static CRef< CBioseq > GetBioseq(ParserPtr pp, const TEntryList &entries, const CSeq_loc &slp)
Definition: asci_blk.cpp:2375
static void GetEmblRefType(size_t bases, Parser::ESource source, DataBlkPtr dbp)
Definition: asci_blk.cpp:630
CRef< CSeq_id > MakeAccSeqId(const char *acc, Uint1 seqtype, bool accver, Int2 vernum)
Definition: asci_blk.cpp:906
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1632
void GetLenSubNode(DataBlkPtr dbp)
Definition: asci_blk.cpp:781
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
Definition: asci_blk.cpp:3084
char * SrchNodeSubType(const DataBlk &entry, Int2 type, Int2 subtype, size_t *len)
Definition: asci_blk.cpp:985
void xGetGenBankBlocks(Entry &entry)
Definition: asci_blk.cpp:202
static void sort_feat_cit(CBioseq::TAnnot &annots)
Definition: asci_blk.cpp:3190
static char * GetBioseqSetDescrTitle(const CSeq_descr &descr)
Definition: asci_blk.cpp:2095
static void SetEmptyId(CBioseq &bioseq)
Definition: asci_blk.cpp:1008
CRef< CSeq_id > MakeLocusSeqId(const char *locus, CSeq_id::E_Choice seqtype)
Definition: asci_blk.cpp:936
void fta_sort_descr(TEntryList &seq_entries)
Definition: asci_blk.cpp:3156
static void DeltaBioseqPack(CBioseq &bioseq)
Definition: asci_blk.cpp:3465
void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char *entry, XmlIndexPtr xip, bool cancelled)
Definition: asci_blk.cpp:2860
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
Definition: asci_blk.cpp:2464
static CRef< CSeq_id > MakeSegSetSeqId(const char *accession, const string &locus, Uint1 seqtype, bool is_tpa)
Definition: asci_blk.cpp:954
static int GetSerialNumFromPubEquiv(const CPub_equiv &pub_eq)
Definition: asci_blk.cpp:3104
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1274
void ShrinkSpaces(char *line)
Definition: asci_blk.cpp:118
static void RawBioseqPack(CBioseq &bioseq)
Definition: asci_blk.cpp:3453
static bool TrimEmblFeatBlk(DataBlkPtr dbp)
Definition: asci_blk.cpp:550
static void GetSegSetDblink(CSeq_descr &descr, TEntryList &entries, bool *drop)
Definition: asci_blk.cpp:2176
static CSeq_descr::Tdata::const_iterator GetDescrByChoice(const CSeq_descr &descr, Uint1 choice)
Definition: asci_blk.cpp:1793
static bool SameCitation_PubEquivMatch_Logic(const CPub_equiv &a, const CPub_equiv &b)
Definition: asci_blk.cpp:1837
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
Definition: asci_blk.cpp:2535
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1020
const CSeq_descr & GetDescrPointer(const CSeq_entry &entry)
Definition: asci_blk.cpp:2929
static void BuildFeatureBlock(DataBlkPtr dbp)
Definition: asci_blk.cpp:323
Int4 ScanSequence(bool warn, char **seqptr, std::vector< char > &bsp, unsigned char *conv, Char replacechar, int *numns)
Definition: asci_blk.cpp:1576
const char * GetEmblDiv(Uint1 num)
Definition: em_ascii.cpp:2419
static TDSICONV * conv
Definition: charconv.c:168
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
Definition: Date.hpp:53
ECompare Compare(const CDate &date) const
Definition: Date.cpp:83
@ eCompare_same
They're equivalent.
Definition: Date.hpp:75
Definition: Dbtag.hpp:53
@Imp_feat.hpp User-defined methods of the data storage class.
Definition: Imp_feat.hpp:54
void SetId8(TId8 value)
Definition: Object_id.cpp:175
Definition: Pub.hpp:56
@Pubdesc.hpp User-defined methods of the data storage class.
Definition: Pubdesc.hpp:54
CRef –.
Definition: ncbiobj.hpp:618
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
@ e_Ncbieaa
Definition: sequtil.hpp:57
@ e_not_set
Definition: sequtil.hpp:44
@ e_Ncbi8aa
Definition: sequtil.hpp:56
@ e_Ncbistdaa
Definition: sequtil.hpp:58
@ e_Iupacaa
Definition: sequtil.hpp:55
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
static TPair GetCodeIndexFromTo(CSeq_data::E_Choice code_type)
unsigned int TIndex
static const string & GetCode(CSeq_data::E_Choice code_type, TIndex idx)
pair< TIndex, TIndex > TPair
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
char * mOffset
Definition: ftablock.h:332
size_t len
Definition: ftablock.h:333
CFlatFileData * mpData
Definition: ftablock.h:331
DataBlk * mpNext
Definition: ftablock.h:336
int mType
Definition: ftablock.h:330
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
@ ParFlat_OC
Definition: embl.h:61
@ ParFlat_RG
Definition: embl.h:66
@ ParFlat_RL
Definition: embl.h:69
@ ParFlat_RP
Definition: embl.h:64
@ ParFlat_RT
Definition: embl.h:68
@ ParFlat_OS
Definition: embl.h:48
@ ParFlat_OG
Definition: embl.h:62
@ ParFlat_RN
Definition: embl.h:49
@ ParFlat_RX
Definition: embl.h:65
@ ParFlat_RA
Definition: embl.h:67
@ ParFlat_RC
Definition: embl.h:63
@ ParFlat_FH
Definition: embl.h:52
#define ERR_DIVISION_NotMappedtoEST
Definition: flat2err.h:237
#define ERR_ACCESSION_UnusualWGS_Secondary
Definition: flat2err.h:175
#define ERR_DIVISION_ShouldBePAT
Definition: flat2err.h:256
#define ERR_DIVISION_MappedtoPAT
Definition: flat2err.h:224
#define ERR_DIVISION_MappedtoSTS
Definition: flat2err.h:225
#define ERR_SEQUENCE_TooShort
Definition: flat2err.h:155
#define ERR_SEQUENCE_TooShortIsPatent
Definition: flat2err.h:157
#define ERR_SEQUENCE_UnknownBaseHTG3
Definition: flat2err.h:147
#define ERR_DIVISION_LongGSSSequence
Definition: flat2err.h:246
#define ERR_SEGMENT_GPIDMissingOrNonUnique
Definition: flat2err.h:167
#define ERR_DIVISION_ESTHasCDSFeature
Definition: flat2err.h:236
#define ERR_DIVISION_PATHasGSSKeywords
Definition: flat2err.h:243
#define ERR_REFERENCE_MultipleIdentifiers
Definition: flat2err.h:313
#define ERR_DIVISION_MissingSTSKeywords
Definition: flat2err.h:228
#define ERR_DIVISION_MissingPatentRef
Definition: flat2err.h:229
#define ERR_SEQUENCE_BadResidue
Definition: flat2err.h:149
#define ERR_DIVISION_PATHasESTKeywords
Definition: flat2err.h:230
#define ERR_ACCESSION_ScfldHasWGSContigSec
Definition: flat2err.h:176
#define ERR_SEGMENT_PubMatch
Definition: flat2err.h:164
#define ERR_FORMAT_LineTypeOrder
Definition: flat2err.h:40
#define ERR_SEGMENT_DBLinkMissingOrNonUnique
Definition: flat2err.h:168
#define ERR_DIVISION_MappedtoGSS
Definition: flat2err.h:242
#define ERR_DIVISION_GSSHasCDSFeature
Definition: flat2err.h:240
#define ERR_DIVISION_MappedtoEST
Definition: flat2err.h:223
#define ERR_DEFINITION_HTGNotInProgress
Definition: flat2err.h:265
#define ERR_ACCESSION_WGSMasterAsSecondary
Definition: flat2err.h:174
#define ERR_DIVISION_STSHasCDSFeature
Definition: flat2err.h:233
#define ERR_FEATURE_NoFeatData
Definition: flat2err.h:325
#define ERR_SEGMENT_DiffMolType
Definition: flat2err.h:163
#define ERR_DIVISION_ShouldBeHTG
Definition: flat2err.h:238
#define ERR_DIVISION_MissingESTKeywords
Definition: flat2err.h:227
#define ERR_DIVISION_NotMappedtoGSS
Definition: flat2err.h:241
#define ERR_SEQUENCE_SeqLenNotEq
Definition: flat2err.h:148
#define ERR_DIVISION_PATHasCDSFeature
Definition: flat2err.h:232
#define ERR_DIVISION_MissingGSSKeywords
Definition: flat2err.h:239
#define ERR_DIVISION_NotMappedtoSTS
Definition: flat2err.h:234
#define ERR_DIVISION_LongSTSSequence
Definition: flat2err.h:245
#define ERR_DIVISION_GBBlockDivision
Definition: flat2err.h:247
#define ERR_SEQUENCE_AllNs
Definition: flat2err.h:156
#define ERR_ACCESSION_WGSWithNonWGS_Sec
Definition: flat2err.h:173
#define ERR_DIVISION_PATHasSTSKeywords
Definition: flat2err.h:231
#define ERR_DIVISION_LongESTSequence
Definition: flat2err.h:244
#define ERR_DEFINITION_HTGShouldBeComplete
Definition: flat2err.h:267
#define ERR_DIVISION_ESTHasSTSKeywords
Definition: flat2err.h:235
#define ERR_DIVISION_ShouldNotBeHTG
Definition: flat2err.h:250
list< CRef< objects::CSeq_entry > > TEntryList
#define INSDSEQ_DEFINITION
Definition: fta_xml.h:52
char * XMLFindTagValue(const char *entry, const XmlIndex *xip, Int4 tag)
Definition: xm_index.cpp:213
#define INSDSEQ_FEATURE_TABLE
Definition: fta_xml.h:68
#define INSDFEATURE_KEY
Definition: fta_xml.h:77
#define INSDSEQ_SEQUENCE
Definition: fta_xml.h:69
std::list< std::string > TKeywordList
Definition: ftablock.h:166
std::list< CRef< objects::CPub > > TPubList
Definition: ftablock.h:62
std::list< CRef< objects::CSeqdesc > > TSeqdescList
Definition: ftablock.h:60
std::list< std::string > TAccessionList
Definition: ftablock.h:56
void MemSet(void *p, int n, size_t sz)
Definition: ftacpp.hpp:49
char * StringSave(const char *s)
Definition: ftacpp.hpp:61
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:116
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:106
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:96
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:75
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void MemCpy(void *p, const void *q, size_t sz)
Definition: ftacpp.hpp:50
char * StringNew(size_t sz)
Definition: ftacpp.hpp:43
@ ParFlat_FEATBLOCK
Definition: genbank.h:72
@ ParFlat_AUTHORS
Definition: genbank.h:67
@ ParFlat_FEATURES
Definition: genbank.h:51
@ ParFlat_SOURCE
Definition: genbank.h:48
@ ParFlat_JOURNAL
Definition: genbank.h:70
@ ParFlat_STANDARD
Definition: genbank.h:71
@ ParFlat_REFERENCE
Definition: genbank.h:49
@ ParFlat_LOCUS
Definition: genbank.h:41
@ ParFlat_CONSRTM
Definition: genbank.h:68
@ ParFlat_END
Definition: genbank.h:54
@ ParFlat_ORGANISM
Definition: genbank.h:66
@ ParFlat_REMARK
Definition: genbank.h:74
@ ParFlat_MEDLINE
Definition: genbank.h:73
@ ParFlat_TITLE
Definition: genbank.h:69
@ ParFlat_PUBMED
Definition: genbank.h:75
static int type
Definition: getdata.c:31
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define StringStr
Definition: ncbistr.hpp:322
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
TPrim & Set(void)
Definition: serialbase.hpp:351
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
Definition: serialbase.hpp:347
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
static E_Choice GetAccType(EAccessionInfo info)
Definition: Seq_id.hpp:530
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
const_iterator end(void) const
Definition: Seq_loc.cpp:1034
const_iterator begin(void) const
Definition: Seq_loc.cpp:1028
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static SIZE_TYPE CommonSuffixSize(const CTempString s1, const CTempString s2)
Determine the common suffix of two strings.
Definition: ncbistr.hpp:5462
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static void TrimSuffixInPlace(string &str, const CTempString suffix, ECase use_case=eCase)
Trim suffix from a string (in-place)
Definition: ncbistr.cpp:3274
void SetSource(const TSource &value)
Assign a value to Source data member.
Definition: GB_block_.hpp:488
TKeywords & SetKeywords(void)
Assign a value to Keywords data member.
Definition: GB_block_.hpp:532
bool IsSetExtra_accessions(void) const
Check if a value has been assigned to Extra_accessions data member.
Definition: GB_block_.hpp:442
void ResetKeywords(void)
Reset Keywords data member.
Definition: GB_block_.cpp:63
void ResetOrigin(void)
Reset Origin data member.
Definition: GB_block_.cpp:69
bool IsSetDiv(void) const
GenBank division Check if a value has been assigned to Div data member.
Definition: GB_block_.hpp:654
void ResetSource(void)
Reset Source data member.
Definition: GB_block_.cpp:57
void ResetDate(void)
Reset Date data member.
Definition: GB_block_.cpp:75
bool IsSetSource(void) const
source line Check if a value has been assigned to Source data member.
Definition: GB_block_.hpp:467
void SetDate(const TDate &value)
Assign a value to Date data member.
Definition: GB_block_.hpp:607
bool IsSetEntry_date(void) const
replaces date Check if a value has been assigned to Entry_date data member.
Definition: GB_block_.hpp:633
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: GB_block_.hpp:666
TExtra_accessions & SetExtra_accessions(void)
Assign a value to Extra_accessions data member.
Definition: GB_block_.hpp:460
const TExtra_accessions & GetExtra_accessions(void) const
Get the Extra_accessions member data.
Definition: GB_block_.hpp:454
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
Definition: GB_block_.hpp:526
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: GB_block_.hpp:539
void SetDiv(const TDiv &value)
Assign a value to Div data member.
Definition: GB_block_.hpp:675
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
Definition: GB_block_.hpp:514
void ResetExtra_accessions(void)
Reset Extra_accessions data member.
Definition: GB_block_.cpp:51
void ResetDiv(void)
Reset Div data member.
Definition: GB_block_.cpp:98
bool IsSetDate(void) const
OBSOLETE old form Entry Date Check if a value has been assigned to Date data member.
Definition: GB_block_.hpp:586
void SetOrigin(const TOrigin &value)
Assign a value to Origin data member.
Definition: GB_block_.hpp:560
bool IsSetPages(void) const
Check if a value has been assigned to Pages data member.
Definition: Cit_gen_.hpp:806
bool IsSetDate(void) const
Check if a value has been assigned to Date data member.
Definition: Cit_gen_.hpp:853
TSerial_number GetSerial_number(void) const
Get the Serial_number member data.
Definition: Cit_gen_.hpp:893
bool IsSetAuthors(void) const
Check if a value has been assigned to Authors data member.
Definition: Cit_gen_.hpp:623
bool IsSetVolume(void) const
Check if a value has been assigned to Volume data member.
Definition: Cit_gen_.hpp:712
const TCit & GetCit(void) const
Get the Cit member data.
Definition: Cit_gen_.hpp:588
bool IsSetSerial_number(void) const
for GenBank style references Check if a value has been assigned to Serial_number data member.
Definition: Cit_gen_.hpp:874
bool IsSetCit(void) const
anything, not parsable Check if a value has been assigned to Cit data member.
Definition: Cit_gen_.hpp:576
bool IsSetTitle(void) const
eg.
Definition: Cit_gen_.hpp:921
bool IsSetJournal(void) const
Check if a value has been assigned to Journal data member.
Definition: Cit_gen_.hpp:691
bool IsSetPmid(void) const
PubMed Id Check if a value has been assigned to Pmid data member.
Definition: Cit_gen_.hpp:968
bool IsSetIssue(void) const
Check if a value has been assigned to Issue data member.
Definition: Cit_gen_.hpp:759
bool IsSetMuid(void) const
medline uid Check if a value has been assigned to Muid data member.
Definition: Cit_gen_.hpp:644
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool CanGetType(void) const
Check if it is safe to call GetType method.
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TType & GetType(void) const
Get the Type member data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Date_.hpp:271
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
@ eLim_gt
greater than
Definition: Int_fuzz_.hpp:211
@ e_not_set
No variant selected.
Definition: Date_.hpp:127
vector< CRef< CDbtag > > TDb
Definition: Org_ref_.hpp:101
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Pub_equiv_.hpp:153
const Tdata & Get(void) const
Get the member data.
Definition: Pub_equiv_.hpp:165
E_Choice Which(void) const
Which variant is currently selected.
Definition: Pub_.hpp:555
const TGen & GetGen(void) const
Get the variant data.
Definition: Pub_.cpp:167
TMuid GetMuid(void) const
Get the variant data.
Definition: Pub_.hpp:608
bool IsMuid(void) const
Check if variant Muid is selected.
Definition: Pub_.hpp:602
bool IsGen(void) const
Check if variant Gen is selected.
Definition: Pub_.hpp:584
@ e_Gen
general or generic unparsed
Definition: Pub_.hpp:102
@ eSeq_code_type_iupacaa
IUPAC 1 letter amino acid code.
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
const TKey & GetKey(void) const
Get the Key member data.
Definition: Imp_feat_.hpp:259
bool IsSetKey(void) const
Check if a value has been assigned to Key data member.
Definition: Imp_feat_.hpp:247
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
const TName & GetName(void) const
Get the Name member data.
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TDescr & GetDescr(void) const
Get the Descr member data.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
void SetClass(TClass value)
Assign a value to Class data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_parts
parts for 2 or 3
@ eClass_segset
segmented sequence + parts
const TIupacaa & GetIupacaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:530
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
ERepr
representation class
Definition: Seq_inst_.hpp:91
void SetPub(TPub &value)
Assign a value to Pub data member.
Definition: Pubdesc_.cpp:72
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
Definition: Seq_inst_.hpp:593
const TPub & GetPub(void) const
Get the variant data.
Definition: Seqdesc_.cpp:356
const TNcbi8aa & GetNcbi8aa(void) const
Get the variant data.
Definition: Seq_data_.hpp:630
TNcbieaa & SetNcbieaa(void)
Select the variant.
Definition: Seq_data_.hpp:657
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
const TNcbistdaa & GetNcbistdaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:690
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
bool IsPub(void) const
Check if variant Pub is selected.
Definition: Seqdesc_.hpp:1096
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
bool IsSetPub(void) const
the citation(s) Check if a value has been assigned to Pub data member.
Definition: Pubdesc_.hpp:593
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
void SetFuzz(TFuzz &value)
Assign a value to Fuzz data member.
Definition: Seq_inst_.cpp:113
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
bool IsGap(void) const
Check if variant Gap is selected.
Definition: Seq_data_.hpp:704
const TPub & GetPub(void) const
Get the Pub member data.
Definition: Pubdesc_.hpp:605
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ eRepr_const
constructed sequence
Definition: Seq_inst_.hpp:96
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Iupacaa
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
@ e_Ncbi8aa
8 bit extended amino acid codes
Definition: Seq_data_.hpp:110
@ e_Org
if all from one organism
Definition: Seqdesc_.hpp:116
@ e_Update_date
date of last update
Definition: Seqdesc_.hpp:129
@ e_Modif
modifiers
Definition: Seqdesc_.hpp:112
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
@ eStrand_ds
double strand
Definition: Seq_inst_.hpp:136
@ eStrand_ss
single strand
Definition: Seq_inst_.hpp:135
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
@ ParFlat_REF_BTW
Definition: index.h:61
@ ParFlat_REF_NO_TARGET
Definition: index.h:63
@ ParFlat_COL_FEATKEY
Definition: index.h:65
@ ParFlat_REF_SITES
Definition: index.h:62
@ ParFlat_REF_END
Definition: index.h:60
CSeq_id::E_Choice GetNucAccOwner(const CTempString &acc)
Definition: indx_blk.cpp:2271
int fta_if_wgs_acc(const CTempString &accession)
Definition: indx_blk.cpp:1193
void DelNoneDigitTail(char *str)
Definition: indx_blk.cpp:946
char * buf
int i
yy_size_t n
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
unsigned int a
Definition: ncbi_localip.c:102
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
static Format format
Definition: njn_ioutil.cpp:53
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static char tmp[2048]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
@ ParFlatSP_KW
Definition: sprot.h:52
static const char * str(char *buf, int n)
Definition: stats.c:84
DataBlkPtr chain
Definition: ftablock.h:344
Definition: entry.h:57
list< SectionPtr > mSections
Definition: entry.h:99
string mBaseData
Definition: entry.h:98
Char acnum[200]
Definition: ftablock.h:169
Char division[4]
Definition: ftablock.h:174
bool is_mga
Definition: ftablock.h:202
Char blocusname[200]
Definition: ftablock.h:181
Int2 vernum
Definition: ftablock.h:170
bool is_tpa
Definition: ftablock.h:209
bool embl_new_ID
Definition: ftablock.h:221
bool is_prot
Definition: ftablock.h:225
bool is_contig
Definition: ftablock.h:200
bool is_pat
Definition: ftablock.h:205
bool drop
Definition: ftablock.h:185
size_t bases
Definition: ftablock.h:175
string wgssec
Definition: ftablock.h:239
TokenBlkPtr secaccs
Definition: ftablock.h:219
Char locusname[200]
Definition: ftablock.h:173
XmlIndexPtr xip
Definition: ftablock.h:220
vector< IndexblkPtr > entrylist
Definition: entry.h:13
TokenBlk * next
Definition: ftablock.h:135
char * str
Definition: ftablock.h:134
size_t start
Definition: ftablock.h:155
XmlIndex * next
Definition: ftablock.h:161
XmlIndex * subtags
Definition: ftablock.h:160
size_t end
Definition: ftablock.h:156
Int4 tag
Definition: ftablock.h:153
Definition: inftrees.h:24
Definition: type.c:6
done
Definition: token1.c:1
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
Definition: utilfun.cpp:1042
char * SrchTheChar(char *bptr, char *eptr, Char letter)
Definition: utilfun.cpp:903
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
Definition: utilfun.cpp:1652
bool fta_is_tpa_keyword(const char *str)
Definition: utilfun.cpp:1315
void xGetBlkDataReplaceNewLine(string &instr, int indent)
Definition: utilfun.cpp:784
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Definition: utilfun.cpp:1108
string xGetNodeData(const DataBlk &entry, int nodeType)
Definition: utilfun.cpp:1122
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1641
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:1139
void CleanTailNoneAlphaChar(char *str)
Definition: utilfun.cpp:837
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
Definition: utilfun.cpp:923
#define ParFlat_UNKW
Definition: utilfun.h:44
static wxAcceleratorEntry entries[3]
Modified on Thu Mar 28 17:05:07 2024 by modify_doxy.py rev. 669887