NCBI C++ ToolKit
sp_ascii.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sp_ascii.cpp 102411 2024-05-02 10:00:24Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: sp_ascii.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Build SWISS-PROT format entry block. All external variables
32  * are in sp_global.c.
33  * Parse SP image in memory to asn.
34  *
35  */
36 
37 #include <ncbi_pch.hpp>
38 
39 #include "ftacpp.hpp"
40 
46 #include <objects/seq/Seq_hist.hpp>
49 #include <objects/seq/MolInfo.hpp>
50 #include <objmgr/scope.hpp>
54 #include <objects/seq/Seq_inst.hpp>
55 #include <objects/seq/Seq_ext.hpp>
61 #include <objects/seq/Pubdesc.hpp>
62 
63 #include "index.h"
64 #include "sprot.h"
65 
67 #include "ftanet.h"
69 
70 #include "ftaerr.hpp"
71 #include "indx_blk.h"
72 #include "asci_blk.h"
73 #include "sp_ascii.h"
74 #include "utilfeat.h"
75 #include "add.h"
76 #include "nucprot.h"
77 #include "utilfun.h"
78 #include "entry.h"
79 #include "ref.h"
80 #include "xutils.h"
81 
82 #ifdef THIS_FILE
83 # undef THIS_FILE
84 #endif
85 #define THIS_FILE "sp_ascii.cpp"
86 
89 
90 const char* ParFlat_SPComTopics[] = {
91  "ALLERGEN:",
92  "ALTERNATIVE PRODUCTS:",
93  "BIOPHYSICOCHEMICAL PROPERTIES:",
94  "BIOTECHNOLOGY:",
95  "CATALYTIC ACTIVITY:",
96  "CAUTION:",
97  "COFACTOR:",
98  "DATABASE:",
99  "DEVELOPMENTAL STAGE:",
100  "DISEASE:",
101  "DISRUPTION PHENOTYPE:",
102  "DOMAIN:",
103  "ENZYME REGULATION:",
104  "FUNCTION:",
105  "INDUCTION:",
106  "INTERACTION:",
107  "MASS SPECTROMETRY:",
108  "MISCELLANEOUS:",
109  "PATHWAY:",
110  "PHARMACEUTICAL:",
111  "POLYMORPHISM:",
112  "PTM:",
113  "RNA EDITING:",
114  "SEQUENCE CAUTION:",
115  "SIMILARITY:",
116  "SUBCELLULAR LOCATION:",
117  "SUBUNIT:",
118  "TISSUE SPECIFICITY:",
119  "TOXIC DOSE:",
120  "WEB RESOURCE:",
121  nullptr
122 };
123 
124 /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
125 const char* ParFlat_SPFeatNoExp[] = {
126  "(PROBABLE).",
127  "(PROBABLE)",
128  "PROBABLE.",
129  "(POTENTIAL).",
130  "(POTENTIAL)",
131  "POTENTIAL.",
132  "(BY SIMILARITY).",
133  "(BY SIMILARITY)",
134  "BY SIMILARITY.",
135  nullptr
136 };
137 
138 const char* ParFlat_SPFeatNoExpW[] = {
139  "(PUTATIVE).",
140  "(PUTATIVE)",
141  "PUTATIVE.",
142  "(SIMILARITY).",
143  "(SIMILARITY)",
144  "SIMILARITY.",
145  "(POSSIBLE).",
146  "(POSSIBLE)",
147  "POSSIBLE.",
148  "(POSTULATED).",
149  "(POSTULATED)",
150  "POSTULATED.",
151  "(BY HOMOLOGY).",
152  "(BY HOMOLOGY)",
153  "BY HOMOLOGY.",
154  nullptr
155 };
156 */
157 
159  { "ACT_SITE", ParFlatSPSites, 1, nullptr },
160  { "BINDING", ParFlatSPSites, 2, nullptr },
161  { "CARBOHYD", ParFlatSPSites, 6, nullptr },
162  { "MUTAGEN", ParFlatSPSites, 8, nullptr },
163  { "METAL", ParFlatSPSites, 9, nullptr },
164  { "LIPID", ParFlatSPSites, 20, nullptr },
165  { "NP_BIND", ParFlatSPSites, 21, nullptr },
166  { "DNA_BIND", ParFlatSPSites, 22, nullptr },
167  { "SITE", ParFlatSPSites, 255, nullptr },
168  { "MOD_RES", ParFlatSPSites, 5, nullptr }, /* 9 */
169  { "MOD_RES", ParFlatSPSites, 10, "4-aspartylphosphate" },
170  { "MOD_RES", ParFlatSPSites, 10, "5-glutamyl glycerylphosphorylethanolamine" },
171  { "MOD_RES", ParFlatSPSites, 10, "Phosphoarginine" },
172  { "MOD_RES", ParFlatSPSites, 10, "Phosphocysteine" },
173  { "MOD_RES", ParFlatSPSites, 10, "Phosphohistidine" },
174  { "MOD_RES", ParFlatSPSites, 10, "PHOSPHORYLATION" },
175  { "MOD_RES", ParFlatSPSites, 10, "Phosphoserine" },
176  { "MOD_RES", ParFlatSPSites, 10, "Phosphothreonine" },
177  { "MOD_RES", ParFlatSPSites, 10, "Phosphotyrosine" },
178  { "MOD_RES", ParFlatSPSites, 10, "Pros-phosphohistidine" },
179  { "MOD_RES", ParFlatSPSites, 10, "Tele-phosphohistidine" },
180  { "MOD_RES", ParFlatSPSites, 11, "ACETYLATION" },
181  { "MOD_RES", ParFlatSPSites, 11, "N2-acetylarginine" },
182  { "MOD_RES", ParFlatSPSites, 11, "N6-acetyllysine" },
183  { "MOD_RES", ParFlatSPSites, 11, "N-acetylalanine" },
184  { "MOD_RES", ParFlatSPSites, 11, "N-acetylaspartate" },
185  { "MOD_RES", ParFlatSPSites, 11, "N-acetylated lysine" },
186  { "MOD_RES", ParFlatSPSites, 11, "N-acetylcysteine" },
187  { "MOD_RES", ParFlatSPSites, 11, "N-acetylglutamate" },
188  { "MOD_RES", ParFlatSPSites, 11, "N-acetylglycine" },
189  { "MOD_RES", ParFlatSPSites, 11, "N-acetylisoleucine" },
190  { "MOD_RES", ParFlatSPSites, 11, "N-acetylmethionine" },
191  { "MOD_RES", ParFlatSPSites, 11, "N-acetylproline" },
192  { "MOD_RES", ParFlatSPSites, 11, "N-acetylserine" },
193  { "MOD_RES", ParFlatSPSites, 11, "N-acetylthreonine" },
194  { "MOD_RES", ParFlatSPSites, 11, "N-acetyltyrosine" },
195  { "MOD_RES", ParFlatSPSites, 11, "N-acetylvaline" },
196  { "MOD_RES", ParFlatSPSites, 11, "O-acetylserine" },
197  { "MOD_RES", ParFlatSPSites, 11, "O-acetylthreonine" },
198  { "MOD_RES", ParFlatSPSites, 12, "Alanine amide" },
199  { "MOD_RES", ParFlatSPSites, 12, "AMIDATION" },
200  { "MOD_RES", ParFlatSPSites, 12, "Arginine amide" },
201  { "MOD_RES", ParFlatSPSites, 12, "Asparagine amide" },
202  { "MOD_RES", ParFlatSPSites, 12, "Aspartic acid 1-amide" },
203  { "MOD_RES", ParFlatSPSites, 12, "Cysteine amide" },
204  { "MOD_RES", ParFlatSPSites, 12, "Glutamic acid 1-amide" },
205  { "MOD_RES", ParFlatSPSites, 12, "Glutamine amide" },
206  { "MOD_RES", ParFlatSPSites, 12, "Glycine amide" },
207  { "MOD_RES", ParFlatSPSites, 12, "Histidine amide" },
208  { "MOD_RES", ParFlatSPSites, 12, "Isoleucine amide" },
209  { "MOD_RES", ParFlatSPSites, 12, "Leucine amide" },
210  { "MOD_RES", ParFlatSPSites, 12, "Lysine amide" },
211  { "MOD_RES", ParFlatSPSites, 12, "Methionine amide" },
212  { "MOD_RES", ParFlatSPSites, 12, "Phenylalanine amide" },
213  { "MOD_RES", ParFlatSPSites, 12, "Proline amide" },
214  { "MOD_RES", ParFlatSPSites, 12, "Serine amide" },
215  { "MOD_RES", ParFlatSPSites, 12, "Threonine amide" },
216  { "MOD_RES", ParFlatSPSites, 12, "Tryptophan amide" },
217  { "MOD_RES", ParFlatSPSites, 12, "Tyrosine amide" },
218  { "MOD_RES", ParFlatSPSites, 12, "Valine amide" },
219  { "MOD_RES", ParFlatSPSites, 13, "2-methylglutamine" },
220  { "MOD_RES", ParFlatSPSites, 13, "2'-methylsulfonyltryptophan" },
221  { "MOD_RES", ParFlatSPSites, 13, "3-methylthioaspartic acid" },
222  { "MOD_RES", ParFlatSPSites, 13, "5-methylarginine" },
223  { "MOD_RES", ParFlatSPSites, 13, "Asymmetric dimethylarginine" },
224  { "MOD_RES", ParFlatSPSites, 13, "Cysteine methyl disulfide" },
225  { "MOD_RES", ParFlatSPSites, 13, "Cysteine methyl ester" },
226  { "MOD_RES", ParFlatSPSites, 13, "Dimethylated arginine" },
227  { "MOD_RES", ParFlatSPSites, 13, "Glutamate methyl ester (Gln)" },
228  { "MOD_RES", ParFlatSPSites, 13, "Glutamate methyl ester (Glu)" },
229  { "MOD_RES", ParFlatSPSites, 13, "Leucine methyl ester" },
230  { "MOD_RES", ParFlatSPSites, 13, "Lysine methyl ester" },
231  { "MOD_RES", ParFlatSPSites, 13, "METHYLATION" },
232  { "MOD_RES", ParFlatSPSites, 13, "Methylhistidine" },
233  { "MOD_RES", ParFlatSPSites, 13, "N,N,N-trimethylalanine" },
234  { "MOD_RES", ParFlatSPSites, 13, "N,N,N-trimethylglycine" },
235  { "MOD_RES", ParFlatSPSites, 13, "N,N,N-trimethylserine" },
236  { "MOD_RES", ParFlatSPSites, 13, "N,N-dimethylalanine" },
237  { "MOD_RES", ParFlatSPSites, 13, "N,N-dimethylglycine" },
238  { "MOD_RES", ParFlatSPSites, 13, "N,N-dimethylleucine" },
239  { "MOD_RES", ParFlatSPSites, 13, "N,N-dimethylproline" },
240  { "MOD_RES", ParFlatSPSites, 13, "N,N-dimethylserine" },
241  { "MOD_RES", ParFlatSPSites, 13, "N2,N2-dimethylarginine" },
242  { "MOD_RES", ParFlatSPSites, 13, "N4,N4-dimethylasparagine" },
243  { "MOD_RES", ParFlatSPSites, 13, "N4-methylasparagine" },
244  { "MOD_RES", ParFlatSPSites, 13, "N5-methylarginine" },
245  { "MOD_RES", ParFlatSPSites, 13, "N5-methylglutamine" },
246  { "MOD_RES", ParFlatSPSites, 13, "N6,N6,N6-trimethyl-5-hydroxylysine" },
247  { "MOD_RES", ParFlatSPSites, 13, "N6,N6,N6-trimethyllysine" },
248  { "MOD_RES", ParFlatSPSites, 13, "N6,N6-dimethyllysine" },
249  { "MOD_RES", ParFlatSPSites, 13, "N6-methylated lysine" },
250  { "MOD_RES", ParFlatSPSites, 13, "N6-methyllysine" },
251  { "MOD_RES", ParFlatSPSites, 13, "N6-poly(methylaminopropyl)lysine" },
252  { "MOD_RES", ParFlatSPSites, 13, "N-methylalanine" },
253  { "MOD_RES", ParFlatSPSites, 13, "N-methylglycine" },
254  { "MOD_RES", ParFlatSPSites, 13, "N-methylisoleucine" },
255  { "MOD_RES", ParFlatSPSites, 13, "N-methylleucine" },
256  { "MOD_RES", ParFlatSPSites, 13, "N-methylmethionine" },
257  { "MOD_RES", ParFlatSPSites, 13, "N-methylphenylalanine" },
258  { "MOD_RES", ParFlatSPSites, 13, "N-methylproline" },
259  { "MOD_RES", ParFlatSPSites, 13, "N-methylserine" },
260  { "MOD_RES", ParFlatSPSites, 13, "N-methyltyrosine" },
261  { "MOD_RES", ParFlatSPSites, 13, "Omega-N-methylarginine" },
262  { "MOD_RES", ParFlatSPSites, 13, "Omega-N-methylated arginine" },
263  { "MOD_RES", ParFlatSPSites, 13, "O-methylthreonine" },
264  { "MOD_RES", ParFlatSPSites, 13, "Pros-methylhistidine" },
265  { "MOD_RES", ParFlatSPSites, 13, "S-methylcysteine" },
266  { "MOD_RES", ParFlatSPSites, 13, "Symmetric dimethylarginine" },
267  { "MOD_RES", ParFlatSPSites, 13, "Tele-methylhistidine" },
268  { "MOD_RES", ParFlatSPSites, 13, "Threonine methyl ester" },
269  { "MOD_RES", ParFlatSPSites, 14, "(3R)-3-hydroxyarginine" },
270  { "MOD_RES", ParFlatSPSites, 14, "(3R)-3-hydroxyasparagine" },
271  { "MOD_RES", ParFlatSPSites, 14, "(3R)-3-hydroxyaspartate" },
272  { "MOD_RES", ParFlatSPSites, 14, "(3S)-3-hydroxyhistidine" },
273  { "MOD_RES", ParFlatSPSites, 14, "(3R,4R)-3,4-dihydroxyproline" },
274  { "MOD_RES", ParFlatSPSites, 14, "(3R,4R)-4,5-dihydroxyisoleucine" },
275  { "MOD_RES", ParFlatSPSites, 14, "(3R,4S)-3,4-dihydroxyproline" },
276  { "MOD_RES", ParFlatSPSites, 14, "(3R,4S)-4-hydroxyisoleucine" },
277  { "MOD_RES", ParFlatSPSites, 14, "(3S)-3-hydroxyasparagine" },
278  { "MOD_RES", ParFlatSPSites, 14, "(3S)-3-hydroxyaspartate" },
279  { "MOD_RES", ParFlatSPSites, 14, "(3S,4R)-3,4-dihydroxyisoleucine" },
280  { "MOD_RES", ParFlatSPSites, 14, "(4R)-5-hydroxyleucine" },
281  { "MOD_RES", ParFlatSPSites, 14, "(4R)-4,5-dihydroxyleucine" },
282  { "MOD_RES", ParFlatSPSites, 14, "3,4-dihydroxyarginine" },
283  { "MOD_RES", ParFlatSPSites, 14, "3',4'-dihydroxyphenylalanine" },
284  { "MOD_RES", ParFlatSPSites, 14, "3,4-dihydroxyproline" },
285  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxyasparagine" },
286  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxyaspartate" },
287  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxyphenylalanine" },
288  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxyproline" },
289  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxytryptophan" },
290  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxyvaline" },
291  { "MOD_RES", ParFlatSPSites, 14, "4,5,5'-trihydroxyleucine" },
292  { "MOD_RES", ParFlatSPSites, 14, "4,5-dihydroxylysine" },
293  { "MOD_RES", ParFlatSPSites, 14, "4-hydroxyarginine" },
294  { "MOD_RES", ParFlatSPSites, 14, "4-hydroxyglutamate" },
295  { "MOD_RES", ParFlatSPSites, 14, "4-hydroxyproline" },
296  { "MOD_RES", ParFlatSPSites, 14, "5-hydroxy-3-methylproline (Ile)" },
297  { "MOD_RES", ParFlatSPSites, 14, "5-hydroxylysine" },
298  { "MOD_RES", ParFlatSPSites, 14, "(5R)-5-hydroxylysine" },
299  { "MOD_RES", ParFlatSPSites, 14, "(5S)-5-hydroxylysine" },
300  { "MOD_RES", ParFlatSPSites, 14, "7'-hydroxytryptophan" },
301  { "MOD_RES", ParFlatSPSites, 14, "D-4-hydroxyvaline" },
302  { "MOD_RES", ParFlatSPSites, 14, "HYDROXYLATION" },
303  { "MOD_RES", ParFlatSPSites, 14, "Hydroxyproline" },
304  { "MOD_RES", ParFlatSPSites, 14, "N6-(3,6-diaminohexanoyl)-5-hydroxylysine" },
305  { "MOD_RES", ParFlatSPSites, 15, "SULFATATION" },
306  { "MOD_RES", ParFlatSPSites, 15, "Sulfoserine" },
307  { "MOD_RES", ParFlatSPSites, 15, "Sulfothreonine" },
308  { "MOD_RES", ParFlatSPSites, 15, "Sulfotyrosine" },
309  { "MOD_RES", ParFlatSPSites, 16, "OXIDATIVE DEAMINATION" },
310  { "MOD_RES", ParFlatSPSites, 17, "Pyrrolidone carboxylic acid" },
311  { "MOD_RES", ParFlatSPSites, 17, "Pyrrolidone carboxylic acid (Glu)" },
312  { "MOD_RES", ParFlatSPSites, 18, "4-carboxyglutamate" },
313  { "MOD_RES", ParFlatSPSites, 18, "GAMMA-CARBOXYGLUTAMIC ACID" },
314  { "MOD_RES", ParFlatSPSites, 19, "Blocked" },
315  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Ala)" },
316  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Arg)" },
317  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Asn)" },
318  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Asp)" },
319  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Asx)" },
320  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Cys)" },
321  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Gln)" },
322  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Glu)" },
323  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Gly)" },
324  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Ile)" },
325  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Leu)" },
326  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Met)" },
327  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Pro)" },
328  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Ser)" },
329  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Thr)" },
330  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Val)" },
331  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Xaa)" },
332  { "MOD_RES", ParFlatSPSites, 19, "Blocked carboxyl end (Arg)" },
333  { "MOD_RES", ParFlatSPSites, 19, "Blocked carboxyl end (His)" }, /* 174 */
334  { "DISULFID", ParFlatSPBonds, 1, nullptr },
335  { "THIOLEST", ParFlatSPBonds, 2, nullptr },
336  { "CROSSLNK", ParFlatSPBonds, 3, nullptr },
337  { "THIOETH", ParFlatSPBonds, 4, nullptr },
338  { "SIGNAL", ParFlatSPRegions, -1, "Signal" },
339  { "PROPEP", ParFlatSPRegions, -1, "Propeptide" },
340  { "CHAIN", ParFlatSPRegions, -1, "Mature chain" },
341  { "TRANSIT", ParFlatSPRegions, -1, "Transit peptide" },
342  { "PEPTIDE", ParFlatSPRegions, -1, "Processed active peptide" },
343  { "DOMAIN", ParFlatSPRegions, -1, "Domain" },
344  { "CA_BIND", ParFlatSPRegions, -1, "Calcium binding region" },
345  { "TRANSMEM", ParFlatSPRegions, -1, "Transmembrane region" },
346  { "ZN_FING", ParFlatSPRegions, -1, "Zinc finger region" },
347  { "SIMILAR", ParFlatSPRegions, -1, "Similarity" },
348  { "REPEAT", ParFlatSPRegions, -1, "Repetitive region" },
349  { "HELIX", ParFlatSPRegions, -1, "Helical region" },
350  { "STRAND", ParFlatSPRegions, -1, "Beta-strand region" },
351  { "TURN", ParFlatSPRegions, -1, "Hydrogen bonded turn" },
352  { "CONFLICT", ParFlatSPRegions, -1, "Conflict" },
353  { "VARIANT", ParFlatSPRegions, -1, "Variant" },
354  { "SE_CYS", ParFlatSPRegions, -1, "Selenocysteine" },
355  { "VARSPLIC", ParFlatSPRegions, -1, "Splicing variant" },
356  { "VAR_SEQ", ParFlatSPRegions, -1, "Splicing variant" },
357  { "COILED", ParFlatSPRegions, -1, "Coiled-coil region" },
358  { "COMPBIAS", ParFlatSPRegions, -1, "Compositionally biased region" },
359  { "MOTIF", ParFlatSPRegions, -1, "Short sequence motif of biological interest" },
360  { "REGION", ParFlatSPRegions, -1, "Region of interest in the sequence" },
361  { "TOPO_DOM", ParFlatSPRegions, -1, "Topological domain" },
362  { "INTRAMEM", ParFlatSPRegions, -1, "Intramembrane region" },
363  { "UNSURE", ParFlatSPImports, -1, "unsure" },
364  { "INIT_MET", ParFlatSPInitMet, -1, "INIT_MET" },
365  { "NON_TER", ParFlatSPNonTer, -1, "NON_TER" },
366  { "NON_CONS", ParFlatSPNonCons, -1, "NON_CONS" },
367  { nullptr, 0, 0, nullptr }
368 };
369 
370 /* for array index, MOD_RES in the "ParFlat_SPFeat"
371  */
372 #define ParFlatSPSitesModB 9 /* beginning */
373 #define ParFlatSPSitesModE 174 /* end */
374 
375 #define COPYRIGHT "This Swiss-Prot entry is copyright."
376 #define COPYRIGHT1 "Copyrighted by the UniProt Consortium,"
377 
378 #define SPDE_RECNAME 000001
379 #define SPDE_ALTNAME 000002
380 #define SPDE_SUBNAME 000004
381 #define SPDE_FLAGS 000010
382 #define SPDE_INCLUDES 000020
383 #define SPDE_CONTAINS 000040
384 #define SPDE_FULL 000100
385 #define SPDE_SHORT 000200
386 #define SPDE_EC 000400
387 #define SPDE_ALLERGEN 001000
388 #define SPDE_BIOTECH 002000
389 #define SPDE_CD_ANTIGEN 004000
390 #define SPDE_INN 010000
391 
392 struct CharIntLen {
393  const char* str;
396 };
397 
398 struct SPDEFields {
399  Int4 tag = 0;
400  char* start = nullptr;
401  char* end = nullptr;
402  SPDEFields* next = nullptr;
403 };
405 
406 struct SPFeatInput {
407  string key; /* column 6-13 */
408  string from; /* column 15-20 */
409  string to; /* column 22-27 */
410  string descrip; /* column 35-75, continue line if a blank key */
411  SPFeatInput* next = nullptr; /* next FT */
412 };
414 
415 struct SPFeatBln {
416  bool initmet = false;
417  bool nonter = false;
418  bool noright = false;
419  bool noleft = false;
420 };
422 
423 /* segment location, data from NON_CONS
424  */
425 struct SPSegLoc {
426  Int4 from = 0; /* the beginning point of the segment */
427  Int4 len = 0; /* total length of the segment */
428  SPSegLoc* next = nullptr;
429 };
431 
432 struct SetOfSyns {
433  char* synname = nullptr;
434  SetOfSyns* next = nullptr;
435 };
437 
438 struct SetOfSpecies {
439  char* fullname = nullptr;
440  char* name = nullptr;
441  SetOfSyns* syn = nullptr;
442 };
444 
445 struct ViralHost {
447  char* name = nullptr;
448  ViralHost* next = nullptr;
449 };
451 
452 // clang-format off
454  {"RecName:", SPDE_RECNAME, 8},
455  {"AltName:", SPDE_ALTNAME, 8},
456  {"SubName:", SPDE_SUBNAME, 8},
457  {"Includes:", SPDE_INCLUDES, 9},
458  {"Contains:", SPDE_CONTAINS, 9},
459  {"Flags:", SPDE_FLAGS, 6},
460  {"Full=", SPDE_FULL, 5},
461  {"Short=", SPDE_SHORT, 6},
462  {"EC=", SPDE_EC, 3},
463  {"Allergen=", SPDE_ALLERGEN, 9},
464  {"Biotech=", SPDE_BIOTECH, 8},
465  {"CD_antigen=", SPDE_CD_ANTIGEN, 11},
466  {"INN=", SPDE_INN, 4},
467  {nullptr, 0, 0},
468 };
469 
470 const char* org_mods[] = {
471  "STRAIN", "SUBSTRAIN", "TYPE", "SUBTYPE", "VAR.", "SEROTYPE",
472  "SEROGROUP", "SEROVAR", "CULTIVAR", "PATHOVAR", "CHEMOVAR", "BIOVAR",
473  "BIOTYPE", "GROUP", "SUBGROUP", "ISOLATE", "ACRONYM", "DOSAGE",
474  "NAT_HOST", "SUBSP.", nullptr
475 };
476 
477 const char* obsolete_dbs[] = {
478  "2DBASE-ECOLI", "AARHUS/GHENT-2DPAGE", "AGD",
479  "ANU-2DPAGE", "BURULIST", "CARBBANK",
480  "CMR", "CORNEA-2DPAGE", "DICTYDB",
481  "DOMO", "ECO2DBASE", "GCRDB",
482  "GENEVESTIGATOR", "GENEW", "GENOMEREVIEWS",
483  "GERMONLINE", "HIV", "HSC-2DPAGE",
484  "HSSP", "IPI", "LINKHUB",
485  "LISTILIST", "MAIZE-2DPAGE", "MENDEL",
486  "MGD", "MYPULIST", "NMPDR",
487  "PATHWAY_INTERACTION_DB", "PHCI-2DPAGE", "PHOSSITE",
488  "PPTASEDB", "PROTCLUSTDB", "PHOTOLIST",
489  "PMMA-2DPAGE", "RAT-HEART-2DPAGE", "RZPD-PROTEXP",
490  "SAGALIST", "SIENA-2DPAGE", "STYGENE",
491  "SUBTILIST", "TIGR", "TRANSFAC",
492  "WORMPEP", "YEPD", "YPD",
493  nullptr
494 };
495 
496 const char* valid_dbs[] = {
497  "ALLERGOME", "ARACHNOSERVER", "ARAPORT",
498  "ARRAYEXPRESS", "BEEBASE", "BGD",
499  "BGEE", "BINDINGDB", "BIOCYC",
500  "BIOGRID", "BIOMUTA", "BRENDA",
501  "CAZY", "CCDS", "CDD",
502  "CGD", "CHEMBL", "CHITARS",
503  "CLEANEX", "COLLECTF", "COMPLUYEAST-2DPAGE",
504  "CONOSERVER", "CTD", "CYGD",
505  "DBSNP", "DEPOD", "DICTYBASE",
506  "DIP", "DISGENET", "DISPROT",
507  "DMDM", "DNASU", "DOSAC-COBS-2DPAGE",
508  "DRUGBANK", "ECHOBASE", "ECOGENE",
509  "EGGNOG", "EMBL", "ENSEMBL",
510  "ENSEMBLBACTERIA", "ENSEMBLFUNGI", "ENSEMBLMETAZOA",
511  "ENSEMBLPLANTS", "ENSEMBLPROTISTS", "EPD",
512  "ESTHER", "EUHCVDB", "EUPATHDB",
513  "EUROPEPMC", "EVOLUTIONARYTRACE", "EXPRESSIONATLAS",
514  "FLYBASE", "GENE3D", "GENECARDS",
515  "GENEDB", "GENEDB_SPOMBE", "GENEFARM",
516  "GENEID", "GENEREVIEWS", "GENETREE",
517  "GENEVISIBLE", "GENEWIKI", "GENOLIST",
518  "GENOMERNAI", "GK", "GLYCOSUITEDB",
519  "GRAINGENES", "GO", "GRAMENE",
520  "GUIDETOPHARMACOLOGY", "H-INVDB", "HAMAP",
521  "HGNC", "HOGENOM", "HOVERGEN",
522  "HPA", "IMGT/GENE-DB", "IMGT/HLA",
523  "IMGT/LIGM", "IMGT_GENE-DB", "INPARANOID",
524  "INTACT", "INTERPRO", "IPD-KIR",
525  "IPTMNET", "KEGG", "KO",
526  "LEGIOLIST", "LEPROMA", "MAIZEDB",
527  "MAIZEGDB", "MALACARDS", "MAXQB",
528  "MEROPS", "MGI", "MIM",
529  "MINT", "MIRBASE", "MOONPROT",
530  "MYCOCLAP", "NEXTBIO", "NEXTPROT",
531  "OGP", "OMA", "OPENTARGETS",
532  "ORPHANET", "ORTHODB", "PANTHER",
533  "PATRIC", "PAXDB", "PDB",
534  "PDBSUM", "PEPTIDEATLAS", "PEROXIBASE",
535  "PFAM", "PHARMGKB", "PHOSPHOSITE",
536  "PHOSPHOSITEPLUS", "PHYLOMEDB", "PIR",
537  "PIRSF", "PMAP-CUTDB", "POMBASE",
538  "PR", "PR2", "PRIDE",
539  "PRINTS", "PRO", "PRODOM",
540  "PROMEX", "PROSITE", "PROTEINMODELPORTAL",
541  "PROTEOMES", "PSEUDOCAP", "REACTOME",
542  "REBASE", "REFSEQ", "REPRODUCTION-2DPAGE",
543  "RGD", "RZPD", "SABIO-RK",
544  "SFLD", "SGD", "SIGNALINK",
545  "SIGNALLINK", "SIGNOR", "SMART",
546  "SMR", "STRING", "SUPFAM",
547  "SWISS-2DPAGE", "SWISSLIPIDS", "SWISSPALM",
548  "TAIR", "TCDB", "TIGRFAMS",
549  "TOPDOWNPROTEOMICS", "TREEFAM", "TUBERCULIST",
550  "UCD-2DPAGE", "UCSC", "UNICARBKB",
551  "UNIGENE", "UNILIB", "UNIPATHWAY",
552  "UNITE", "VBASE2", "VECTORBASE",
553  "VEGA-TR", "VEGA-GN", "VGNC",
554  "WBPARASITE", "WORLD-2DPAGE", "WORMBASE",
555  "XENBASE", "ZFIN", nullptr
556 };
557 
558 const char* SP_organelle[] = {
559  "CHLOROPLAST", "CYANELLE", "MITOCHONDRION", "PLASMID", "NUCLEOMORPH",
560  "HYDROGENOSOME", "APICOPLAST", "CHROMATOPHORE",
561  "ORGANELLAR CHROMATOPHORE", nullptr
562 };
563 
564 const char* PE_values[] = {
565  "Evidence at protein level",
566  "Evidence at transcript level",
567  "Inferred from homology",
568  "Predicted",
569  "Uncertain",
570  nullptr
571 };
572 // clang-format on
573 
574 /**********************************************************
575  *
576  * static char* StringCombine(str1, str2, delim):
577  *
578  * Return a string which is combined str1 and str2,
579  * put blank between two strings if "blank" = TRUE;
580  * also memory free out str1 and str2.
581  *
582  **********************************************************/
583 static void StringCombine(string& dest, const string& to_add, const Char* delim)
584 {
585  if (to_add.empty())
586  return;
587 
588  if (delim && *delim != '\0' && ! dest.empty())
589  dest += delim[0];
590 
591  dest += to_add;
592 }
593 
594 /**********************************************************
595  *
596  * static CRef<CDbtag> MakeStrDbtag(dbname, str):
597  *
598  * 10-1-93
599  *
600  **********************************************************/
601 static CRef<CDbtag> MakeStrDbtag(const char* dbname, const char* str)
602 {
604 
605  if (dbname && str) {
606  tag.Reset(new CDbtag);
607  tag->SetDb(dbname);
608  tag->SetTag().SetStr(str);
609  }
610 
611  return tag;
612 }
613 
614 /**********************************************************
615  *
616  * static CRef<CDate> MakeDatePtr(str, source):
617  *
618  * Return a DatePtr with "std" type if dd-mmm-yyyy
619  * or dd-mmm-yy format; with "str" type if not
620  * a dd-mmm-yyyy format.
621  *
622  **********************************************************/
624 {
625  static Char msg[11];
626 
627  CRef<CDate> res(new CDate);
628 
629  if (! str)
630  return res;
631 
632  if (StringChr(str, '-') && (isdigit(*str) != 0 || *str == ' ')) {
633  CRef<CDate_std> std_date = get_full_date(str, true, source);
634  res->SetStd(*std_date);
635  if (XDateCheck(*std_date) != 0) {
636  StringNCpy(msg, str, 10);
637  msg[10] = '\0';
638  ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegalDate, "Illegal date: %s", msg);
639  }
640  }
641 
642  if (res->Which() == CDate::e_not_set) {
643  res->SetStr(str);
644  }
645 
646  return res;
647 }
648 
649 /**********************************************************/
650 static void fta_create_pdb_seq_id(CSP_block_Base::TSeqref& refs, const char* mol, Uint1 chain)
651 {
652  if (! mol)
653  return;
654 
655  CRef<CPDB_seq_id> pdb_seq_id(new CPDB_seq_id);
656  pdb_seq_id->SetMol(CPDB_mol_id(mol));
657 
658  if (chain > 0) {
659  pdb_seq_id->SetChain(chain);
660  }
661 
662  CRef<CSeq_id> sid(new CSeq_id);
663  sid->SetPdb(*pdb_seq_id);
664  refs.push_back(sid);
665 }
666 
667 /**********************************************************/
668 static void MakeChainPDBSeqId(CSP_block_Base::TSeqref& refs, const char* mol, char* chain)
669 {
670  char* fourth;
671  char* p;
672  char* q;
673  char* r;
674 
675  bool bad;
676  bool got;
677 
678  if (! mol || ! chain)
679  return;
680 
681  fourth = StringSave(chain);
682  for (bad = false, got = false, q = chain; *q != '\0'; q = p) {
683  while (*q == ' ' || *q == ',')
684  q++;
685  for (p = q; *p != '\0' && *p != ' ' && *p != ',';)
686  p++;
687  if (*p != '\0')
688  *p++ = '\0';
689  r = StringRChr(q, '=');
690  if (! r && ! got) {
691  fta_create_pdb_seq_id(refs, mol, 0);
692  continue;
693  }
694  *r = '\0';
695  for (r = q; *r != '\0'; r++) {
696  if (*r == '/')
697  continue;
698  if (r[1] != '/' && r[1] != '\0') {
699  while (*r != '/' && *r != '\0')
700  r++;
701  r--;
702  bad = true;
703  continue;
704  }
705  got = true;
706  fta_create_pdb_seq_id(refs, mol, *r);
707  }
708  }
709 
710  if (bad) {
711  ErrPostEx(SEV_ERROR, ERR_FORMAT_InvalidPDBCrossRef, "PDB cross-reference \"%s\" contains one or more chain identifiers that are more than a single character in length.", fourth);
712  if (! got)
713  fta_create_pdb_seq_id(refs, mol, 0);
714  }
715 
716  MemFree(fourth);
717 }
718 
719 /**********************************************************
720  *
721  * static void MakePDBSeqId(refs, mol, rel, chain, drop, source):
722  *
723  * 10-1-93
724  *
725  **********************************************************/
726 static void MakePDBSeqId(CSP_block_Base::TSeqref& refs, const char* mol, const char* rel, char* chain, bool* drop, Parser::ESource source)
727 {
728  if (! mol)
729  return;
730 
731  if (! chain) {
732  CRef<CPDB_seq_id> pdb_seq_id(new CPDB_seq_id);
733  pdb_seq_id->SetMol(CPDB_mol_id(mol));
734 
735  if (rel) {
736  CRef<CDate> date = MakeDatePtr(rel, source);
737  pdb_seq_id->SetRel(*date);
738  }
739 
740  CRef<CSeq_id> sid(new CSeq_id);
741  sid->SetPdb(*pdb_seq_id);
742  refs.push_back(sid);
743  } else
744  MakeChainPDBSeqId(refs, mol, chain);
745 }
746 
747 /**********************************************************/
748 static void GetIntFuzzPtr(Uint1 choice, Int4 a, Int4 b, CInt_fuzz& fuzz)
749 {
750  if (choice < 1 || choice > 4)
751  return;
752 
753  if (choice == 2) {
754  fuzz.SetRange().SetMax(a);
755  if (b >= 0)
756  fuzz.SetRange().SetMin(b);
757  } else if (choice == 4) {
758  fuzz.SetLim(static_cast<CInt_fuzz::ELim>(a));
759  }
760 }
761 
762 /**********************************************************/
764 {
765  DataBlkPtr subdbp;
766  char* p;
767  Int4 gmod = -1;
768 
769  for (; dbp; dbp = dbp->mpNext)
770  if (dbp->mType == ParFlatSP_OS) {
771  subdbp = static_cast<DataBlk*>(dbp->mpData);
772  for (; subdbp; subdbp = subdbp->mpNext)
773  if (subdbp->mType == ParFlatSP_OG) {
774  p = subdbp->mOffset + ParFlat_COL_DATA_SP;
775  if (StringEquNI(p, "Plastid;", 8))
776  for (p += 8; *p == ' ';)
777  p++;
778  gmod = StringMatchIcase(SP_organelle, p);
779  }
780  }
781  if (gmod < 0)
783  if (gmod == 0)
785  if (gmod == 1)
787  if (gmod == 2)
789  if (gmod == 3)
791  if (gmod == 4)
793  if (gmod == 5)
795  if (gmod == 6)
797  if (gmod == 7 || gmod == 8)
800 }
801 
802 /**********************************************************/
803 static void SpAddToIndexBlk(DataBlkPtr entry, IndexblkPtr pIndex)
804 {
805  char* eptr;
806  char* offset;
807  size_t len = 0;
808 
809  offset = SrchNodeType(entry, ParFlatSP_ID, &len);
810  if (! offset || len == 0)
811  return;
812 
813  eptr = offset + len - 1;
814  if (len > 5 && StringEquN(eptr - 3, "AA.", 3))
815  eptr -= 4;
816 
817  while (*eptr == ' ' && eptr > offset)
818  eptr--;
819  while (isdigit(*eptr) != 0 && eptr > offset)
820  eptr--;
821  pIndex->bases = atoi(eptr + 1);
822  while (*eptr == ' ' && eptr > offset)
823  eptr--;
824  if (*eptr == ';')
825  eptr--;
826  while (isalpha(*eptr) != 0 && eptr > offset)
827  eptr--;
828 
829  StringNCpy(pIndex->division, eptr + 1, 3);
830  pIndex->division[3] = '\0';
831 }
832 
833 /**********************************************************
834  *
835  * static void GetSprotSubBlock(pp, entry):
836  *
837  * 9-23-93
838  *
839  **********************************************************/
840 static void GetSprotSubBlock(ParserPtr pp, const DataBlk* entry)
841 {
842  DataBlkPtr dbp;
843 
844  dbp = TrackNodeType(*entry, ParFlatSP_OS);
845  if (dbp) {
846  BuildSubBlock(dbp, ParFlatSP_OG, "OG");
847  BuildSubBlock(dbp, ParFlatSP_OC, "OC");
848  BuildSubBlock(dbp, ParFlatSP_OX, "OX");
849  BuildSubBlock(dbp, ParFlatSP_OH, "OH");
850  GetLenSubNode(dbp);
851  }
852 
853  dbp = TrackNodeType(*entry, ParFlatSP_RN);
854  for (; dbp; dbp = dbp->mpNext) {
855  if (dbp->mType != ParFlatSP_RN)
856  continue;
857 
858  BuildSubBlock(dbp, ParFlatSP_RP, "RP");
859  BuildSubBlock(dbp, ParFlatSP_RC, "RC");
860  BuildSubBlock(dbp, ParFlatSP_RM, "RM");
861  BuildSubBlock(dbp, ParFlatSP_RX, "RX");
862  BuildSubBlock(dbp, ParFlatSP_RG, "RG");
863  BuildSubBlock(dbp, ParFlatSP_RA, "RA");
864  BuildSubBlock(dbp, ParFlatSP_RT, "RT");
865  BuildSubBlock(dbp, ParFlatSP_RL, "RL");
866  GetLenSubNode(dbp);
867  dbp->mType = ParFlat_REF_END; /* swiss-prot only has one type */
868  }
869 }
870 
871 /**********************************************************
872  *
873  * static char* GetSPDescrTitle(bptr, eptr, fragment)
874  *
875  * Return title string without "(EC ...)" and
876  * "(FRAGMENT)".
877  *
878  * 10-8-93
879  *
880  **********************************************************/
881 static string GetSPDescrTitle(string_view sv, bool* fragment)
882 {
883  const char* tag;
884  char* ptr;
885  char* str;
886  char* p;
887  char* q;
888  Char symb;
889  Int4 shift;
890  bool ret;
891 
893  StripECO(str_);
894 
895  if (str_.find("(GENE NAME") != string::npos) {
896  ErrPostStr(SEV_WARNING, ERR_GENENAME_DELineGeneName, "Old format, found gene_name in the DE data line");
897  }
898 
899  ShrinkSpaces(str_);
900  str = StringSave(str_);
901 
902  /* Delete (EC ...)
903  */
904  if (StringEquNI(str, "RecName: ", 9) ||
905  StringEquNI(str, "AltName: ", 9) ||
906  StringEquNI(str, "SubName: ", 9)) {
907  tag = "; EC=";
908  symb = ';';
909  shift = 5;
910  } else {
911  tag = "(EC ";
912  symb = ')';
913  shift = 4;
914  }
915 
916  for (ptr = str;;) {
917  ptr = StringStr(ptr, tag);
918  if (! ptr)
919  break;
920 
921  for (p = ptr + shift; *p == ' ';)
922  p++;
923 
924  if (*p == symb || *p == '\0') {
925  ptr = p;
926  continue;
927  }
928 
929  while (*p == '.' || *p == '-' || *p == 'n' || isdigit(*p) != 0)
930  p++;
931  if (symb == ')')
932  while (*p == ' ' || *p == ')')
933  p++;
934 
935  fta_StringCpy(ptr, p);
936  }
937 
938  if (symb == ';') {
939  for (ptr = str;;) {
940  ptr = StringIStr(ptr, "; Flags:");
941  if (! ptr)
942  break;
943  if (ptr[8] == '\0') {
944  *ptr = '\0';
945  break;
946  }
947  if (ptr[8] != ' ') {
948  ptr += 8;
949  continue;
950  ;
951  }
952  for (q = ptr + 8;;) {
953  p = StringChr(q, ':');
954  q = StringIStr(q, " Fragment");
955  if (! q || (p && q > p))
956  break;
957 
958  ret = true;
959  if (q[9] == ';')
960  fta_StringCpy(q, q + 10);
961  else if (q[9] == '\0')
962  *q = '\0';
963  else if (q[9] == 's' || q[9] == 'S') {
964  if (q[10] == ';')
965  fta_StringCpy(q, q + 11);
966  else if (q[10] == '\0')
967  *q = '\0';
968  else {
969  q++;
970  ret = false;
971  }
972  } else {
973  q++;
974  ret = false;
975  }
976  if (ret)
977  *fragment = true;
978  }
979  if (ptr[8] == '\0') {
980  *ptr = '\0';
981  break;
982  }
983  q = StringChr(ptr + 8, ';');
984  p = StringChr(ptr + 8, ':');
985  if (! q) {
986  if (! p)
987  break;
988  else
989  fta_StringCpy(ptr + 2, ptr + 9);
990  } else {
991  if (! p)
992  ptr += 9;
993  else {
994  if (p < q)
995  fta_StringCpy(ptr + 2, ptr + 9);
996  else
997  ptr += 9;
998  }
999  }
1000  }
1001  } else {
1002  ptr = StringIStr(str, "(FRAGMENT");
1003  if (ptr) {
1004  /* delete (FRAGMENTS) or (FRAGMENT)
1005  */
1006  *fragment = true;
1007 
1008  for (p = ptr + 8; *p != '\0' && *p != ')';)
1009  p++;
1010  while (*p == ' ' || *p == ')')
1011  p++;
1012 
1013  fta_StringCpy(ptr, p);
1014  }
1015  }
1016 
1017  string s = tata_save(str);
1018  MemFree(str);
1019  if (! s.empty() && s.back() == '.') {
1020  s.pop_back();
1021  while (! s.empty() && s.back() == ' ')
1022  s.pop_back();
1023  s.push_back('.');
1024  }
1025  return s;
1026 }
1027 
1028 /**********************************************************/
1029 static char* GetLineOSorOC(DataBlkPtr dbp, const char* pattern)
1030 {
1031  char* res;
1032  char* p;
1033  char* q;
1034 
1035  size_t len = dbp->len;
1036  if (len == 0)
1037  return nullptr;
1038  for (size_t i = 0; i < dbp->len; i++)
1039  if (dbp->mOffset[i] == '\n')
1040  len -= 5;
1041  res = StringNew(len - 1);
1042  p = res;
1043  for (q = dbp->mOffset; *q != '\0';) {
1044  if (! StringEquN(q, pattern, 5))
1045  break;
1046  if (p > res)
1047  *p++ = ' ';
1048  for (q += 5; *q != '\n' && *q != '\0'; q++)
1049  *p++ = *q;
1050  if (*q == '\n')
1051  q++;
1052  }
1053  *p = '\0';
1054  if (p > res)
1055  p--;
1056  while (*p == '.' || *p == ' ' || *p == '\t') {
1057  *p = '\0';
1058  if (p > res)
1059  p--;
1060  }
1061  return (res);
1062 }
1063 
1064 /**********************************************************/
1066 {
1067  SetOfSpeciesPtr res;
1068  SetOfSynsPtr ssp;
1069  SetOfSynsPtr tssp;
1070  char* p;
1071  char* q;
1072  char* r;
1073  char* temp;
1074  Int2 i;
1075 
1076  if (! line || line[0] == '\0')
1077  return nullptr;
1078  for (p = line; *p == ' ' || *p == '\t' || *p == '.' || *p == ',';)
1079  p++;
1080  if (*p == '\0')
1081  return nullptr;
1082 
1083  res = new SetOfSpecies;
1084  res->fullname = StringSave(p);
1085 
1086  temp = StringSave(res->fullname);
1087  p = StringChr(temp, '(');
1088  if (! p)
1089  res->name = StringSave(temp);
1090  else {
1091  *p = '\0';
1092  q = temp;
1093  if (p > q) {
1094  for (r = p - 1; *r == ' ' || *r == '\t'; r--) {
1095  *r = '\0';
1096  if (r == q)
1097  break;
1098  }
1099  }
1100  res->name = StringSave(temp);
1101  *p = '(';
1102  ssp = new SetOfSyns;
1103  tssp = ssp;
1104  for (;;) {
1105  for (p++; *p == ' ' || *p == '\t';)
1106  p++;
1107  q = p;
1108  for (i = 1; *p != '\0'; p++) {
1109  if (*p == '(')
1110  i++;
1111  else if (*p == ')')
1112  i--;
1113  if (i == 0)
1114  break;
1115  }
1116  if (*p == '\0') {
1117  tssp->next = new SetOfSyns;
1118  tssp = tssp->next;
1119  tssp->synname = StringSave(q);
1120  break;
1121  }
1122  *p = '\0';
1123  if (p > q) {
1124  for (r = p - 1; *r == ' ' || *r == '\t'; r--) {
1125  *r = '\0';
1126  if (r == q)
1127  break;
1128  }
1129  }
1130  tssp->next = new SetOfSyns;
1131  tssp = tssp->next;
1132  tssp->synname = StringSave(q);
1133  *p = ')';
1134  p = StringChr(p, '(');
1135  if (! p)
1136  break;
1137  }
1138 
1139  res->syn = ssp->next;
1140  delete ssp;
1141  }
1142 
1143  MemFree(temp);
1144  return (res);
1145 }
1146 
1147 /**********************************************************/
1148 static void fix_taxname_dot(COrg_ref& org_ref)
1149 {
1150  if (! org_ref.IsSetTaxname())
1151  return;
1152 
1153  string& taxname = org_ref.SetTaxname();
1154 
1155  size_t len = taxname.size();
1156  if (len < 3)
1157  return;
1158 
1159  const Char* p = taxname.c_str() + len - 3;
1160  if ((p[0] == ' ' || p[0] == '\t') && (p[1] == 's' || p[1] == 'S') &&
1161  (p[2] == 'p' || p[2] == 'P') && p[3] == '\0') {
1162  if (NStr::CompareNocase(taxname.c_str(), "BACTERIOPHAGE SP") == 0)
1163  return;
1164 
1165  taxname += ".";
1166  }
1167 }
1168 
1169 /**********************************************************/
1171 {
1172  SetOfSynsPtr synsp;
1173 
1174  const char** b;
1175 
1176  char* p;
1177  char* q;
1178  Uint1 num;
1179  size_t i;
1180 
1181  CRef<COrg_ref> org_ref;
1182 
1183  if (! sosp)
1184  return org_ref;
1185 
1186  org_ref.Reset(new COrg_ref);
1187 
1188  if (sosp->name && sosp->name[0] != '\0')
1189  org_ref->SetTaxname(sosp->name);
1190 
1191  for (synsp = sosp->syn; synsp; synsp = synsp->next) {
1192  p = synsp->synname;
1193  if (! p || *p == '\0')
1194  continue;
1195 
1196  q = StringIStr(p, "PLASMID");
1197  if (! q)
1198  q = StringIStr(p, "CLONE");
1199  if (q) {
1200  i = (*q == 'C' || *q == 'c') ? 5 : 7;
1201  if (q > p) {
1202  q--;
1203  i++;
1204  }
1205  if ((q == p || q[0] == ' ' || q[0] == '\t') &&
1206  (q[i] == ' ' || q[i] == '\t' || q[i] == '\0')) {
1207  if (! org_ref->IsSetTaxname())
1208  org_ref->SetTaxname(p);
1209  else {
1210  string& taxname = org_ref->SetTaxname();
1211  taxname += " (";
1212  taxname += p;
1213  taxname += ")";
1214  }
1215  continue;
1216  }
1217  }
1218 
1219  if ((StringEquNI(p, "PV.", 3) && (p[3] == ' ' || p[3] == '\t' || p[3] == '\0')) ||
1220  NStr::CompareNocase(p, "AD11A") == 0 || NStr::CompareNocase(p, "AD11P") == 0) {
1221  if (! org_ref->IsSetTaxname())
1222  org_ref->SetTaxname(p);
1223  else {
1224  string& taxname = org_ref->SetTaxname();
1225  taxname += " (";
1226  taxname += p;
1227  taxname += ")";
1228  }
1229  continue;
1230  }
1231 
1232  for (q = p; *p != '\0' && *p != ' ' && *p != '\t';)
1233  p++;
1234  if (*p == '\0') {
1235  org_ref->SetSyn().push_back(q);
1236  continue;
1237  }
1238 
1239  *p = '\0';
1240  for (q = p + 1; *q == ' ' || *q == '\t';)
1241  q++;
1242 
1243  if (NStr::CompareNocase(synsp->synname, "COMMON") == 0) {
1244  if (! org_ref->IsSetCommon())
1245  org_ref->SetCommon(q);
1246  else
1247  org_ref->SetSyn().push_back(q);
1248  continue;
1249  }
1250 
1251  for (b = org_mods, num = 2; *b; b++, num++)
1252  if (NStr::CompareNocase(synsp->synname, *b) == 0)
1253  break;
1254  *p = ' ';
1255 
1256  if (! *b) {
1257  for (b = org_mods, num = 2; *b; b++, num++) {
1258  if (NStr::CompareNocase(*b, "ISOLATE") != 0 &&
1259  NStr::CompareNocase(*b, "STRAIN") != 0)
1260  continue;
1261  p = StringIStr(synsp->synname, *b);
1262  if (! p)
1263  continue;
1264 
1265  p--;
1266  i = StringLen(*b) + 1;
1267  if (*p == ' ' && (p[i] == ' ' || p[i] == '\t' || p[i] == '\0')) {
1268  string& taxname = org_ref->SetTaxname();
1269  taxname += " (";
1270  taxname += synsp->synname;
1271  taxname += ")";
1272  break;
1273  }
1274  }
1275 
1276  if (! *b)
1277  org_ref->SetSyn().push_back(synsp->synname);
1278  continue;
1279  }
1280 
1281  string& taxname = org_ref->SetTaxname();
1282  if (! taxname.empty())
1283  taxname += " ";
1284 
1285  taxname += "(";
1286  taxname += synsp->synname;
1287  taxname += ")";
1288  }
1289 
1290  fix_taxname_dot(*org_ref);
1291  if (org_ref->IsSetSyn() && org_ref->GetSyn().empty())
1292  org_ref->ResetSyn();
1293 
1294  return org_ref;
1295 }
1296 
1297 /**********************************************************/
1299 {
1300  SetOfSynsPtr ssp;
1301  SetOfSynsPtr tssp;
1302 
1303  if (sosp->fullname)
1304  MemFree(sosp->fullname);
1305  if (sosp->name)
1306  MemFree(sosp->name);
1307  for (ssp = sosp->syn; ssp; ssp = tssp) {
1308  tssp = ssp->next;
1309  if (ssp->synname)
1310  MemFree(ssp->synname);
1311  delete ssp;
1312  }
1313  delete sosp;
1314 }
1315 
1316 /**********************************************************/
1318 {
1319  ViralHostPtr vhp;
1320  ViralHostPtr tvhp;
1321  char* line;
1322  char* p;
1323  char* q;
1324  char* r;
1325  Char ch;
1326 
1327  for (; dbp; dbp = dbp->mpNext)
1328  if (dbp->mType == ParFlatSP_OS)
1329  break;
1330  if (! dbp)
1331  return nullptr;
1332 
1333  for (dbp = static_cast<DataBlk*>(dbp->mpData); dbp; dbp = dbp->mpNext)
1334  if (dbp->mType == ParFlatSP_OH)
1335  break;
1336  if (! dbp)
1337  return nullptr;
1338 
1339  vhp = new ViralHost;
1340  tvhp = vhp;
1341 
1342  line = StringNew(dbp->len + 1);
1343  ch = dbp->mOffset[dbp->len - 1];
1344  dbp->mOffset[dbp->len - 1] = '\0';
1345  line[0] = '\n';
1346  line[1] = '\0';
1347  StringCat(line, dbp->mOffset);
1348  dbp->mOffset[dbp->len - 1] = ch;
1349 
1350  if (! StringEquNI(line, "\nOH NCBI_TaxID=", 17)) {
1351  ch = '\0';
1352  p = StringChr(line + 1, '\n');
1353  if (p)
1354  *p = '\0';
1355  if (StringLen(line + 1) > 20) {
1356  ch = line[21];
1357  line[21] = '\0';
1358  }
1359  ErrPostEx(SEV_ERROR, ERR_SOURCE_UnknownOHType, "Unknown beginning of OH block: \"%s[...]\".", line + 1);
1360  if (ch != '\0')
1361  line[21] = ch;
1362  if (p)
1363  *p = '\n';
1364  }
1365 
1366  for (p = line;;) {
1367  p = StringIStr(p, "\nOH NCBI_TaxID=");
1368  if (! p)
1369  break;
1370  for (p += 17, q = p; *q == ' ';)
1371  q++;
1372  r = StringChr(q, '\n');
1373  p = StringChr(q, ';');
1374  if ((! r || r > p) && p) {
1375  tvhp->next = new ViralHost;
1376  tvhp = tvhp->next;
1377  for (p--; *p == ';' || *p == ' ';)
1378  p--;
1379  p++;
1380  for (r = q; *r >= '0' && *r <= '9';)
1381  r++;
1382  *p = '\0';
1383  if (r != p) {
1384  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidNcbiTaxID, "Invalid NCBI TaxID in OH line : \"%s\".", q);
1385  tvhp->taxid = ZERO_TAX_ID;
1386  } else
1387  tvhp->taxid = TAX_ID_FROM(int, atoi(q));
1388  for (p++; *p == ' ' || *p == ';';)
1389  p++;
1390  r = StringChr(p, '\n');
1391  if (! r)
1392  r = p + StringLen(p);
1393  else
1394  r--;
1395  while ((*r == ' ' || *r == '.' || *r == '\0') && r > p)
1396  r--;
1397  if (*r != '\0' && *r != '.' && *r != ' ')
1398  r++;
1399  ch = *r;
1400  *r = '\0';
1401  tvhp->name = StringSave(p);
1402  ShrinkSpaces(tvhp->name);
1403  *r = ch;
1404  p = r;
1405  } else {
1406  if (r)
1407  *r = '\0';
1408  ErrPostEx(SEV_ERROR, ERR_SOURCE_IncorrectOHLine, "Incorrect OH line content skipped: \"%s\".", q);
1409  if (r)
1410  *r = '\n';
1411  p = q;
1412  }
1413  }
1414  MemFree(line);
1415 
1416  tvhp = vhp->next;
1417  delete vhp;
1418 
1419  if (! tvhp)
1420  ErrPostEx(SEV_WARNING, ERR_SOURCE_NoNcbiTaxIDLookup, "No legal NCBI TaxIDs found in OH line.");
1421 
1422  return (tvhp);
1423 }
1424 
1425 /**********************************************************/
1427 {
1428  DataBlkPtr subdbp;
1429  char* line;
1430  char* p;
1431  char* q;
1432  bool got;
1433  TTaxId taxid;
1434 
1435  for (got = false, taxid = ZERO_TAX_ID; dbp; dbp = dbp->mpNext) {
1436  if (dbp->mType != ParFlatSP_OS)
1437  continue;
1438 
1439  subdbp = static_cast<DataBlk*>(dbp->mpData);
1440  for (; subdbp; subdbp = subdbp->mpNext) {
1441  if (subdbp->mType != ParFlatSP_OX)
1442  continue;
1443  got = true;
1444  line = StringSave(string_view(subdbp->mOffset, subdbp->len - 1));
1445  p = StringChr(line, '\n');
1446  if (p)
1447  *p = '\0';
1448  if (! StringEquNI(line, "OX NCBI_TaxID=", 16)) {
1449  if (StringLen(line) > 20)
1450  line[20] = '\0';
1451  ErrPostEx(SEV_ERROR, ERR_SOURCE_UnknownOXType, "Unknown beginning of OX line: \"%s\".", line);
1452  MemFree(line);
1453  break;
1454  }
1455  p = StringChr(line + 16, ';');
1456  if (p) {
1457  *p++ = '\0';
1458  for (q = p; *q == ' ';)
1459  q++;
1460  if (*q != '\0') {
1461  ErrPostEx(SEV_ERROR, ERR_FORMAT_UnexpectedData, "Encountered unexpected data while parsing OX line: \"%s\" : Ignored.", p);
1462  }
1463  }
1464  for (p = line + 16; *p == ' ';)
1465  p++;
1466  if (*p == '\0') {
1467  MemFree(line);
1468  break;
1469  }
1470  for (q = p; *q >= '0' && *q <= '9';)
1471  q++;
1472  if (*q == ' ' || *q == '\0')
1473  taxid = TAX_ID_FROM(int, atoi(p));
1474  if (taxid <= ZERO_TAX_ID || (*q != ' ' && *q != '\0')) {
1475  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidNcbiTaxID, "Invalid NCBI TaxID on OX line : \"%s\" : Ignored.", p);
1476  }
1477  MemFree(line);
1478  break;
1479  }
1480  break;
1481  }
1482 
1483  if (got && taxid <= ZERO_TAX_ID)
1484  ErrPostEx(SEV_WARNING, ERR_SOURCE_NoNcbiTaxIDLookup, "No legal NCBI TaxID found on OX line : will use organism names for lookup instead.");
1485 
1486  return (taxid);
1487 }
1488 
1489 /**********************************************************/
1491 {
1492  SetOfSpeciesPtr sosp;
1493  DataBlkPtr dbp;
1494  char* line_OS;
1495  char* line_OC;
1496 
1497  CRef<COrg_ref> org_ref;
1498 
1499  line_OS = nullptr;
1500  line_OC = nullptr;
1501 
1502  for (dbp = entry; dbp; dbp = dbp->mpNext) {
1503  if (dbp->mType != ParFlatSP_OS)
1504  continue;
1505  line_OS = GetLineOSorOC(dbp, "OS ");
1506  for (dbp = static_cast<DataBlk*>(dbp->mpData); dbp; dbp = dbp->mpNext) {
1507  if (dbp->mType != ParFlatSP_OC)
1508  continue;
1509  line_OC = GetLineOSorOC(dbp, "OC ");
1510  break;
1511  }
1512  break;
1513  }
1514 
1515  if (line_OS && line_OS[0] != '\0') {
1516  sosp = GetSetOfSpecies(line_OS);
1517  if (sosp && sosp->name && sosp->name[0] != '\0') {
1518  org_ref = fill_orgref(sosp);
1519  }
1520 
1521  SetOfSpeciesFree(sosp);
1522  MemFree(line_OS);
1523  }
1524 
1525  if (org_ref.NotEmpty() && line_OC && line_OC[0] != '\0') {
1526  org_ref->SetOrgname().SetLineage(line_OC);
1527  MemFree(line_OC);
1528  }
1529 
1530  return org_ref;
1531 }
1532 
1533 /**********************************************************/
1534 static void get_plasmid(DataBlkPtr entry, CSP_block::TPlasnm& plasms)
1535 {
1536  DataBlkPtr dbp;
1537  DataBlkPtr subdbp;
1538  char* offset = nullptr;
1539  char* eptr = nullptr;
1540  char* str;
1541  char* ptr;
1542  Int4 gmod = -1;
1543 
1544  dbp = TrackNodeType(*entry, ParFlatSP_OS);
1545  for (; dbp; dbp = dbp->mpNext) {
1546  if (dbp->mType != ParFlatSP_OS)
1547  continue;
1548 
1549  subdbp = static_cast<DataBlk*>(dbp->mpData);
1550  for (; subdbp; subdbp = subdbp->mpNext) {
1551  if (subdbp->mType != ParFlatSP_OG)
1552  continue;
1553 
1554  offset = subdbp->mOffset + ParFlat_COL_DATA_SP;
1555  eptr = offset + subdbp->len;
1557  }
1558  }
1559  if (gmod != Seq_descr_GIBB_mod_plasmid)
1560  return;
1561 
1562  while ((str = StringIStr(offset, "PLASMID"))) {
1563  if (str > eptr)
1564  break;
1565 
1566  str += StringLen("PLASMID");
1567  while (*str == ' ')
1568  str++;
1569 
1570  for (ptr = str; *ptr != '\n' && *ptr != ' ';)
1571  ptr++;
1572  ptr--;
1573  if (ptr > str) {
1574  plasms.push_back(string(str, ptr));
1575  } else
1576  ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingPlasmidName, "Plasmid name is missing from OG line of SwissProt record.");
1577  offset = ptr;
1578  }
1579 }
1580 
1581 /**********************************************************
1582  *
1583  * static char* GetDRToken(ptr):
1584  *
1585  * From GetTheCurrentToken.
1586  *
1587  **********************************************************/
1588 static char* GetDRToken(char** ptr)
1589 {
1590  char* ret;
1591  char* p;
1592 
1593  p = *ptr;
1594  if (! p || *p == '\0')
1595  return nullptr;
1596 
1597  for (;; p++) {
1598  if (*p == '\0' || *p == '\n')
1599  break;
1600  if ((*p == ';' || *p == '.') && (p[1] == ' ' || p[1] == '\n'))
1601  break;
1602  }
1603 
1604  if (*p == '\0' || *p == '\n')
1605  return nullptr;
1606 
1607  *p++ = '\0';
1608 
1609  ret = *ptr;
1610 
1611  while (*p == ' ' || *p == ';' || *p == '.')
1612  p++;
1613  *ptr = p;
1614 
1615  if (*ret == '\0')
1616  return nullptr;
1617  return (ret);
1618 }
1619 
1620 /**********************************************************/
1621 static CRef<CSeq_id> AddPIDToSeqId(char* str, char* acc)
1622 {
1623  long long lID;
1624  char* end = nullptr;
1625 
1626  CRef<CSeq_id> sid;
1627 
1628  if (! str || *str == '\0')
1629  return sid;
1630 
1631  if (str[0] == '-') {
1632  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Not annotated CDS [ACC:%s, PID:%s]", acc, str);
1633  return sid;
1634  }
1635  errno = 0; /* clear errors, the error flag from stdlib */
1636  lID = strtoll(str + 1, &end, 10);
1637  if ((lID == 0 && str + 1 == end) || (lID == LLONG_MAX && errno == ERANGE)) {
1638  /* Bad or too large number
1639  */
1640  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Invalid PID value [ACC:%s, PID:%s]", acc, str);
1641  return sid;
1642  }
1643 
1644  if (*str == 'G') {
1645  sid.Reset(new CSeq_id);
1646  sid->SetGi(GI_FROM(long long, lID));
1647  } else if (*str == 'E' || *str == 'D') {
1648  CRef<CDbtag> tag(new CDbtag);
1649  tag->SetDb("PID");
1650  tag->SetTag().SetStr(str);
1651 
1652  sid.Reset(new CSeq_id);
1653  sid->SetGeneral(*tag);
1654  } else {
1655  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Unrecognized PID data base type [ACC:%s, PID:%s]", acc, str);
1656  }
1657  return sid;
1658 }
1659 
1660 /**********************************************************/
1661 static bool AddToList(ValNodePtr* head, char* str)
1662 {
1663  ValNodePtr vnp;
1664  char* data;
1665  char* dot;
1666  char* d;
1667 
1668  if (! str)
1669  return false;
1670 
1671  if (str[0] == '-' && str[1] == '\0')
1672  return true;
1673 
1674  dot = StringChr(str, '.');
1675  for (vnp = *head; vnp; vnp = vnp->next)
1676  if (StringEqu(vnp->data, str))
1677  break;
1678  if (vnp)
1679  return false;
1680 
1681  if (dot) {
1682  *dot = '\0';
1683  for (vnp = *head; vnp; vnp = vnp->next) {
1684  data = vnp->data;
1685  d = StringChr(data, '.');
1686  if (! d)
1687  continue;
1688  *d = '\0';
1689  if (StringEqu(data, str)) {
1690  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Same protein accessions with different versions found in DR line [PID1:%s.%s; PID2:%s.%s].", data, d + 1, str, dot + 1);
1691  }
1692  *d = '.';
1693  }
1694  *dot = '.';
1695  }
1697  ValNodeLink(head, vnp);
1698 
1699  return true;
1700 }
1701 
1702 /**********************************************************/
1704 {
1705  for (CSP_block::TSeqref::iterator cur_ref = refs.begin(); cur_ref != refs.end(); ++cur_ref) {
1706  if ((*cur_ref)->Which() != CSeq_id::e_Pdb || (*cur_ref)->GetPdb().IsSetRel())
1707  continue;
1708 
1709  bool got = false;
1710 
1711  const CPDB_seq_id& cur_id = (*cur_ref)->GetPdb();
1712  CSP_block::TSeqref::iterator next_ref = cur_ref;
1713 
1714  for (++next_ref; next_ref != refs.end();) {
1715  if ((*next_ref)->Which() != CSeq_id::e_Pdb ||
1716  (*next_ref)->GetPdb().IsSetRel())
1717  continue;
1718 
1719  const CPDB_seq_id& next_id = (*next_ref)->GetPdb();
1720 
1721  if (cur_id.GetMol().Get() != next_id.GetMol().Get()) {
1722  ++next_ref;
1723  continue;
1724  }
1725 
1726  if (next_id.GetChain() != 32) {
1727  if (! got && cur_id.GetChain() == 32) {
1728  got = true;
1729  /* Commented out until the proper handling of PDB chain contents
1730  ErrPostEx(SEV_WARNING, ERR_FORMAT_DuplicateCrossRef,
1731  "Duplicate PDB cross reference removed, mol = \"%s\", chain = \"%d\".",
1732  psip1->mol, (int) psip1->chain);
1733 */
1734  }
1735  if (cur_id.GetChain() != next_id.GetChain()) {
1736  ++next_ref;
1737  continue;
1738  }
1739  }
1740 
1741  next_ref = refs.erase(next_ref);
1742  /* Commented out until the proper handling of PDB chain contents
1743  ErrPostEx(SEV_WARNING, ERR_FORMAT_DuplicateCrossRef,
1744  "Duplicate PDB cross reference removed, mol = \"%s\", chain = \"%d\".",
1745  psip2->mol, (int) psip2->chain);
1746 */
1747  }
1748  }
1749 }
1750 
1751 /**********************************************************/
1752 static void fta_check_embl_drxref_dups(ValNodePtr embl_acc_list)
1753 {
1754  ValNodePtr vnp;
1755  ValNodePtr vnpn;
1756  const char* n;
1757  const char* p;
1758  const char* q;
1759 
1760  if (! embl_acc_list || ! embl_acc_list->next->next)
1761  return;
1762 
1763  for (vnp = embl_acc_list; vnp; vnp = vnp->next->next) {
1764  p = vnp->data;
1765  q = StringChr(p, '.');
1766  if (q) {
1767  for (p = q + 1; *p >= '0' && *p <= '9';)
1768  p++;
1769  if (*p != '\0')
1770  q = nullptr;
1771  p = vnp->data;
1772  }
1773  n = vnp->next->data;
1774  for (vnpn = vnp->next->next; vnpn; vnpn = vnpn->next->next) {
1775  if (vnp->next->choice != vnpn->next->choice &&
1776  StringEqu(p, vnpn->data)) {
1777  if (GetProtAccOwner(q ? CTempString(p, q - p) : CTempString(p)) > CSeq_id::e_not_set)
1778  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLineCrossDBProtein, "Protein accession \"%s\" associated with \"%s\" and \"%s\".", vnpn->data, n, vnpn->next->data);
1779  }
1780  }
1781  }
1782 }
1783 
1784 /**********************************************************
1785  *
1786  * static void GetDRlineDataSP(entry, spbp, drop, source):
1787  *
1788  * Database identifiers on the DR lines which point to
1789  * entries in GenBank, EMBL, DDBJ, PIR, or PDB are output
1790  * as Seq-id's of the appropriate type:
1791  * - For GenBank and DDBJ, only the primary identifier
1792  * (accession number) is captured; and their database
1793  * references are actually labelled as "EMBL". Their
1794  * true nature is determined by the accession number
1795  * ownership rules described by accession prefix.
1796  * - For EMBL, both the primary and secondary
1797  * identifiers are captured.
1798  * - For PIR, we only capture the secondary
1799  * identifier (name).
1800  * - For PDB, we capture both the primary identifier
1801  * (molecule name) and the secondary identifier
1802  * (release date).
1803  * example:
1804  * DR EMBL; J05536; RNPCBB.
1805  * DR EMBL; X51318; RN10SP.
1806  * DR PIR; A36581; A36581.
1807  * DR PDB; 1CCD; PRELIMINARY.
1808  * Release 33.0 Cross-references to EMBL/GenBank/DDBJ
1809  *
1810  * DR EMBL; X51318; G63880; -.
1811  *
1812  * seqref {
1813  * genbank {
1814  * accession "J05536" } ,
1815  * embl {
1816  * name "RN10SP" ,
1817  * accession "X51318" } ,
1818  * pir {
1819  * name "A36581" } ,
1820  * pdb {
1821  * mol "1CCD" ,
1822  * rel
1823  * str "PRELIMINARY" }
1824  *
1825  * Release 33.0
1826  *
1827  * seqref {
1828  * gi 63880 ,
1829  * } ,
1830  *
1831  * All other databank references are output using Dbtag.
1832  * In these cases, secondary identifiers, whether
1833  * entry-names, release numbers, or date-stamps, are not
1834  * captured since Dbtag has no provision for them.
1835  * example:
1836  * DR PROSITE; PS00403; UTEROGLOBIN_1.
1837  * DR PROSITE; PS00404; UTEROGLOBIN_2.
1838  * dbref {
1839  * {
1840  * db "PROSITE" ,
1841  * tag
1842  * str "PS00403" } ,
1843  * {
1844  * db "PROSITE" ,
1845  * tag
1846  * str "PS00404" } } ,
1847  *
1848  * Also need to delete duplicated DR line.
1849  *
1850  **********************************************************/
1851 static void GetDRlineDataSP(DataBlkPtr entry, CSP_block& spb, bool* drop, Parser::ESource source)
1852 {
1853  ValNodePtr embl_vnp;
1854  ValNodePtr acc_list = nullptr;
1855  ValNodePtr pid_list = nullptr;
1856  ValNodePtr ens_tran_list = nullptr;
1857  ValNodePtr ens_prot_list = nullptr;
1858  ValNodePtr ens_gene_list = nullptr;
1859  ValNodePtr embl_acc_list = nullptr;
1860  const char** b;
1861  char* offset;
1862  const char* token1;
1863  char* token2;
1864  char* token3;
1865  char* token4;
1866  char* token5;
1867  char* str;
1868  char* ptr;
1869  char* p;
1870  char* q;
1871  bool pdbold;
1872  bool pdbnew;
1873  bool check_embl_prot;
1874  size_t len = 0;
1875  Char ch;
1876 
1877  CSeq_id::E_Choice ptype;
1878  CSeq_id::E_Choice ntype;
1879 
1880  spb.ResetSeqref();
1881  spb.ResetDbref();
1882 
1883  offset = SrchNodeType(entry, ParFlatSP_DR, &len);
1884  if (! offset)
1885  return;
1886 
1887  ch = offset[len];
1888  offset[len] = '\0';
1889  str = StringNew(len + 1);
1890  StringCpy(str, "\n");
1891  StringCat(str, offset);
1892  offset[len] = ch;
1893  pdbold = false;
1894  pdbnew = false;
1895  embl_acc_list = ValNodeNew(nullptr);
1896  embl_vnp = embl_acc_list;
1897  check_embl_prot = false;
1898  for (ptr = str;;) {
1899  if (*drop)
1900  break;
1901  ptr = StringChr(ptr, '\n');
1902  if (! ptr)
1903  break;
1904  ptr++;
1905  if (! StringEquN(ptr, "DR ", 5))
1906  continue;
1907  ptr += ParFlat_COL_DATA_SP;
1908  token1 = GetDRToken(&ptr);
1909  token2 = GetDRToken(&ptr);
1910  token3 = GetDRToken(&ptr);
1911  token4 = GetDRToken(&ptr);
1912  token5 = GetDRToken(&ptr);
1913  if (! token1 || ! token2 || ! token3 ||
1914  (StringEqu(token2, "-") && StringEqu(token3, "-"))) {
1915  ErrPostEx(SEV_ERROR, ERR_SPROT_DRLine, "Badly formatted DR line. Skipped.");
1916  continue;
1917  }
1918 
1919  if (NStr::CompareNocase(token1, "MD5") == 0)
1920  continue;
1921 
1922  for (b = valid_dbs; *b; b++)
1923  if (NStr::CompareNocase(*b, token1) == 0)
1924  break;
1925  if (! *b) {
1926  for (b = obsolete_dbs; *b; b++)
1927  if (NStr::CompareNocase(*b, token1) == 0)
1928  break;
1929  if (! *b)
1930  ErrPostEx(SEV_WARNING, ERR_DRXREF_UnknownDBname, "Encountered a new/unknown database name in DR line: \"%s\".", token1);
1931  else
1932  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Obsolete database name found in DR line: \"%s\".", token1);
1933  }
1934 
1935  if (NStr::CompareNocase(token1, "PDB") == 0) {
1936  if (! token4)
1937  pdbold = true;
1938  else
1939  pdbnew = true;
1940 
1941  MakePDBSeqId(spb.SetSeqref(), token2, token3, token5 ? token5 : token4, drop, source);
1942  } else if (NStr::CompareNocase(token1, "PIR") == 0) {
1944  if (id.NotEmpty())
1945  spb.SetSeqref().push_back(id);
1946  } else if (NStr::CompareNocase(token1, "EMBL") == 0) {
1947  p = StringChr(token2, '.');
1948  ntype = GetNucAccOwner(p ? CTempString(token2, p - token2) : CTempString(token2));
1949  if (ntype == CSeq_id::e_not_set) {
1950  ErrPostEx(SEV_ERROR, ERR_SPROT_DRLine, "Incorrect NA accession is used in DR line: \"%s\". Skipped...", token2);
1951  } else if (AddToList(&acc_list, token2)) {
1952  CRef<CSeq_id> id(MakeAccSeqId(token2, ntype, p ? true : false,
1953  p ? (Int2) atoi(p + 1) : 0));
1954  if (id.NotEmpty())
1955  spb.SetSeqref().push_back(id);
1956  }
1957  if (p)
1958  *p = '\0';
1959 
1960  ptype = CSeq_id::e_not_set;
1961  if (token3[0] >= 'A' && token3[0] <= 'Z' &&
1962  token3[1] >= 'A' && token3[1] <= 'Z') {
1963  p = StringChr(token3, '.');
1964  if (p) {
1965  ptype = GetProtAccOwner(CTempString(token3, p - token3));
1966  for (q = p + 1; *q >= '0' && *q <= '9';)
1967  q++;
1968  if (q == p + 1 || *q != '\0')
1969  p = nullptr;
1970  }
1971  if (! p || ptype == CSeq_id::e_not_set) {
1972  ErrPostEx(SEV_ERROR, ERR_SPROT_DRLine, "Incorrect protein accession is used in DR line [ACC:%s; PID:%s]. Skipped...", token2, token3);
1973  continue;
1974  }
1975  } else
1976  p = nullptr;
1977 
1978  if (ntype > CSeq_id::e_not_set) {
1979  embl_vnp->next = ConstructValNode(ptype, token3);
1980  embl_vnp = embl_vnp->next;
1981  embl_vnp->next = ConstructValNode(ntype, token2);
1982  embl_vnp = embl_vnp->next;
1983  }
1984 
1985  if (! AddToList(&pid_list, token3)) {
1986  check_embl_prot = true;
1987  continue;
1988  }
1989 
1990  CRef<CSeq_id> id;
1991  if (! p)
1992  id = AddPIDToSeqId(token3, token2);
1993  else {
1994  *p++ = '\0';
1995  id = MakeAccSeqId(token3, ptype, true, (Int2)atoi(p));
1996  }
1997 
1998  if (id.NotEmpty())
1999  spb.SetSeqref().push_back(id);
2000  } else if (NStr::CompareNocase(token1, "ENSEMBL") == 0 ||
2001  NStr::CompareNocase(token1, "ENSEMBLBACTERIA") == 0 ||
2002  NStr::CompareNocase(token1, "ENSEMBLFUNGI") == 0 ||
2003  NStr::CompareNocase(token1, "ENSEMBLMETAZOA") == 0 ||
2004  NStr::CompareNocase(token1, "ENSEMBLPLANTS") == 0 ||
2005  NStr::CompareNocase(token1, "ENSEMBLPROTISTS") == 0 ||
2006  NStr::CompareNocase(token1, "WORMBASE") == 0) {
2007  if (AddToList(&ens_tran_list, token2)) {
2008  CRef<CDbtag> tag = MakeStrDbtag(token1, token2);
2009  if (tag.NotEmpty())
2010  spb.SetDbref().push_back(tag);
2011  }
2012 
2013  if (! AddToList(&ens_prot_list, token3)) {
2014  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Duplicated protein id \"%s\" in \"%s\" DR line.", token3, token1);
2015  } else {
2016  CRef<CDbtag> tag = MakeStrDbtag(token1, token3);
2017  if (tag.NotEmpty())
2018  spb.SetDbref().push_back(tag);
2019  }
2020 
2021  if (token4 && AddToList(&ens_gene_list, token4)) {
2022  CRef<CDbtag> tag = MakeStrDbtag(token1, token4);
2023  if (tag.NotEmpty())
2024  spb.SetDbref().push_back(tag);
2025  }
2026  } else if (NStr::CompareNocase(token1, "REFSEQ") == 0) {
2027  ptype = CSeq_id::e_not_set;
2028  if (token2[0] >= 'A' && token2[0] <= 'Z' &&
2029  token2[1] >= 'A' && token2[1] <= 'Z') {
2030  p = StringChr(token2, '.');
2031  if (p) {
2032  ptype = GetProtAccOwner(CTempString(token2, p - token2));
2033  for (q = p + 1; *q >= '0' && *q <= '9';)
2034  q++;
2035  if (q == p + 1 || *q != '\0')
2036  p = nullptr;
2037  }
2038  if (ptype != CSeq_id::e_Other)
2039  p = nullptr;
2040  } else
2041  p = nullptr;
2042 
2043  if (! p) {
2044  ErrPostEx(SEV_ERROR, ERR_SPROT_DRLine, "Incorrect protein accession.version is used in RefSeq DR line: \"%s\". Skipped...", token2);
2045  continue;
2046  }
2047 
2048  if (! AddToList(&pid_list, token2))
2049  continue;
2050 
2051  *p++ = '\0';
2052  CRef<CSeq_id> id(MakeAccSeqId(token2, ptype, true, (Int2)atoi(p)));
2053  if (id.NotEmpty())
2054  spb.SetSeqref().push_back(id);
2055  } else {
2056  if (NStr::CompareNocase(token1, "GK") == 0)
2057  token1 = "Reactome";
2058  else if (NStr::CompareNocase(token1, "GENEW") == 0)
2059  token1 = "HGNC";
2060  else if (NStr::CompareNocase(token1, "GeneDB_Spombe") == 0)
2061  token1 = "PomBase";
2062  else if (NStr::CompareNocase(token1, "PomBase") == 0 &&
2063  StringEquNI(token2, "PomBase:", 8))
2064  token2 += 8;
2065 
2066  CRef<CDbtag> tag = MakeStrDbtag(token1, token2);
2067  if (tag.NotEmpty()) {
2068  bool not_found = true;
2069 
2070  for (const auto& cur_tag : spb.SetDbref()) {
2071  if (tag->Match(*cur_tag)) {
2072  not_found = false;
2073  break;
2074  }
2075  }
2076  if (not_found)
2077  spb.SetDbref().push_back(tag);
2078  }
2079  }
2080  }
2081 
2082  if (embl_acc_list->next) {
2083  if (check_embl_prot)
2084  fta_check_embl_drxref_dups(embl_acc_list->next);
2085  ValNodeFreeData(embl_acc_list->next);
2086  }
2087  delete embl_acc_list;
2088 
2089  if (acc_list)
2090  ValNodeFreeData(acc_list);
2091  if (pid_list)
2092  ValNodeFreeData(pid_list);
2093  if (ens_tran_list)
2094  ValNodeFreeData(ens_tran_list);
2095  if (ens_prot_list)
2096  ValNodeFreeData(ens_prot_list);
2097  if (ens_gene_list)
2098  ValNodeFreeData(ens_gene_list);
2099  MemFree(str);
2100 
2101  if (pdbold && pdbnew) {
2102  ErrPostEx(SEV_REJECT, ERR_FORMAT_MixedPDBXrefs, "Both old and new types of PDB cross-references exist on this record. Only one style is allowed.");
2103  *drop = true;
2104  }
2105 
2106  if (pdbnew && spb.SetSeqref().size() > 1)
2108 }
2109 
2110 /**********************************************************
2111  *
2112  * static bool GetSPDate(pp, entry, crdate, sequpd,
2113  * annotupd, ver_num):
2114  *
2115  * Contain three lines in order created, last sequence
2116  * update, last annotation update.
2117  *
2118  * 9-30-93
2119  *
2120  **********************************************************/
2121 static bool GetSPDate(ParserPtr pp, DataBlkPtr entry, CDate& crdate, CDate& sequpd, CDate& annotupd, short* ver_num)
2122 {
2123  ValNodePtr vnp;
2124  ValNodePtr tvnp;
2125  char* offset;
2126  char* p;
2127  char* q;
2128  bool new_style;
2129  bool ret;
2130  Char ch;
2131  Int4 first;
2132  Int4 second;
2133  Int4 third;
2134  size_t len;
2135 
2136  CRef<CDate_std> std_crdate,
2137  std_sequpd,
2138  std_annotupd;
2139 
2140  if (ver_num)
2141  *ver_num = 0;
2142 
2143  offset = SrchNodeType(entry, ParFlatSP_DT, &len);
2144  if (! offset)
2145  return true;
2146 
2147  ch = offset[len];
2148  offset[len] = '\0';
2149  vnp = ValNodeNew(nullptr);
2150  for (q = offset, tvnp = vnp;;) {
2151  p = StringChr(q, '\n');
2152  if (p == q)
2153  break;
2154  if (p)
2155  *p = '\0';
2156  tvnp = ValNodeNew(tvnp, q);
2157  if (! p)
2158  break;
2159  *p++ = '\n';
2160  q = p;
2161  if (*q == '\0')
2162  break;
2163  }
2164  offset[len] = ch;
2165  tvnp = vnp->next;
2166  vnp->next = nullptr;
2167  delete vnp;
2168  vnp = tvnp;
2169 
2170  first = 0;
2171  second = 0;
2172  third = 0;
2173  if (! StringChr(vnp->data, '(')) {
2174  new_style = true;
2175  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
2176  offset = tvnp->data;
2178  if (StringIStr(offset, "integrated into")) {
2179  first++;
2180  std_crdate = GetUpdateDate(offset, pp->source);
2181  } else if (StringIStr(offset, "entry version")) {
2182  third++;
2183  std_annotupd = GetUpdateDate(offset, pp->source);
2184  } else {
2185  p = StringIStr(offset, "sequence version");
2186  if (p) {
2187  second++;
2188  std_sequpd = GetUpdateDate(offset, pp->source);
2189  if (ver_num) {
2190  for (p += 16; *p == ' ';)
2191  p++;
2192  for (q = p; *p >= '0' && *p <= '9';)
2193  p++;
2194  if (*p == '.' && p[1] == '\0') {
2195  *p = '\0';
2196  *ver_num = atoi(q);
2197  *p = '.';
2198  }
2199  }
2200  }
2201  }
2202  }
2203  } else {
2204  new_style = false;
2205  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
2206  offset = tvnp->data;
2208  if (StringIStr(offset, "Created")) {
2209  first++;
2210  std_crdate = GetUpdateDate(offset, pp->source);
2211  } else if (StringIStr(offset, "Last annotation update")) {
2212  third++;
2213  std_annotupd = GetUpdateDate(offset, pp->source);
2214  } else if (StringIStr(offset, "Last sequence update")) {
2215  second++;
2216  std_sequpd = GetUpdateDate(offset, pp->source);
2217  }
2218  }
2219  }
2220 
2221  ValNodeFreeData(vnp);
2222 
2223  ret = true;
2224  if (first == 0) {
2225  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing required \"%s\" DT line.", (new_style ? "integrated into" : "Created"));
2226  ret = false;
2227  } else if (first > 1) {
2228  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Multiple \"%s\" DT lines are present.", (new_style ? "integrated into" : "Created"));
2229  ret = false;
2230  } else if (second == 0) {
2231  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing required \"%s\" DT line.", (new_style ? "sequence version" : "Last sequence update"));
2232  ret = false;
2233  } else if (second > 1) {
2234  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Multiple \"%s\" DT lines are present.", (new_style ? "sequence version" : "Last sequence update"));
2235  ret = false;
2236  } else if (third == 0) {
2237  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing required \"%s\" DT line.", (new_style ? "entry version" : "Last annotation update"));
2238  ret = false;
2239  } else if (third > 1) {
2240  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Multiple \"%s\" DT lines are present.", (new_style ? "entry version" : "Last annotation update"));
2241  ret = false;
2242  } else if (std_crdate.Empty()) {
2243  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing or incorrect create date in \"%s\" DT line.", (new_style ? "integrated into" : "Created"));
2244  ret = false;
2245  } else if (std_sequpd.Empty()) {
2246  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing or incorrect update date in \"%s\" DT line.", (new_style ? "sequence version" : "Last sequence update"));
2247  ret = false;
2248  } else if (std_annotupd.Empty()) {
2249  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing or incorrect update date in \"%s\" DT line.", (new_style ? "entry version" : "Last annotation update"));
2250  ret = false;
2251  } else if (ver_num && *ver_num < 1) {
2252  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Invalidly formatted sequence version DT line is present.");
2253  ret = false;
2254  }
2255 
2256  if (ret) {
2257  crdate.SetStd(*std_crdate);
2258  sequpd.SetStd(*std_sequpd);
2259  annotupd.SetStd(*std_annotupd);
2260  return true;
2261  }
2262 
2263  return false;
2264 }
2265 
2266 /**********************************************************
2267  *
2268  * static SPBlockPtr GetDescrSPBlock(pp, entry, bsp):
2269  *
2270  * 9-30-93
2271  *
2272  **********************************************************/
2273 static CRef<CSP_block>
2275 {
2276  IndexblkPtr ibp;
2277 
2278  CRef<CSP_block> spb(new CSP_block);
2279 
2280  char* bptr;
2281  bool reviewed;
2282  bool i;
2283  Int2 ver_num;
2284 
2285  /* first ID line, 2nd token
2286  */
2287  bptr = PointToNextToken(entry->mOffset + ParFlat_COL_DATA_SP);
2288  reviewed = StringEquNI(bptr, "reviewed", 8);
2289  if (reviewed || StringEquNI(bptr, "standard", 8)) {
2290  spb->SetClass(CSP_block::eClass_standard);
2291  } else if (StringEquNI(bptr, "preliminary", 11) ||
2292  StringEquNI(bptr, "unreviewed", 10)) {
2293  spb->SetClass(CSP_block::eClass_prelim);
2294  } else {
2295  spb->SetClass(CSP_block::eClass_not_set);
2296  ErrPostStr(SEV_WARNING, ERR_DATACLASS_UnKnownClass, "Not a standard/reviewed or preliminary/unreviewed class in SWISS-PROT");
2297  }
2298 
2299  GetSequenceOfKeywords(*entry, ParFlatSP_KW, ParFlat_COL_DATA_SP, spb->SetKeywords());
2300 
2301  ibp = pp->entrylist[pp->curindx];
2302  ibp->wgssec[0] = '\0';
2303 
2304  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, spb->SetExtra_acc());
2305  if (spb->SetExtra_acc().empty())
2306  spb->ResetExtra_acc();
2307 
2308  /* DT data ==> create-date, seqence update, annotation update
2309  */
2310  ver_num = 0;
2311  if (reviewed && pp->sp_dt_seq_ver)
2312  i = GetSPDate(pp, entry, spb->SetCreated(), spb->SetSequpd(), spb->SetAnnotupd(), &ver_num);
2313  else
2314  i = GetSPDate(pp, entry, spb->SetCreated(), spb->SetSequpd(), spb->SetAnnotupd(), nullptr);
2315 
2316  get_plasmid(entry, spb->SetPlasnm());
2317  if (spb->SetPlasnm().empty())
2318  spb->ResetPlasnm();
2319 
2320  GetDRlineDataSP(entry, *spb, &ibp->drop, pp->source);
2321 
2322  if (! i)
2323  ibp->drop = true;
2324  else if (spb->GetClass() == CSP_block::eClass_standard ||
2325  spb->GetClass() == CSP_block::eClass_prelim) {
2326  for (auto& cur_id : bioseq.SetId()) {
2327  if (! cur_id->IsSwissprot())
2328  continue;
2329 
2330  CTextseq_id& id = cur_id->SetSwissprot();
2331  if (ver_num > 0)
2332  id.SetVersion(ver_num);
2333 
2334  if (spb->GetClass() == CSP_block::eClass_standard)
2335  id.SetRelease("reviewed");
2336  else
2337  id.SetRelease("reviewed");
2338 
2339  break;
2340  }
2341  }
2342 
2343  CRef<CSeqdesc> descr(new CSeqdesc);
2344  descr->SetSp(*spb);
2345  bioseq.SetDescr().Set().push_back(descr);
2346 
2347  return spb;
2348 }
2349 
2350 /**********************************************************
2351  *
2352  * static void ParseSpComment(descrptr, bptr, eptr):
2353  *
2354  * 10-1-93
2355  *
2356  **********************************************************/
2357 static void ParseSpComment(CSeq_descr::Tdata& descrs, char* line)
2358 {
2359  char* com;
2360  char* p;
2361  char* q;
2362  Int2 i;
2363 
2364  for (p = line; *p == ' ';)
2365  p++;
2366 
2367  com = StringNew(StringLen(p) + 1);
2368  q = com;
2370  if (i >= 0)
2371  *q++ = '[';
2372 
2373  while (*p != '\0') {
2374  if (*p != '\n') {
2375  *q++ = *p++;
2376  continue;
2377  }
2378 
2379  if (p > line && *(p - 1) != '-')
2380  *q++ = ' ';
2381  for (++p; *p == ' ';)
2382  p++;
2383  if (StringEquN(p, "CC ", 3))
2384  for (p += 3; *p == ' ';)
2385  p++;
2386  }
2387  if (q == com) {
2388  MemFree(com);
2389  return;
2390  }
2391  for (--q; q > com && *q == ' ';)
2392  q--;
2393  if (*q != ' ')
2394  q++;
2395  *q = '\0';
2396  if (i >= 0) {
2397  p = StringChr(com, ':');
2398  *p = ']';
2399  }
2400 
2401  if (com[0] != 0) {
2402  CRef<CSeqdesc> descr(new CSeqdesc);
2403  descr->SetComment(com);
2404  descrs.push_back(descr);
2405  }
2406  MemFree(com);
2407 }
2408 
2409 /**********************************************************
2410  *
2411  * static void GetSPDescrComment(entry, descrs, acc, cla):
2412  *
2413  * CC line ==> comment, separate each by "-!-".
2414  *
2415  * 10-1-93
2416  *
2417  **********************************************************/
2418 static void GetSPDescrComment(DataBlkPtr entry, CSeq_descr::Tdata& descrs, char* acc, Uint1 cla)
2419 {
2420  char* offset;
2421  char* bptr;
2422  char* eptr;
2423  char* tmp;
2424  char* p;
2425  char* q;
2426  Char ch;
2427  Int2 count;
2428  Int4 i;
2429 
2430  size_t len = 0;
2431  offset = SrchNodeType(entry, ParFlatSP_CC, &len);
2432  if (! offset)
2433  return;
2434 
2435  eptr = offset + len;
2436  ch = *eptr;
2437  *eptr = '\0';
2438  for (count = 0, p = offset;;) {
2439  p = StringStr(p, "----------");
2440  if (! p)
2441  break;
2442  for (q = p; q > offset && *q != '\n';)
2443  q--;
2444  if (*q == '\n')
2445  q++;
2446 
2447  p = StringChr(p, '\n');
2448  if (! p)
2449  break;
2450  for (i = 0; *p != '\0' && i < ParFlat_COL_DATA_SP + 1; i++)
2451  p++;
2452  if (*p == '\0')
2453  break;
2454  if (! StringEquNI(p, COPYRIGHT, StringLen(COPYRIGHT)) &&
2456  break;
2457  p = StringStr(p, "----------");
2458  if (! p)
2459  break;
2460  p = StringChr(p, '\n');
2461  if (! p)
2462  break;
2463  p++;
2464  len -= (p - q);
2465  fta_StringCpy(q, p);
2466  p = q;
2467  count++;
2468  }
2469 
2470  if (count == 0 && cla != 2) /* not PRELIMINARY or UNREVIEWED */
2471  ErrPostEx(SEV_WARNING, ERR_FORMAT_MissingCopyright, "The expected copyright notice for UniProt/Swiss-Prot entry %s was not found.", acc);
2472 
2473  if (len < 1) {
2474  *eptr = ch;
2475  return;
2476  }
2477 
2478  bptr = offset + ParFlat_COL_DATA_SP + 4;
2479 
2480  for (; (tmp = StringStr(bptr, "-!-")); bptr = tmp + 4) {
2481  /* found a new comment
2482  */
2483  for (p = tmp; p > bptr && *p != '\n';)
2484  p--;
2485  if (p == bptr)
2486  continue;
2487  *p = '\0';
2488  ParseSpComment(descrs, bptr);
2489  *p = '\n';
2490  }
2491 
2492  ParseSpComment(descrs, bptr);
2493  *eptr = ch;
2494 }
2495 
2496 /**********************************************************/
2497 static void SPAppendPIRToHist(CBioseq& bioseq, const CSP_block& spb)
2498 {
2499  if (spb.GetSeqref().empty())
2500  return;
2501 
2502  CSeq_hist_rec::TIds rep_ids;
2503 
2504  for (const auto& cur_ref : spb.GetSeqref()) {
2505  if (! cur_ref->IsPir())
2506  continue;
2507 
2508  CRef<CTextseq_id> text_id(new CTextseq_id);
2509  text_id->Assign(cur_ref->GetPir());
2510 
2511  CRef<CSeq_id> rep_id(new CSeq_id);
2512  rep_id->SetPir(*text_id);
2513 
2514  rep_ids.push_back(rep_id);
2515  }
2516 
2517  if (rep_ids.empty())
2518  return;
2519 
2520  CSeq_hist& hist = bioseq.SetInst().SetHist();
2521  hist.SetReplaces().SetIds().splice(hist.SetReplaces().SetIds().end(), rep_ids);
2522 }
2523 
2524 /**********************************************************/
2525 static bool IfOHTaxIdMatchOHName(const char* orpname, const char* ohname)
2526 {
2527  const char* p;
2528  const char* q;
2529  Char chp;
2530  Char chq;
2531 
2532  if (! orpname && ! ohname)
2533  return true;
2534  if (! orpname || ! ohname)
2535  return false;
2536 
2537  for (p = orpname, q = ohname; *p != '\0' && *q != '\0'; p++, q++) {
2538  chp = *p;
2539  if (chp >= 'a' && chp <= 'z')
2540  chp &= ~040;
2541  chq = *q;
2542  if (chq >= 'a' && chq <= 'z')
2543  chq &= ~040;
2544  if (chp != chq)
2545  break;
2546  }
2547 
2548  while (*p == ' ')
2549  p++;
2550  if (*p != '\0')
2551  return false;
2552 
2553  while (*q == ' ')
2554  q++;
2555  if (*q == '(' || *q == '\0')
2556  return true;
2557  return false;
2558 }
2559 
2560 /**********************************************************/
2561 static void GetSprotDescr(CBioseq& bioseq, ParserPtr pp, DataBlkPtr entry)
2562 {
2563  DataBlkPtr dbp;
2564  char* offset;
2565  CBioSource::TGenome gmod;
2566  bool fragment = false;
2567  TTaxId taxid;
2568 
2569  IndexblkPtr ibp;
2570  ViralHostPtr vhp;
2571  ViralHostPtr tvhp;
2572 
2573  CSeq_descr& descr = bioseq.SetDescr();
2574 
2575  ibp = pp->entrylist[pp->curindx];
2576  size_t len = 0;
2577  offset = SrchNodeType(entry, ParFlatSP_DE, &len);
2578  if (offset) {
2579  string title = GetSPDescrTitle(string_view(offset, len), &fragment);
2580  if (! title.empty()) {
2581  CRef<CSeqdesc> desc_new(new CSeqdesc);
2582  desc_new->SetTitle(title);
2583  descr.Set().push_back(desc_new);
2584  }
2585  }
2586 
2587  /* sp-block
2588  */
2589  CRef<CSP_block> spb = GetDescrSPBlock(pp, entry, bioseq);
2590 
2591  GetSPDescrComment(entry, descr.Set(), ibp->acnum, spb->GetClass());
2592 
2593  if (spb.NotEmpty() && pp->accver && pp->histacc && pp->source == Parser::ESource::SPROT) {
2594  CSeq_hist_rec::TIds rep_ids;
2595 
2596  for (const string& cur_acc : spb->GetExtra_acc()) {
2597  if (cur_acc.empty() || ! IsSPROTAccession(cur_acc.c_str()))
2598  continue;
2599 
2600  CRef<CTextseq_id> text_id(new CTextseq_id);
2601  text_id->SetAccession(cur_acc);
2602 
2603  CRef<CSeq_id> rep_id(new CSeq_id);
2604  rep_id->SetSwissprot(*text_id);
2605  rep_ids.push_back(rep_id);
2606  }
2607 
2608  if (! rep_ids.empty()) {
2609  CSeq_hist& hist = bioseq.SetInst().SetHist();
2610  hist.SetReplaces().SetIds().swap(rep_ids);
2611  }
2612  }
2613 
2614  if (spb->CanGetCreated()) {
2615  CRef<CSeqdesc> create_date_descr(new CSeqdesc);
2616  create_date_descr->SetCreate_date().Assign(spb->GetCreated());
2617 
2618  descr.Set().push_back(create_date_descr);
2619  }
2620 
2621  bool has_update_date = spb->CanGetAnnotupd() || spb->CanGetSequpd();
2622  CDate upd_date;
2623 
2624  if (has_update_date) {
2625  if (spb->CanGetAnnotupd() && spb->CanGetSequpd()) {
2626  upd_date.Assign(spb->GetAnnotupd().Compare(spb->GetSequpd()) == CDate::eCompare_after ? spb->GetAnnotupd() : spb->GetSequpd());
2627  } else if (spb->CanGetAnnotupd())
2628  upd_date.Assign(spb->GetAnnotupd());
2629  else
2630  upd_date.Assign(spb->GetSequpd());
2631 
2632  CRef<CSeqdesc> upd_date_descr(new CSeqdesc);
2633  upd_date_descr->SetUpdate_date().Assign(upd_date);
2634 
2635  descr.Set().push_back(upd_date_descr);
2636  }
2637 
2638  if (spb->CanGetCreated() && has_update_date &&
2639  spb->GetCreated().Compare(upd_date) == CDate::eCompare_after) {
2640  string upd_date_str, create_date_str;
2641 
2642  upd_date.GetDate(&upd_date_str);
2643  spb->GetCreated().GetDate(&create_date_str);
2644 
2645  ErrPostEx(SEV_ERROR, ERR_DATE_IllegalDate, "Update-date \"%s\" precedes create-date \"%s\".", upd_date_str.c_str(), create_date_str.c_str());
2646  }
2647 
2648  dbp = TrackNodeType(*entry, ParFlatSP_OS);
2649  gmod = GetSPGenome(dbp);
2650 
2651  /* Org-ref from ID lines
2652  */
2653  for (dbp = TrackNodeType(*entry, ParFlatSP_ID); dbp; dbp = dbp->mpNext) {
2654  if (dbp->mType != ParFlatSP_ID)
2655  continue;
2656 
2657  CRef<CBioSource> bio_src;
2658 
2659  taxid = GetTaxIdFrom_OX(dbp);
2660  if (taxid > ZERO_TAX_ID) {
2661  CRef<COrg_ref> org_ref = fta_fix_orgref_byid(pp, taxid, &ibp->drop, false);
2662  if (org_ref.Empty())
2663  ErrPostEx(SEV_ERROR, ERR_SOURCE_NcbiTaxIDLookupFailure, "NCBI TaxID lookup for %d failed : will use organism name for lookup instead.", TAX_ID_TO(int, taxid));
2664  else {
2665  bio_src.Reset(new CBioSource);
2666 
2667  if (gmod != CBioSource::eGenome_unknown)
2668  bio_src->SetGenome(gmod);
2669  bio_src->SetOrg(*org_ref);
2670  }
2671  }
2672 
2673  CRef<COrg_ref> org_ref = GetOrganismFrom_OS_OC(dbp);
2674  if (org_ref.NotEmpty()) {
2675  if (bio_src.Empty()) {
2676  bio_src.Reset(new CBioSource);
2677 
2678  if (gmod != CBioSource::eGenome_unknown)
2679  bio_src->SetGenome(gmod);
2680  fta_fix_orgref(pp, *org_ref, &ibp->drop, nullptr);
2681  bio_src->SetOrg(*org_ref);
2682  } else if (org_ref->IsSetTaxname()) {
2683  if (! bio_src->IsSetOrg() || ! bio_src->GetOrg().IsSetTaxname() ||
2684  NStr::CompareNocase(org_ref->GetTaxname().c_str(), bio_src->GetOrg().GetTaxname().c_str()) != 0)
2685  ErrPostEx(SEV_ERROR, ERR_SOURCE_OrgNameVsTaxIDMissMatch, "Organism name \"%s\" from OS line does not match the organism name \"%s\" obtained by lookup of NCBI TaxID.", org_ref->GetTaxname().c_str(), bio_src->GetOrg().GetTaxname().c_str());
2686  }
2687  }
2688 
2689  if (bio_src.Empty())
2690  break;
2691 
2692  vhp = GetViralHostsFrom_OH(dbp);
2693  if (vhp) {
2694  COrgName& orgname = bio_src->SetOrg().SetOrgname();
2695 
2696  for (tvhp = vhp; tvhp; tvhp = vhp) {
2697  vhp = tvhp->next;
2698 
2699  CRef<COrgMod> mod(new COrgMod);
2700  mod->SetSubtype(COrgMod::eSubtype_nat_host);
2701  mod->SetSubname(tvhp->name);
2702  orgname.SetMod().push_back(mod);
2703 
2704  if (tvhp->taxid <= ZERO_TAX_ID) {
2705  delete tvhp;
2706  continue;
2707  }
2708 
2709  bool drop = false;
2710  CRef<COrg_ref> org_ref_cur = fta_fix_orgref_byid(pp, tvhp->taxid, &drop, true);
2711  if (org_ref_cur.Empty()) {
2712  if (! drop)
2713  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidNcbiTaxID, "OH-line TaxId \"%d\" was not found via the NCBI TaxArch service.", TAX_ID_TO(int, tvhp->taxid));
2714  else
2715  ErrPostEx(SEV_ERROR, ERR_SOURCE_NcbiTaxIDLookupFailure, "Taxonomy lookup for OH-line TaxId \"%d\" failed.", TAX_ID_TO(int, tvhp->taxid));
2716  } else {
2717  vector<Char> org_taxname;
2718  if (org_ref_cur->IsSetTaxname()) {
2719  const string& cur_taxname = org_ref_cur->GetTaxname();
2720  org_taxname.assign(cur_taxname.begin(), cur_taxname.end());
2721  }
2722 
2723  org_taxname.push_back(0);
2724 
2725  if (! IfOHTaxIdMatchOHName(&org_taxname[0], tvhp->name))
2728  "OH-line HostName \"%s\" does not match NCBI organism name \"%s\" obtained by lookup of NCBI TaxID \"%d\".",
2729  tvhp->name,
2730  &org_taxname[0],
2731  TAX_ID_TO(int, tvhp->taxid));
2732  }
2733  delete tvhp;
2734  }
2735  }
2736 
2737  fta_sort_biosource(*bio_src);
2738 
2739  CRef<CSeqdesc> bio_src_desc(new CSeqdesc);
2740  bio_src_desc->SetSource(*bio_src);
2741  descr.Set().push_back(bio_src_desc);
2742  break;
2743  }
2744 
2745  if (spb.NotEmpty())
2746  SPAppendPIRToHist(bioseq, *spb);
2747 
2748  CRef<CSeqdesc> mol_info_descr(new CSeqdesc);
2749  CMolInfo& mol_info = mol_info_descr->SetMolinfo();
2752 
2753  descr.Set().push_back(mol_info_descr);
2754 
2755  /* RN data ==> pub
2756  */
2757  dbp = TrackNodeType(*entry, ParFlat_REF_END);
2758  for (; dbp; dbp = dbp->mpNext) {
2759  if (dbp->mType != ParFlat_REF_END)
2760  continue;
2761 
2762  CRef<CPubdesc> pub_desc = DescrRefs(pp, dbp, ParFlat_COL_DATA_SP);
2763  if (pub_desc.NotEmpty()) {
2764  CRef<CSeqdesc> pub_desc_descr(new CSeqdesc);
2765  pub_desc_descr->SetPub(*pub_desc);
2766 
2767  descr.Set().push_back(pub_desc_descr);
2768  }
2769  }
2770 }
2771 
2772 /**********************************************************
2773  *
2774  * static void GetSPInst(pp, entry, protconv):
2775  *
2776  * Fills in Seq-inst for an entry. Assumes Bioseq
2777  * already allocated.
2778  *
2779  * 10-8-93
2780  *
2781  **********************************************************/
2782 static void GetSPInst(ParserPtr pp, DataBlkPtr entry, unsigned char* protconv)
2783 {
2784  EntryBlkPtr ebp;
2785 
2786  ebp = static_cast<EntryBlk*>(entry->mpData);
2787 
2788  CBioseq& bioseq = ebp->seq_entry->SetSeq();
2789 
2790  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
2791  bioseq.SetInst().SetMol(CSeq_inst::eMol_aa);
2792 
2793  GetSeqData(pp, *entry, bioseq, ParFlatSP_SQ, protconv, CSeq_data::e_Iupacaa);
2794 }
2795 
2796 /**********************************************************/
2798 {
2799  delete spfip;
2800 }
2801 
2802 /**********************************************************
2803  *
2804  * static void FreeSPFeatInputSet(spfip):
2805  *
2806  * 10-18-93
2807  *
2808  **********************************************************/
2810 {
2812 
2813  for (; spfip; spfip = next) {
2814  next = spfip->next;
2815  FreeSPFeatInput(spfip);
2816  }
2817 }
2818 
2819 /**********************************************************/
2821 {
2822  if (! fip1 && ! fip2)
2823  return true;
2824 
2825  if (! fip1 || ! fip2 ||
2826  fip1->key != fip2->key ||
2827  fip1->from != fip2->from ||
2828  fip1->to != fip2->to ||
2829  fip1->descrip != fip2->descrip)
2830  return false;
2831 
2832  return true;
2833 }
2834 
2835 /**********************************************************/
2837 {
2838  SPFeatInputPtr fip;
2839  SPFeatInputPtr fipnext;
2840  SPFeatInputPtr fipprev;
2841 
2842  if (! spfip || ! spfip->next)
2843  return;
2844 
2845  for (; spfip && spfip->next; spfip = spfip->next) {
2846  fipprev = spfip;
2847  for (fip = spfip->next; fip; fip = fipnext) {
2848  fipnext = fip->next;
2849  if (! fta_spfeats_same(spfip, fip)) {
2850  fipprev = fip;
2851  continue;
2852  }
2853  fipprev->next = fip->next;
2854  ErrPostEx(SEV_WARNING, ERR_FEATURE_DuplicateRemoved, "Duplicated feature \"%s\" at location \"%s..%s\" removed.", fip->key.empty() ? "???" : fip->key.c_str(), fip->from.empty() ? "???" : fip->from.c_str(), fip->to.empty() ? "???" : fip->to.c_str());
2855  FreeSPFeatInput(fip);
2856  }
2857  }
2858 }
2859 
2860 /**********************************************************/
2861 static void SPPostProcVarSeq(string& varseq)
2862 {
2863  char* temp;
2864  char* end;
2865  char* p;
2866  char* q;
2867 
2868  if (varseq.empty())
2869  return;
2870 
2871  temp = StringSave(varseq);
2872  p = StringStr(temp, "->");
2873  if (! p || p == temp ||
2874  (*(p - 1) != ' ' && *(p - 1) != '\n') || (p[2] != ' ' && p[2] != '\n')) {
2875  NStr::ReplaceInPlace(varseq, "\n", " ");
2876  MemFree(temp);
2877  return;
2878  }
2879 
2880  for (p--; p > temp && (*p == ' ' || *p == '\n');)
2881  p--;
2882  if (*p < 'A' || *p > 'Z') {
2883  NStr::ReplaceInPlace(varseq, "\n", " ");
2884  MemFree(temp);
2885  return;
2886  }
2887 
2888  end = p + 1;
2889  while (p > temp && (*p == '\n' || (*p >= 'A' && *p <= 'Z')))
2890  p--;
2891  if (p > temp)
2892  p++;
2893  while (*p == '\n')
2894  p++;
2895  for (;;) {
2896  while (*p >= 'A' && *p <= 'Z' && p < end)
2897  p++;
2898  if (p == end)
2899  break;
2900  for (q = p; *p == '\n'; p++)
2901  end--;
2902  fta_StringCpy(q, p);
2903  }
2904 
2905  while (*p == ' ' || *p == '\n')
2906  p++;
2907  for (p += 2; *p == ' ' || *p == '\n';)
2908  p++;
2909 
2910  if (*p < 'A' || *p > 'Z') {
2911  NStr::ReplaceInPlace(varseq, "\n", " ");
2912  MemFree(temp);
2913  return;
2914  }
2915 
2916  for (q = p; *q == '\n' || (*q >= 'A' && *q <= 'Z');)
2917  q++;
2918  if (q > p && *(q - 1) == '\n') {
2919  for (q--; *q == '\n' && q > p;)
2920  q--;
2921  if (*q != '\n')
2922  q++;
2923  }
2924  end = q;
2925 
2926  for (;;) {
2927  while (*p >= 'A' && *p <= 'Z' && p < end)
2928  p++;
2929  if (p == end)
2930  break;
2931  for (q = p; *p == '\n'; p++)
2932  end--;
2933  fta_StringCpy(q, p);
2934  }
2935 
2936  for (p = temp; *p != '\0'; p++)
2937  if (*p == '\n')
2938  *p = ' ';
2939 
2940  varseq = temp;
2941  MemFree(temp);
2942 }
2943 
2944 /**********************************************************
2945  *
2946  * static SPFeatInputPtr ParseSPFeat(entry, seqlen):
2947  *
2948  * Return a link list of feature input data, including
2949  * key, from, to, description.
2950  *
2951  * 10-15-93
2952  *
2953  **********************************************************/
2954 static SPFeatInputPtr ParseSPFeat(DataBlkPtr entry, size_t seqlen)
2955 {
2956  SPFeatInputPtr temp;
2957  SPFeatInputPtr current;
2958  SPFeatInputPtr spfip;
2959  const char* defdelim;
2960  char* fromstart;
2961  char* fromend;
2962  char* bptr;
2963  char* eptr;
2964  char* ptr1;
2965  char* offset;
2966  char* endline;
2967  char* str;
2968  const char* delim;
2969  char* quotes;
2970  char* location;
2971  char* p;
2972  char* q;
2973  int i;
2974  bool badqual;
2975  bool new_format;
2976  bool extra_text;
2977  Char ch;
2978 
2979  size_t len = 0;
2980  offset = SrchNodeType(entry, ParFlatSP_FT, &len);
2981  if (! offset)
2982  return nullptr;
2983 
2984  bptr = offset + ParFlat_COL_DATA_SP;
2985  eptr = offset + len;
2986 
2987  spfip = nullptr;
2988  current = nullptr;
2989 
2990  while (bptr < eptr && (endline = SrchTheChar(bptr, eptr, '\n'))) {
2991  temp = new SPFeatInput;
2992 
2993  for (p = bptr, i = 0; *p != ' ' && *p != '\n' && i < 8; i++)
2994  p++;
2995  temp->key.assign(bptr, p);
2997 
2998  if (temp->key == "VAR_SEQ")
2999  defdelim = "\n";
3000  else
3001  defdelim = " ";
3002 
3003  for (bptr += 8; *bptr == ' ' && bptr <= endline;)
3004  bptr++;
3005 
3006  location = bptr;
3007 
3008  if (((*bptr >= 'a' && *bptr <= 'z') || (*bptr >= 'A' && *bptr <= 'Z')) &&
3009  bptr[6] == '-') {
3010  for (bptr += 7; *bptr >= '0' && *bptr <= '9' && bptr <= endline;)
3011  bptr++;
3012  for (; *bptr == ':' && bptr <= endline;)
3013  bptr++;
3014  }
3015 
3016  for (ptr1 = bptr; *ptr1 == '?' || *ptr1 == '>' || *ptr1 == '<' ||
3017  (*ptr1 >= '0' && *ptr1 <= '9');)
3018  ptr1++;
3019 
3020  if (bptr < ptr1 && ptr1 <= endline) {
3021  temp->from.assign(bptr, ptr1);
3022  fromstart = bptr;
3023  fromend = ptr1;
3024  } else {
3025  ch = '\0';
3026  p = StringChr(location, ' ');
3027  q = StringChr(location, '\n');
3028  if (! p || (q && q < p))
3029  p = q;
3030  if (p) {
3031  ch = *p;
3032  *p = '\0';
3033  }
3034  if (bptr == ptr1)
3035  ErrPostEx(SEV_ERROR, ERR_FEATURE_BadLocation, "Invalid location \"%s\" at feature \"%s\". Feature dropped.", location, temp->key.c_str());
3036  else
3037  ErrPostEx(SEV_ERROR, ERR_FEATURE_BadLocation, "Empty location at feature \"%s\". Feature dropped.", temp->key.c_str());
3038  if (p)
3039  *p = ch;
3040  temp->from.assign("-1");
3041  fromstart = nullptr;
3042  fromend = nullptr;
3043  }
3044 
3045  new_format = false;
3046  bptr = ptr1;
3047  for (; (*bptr == ' ' || *bptr == '.') && bptr <= endline; bptr++)
3048  if (*bptr == '.')
3049  new_format = true;
3050  for (ptr1 = bptr; *ptr1 == '?' || *ptr1 == '>' || *ptr1 == '<' ||
3051  (*ptr1 >= '0' && *ptr1 <= '9');)
3052  ptr1++;
3053 
3054  p = (char*)temp->from.c_str();
3055  if (*p == '<' || *p == '>')
3056  p++;
3057 
3058  for (q = ptr1; *q == ' ';)
3059  q++;
3060  extra_text = false;
3061  if (bptr < ptr1 && ptr1 <= endline) {
3062  if (*q != '\n' && new_format && (*p == '?' || atoi(p) != -1))
3063  extra_text = true;
3064  temp->to.assign(bptr, ptr1);
3065  } else if (fromstart) {
3066  if (*q != '\n' && (*p == '?' || atoi(p) != -1))
3067  extra_text = true;
3068  temp->to.assign(fromstart, fromend);
3069  } else {
3070  if (*q != '\n' && (*p == '?' || atoi(p) != -1))
3071  extra_text = true;
3072  temp->to.assign("-1");
3073  }
3074 
3075  q = (char*)temp->to.c_str();
3076  if (*q == '<' || *q == '>')
3077  q++;
3078  if (extra_text || (*p != '?' && *q != '?' && (atoi(p) > atoi(q)))) {
3079  ch = '\0';
3080  p = extra_text ? nullptr : StringChr(location, ' ');
3081  q = StringChr(location, '\n');
3082  if (! p || (q && q < p))
3083  p = q;
3084  if (p) {
3085  ch = *p;
3086  *p = '\0';
3087  }
3088  ErrPostEx(SEV_ERROR, ERR_FEATURE_BadLocation, "Invalid location \"%s\" at feature \"%s\". Feature dropped.", location, temp->key.c_str());
3089  if (p)
3090  *p = ch;
3091  temp->from.assign("-1");
3092  }
3093 
3094  for (bptr = ptr1; *bptr == ' ' && bptr <= endline;)
3095  bptr++;
3096 
3097  str = endline;
3098  delim = defdelim;
3099  if (str > bptr)
3100  if (*--str == '-' && str > bptr)
3101  if (*--str != ' ')
3102  delim = nullptr;
3103  if (bptr <= endline)
3104  temp->descrip.assign(bptr, endline);
3105 
3106  for (bptr = endline; *bptr == ' ' || *bptr == '\n';)
3107  bptr++;
3108 
3109  badqual = false;
3110  bptr += ParFlat_COL_DATA_SP;
3111  while (bptr < eptr && (*bptr == ' ')) /* continue description data */
3112  {
3113  while (*bptr == ' ')
3114  bptr++;
3115 
3116  if (StringEquN(bptr, "/note=\"", 7)) {
3117  bptr += 7;
3118  quotes = nullptr;
3119  } else if (StringEquN(bptr, "/evidence=\"", 11)) {
3120  quotes = bptr + 10;
3121  if (! StringEquN(quotes + 1, "ECO:", 4)) {
3122  p = StringChr(bptr, '\n');
3123  if (p)
3124  *p = '\0';
3125  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidEvidence, "/evidence qualifier does not have expected \"ECO:\" prefix : \"%s\".", bptr);
3126  if (p)
3127  *p = '\n';
3128  }
3129  } else if (StringEquN(bptr, "/id=\"", 5))
3130  quotes = bptr + 4;
3131  else {
3132  if (*bptr == '/') {
3133  for (p = bptr + 1; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || (*p >= '0' && *p <= '9') || *p == '_';)
3134  p++;
3135  if (*p == '=' && p[1] == '\"') {
3136  *p = '\0';
3137  badqual = true;
3138  ErrPostEx(SEV_ERROR, ERR_FEATURE_InvalidQualifier, "Qualifier %s is invalid for the feature \"%s\" at \"%s..%s\".", bptr, temp->key.c_str(), temp->from.c_str(), temp->to.c_str());
3139  *p = '=';
3140  }
3141  }
3142  quotes = nullptr;
3143  }
3144 
3145  endline = SrchTheChar(bptr, eptr, '\n');
3146  p = endline - 1;
3147  if (p >= bptr && *p == '\"')
3148  *p = '.';
3149  else
3150  p = nullptr;
3151 
3152  if (quotes) {
3153  StringCombine(temp->descrip, string(bptr, quotes), delim);
3154  if (p && p - 1 >= bptr && *(p - 1) == '.')
3155  StringCombine(temp->descrip, string(quotes + 1, endline - 1), "");
3156  else
3157  StringCombine(temp->descrip, string(quotes + 1, endline), "");
3158  } else {
3159  if (p && p - 1 >= bptr && *(p - 1) == '.')
3160  StringCombine(temp->descrip, string(bptr, endline - 1), delim);
3161  else
3162  StringCombine(temp->descrip, string(bptr, endline), delim);
3163  }
3164 
3165  if (p)
3166  *p = '\"';
3167 
3168  str = endline;
3169  delim = defdelim;
3170  if (str > bptr)
3171  if (*--str == '-' && str > bptr)
3172  if (*--str != ' ')
3173  delim = nullptr;
3174  for (bptr = endline; *bptr == ' ' || *bptr == '\n';)
3175  bptr++;
3176 
3177  bptr += ParFlat_COL_DATA_SP;
3178  }
3179 
3180  if (badqual) {
3181  ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped, "Invalid qualifier(s) found within the feature \"%s\" at \"%s..%s\". Feature dropped.", temp->key.c_str(), temp->from.c_str(), temp->to.c_str());
3182  FreeSPFeatInputSet(temp);
3183  continue;
3184  }
3185 
3186  if (*defdelim == '\n')
3187  SPPostProcVarSeq(temp->descrip);
3188 
3189  p = (char*)temp->from.c_str();
3190  if (*p == '<' || *p == '>')
3191  p++;
3192  if (*p != '?' && atoi(p) < 0) {
3193  FreeSPFeatInputSet(temp);
3194  continue;
3195  }
3196 
3197  q = (char*)temp->to.c_str();
3198  if (*q == '<' || *q == '>')
3199  q++;
3200  if ((*p != '?' && atoi(p) > (Int4)seqlen) || (*q != '?' && atoi(q) > (Int4)seqlen)) {
3201  ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "Location range exceeds the sequence length: feature=%s, length=%d, from=%s, to=%s", temp->key.c_str(), seqlen, temp->from.c_str(), temp->to.c_str());
3202  ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped, "Location range exceeds the sequence length: feature=%s, length=%d, from=%s, to=%s", temp->key.c_str(), seqlen, temp->from.c_str(), temp->to.c_str());
3203  FreeSPFeatInputSet(temp);
3204  continue;
3205  }
3206 
3207  if (! spfip)
3208  spfip = temp;
3209  else
3210  current->next = temp;
3211  current = temp;
3212  }
3213 
3214  fta_remove_dup_spfeats(spfip);
3215 
3216  return (spfip);
3217 }
3218 
3219 /**********************************************************
3220  *
3221  * static CRef<CSeq_loc> GetSPSeqLoc(pp, spfip, bond, initmet,
3222  * signal):
3223  *
3224  * The following rules are assumption since I am
3225  * waiting Mark's mail:
3226  * - substract one if from > 0 and
3227  * - unknown endpoint "?" not implement.
3228  *
3229  * 10-18-93
3230  *
3231  **********************************************************/
3232 static CRef<CSeq_loc> GetSPSeqLoc(ParserPtr pp, SPFeatInputPtr spfip, bool bond, bool initmet, bool signal)
3233 {
3234  CRef<CSeq_loc> loc;
3235 
3236  IndexblkPtr ibp;
3237 
3238  const char* ptr;
3239 
3240  bool fuzzfrom = false;
3241  bool fuzzto = false;
3242  bool nofrom = false;
3243  bool noto = false;
3244  bool pntfuzz = false;
3245  Int4 from;
3246  Int4 to;
3247 
3248  if (! spfip || spfip->from.empty() || spfip->to.empty())
3249  return loc;
3250 
3251  ibp = pp->entrylist[pp->curindx];
3252 
3253  loc.Reset(new CSeq_loc);
3254 
3255  ptr = spfip->from.c_str();
3256  if (StringChr(ptr, '<')) {
3257  fuzzfrom = true;
3258 
3259  while (*ptr != '\0' && isdigit(*ptr) == 0)
3260  ptr++;
3261  from = (Int4)atoi(ptr);
3262  } else if (StringChr(ptr, '?')) {
3263  from = 0;
3264  nofrom = true;
3265  } else {
3266  from = (Int4)atoi(ptr);
3267  }
3268  if ((initmet == false && from != 0) ||
3269  (initmet && signal && from == 1))
3270  from--;
3271 
3272  ptr = spfip->to.c_str();
3273  if (StringChr(ptr, '>')) {
3274  fuzzto = true;
3275  while (*ptr != '\0' && isdigit(*ptr) == 0)
3276  ptr++;
3277  to = (Int4)atoi(ptr);
3278  } else if (StringChr(ptr, '?')) {
3279  to = static_cast<Int4>(ibp->bases);
3280  noto = true;
3281  } else
3282  to = (Int4)atoi(ptr);
3283 
3284  if (initmet == false && to != 0)
3285  to--;
3286  if (nofrom && noto)
3287  pntfuzz = true;
3288 
3289  if (bond) {
3290  CSeq_bond& bond = loc->SetBond();
3291  CSeq_point& point_a = bond.SetA();
3292 
3293  point_a.SetPoint(from);
3294  point_a.SetId(*MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum));
3295 
3296  if (fuzzfrom)
3297  GetIntFuzzPtr(4, 2, 0, point_a.SetFuzz());
3298 
3299  if (from != to) {
3300  CSeq_point& point_b = bond.SetB();
3301  point_b.SetPoint(to);
3302  point_b.SetId(*MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum));
3303 
3304  if (fuzzto)
3305  GetIntFuzzPtr(4, 1, 0, point_b.SetFuzz());
3306  }
3307  } else if (from != to && ! pntfuzz) {
3308  CSeq_interval& interval = loc->SetInt();
3309  interval.SetFrom(from);
3310  interval.SetTo(to);
3311  interval.SetId(*MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum));
3312 
3313  if (fuzzfrom)
3314  GetIntFuzzPtr(4, 2, 0, interval.SetFuzz_from()); /* lim, lt, no-min */
3315 
3316  if (nofrom)
3317  GetIntFuzzPtr(2, to - 1, 0, interval.SetFuzz_from()); /* range, max, min */
3318 
3319  if (noto)
3320  GetIntFuzzPtr(2, to, from + 1, interval.SetFuzz_to()); /* range, max, min */
3321 
3322  if (fuzzto)
3323  GetIntFuzzPtr(4, 1, 0, interval.SetFuzz_to()); /* lim, gt, no-min */
3324  } else {
3325  CSeq_point& point = loc->SetPnt();
3326  point.SetPoint(from);
3327  point.SetId(*MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum));
3328 
3329  if (pntfuzz) {
3330  GetIntFuzzPtr(2, to, from, point.SetFuzz()); /* range, max, min */
3331  } else if (fuzzfrom) {
3332  GetIntFuzzPtr(4, 2, 0, point.SetFuzz());
3333  }
3334  }
3335 
3336  return loc;
3337 }
3338 
3339 /**********************************************************
3340  *
3341  * static char* DelTheStr(sourcesrt, targetstr):
3342  *
3343  * Return a string with deleted "targetstr".
3344  * Also Free out "sourcestr".
3345  *
3346  **********************************************************/
3347 /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
3348 static void DelTheStr(string& sourcestr, const string& targetstr)
3349 {
3350  NStr::ReplaceInPlace(sourcestr, targetstr, "", 0, 1);
3351  NStr::TruncateSpacesInPlace(sourcestr, NStr::eTrunc_End);
3352 }
3353 */
3354 
3355 /**********************************************************
3356  *
3357  * static bool SPFeatNoExp(pp, spfip):
3358  *
3359  * Return TRUE if "str" containing any string in the
3360  * ParFlat_SPFeatNoExp or ParFlat_SPFeatNoExpW (old
3361  * patterns, put warning message).
3362  *
3363  * 10-18-93
3364  *
3365  **********************************************************/
3366 /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
3367 static bool SPFeatNoExp(ParserPtr pp, SPFeatInputPtr spfip)
3368 {
3369  Int2 indx;
3370  Int4 len = 0;
3371 
3372  if (!spfip)
3373  return false;
3374 
3375  if (MatchArrayISubString(ParFlat_SPFeatNoExp, spfip->descrip) != -1)
3376  return true;
3377 
3378  indx = MatchArrayISubString(ParFlat_SPFeatNoExpW, spfip->descrip);
3379  if (indx < 0)
3380  return false;
3381 
3382  DelTheStr(spfip->descrip, ParFlat_SPFeatNoExpW[indx]);
3383  if (len > 0 && spfip->descrip[len-1] != '.')
3384  {
3385  StringCombine(spfip->descrip, ".", nullptr);
3386  }
3387 
3388  ErrPostEx(SEV_WARNING, ERR_FEATURE_OldNonExp,
3389  "Old Non-experimental feature description, %s",
3390  ParFlat_SPFeatNoExpW[indx]);
3391 
3392  return true;
3393 }
3394 */
3395 
3396 /**********************************************************
3397  *
3398  * static Int2 GetSPSitesMod(retstr):
3399  *
3400  * Return an index array of ParFlat_SPFEAT for
3401  * a specific type of modified residue because the first
3402  * several words of a MOD_RES feature's description can
3403  * indicate a more specific type of modified residue.
3404  *
3405  * 10-18-93
3406  *
3407  **********************************************************/
3408 static Int2 GetSPSitesMod(string& retstr)
3409 {
3410  Int2 ret = ParFlatSPSitesModB;
3411 
3412  for (Int2 i = ParFlatSPSitesModB; i <= ParFlatSPSitesModE; i++) {
3413  size_t pos = NStr::FindNoCase(retstr, ParFlat_SPFeat[i].keystring, 0);
3414  if (pos == NPOS)
3415  continue;
3416 
3417  size_t len = StringLen(ParFlat_SPFeat[i].keystring);
3418  if ((pos != 0 && retstr[pos - 1] != ' ' && retstr[pos - 1] != '.') ||
3419  (retstr[pos + len] != '\0' && retstr[pos + len] != ' ' &&
3420  retstr[pos + len] != '.' && retstr[pos + len] != ';'))
3421  continue;
3422 
3423  ret = i;
3424  break;
3425  }
3426 
3427  return (ret);
3428 }
3429 
3430 /**********************************************************
3431  *
3432  * Int2 SpFeatKeyNameValid(keystr):
3433  *
3434  * 10-18-93
3435  *
3436  **********************************************************/
3438 {
3439  Int2 i;
3440 
3441  for (i = 0; ParFlat_SPFeat[i].inkey; i++)
3442  if (NStr::CompareNocase(ParFlat_SPFeat[i].inkey, keystr) == 0)
3443  break;
3444 
3445  if (! ParFlat_SPFeat[i].inkey)
3446  return (-1);
3447  return (i);
3448 }
3449 
3450 /**********************************************************/
3452 {
3453  string descrip;
3454  char* loc;
3455  char* p;
3456  Uint1 type;
3457  Int2 indx;
3458  bool err = false;
3459 
3460  descrip.assign(CpTheQualValue(fbp->quals, "note"));
3461 
3462  if (NStr::EqualNocase(fbp->key, "VARSPLIC")) {
3463  ErrPostStr(SEV_WARNING, ERR_FEATURE_ObsoleteFeature, "Obsolete UniProt feature \"VARSPLIC\" found. Replaced with \"VAR_SEQ\".");
3464  fbp->key = "VAR_SEQ";
3465  }
3466 
3467  if (NStr::EqualNocase(fbp->key, "NON_STD")) {
3468  if (NStr::EqualNocase(descrip, "Selenocysteine.")) {
3469  fbp->key = "SE_CYS";
3470  descrip.clear();
3471  } else
3472  fbp->key = "MOD_RES";
3473  }
3474 
3475  CRef<CSeq_feat> feat(new CSeq_feat);
3476  indx = fbp->spindex;
3477  type = ParFlat_SPFeat[indx].type;
3478  if (type == ParFlatSPSites) {
3479  if (indx == ParFlatSPSitesModB && ! descrip.empty())
3480  indx = GetSPSitesMod(descrip);
3481 
3482  feat->SetData().SetSite(static_cast<CSeqFeatData::ESite>(ParFlat_SPFeat[indx].keyint));
3483  } else if (type == ParFlatSPBonds) {
3484  feat->SetData().SetBond(static_cast<CSeqFeatData::EBond>(ParFlat_SPFeat[indx].keyint));
3485  } else if (type == ParFlatSPRegions) {
3486  feat->SetData().SetRegion(ParFlat_SPFeat[indx].keystring);
3487  } else if (type == ParFlatSPImports) {
3488  feat->SetData().SetImp().SetKey(ParFlat_SPFeat[indx].keystring);
3489  feat->SetData().SetImp().SetDescr("uncertain amino acids");
3490  } else {
3491  if (type != ParFlatSPInitMet && type != ParFlatSPNonTer &&
3492  type != ParFlatSPNonCons) {
3493  ErrPostEx(SEV_WARNING, ERR_FEATURE_Dropped, "Swiss-Prot feature \"%s\" with unknown type dropped.", fbp->key.c_str());
3494  }
3495  feat->Reset();
3496  return (null);
3497  }
3498 
3499  if (fbp->location_isset()) {
3500  loc = fbp->location;
3501  for (p = loc; *p; p++)
3502  if (*p != ' ')
3503  *loc++ = *p;
3504  *loc = '\0';
3505  pp->buf = fbp->key + " : " + fbp->location_get();
3506  GetSeqLocation(*feat, fbp->location, seqids, &err, pp, fbp->key);
3507  pp->buf.reset();
3508  }
3509  if (err) {
3510  if (! pp->debug) {
3511  ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped, "%s|%s| range check detects problems", fbp->key.c_str(), fbp->location_c_str());
3512  if (! descrip.empty())
3513  descrip.clear();
3514  feat->Reset();
3515  return (null);
3516  }
3517  ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "%s|%s| range check detects problems", fbp->key.c_str(), fbp->location_c_str());
3518  }
3519 
3520  if (SeqLocHaveFuzz(feat->GetLocation()))
3521  feat->SetPartial(true);
3522 
3523  if (! descrip.empty())
3524  feat->SetComment(descrip);
3525 
3526  return (feat);
3527 }
3528 
3529 /**********************************************************
3530  *
3531  * static void SPFeatGeneral(pp, spfip, initmet):
3532  *
3533  * 10-18-93
3534  *
3535  **********************************************************/
3536 static void SPFeatGeneral(ParserPtr pp, SPFeatInputPtr spfip, bool initmet, CSeq_annot::C_Data::TFtable& feats)
3537 {
3538  SPFeatInputPtr temp;
3539 
3540  Int2 indx;
3541  bool signal;
3542  bool bond;
3543  /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
3544  bool noexp;
3545 */
3546  Uint1 type;
3547 
3548  for (temp = spfip; temp; temp = temp->next) {
3549  FtaInstallPrefix(PREFIX_FEATURE, temp->key.c_str(), temp->from.c_str());
3550 
3551  if (NStr::EqualNocase("VARSPLIC", temp->key)) {
3552  ErrPostStr(SEV_WARNING, ERR_FEATURE_ObsoleteFeature, "Obsolete UniProt feature \"VARSPLIC\" found. Replaced with \"VAR_SEQ\".");
3553  temp->key = "VAR_SEQ";
3554  }
3555 
3556  if (NStr::EqualNocase(temp->key, "NON_STD")) {
3557  if (NStr::EqualNocase(temp->descrip, "Selenocysteine.")) {
3558  temp->key = "SE_CYS";
3559  temp->descrip.clear();
3560  } else
3561  temp->key = "MOD_RES";
3562  }
3563 
3564  indx = SpFeatKeyNameValid(temp->key.c_str());
3565  if (indx == -1) {
3568  continue;
3569  }
3570 
3571  signal = false;
3572  bond = false;
3573 
3574  /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
3575  noexp = SPFeatNoExp(pp, temp);
3576 */
3577 
3578  CRef<CSeq_feat> feat(new CSeq_feat);
3579 
3580  type = ParFlat_SPFeat[indx].type;
3581  if (type == ParFlatSPSites) {
3582  if (indx == ParFlatSPSitesModB)
3583  indx = GetSPSitesMod(temp->descrip);
3584 
3585  feat->SetData().SetSite(static_cast<CSeqFeatData::ESite>(ParFlat_SPFeat[indx].keyint));
3586  } else if (type == ParFlatSPBonds) {
3587  feat->SetData().SetBond(static_cast<CSeqFeatData::EBond>(ParFlat_SPFeat[indx].keyint));
3588  bond = true;
3589  } else if (type == ParFlatSPRegions) {
3590  feat->SetData().SetRegion(ParFlat_SPFeat[indx].keystring);
3591  if (feat->GetData().GetRegion() == "Signal")
3592  signal = true;
3593  } else if (type == ParFlatSPImports) {
3594  feat->SetData().SetImp().SetKey(ParFlat_SPFeat[indx].keystring);
3595  feat->SetData().SetImp().SetDescr("uncertain amino acids");
3596  } else {
3597  if (type != ParFlatSPInitMet && type != ParFlatSPNonTer &&
3598  type != ParFlatSPNonCons) {
3599  ErrPostEx(SEV_WARNING, ERR_FEATURE_Dropped, "Swiss-Prot feature \"%s\" with unknown type dropped.", temp->key.c_str());
3600  }
3602  continue;
3603  }
3604 
3605  /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
3606  if(noexp)
3607  feat->SetExp_ev(CSeq_feat::eExp_ev_not_experimental);
3608  else
3609  feat->SetExp_ev(CSeq_feat::eExp_ev_experimental);
3610 */
3611 
3612 
3613  CRef<CSeq_loc> loc = GetSPSeqLoc(pp, temp, bond, initmet, signal);
3614  if (loc.NotEmpty())
3615  feat->SetLocation(*loc);
3616 
3617  if (SeqLocHaveFuzz(*loc))
3618  feat->SetPartial(true);
3619 
3620  if (! temp->descrip.empty())
3621  feat->SetComment(NStr::Sanitize(temp->descrip));
3622 
3623  feats.push_back(feat);
3624 
3626  }
3627 }
3628 
3629 /**********************************************************/
3630 static void DelParenthesis(char* str)
3631 {
3632  char* p;
3633  char* q;
3634  char* pp;
3635  char* qq;
3636  char* r;
3637  Int2 count;
3638  Int2 left;
3639  Int2 right;
3640 
3641  for (p = str; *p == ' ' || *p == '\t';)
3642  p++;
3643  for (q = p; *q != '\0';)
3644  q++;
3645  if (q > p)
3646  for (q--; (*q == ' ' || *q == '\t') && q > p;)
3647  *q-- = '\0';
3648  if (q == p && (*q == ' ' || *q == '\t'))
3649  *q = '\0';
3650  for (pp = p; *pp == '(';)
3651  pp++;
3652  for (qq = q; *qq == ')' && qq >= pp;)
3653  qq--;
3654  for (count = 0, left = 0, right = 0, r = pp; r <= qq; r++) {
3655  if (*r == '(')
3656  left++;
3657  else if (*r == ')') {
3658  right++;
3659  count = left - right;
3660  }
3661  }
3662  if (count < 0)
3663  for (; count < 0 && pp > p; pp--)
3664  count++;
3665  for (count = 0, r = qq; r >= pp; r--) {
3666  if (*r == '(')
3667  count--;
3668  else if (*r == ')')
3669  count++;
3670  }
3671  if (count < 0)
3672  for (; count < 0 && qq < q; qq++)
3673  count++;
3674  *++qq = '\0';
3675  if (pp != str)
3676  fta_StringCpy(str, pp);
3677 }
3678 
3679 /**********************************************************
3680  *
3681  * static void CkGeneNameSP(gname):
3682  *
3683  * Legal characters for gene_name are 0-9, a-z, A-Z,
3684  * under-score, dash, period, single quote, back single
3685  * quote, slash.
3686  *
3687  * 10-25-93
3688  *
3689  **********************************************************/
3690 static void CkGeneNameSP(char* gname)
3691 {
3692  char* p;
3693 
3694  DelParenthesis(gname);
3695  for (p = gname; *p != '\0'; p++)
3696  if (! (isalnum(*p) || *p == '_' || *p == '-' || *p == '.' ||
3697  *p == '\'' || *p == '`' || *p == '/' || *p == '(' || *p == ')'))
3698  break;
3699  if (*p != '\0')
3700  ErrPostEx(SEV_WARNING, ERR_GENENAME_IllegalGeneName, "gene_name contains unusual characters, %s, in SWISS-PROT", gname);
3701 }
3702 
3703 /**********************************************************
3704  *
3705  * static void ParseGeneNameSP(str, feat):
3706  *
3707  * gene_name and synonyms separated by " OR ".
3708  *
3709  * 10-25-93
3710  *
3711  **********************************************************/
3712 static void ParseGeneNameSP(char* str, CSeq_feat& feat)
3713 {
3714  char* p;
3715  char* q;
3716  Int2 count = 0;
3717 
3718  CGene_ref& gene = feat.SetData().SetGene();
3719 
3720  for (p = str; *p != '\0';) {
3721  while (*p == ' ')
3722  p++;
3723  for (q = p; *p != '\0' && *p != ' ';)
3724  p++;
3725  if (*p != '\0')
3726  *p++ = '\0';
3727  if (StringEqu(q, "AND") || StringEqu(q, "OR"))
3728  continue;
3729  char* gname = StringSave(q);
3730  CkGeneNameSP(gname);
3731  if (count == 0) {
3732  count++;
3733  gene.SetLocus(gname);
3734  } else {
3735  gene.SetSyn().push_back(gname);
3736  }
3737  MemFree(gname);
3738  }
3739 }
3740 
3741 /**********************************************************
3742  *
3743  * static CRef<CSeq_loc> GetSeqLocIntSP(seqlen, acnum,
3744  * accver, vernum):
3745  *
3746  * 10-18-93
3747  *
3748  **********************************************************/
3749 static CRef<CSeq_loc> GetSeqLocIntSP(size_t seqlen, char* acnum, bool accver, Int2 vernum)
3750 {
3751  CRef<CSeq_loc> loc(new CSeq_loc);
3752  CSeq_interval& interval = loc->SetInt();
3753 
3754  interval.SetFrom(0);
3755  interval.SetTo(static_cast<TSeqPos>(seqlen) - 1);
3756  interval.SetId(*MakeAccSeqId(acnum, CSeq_id::e_Swissprot, accver, vernum));
3757 
3758  return loc;
3759 }
3760 
3761 /**********************************************************
3762  *
3763  * static void GetOneGeneRef(pp, hsfp, bptr,
3764  * seqlen):
3765  *
3766  * Each Gene-ref separated by " AND ".
3767  *
3768  * 10-25-93
3769  *
3770  **********************************************************/
3771 static void GetOneGeneRef(ParserPtr pp, CSeq_annot::C_Data::TFtable& feats, char* bptr, size_t seqlen)
3772 {
3773  IndexblkPtr ibp;
3774 
3775  char* str;
3776  char* ptr;
3777 
3778  if (! pp || pp->entrylist.empty())
3779  return;
3780 
3781  ibp = pp->entrylist[pp->curindx];
3782  if (! ibp)
3783  return;
3784 
3785  str = StringSave(bptr);
3786  for (ptr = str; *ptr != '\0'; ptr++)
3787  if (*ptr == '\t')
3788  *ptr = ' ';
3789 
3791 
3792  CRef<CSeq_feat> feat(new CSeq_feat);
3793  ParseGeneNameSP(str, *feat);
3794  feat->SetLocation(*GetSeqLocIntSP(seqlen, ibp->acnum, pp->accver, ibp->vernum));
3795 
3796  feats.push_back(feat);
3797 }
3798 
3799 /**********************************************************/
3800 static void SPFreeGenRefTokens(char* name, char* syns, char* ltags, char* orfs)
3801 {
3802  if (name)
3803  MemFree(name);
3804  if (syns)
3805  MemFree(syns);
3806  if (ltags)
3807  MemFree(ltags);
3808  if (orfs)
3809  MemFree(orfs);
3810 }
3811 
3812 /**********************************************************/
3813 static void SPParseGeneRefTag(char* str, CGene_ref& gene, bool set_locus_tag)
3814 {
3815  char* p;
3816  char* q;
3817 
3818  if (! str)
3819  return;
3820 
3821  for (p = str; p && *p != '\0'; p = q) {
3822  while (*p == ' ' || *p == ',')
3823  p++;
3824  q = StringChr(p, ',');
3825  if (q)
3826  *q++ = '\0';
3827  if (q == p)
3828  continue;
3829  if (set_locus_tag && ! gene.IsSetLocus_tag()) {
3830  gene.SetLocus_tag(p);
3831  continue;
3832  }
3833 
3834  gene.SetSyn().push_back(p);
3835  }
3836 }
3837 
3838 /**********************************************************/
3839 static void SPGetOneGeneRefNew(ParserPtr pp, CSeq_annot::C_Data::TFtable& feats, size_t seqlen, char* name, char* syns, char* ltags, char* orfs)
3840 {
3841  IndexblkPtr ibp;
3842 
3843  if (! pp || pp->entrylist.empty() ||
3844  (! name && ! syns && ! ltags && ! orfs))
3845  return;
3846 
3847  ibp = pp->entrylist[pp->curindx];
3848  if (! ibp)
3849  return;
3850 
3851  CRef<CSeq_feat> feat(new CSeq_feat);
3852  CGene_ref& gene = feat->SetData().SetGene();
3853 
3854  if (name)
3855  gene.SetLocus(name);
3856 
3857 
3858  SPParseGeneRefTag(syns, gene, false);
3859  SPParseGeneRefTag(ltags, gene, true);
3860  SPParseGeneRefTag(orfs, gene, true);
3861 
3862  feat->SetLocation(*GetSeqLocIntSP(seqlen, ibp->acnum, pp->accver, ibp->vernum));
3863 
3864  feats.push_back(feat);
3865 }
3866 
3867 /**********************************************************/
3868 static void SPGetGeneRefsNew(ParserPtr pp, CSeq_annot::C_Data::TFtable& feats, char* bptr, size_t seqlen)
3869 {
3870  IndexblkPtr ibp;
3871 
3872  char* name;
3873  char* syns;
3874  char* ltags;
3875  char* orfs;
3876  char* str;
3877  char* p;
3878  char* q;
3879  char* r;
3880 
3881  if (! pp || pp->entrylist.empty() || ! bptr)
3882  return;
3883 
3884  ibp = pp->entrylist[pp->curindx];
3885  if (! ibp)
3886  return;
3887 
3888  str = StringSave(bptr);
3889 
3890  name = nullptr;
3891  syns = nullptr;
3892  ltags = nullptr;
3893  orfs = nullptr;
3894  for (p = str; p && *p != '\0'; p = q) {
3895  while (*p == ' ' || *p == ';')
3896  p++;
3897  for (r = p;; r = q + 1) {
3898  q = StringChr(r, ';');
3899  if (! q || q[1] == ' ' || q[1] == '\n' || q[1] == '\0')
3900  break;
3901  }
3902  if (q)
3903  *q++ = '\0';
3904  if (StringEquNI(p, "Name=", 5)) {
3905  if (name) {
3906  ErrPostEx(SEV_REJECT, ERR_FORMAT_ExcessGeneFields, "Field \"Name=\" occurs multiple times within a GN line. Entry dropped.");
3907  ibp->drop = true;
3908  break;
3909  }
3910  p += 5;
3911  if (p != q)
3912  name = StringSave(p);
3913  } else if (StringEquNI(p, "Synonyms=", 9)) {
3914  if (syns) {
3915  ErrPostEx(SEV_REJECT, ERR_FORMAT_ExcessGeneFields, "Field \"Synonyms=\" occurs multiple times within a GN line. Entry dropped.");
3916  ibp->drop = true;
3917  break;
3918  }
3919  p += 9;
3920  if (p != q)
3921  syns = StringSave(p);
3922  } else if (StringEquNI(p, "OrderedLocusNames=", 18)) {
3923  if (ltags) {
3924  ErrPostEx(SEV_REJECT, ERR_FORMAT_ExcessGeneFields, "Field \"OrderedLocusNames=\" occurs multiple times within a GN line. Entry dropped.");
3925  ibp->drop = true;
3926  break;
3927  }
3928  p += 18;
3929  if (p != q)
3930  ltags = StringSave(p);
3931  } else if (StringEquNI(p, "ORFNames=", 9)) {
3932  if (orfs) {
3933  ErrPostEx(SEV_REJECT, ERR_FORMAT_ExcessGeneFields, "Field \"ORFNames=\" occurs multiple times within a GN line. Entry dropped.");
3934  ibp->drop = true;
3935  break;
3936  }
3937  p += 9;
3938  if (p != q)
3939  orfs = StringSave(p);
3940  } else if (StringEquNI(p, "and ", 4)) {
3941  if (q)
3942  *--q = ';';
3943  q = p + 4;
3944 
3945  if (! name && ! syns && ! ltags && ! orfs)
3946  continue;
3947 
3948  if (! name && syns) {
3949  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingGeneName, "Encountered a gene with synonyms \"%s\" that lacks a gene symbol.", syns);
3950  }
3951 
3952  SPGetOneGeneRefNew(pp, feats, seqlen, name, syns, ltags, orfs);
3953  SPFreeGenRefTokens(name, syns, ltags, orfs);
3954  name = nullptr;
3955  syns = nullptr;
3956  ltags = nullptr;
3957  orfs = nullptr;
3958  } else {
3959  ErrPostEx(SEV_REJECT, ERR_FORMAT_UnknownGeneField, "Field \"%s\" is not a legal field for the GN linetype. Entry dropped.", p);
3960  ibp->drop = true;
3961  break;
3962  }
3963  }
3964 
3965  MemFree(str);
3966 
3967  if (! name && ! syns && ! ltags && ! orfs)
3968  return;
3969 
3970  if (ibp->drop) {
3971  SPFreeGenRefTokens(name, syns, ltags, orfs);
3972  return;
3973  }
3974 
3975  SPGetOneGeneRefNew(pp, feats, seqlen, name, syns, ltags, orfs);
3976 
3977  SPFreeGenRefTokens(name, syns, ltags, orfs);
3978 }
3979 
3980 /**********************************************************
3981  *
3982  * static Int4 GetSeqLen(entry):
3983  *
3984  * 11-3-93
3985  *
3986  **********************************************************/
3988 {
3989  EntryBlkPtr ebp = static_cast<EntryBlk*>(entry->mpData);
3990  const CBioseq& bioseq = ebp->seq_entry->GetSeq();
3991  return bioseq.GetLength();
3992 }
3993 
3994 /**********************************************************
3995  *
3996  * static void SPFeatGeneRef(pp, hsfp, entry):
3997  *
3998  * sfp->mpData: gene (Gene-ref).
3999  * Data from GN lines:
4000  * - legal characters for gene_name are 0-9, a-z, A-Z,
4001  * under-score, dash, period, single quote, back
4002  * single quote, slash;
4003  * - each Gene-ref separated by " AND ";
4004  * - gene_name and synonyms separated by " OR ", the
4005  * first one before " OR " is gene_name, others are
4006  * synonyms.
4007  *
4008  * sfp->location: SEQLOC_INT, always from 0 to
4009  * seqence_length.
4010  *
4011  * Output warning message:
4012  * - if DE line containing "(GENE NAME:...)" clause
4013  * (SPFeatProtRef routine);
4014  * - or other illegal character s.t. white space in the
4015  * gene_name.
4016  *
4017  * 10-25-93
4018  *
4019  **********************************************************/
4021 {
4022  char* offset;
4023  char* str;
4024 
4025  size_t len = 0;
4026  offset = SrchNodeType(entry, ParFlatSP_GN, &len);
4027  if (! offset)
4028  return;
4029 
4030  string str_ = GetBlkDataReplaceNewLine(string_view(offset, len), ParFlat_COL_DATA_SP);
4031  StripECO(str_);
4032  str = StringSave(str_);
4033  if (! str)
4034  return;
4035 
4036  len = GetSeqLen(entry);
4037  if (! StringIStr(str, "Name=") &&
4038  ! StringIStr(str, "Synonyms=") &&
4039  ! StringIStr(str, "OrderedLocusNames=") &&
4040  ! StringIStr(str, "ORFNames="))
4041  GetOneGeneRef(pp, feats, str, len);
4042  else
4043  SPGetGeneRefsNew(pp, feats, str, len);
4044 
4045  MemFree(str);
4046 }
4047 
4048 /**********************************************************/
4049 static void SPValidateEcnum(string& ecnum)
4050 {
4051  char* p;
4052  char* q;
4053  char* buf;
4054  Int4 count;
4055 
4056  buf = StringSave(ecnum);
4057  for (count = 0, q = buf;; q = p) {
4058  p = q;
4059  count++;
4060  if (*p == '-') {
4061  p++;
4062  if (*p != '.')
4063  break;
4064  p++;
4065  continue;
4066  }
4067  if (*p == 'n') {
4068  p++;
4069  if (*p == '.' || *p == '\0') {
4070  count = 0;
4071  break;
4072  }
4073  }
4074  while (*p >= '0' && *p <= '9')
4075  p++;
4076  if (*q == 'n' && (*p == '.' || *p == '\0')) {
4077  fta_StringCpy(q + 1, p);
4078  p = q + 1;
4079  }
4080  if (p == q) {
4081  count = 0;
4082  break;
4083  }
4084  if (*p != '.')
4085  break;
4086  p++;
4087  }
4088 
4089  if (count != 4 || *p != '\0') {
4090  ErrPostEx(SEV_ERROR, ERR_FORMAT_InvalidECNumber, "Invalid EC number provided in SwissProt DE line: \"%s\". Preserve it anyway.", ecnum.c_str());
4091  } else
4092  ecnum = buf;
4093  MemFree(buf);
4094 }
4095 
4096 /**********************************************************/
4098 {
4099  Char ch;
4100 
4101  for (; sfp; sfp = sfp->next) {
4102  if (sfp->tag == SPDE_RECNAME || sfp->tag == SPDE_ALTNAME ||
4103  sfp->tag == SPDE_SUBNAME || sfp->tag == SPDE_FLAGS)
4104  break;
4105  if (sfp->tag != tag)
4106  continue;
4107 
4108  ch = *sfp->end;
4109  *sfp->end = '\0';
4110 
4111  prot.SetName().push_back(sfp->start);
4112  *sfp->end = ch;
4113  }
4114 }
4115 
4116 /**********************************************************/
4117 static void SPValidateDefinition(SPDEFieldsPtr sfp, bool* drop, bool is_trembl)
4118 {
4119  SPDEFieldsPtr tsfp;
4120  Int4 rcount;
4121  Int4 scount;
4122  Int4 fcount;
4123 
4124  for (rcount = 0, scount = 0, tsfp = sfp; tsfp; tsfp = tsfp->next) {
4125  if (tsfp->tag == SPDE_RECNAME)
4126  rcount++;
4127  else if (tsfp->tag == SPDE_SUBNAME)
4128  scount++;
4129  }
4130 
4131  for (fcount = 0, tsfp = sfp; tsfp; tsfp = tsfp->next) {
4132  if (tsfp->tag != SPDE_RECNAME)
4133  continue;
4134  for (tsfp = tsfp->next; tsfp; tsfp = tsfp->next) {
4135  if (tsfp->tag == SPDE_RECNAME || tsfp->tag == SPDE_ALTNAME ||
4136  tsfp->tag == SPDE_SUBNAME || tsfp->tag == SPDE_FLAGS)
4137  break;
4138  if (tsfp->tag == SPDE_FULL)
4139  fcount++;
4140  }
4141  if (! tsfp)
4142  break;
4143  }
4144 
4145  if (rcount > 1) {
4146  ErrPostEx(SEV_REJECT, ERR_FORMAT_MultipleRecName, "This UniProt record has multiple RecName protein-name categories, but only one is allowed. Entry dropped.");
4147  *drop = true;
4148  } else if (rcount == 0 && ! is_trembl) {
4149  ErrPostEx(SEV_REJECT, ERR_FORMAT_MissingRecName, "This UniProt/Swiss-Prot record lacks required RecName protein-name categorie. Entry dropped.");
4150  *drop = true;
4151  }
4152 
4153  if (scount > 0 && ! is_trembl) {
4154  ErrPostEx(SEV_REJECT, ERR_FORMAT_SwissProtHasSubName, "This UniProt/Swiss-Prot record includes a SubName protein-name category, which should be used only for UniProt/TrEMBL. Entry dropped.");
4155  *drop = true;
4156  }
4157 
4158  if (fcount == 0 && rcount > 0) {
4159  ErrPostEx(SEV_REJECT, ERR_FORMAT_MissingFullRecName, "This UniProt record lacks a Full name in the RecName protein-name category.");
4160  *drop = true;
4161  }
4162 }
4163 
4164 /**********************************************************/
4165 static void SPParseDefinition(char* str, const CBioseq::TId& ids, IndexblkPtr ibp, CProt_ref& prot)
4166 {
4167  CharIntLen* cilp;
4168  SPDEFieldsPtr sfp;
4169  SPDEFieldsPtr tsfp;
4170 
4171  bool is_trembl;
4172  char* p;
4173  char* q;
4174  char* r;
4175  Int4 count;
4176  Char ch;
4177 
4178  if (! str || (! StringEquNI(str, "RecName: ", 9) &&
4179  ! StringEquNI(str, "AltName: ", 9) &&
4180  ! StringEquNI(str, "SubName: ", 9)))
4181  return;
4182 
4183  is_trembl = false;
4184 
4185  for (const auto& id : ids) {
4186  if (! id->IsSwissprot())
4187  continue;
4188 
4189  if (id->GetSwissprot().IsSetRelease() &&
4190  NStr::CompareNocase(id->GetSwissprot().GetRelease().c_str(), "unreviewed") == 0)
4191  is_trembl = true;
4192  }
4193 
4194  sfp = new SPDEFields;
4195  sfp->tag = 0;
4196  sfp->next = nullptr;
4197 
4198  for (tsfp = sfp, p = str, count = 0; *p != '\0';) {
4199  while (*p == ' ')
4200  p++;
4201  for (q = p; *p != '\0' && *p != ' ';)
4202  p++;
4203  ch = *p;
4204  *p = '\0';
4205  for (cilp = spde_tags; cilp->str; cilp++)
4206  if (StringEquNI(cilp->str, q, cilp->len))
4207  break;
4208 
4209  *p = ch;
4210  if (! cilp->str)
4211  continue;
4212 
4213  if (tsfp->tag != 0) {
4214  if (q == tsfp->start)
4215  tsfp->end = q;
4216  else {
4217  for (r = q - 1; *r == ' ' || *r == ';';)
4218  r--;
4219  tsfp->end = r + 1;
4220  }
4221  }
4222 
4223  if (cilp->num == SPDE_INCLUDES || cilp->num == SPDE_CONTAINS)
4224  break;
4225 
4226  count++;
4227  tsfp->next = new SPDEFields;
4228  tsfp = tsfp->next;
4229  tsfp->tag = cilp->num;
4230  for (r = q + cilp->len; *r == ' ';)
4231  r++;
4232  tsfp->start = r;
4233  tsfp->next = nullptr;
4234  }
4235 
4236  if (*p == '\0')
4237  tsfp->end = p;
4238 
4239  SPValidateDefinition(sfp->next, &ibp->drop, is_trembl);
4240 
4241  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4242  if (tsfp->tag == SPDE_RECNAME)
4244  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4245  if (tsfp->tag == SPDE_RECNAME)
4247 
4248  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4249  if (tsfp->tag == SPDE_ALTNAME)
4251  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4252  if (tsfp->tag == SPDE_ALTNAME)
4254 
4255  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4256  if (tsfp->tag == SPDE_SUBNAME)
4258  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4259  if (tsfp->tag == SPDE_SUBNAME)
4261 }
4262 
4263 /**********************************************************/
4264 static void SPGetPEValue(DataBlkPtr entry, CSeq_feat& feat)
4265 {
4266  char* offset;
4267  char* buf;
4268  char* p;
4269  char* q;
4270 
4271  size_t len = 0;
4272  offset = SrchNodeType(entry, ParFlatSP_PE, &len);
4273  if (! offset || len < 1)
4274  return;
4275 
4276  buf = StringSave(string_view(offset, len - 1));
4277 
4278  for (q = buf + 2; *q == ' ';)
4279  q++;
4280  p = StringChr(q, ':');
4281  if (p)
4282  for (p++; *p == ' ';)
4283  p++;
4284  else
4285  p = q;
4286 
4287  q = StringRChr(p, ';');
4288  if (! q)
4289  q = StringChr(p, '\n');
4290  if (q)
4291  *q = '\0';
4292 
4293  if (MatchArrayIString(PE_values, p) < 0)
4294  ErrPostEx(SEV_ERROR, ERR_SPROT_PELine, "Unrecognized value is encountered in PE (Protein Existence) line: \"%s\".", p);
4295 
4296  CRef<CGb_qual> qual(new CGb_qual);
4297  qual->SetQual("UniProtKB_evidence");
4298  qual->SetVal(p);
4299  feat.SetQual().push_back(qual);
4300 
4301  MemFree(buf);
4302 }
4303 
4304 /**********************************************************
4305  *
4306  * static SeqFeatPtr SPFeatProtRef(pp, hsfp, entry,
4307  * spfbp):
4308  *
4309  * sfp->data: prot (Prot-ref):
4310  * - name: DE line, delete everything after " (" or "/";
4311  * - EC_number: if DE lines contains "(EC ...)".
4312  *
4313  * sfp->location: SEQLOC_INT, always from 0 to
4314  * seqence_length.
4315  *
4316  * 10-20-93
4317  *
4318  **********************************************************/
4320 {
4321  IndexblkPtr ibp;
4322 
4323  char* offset;
4324 
4325  char* str;
4326  string str1;
4327 
4328  char* ptr;
4329 
4330  const char* tag;
4331  Char symb;
4332  Int4 shift;
4333 
4334  EntryBlkPtr ebp;
4335 
4336  ebp = static_cast<EntryBlk*>(entry->mpData);
4337 
4338  CSeq_entry& seq_entry = *ebp->seq_entry;
4339  CBioseq& bioseq = seq_entry.SetSeq();
4340 
4341  size_t len = 0;
4342  offset = SrchNodeType(entry, ParFlatSP_DE, &len);
4343  if (! offset)
4344  return;
4345 
4346  CRef<CSeq_feat> feat(new CSeq_feat);
4347  CProt_ref& prot = feat->SetData().SetProt();
4348 
4349  string str_ = GetBlkDataReplaceNewLine(string_view(offset, len), ParFlat_COL_DATA_SP);
4350  StripECO(str_);
4351  while (! str_.empty()) {
4352  char c = str_.back();
4353  if (c == '.' || c == ';' || c == ',')
4354  str_.pop_back();
4355  else
4356  break;
4357  }
4358 
4359  ShrinkSpaces(str_);
4360  str = StringSave(str_);
4361 
4362  ibp = pp->entrylist[pp->curindx];
4363 
4364  if (StringEquNI(str, "Contains: ", 10) ||
4365  StringEquNI(str, "Includes: ", 10)) {
4366  ErrPostEx(SEV_REJECT, ERR_FORMAT_NoProteinNameCategory, "DE lines do not have a non-Includes/non-Contains RecName, AltName or SubName protein name category. Entry dropped.");
4367  ibp->drop = true;
4368  }
4369 
4370  if (StringEquNI(str, "RecName: ", 9) ||
4371  StringEquNI(str, "AltName: ", 9) ||
4372  StringEquNI(str, "SubName: ", 9)) {
4373  tag = "; EC=";
4374  symb = ';';
4375  shift = 5;
4376  SPParseDefinition(str, bioseq.GetId(), ibp, prot);
4377  } else {
4378  tag = "(EC";
4379  symb = ')';
4380  shift = 3;
4381  }
4382 
4383  while ((ptr = StringStr(str, tag))) {
4384  len = StringLen(str);
4385  str1.assign(str, ptr);
4386 
4387  ptr += shift;
4388  while (*ptr == ' ')
4389  ptr++;
4390 
4391  char* bptr;
4392  for (bptr = ptr; *ptr != '\0' && *ptr != ' ' && *ptr != symb;)
4393  ptr++;
4394  if (ptr > bptr) {
4395  string ecnum(bptr, ptr);
4396  SPValidateEcnum(ecnum);
4397 
4398  if (! ecnum.empty())
4399  prot.SetEc().push_back(ecnum);
4400  } else {
4401  ErrPostEx(SEV_WARNING, ERR_FORMAT_ECNumberNotPresent, "Empty EC number provided in SwissProt DE line.");
4402  }
4403 
4404  if (symb == ')') {
4405  while (*ptr != '\0' && (*ptr == ' ' || *ptr == symb))
4406  ptr++;
4407  if (StringLen(ptr) <= 1)
4409  }
4410 
4411  str1 += ptr;
4412 
4413  MemFree(str);
4414  str = StringSave(str1);
4415  }
4416 
4417  if (symb == ')') {
4418  while ((ptr = StringStr(str, " (")) ||
4419  (ptr = StringStr(str, " /"))) {
4420  str1.assign(str, ptr);
4422 
4423  MemFree(str);
4424  str = StringSave(str1);
4425  }
4426  }
4427 
4428  if (! prot.IsSetName())
4429  prot.SetName().push_back(str);
4430 
4431  MemFree(str);
4432 
4433  feat->SetLocation(*GetSeqLocIntSP(GetSeqLen(entry), ibp->acnum, pp->accver, ibp->vernum));
4434 
4435  if (spfbp->nonter) {
4436  feat->SetPartial(true);
4437 
4438  if (spfbp->noleft)
4439  GetIntFuzzPtr(4, 2, 0, feat->SetLocation().SetInt().SetFuzz_from()); /* lim, lt, no-min */
4440  if (spfbp->noright)
4441  GetIntFuzzPtr(4, 1, 0, feat->SetLocation().SetInt().SetFuzz_to()); /* lim, gt, no-min */
4442  }
4443 
4444  SPGetPEValue(entry, *feat);
4445 
4446  feats.push_back(feat);
4447 }
4448 
4449 /**********************************************************
4450  *
4451  * static SPSegLocPtr GetSPSegLocInfo(sep, spfip, spfbp):
4452  *
4453  * Return a link list of segment location information,
4454  * data from NON_CONS and change the modif of the sep of
4455  * the bsp->descr to partial.
4456  *
4457  * If input has NON_CONS: 17..18, 31..32, 65..66, and
4458  * total seqlen = 100, then SPSegLocPtr, spslp, will have
4459  * 4 nodes, each node has
4460  * 0, 16, 16-0+1=17, 1, 4, XXXX_1, descr of XXXX_1, add no-right
4461  * 17, 30, 30-17+1=14, 2, 4, XXXX_2, descr of XXXX_2, add no-right, no-left
4462  * 31, 64, 64-31+1=34, 3, 4, XXXX_3, descr of XXXX_3, add no-right, no-left
4463  * 65, 99, 99-65+1=35, 4, 4, XXXX_4, descr of XXXX_4, add no-left
4464  * where XXXX is locus (ID) name.
4465  *
4466  * Set hspslp->fuzzfrom = TRUE if spfbp->noleft = TRUE.
4467  * Set hspslp->fuzzto = TRUE if spfbp->noright = TRUE.
4468  *
4469  * 11-5-93
4470  *
4471  **********************************************************/
4473 {
4474  SPSegLocPtr curspslp = nullptr;
4475  SPSegLocPtr hspslp = nullptr;
4476  SPSegLocPtr spslp;
4477  const char* p;
4478 
4479  if (! spfip)
4480  return nullptr;
4481 
4482  /* get location range
4483  */
4484  for (; spfip; spfip = spfip->next) {
4485  if (spfip->key != "NON_CONS")
4486  continue;
4487 
4488  if (! hspslp) {
4489  spslp = new SPSegLoc;
4490  p = spfip->from.c_str();
4491  if (*p == '<' || *p == '>' || *p == '?')
4492  p++;
4493 
4494  spslp->len = atoi(p);
4495  hspslp = spslp;
4496  curspslp = spslp;
4497  } else {
4498  p = spfip->from.c_str();
4499  if (*p == '<' || *p == '>' || *p == '?')
4500  p++;
4501  curspslp->len = atoi(p) - curspslp->from;
4502  }
4503 
4504  spslp = new SPSegLoc;
4505  p = spfip->from.c_str();
4506  if (*p == '<' || *p == '>' || *p == '?')
4507  p++;
4508  spslp->from = atoi(p);
4509  curspslp->next = spslp;
4510  curspslp = spslp;
4511  }
4512 
4513  for (auto& descr : bioseq.SetDescr().Set()) {
4514  if (! descr->IsMolinfo())
4515  continue;
4516 
4517  if (spfbp->noleft && spfbp->noright)
4518  descr->SetMolinfo().SetCompleteness(CMolInfo::eCompleteness_no_ends);
4519  else if (spfbp->noleft)
4520  descr->SetMolinfo().SetCompleteness(CMolInfo::eCompleteness_no_left);
4521  else if (spfbp->noright)
4522  descr->SetMolinfo().SetCompleteness(CMolInfo::eCompleteness_no_right);
4523  }
4524 
4525  if (hspslp)
4526  curspslp->len = bioseq.GetLength() - curspslp->from;
4527 
4528  return (hspslp);
4529 }
4530 
4531 /**********************************************************
4532  *
4533  * static void CkInitMetSP(pp, spfip, sep, spfbp):
4534  *
4535  * 11-1-93
4536  *
4537  **********************************************************/
4538 static void CkInitMetSP(ParserPtr pp, SPFeatInputPtr spfip, CSeq_entry& seq_entry, SPFeatBlnPtr spfbp)
4539 {
4540  SPFeatInputPtr temp;
4541  const char* p;
4542  Int2 count;
4543  Int4 from = 0;
4544  Int4 to;
4545 
4546  for (count = 0; spfip; spfip = spfip->next) {
4547  if (spfip->key != "INIT_MET")
4548  continue;
4549 
4550  if (count > 0)
4551  break;
4552 
4553  count++;
4554  p = spfip->from.c_str();
4555  if (*p == '<' || *p == '>' || *p == '?')
4556  p++;
4557  from = atoi(p);
4558  p = spfip->to.c_str();
4559  if (*p == '<' || *p == '>' || *p == '?')
4560  p++;
4561  to = atoi(p);
4562 
4563  if ((from != 0 || to != 0) && (from != 1 || to != 1))
4564  break;
4565  temp = spfip;
4566  }
4567 
4568  if (count == 0)
4569  return;
4570 
4571  if (spfip) {
4572  ErrPostEx(SEV_ERROR, ERR_FEATURE_Invalid_INIT_MET, "Either incorrect or more than one INIT_MET feature provided.");
4573  return;
4574  }
4575 
4576  if (! temp->descrip.empty()) {
4577  ErrPostEx(SEV_WARNING, ERR_FEATURE_ExpectEmptyComment, "%s:%d-%d has description: %s", temp->key.c_str(), from, to, temp->descrip.c_str());
4578  }
4579 
4580 
4581  CBioseq& bioseq = seq_entry.SetSeq();
4582 
4583  CSeq_data& data = bioseq.SetInst().SetSeq_data();
4584  string& sequence = data.SetIupacaa().Set();
4585 
4586  if (from == 0) {
4587  spfbp->initmet = true;
4588 
4589  /* insert "M" in the front
4590  */
4591  sequence.insert(sequence.begin(), 'M');
4592  bioseq.SetInst().SetLength(static_cast<TSeqPos>(sequence.size()));
4593  } else if (sequence.empty() || sequence[0] != 'M')
4594  ErrPostEx(SEV_ERROR, ERR_FEATURE_MissingInitMet, "The required Init Met is missing from the sequence.");
4595 }
4596 
4597 /**********************************************************
4598  *
4599  * static void CkNonTerSP(pp, spfip, sep, spfbp):
4600  *
4601  * Set spfbp->nonter = spfbp->noleft = TRUE if
4602  * NON_TER 1..1.
4603  * Set spfbp->nonter = spfbp->noright = TRUE if
4604  * NON_TER base..base.
4605  * Set bsp->descr of modif = partial if there is more
4606  * than 5 contiguous unsequenced residues, X.
4607  *
4608  * 11-2-93
4609  *
4610  **********************************************************/
4611 static void CkNonTerSP(ParserPtr pp, SPFeatInputPtr spfip, CSeq_entry& seq_entry, SPFeatBlnPtr spfbp)
4612 {
4613  SPFeatInputPtr temp;
4614  Int4 from;
4615  Int4 ctr;
4616  bool segm;
4617 
4618  CMolInfo* mol_info = nullptr;
4619  CBioseq& bioseq = seq_entry.SetSeq();
4620 
4621  ctr = 0;
4622  for (auto& descr : bioseq.SetDescr().Set()) {
4623  if (! descr->IsMolinfo())
4624  continue;
4625 
4626  mol_info = &(descr->SetMolinfo());
4627  break;
4628  }
4629 
4630  segm = false;
4631  for (temp = spfip; temp; temp = temp->next) {
4632  if (temp->key == "NON_CONS") {
4633  segm = true;
4634  continue;
4635  }
4636 
4637  if (temp->key != "NON_TER")
4638  continue;
4639 
4640  from = NStr::StringToInt(temp->from);
4641  if (from != NStr::StringToInt(temp->to)) {
4642  ErrPostStr(SEV_WARNING, ERR_FEATURE_UnEqualEndPoint, "NON_TER has unequal endpoints");
4643  continue;
4644  }
4645 
4646  if (from == 1) {
4647  spfbp->nonter = true;
4648  spfbp->noleft = true;
4649  } else if (from == (Int4)pp->entrylist[pp->curindx]->bases) {
4650  spfbp->nonter = true;
4651  spfbp->noright = true;
4652  } else {
4653  ErrPostStr(SEV_WARNING, ERR_FEATURE_NotSeqEndPoint, "NON_TER is not at a sequence endpoint.");
4654  }
4655  }
4656 
4657  if (! mol_info)
4658  return;
4659 
4660  if (segm && mol_info->GetCompleteness() != 2) {
4662  ErrPostEx(SEV_WARNING, ERR_FEATURE_NoFragment, "Found NON_CONS in FT line but no FRAGMENT in DE line.");
4663  } else if (spfbp->nonter && mol_info->GetCompleteness() != CMolInfo::eCompleteness_partial) {
4665  ErrPostEx(SEV_WARNING, ERR_FEATURE_NoFragment, "Found NON_TER in FT line but no FRAGMENT in DE line.");
4666  } else if (! spfbp->nonter && mol_info->GetCompleteness() == CMolInfo::eCompleteness_partial && ! segm) {
4667  ErrPostEx(SEV_WARNING, ERR_FEATURE_PartialNoNonTerNonCons, "Entry is partial but has no NON_TER or NON_CONS features.");
4668  } else if (mol_info->GetCompleteness() != 2) {
4669  if (bioseq.GetInst().IsSetSeq_data()) {
4670  const CSeq_data& data = bioseq.GetInst().GetSeq_data();
4671  const string& sequence = data.GetIupacaa().Get();
4672 
4673  for (string::const_iterator value = sequence.begin(); value != sequence.end(); ++value) {
4674  if (*value != 'X') {
4675  ctr = 0; /* reset counter */
4676  continue;
4677  }
4678 
4679  ctr++;
4680  if (ctr == 5) {
4682  break;
4683  }
4684  }
4685  }
4686  }
4687 }
4688 
4689 /**********************************************************/
4690 static void SeqToDeltaSP(CBioseq& bioseq, SPSegLocPtr spslp)
4691 {
4692  if (! spslp || ! bioseq.GetInst().IsSetSeq_data())
4693  return;
4694 
4695  CSeq_ext::TDelta& deltas = bioseq.SetInst().SetExt().SetDelta();
4696  const string& bioseq_data = bioseq.GetInst().GetSeq_data().GetIupacaa().Get();
4697 
4698  for (; spslp; spslp = spslp->next) {
4700  if (! deltas.Set().empty()) {
4701  delta->SetLiteral().SetLength(0);
4702  delta->SetLiteral().SetFuzz().SetLim();
4703  deltas.Set().push_back(delta);
4704 
4705  delta.Reset(new CDelta_seq);
4706  }
4707 
4708  delta->SetLiteral().SetLength(spslp->len);
4709 
4710 
4711  string data_str = bioseq_data.substr(spslp->from, spslp->len);
4712 
4713  delta->SetLiteral().SetSeq_data().SetIupacaa().Set(data_str);
4714  deltas.Set().push_back(delta);
4715  }
4716 
4717  if (deltas.Set().size() > 1) {
4718  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
4719  bioseq.SetInst().ResetSeq_data();
4720  } else
4721  bioseq.SetInst().SetExt().Reset();
4722 }
4723 
4724 /**********************************************************
4725  *
4726  * static void GetSPAnnot(pp, entry, protconv):
4727  *
4728  * 10-15-93
4729  *
4730  **********************************************************/
4731 static void GetSPAnnot(ParserPtr pp, DataBlkPtr entry, unsigned char* protconv)
4732 {
4733  SPFeatInputPtr spfip;
4734  EntryBlkPtr ebp;
4735 
4736  SPFeatBlnPtr spfbp;
4737  SPSegLocPtr spslp; /* segment location, data from NON_CONS */
4738  SPSegLocPtr next;
4739 
4740  ebp = static_cast<EntryBlk*>(entry->mpData);
4741  CSeq_entry& seq_entry = *ebp->seq_entry;
4742 
4743  spfbp = new SPFeatBln;
4744  spfip = ParseSPFeat(entry, pp->entrylist[pp->curindx]->bases);
4745 
4747 
4748  if (spfip) {
4749  CkNonTerSP(pp, spfip, seq_entry, spfbp);
4750  CkInitMetSP(pp, spfip, seq_entry, spfbp);
4751  SPFeatGeneral(pp, spfip, spfbp->initmet, feats);
4752  }
4753 
4754  SPFeatGeneRef(pp, feats, entry); /* GN line */
4755  SPFeatProtRef(pp, feats, entry, spfbp); /* DE line */
4756 
4757  CBioseq& bioseq = seq_entry.SetSeq();
4758 
4759  spslp = GetSPSegLocInfo(bioseq, spfip, spfbp); /* checking NON_CONS key */
4760  if (spslp)
4761  SeqToDeltaSP(bioseq, spslp);
4762 
4763  if (! feats.empty()) {
4764  CRef<CSeq_annot> annot(new CSeq_annot);
4765  annot->SetData().SetFtable().swap(feats);
4766  bioseq.SetAnnot().push_back(annot);
4767  }
4768 
4769  for (; spslp; spslp = next) {
4770  next = spslp->next;
4771  delete spslp;
4772  }
4773 
4774  FreeSPFeatInputSet(spfip);
4775  delete spfbp;
4776 }
4777 
4778 /**********************************************************/
4779 static void SpPrepareEntry(ParserPtr pp, DataBlkPtr entry, unsigned char* protconv)
4780 {
4781  Int2 curkw;
4782  char* ptr;
4783  char* eptr;
4784  EntryBlkPtr ebp;
4785 
4786  ebp = static_cast<EntryBlk*>(entry->mpData);
4787  ptr = entry->mOffset;
4788  eptr = ptr + entry->len;
4789  for (curkw = ParFlatSP_ID; curkw != ParFlatSP_END;) {
4790  ptr = GetEmblBlock(&ebp->chain, ptr, &curkw, pp->format, eptr);
4791  }
4792  GetSprotSubBlock(pp, entry);
4793 
4794  if (pp->entrylist[pp->curindx]->bases == 0) {
4795  SpAddToIndexBlk(entry, pp->entrylist[pp->curindx]);
4796  }
4797 
4798  CRef<CBioseq> bioseq = CreateEntryBioseq(pp);
4799  ebp->seq_entry.Reset(new CSeq_entry);
4800  ebp->seq_entry->SetSeq(*bioseq);
4801  GetScope().AddBioseq(*bioseq);
4802 
4803  GetSprotDescr(*bioseq, pp, entry);
4804 
4805  GetSPInst(pp, entry, protconv);
4806  GetSPAnnot(pp, entry, protconv);
4807 
4809  entries.push_back(ebp->seq_entry);
4811 
4812  if (pp->citat) {
4814  }
4815 }
4816 
4817 /**********************************************************
4818  *
4819  * bool SprotAscii(pp):
4820  *
4821  * Return FALSE if allocate entry block failed.
4822  *
4823  * 3-23-93
4824  *
4825  **********************************************************/
4827 {
4828  DataBlkPtr entry;
4829 
4830  Int4 total;
4831  Int4 i;
4832  IndexblkPtr ibp;
4833  Int4 imax;
4834 
4835  auto protconv = GetProteinConv();
4836 
4837  for (total = 0, i = 0, imax = pp->indx; i < imax; i++) {
4838  pp->curindx = i;
4839  ibp = pp->entrylist[i];
4840 
4841  err_install(ibp, pp->accver);
4842 
4843  if (! ibp->drop) {
4844  entry = LoadEntry(pp, ibp->offset, ibp->len);
4845  if (! entry) {
4847  return false;
4848  }
4849 
4850  SpPrepareEntry(pp, entry, protconv.get());
4851 
4852  if (! ibp->drop) {
4853  CRef<CSeq_entry>& cur_entry = (static_cast<EntryBlk*>(entry->mpData))->seq_entry;
4854  pp->entries.push_back(cur_entry);
4855 
4856  cur_entry.Reset();
4857  }
4858  // delete entry;
4859  }
4860  if (! ibp->drop) {
4861  total++;
4862  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry \"%s|%s\" parsed successfully", ibp->locusname, ibp->acnum);
4863  } else {
4864  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry \"%s|%s\" skipped", ibp->locusname, ibp->acnum);
4865  }
4866  }
4867 
4869 
4870  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "Parsing completed, %d entr%s parsed", total, (total == 1) ? "y" : "ies");
4871  return true;
4872 }
4873 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:302
string tata_save(string_view t)
Definition: add.cpp:148
void StripECO(string &str)
Definition: add.cpp:2811
void BuildSubBlock(DataBlkPtr dbp, Int2 subtype, const char *subkw)
Definition: asci_blk.cpp:812
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3406
unique_ptr< unsigned char[]> GetProteinConv(void)
Definition: asci_blk.cpp:1818
void GetSequenceOfKeywords(const DataBlk &entry, int type, Uint2 col_data, TKeywordList &keywords)
Definition: asci_blk.cpp:1551
char * GetEmblBlock(DataBlkPtr *chain, char *ptr, short *retkw, Parser::EFormat format, char *eptr)
Definition: asci_blk.cpp:545
CRef< CSeq_id > MakeAccSeqId(const char *acc, Uint1 seqtype, bool accver, Int2 vernum)
Definition: asci_blk.cpp:960
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1678
void GetLenSubNode(DataBlkPtr dbp)
Definition: asci_blk.cpp:835
CRef< CSeq_id > MakeLocusSeqId(const char *locus, CSeq_id::E_Choice seqtype)
Definition: asci_blk.cpp:990
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1321
void ShrinkSpaces(char *line)
Definition: asci_blk.cpp:118
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1074
void fta_sort_biosource(objects::CBioSource &bio)
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
Definition: Date.hpp:53
void GetDate(string *label, bool year_only=false) const
Append a standardized string representation of the date to the label.
Definition: Date.hpp:149
@ eCompare_after
*this comes second.
Definition: Date.hpp:76
Definition: Dbtag.hpp:53
CDelta_seq –.
Definition: Delta_seq.hpp:66
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
CPDB_mol_id –.
Definition: PDB_mol_id.hpp:66
CSP_block –.
Definition: SP_block.hpp:66
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeq_hist –.
Definition: Seq_hist.hpp:66
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
char * mOffset
Definition: ftablock.h:329
size_t len
Definition: ftablock.h:330
CFlatFileData * mpData
Definition: ftablock.h:328
DataBlk * mpNext
Definition: ftablock.h:333
int mType
Definition: ftablock.h:327
#define head
Definition: ct_nlmzip_i.h:138
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
Definition: entry.cpp:300
#define ERR_GENENAME_IllegalGeneName
Definition: flat2err.h:409
#define ERR_SOURCE_HostNameVsTaxIDMissMatch
Definition: flat2err.h:540
#define ERR_FEATURE_PartialNoNonTerNonCons
Definition: flat2err.h:381
#define ERR_FORMAT_UnexpectedData
Definition: flat2err.h:51
#define ERR_SPROT_PELine
Definition: flat2err.h:481
#define ERR_FEATURE_ExpectEmptyComment
Definition: flat2err.h:321
#define ERR_FORMAT_NoProteinNameCategory
Definition: flat2err.h:64
#define ERR_FORMAT_MultipleRecName
Definition: flat2err.h:65
#define ERR_SOURCE_InvalidNcbiTaxID
Definition: flat2err.h:530
#define ERR_SOURCE_UnknownOHType
Definition: flat2err.h:538
#define ERR_GENENAME_DELineGeneName
Definition: flat2err.h:410
#define ERR_SOURCE_NcbiTaxIDLookupFailure
Definition: flat2err.h:532
#define ERR_FORMAT_InvalidPDBCrossRef
Definition: flat2err.h:59
#define ERR_FORMAT_Date
Definition: flat2err.h:62
#define ERR_FORMAT_ECNumberNotPresent
Definition: flat2err.h:63
#define ERR_FORMAT_MixedPDBXrefs
Definition: flat2err.h:60
#define ERR_ENTRY_Skipped
Definition: flat2err.h:80
#define ERR_FEATURE_UnEqualEndPoint
Definition: flat2err.h:332
#define ERR_SOURCE_OrgNameVsTaxIDMissMatch
Definition: flat2err.h:534
#define ERR_FORMAT_MissingCopyright
Definition: flat2err.h:46
#define ERR_SOURCE_MissingPlasmidName
Definition: flat2err.h:537
#define ERR_FEATURE_Invalid_INIT_MET
Definition: flat2err.h:375
#define ERR_FEATURE_InvalidQualifier
Definition: flat2err.h:384
#define ERR_FEATURE_BadLocation
Definition: flat2err.h:347
#define ERR_REFERENCE_IllegalDate
Definition: flat2err.h:282
#define ERR_FORMAT_MissingFullRecName
Definition: flat2err.h:68
#define ERR_FORMAT_SwissProtHasSubName
Definition: flat2err.h:67
#define ERR_FEATURE_UnknownFeatKey
Definition: flat2err.h:333
#define ERR_SOURCE_UnknownOXType
Definition: flat2err.h:529
#define ERR_DRXREF_UnknownDBname
Definition: flat2err.h:596
#define ERR_SOURCE_NoNcbiTaxIDLookup
Definition: flat2err.h:531
#define ERR_FEATURE_ObsoleteFeature
Definition: flat2err.h:342
#define ERR_FEATURE_Dropped
Definition: flat2err.h:337
#define ERR_ENTRY_ParsingComplete
Definition: flat2err.h:79
#define ERR_FEATURE_MissingInitMet
Definition: flat2err.h:377
#define ERR_SPROT_DRLine
Definition: flat2err.h:480
#define ERR_SOURCE_IncorrectOHLine
Definition: flat2err.h:539
#define ERR_FORMAT_MissingGeneName
Definition: flat2err.h:58
#define ERR_LOCATION_FailedCheck
Definition: flat2err.h:393
#define ERR_FORMAT_InvalidECNumber
Definition: flat2err.h:52
#define ERR_QUALIFIER_InvalidEvidence
Definition: flat2err.h:117
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
#define ERR_ENTRY_Parsed
Definition: flat2err.h:83
#define ERR_FORMAT_UnknownGeneField
Definition: flat2err.h:56
#define ERR_FEATURE_NotSeqEndPoint
Definition: flat2err.h:327
#define ERR_FEATURE_NoFragment
Definition: flat2err.h:326
#define ERR_SPROT_DRLineCrossDBProtein
Definition: flat2err.h:482
#define ERR_DATACLASS_UnKnownClass
Definition: flat2err.h:76
#define ERR_FORMAT_ExcessGeneFields
Definition: flat2err.h:57
#define ERR_FORMAT_MissingRecName
Definition: flat2err.h:66
#define ERR_FEATURE_DuplicateRemoved
Definition: flat2err.h:349
list< CRef< objects::CSeq_entry > > TEntryList
std::list< CRef< objects::CSeq_id > > TSeqIdList
Definition: ftablock.h:58
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:131
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:89
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:90
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void StringCat(char *d, const char *s)
Definition: ftacpp.hpp:88
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:93
char * StringNew(size_t sz)
Definition: ftacpp.hpp:43
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:346
void FtaInstallPrefix(int prefix, const char *name, const char *location)
Definition: ftaerr.cpp:321
#define PREFIX_FEATURE
Definition: ftaerr.hpp:16
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
Definition: ftanet.cpp:753
CRef< COrg_ref > fta_fix_orgref_byid(ParserPtr pp, TTaxId taxid, bool *drop, bool isoh)
Definition: ftanet.cpp:848
void fta_fix_orgref(ParserPtr pp, COrg_ref &org_ref, bool *drop, char *organelle)
Definition: ftanet.cpp:936
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
static const char location[]
Definition: config.c:97
char data[12]
Definition: iconv.c:80
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define TAX_ID_TO(T, tax_id)
Definition: ncbimisc.hpp:1110
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
Definition: serialbase.hpp:347
void SetPnt(TPnt &v)
Definition: Seq_loc.hpp:985
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void SetBond(TBond &v)
Definition: Seq_loc.hpp:989
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static string Sanitize(CTempString str, TSS_Flags flags=fSS_print)
Sanitize a string, allowing only specified classes of characters.
Definition: ncbistr.hpp:2876
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
@ eTrunc_End
Truncate trailing spaces only.
Definition: ncbistr.hpp:2241
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497