NCBI C++ ToolKit
sp_ascii.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sp_ascii.cpp 100429 2023-07-31 15:03:27Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: sp_ascii.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Build SWISS-PROT format entry block. All external variables
32  * are in sp_global.c.
33  * Parse SP image in memory to asn.
34  *
35  */
36 
37 #include <ncbi_pch.hpp>
38 
39 #include "ftacpp.hpp"
40 
46 #include <objects/seq/Seq_hist.hpp>
49 #include <objects/seq/MolInfo.hpp>
50 #include <objmgr/scope.hpp>
54 #include <objects/seq/Seq_inst.hpp>
55 #include <objects/seq/Seq_ext.hpp>
61 #include <objects/seq/Pubdesc.hpp>
62 
63 #include "index.h"
64 #include "sprot.h"
65 
67 #include "ftanet.h"
69 
70 #include "ftaerr.hpp"
71 #include "indx_blk.h"
72 #include "asci_blk.h"
73 #include "sp_ascii.h"
74 #include "utilfeat.h"
75 #include "add.h"
76 #include "nucprot.h"
77 #include "utilfun.h"
78 #include "entry.h"
79 #include "ref.h"
80 #include "xutils.h"
81 
82 #ifdef THIS_FILE
83 # undef THIS_FILE
84 #endif
85 #define THIS_FILE "sp_ascii.cpp"
86 
89 
90 const char* ParFlat_SPComTopics[] = {
91  "ALLERGEN:",
92  "ALTERNATIVE PRODUCTS:",
93  "BIOPHYSICOCHEMICAL PROPERTIES:",
94  "BIOTECHNOLOGY:",
95  "CATALYTIC ACTIVITY:",
96  "CAUTION:",
97  "COFACTOR:",
98  "DATABASE:",
99  "DEVELOPMENTAL STAGE:",
100  "DISEASE:",
101  "DISRUPTION PHENOTYPE:",
102  "DOMAIN:",
103  "ENZYME REGULATION:",
104  "FUNCTION:",
105  "INDUCTION:",
106  "INTERACTION:",
107  "MASS SPECTROMETRY:",
108  "MISCELLANEOUS:",
109  "PATHWAY:",
110  "PHARMACEUTICAL:",
111  "POLYMORPHISM:",
112  "PTM:",
113  "RNA EDITING:",
114  "SEQUENCE CAUTION:",
115  "SIMILARITY:",
116  "SUBCELLULAR LOCATION:",
117  "SUBUNIT:",
118  "TISSUE SPECIFICITY:",
119  "TOXIC DOSE:",
120  "WEB RESOURCE:",
121  nullptr
122 };
123 
124 /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
125 const char* ParFlat_SPFeatNoExp[] = {
126  "(PROBABLE).",
127  "(PROBABLE)",
128  "PROBABLE.",
129  "(POTENTIAL).",
130  "(POTENTIAL)",
131  "POTENTIAL.",
132  "(BY SIMILARITY).",
133  "(BY SIMILARITY)",
134  "BY SIMILARITY.",
135  nullptr
136 };
137 
138 const char* ParFlat_SPFeatNoExpW[] = {
139  "(PUTATIVE).",
140  "(PUTATIVE)",
141  "PUTATIVE.",
142  "(SIMILARITY).",
143  "(SIMILARITY)",
144  "SIMILARITY.",
145  "(POSSIBLE).",
146  "(POSSIBLE)",
147  "POSSIBLE.",
148  "(POSTULATED).",
149  "(POSTULATED)",
150  "POSTULATED.",
151  "(BY HOMOLOGY).",
152  "(BY HOMOLOGY)",
153  "BY HOMOLOGY.",
154  nullptr
155 };
156 */
157 
159  { "ACT_SITE", ParFlatSPSites, 1, nullptr },
160  { "BINDING", ParFlatSPSites, 2, nullptr },
161  { "CARBOHYD", ParFlatSPSites, 6, nullptr },
162  { "MUTAGEN", ParFlatSPSites, 8, nullptr },
163  { "METAL", ParFlatSPSites, 9, nullptr },
164  { "LIPID", ParFlatSPSites, 20, nullptr },
165  { "NP_BIND", ParFlatSPSites, 21, nullptr },
166  { "DNA_BIND", ParFlatSPSites, 22, nullptr },
167  { "SITE", ParFlatSPSites, 255, nullptr },
168  { "MOD_RES", ParFlatSPSites, 5, nullptr }, /* 9 */
169  { "MOD_RES", ParFlatSPSites, 10, "4-aspartylphosphate" },
170  { "MOD_RES", ParFlatSPSites, 10, "5-glutamyl glycerylphosphorylethanolamine" },
171  { "MOD_RES", ParFlatSPSites, 10, "Phosphoarginine" },
172  { "MOD_RES", ParFlatSPSites, 10, "Phosphocysteine" },
173  { "MOD_RES", ParFlatSPSites, 10, "Phosphohistidine" },
174  { "MOD_RES", ParFlatSPSites, 10, "PHOSPHORYLATION" },
175  { "MOD_RES", ParFlatSPSites, 10, "Phosphoserine" },
176  { "MOD_RES", ParFlatSPSites, 10, "Phosphothreonine" },
177  { "MOD_RES", ParFlatSPSites, 10, "Phosphotyrosine" },
178  { "MOD_RES", ParFlatSPSites, 10, "Pros-phosphohistidine" },
179  { "MOD_RES", ParFlatSPSites, 10, "Tele-phosphohistidine" },
180  { "MOD_RES", ParFlatSPSites, 11, "ACETYLATION" },
181  { "MOD_RES", ParFlatSPSites, 11, "N2-acetylarginine" },
182  { "MOD_RES", ParFlatSPSites, 11, "N6-acetyllysine" },
183  { "MOD_RES", ParFlatSPSites, 11, "N-acetylalanine" },
184  { "MOD_RES", ParFlatSPSites, 11, "N-acetylaspartate" },
185  { "MOD_RES", ParFlatSPSites, 11, "N-acetylated lysine" },
186  { "MOD_RES", ParFlatSPSites, 11, "N-acetylcysteine" },
187  { "MOD_RES", ParFlatSPSites, 11, "N-acetylglutamate" },
188  { "MOD_RES", ParFlatSPSites, 11, "N-acetylglycine" },
189  { "MOD_RES", ParFlatSPSites, 11, "N-acetylisoleucine" },
190  { "MOD_RES", ParFlatSPSites, 11, "N-acetylmethionine" },
191  { "MOD_RES", ParFlatSPSites, 11, "N-acetylproline" },
192  { "MOD_RES", ParFlatSPSites, 11, "N-acetylserine" },
193  { "MOD_RES", ParFlatSPSites, 11, "N-acetylthreonine" },
194  { "MOD_RES", ParFlatSPSites, 11, "N-acetyltyrosine" },
195  { "MOD_RES", ParFlatSPSites, 11, "N-acetylvaline" },
196  { "MOD_RES", ParFlatSPSites, 11, "O-acetylserine" },
197  { "MOD_RES", ParFlatSPSites, 11, "O-acetylthreonine" },
198  { "MOD_RES", ParFlatSPSites, 12, "Alanine amide" },
199  { "MOD_RES", ParFlatSPSites, 12, "AMIDATION" },
200  { "MOD_RES", ParFlatSPSites, 12, "Arginine amide" },
201  { "MOD_RES", ParFlatSPSites, 12, "Asparagine amide" },
202  { "MOD_RES", ParFlatSPSites, 12, "Aspartic acid 1-amide" },
203  { "MOD_RES", ParFlatSPSites, 12, "Cysteine amide" },
204  { "MOD_RES", ParFlatSPSites, 12, "Glutamic acid 1-amide" },
205  { "MOD_RES", ParFlatSPSites, 12, "Glutamine amide" },
206  { "MOD_RES", ParFlatSPSites, 12, "Glycine amide" },
207  { "MOD_RES", ParFlatSPSites, 12, "Histidine amide" },
208  { "MOD_RES", ParFlatSPSites, 12, "Isoleucine amide" },
209  { "MOD_RES", ParFlatSPSites, 12, "Leucine amide" },
210  { "MOD_RES", ParFlatSPSites, 12, "Lysine amide" },
211  { "MOD_RES", ParFlatSPSites, 12, "Methionine amide" },
212  { "MOD_RES", ParFlatSPSites, 12, "Phenylalanine amide" },
213  { "MOD_RES", ParFlatSPSites, 12, "Proline amide" },
214  { "MOD_RES", ParFlatSPSites, 12, "Serine amide" },
215  { "MOD_RES", ParFlatSPSites, 12, "Threonine amide" },
216  { "MOD_RES", ParFlatSPSites, 12, "Tryptophan amide" },
217  { "MOD_RES", ParFlatSPSites, 12, "Tyrosine amide" },
218  { "MOD_RES", ParFlatSPSites, 12, "Valine amide" },
219  { "MOD_RES", ParFlatSPSites, 13, "2-methylglutamine" },
220  { "MOD_RES", ParFlatSPSites, 13, "2'-methylsulfonyltryptophan" },
221  { "MOD_RES", ParFlatSPSites, 13, "3-methylthioaspartic acid" },
222  { "MOD_RES", ParFlatSPSites, 13, "5-methylarginine" },
223  { "MOD_RES", ParFlatSPSites, 13, "Asymmetric dimethylarginine" },
224  { "MOD_RES", ParFlatSPSites, 13, "Cysteine methyl disulfide" },
225  { "MOD_RES", ParFlatSPSites, 13, "Cysteine methyl ester" },
226  { "MOD_RES", ParFlatSPSites, 13, "Dimethylated arginine" },
227  { "MOD_RES", ParFlatSPSites, 13, "Glutamate methyl ester (Gln)" },
228  { "MOD_RES", ParFlatSPSites, 13, "Glutamate methyl ester (Glu)" },
229  { "MOD_RES", ParFlatSPSites, 13, "Leucine methyl ester" },
230  { "MOD_RES", ParFlatSPSites, 13, "Lysine methyl ester" },
231  { "MOD_RES", ParFlatSPSites, 13, "METHYLATION" },
232  { "MOD_RES", ParFlatSPSites, 13, "Methylhistidine" },
233  { "MOD_RES", ParFlatSPSites, 13, "N,N,N-trimethylalanine" },
234  { "MOD_RES", ParFlatSPSites, 13, "N,N,N-trimethylglycine" },
235  { "MOD_RES", ParFlatSPSites, 13, "N,N,N-trimethylserine" },
236  { "MOD_RES", ParFlatSPSites, 13, "N,N-dimethylalanine" },
237  { "MOD_RES", ParFlatSPSites, 13, "N,N-dimethylglycine" },
238  { "MOD_RES", ParFlatSPSites, 13, "N,N-dimethylleucine" },
239  { "MOD_RES", ParFlatSPSites, 13, "N,N-dimethylproline" },
240  { "MOD_RES", ParFlatSPSites, 13, "N,N-dimethylserine" },
241  { "MOD_RES", ParFlatSPSites, 13, "N2,N2-dimethylarginine" },
242  { "MOD_RES", ParFlatSPSites, 13, "N4,N4-dimethylasparagine" },
243  { "MOD_RES", ParFlatSPSites, 13, "N4-methylasparagine" },
244  { "MOD_RES", ParFlatSPSites, 13, "N5-methylarginine" },
245  { "MOD_RES", ParFlatSPSites, 13, "N5-methylglutamine" },
246  { "MOD_RES", ParFlatSPSites, 13, "N6,N6,N6-trimethyl-5-hydroxylysine" },
247  { "MOD_RES", ParFlatSPSites, 13, "N6,N6,N6-trimethyllysine" },
248  { "MOD_RES", ParFlatSPSites, 13, "N6,N6-dimethyllysine" },
249  { "MOD_RES", ParFlatSPSites, 13, "N6-methylated lysine" },
250  { "MOD_RES", ParFlatSPSites, 13, "N6-methyllysine" },
251  { "MOD_RES", ParFlatSPSites, 13, "N6-poly(methylaminopropyl)lysine" },
252  { "MOD_RES", ParFlatSPSites, 13, "N-methylalanine" },
253  { "MOD_RES", ParFlatSPSites, 13, "N-methylglycine" },
254  { "MOD_RES", ParFlatSPSites, 13, "N-methylisoleucine" },
255  { "MOD_RES", ParFlatSPSites, 13, "N-methylleucine" },
256  { "MOD_RES", ParFlatSPSites, 13, "N-methylmethionine" },
257  { "MOD_RES", ParFlatSPSites, 13, "N-methylphenylalanine" },
258  { "MOD_RES", ParFlatSPSites, 13, "N-methylproline" },
259  { "MOD_RES", ParFlatSPSites, 13, "N-methylserine" },
260  { "MOD_RES", ParFlatSPSites, 13, "N-methyltyrosine" },
261  { "MOD_RES", ParFlatSPSites, 13, "Omega-N-methylarginine" },
262  { "MOD_RES", ParFlatSPSites, 13, "Omega-N-methylated arginine" },
263  { "MOD_RES", ParFlatSPSites, 13, "O-methylthreonine" },
264  { "MOD_RES", ParFlatSPSites, 13, "Pros-methylhistidine" },
265  { "MOD_RES", ParFlatSPSites, 13, "S-methylcysteine" },
266  { "MOD_RES", ParFlatSPSites, 13, "Symmetric dimethylarginine" },
267  { "MOD_RES", ParFlatSPSites, 13, "Tele-methylhistidine" },
268  { "MOD_RES", ParFlatSPSites, 13, "Threonine methyl ester" },
269  { "MOD_RES", ParFlatSPSites, 14, "(3R)-3-hydroxyarginine" },
270  { "MOD_RES", ParFlatSPSites, 14, "(3R)-3-hydroxyasparagine" },
271  { "MOD_RES", ParFlatSPSites, 14, "(3R)-3-hydroxyaspartate" },
272  { "MOD_RES", ParFlatSPSites, 14, "(3S)-3-hydroxyhistidine" },
273  { "MOD_RES", ParFlatSPSites, 14, "(3R,4R)-3,4-dihydroxyproline" },
274  { "MOD_RES", ParFlatSPSites, 14, "(3R,4R)-4,5-dihydroxyisoleucine" },
275  { "MOD_RES", ParFlatSPSites, 14, "(3R,4S)-3,4-dihydroxyproline" },
276  { "MOD_RES", ParFlatSPSites, 14, "(3R,4S)-4-hydroxyisoleucine" },
277  { "MOD_RES", ParFlatSPSites, 14, "(3S)-3-hydroxyasparagine" },
278  { "MOD_RES", ParFlatSPSites, 14, "(3S)-3-hydroxyaspartate" },
279  { "MOD_RES", ParFlatSPSites, 14, "(3S,4R)-3,4-dihydroxyisoleucine" },
280  { "MOD_RES", ParFlatSPSites, 14, "(4R)-5-hydroxyleucine" },
281  { "MOD_RES", ParFlatSPSites, 14, "(4R)-4,5-dihydroxyleucine" },
282  { "MOD_RES", ParFlatSPSites, 14, "3,4-dihydroxyarginine" },
283  { "MOD_RES", ParFlatSPSites, 14, "3',4'-dihydroxyphenylalanine" },
284  { "MOD_RES", ParFlatSPSites, 14, "3,4-dihydroxyproline" },
285  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxyasparagine" },
286  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxyaspartate" },
287  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxyphenylalanine" },
288  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxyproline" },
289  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxytryptophan" },
290  { "MOD_RES", ParFlatSPSites, 14, "3-hydroxyvaline" },
291  { "MOD_RES", ParFlatSPSites, 14, "4,5,5'-trihydroxyleucine" },
292  { "MOD_RES", ParFlatSPSites, 14, "4,5-dihydroxylysine" },
293  { "MOD_RES", ParFlatSPSites, 14, "4-hydroxyarginine" },
294  { "MOD_RES", ParFlatSPSites, 14, "4-hydroxyglutamate" },
295  { "MOD_RES", ParFlatSPSites, 14, "4-hydroxyproline" },
296  { "MOD_RES", ParFlatSPSites, 14, "5-hydroxy-3-methylproline (Ile)" },
297  { "MOD_RES", ParFlatSPSites, 14, "5-hydroxylysine" },
298  { "MOD_RES", ParFlatSPSites, 14, "(5R)-5-hydroxylysine" },
299  { "MOD_RES", ParFlatSPSites, 14, "(5S)-5-hydroxylysine" },
300  { "MOD_RES", ParFlatSPSites, 14, "7'-hydroxytryptophan" },
301  { "MOD_RES", ParFlatSPSites, 14, "D-4-hydroxyvaline" },
302  { "MOD_RES", ParFlatSPSites, 14, "HYDROXYLATION" },
303  { "MOD_RES", ParFlatSPSites, 14, "Hydroxyproline" },
304  { "MOD_RES", ParFlatSPSites, 14, "N6-(3,6-diaminohexanoyl)-5-hydroxylysine" },
305  { "MOD_RES", ParFlatSPSites, 15, "SULFATATION" },
306  { "MOD_RES", ParFlatSPSites, 15, "Sulfoserine" },
307  { "MOD_RES", ParFlatSPSites, 15, "Sulfothreonine" },
308  { "MOD_RES", ParFlatSPSites, 15, "Sulfotyrosine" },
309  { "MOD_RES", ParFlatSPSites, 16, "OXIDATIVE DEAMINATION" },
310  { "MOD_RES", ParFlatSPSites, 17, "Pyrrolidone carboxylic acid" },
311  { "MOD_RES", ParFlatSPSites, 17, "Pyrrolidone carboxylic acid (Glu)" },
312  { "MOD_RES", ParFlatSPSites, 18, "4-carboxyglutamate" },
313  { "MOD_RES", ParFlatSPSites, 18, "GAMMA-CARBOXYGLUTAMIC ACID" },
314  { "MOD_RES", ParFlatSPSites, 19, "Blocked" },
315  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Ala)" },
316  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Arg)" },
317  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Asn)" },
318  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Asp)" },
319  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Asx)" },
320  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Cys)" },
321  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Gln)" },
322  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Glu)" },
323  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Gly)" },
324  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Ile)" },
325  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Leu)" },
326  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Met)" },
327  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Pro)" },
328  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Ser)" },
329  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Thr)" },
330  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Val)" },
331  { "MOD_RES", ParFlatSPSites, 19, "Blocked amino end (Xaa)" },
332  { "MOD_RES", ParFlatSPSites, 19, "Blocked carboxyl end (Arg)" },
333  { "MOD_RES", ParFlatSPSites, 19, "Blocked carboxyl end (His)" }, /* 174 */
334  { "DISULFID", ParFlatSPBonds, 1, nullptr },
335  { "THIOLEST", ParFlatSPBonds, 2, nullptr },
336  { "CROSSLNK", ParFlatSPBonds, 3, nullptr },
337  { "THIOETH", ParFlatSPBonds, 4, nullptr },
338  { "SIGNAL", ParFlatSPRegions, -1, "Signal" },
339  { "PROPEP", ParFlatSPRegions, -1, "Propeptide" },
340  { "CHAIN", ParFlatSPRegions, -1, "Mature chain" },
341  { "TRANSIT", ParFlatSPRegions, -1, "Transit peptide" },
342  { "PEPTIDE", ParFlatSPRegions, -1, "Processed active peptide" },
343  { "DOMAIN", ParFlatSPRegions, -1, "Domain" },
344  { "CA_BIND", ParFlatSPRegions, -1, "Calcium binding region" },
345  { "TRANSMEM", ParFlatSPRegions, -1, "Transmembrane region" },
346  { "ZN_FING", ParFlatSPRegions, -1, "Zinc finger region" },
347  { "SIMILAR", ParFlatSPRegions, -1, "Similarity" },
348  { "REPEAT", ParFlatSPRegions, -1, "Repetitive region" },
349  { "HELIX", ParFlatSPRegions, -1, "Helical region" },
350  { "STRAND", ParFlatSPRegions, -1, "Beta-strand region" },
351  { "TURN", ParFlatSPRegions, -1, "Hydrogen bonded turn" },
352  { "CONFLICT", ParFlatSPRegions, -1, "Conflict" },
353  { "VARIANT", ParFlatSPRegions, -1, "Variant" },
354  { "SE_CYS", ParFlatSPRegions, -1, "Selenocysteine" },
355  { "VARSPLIC", ParFlatSPRegions, -1, "Splicing variant" },
356  { "VAR_SEQ", ParFlatSPRegions, -1, "Splicing variant" },
357  { "COILED", ParFlatSPRegions, -1, "Coiled-coil region" },
358  { "COMPBIAS", ParFlatSPRegions, -1, "Compositionally biased region" },
359  { "MOTIF", ParFlatSPRegions, -1, "Short sequence motif of biological interest" },
360  { "REGION", ParFlatSPRegions, -1, "Region of interest in the sequence" },
361  { "TOPO_DOM", ParFlatSPRegions, -1, "Topological domain" },
362  { "INTRAMEM", ParFlatSPRegions, -1, "Intramembrane region" },
363  { "UNSURE", ParFlatSPImports, -1, "unsure" },
364  { "INIT_MET", ParFlatSPInitMet, -1, "INIT_MET" },
365  { "NON_TER", ParFlatSPNonTer, -1, "NON_TER" },
366  { "NON_CONS", ParFlatSPNonCons, -1, "NON_CONS" },
367  { nullptr, 0, 0, nullptr }
368 };
369 
370 /* for array index, MOD_RES in the "ParFlat_SPFeat"
371  */
372 #define ParFlatSPSitesModB 9 /* beginning */
373 #define ParFlatSPSitesModE 174 /* end */
374 
375 #define COPYRIGHT "This Swiss-Prot entry is copyright."
376 #define COPYRIGHT1 "Copyrighted by the UniProt Consortium,"
377 
378 #define SPDE_RECNAME 000001
379 #define SPDE_ALTNAME 000002
380 #define SPDE_SUBNAME 000004
381 #define SPDE_FLAGS 000010
382 #define SPDE_INCLUDES 000020
383 #define SPDE_CONTAINS 000040
384 #define SPDE_FULL 000100
385 #define SPDE_SHORT 000200
386 #define SPDE_EC 000400
387 #define SPDE_ALLERGEN 001000
388 #define SPDE_BIOTECH 002000
389 #define SPDE_CD_ANTIGEN 004000
390 #define SPDE_INN 010000
391 
392 struct CharIntLen {
393  const char* str;
396 };
397 
398 struct SPDEFields {
399  Int4 tag = 0;
400  char* start = nullptr;
401  char* end = nullptr;
402  SPDEFields* next = nullptr;
403 };
405 
406 struct SPFeatInput {
407  string key; /* column 6-13 */
408  string from; /* column 15-20 */
409  string to; /* column 22-27 */
410  string descrip; /* column 35-75, continue line if a blank key */
411  SPFeatInput* next = nullptr; /* next FT */
412 };
414 
415 struct SPFeatBln {
416  bool initmet = false;
417  bool nonter = false;
418  bool noright = false;
419  bool noleft = false;
420 };
422 
423 /* segment location, data from NON_CONS
424  */
425 struct SPSegLoc {
426  Int4 from = 0; /* the beginning point of the segment */
427  Int4 len = 0; /* total length of the segment */
428  SPSegLoc* next = nullptr;
429 };
431 
432 struct SetOfSyns {
433  char* synname = nullptr;
434  SetOfSyns* next = nullptr;
435 };
437 
438 struct SetOfSpecies {
439  char* fullname = nullptr;
440  char* name = nullptr;
441  SetOfSyns* syn = nullptr;
442 };
444 
445 struct ViralHost {
447  char* name = nullptr;
448  ViralHost* next = nullptr;
449 };
451 
452 // clang-format off
454  {"RecName:", SPDE_RECNAME, 8},
455  {"AltName:", SPDE_ALTNAME, 8},
456  {"SubName:", SPDE_SUBNAME, 8},
457  {"Includes:", SPDE_INCLUDES, 9},
458  {"Contains:", SPDE_CONTAINS, 9},
459  {"Flags:", SPDE_FLAGS, 6},
460  {"Full=", SPDE_FULL, 5},
461  {"Short=", SPDE_SHORT, 6},
462  {"EC=", SPDE_EC, 3},
463  {"Allergen=", SPDE_ALLERGEN, 9},
464  {"Biotech=", SPDE_BIOTECH, 8},
465  {"CD_antigen=", SPDE_CD_ANTIGEN, 11},
466  {"INN=", SPDE_INN, 4},
467  {nullptr, 0, 0},
468 };
469 
470 const char* org_mods[] = {
471  "STRAIN", "SUBSTRAIN", "TYPE", "SUBTYPE", "VAR.", "SEROTYPE",
472  "SEROGROUP", "SEROVAR", "CULTIVAR", "PATHOVAR", "CHEMOVAR", "BIOVAR",
473  "BIOTYPE", "GROUP", "SUBGROUP", "ISOLATE", "ACRONYM", "DOSAGE",
474  "NAT_HOST", "SUBSP.", nullptr
475 };
476 
477 const char* obsolete_dbs[] = {
478  "2DBASE-ECOLI", "AARHUS/GHENT-2DPAGE", "AGD",
479  "ANU-2DPAGE", "BURULIST", "CARBBANK",
480  "CMR", "CORNEA-2DPAGE", "DICTYDB",
481  "DOMO", "ECO2DBASE", "GCRDB",
482  "GENEVESTIGATOR", "GENEW", "GENOMEREVIEWS",
483  "GERMONLINE", "HIV", "HSC-2DPAGE",
484  "HSSP", "IPI", "LINKHUB",
485  "LISTILIST", "MAIZE-2DPAGE", "MENDEL",
486  "MGD", "MYPULIST", "NMPDR",
487  "PATHWAY_INTERACTION_DB", "PHCI-2DPAGE", "PHOSSITE",
488  "PPTASEDB", "PROTCLUSTDB", "PHOTOLIST",
489  "PMMA-2DPAGE", "RAT-HEART-2DPAGE", "RZPD-PROTEXP",
490  "SAGALIST", "SIENA-2DPAGE", "STYGENE",
491  "SUBTILIST", "TIGR", "TRANSFAC",
492  "WORMPEP", "YEPD", "YPD",
493  nullptr
494 };
495 
496 const char* valid_dbs[] = {
497  "ALLERGOME", "ARACHNOSERVER", "ARAPORT",
498  "ARRAYEXPRESS", "BEEBASE", "BGD",
499  "BGEE", "BINDINGDB", "BIOCYC",
500  "BIOGRID", "BIOMUTA", "BRENDA",
501  "CAZY", "CCDS", "CDD",
502  "CGD", "CHEMBL", "CHITARS",
503  "CLEANEX", "COLLECTF", "COMPLUYEAST-2DPAGE",
504  "CONOSERVER", "CTD", "CYGD",
505  "DBSNP", "DEPOD", "DICTYBASE",
506  "DIP", "DISGENET", "DISPROT",
507  "DMDM", "DNASU", "DOSAC-COBS-2DPAGE",
508  "DRUGBANK", "ECHOBASE", "ECOGENE",
509  "EGGNOG", "EMBL", "ENSEMBL",
510  "ENSEMBLBACTERIA", "ENSEMBLFUNGI", "ENSEMBLMETAZOA",
511  "ENSEMBLPLANTS", "ENSEMBLPROTISTS", "EPD",
512  "ESTHER", "EUHCVDB", "EUPATHDB",
513  "EUROPEPMC", "EVOLUTIONARYTRACE", "EXPRESSIONATLAS",
514  "FLYBASE", "GENE3D", "GENECARDS",
515  "GENEDB", "GENEDB_SPOMBE", "GENEFARM",
516  "GENEID", "GENEREVIEWS", "GENETREE",
517  "GENEVISIBLE", "GENEWIKI", "GENOLIST",
518  "GENOMERNAI", "GK", "GLYCOSUITEDB",
519  "GRAINGENES", "GO", "GRAMENE",
520  "GUIDETOPHARMACOLOGY", "H-INVDB", "HAMAP",
521  "HGNC", "HOGENOM", "HOVERGEN",
522  "HPA", "IMGT/GENE-DB", "IMGT/HLA",
523  "IMGT/LIGM", "IMGT_GENE-DB", "INPARANOID",
524  "INTACT", "INTERPRO", "IPD-KIR",
525  "IPTMNET", "KEGG", "KO",
526  "LEGIOLIST", "LEPROMA", "MAIZEDB",
527  "MAIZEGDB", "MALACARDS", "MAXQB",
528  "MEROPS", "MGI", "MIM",
529  "MINT", "MIRBASE", "MOONPROT",
530  "MYCOCLAP", "NEXTBIO", "NEXTPROT",
531  "OGP", "OMA", "OPENTARGETS",
532  "ORPHANET", "ORTHODB", "PANTHER",
533  "PATRIC", "PAXDB", "PDB",
534  "PDBSUM", "PEPTIDEATLAS", "PEROXIBASE",
535  "PFAM", "PHARMGKB", "PHOSPHOSITE",
536  "PHOSPHOSITEPLUS", "PHYLOMEDB", "PIR",
537  "PIRSF", "PMAP-CUTDB", "POMBASE",
538  "PR", "PR2", "PRIDE",
539  "PRINTS", "PRO", "PRODOM",
540  "PROMEX", "PROSITE", "PROTEINMODELPORTAL",
541  "PROTEOMES", "PSEUDOCAP", "REACTOME",
542  "REBASE", "REFSEQ", "REPRODUCTION-2DPAGE",
543  "RGD", "RZPD", "SABIO-RK",
544  "SFLD", "SGD", "SIGNALINK",
545  "SIGNALLINK", "SIGNOR", "SMART",
546  "SMR", "STRING", "SUPFAM",
547  "SWISS-2DPAGE", "SWISSLIPIDS", "SWISSPALM",
548  "TAIR", "TCDB", "TIGRFAMS",
549  "TOPDOWNPROTEOMICS", "TREEFAM", "TUBERCULIST",
550  "UCD-2DPAGE", "UCSC", "UNICARBKB",
551  "UNIGENE", "UNILIB", "UNIPATHWAY",
552  "UNITE", "VBASE2", "VECTORBASE",
553  "VEGA-TR", "VEGA-GN", "VGNC",
554  "WBPARASITE", "WORLD-2DPAGE", "WORMBASE",
555  "XENBASE", "ZFIN", nullptr
556 };
557 
558 const char* SP_organelle[] = {
559  "CHLOROPLAST", "CYANELLE", "MITOCHONDRION", "PLASMID", "NUCLEOMORPH",
560  "HYDROGENOSOME", "APICOPLAST", "CHROMATOPHORE",
561  "ORGANELLAR CHROMATOPHORE", nullptr
562 };
563 
564 const char* PE_values[] = {
565  "Evidence at protein level",
566  "Evidence at transcript level",
567  "Inferred from homology",
568  "Predicted",
569  "Uncertain",
570  nullptr
571 };
572 // clang-format on
573 
574 /**********************************************************
575  *
576  * static char* StringCombine(str1, str2, delim):
577  *
578  * Return a string which is combined str1 and str2,
579  * put blank between two strings if "blank" = TRUE;
580  * also memory free out str1 and str2.
581  *
582  **********************************************************/
583 static void StringCombine(string& dest, const string& to_add, const Char* delim)
584 {
585  if (to_add.empty())
586  return;
587 
588  if (delim && *delim != '\0' && ! dest.empty())
589  dest += delim[0];
590 
591  dest += to_add;
592 }
593 
594 /**********************************************************
595  *
596  * static CRef<CDbtag> MakeStrDbtag(dbname, str):
597  *
598  * 10-1-93
599  *
600  **********************************************************/
601 static CRef<CDbtag> MakeStrDbtag(const char* dbname, const char* str)
602 {
604 
605  if (dbname && str) {
606  tag.Reset(new CDbtag);
607  tag->SetDb(dbname);
608  tag->SetTag().SetStr(str);
609  }
610 
611  return tag;
612 }
613 
614 /**********************************************************
615  *
616  * static CRef<CDate> MakeDatePtr(str, source):
617  *
618  * Return a DatePtr with "std" type if dd-mmm-yyyy
619  * or dd-mmm-yy format; with "str" type if not
620  * a dd-mmm-yyyy format.
621  *
622  **********************************************************/
624 {
625  static Char msg[11];
626 
627  CRef<CDate> res(new CDate);
628 
629  if (! str)
630  return res;
631 
632  if (StringChr(str, '-') && (isdigit(*str) != 0 || *str == ' ')) {
633  CRef<CDate_std> std_date = get_full_date(str, true, source);
634  res->SetStd(*std_date);
635  if (XDateCheck(*std_date) != 0) {
636  StringNCpy(msg, str, 10);
637  msg[10] = '\0';
638  ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegalDate, "Illegal date: %s", msg);
639  }
640  }
641 
642  if (res->Which() == CDate::e_not_set) {
643  res->SetStr(str);
644  }
645 
646  return res;
647 }
648 
649 /**********************************************************/
650 static void fta_create_pdb_seq_id(CSP_block_Base::TSeqref& refs, const char* mol, Uint1 chain)
651 {
652  if (! mol)
653  return;
654 
655  CRef<CPDB_seq_id> pdb_seq_id(new CPDB_seq_id);
656  pdb_seq_id->SetMol(CPDB_mol_id(mol));
657 
658  if (chain > 0) {
659  pdb_seq_id->SetChain(chain);
660  }
661 
662  CRef<CSeq_id> sid(new CSeq_id);
663  sid->SetPdb(*pdb_seq_id);
664  refs.push_back(sid);
665 }
666 
667 /**********************************************************/
668 static void MakeChainPDBSeqId(CSP_block_Base::TSeqref& refs, const char* mol, char* chain)
669 {
670  char* fourth;
671  char* p;
672  char* q;
673  char* r;
674 
675  bool bad;
676  bool got;
677 
678  if (! mol || ! chain)
679  return;
680 
681  fourth = StringSave(chain);
682  for (bad = false, got = false, q = chain; *q != '\0'; q = p) {
683  while (*q == ' ' || *q == ',')
684  q++;
685  for (p = q; *p != '\0' && *p != ' ' && *p != ',';)
686  p++;
687  if (*p != '\0')
688  *p++ = '\0';
689  r = StringRChr(q, '=');
690  if (! r && ! got) {
691  fta_create_pdb_seq_id(refs, mol, 0);
692  continue;
693  }
694  *r = '\0';
695  for (r = q; *r != '\0'; r++) {
696  if (*r == '/')
697  continue;
698  if (r[1] != '/' && r[1] != '\0') {
699  while (*r != '/' && *r != '\0')
700  r++;
701  r--;
702  bad = true;
703  continue;
704  }
705  got = true;
706  fta_create_pdb_seq_id(refs, mol, *r);
707  }
708  }
709 
710  if (bad) {
711  ErrPostEx(SEV_ERROR, ERR_FORMAT_InvalidPDBCrossRef, "PDB cross-reference \"%s\" contains one or more chain identifiers that are more than a single character in length.", fourth);
712  if (! got)
713  fta_create_pdb_seq_id(refs, mol, 0);
714  }
715 
716  MemFree(fourth);
717 }
718 
719 /**********************************************************
720  *
721  * static void MakePDBSeqId(refs, mol, rel, chain, drop, source):
722  *
723  * 10-1-93
724  *
725  **********************************************************/
726 static void MakePDBSeqId(CSP_block_Base::TSeqref& refs, const char* mol, const char* rel, char* chain, bool* drop, Parser::ESource source)
727 {
728  if (! mol)
729  return;
730 
731  if (! chain) {
732  CRef<CPDB_seq_id> pdb_seq_id(new CPDB_seq_id);
733  pdb_seq_id->SetMol(CPDB_mol_id(mol));
734 
735  if (rel) {
736  CRef<CDate> date = MakeDatePtr(rel, source);
737  pdb_seq_id->SetRel(*date);
738  }
739 
740  CRef<CSeq_id> sid(new CSeq_id);
741  sid->SetPdb(*pdb_seq_id);
742  refs.push_back(sid);
743  } else
744  MakeChainPDBSeqId(refs, mol, chain);
745 }
746 
747 /**********************************************************/
748 static void GetIntFuzzPtr(Uint1 choice, Int4 a, Int4 b, CInt_fuzz& fuzz)
749 {
750  if (choice < 1 || choice > 4)
751  return;
752 
753  if (choice == 2) {
754  fuzz.SetRange().SetMax(a);
755  if (b >= 0)
756  fuzz.SetRange().SetMin(b);
757  } else if (choice == 4) {
758  fuzz.SetLim(static_cast<CInt_fuzz::ELim>(a));
759  }
760 }
761 
762 /**********************************************************/
764 {
765  DataBlkPtr subdbp;
766  char* p;
767  Int4 gmod;
768 
769  for (gmod = -1; dbp; dbp = dbp->mpNext)
770  if (dbp->mType == ParFlatSP_OS) {
771  subdbp = static_cast<DataBlk*>(dbp->mpData);
772  for (; subdbp; subdbp = subdbp->mpNext)
773  if (subdbp->mType == ParFlatSP_OG) {
774  p = subdbp->mOffset + ParFlat_COL_DATA_SP;
775  if (StringEquNI(p, "Plastid;", 8))
776  for (p += 8; *p == ' ';)
777  p++;
778  gmod = StringMatchIcase(SP_organelle, p);
779  }
780  }
781  if (gmod == -1)
783  if (gmod == 0)
785  if (gmod == 1)
787  if (gmod == 2)
789  if (gmod == 3)
791  if (gmod == 4)
793  if (gmod == 5)
795  if (gmod == 6)
797  if (gmod == 7 || gmod == 8)
800 }
801 
802 /**********************************************************/
803 static void SpAddToIndexBlk(DataBlkPtr entry, IndexblkPtr pIndex)
804 {
805  char* eptr;
806  char* offset;
807  size_t len = 0;
808 
809  offset = SrchNodeType(entry, ParFlatSP_ID, &len);
810  if (! offset || len == 0)
811  return;
812 
813  eptr = offset + len - 1;
814  if (len > 5 && StringEquN(eptr - 3, "AA.", 3))
815  eptr -= 4;
816 
817  while (*eptr == ' ' && eptr > offset)
818  eptr--;
819  while (isdigit(*eptr) != 0 && eptr > offset)
820  eptr--;
821  pIndex->bases = atoi(eptr + 1);
822  while (*eptr == ' ' && eptr > offset)
823  eptr--;
824  if (*eptr == ';')
825  eptr--;
826  while (isalpha(*eptr) != 0 && eptr > offset)
827  eptr--;
828 
829  StringNCpy(pIndex->division, eptr + 1, 3);
830  pIndex->division[3] = '\0';
831 }
832 
833 /**********************************************************
834  *
835  * static void GetSprotSubBlock(pp, entry):
836  *
837  * 9-23-93
838  *
839  **********************************************************/
840 static void GetSprotSubBlock(ParserPtr pp, const DataBlk* entry)
841 {
842  DataBlkPtr dbp;
843 
844  dbp = TrackNodeType(*entry, ParFlatSP_OS);
845  if (dbp) {
846  BuildSubBlock(dbp, ParFlatSP_OG, "OG");
847  BuildSubBlock(dbp, ParFlatSP_OC, "OC");
848  BuildSubBlock(dbp, ParFlatSP_OX, "OX");
849  BuildSubBlock(dbp, ParFlatSP_OH, "OH");
850  GetLenSubNode(dbp);
851  }
852 
853  dbp = TrackNodeType(*entry, ParFlatSP_RN);
854  for (; dbp; dbp = dbp->mpNext) {
855  if (dbp->mType != ParFlatSP_RN)
856  continue;
857 
858  BuildSubBlock(dbp, ParFlatSP_RP, "RP");
859  BuildSubBlock(dbp, ParFlatSP_RC, "RC");
860  BuildSubBlock(dbp, ParFlatSP_RM, "RM");
861  BuildSubBlock(dbp, ParFlatSP_RX, "RX");
862  BuildSubBlock(dbp, ParFlatSP_RG, "RG");
863  BuildSubBlock(dbp, ParFlatSP_RA, "RA");
864  BuildSubBlock(dbp, ParFlatSP_RT, "RT");
865  BuildSubBlock(dbp, ParFlatSP_RL, "RL");
866  GetLenSubNode(dbp);
867  dbp->mType = ParFlat_REF_END; /* swiss-prot only has one type */
868  }
869 }
870 
871 /**********************************************************
872  *
873  * static char* GetSPDescrTitle(bptr, eptr, fragment)
874  *
875  * Return title string without "(EC ...)" and
876  * "(FRAGMENT)".
877  *
878  * 10-8-93
879  *
880  **********************************************************/
881 static char* GetSPDescrTitle(char* bptr, char* eptr, bool* fragment)
882 {
883  const char* tag;
884  char* ptr;
885  char* str;
886  char* p;
887  char* q;
888  Char symb;
889  Int4 shift;
890  bool ret;
891 
893  StripECO(str);
894 
895  ptr = StringStr(str, "(GENE NAME");
896  if (ptr) {
897  ErrPostStr(SEV_WARNING, ERR_GENENAME_DELineGeneName, "Old format, found gene_name in the DE data line");
898  }
899 
900  ShrinkSpaces(str);
901 
902  /* Delete (EC ...)
903  */
904  if (StringEquNI(str, "RecName: ", 9) ||
905  StringEquNI(str, "AltName: ", 9) ||
906  StringEquNI(str, "SubName: ", 9)) {
907  tag = "; EC=";
908  symb = ';';
909  shift = 5;
910  } else {
911  tag = "(EC ";
912  symb = ')';
913  shift = 4;
914  }
915 
916  for (ptr = str;;) {
917  ptr = StringStr(ptr, tag);
918  if (! ptr)
919  break;
920 
921  for (p = ptr + shift; *p == ' ';)
922  p++;
923 
924  if (*p == symb || *p == '\0') {
925  ptr = p;
926  continue;
927  }
928 
929  while (*p == '.' || *p == '-' || *p == 'n' || isdigit(*p) != 0)
930  p++;
931  if (symb == ')')
932  while (*p == ' ' || *p == ')')
933  p++;
934 
935  fta_StringCpy(ptr, p);
936  }
937 
938  if (symb == ';') {
939  for (ptr = str;;) {
940  ptr = StringIStr(ptr, "; Flags:");
941  if (! ptr)
942  break;
943  if (ptr[8] == '\0') {
944  *ptr = '\0';
945  break;
946  }
947  if (ptr[8] != ' ') {
948  ptr += 8;
949  continue;
950  ;
951  }
952  for (q = ptr + 8;;) {
953  p = StringChr(q, ':');
954  q = StringIStr(q, " Fragment");
955  if (! q || (p && q > p))
956  break;
957 
958  ret = true;
959  if (q[9] == ';')
960  fta_StringCpy(q, q + 10);
961  else if (q[9] == '\0')
962  *q = '\0';
963  else if (q[9] == 's' || q[9] == 'S') {
964  if (q[10] == ';')
965  fta_StringCpy(q, q + 11);
966  else if (q[10] == '\0')
967  *q = '\0';
968  else {
969  q++;
970  ret = false;
971  }
972  } else {
973  q++;
974  ret = false;
975  }
976  if (ret)
977  *fragment = true;
978  }
979  if (ptr[8] == '\0') {
980  *ptr = '\0';
981  break;
982  }
983  q = StringChr(ptr + 8, ';');
984  p = StringChr(ptr + 8, ':');
985  if (! q) {
986  if (! p)
987  break;
988  else
989  fta_StringCpy(ptr + 2, ptr + 9);
990  } else {
991  if (! p)
992  ptr += 9;
993  else {
994  if (p < q)
995  fta_StringCpy(ptr + 2, ptr + 9);
996  else
997  ptr += 9;
998  }
999  }
1000  }
1001  } else {
1002  ptr = StringIStr(str, "(FRAGMENT");
1003  if (ptr) {
1004  /* delete (FRAGMENTS) or (FRAGMENT)
1005  */
1006  *fragment = true;
1007 
1008  for (p = ptr + 8; *p != '\0' && *p != ')';)
1009  p++;
1010  while (*p == ' ' || *p == ')')
1011  p++;
1012 
1013  fta_StringCpy(ptr, p);
1014  }
1015  }
1016 
1017  ptr = tata_save(str);
1018  p = ptr + StringLen(ptr) - 1;
1019  if (*p == '.') {
1020  while (p > ptr && *(p - 1) == ' ')
1021  p--;
1022  *p = '.';
1023  p[1] = '\0';
1024  }
1025  MemFree(str);
1026  return (ptr);
1027 }
1028 
1029 /**********************************************************/
1030 static char* GetLineOSorOC(DataBlkPtr dbp, const char* pattern)
1031 {
1032  char* res;
1033  char* p;
1034  char* q;
1035 
1036  size_t len = dbp->len;
1037  if (len == 0)
1038  return nullptr;
1039  for (size_t i = 0; i < dbp->len; i++)
1040  if (dbp->mOffset[i] == '\n')
1041  len -= 5;
1042  res = MemNew(len);
1043  p = res;
1044  for (q = dbp->mOffset; *q != '\0';) {
1045  if (! StringEquN(q, pattern, 5))
1046  break;
1047  if (p > res)
1048  *p++ = ' ';
1049  for (q += 5; *q != '\n' && *q != '\0'; q++)
1050  *p++ = *q;
1051  if (*q == '\n')
1052  q++;
1053  }
1054  *p = '\0';
1055  if (p > res)
1056  p--;
1057  while (*p == '.' || *p == ' ' || *p == '\t') {
1058  *p = '\0';
1059  if (p > res)
1060  p--;
1061  }
1062  return (res);
1063 }
1064 
1065 /**********************************************************/
1067 {
1068  SetOfSpeciesPtr res;
1069  SetOfSynsPtr ssp;
1070  SetOfSynsPtr tssp;
1071  char* p;
1072  char* q;
1073  char* r;
1074  char* temp;
1075  Int2 i;
1076 
1077  if (! line || line[0] == '\0')
1078  return nullptr;
1079  for (p = line; *p == ' ' || *p == '\t' || *p == '.' || *p == ',';)
1080  p++;
1081  if (*p == '\0')
1082  return nullptr;
1083 
1084  res = new SetOfSpecies;
1085  res->fullname = StringSave(p);
1086 
1087  temp = StringSave(res->fullname);
1088  p = StringChr(temp, '(');
1089  if (! p)
1090  res->name = StringSave(temp);
1091  else {
1092  *p = '\0';
1093  q = temp;
1094  if (p > q) {
1095  for (r = p - 1; *r == ' ' || *r == '\t'; r--) {
1096  *r = '\0';
1097  if (r == q)
1098  break;
1099  }
1100  }
1101  res->name = StringSave(temp);
1102  *p = '(';
1103  ssp = new SetOfSyns;
1104  tssp = ssp;
1105  for (;;) {
1106  for (p++; *p == ' ' || *p == '\t';)
1107  p++;
1108  q = p;
1109  for (i = 1; *p != '\0'; p++) {
1110  if (*p == '(')
1111  i++;
1112  else if (*p == ')')
1113  i--;
1114  if (i == 0)
1115  break;
1116  }
1117  if (*p == '\0') {
1118  tssp->next = new SetOfSyns;
1119  tssp = tssp->next;
1120  tssp->synname = StringSave(q);
1121  break;
1122  }
1123  *p = '\0';
1124  if (p > q) {
1125  for (r = p - 1; *r == ' ' || *r == '\t'; r--) {
1126  *r = '\0';
1127  if (r == q)
1128  break;
1129  }
1130  }
1131  tssp->next = new SetOfSyns;
1132  tssp = tssp->next;
1133  tssp->synname = StringSave(q);
1134  *p = ')';
1135  p = StringChr(p, '(');
1136  if (! p)
1137  break;
1138  }
1139 
1140  res->syn = ssp->next;
1141  delete ssp;
1142  }
1143 
1144  MemFree(temp);
1145  return (res);
1146 }
1147 
1148 /**********************************************************/
1149 static void fix_taxname_dot(COrg_ref& org_ref)
1150 {
1151  if (! org_ref.IsSetTaxname())
1152  return;
1153 
1154  string& taxname = org_ref.SetTaxname();
1155 
1156  size_t len = taxname.size();
1157  if (len < 3)
1158  return;
1159 
1160  const Char* p = taxname.c_str() + len - 3;
1161  if ((p[0] == ' ' || p[0] == '\t') && (p[1] == 's' || p[1] == 'S') &&
1162  (p[2] == 'p' || p[2] == 'P') && p[3] == '\0') {
1163  if (NStr::CompareNocase(taxname.c_str(), "BACTERIOPHAGE SP") == 0)
1164  return;
1165 
1166  taxname += ".";
1167  }
1168 }
1169 
1170 /**********************************************************/
1172 {
1173  SetOfSynsPtr synsp;
1174 
1175  const char** b;
1176 
1177  char* p;
1178  char* q;
1179  Uint1 num;
1180  size_t i;
1181 
1182  CRef<COrg_ref> org_ref;
1183 
1184  if (! sosp)
1185  return org_ref;
1186 
1187  org_ref.Reset(new COrg_ref);
1188 
1189  if (sosp->name && sosp->name[0] != '\0')
1190  org_ref->SetTaxname(sosp->name);
1191 
1192  for (synsp = sosp->syn; synsp; synsp = synsp->next) {
1193  p = synsp->synname;
1194  if (! p || *p == '\0')
1195  continue;
1196 
1197  q = StringIStr(p, "PLASMID");
1198  if (! q)
1199  q = StringIStr(p, "CLONE");
1200  if (q) {
1201  i = (*q == 'C' || *q == 'c') ? 5 : 7;
1202  if (q > p) {
1203  q--;
1204  i++;
1205  }
1206  if ((q == p || q[0] == ' ' || q[0] == '\t') &&
1207  (q[i] == ' ' || q[i] == '\t' || q[i] == '\0')) {
1208  if (! org_ref->IsSetTaxname())
1209  org_ref->SetTaxname(p);
1210  else {
1211  string& taxname = org_ref->SetTaxname();
1212  taxname += " (";
1213  taxname += p;
1214  taxname += ")";
1215  }
1216  continue;
1217  }
1218  }
1219 
1220  if ((StringEquNI(p, "PV.", 3) && (p[3] == ' ' || p[3] == '\t' || p[3] == '\0')) ||
1221  NStr::CompareNocase(p, "AD11A") == 0 || NStr::CompareNocase(p, "AD11P") == 0) {
1222  if (! org_ref->IsSetTaxname())
1223  org_ref->SetTaxname(p);
1224  else {
1225  string& taxname = org_ref->SetTaxname();
1226  taxname += " (";
1227  taxname += p;
1228  taxname += ")";
1229  }
1230  continue;
1231  }
1232 
1233  for (q = p; *p != '\0' && *p != ' ' && *p != '\t';)
1234  p++;
1235  if (*p == '\0') {
1236  org_ref->SetSyn().push_back(q);
1237  continue;
1238  }
1239 
1240  *p = '\0';
1241  for (q = p + 1; *q == ' ' || *q == '\t';)
1242  q++;
1243 
1244  if (NStr::CompareNocase(synsp->synname, "COMMON") == 0) {
1245  if (! org_ref->IsSetCommon())
1246  org_ref->SetCommon(q);
1247  else
1248  org_ref->SetSyn().push_back(q);
1249  continue;
1250  }
1251 
1252  for (b = org_mods, num = 2; *b; b++, num++)
1253  if (NStr::CompareNocase(synsp->synname, *b) == 0)
1254  break;
1255  *p = ' ';
1256 
1257  if (! *b) {
1258  for (b = org_mods, num = 2; *b; b++, num++) {
1259  if (NStr::CompareNocase(*b, "ISOLATE") != 0 &&
1260  NStr::CompareNocase(*b, "STRAIN") != 0)
1261  continue;
1262  p = StringIStr(synsp->synname, *b);
1263  if (! p)
1264  continue;
1265 
1266  p--;
1267  i = StringLen(*b) + 1;
1268  if (*p == ' ' && (p[i] == ' ' || p[i] == '\t' || p[i] == '\0')) {
1269  string& taxname = org_ref->SetTaxname();
1270  taxname += " (";
1271  taxname += synsp->synname;
1272  taxname += ")";
1273  break;
1274  }
1275  }
1276 
1277  if (! *b)
1278  org_ref->SetSyn().push_back(synsp->synname);
1279  continue;
1280  }
1281 
1282  string& taxname = org_ref->SetTaxname();
1283  if (! taxname.empty())
1284  taxname += " ";
1285 
1286  taxname += "(";
1287  taxname += synsp->synname;
1288  taxname += ")";
1289  }
1290 
1291  fix_taxname_dot(*org_ref);
1292  if (org_ref->IsSetSyn() && org_ref->GetSyn().empty())
1293  org_ref->ResetSyn();
1294 
1295  return org_ref;
1296 }
1297 
1298 /**********************************************************/
1300 {
1301  SetOfSynsPtr ssp;
1302  SetOfSynsPtr tssp;
1303 
1304  if (sosp->fullname)
1305  MemFree(sosp->fullname);
1306  if (sosp->name)
1307  MemFree(sosp->name);
1308  for (ssp = sosp->syn; ssp; ssp = tssp) {
1309  tssp = ssp->next;
1310  if (ssp->synname)
1311  MemFree(ssp->synname);
1312  delete ssp;
1313  }
1314  delete sosp;
1315 }
1316 
1317 /**********************************************************/
1319 {
1320  ViralHostPtr vhp;
1321  ViralHostPtr tvhp;
1322  char* line;
1323  char* p;
1324  char* q;
1325  char* r;
1326  Char ch;
1327 
1328  for (; dbp; dbp = dbp->mpNext)
1329  if (dbp->mType == ParFlatSP_OS)
1330  break;
1331  if (! dbp)
1332  return nullptr;
1333 
1334  for (dbp = static_cast<DataBlk*>(dbp->mpData); dbp; dbp = dbp->mpNext)
1335  if (dbp->mType == ParFlatSP_OH)
1336  break;
1337  if (! dbp)
1338  return nullptr;
1339 
1340  vhp = new ViralHost;
1341  tvhp = vhp;
1342 
1343  line = MemNew(dbp->len + 2);
1344  ch = dbp->mOffset[dbp->len - 1];
1345  dbp->mOffset[dbp->len - 1] = '\0';
1346  line[0] = '\n';
1347  line[1] = '\0';
1348  StringCat(line, dbp->mOffset);
1349  dbp->mOffset[dbp->len - 1] = ch;
1350 
1351  if (! StringEquNI(line, "\nOH NCBI_TaxID=", 17)) {
1352  ch = '\0';
1353  p = StringChr(line + 1, '\n');
1354  if (p)
1355  *p = '\0';
1356  if (StringLen(line + 1) > 20) {
1357  ch = line[21];
1358  line[21] = '\0';
1359  }
1360  ErrPostEx(SEV_ERROR, ERR_SOURCE_UnknownOHType, "Unknown beginning of OH block: \"%s[...]\".", line + 1);
1361  if (ch != '\0')
1362  line[21] = ch;
1363  if (p)
1364  *p = '\n';
1365  }
1366 
1367  for (p = line;;) {
1368  p = StringIStr(p, "\nOH NCBI_TaxID=");
1369  if (! p)
1370  break;
1371  for (p += 17, q = p; *q == ' ';)
1372  q++;
1373  r = StringChr(q, '\n');
1374  p = StringChr(q, ';');
1375  if ((! r || r > p) && p) {
1376  tvhp->next = new ViralHost;
1377  tvhp = tvhp->next;
1378  for (p--; *p == ';' || *p == ' ';)
1379  p--;
1380  p++;
1381  for (r = q; *r >= '0' && *r <= '9';)
1382  r++;
1383  *p = '\0';
1384  if (r != p) {
1385  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidNcbiTaxID, "Invalid NCBI TaxID in OH line : \"%s\".", q);
1386  tvhp->taxid = ZERO_TAX_ID;
1387  } else
1388  tvhp->taxid = TAX_ID_FROM(int, atoi(q));
1389  for (p++; *p == ' ' || *p == ';';)
1390  p++;
1391  r = StringChr(p, '\n');
1392  if (! r)
1393  r = p + StringLen(p);
1394  else
1395  r--;
1396  while ((*r == ' ' || *r == '.' || *r == '\0') && r > p)
1397  r--;
1398  if (*r != '\0' && *r != '.' && *r != ' ')
1399  r++;
1400  ch = *r;
1401  *r = '\0';
1402  tvhp->name = StringSave(p);
1403  ShrinkSpaces(tvhp->name);
1404  *r = ch;
1405  p = r;
1406  } else {
1407  if (r)
1408  *r = '\0';
1409  ErrPostEx(SEV_ERROR, ERR_SOURCE_IncorrectOHLine, "Incorrect OH line content skipped: \"%s\".", q);
1410  if (r)
1411  *r = '\n';
1412  p = q;
1413  }
1414  }
1415  MemFree(line);
1416 
1417  tvhp = vhp->next;
1418  delete vhp;
1419 
1420  if (! tvhp)
1421  ErrPostEx(SEV_WARNING, ERR_SOURCE_NoNcbiTaxIDLookup, "No legal NCBI TaxIDs found in OH line.");
1422 
1423  return (tvhp);
1424 }
1425 
1426 /**********************************************************/
1428 {
1429  DataBlkPtr subdbp;
1430  char* line;
1431  char* p;
1432  char* q;
1433  bool got;
1434  Char ch;
1435  TTaxId taxid;
1436 
1437  for (got = false, taxid = ZERO_TAX_ID; dbp; dbp = dbp->mpNext) {
1438  if (dbp->mType != ParFlatSP_OS)
1439  continue;
1440 
1441  subdbp = static_cast<DataBlk*>(dbp->mpData);
1442  for (; subdbp; subdbp = subdbp->mpNext) {
1443  if (subdbp->mType != ParFlatSP_OX)
1444  continue;
1445  got = true;
1446  ch = subdbp->mOffset[subdbp->len - 1];
1447  subdbp->mOffset[subdbp->len - 1] = '\0';
1448  line = StringSave(subdbp->mOffset);
1449  subdbp->mOffset[subdbp->len - 1] = ch;
1450 
1451  p = StringChr(line, '\n');
1452  if (p)
1453  *p = '\0';
1454  if (! StringEquNI(line, "OX NCBI_TaxID=", 16)) {
1455  if (StringLen(line) > 20)
1456  line[20] = '\0';
1457  ErrPostEx(SEV_ERROR, ERR_SOURCE_UnknownOXType, "Unknown beginning of OX line: \"%s\".", line);
1458  MemFree(line);
1459  break;
1460  }
1461  p = StringChr(line + 16, ';');
1462  if (p) {
1463  *p++ = '\0';
1464  for (q = p; *q == ' ';)
1465  q++;
1466  if (*q != '\0') {
1467  ErrPostEx(SEV_ERROR, ERR_FORMAT_UnexpectedData, "Encountered unexpected data while parsing OX line: \"%s\" : Ignored.", p);
1468  }
1469  }
1470  for (p = line + 16; *p == ' ';)
1471  p++;
1472  if (*p == '\0') {
1473  MemFree(line);
1474  break;
1475  }
1476  for (q = p; *q >= '0' && *q <= '9';)
1477  q++;
1478  if (*q == ' ' || *q == '\0')
1479  taxid = TAX_ID_FROM(int, atoi(p));
1480  if (taxid <= ZERO_TAX_ID || (*q != ' ' && *q != '\0')) {
1481  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidNcbiTaxID, "Invalid NCBI TaxID on OX line : \"%s\" : Ignored.", p);
1482  }
1483  MemFree(line);
1484  break;
1485  }
1486  break;
1487  }
1488 
1489  if (got && taxid <= ZERO_TAX_ID)
1490  ErrPostEx(SEV_WARNING, ERR_SOURCE_NoNcbiTaxIDLookup, "No legal NCBI TaxID found on OX line : will use organism names for lookup instead.");
1491 
1492  return (taxid);
1493 }
1494 
1495 /**********************************************************/
1497 {
1498  SetOfSpeciesPtr sosp;
1499  DataBlkPtr dbp;
1500  char* line_OS;
1501  char* line_OC;
1502 
1503  CRef<COrg_ref> org_ref;
1504 
1505  line_OS = nullptr;
1506  line_OC = nullptr;
1507 
1508  for (dbp = entry; dbp; dbp = dbp->mpNext) {
1509  if (dbp->mType != ParFlatSP_OS)
1510  continue;
1511  line_OS = GetLineOSorOC(dbp, "OS ");
1512  for (dbp = static_cast<DataBlk*>(dbp->mpData); dbp; dbp = dbp->mpNext) {
1513  if (dbp->mType != ParFlatSP_OC)
1514  continue;
1515  line_OC = GetLineOSorOC(dbp, "OC ");
1516  break;
1517  }
1518  break;
1519  }
1520 
1521  if (line_OS && line_OS[0] != '\0') {
1522  sosp = GetSetOfSpecies(line_OS);
1523  if (sosp && sosp->name && sosp->name[0] != '\0') {
1524  org_ref = fill_orgref(sosp);
1525  }
1526 
1527  SetOfSpeciesFree(sosp);
1528  MemFree(line_OS);
1529  }
1530 
1531  if (org_ref.NotEmpty() && line_OC && line_OC[0] != '\0') {
1532  org_ref->SetOrgname().SetLineage(line_OC);
1533  MemFree(line_OC);
1534  }
1535 
1536  return org_ref;
1537 }
1538 
1539 /**********************************************************/
1540 static void get_plasmid(DataBlkPtr entry, CSP_block::TPlasnm& plasms)
1541 {
1542  DataBlkPtr dbp;
1543  DataBlkPtr subdbp;
1544  char* offset = nullptr;
1545  char* eptr = nullptr;
1546  char* str;
1547  char* ptr;
1548  Int4 gmod = -1;
1549 
1550  dbp = TrackNodeType(*entry, ParFlatSP_OS);
1551  for (; dbp; dbp = dbp->mpNext) {
1552  if (dbp->mType != ParFlatSP_OS)
1553  continue;
1554 
1555  subdbp = static_cast<DataBlk*>(dbp->mpData);
1556  for (; subdbp; subdbp = subdbp->mpNext) {
1557  if (subdbp->mType != ParFlatSP_OG)
1558  continue;
1559 
1560  offset = subdbp->mOffset + ParFlat_COL_DATA_SP;
1561  eptr = offset + subdbp->len;
1563  }
1564  }
1565  if (gmod != Seq_descr_GIBB_mod_plasmid)
1566  return;
1567 
1568  while ((str = StringIStr(offset, "PLASMID"))) {
1569  if (str > eptr)
1570  break;
1571 
1572  str += StringLen("PLASMID");
1573  while (*str == ' ')
1574  str++;
1575 
1576  for (ptr = str; *ptr != '\n' && *ptr != ' ';)
1577  ptr++;
1578  ptr--;
1579  if (ptr > str) {
1580  plasms.push_back(string(str, ptr));
1581  } else
1582  ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingPlasmidName, "Plasmid name is missing from OG line of SwissProt record.");
1583  offset = ptr;
1584  }
1585 }
1586 
1587 /**********************************************************
1588  *
1589  * static char* GetDRToken(ptr):
1590  *
1591  * From GetTheCurrentToken.
1592  *
1593  **********************************************************/
1594 static char* GetDRToken(char** ptr)
1595 {
1596  char* ret;
1597  char* p;
1598 
1599  p = *ptr;
1600  if (! p || *p == '\0')
1601  return nullptr;
1602 
1603  for (;; p++) {
1604  if (*p == '\0' || *p == '\n')
1605  break;
1606  if ((*p == ';' || *p == '.') && (p[1] == ' ' || p[1] == '\n'))
1607  break;
1608  }
1609 
1610  if (*p == '\0' || *p == '\n')
1611  return nullptr;
1612 
1613  *p++ = '\0';
1614 
1615  ret = *ptr;
1616 
1617  while (*p == ' ' || *p == ';' || *p == '.')
1618  p++;
1619  *ptr = p;
1620 
1621  if (*ret == '\0')
1622  return nullptr;
1623  return (ret);
1624 }
1625 
1626 /**********************************************************/
1627 static CRef<CSeq_id> AddPIDToSeqId(char* str, char* acc)
1628 {
1629  long long lID;
1630  char* end = nullptr;
1631 
1632  CRef<CSeq_id> sid;
1633 
1634  if (! str || *str == '\0')
1635  return sid;
1636 
1637  if (str[0] == '-') {
1638  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Not annotated CDS [ACC:%s, PID:%s]", acc, str);
1639  return sid;
1640  }
1641  errno = 0; /* clear errors, the error flag from stdlib */
1642  lID = strtoll(str + 1, &end, 10);
1643  if ((lID == 0 && str + 1 == end) || (lID == LLONG_MAX && errno == ERANGE)) {
1644  /* Bad or too large number
1645  */
1646  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Invalid PID value [ACC:%s, PID:%s]", acc, str);
1647  return sid;
1648  }
1649 
1650  if (*str == 'G') {
1651  sid.Reset(new CSeq_id);
1652  sid->SetGi(GI_FROM(long long, lID));
1653  } else if (*str == 'E' || *str == 'D') {
1654  CRef<CDbtag> tag(new CDbtag);
1655  tag->SetDb("PID");
1656  tag->SetTag().SetStr(str);
1657 
1658  sid.Reset(new CSeq_id);
1659  sid->SetGeneral(*tag);
1660  } else {
1661  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Unrecognized PID data base type [ACC:%s, PID:%s]", acc, str);
1662  }
1663  return sid;
1664 }
1665 
1666 /**********************************************************/
1667 static bool AddToList(ValNodePtr* head, char* str)
1668 {
1669  ValNodePtr vnp;
1670  char* data;
1671  char* dot;
1672  char* d;
1673 
1674  if (! str)
1675  return false;
1676 
1677  if (str[0] == '-' && str[1] == '\0')
1678  return true;
1679 
1680  dot = StringChr(str, '.');
1681  for (vnp = *head; vnp; vnp = vnp->next)
1682  if (StringEqu(vnp->data, str))
1683  break;
1684  if (vnp)
1685  return false;
1686 
1687  if (dot) {
1688  *dot = '\0';
1689  for (vnp = *head; vnp; vnp = vnp->next) {
1690  data = vnp->data;
1691  d = StringChr(data, '.');
1692  if (! d)
1693  continue;
1694  *d = '\0';
1695  if (StringEqu(data, str)) {
1696  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Same protein accessions with different versions found in DR line [PID1:%s.%s; PID2:%s.%s].", data, d + 1, str, dot + 1);
1697  }
1698  *d = '.';
1699  }
1700  *dot = '.';
1701  }
1703  ValNodeLink(head, vnp);
1704 
1705  return true;
1706 }
1707 
1708 /**********************************************************/
1710 {
1711  for (CSP_block::TSeqref::iterator cur_ref = refs.begin(); cur_ref != refs.end(); ++cur_ref) {
1712  if ((*cur_ref)->Which() != CSeq_id::e_Pdb || (*cur_ref)->GetPdb().IsSetRel())
1713  continue;
1714 
1715  bool got = false;
1716 
1717  const CPDB_seq_id& cur_id = (*cur_ref)->GetPdb();
1718  CSP_block::TSeqref::iterator next_ref = cur_ref;
1719 
1720  for (++next_ref; next_ref != refs.end();) {
1721  if ((*next_ref)->Which() != CSeq_id::e_Pdb ||
1722  (*next_ref)->GetPdb().IsSetRel())
1723  continue;
1724 
1725  const CPDB_seq_id& next_id = (*next_ref)->GetPdb();
1726 
1727  if (cur_id.GetMol().Get() != next_id.GetMol().Get()) {
1728  ++next_ref;
1729  continue;
1730  }
1731 
1732  if (next_id.GetChain() != 32) {
1733  if (! got && cur_id.GetChain() == 32) {
1734  got = true;
1735  /* Commented out until the proper handling of PDB chain contents
1736  ErrPostEx(SEV_WARNING, ERR_FORMAT_DuplicateCrossRef,
1737  "Duplicate PDB cross reference removed, mol = \"%s\", chain = \"%d\".",
1738  psip1->mol, (int) psip1->chain);
1739 */
1740  }
1741  if (cur_id.GetChain() != next_id.GetChain()) {
1742  ++next_ref;
1743  continue;
1744  }
1745  }
1746 
1747  next_ref = refs.erase(next_ref);
1748  /* Commented out until the proper handling of PDB chain contents
1749  ErrPostEx(SEV_WARNING, ERR_FORMAT_DuplicateCrossRef,
1750  "Duplicate PDB cross reference removed, mol = \"%s\", chain = \"%d\".",
1751  psip2->mol, (int) psip2->chain);
1752 */
1753  }
1754  }
1755 }
1756 
1757 /**********************************************************/
1758 static void fta_check_embl_drxref_dups(ValNodePtr embl_acc_list)
1759 {
1760  ValNodePtr vnp;
1761  ValNodePtr vnpn;
1762  const char* n;
1763  char* p;
1764  char* q;
1765 
1766  if (! embl_acc_list || ! embl_acc_list->next->next)
1767  return;
1768 
1769  for (vnp = embl_acc_list; vnp; vnp = vnp->next->next) {
1770  p = vnp->data;
1771  q = StringChr(p, '.');
1772  if (q) {
1773  for (p = q + 1; *p >= '0' && *p <= '9';)
1774  p++;
1775  if (*p != '\0')
1776  q = nullptr;
1777  p = vnp->data;
1778  }
1779  n = vnp->next->data;
1780  for (vnpn = vnp->next->next; vnpn; vnpn = vnpn->next->next) {
1781  if (vnp->next->choice != vnpn->next->choice &&
1782  StringEqu(p, vnpn->data)) {
1783  if (q)
1784  *q = '\0';
1786  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLineCrossDBProtein, "Protein accession \"%s\" associated with \"%s\" and \"%s\".", vnpn->data, n, vnpn->next->data);
1787  if (q)
1788  *q = '.';
1789  }
1790  }
1791  if (q)
1792  *q = '.';
1793  }
1794 }
1795 
1796 /**********************************************************
1797  *
1798  * static void GetDRlineDataSP(entry, spbp, drop, source):
1799  *
1800  * Database identifiers on the DR lines which point to
1801  * entries in GenBank, EMBL, DDBJ, PIR, or PDB are output
1802  * as Seq-id's of the appropriate type:
1803  * - For GenBank and DDBJ, only the primary identifier
1804  * (accession number) is captured; and their database
1805  * references are actually labelled as "EMBL". Their
1806  * true nature is determined by the accession number
1807  * ownership rules described by accession prefix.
1808  * - For EMBL, both the primary and secondary
1809  * identifiers are captured.
1810  * - For PIR, we only capture the secondary
1811  * identifier (name).
1812  * - For PDB, we capture both the primary identifier
1813  * (molecule name) and the secondary identifier
1814  * (release date).
1815  * example:
1816  * DR EMBL; J05536; RNPCBB.
1817  * DR EMBL; X51318; RN10SP.
1818  * DR PIR; A36581; A36581.
1819  * DR PDB; 1CCD; PRELIMINARY.
1820  * Release 33.0 Cross-references to EMBL/GenBank/DDBJ
1821  *
1822  * DR EMBL; X51318; G63880; -.
1823  *
1824  * seqref {
1825  * genbank {
1826  * accession "J05536" } ,
1827  * embl {
1828  * name "RN10SP" ,
1829  * accession "X51318" } ,
1830  * pir {
1831  * name "A36581" } ,
1832  * pdb {
1833  * mol "1CCD" ,
1834  * rel
1835  * str "PRELIMINARY" }
1836  *
1837  * Release 33.0
1838  *
1839  * seqref {
1840  * gi 63880 ,
1841  * } ,
1842  *
1843  * All other databank references are output using Dbtag.
1844  * In these cases, secondary identifiers, whether
1845  * entry-names, release numbers, or date-stamps, are not
1846  * captured since Dbtag has no provision for them.
1847  * example:
1848  * DR PROSITE; PS00403; UTEROGLOBIN_1.
1849  * DR PROSITE; PS00404; UTEROGLOBIN_2.
1850  * dbref {
1851  * {
1852  * db "PROSITE" ,
1853  * tag
1854  * str "PS00403" } ,
1855  * {
1856  * db "PROSITE" ,
1857  * tag
1858  * str "PS00404" } } ,
1859  *
1860  * Also need to delete duplicated DR line.
1861  *
1862  **********************************************************/
1863 static void GetDRlineDataSP(DataBlkPtr entry, CSP_block& spb, bool* drop, Parser::ESource source)
1864 {
1865  ValNodePtr embl_vnp;
1866  ValNodePtr acc_list = nullptr;
1867  ValNodePtr pid_list = nullptr;
1868  ValNodePtr ens_tran_list = nullptr;
1869  ValNodePtr ens_prot_list = nullptr;
1870  ValNodePtr ens_gene_list = nullptr;
1871  ValNodePtr embl_acc_list = nullptr;
1872  const char** b;
1873  char* offset;
1874  const char* token1;
1875  char* token2;
1876  char* token3;
1877  char* token4;
1878  char* token5;
1879  char* str;
1880  char* ptr;
1881  char* p;
1882  char* q;
1883  bool pdbold;
1884  bool pdbnew;
1885  bool check_embl_prot;
1886  size_t len = 0;
1887  Char ch;
1888 
1889  CSeq_id::E_Choice ptype;
1890  CSeq_id::E_Choice ntype;
1891 
1892  spb.ResetSeqref();
1893  spb.ResetDbref();
1894 
1895  offset = SrchNodeType(entry, ParFlatSP_DR, &len);
1896  if (! offset)
1897  return;
1898 
1899  ch = offset[len];
1900  offset[len] = '\0';
1901  str = MemNew(len + 2);
1902  StringCpy(str, "\n");
1903  StringCat(str, offset);
1904  offset[len] = ch;
1905  pdbold = false;
1906  pdbnew = false;
1907  embl_acc_list = ValNodeNew(nullptr);
1908  embl_vnp = embl_acc_list;
1909  check_embl_prot = false;
1910  for (ptr = str;;) {
1911  if (*drop)
1912  break;
1913  ptr = StringChr(ptr, '\n');
1914  if (! ptr)
1915  break;
1916  ptr++;
1917  if (! StringEquN(ptr, "DR ", 5))
1918  continue;
1919  ptr += ParFlat_COL_DATA_SP;
1920  token1 = GetDRToken(&ptr);
1921  token2 = GetDRToken(&ptr);
1922  token3 = GetDRToken(&ptr);
1923  token4 = GetDRToken(&ptr);
1924  token5 = GetDRToken(&ptr);
1925  if (! token1 || ! token2 || ! token3 ||
1926  (StringEqu(token2, "-") && StringEqu(token3, "-"))) {
1927  ErrPostEx(SEV_ERROR, ERR_SPROT_DRLine, "Badly formatted DR line. Skipped.");
1928  continue;
1929  }
1930 
1931  if (NStr::CompareNocase(token1, "MD5") == 0)
1932  continue;
1933 
1934  for (b = valid_dbs; *b; b++)
1935  if (NStr::CompareNocase(*b, token1) == 0)
1936  break;
1937  if (! *b) {
1938  for (b = obsolete_dbs; *b; b++)
1939  if (NStr::CompareNocase(*b, token1) == 0)
1940  break;
1941  if (! *b)
1942  ErrPostEx(SEV_WARNING, ERR_DRXREF_UnknownDBname, "Encountered a new/unknown database name in DR line: \"%s\".", token1);
1943  else
1944  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Obsolete database name found in DR line: \"%s\".", token1);
1945  }
1946 
1947  if (NStr::CompareNocase(token1, "PDB") == 0) {
1948  if (! token4)
1949  pdbold = true;
1950  else
1951  pdbnew = true;
1952 
1953  MakePDBSeqId(spb.SetSeqref(), token2, token3, token5 ? token5 : token4, drop, source);
1954  } else if (NStr::CompareNocase(token1, "PIR") == 0) {
1956  if (id.NotEmpty())
1957  spb.SetSeqref().push_back(id);
1958  } else if (NStr::CompareNocase(token1, "EMBL") == 0) {
1959  p = StringChr(token2, '.');
1960  if (p)
1961  *p = '\0';
1962  ntype = GetNucAccOwner(token2);
1963  if (ntype == CSeq_id::e_not_set) {
1964  if (p)
1965  *p = '.';
1966  ErrPostEx(SEV_ERROR, ERR_SPROT_DRLine, "Incorrect NA accession is used in DR line: \"%s\". Skipped...", token2);
1967  } else if (AddToList(&acc_list, token2)) {
1968  CRef<CSeq_id> id(MakeAccSeqId(token2, ntype, p ? true : false,
1969  p ? (Int2) atoi(p + 1) : 0));
1970  if (id.NotEmpty())
1971  spb.SetSeqref().push_back(id);
1972  }
1973  if (p)
1974  *p = '\0';
1975 
1976  ptype = CSeq_id::e_not_set;
1977  if (token3[0] >= 'A' && token3[0] <= 'Z' &&
1978  token3[1] >= 'A' && token3[1] <= 'Z') {
1979  p = StringChr(token3, '.');
1980  if (p) {
1981  *p = '\0';
1982  ptype = GetProtAccOwner(token3);
1983  *p = '.';
1984  for (q = p + 1; *q >= '0' && *q <= '9';)
1985  q++;
1986  if (q == p + 1 || *q != '\0')
1987  p = nullptr;
1988  }
1989  if (! p || ptype == CSeq_id::e_not_set) {
1990  ErrPostEx(SEV_ERROR, ERR_SPROT_DRLine, "Incorrect protein accession is used in DR line [ACC:%s; PID:%s]. Skipped...", token2, token3);
1991  continue;
1992  }
1993  } else
1994  p = nullptr;
1995 
1996  if (ntype > CSeq_id::e_not_set) {
1997  embl_vnp->next = ConstructValNode(ptype, token3);
1998  embl_vnp = embl_vnp->next;
1999  embl_vnp->next = ConstructValNode(ntype, token2);
2000  embl_vnp = embl_vnp->next;
2001  }
2002 
2003  if (! AddToList(&pid_list, token3)) {
2004  check_embl_prot = true;
2005  continue;
2006  }
2007 
2008  CRef<CSeq_id> id;
2009  if (! p)
2010  id = AddPIDToSeqId(token3, token2);
2011  else {
2012  *p++ = '\0';
2013  id = MakeAccSeqId(token3, ptype, true, (Int2)atoi(p));
2014  }
2015 
2016  if (id.NotEmpty())
2017  spb.SetSeqref().push_back(id);
2018  } else if (NStr::CompareNocase(token1, "ENSEMBL") == 0 ||
2019  NStr::CompareNocase(token1, "ENSEMBLBACTERIA") == 0 ||
2020  NStr::CompareNocase(token1, "ENSEMBLFUNGI") == 0 ||
2021  NStr::CompareNocase(token1, "ENSEMBLMETAZOA") == 0 ||
2022  NStr::CompareNocase(token1, "ENSEMBLPLANTS") == 0 ||
2023  NStr::CompareNocase(token1, "ENSEMBLPROTISTS") == 0 ||
2024  NStr::CompareNocase(token1, "WORMBASE") == 0) {
2025  if (AddToList(&ens_tran_list, token2)) {
2026  CRef<CDbtag> tag = MakeStrDbtag(token1, token2);
2027  if (tag.NotEmpty())
2028  spb.SetDbref().push_back(tag);
2029  }
2030 
2031  if (! AddToList(&ens_prot_list, token3)) {
2032  ErrPostEx(SEV_WARNING, ERR_SPROT_DRLine, "Duplicated protein id \"%s\" in \"%s\" DR line.", token3, token1);
2033  } else {
2034  CRef<CDbtag> tag = MakeStrDbtag(token1, token3);
2035  if (tag.NotEmpty())
2036  spb.SetDbref().push_back(tag);
2037  }
2038 
2039  if (token4 && AddToList(&ens_gene_list, token4)) {
2040  CRef<CDbtag> tag = MakeStrDbtag(token1, token4);
2041  if (tag.NotEmpty())
2042  spb.SetDbref().push_back(tag);
2043  }
2044  } else if (NStr::CompareNocase(token1, "REFSEQ") == 0) {
2045  ptype = CSeq_id::e_not_set;
2046  if (token2[0] >= 'A' && token2[0] <= 'Z' &&
2047  token2[1] >= 'A' && token2[1] <= 'Z') {
2048  p = StringChr(token2, '.');
2049  if (p) {
2050  *p = '\0';
2051  ptype = GetProtAccOwner(token2);
2052  *p = '.';
2053  for (q = p + 1; *q >= '0' && *q <= '9';)
2054  q++;
2055  if (q == p + 1 || *q != '\0')
2056  p = nullptr;
2057  }
2058  if (ptype != CSeq_id::e_Other)
2059  p = nullptr;
2060  } else
2061  p = nullptr;
2062 
2063  if (! p) {
2064  ErrPostEx(SEV_ERROR, ERR_SPROT_DRLine, "Incorrect protein accession.version is used in RefSeq DR line: \"%s\". Skipped...", token2);
2065  continue;
2066  }
2067 
2068  if (! AddToList(&pid_list, token2))
2069  continue;
2070 
2071  *p++ = '\0';
2072  CRef<CSeq_id> id(MakeAccSeqId(token2, ptype, true, (Int2)atoi(p)));
2073  if (id.NotEmpty())
2074  spb.SetSeqref().push_back(id);
2075  } else {
2076  if (NStr::CompareNocase(token1, "GK") == 0)
2077  token1 = "Reactome";
2078  else if (NStr::CompareNocase(token1, "GENEW") == 0)
2079  token1 = "HGNC";
2080  else if (NStr::CompareNocase(token1, "GeneDB_Spombe") == 0)
2081  token1 = "PomBase";
2082  else if (NStr::CompareNocase(token1, "PomBase") == 0 &&
2083  StringEquNI(token2, "PomBase:", 8))
2084  token2 += 8;
2085 
2086  CRef<CDbtag> tag = MakeStrDbtag(token1, token2);
2087  if (tag.NotEmpty()) {
2088  bool not_found = true;
2089 
2090  for (const auto& cur_tag : spb.SetDbref()) {
2091  if (tag->Match(*cur_tag)) {
2092  not_found = false;
2093  break;
2094  }
2095  }
2096  if (not_found)
2097  spb.SetDbref().push_back(tag);
2098  }
2099  }
2100  }
2101 
2102  if (embl_acc_list->next) {
2103  if (check_embl_prot)
2104  fta_check_embl_drxref_dups(embl_acc_list->next);
2105  ValNodeFreeData(embl_acc_list->next);
2106  }
2107  delete embl_acc_list;
2108 
2109  if (acc_list)
2110  ValNodeFreeData(acc_list);
2111  if (pid_list)
2112  ValNodeFreeData(pid_list);
2113  if (ens_tran_list)
2114  ValNodeFreeData(ens_tran_list);
2115  if (ens_prot_list)
2116  ValNodeFreeData(ens_prot_list);
2117  if (ens_gene_list)
2118  ValNodeFreeData(ens_gene_list);
2119  MemFree(str);
2120 
2121  if (pdbold && pdbnew) {
2122  ErrPostEx(SEV_REJECT, ERR_FORMAT_MixedPDBXrefs, "Both old and new types of PDB cross-references exist on this record. Only one style is allowed.");
2123  *drop = true;
2124  }
2125 
2126  if (pdbnew && spb.SetSeqref().size() > 1)
2128 }
2129 
2130 /**********************************************************
2131  *
2132  * static bool GetSPDate(pp, entry, crdate, sequpd,
2133  * annotupd, ver_num):
2134  *
2135  * Contain three lines in order created, last sequence
2136  * update, last annotation update.
2137  *
2138  * 9-30-93
2139  *
2140  **********************************************************/
2141 static bool GetSPDate(ParserPtr pp, DataBlkPtr entry, CDate& crdate, CDate& sequpd, CDate& annotupd, short* ver_num)
2142 {
2143  ValNodePtr vnp;
2144  ValNodePtr tvnp;
2145  char* offset;
2146  char* p;
2147  char* q;
2148  bool new_style;
2149  bool ret;
2150  Char ch;
2151  Int4 first;
2152  Int4 second;
2153  Int4 third;
2154  size_t len;
2155 
2156  CRef<CDate_std> std_crdate,
2157  std_sequpd,
2158  std_annotupd;
2159 
2160  if (ver_num)
2161  *ver_num = 0;
2162 
2163  offset = SrchNodeType(entry, ParFlatSP_DT, &len);
2164  if (! offset)
2165  return true;
2166 
2167  ch = offset[len];
2168  offset[len] = '\0';
2169  vnp = ValNodeNew(nullptr);
2170  for (q = offset, tvnp = vnp;;) {
2171  p = StringChr(q, '\n');
2172  if (p == q)
2173  break;
2174  if (p)
2175  *p = '\0';
2176  tvnp = ValNodeNew(tvnp, q);
2177  if (! p)
2178  break;
2179  *p++ = '\n';
2180  q = p;
2181  if (*q == '\0')
2182  break;
2183  }
2184  offset[len] = ch;
2185  tvnp = vnp->next;
2186  vnp->next = nullptr;
2187  delete vnp;
2188  vnp = tvnp;
2189 
2190  first = 0;
2191  second = 0;
2192  third = 0;
2193  if (! StringChr(vnp->data, '(')) {
2194  new_style = true;
2195  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
2196  offset = tvnp->data;
2198  if (StringIStr(offset, "integrated into")) {
2199  first++;
2200  std_crdate = GetUpdateDate(offset, pp->source);
2201  } else if (StringIStr(offset, "entry version")) {
2202  third++;
2203  std_annotupd = GetUpdateDate(offset, pp->source);
2204  } else {
2205  p = StringIStr(offset, "sequence version");
2206  if (p) {
2207  second++;
2208  std_sequpd = GetUpdateDate(offset, pp->source);
2209  if (ver_num) {
2210  for (p += 16; *p == ' ';)
2211  p++;
2212  for (q = p; *p >= '0' && *p <= '9';)
2213  p++;
2214  if (*p == '.' && p[1] == '\0') {
2215  *p = '\0';
2216  *ver_num = atoi(q);
2217  *p = '.';
2218  }
2219  }
2220  }
2221  }
2222  }
2223  } else {
2224  new_style = false;
2225  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
2226  offset = tvnp->data;
2228  if (StringIStr(offset, "Created")) {
2229  first++;
2230  std_crdate = GetUpdateDate(offset, pp->source);
2231  } else if (StringIStr(offset, "Last annotation update")) {
2232  third++;
2233  std_annotupd = GetUpdateDate(offset, pp->source);
2234  } else if (StringIStr(offset, "Last sequence update")) {
2235  second++;
2236  std_sequpd = GetUpdateDate(offset, pp->source);
2237  }
2238  }
2239  }
2240 
2241  ValNodeFreeData(vnp);
2242 
2243  ret = true;
2244  if (first == 0) {
2245  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing required \"%s\" DT line.", (new_style ? "integrated into" : "Created"));
2246  ret = false;
2247  } else if (first > 1) {
2248  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Multiple \"%s\" DT lines are present.", (new_style ? "integrated into" : "Created"));
2249  ret = false;
2250  } else if (second == 0) {
2251  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing required \"%s\" DT line.", (new_style ? "sequence version" : "Last sequence update"));
2252  ret = false;
2253  } else if (second > 1) {
2254  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Multiple \"%s\" DT lines are present.", (new_style ? "sequence version" : "Last sequence update"));
2255  ret = false;
2256  } else if (third == 0) {
2257  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing required \"%s\" DT line.", (new_style ? "entry version" : "Last annotation update"));
2258  ret = false;
2259  } else if (third > 1) {
2260  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Multiple \"%s\" DT lines are present.", (new_style ? "entry version" : "Last annotation update"));
2261  ret = false;
2262  } else if (std_crdate.Empty()) {
2263  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing or incorrect create date in \"%s\" DT line.", (new_style ? "integrated into" : "Created"));
2264  ret = false;
2265  } else if (std_sequpd.Empty()) {
2266  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing or incorrect update date in \"%s\" DT line.", (new_style ? "sequence version" : "Last sequence update"));
2267  ret = false;
2268  } else if (std_annotupd.Empty()) {
2269  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Missing or incorrect update date in \"%s\" DT line.", (new_style ? "entry version" : "Last annotation update"));
2270  ret = false;
2271  } else if (ver_num && *ver_num < 1) {
2272  ErrPostEx(SEV_REJECT, ERR_FORMAT_Date, "Invalidly formatted sequence version DT line is present.");
2273  ret = false;
2274  }
2275 
2276  if (ret) {
2277  crdate.SetStd(*std_crdate);
2278  sequpd.SetStd(*std_sequpd);
2279  annotupd.SetStd(*std_annotupd);
2280  return true;
2281  }
2282 
2283  return false;
2284 }
2285 
2286 /**********************************************************
2287  *
2288  * static SPBlockPtr GetDescrSPBlock(pp, entry, bsp):
2289  *
2290  * 9-30-93
2291  *
2292  **********************************************************/
2293 static CRef<CSP_block>
2295 {
2296  IndexblkPtr ibp;
2297 
2298  CRef<CSP_block> spb(new CSP_block);
2299 
2300  char* bptr;
2301  bool reviewed;
2302  bool i;
2303  Int2 ver_num;
2304 
2305  /* first ID line, 2nd token
2306  */
2307  bptr = PointToNextToken(entry->mOffset + ParFlat_COL_DATA_SP);
2308  reviewed = StringEquNI(bptr, "reviewed", 8);
2309  if (reviewed || StringEquNI(bptr, "standard", 8)) {
2310  spb->SetClass(CSP_block::eClass_standard);
2311  } else if (StringEquNI(bptr, "preliminary", 11) ||
2312  StringEquNI(bptr, "unreviewed", 10)) {
2313  spb->SetClass(CSP_block::eClass_prelim);
2314  } else {
2315  spb->SetClass(CSP_block::eClass_not_set);
2316  ErrPostStr(SEV_WARNING, ERR_DATACLASS_UnKnownClass, "Not a standard/reviewed or preliminary/unreviewed class in SWISS-PROT");
2317  }
2318 
2319  GetSequenceOfKeywords(*entry, ParFlatSP_KW, ParFlat_COL_DATA_SP, spb->SetKeywords());
2320 
2321  ibp = pp->entrylist[pp->curindx];
2322  ibp->wgssec[0] = '\0';
2323 
2324  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, spb->SetExtra_acc());
2325  if (spb->SetExtra_acc().empty())
2326  spb->ResetExtra_acc();
2327 
2328  /* DT data ==> create-date, seqence update, annotation update
2329  */
2330  ver_num = 0;
2331  if (reviewed && pp->sp_dt_seq_ver)
2332  i = GetSPDate(pp, entry, spb->SetCreated(), spb->SetSequpd(), spb->SetAnnotupd(), &ver_num);
2333  else
2334  i = GetSPDate(pp, entry, spb->SetCreated(), spb->SetSequpd(), spb->SetAnnotupd(), nullptr);
2335 
2336  get_plasmid(entry, spb->SetPlasnm());
2337  if (spb->SetPlasnm().empty())
2338  spb->ResetPlasnm();
2339 
2340  GetDRlineDataSP(entry, *spb, &ibp->drop, pp->source);
2341 
2342  if (! i)
2343  ibp->drop = true;
2344  else if (spb->GetClass() == CSP_block::eClass_standard ||
2345  spb->GetClass() == CSP_block::eClass_prelim) {
2346  for (auto& cur_id : bioseq.SetId()) {
2347  if (! cur_id->IsSwissprot())
2348  continue;
2349 
2350  CTextseq_id& id = cur_id->SetSwissprot();
2351  if (ver_num > 0)
2352  id.SetVersion(ver_num);
2353 
2354  if (spb->GetClass() == CSP_block::eClass_standard)
2355  id.SetRelease("reviewed");
2356  else
2357  id.SetRelease("reviewed");
2358 
2359  break;
2360  }
2361  }
2362 
2363  CRef<CSeqdesc> descr(new CSeqdesc);
2364  descr->SetSp(*spb);
2365  bioseq.SetDescr().Set().push_back(descr);
2366 
2367  return spb;
2368 }
2369 
2370 /**********************************************************
2371  *
2372  * static void ParseSpComment(descrptr, bptr, eptr):
2373  *
2374  * 10-1-93
2375  *
2376  **********************************************************/
2377 static void ParseSpComment(CSeq_descr::Tdata& descrs, char* line)
2378 {
2379  char* com;
2380  char* p;
2381  char* q;
2382  Int2 i;
2383 
2384  for (p = line; *p == ' ';)
2385  p++;
2386 
2387  com = MemNew(StringLen(p) + 2);
2388  q = com;
2390  if (i >= 0)
2391  *q++ = '[';
2392 
2393  while (*p != '\0') {
2394  if (*p != '\n') {
2395  *q++ = *p++;
2396  continue;
2397  }
2398 
2399  if (p > line && *(p - 1) != '-')
2400  *q++ = ' ';
2401  for (++p; *p == ' ';)
2402  p++;
2403  if (StringEquN(p, "CC ", 3))
2404  for (p += 3; *p == ' ';)
2405  p++;
2406  }
2407  if (q == com) {
2408  MemFree(com);
2409  return;
2410  }
2411  for (--q; q > com && *q == ' ';)
2412  q--;
2413  if (*q != ' ')
2414  q++;
2415  *q = '\0';
2416  if (i >= 0) {
2417  p = StringChr(com, ':');
2418  *p = ']';
2419  }
2420 
2421  if (com[0] != 0) {
2422  CRef<CSeqdesc> descr(new CSeqdesc);
2423  descr->SetComment(com);
2424  descrs.push_back(descr);
2425  }
2426  MemFree(com);
2427 }
2428 
2429 /**********************************************************
2430  *
2431  * static void GetSPDescrComment(entry, descrs, acc, cla):
2432  *
2433  * CC line ==> comment, separate each by "-!-".
2434  *
2435  * 10-1-93
2436  *
2437  **********************************************************/
2438 static void GetSPDescrComment(DataBlkPtr entry, CSeq_descr::Tdata& descrs, char* acc, Uint1 cla)
2439 {
2440  char* offset;
2441  char* bptr;
2442  char* eptr;
2443  char* tmp;
2444  char* p;
2445  char* q;
2446  Char ch;
2447  Int2 count;
2448  Int4 i;
2449 
2450  size_t len = 0;
2451  offset = SrchNodeType(entry, ParFlatSP_CC, &len);
2452  if (! offset)
2453  return;
2454 
2455  eptr = offset + len;
2456  ch = *eptr;
2457  *eptr = '\0';
2458  for (count = 0, p = offset;;) {
2459  p = StringStr(p, "----------");
2460  if (! p)
2461  break;
2462  for (q = p; q > offset && *q != '\n';)
2463  q--;
2464  if (*q == '\n')
2465  q++;
2466 
2467  p = StringChr(p, '\n');
2468  if (! p)
2469  break;
2470  for (i = 0; *p != '\0' && i < ParFlat_COL_DATA_SP + 1; i++)
2471  p++;
2472  if (*p == '\0')
2473  break;
2474  if (! StringEquNI(p, COPYRIGHT, StringLen(COPYRIGHT)) &&
2476  break;
2477  p = StringStr(p, "----------");
2478  if (! p)
2479  break;
2480  p = StringChr(p, '\n');
2481  if (! p)
2482  break;
2483  p++;
2484  len -= (p - q);
2485  fta_StringCpy(q, p);
2486  p = q;
2487  count++;
2488  }
2489 
2490  if (count == 0 && cla != 2) /* not PRELIMINARY or UNREVIEWED */
2491  ErrPostEx(SEV_WARNING, ERR_FORMAT_MissingCopyright, "The expected copyright notice for UniProt/Swiss-Prot entry %s was not found.", acc);
2492 
2493  if (len < 1) {
2494  *eptr = ch;
2495  return;
2496  }
2497 
2498  bptr = offset + ParFlat_COL_DATA_SP + 4;
2499 
2500  for (; (tmp = StringStr(bptr, "-!-")); bptr = tmp + 4) {
2501  /* found a new comment
2502  */
2503  for (p = tmp; p > bptr && *p != '\n';)
2504  p--;
2505  if (p == bptr)
2506  continue;
2507  *p = '\0';
2508  ParseSpComment(descrs, bptr);
2509  *p = '\n';
2510  }
2511 
2512  ParseSpComment(descrs, bptr);
2513  *eptr = ch;
2514 }
2515 
2516 /**********************************************************/
2517 static void SPAppendPIRToHist(CBioseq& bioseq, const CSP_block& spb)
2518 {
2519  if (spb.GetSeqref().empty())
2520  return;
2521 
2522  CSeq_hist_rec::TIds rep_ids;
2523 
2524  for (const auto& cur_ref : spb.GetSeqref()) {
2525  if (! cur_ref->IsPir())
2526  continue;
2527 
2528  CRef<CTextseq_id> text_id(new CTextseq_id);
2529  text_id->Assign(cur_ref->GetPir());
2530 
2531  CRef<CSeq_id> rep_id(new CSeq_id);
2532  rep_id->SetPir(*text_id);
2533 
2534  rep_ids.push_back(rep_id);
2535  }
2536 
2537  if (rep_ids.empty())
2538  return;
2539 
2540  CSeq_hist& hist = bioseq.SetInst().SetHist();
2541  hist.SetReplaces().SetIds().splice(hist.SetReplaces().SetIds().end(), rep_ids);
2542 }
2543 
2544 /**********************************************************/
2545 static bool IfOHTaxIdMatchOHName(const char* orpname, const char* ohname)
2546 {
2547  const char* p;
2548  const char* q;
2549  Char chp;
2550  Char chq;
2551 
2552  if (! orpname && ! ohname)
2553  return true;
2554  if (! orpname || ! ohname)
2555  return false;
2556 
2557  for (p = orpname, q = ohname; *p != '\0' && *q != '\0'; p++, q++) {
2558  chp = *p;
2559  if (chp >= 'a' && chp <= 'z')
2560  chp &= ~040;
2561  chq = *q;
2562  if (chq >= 'a' && chq <= 'z')
2563  chq &= ~040;
2564  if (chp != chq)
2565  break;
2566  }
2567 
2568  while (*p == ' ')
2569  p++;
2570  if (*p != '\0')
2571  return false;
2572 
2573  while (*q == ' ')
2574  q++;
2575  if (*q == '(' || *q == '\0')
2576  return true;
2577  return false;
2578 }
2579 
2580 /**********************************************************/
2581 static void GetSprotDescr(CBioseq& bioseq, ParserPtr pp, DataBlkPtr entry)
2582 {
2583  DataBlkPtr dbp;
2584  char* offset;
2585  CBioSource::TGenome gmod;
2586  bool fragment = false;
2587  TTaxId taxid;
2588 
2589  IndexblkPtr ibp;
2590  ViralHostPtr vhp;
2591  ViralHostPtr tvhp;
2592 
2593  CSeq_descr& descr = bioseq.SetDescr();
2594 
2595  ibp = pp->entrylist[pp->curindx];
2596  size_t len = 0;
2597  offset = SrchNodeType(entry, ParFlatSP_DE, &len);
2598  if (offset) {
2599  char* title = GetSPDescrTitle(offset, offset + len, &fragment);
2600  if (title) {
2601  CRef<CSeqdesc> desc_new(new CSeqdesc);
2602  desc_new->SetTitle(title);
2603  descr.Set().push_back(desc_new);
2604  }
2605  }
2606 
2607  /* sp-block
2608  */
2609  CRef<CSP_block> spb = GetDescrSPBlock(pp, entry, bioseq);
2610 
2611  GetSPDescrComment(entry, descr.Set(), ibp->acnum, spb->GetClass());
2612 
2613  if (spb.NotEmpty() && pp->accver && pp->histacc && pp->source == Parser::ESource::SPROT) {
2614  CSeq_hist_rec::TIds rep_ids;
2615 
2616  for (const string& cur_acc : spb->GetExtra_acc()) {
2617  if (cur_acc.empty() || ! IsSPROTAccession(cur_acc.c_str()))
2618  continue;
2619 
2620  CRef<CTextseq_id> text_id(new CTextseq_id);
2621  text_id->SetAccession(cur_acc);
2622 
2623  CRef<CSeq_id> rep_id(new CSeq_id);
2624  rep_id->SetSwissprot(*text_id);
2625  rep_ids.push_back(rep_id);
2626  }
2627 
2628  if (! rep_ids.empty()) {
2629  CSeq_hist& hist = bioseq.SetInst().SetHist();
2630  hist.SetReplaces().SetIds().swap(rep_ids);
2631  }
2632  }
2633 
2634  if (spb->CanGetCreated()) {
2635  CRef<CSeqdesc> create_date_descr(new CSeqdesc);
2636  create_date_descr->SetCreate_date().Assign(spb->GetCreated());
2637 
2638  descr.Set().push_back(create_date_descr);
2639  }
2640 
2641  bool has_update_date = spb->CanGetAnnotupd() || spb->CanGetSequpd();
2642  CDate upd_date;
2643 
2644  if (has_update_date) {
2645  if (spb->CanGetAnnotupd() && spb->CanGetSequpd()) {
2646  upd_date.Assign(spb->GetAnnotupd().Compare(spb->GetSequpd()) == CDate::eCompare_after ? spb->GetAnnotupd() : spb->GetSequpd());
2647  } else if (spb->CanGetAnnotupd())
2648  upd_date.Assign(spb->GetAnnotupd());
2649  else
2650  upd_date.Assign(spb->GetSequpd());
2651 
2652  CRef<CSeqdesc> upd_date_descr(new CSeqdesc);
2653  upd_date_descr->SetUpdate_date().Assign(upd_date);
2654 
2655  descr.Set().push_back(upd_date_descr);
2656  }
2657 
2658  if (spb->CanGetCreated() && has_update_date &&
2659  spb->GetCreated().Compare(upd_date) == CDate::eCompare_after) {
2660  string upd_date_str, create_date_str;
2661 
2662  upd_date.GetDate(&upd_date_str);
2663  spb->GetCreated().GetDate(&create_date_str);
2664 
2665  ErrPostEx(SEV_ERROR, ERR_DATE_IllegalDate, "Update-date \"%s\" precedes create-date \"%s\".", upd_date_str.c_str(), create_date_str.c_str());
2666  }
2667 
2668  dbp = TrackNodeType(*entry, ParFlatSP_OS);
2669  gmod = GetSPGenome(dbp);
2670 
2671  /* Org-ref from ID lines
2672  */
2673  for (dbp = TrackNodeType(*entry, ParFlatSP_ID); dbp; dbp = dbp->mpNext) {
2674  if (dbp->mType != ParFlatSP_ID)
2675  continue;
2676 
2677  CRef<CBioSource> bio_src;
2678 
2679  taxid = GetTaxIdFrom_OX(dbp);
2680  if (taxid > ZERO_TAX_ID) {
2681  CRef<COrg_ref> org_ref = fta_fix_orgref_byid(pp, taxid, &ibp->drop, false);
2682  if (org_ref.Empty())
2683  ErrPostEx(SEV_ERROR, ERR_SOURCE_NcbiTaxIDLookupFailure, "NCBI TaxID lookup for %d failed : will use organism name for lookup instead.", TAX_ID_TO(int, taxid));
2684  else {
2685  bio_src.Reset(new CBioSource);
2686 
2687  if (gmod != CBioSource::eGenome_unknown)
2688  bio_src->SetGenome(gmod);
2689  bio_src->SetOrg(*org_ref);
2690  }
2691  }
2692 
2693  CRef<COrg_ref> org_ref = GetOrganismFrom_OS_OC(dbp);
2694  if (org_ref.NotEmpty()) {
2695  if (bio_src.Empty()) {
2696  bio_src.Reset(new CBioSource);
2697 
2698  if (gmod != CBioSource::eGenome_unknown)
2699  bio_src->SetGenome(gmod);
2700  fta_fix_orgref(pp, *org_ref, &ibp->drop, nullptr);
2701  bio_src->SetOrg(*org_ref);
2702  } else if (org_ref->IsSetTaxname()) {
2703  if (! bio_src->IsSetOrg() || ! bio_src->GetOrg().IsSetTaxname() ||
2704  NStr::CompareNocase(org_ref->GetTaxname().c_str(), bio_src->GetOrg().GetTaxname().c_str()) != 0)
2705  ErrPostEx(SEV_ERROR, ERR_SOURCE_OrgNameVsTaxIDMissMatch, "Organism name \"%s\" from OS line does not match the organism name \"%s\" obtained by lookup of NCBI TaxID.", org_ref->GetTaxname().c_str(), bio_src->GetOrg().GetTaxname().c_str());
2706  }
2707  }
2708 
2709  if (bio_src.Empty())
2710  break;
2711 
2712  vhp = GetViralHostsFrom_OH(dbp);
2713  if (vhp) {
2714  COrgName& orgname = bio_src->SetOrg().SetOrgname();
2715 
2716  for (tvhp = vhp; tvhp; tvhp = vhp) {
2717  vhp = tvhp->next;
2718 
2719  CRef<COrgMod> mod(new COrgMod);
2720  mod->SetSubtype(COrgMod::eSubtype_nat_host);
2721  mod->SetSubname(tvhp->name);
2722  orgname.SetMod().push_back(mod);
2723 
2724  if (tvhp->taxid <= ZERO_TAX_ID) {
2725  delete tvhp;
2726  continue;
2727  }
2728 
2729  bool drop = false;
2730  CRef<COrg_ref> org_ref_cur = fta_fix_orgref_byid(pp, tvhp->taxid, &drop, true);
2731  if (org_ref_cur.Empty()) {
2732  if (! drop)
2733  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidNcbiTaxID, "OH-line TaxId \"%d\" was not found via the NCBI TaxArch service.", TAX_ID_TO(int, tvhp->taxid));
2734  else
2735  ErrPostEx(SEV_ERROR, ERR_SOURCE_NcbiTaxIDLookupFailure, "Taxonomy lookup for OH-line TaxId \"%d\" failed.", TAX_ID_TO(int, tvhp->taxid));
2736  } else {
2737  vector<Char> org_taxname;
2738  if (org_ref_cur->IsSetTaxname()) {
2739  const string& cur_taxname = org_ref_cur->GetTaxname();
2740  org_taxname.assign(cur_taxname.begin(), cur_taxname.end());
2741  }
2742 
2743  org_taxname.push_back(0);
2744 
2745  if (! IfOHTaxIdMatchOHName(&org_taxname[0], tvhp->name))
2748  "OH-line HostName \"%s\" does not match NCBI organism name \"%s\" obtained by lookup of NCBI TaxID \"%d\".",
2749  tvhp->name,
2750  &org_taxname[0],
2751  TAX_ID_TO(int, tvhp->taxid));
2752  }
2753  delete tvhp;
2754  }
2755  }
2756 
2757  fta_sort_biosource(*bio_src);
2758 
2759  CRef<CSeqdesc> bio_src_desc(new CSeqdesc);
2760  bio_src_desc->SetSource(*bio_src);
2761  descr.Set().push_back(bio_src_desc);
2762  break;
2763  }
2764 
2765  if (spb.NotEmpty())
2766  SPAppendPIRToHist(bioseq, *spb);
2767 
2768  CRef<CSeqdesc> mol_info_descr(new CSeqdesc);
2769  CMolInfo& mol_info = mol_info_descr->SetMolinfo();
2772 
2773  descr.Set().push_back(mol_info_descr);
2774 
2775  /* RN data ==> pub
2776  */
2777  dbp = TrackNodeType(*entry, ParFlat_REF_END);
2778  for (; dbp; dbp = dbp->mpNext) {
2779  if (dbp->mType != ParFlat_REF_END)
2780  continue;
2781 
2782  CRef<CPubdesc> pub_desc = DescrRefs(pp, dbp, ParFlat_COL_DATA_SP);
2783  if (pub_desc.NotEmpty()) {
2784  CRef<CSeqdesc> pub_desc_descr(new CSeqdesc);
2785  pub_desc_descr->SetPub(*pub_desc);
2786 
2787  descr.Set().push_back(pub_desc_descr);
2788  }
2789  }
2790 }
2791 
2792 /**********************************************************
2793  *
2794  * static void GetSPInst(pp, entry, protconv):
2795  *
2796  * Fills in Seq-inst for an entry. Assumes Bioseq
2797  * already allocated.
2798  *
2799  * 10-8-93
2800  *
2801  **********************************************************/
2802 static void GetSPInst(ParserPtr pp, DataBlkPtr entry, unsigned char* protconv)
2803 {
2804  EntryBlkPtr ebp;
2805 
2806  ebp = static_cast<EntryBlk*>(entry->mpData);
2807 
2808  CBioseq& bioseq = ebp->seq_entry->SetSeq();
2809 
2810  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
2811  bioseq.SetInst().SetMol(CSeq_inst::eMol_aa);
2812 
2813  GetSeqData(pp, *entry, bioseq, ParFlatSP_SQ, protconv, CSeq_data::e_Iupacaa);
2814 }
2815 
2816 /**********************************************************/
2818 {
2819  delete spfip;
2820 }
2821 
2822 /**********************************************************
2823  *
2824  * static void FreeSPFeatInputSet(spfip):
2825  *
2826  * 10-18-93
2827  *
2828  **********************************************************/
2830 {
2832 
2833  for (; spfip; spfip = next) {
2834  next = spfip->next;
2835  FreeSPFeatInput(spfip);
2836  }
2837 }
2838 
2839 /**********************************************************/
2841 {
2842  if (! fip1 && ! fip2)
2843  return true;
2844 
2845  if (! fip1 || ! fip2 ||
2846  fip1->key != fip2->key ||
2847  fip1->from != fip2->from ||
2848  fip1->to != fip2->to ||
2849  fip1->descrip != fip2->descrip)
2850  return false;
2851 
2852  return true;
2853 }
2854 
2855 /**********************************************************/
2857 {
2858  SPFeatInputPtr fip;
2859  SPFeatInputPtr fipnext;
2860  SPFeatInputPtr fipprev;
2861 
2862  if (! spfip || ! spfip->next)
2863  return;
2864 
2865  for (; spfip && spfip->next; spfip = spfip->next) {
2866  fipprev = spfip;
2867  for (fip = spfip->next; fip; fip = fipnext) {
2868  fipnext = fip->next;
2869  if (! fta_spfeats_same(spfip, fip)) {
2870  fipprev = fip;
2871  continue;
2872  }
2873  fipprev->next = fip->next;
2874  ErrPostEx(SEV_WARNING, ERR_FEATURE_DuplicateRemoved, "Duplicated feature \"%s\" at location \"%s..%s\" removed.", fip->key.empty() ? "???" : fip->key.c_str(), fip->from.empty() ? "???" : fip->from.c_str(), fip->to.empty() ? "???" : fip->to.c_str());
2875  FreeSPFeatInput(fip);
2876  }
2877  }
2878 }
2879 
2880 /**********************************************************/
2881 static void SPPostProcVarSeq(string& varseq)
2882 {
2883  char* temp;
2884  char* end;
2885  char* p;
2886  char* q;
2887 
2888  if (varseq.empty())
2889  return;
2890 
2891  temp = StringSave(varseq.c_str());
2892  p = StringStr(temp, "->");
2893  if (! p || p == temp ||
2894  (*(p - 1) != ' ' && *(p - 1) != '\n') || (p[2] != ' ' && p[2] != '\n')) {
2895  NStr::ReplaceInPlace(varseq, "\n", " ");
2896  MemFree(temp);
2897  return;
2898  }
2899 
2900  for (p--; p > temp && (*p == ' ' || *p == '\n');)
2901  p--;
2902  if (*p < 'A' || *p > 'Z') {
2903  NStr::ReplaceInPlace(varseq, "\n", " ");
2904  MemFree(temp);
2905  return;
2906  }
2907 
2908  end = p + 1;
2909  while (p > temp && (*p == '\n' || (*p >= 'A' && *p <= 'Z')))
2910  p--;
2911  if (p > temp)
2912  p++;
2913  while (*p == '\n')
2914  p++;
2915  for (;;) {
2916  while (*p >= 'A' && *p <= 'Z' && p < end)
2917  p++;
2918  if (p == end)
2919  break;
2920  for (q = p; *p == '\n'; p++)
2921  end--;
2922  fta_StringCpy(q, p);
2923  }
2924 
2925  while (*p == ' ' || *p == '\n')
2926  p++;
2927  for (p += 2; *p == ' ' || *p == '\n';)
2928  p++;
2929 
2930  if (*p < 'A' || *p > 'Z') {
2931  NStr::ReplaceInPlace(varseq, "\n", " ");
2932  MemFree(temp);
2933  return;
2934  }
2935 
2936  for (q = p; *q == '\n' || (*q >= 'A' && *q <= 'Z');)
2937  q++;
2938  if (q > p && *(q - 1) == '\n') {
2939  for (q--; *q == '\n' && q > p;)
2940  q--;
2941  if (*q != '\n')
2942  q++;
2943  }
2944  end = q;
2945 
2946  for (;;) {
2947  while (*p >= 'A' && *p <= 'Z' && p < end)
2948  p++;
2949  if (p == end)
2950  break;
2951  for (q = p; *p == '\n'; p++)
2952  end--;
2953  fta_StringCpy(q, p);
2954  }
2955 
2956  for (p = temp; *p != '\0'; p++)
2957  if (*p == '\n')
2958  *p = ' ';
2959 
2960  varseq = temp;
2961  MemFree(temp);
2962 }
2963 
2964 /**********************************************************
2965  *
2966  * static SPFeatInputPtr ParseSPFeat(entry, seqlen):
2967  *
2968  * Return a link list of feature input data, including
2969  * key, from, to, description.
2970  *
2971  * 10-15-93
2972  *
2973  **********************************************************/
2974 static SPFeatInputPtr ParseSPFeat(DataBlkPtr entry, size_t seqlen)
2975 {
2976  SPFeatInputPtr temp;
2977  SPFeatInputPtr current;
2978  SPFeatInputPtr spfip;
2979  const char* defdelim;
2980  char* fromstart;
2981  char* fromend;
2982  char* bptr;
2983  char* eptr;
2984  char* ptr1;
2985  char* offset;
2986  char* endline;
2987  char* str;
2988  const char* delim;
2989  char* quotes;
2990  char* location;
2991  char* p;
2992  char* q;
2993  int i;
2994  bool badqual;
2995  bool new_format;
2996  bool extra_text;
2997  Char ch;
2998 
2999  size_t len = 0;
3000  offset = SrchNodeType(entry, ParFlatSP_FT, &len);
3001  if (! offset)
3002  return nullptr;
3003 
3004  bptr = offset + ParFlat_COL_DATA_SP;
3005  eptr = offset + len;
3006 
3007  spfip = nullptr;
3008  current = nullptr;
3009 
3010  while (bptr < eptr && (endline = SrchTheChar(bptr, eptr, '\n'))) {
3011  temp = new SPFeatInput;
3012 
3013  for (p = bptr, i = 0; *p != ' ' && *p != '\n' && i < 8; i++)
3014  p++;
3015  temp->key.assign(bptr, p);
3017 
3018  if (temp->key == "VAR_SEQ")
3019  defdelim = "\n";
3020  else
3021  defdelim = " ";
3022 
3023  for (bptr += 8; *bptr == ' ' && bptr <= endline;)
3024  bptr++;
3025 
3026  location = bptr;
3027 
3028  if (((*bptr >= 'a' && *bptr <= 'z') || (*bptr >= 'A' && *bptr <= 'Z')) &&
3029  bptr[6] == '-') {
3030  for (bptr += 7; *bptr >= '0' && *bptr <= '9' && bptr <= endline;)
3031  bptr++;
3032  for (; *bptr == ':' && bptr <= endline;)
3033  bptr++;
3034  }
3035 
3036  for (ptr1 = bptr; *ptr1 == '?' || *ptr1 == '>' || *ptr1 == '<' ||
3037  (*ptr1 >= '0' && *ptr1 <= '9');)
3038  ptr1++;
3039 
3040  if (bptr < ptr1 && ptr1 <= endline) {
3041  temp->from.assign(bptr, ptr1);
3042  fromstart = bptr;
3043  fromend = ptr1;
3044  } else {
3045  ch = '\0';
3046  p = StringChr(location, ' ');
3047  q = StringChr(location, '\n');
3048  if (! p || (q && q < p))
3049  p = q;
3050  if (p) {
3051  ch = *p;
3052  *p = '\0';
3053  }
3054  if (bptr == ptr1)
3055  ErrPostEx(SEV_ERROR, ERR_FEATURE_BadLocation, "Invalid location \"%s\" at feature \"%s\". Feature dropped.", location, temp->key.c_str());
3056  else
3057  ErrPostEx(SEV_ERROR, ERR_FEATURE_BadLocation, "Empty location at feature \"%s\". Feature dropped.", temp->key.c_str());
3058  if (p)
3059  *p = ch;
3060  temp->from.assign("-1");
3061  fromstart = nullptr;
3062  fromend = nullptr;
3063  }
3064 
3065  new_format = false;
3066  bptr = ptr1;
3067  for (; (*bptr == ' ' || *bptr == '.') && bptr <= endline; bptr++)
3068  if (*bptr == '.')
3069  new_format = true;
3070  for (ptr1 = bptr; *ptr1 == '?' || *ptr1 == '>' || *ptr1 == '<' ||
3071  (*ptr1 >= '0' && *ptr1 <= '9');)
3072  ptr1++;
3073 
3074  p = (char*)temp->from.c_str();
3075  if (*p == '<' || *p == '>')
3076  p++;
3077 
3078  for (q = ptr1; *q == ' ';)
3079  q++;
3080  extra_text = false;
3081  if (bptr < ptr1 && ptr1 <= endline) {
3082  if (*q != '\n' && new_format && (*p == '?' || atoi(p) != -1))
3083  extra_text = true;
3084  temp->to.assign(bptr, ptr1);
3085  } else if (fromstart) {
3086  if (*q != '\n' && (*p == '?' || atoi(p) != -1))
3087  extra_text = true;
3088  temp->to.assign(fromstart, fromend);
3089  } else {
3090  if (*q != '\n' && (*p == '?' || atoi(p) != -1))
3091  extra_text = true;
3092  temp->to.assign("-1");
3093  }
3094 
3095  q = (char*)temp->to.c_str();
3096  if (*q == '<' || *q == '>')
3097  q++;
3098  if (extra_text || (*p != '?' && *q != '?' && (atoi(p) > atoi(q)))) {
3099  ch = '\0';
3100  p = extra_text ? nullptr : StringChr(location, ' ');
3101  q = StringChr(location, '\n');
3102  if (! p || (q && q < p))
3103  p = q;
3104  if (p) {
3105  ch = *p;
3106  *p = '\0';
3107  }
3108  ErrPostEx(SEV_ERROR, ERR_FEATURE_BadLocation, "Invalid location \"%s\" at feature \"%s\". Feature dropped.", location, temp->key.c_str());
3109  if (p)
3110  *p = ch;
3111  temp->from.assign("-1");
3112  }
3113 
3114  for (bptr = ptr1; *bptr == ' ' && bptr <= endline;)
3115  bptr++;
3116 
3117  str = endline;
3118  delim = defdelim;
3119  if (str > bptr)
3120  if (*--str == '-' && str > bptr)
3121  if (*--str != ' ')
3122  delim = nullptr;
3123  if (bptr <= endline)
3124  temp->descrip.assign(bptr, endline);
3125 
3126  for (bptr = endline; *bptr == ' ' || *bptr == '\n';)
3127  bptr++;
3128 
3129  badqual = false;
3130  bptr += ParFlat_COL_DATA_SP;
3131  while (bptr < eptr && (*bptr == ' ')) /* continue description data */
3132  {
3133  while (*bptr == ' ')
3134  bptr++;
3135 
3136  if (StringEquN(bptr, "/note=\"", 7)) {
3137  bptr += 7;
3138  quotes = nullptr;
3139  } else if (StringEquN(bptr, "/evidence=\"", 11)) {
3140  quotes = bptr + 10;
3141  if (! StringEquN(quotes + 1, "ECO:", 4)) {
3142  p = StringChr(bptr, '\n');
3143  if (p)
3144  *p = '\0';
3145  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidEvidence, "/evidence qualifier does not have expected \"ECO:\" prefix : \"%s\".", bptr);
3146  if (p)
3147  *p = '\n';
3148  }
3149  } else if (StringEquN(bptr, "/id=\"", 5))
3150  quotes = bptr + 4;
3151  else {
3152  if (*bptr == '/') {
3153  for (p = bptr + 1; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || (*p >= '0' && *p <= '9') || *p == '_';)
3154  p++;
3155  if (*p == '=' && p[1] == '\"') {
3156  *p = '\0';
3157  badqual = true;
3158  ErrPostEx(SEV_ERROR, ERR_FEATURE_InvalidQualifier, "Qualifier %s is invalid for the feature \"%s\" at \"%s..%s\".", bptr, temp->key.c_str(), temp->from.c_str(), temp->to.c_str());
3159  *p = '=';
3160  }
3161  }
3162  quotes = nullptr;
3163  }
3164 
3165  endline = SrchTheChar(bptr, eptr, '\n');
3166  p = endline - 1;
3167  if (p >= bptr && *p == '\"')
3168  *p = '.';
3169  else
3170  p = nullptr;
3171 
3172  if (quotes) {
3173  StringCombine(temp->descrip, string(bptr, quotes), delim);
3174  if (p && p - 1 >= bptr && *(p - 1) == '.')
3175  StringCombine(temp->descrip, string(quotes + 1, endline - 1), "");
3176  else
3177  StringCombine(temp->descrip, string(quotes + 1, endline), "");
3178  } else {
3179  if (p && p - 1 >= bptr && *(p - 1) == '.')
3180  StringCombine(temp->descrip, string(bptr, endline - 1), delim);
3181  else
3182  StringCombine(temp->descrip, string(bptr, endline), delim);
3183  }
3184 
3185  if (p)
3186  *p = '\"';
3187 
3188  str = endline;
3189  delim = defdelim;
3190  if (str > bptr)
3191  if (*--str == '-' && str > bptr)
3192  if (*--str != ' ')
3193  delim = nullptr;
3194  for (bptr = endline; *bptr == ' ' || *bptr == '\n';)
3195  bptr++;
3196 
3197  bptr += ParFlat_COL_DATA_SP;
3198  }
3199 
3200  if (badqual) {
3201  ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped, "Invalid qualifier(s) found within the feature \"%s\" at \"%s..%s\". Feature dropped.", temp->key.c_str(), temp->from.c_str(), temp->to.c_str());
3202  FreeSPFeatInputSet(temp);
3203  continue;
3204  }
3205 
3206  if (*defdelim == '\n')
3207  SPPostProcVarSeq(temp->descrip);
3208 
3209  p = (char*)temp->from.c_str();
3210  if (*p == '<' || *p == '>')
3211  p++;
3212  if (*p != '?' && atoi(p) < 0) {
3213  FreeSPFeatInputSet(temp);
3214  continue;
3215  }
3216 
3217  q = (char*)temp->to.c_str();
3218  if (*q == '<' || *q == '>')
3219  q++;
3220  if ((*p != '?' && atoi(p) > (Int4)seqlen) || (*q != '?' && atoi(q) > (Int4)seqlen)) {
3221  ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "Location range exceeds the sequence length: feature=%s, length=%d, from=%s, to=%s", temp->key.c_str(), seqlen, temp->from.c_str(), temp->to.c_str());
3222  ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped, "Location range exceeds the sequence length: feature=%s, length=%d, from=%s, to=%s", temp->key.c_str(), seqlen, temp->from.c_str(), temp->to.c_str());
3223  FreeSPFeatInputSet(temp);
3224  continue;
3225  }
3226 
3227  if (! spfip)
3228  spfip = temp;
3229  else
3230  current->next = temp;
3231  current = temp;
3232  }
3233 
3234  fta_remove_dup_spfeats(spfip);
3235 
3236  return (spfip);
3237 }
3238 
3239 /**********************************************************
3240  *
3241  * static CRef<CSeq_loc> GetSPSeqLoc(pp, spfip, bond, initmet,
3242  * signal):
3243  *
3244  * The following rules are assumption since I am
3245  * waiting Mark's mail:
3246  * - substract one if from > 0 and
3247  * - unknown endpoint "?" not implement.
3248  *
3249  * 10-18-93
3250  *
3251  **********************************************************/
3252 static CRef<CSeq_loc> GetSPSeqLoc(ParserPtr pp, SPFeatInputPtr spfip, bool bond, bool initmet, bool signal)
3253 {
3254  CRef<CSeq_loc> loc;
3255 
3256  IndexblkPtr ibp;
3257 
3258  const char* ptr;
3259 
3260  bool fuzzfrom = false;
3261  bool fuzzto = false;
3262  bool nofrom = false;
3263  bool noto = false;
3264  bool pntfuzz = false;
3265  Int4 from;
3266  Int4 to;
3267 
3268  if (! spfip || spfip->from.empty() || spfip->to.empty())
3269  return loc;
3270 
3271  ibp = pp->entrylist[pp->curindx];
3272 
3273  loc.Reset(new CSeq_loc);
3274 
3275  ptr = spfip->from.c_str();
3276  if (StringChr(ptr, '<')) {
3277  fuzzfrom = true;
3278 
3279  while (*ptr != '\0' && isdigit(*ptr) == 0)
3280  ptr++;
3281  from = (Int4)atoi(ptr);
3282  } else if (StringChr(ptr, '?')) {
3283  from = 0;
3284  nofrom = true;
3285  } else {
3286  from = (Int4)atoi(ptr);
3287  }
3288  if ((initmet == false && from != 0) ||
3289  (initmet && signal && from == 1))
3290  from--;
3291 
3292  ptr = spfip->to.c_str();
3293  if (StringChr(ptr, '>')) {
3294  fuzzto = true;
3295  while (*ptr != '\0' && isdigit(*ptr) == 0)
3296  ptr++;
3297  to = (Int4)atoi(ptr);
3298  } else if (StringChr(ptr, '?')) {
3299  to = static_cast<Int4>(ibp->bases);
3300  noto = true;
3301  } else
3302  to = (Int4)atoi(ptr);
3303 
3304  if (initmet == false && to != 0)
3305  to--;
3306  if (nofrom && noto)
3307  pntfuzz = true;
3308 
3309  if (bond) {
3310  CSeq_bond& bond = loc->SetBond();
3311  CSeq_point& point_a = bond.SetA();
3312 
3313  point_a.SetPoint(from);
3314  point_a.SetId(*MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum));
3315 
3316  if (fuzzfrom)
3317  GetIntFuzzPtr(4, 2, 0, point_a.SetFuzz());
3318 
3319  if (from != to) {
3320  CSeq_point& point_b = bond.SetB();
3321  point_b.SetPoint(to);
3322  point_b.SetId(*MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum));
3323 
3324  if (fuzzto)
3325  GetIntFuzzPtr(4, 1, 0, point_b.SetFuzz());
3326  }
3327  } else if (from != to && ! pntfuzz) {
3328  CSeq_interval& interval = loc->SetInt();
3329  interval.SetFrom(from);
3330  interval.SetTo(to);
3331  interval.SetId(*MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum));
3332 
3333  if (fuzzfrom)
3334  GetIntFuzzPtr(4, 2, 0, interval.SetFuzz_from()); /* lim, lt, no-min */
3335 
3336  if (nofrom)
3337  GetIntFuzzPtr(2, to - 1, 0, interval.SetFuzz_from()); /* range, max, min */
3338 
3339  if (noto)
3340  GetIntFuzzPtr(2, to, from + 1, interval.SetFuzz_to()); /* range, max, min */
3341 
3342  if (fuzzto)
3343  GetIntFuzzPtr(4, 1, 0, interval.SetFuzz_to()); /* lim, gt, no-min */
3344  } else {
3345  CSeq_point& point = loc->SetPnt();
3346  point.SetPoint(from);
3347  point.SetId(*MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum));
3348 
3349  if (pntfuzz) {
3350  GetIntFuzzPtr(2, to, from, point.SetFuzz()); /* range, max, min */
3351  } else if (fuzzfrom) {
3352  GetIntFuzzPtr(4, 2, 0, point.SetFuzz());
3353  }
3354  }
3355 
3356  return loc;
3357 }
3358 
3359 /**********************************************************
3360  *
3361  * static char* DelTheStr(sourcesrt, targetstr):
3362  *
3363  * Return a string with deleted "targetstr".
3364  * Also Free out "sourcestr".
3365  *
3366  **********************************************************/
3367 /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
3368 static void DelTheStr(string& sourcestr, const string& targetstr)
3369 {
3370  NStr::ReplaceInPlace(sourcestr, targetstr, "", 0, 1);
3371  NStr::TruncateSpacesInPlace(sourcestr, NStr::eTrunc_End);
3372 }
3373 */
3374 
3375 /**********************************************************
3376  *
3377  * static bool SPFeatNoExp(pp, spfip):
3378  *
3379  * Return TRUE if "str" containing any string in the
3380  * ParFlat_SPFeatNoExp or ParFlat_SPFeatNoExpW (old
3381  * patterns, put warning message).
3382  *
3383  * 10-18-93
3384  *
3385  **********************************************************/
3386 /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
3387 static bool SPFeatNoExp(ParserPtr pp, SPFeatInputPtr spfip)
3388 {
3389  Int2 indx;
3390  Int4 len = 0;
3391 
3392  if (!spfip)
3393  return false;
3394 
3395  if(MatchArrayISubString(ParFlat_SPFeatNoExp, spfip->descrip.c_str()) != -1)
3396  return true;
3397 
3398  indx = MatchArrayISubString(ParFlat_SPFeatNoExpW, spfip->descrip.c_str());
3399  if(indx == -1)
3400  return false;
3401 
3402  DelTheStr(spfip->descrip, ParFlat_SPFeatNoExpW[indx]);
3403  if (len > 0 && spfip->descrip[len-1] != '.')
3404  {
3405  StringCombine(spfip->descrip, ".", nullptr);
3406  }
3407 
3408  ErrPostEx(SEV_WARNING, ERR_FEATURE_OldNonExp,
3409  "Old Non-experimental feature description, %s",
3410  ParFlat_SPFeatNoExpW[indx]);
3411 
3412  return true;
3413 }
3414 */
3415 
3416 /**********************************************************
3417  *
3418  * static Int2 GetSPSitesMod(retstr):
3419  *
3420  * Return an index array of ParFlat_SPFEAT for
3421  * a specific type of modified residue because the first
3422  * several words of a MOD_RES feature's description can
3423  * indicate a more specific type of modified residue.
3424  *
3425  * 10-18-93
3426  *
3427  **********************************************************/
3428 static Int2 GetSPSitesMod(string& retstr)
3429 {
3430  Int2 ret = ParFlatSPSitesModB;
3431 
3432  for (Int2 i = ParFlatSPSitesModB; i <= ParFlatSPSitesModE; i++) {
3433  size_t pos = NStr::FindNoCase(retstr, ParFlat_SPFeat[i].keystring, 0);
3434  if (pos == NPOS)
3435  continue;
3436 
3437  size_t len = StringLen(ParFlat_SPFeat[i].keystring);
3438  if ((pos != 0 && retstr[pos - 1] != ' ' && retstr[pos - 1] != '.') ||
3439  (retstr[pos + len] != '\0' && retstr[pos + len] != ' ' &&
3440  retstr[pos + len] != '.' && retstr[pos + len] != ';'))
3441  continue;
3442 
3443  ret = i;
3444  break;
3445  }
3446 
3447  return (ret);
3448 }
3449 
3450 /**********************************************************
3451  *
3452  * Int2 SpFeatKeyNameValid(keystr):
3453  *
3454  * 10-18-93
3455  *
3456  **********************************************************/
3458 {
3459  Int2 i;
3460 
3461  for (i = 0; ParFlat_SPFeat[i].inkey; i++)
3462  if (NStr::CompareNocase(ParFlat_SPFeat[i].inkey, keystr) == 0)
3463  break;
3464 
3465  if (! ParFlat_SPFeat[i].inkey)
3466  return (-1);
3467  return (i);
3468 }
3469 
3470 /**********************************************************/
3472 {
3473  string descrip;
3474  char* loc;
3475  char* p;
3476  Uint1 type;
3477  Int2 indx;
3478  bool err = false;
3479 
3480  descrip.assign(CpTheQualValue(fbp->quals, "note"));
3481 
3482  if (NStr::EqualNocase("VARSPLIC", fbp->key)) {
3483  ErrPostStr(SEV_WARNING, ERR_FEATURE_ObsoleteFeature, "Obsolete UniProt feature \"VARSPLIC\" found. Replaced with \"VAR_SEQ\".");
3484  fbp->key = (char*)"VAR_SEQ";
3485  }
3486 
3487  if (NStr::EqualNocase(fbp->key, "NON_STD")) {
3488  if (NStr::EqualNocase(descrip, "Selenocysteine.")) {
3489  fbp->key = (char*)"SE_CYS";
3490  descrip.clear();
3491  } else
3492  fbp->key = (char*)"MOD_RES";
3493  }
3494 
3495  CRef<CSeq_feat> feat(new CSeq_feat);
3496  indx = fbp->spindex;
3497  type = ParFlat_SPFeat[indx].type;
3498  if (type == ParFlatSPSites) {
3499  if (indx == ParFlatSPSitesModB && ! descrip.empty())
3500  indx = GetSPSitesMod(descrip);
3501 
3502  feat->SetData().SetSite(static_cast<CSeqFeatData::ESite>(ParFlat_SPFeat[indx].keyint));
3503  } else if (type == ParFlatSPBonds) {
3504  feat->SetData().SetBond(static_cast<CSeqFeatData::EBond>(ParFlat_SPFeat[indx].keyint));
3505  } else if (type == ParFlatSPRegions) {
3506  feat->SetData().SetRegion(ParFlat_SPFeat[indx].keystring);
3507  } else if (type == ParFlatSPImports) {
3508  feat->SetData().SetImp().SetKey(ParFlat_SPFeat[indx].keystring);
3509  feat->SetData().SetImp().SetDescr("uncertain amino acids");
3510  } else {
3511  if (type != ParFlatSPInitMet && type != ParFlatSPNonTer &&
3512  type != ParFlatSPNonCons) {
3513  ErrPostEx(SEV_WARNING, ERR_FEATURE_Dropped, "Swiss-Prot feature \"%s\" with unknown type dropped.", fbp->key);
3514  }
3515  feat->Reset();
3516  return (null);
3517  }
3518 
3519  if (fbp->location) {
3520  loc = fbp->location;
3521  for (p = loc; *p; p++)
3522  if (*p != ' ')
3523  *loc++ = *p;
3524  *loc = '\0';
3525  if (pp->buf)
3526  MemFree(pp->buf);
3527  pp->buf = MemNew(StringLen(fbp->key) + StringLen(fbp->location) + 4);
3528  StringCpy(pp->buf, fbp->key);
3529  StringCpy(pp->buf, " : ");
3530  StringCpy(pp->buf, fbp->location);
3531  GetSeqLocation(*feat, fbp->location, seqids, &err, pp, fbp->key);
3532  if (pp->buf)
3533  MemFree(pp->buf);
3534  pp->buf = nullptr;
3535  }
3536  if (err) {
3537  if (! pp->debug) {
3538  ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped, "%s|%s| range check detects problems", fbp->key, fbp->location);
3539  if (! descrip.empty())
3540  descrip.clear();
3541  feat->Reset();
3542  return (null);
3543  }
3544  ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "%s|%s| range check detects problems", fbp->key, fbp->location);
3545  }
3546 
3547  if (SeqLocHaveFuzz(feat->GetLocation()))
3548  feat->SetPartial(true);
3549 
3550  if (! descrip.empty())
3551  feat->SetComment(descrip);
3552 
3553  return (feat);
3554 }
3555 
3556 /**********************************************************
3557  *
3558  * static void SPFeatGeneral(pp, spfip, initmet):
3559  *
3560  * 10-18-93
3561  *
3562  **********************************************************/
3563 static void SPFeatGeneral(ParserPtr pp, SPFeatInputPtr spfip, bool initmet, CSeq_annot::C_Data::TFtable& feats)
3564 {
3565  SPFeatInputPtr temp;
3566 
3567  Int2 indx;
3568  bool signal;
3569  bool bond;
3570  /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
3571  bool noexp;
3572 */
3573  Uint1 type;
3574 
3575  for (temp = spfip; temp; temp = temp->next) {
3576  FtaInstallPrefix(PREFIX_FEATURE, temp->key.c_str(), temp->from.c_str());
3577 
3578  if (NStr::EqualNocase("VARSPLIC", temp->key)) {
3579  ErrPostStr(SEV_WARNING, ERR_FEATURE_ObsoleteFeature, "Obsolete UniProt feature \"VARSPLIC\" found. Replaced with \"VAR_SEQ\".");
3580  temp->key = "VAR_SEQ";
3581  }
3582 
3583  if (NStr::EqualNocase(temp->key, "NON_STD")) {
3584  if (NStr::EqualNocase(temp->descrip, "Selenocysteine.")) {
3585  temp->key = "SE_CYS";
3586  temp->descrip.clear();
3587  } else
3588  temp->key = "MOD_RES";
3589  }
3590 
3591  indx = SpFeatKeyNameValid(temp->key.c_str());
3592  if (indx == -1) {
3595  continue;
3596  }
3597 
3598  signal = false;
3599  bond = false;
3600 
3601  /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
3602  noexp = SPFeatNoExp(pp, temp);
3603 */
3604 
3605  CRef<CSeq_feat> feat(new CSeq_feat);
3606 
3607  type = ParFlat_SPFeat[indx].type;
3608  if (type == ParFlatSPSites) {
3609  if (indx == ParFlatSPSitesModB)
3610  indx = GetSPSitesMod(temp->descrip);
3611 
3612  feat->SetData().SetSite(static_cast<CSeqFeatData::ESite>(ParFlat_SPFeat[indx].keyint));
3613  } else if (type == ParFlatSPBonds) {
3614  feat->SetData().SetBond(static_cast<CSeqFeatData::EBond>(ParFlat_SPFeat[indx].keyint));
3615  bond = true;
3616  } else if (type == ParFlatSPRegions) {
3617  feat->SetData().SetRegion(ParFlat_SPFeat[indx].keystring);
3618  if (feat->GetData().GetRegion() == "Signal")
3619  signal = true;
3620  } else if (type == ParFlatSPImports) {
3621  feat->SetData().SetImp().SetKey(ParFlat_SPFeat[indx].keystring);
3622  feat->SetData().SetImp().SetDescr("uncertain amino acids");
3623  } else {
3624  if (type != ParFlatSPInitMet && type != ParFlatSPNonTer &&
3625  type != ParFlatSPNonCons) {
3626  ErrPostEx(SEV_WARNING, ERR_FEATURE_Dropped, "Swiss-Prot feature \"%s\" with unknown type dropped.", temp->key.c_str());
3627  }
3629  continue;
3630  }
3631 
3632  /* bsv : 03/04/2020 : no Seq-feat.exp-ev setting anymore
3633  if(noexp)
3634  feat->SetExp_ev(CSeq_feat::eExp_ev_not_experimental);
3635  else
3636  feat->SetExp_ev(CSeq_feat::eExp_ev_experimental);
3637 */
3638 
3639 
3640  CRef<CSeq_loc> loc = GetSPSeqLoc(pp, temp, bond, initmet, signal);
3641  if (loc.NotEmpty())
3642  feat->SetLocation(*loc);
3643 
3644  if (SeqLocHaveFuzz(*loc))
3645  feat->SetPartial(true);
3646 
3647  if (! temp->descrip.empty())
3648  feat->SetComment(NStr::Sanitize(temp->descrip));
3649 
3650  feats.push_back(feat);
3651 
3653  }
3654 }
3655 
3656 /**********************************************************/
3657 static void DelParenthesis(char* str)
3658 {
3659  char* p;
3660  char* q;
3661  char* pp;
3662  char* qq;
3663  char* r;
3664  Int2 count;
3665  Int2 left;
3666  Int2 right;
3667 
3668  for (p = str; *p == ' ' || *p == '\t';)
3669  p++;
3670  for (q = p; *q != '\0';)
3671  q++;
3672  if (q > p)
3673  for (q--; (*q == ' ' || *q == '\t') && q > p;)
3674  *q-- = '\0';
3675  if (q == p && (*q == ' ' || *q == '\t'))
3676  *q = '\0';
3677  for (pp = p; *pp == '(';)
3678  pp++;
3679  for (qq = q; *qq == ')' && qq >= pp;)
3680  qq--;
3681  for (count = 0, left = 0, right = 0, r = pp; r <= qq; r++) {
3682  if (*r == '(')
3683  left++;
3684  else if (*r == ')') {
3685  right++;
3686  count = left - right;
3687  }
3688  }
3689  if (count < 0)
3690  for (; count < 0 && pp > p; pp--)
3691  count++;
3692  for (count = 0, r = qq; r >= pp; r--) {
3693  if (*r == '(')
3694  count--;
3695  else if (*r == ')')
3696  count++;
3697  }
3698  if (count < 0)
3699  for (; count < 0 && qq < q; qq++)
3700  count++;
3701  *++qq = '\0';
3702  if (pp != str)
3703  fta_StringCpy(str, pp);
3704 }
3705 
3706 /**********************************************************
3707  *
3708  * static void CkGeneNameSP(gname):
3709  *
3710  * Legal characters for gene_name are 0-9, a-z, A-Z,
3711  * under-score, dash, period, single quote, back single
3712  * quote, slash.
3713  *
3714  * 10-25-93
3715  *
3716  **********************************************************/
3717 static void CkGeneNameSP(char* gname)
3718 {
3719  char* p;
3720 
3721  DelParenthesis(gname);
3722  for (p = gname; *p != '\0'; p++)
3723  if (! (isalnum(*p) || *p == '_' || *p == '-' || *p == '.' ||
3724  *p == '\'' || *p == '`' || *p == '/' || *p == '(' || *p == ')'))
3725  break;
3726  if (*p != '\0')
3727  ErrPostEx(SEV_WARNING, ERR_GENENAME_IllegalGeneName, "gene_name contains unusual characters, %s, in SWISS-PROT", gname);
3728 }
3729 
3730 /**********************************************************
3731  *
3732  * static void ParseGeneNameSP(str, feat):
3733  *
3734  * gene_name and synonyms separated by " OR ".
3735  *
3736  * 10-25-93
3737  *
3738  **********************************************************/
3739 static void ParseGeneNameSP(char* str, CSeq_feat& feat)
3740 {
3741  char* p;
3742  char* q;
3743  Int2 count = 0;
3744 
3745  CGene_ref& gene = feat.SetData().SetGene();
3746 
3747  for (p = str; *p != '\0';) {
3748  while (*p == ' ')
3749  p++;
3750  for (q = p; *p != '\0' && *p != ' ';)
3751  p++;
3752  if (*p != '\0')
3753  *p++ = '\0';
3754  if (StringEqu(q, "AND") || StringEqu(q, "OR"))
3755  continue;
3756  char* gname = StringSave(q);
3757  CkGeneNameSP(gname);
3758  if (count == 0) {
3759  count++;
3760  gene.SetLocus(gname);
3761  } else {
3762  gene.SetSyn().push_back(gname);
3763  }
3764  MemFree(gname);
3765  }
3766 }
3767 
3768 /**********************************************************
3769  *
3770  * static CRef<CSeq_loc> GetSeqLocIntSP(seqlen, acnum,
3771  * accver, vernum):
3772  *
3773  * 10-18-93
3774  *
3775  **********************************************************/
3776 static CRef<CSeq_loc> GetSeqLocIntSP(size_t seqlen, char* acnum, bool accver, Int2 vernum)
3777 {
3778  CRef<CSeq_loc> loc(new CSeq_loc);
3779  CSeq_interval& interval = loc->SetInt();
3780 
3781  interval.SetFrom(0);
3782  interval.SetTo(static_cast<TSeqPos>(seqlen) - 1);
3783  interval.SetId(*MakeAccSeqId(acnum, CSeq_id::e_Swissprot, accver, vernum));
3784 
3785  return loc;
3786 }
3787 
3788 /**********************************************************
3789  *
3790  * static void GetOneGeneRef(pp, hsfp, bptr,
3791  * seqlen):
3792  *
3793  * Each Gene-ref separated by " AND ".
3794  *
3795  * 10-25-93
3796  *
3797  **********************************************************/
3798 static void GetOneGeneRef(ParserPtr pp, CSeq_annot::C_Data::TFtable& feats, char* bptr, size_t seqlen)
3799 {
3800  IndexblkPtr ibp;
3801 
3802  char* str;
3803  char* ptr;
3804 
3805  if (! pp || pp->entrylist.empty())
3806  return;
3807 
3808  ibp = pp->entrylist[pp->curindx];
3809  if (! ibp)
3810  return;
3811 
3812  str = StringSave(bptr);
3813  for (ptr = str; *ptr != '\0'; ptr++)
3814  if (*ptr == '\t')
3815  *ptr = ' ';
3816 
3818 
3819  CRef<CSeq_feat> feat(new CSeq_feat);
3820  ParseGeneNameSP(str, *feat);
3821  feat->SetLocation(*GetSeqLocIntSP(seqlen, ibp->acnum, pp->accver, ibp->vernum));
3822 
3823  feats.push_back(feat);
3824 }
3825 
3826 /**********************************************************/
3827 static void SPFreeGenRefTokens(char* name, char* syns, char* ltags, char* orfs)
3828 {
3829  if (name)
3830  MemFree(name);
3831  if (syns)
3832  MemFree(syns);
3833  if (ltags)
3834  MemFree(ltags);
3835  if (orfs)
3836  MemFree(orfs);
3837 }
3838 
3839 /**********************************************************/
3840 static void SPParseGeneRefTag(char* str, CGene_ref& gene, bool set_locus_tag)
3841 {
3842  char* p;
3843  char* q;
3844 
3845  if (! str)
3846  return;
3847 
3848  for (p = str; p && *p != '\0'; p = q) {
3849  while (*p == ' ' || *p == ',')
3850  p++;
3851  q = StringChr(p, ',');
3852  if (q)
3853  *q++ = '\0';
3854  if (q == p)
3855  continue;
3856  if (set_locus_tag && ! gene.IsSetLocus_tag()) {
3857  gene.SetLocus_tag(p);
3858  continue;
3859  }
3860 
3861  gene.SetSyn().push_back(p);
3862  }
3863 }
3864 
3865 /**********************************************************/
3866 static void SPGetOneGeneRefNew(ParserPtr pp, CSeq_annot::C_Data::TFtable& feats, size_t seqlen, char* name, char* syns, char* ltags, char* orfs)
3867 {
3868  IndexblkPtr ibp;
3869 
3870  if (! pp || pp->entrylist.empty() ||
3871  (! name && ! syns && ! ltags && ! orfs))
3872  return;
3873 
3874  ibp = pp->entrylist[pp->curindx];
3875  if (! ibp)
3876  return;
3877 
3878  CRef<CSeq_feat> feat(new CSeq_feat);
3879  CGene_ref& gene = feat->SetData().SetGene();
3880 
3881  if (name)
3882  gene.SetLocus(name);
3883 
3884 
3885  SPParseGeneRefTag(syns, gene, false);
3886  SPParseGeneRefTag(ltags, gene, true);
3887  SPParseGeneRefTag(orfs, gene, true);
3888 
3889  feat->SetLocation(*GetSeqLocIntSP(seqlen, ibp->acnum, pp->accver, ibp->vernum));
3890 
3891  feats.push_back(feat);
3892 }
3893 
3894 /**********************************************************/
3895 static void SPGetGeneRefsNew(ParserPtr pp, CSeq_annot::C_Data::TFtable& feats, char* bptr, size_t seqlen)
3896 {
3897  IndexblkPtr ibp;
3898 
3899  char* name;
3900  char* syns;
3901  char* ltags;
3902  char* orfs;
3903  char* str;
3904  char* p;
3905  char* q;
3906  char* r;
3907 
3908  if (! pp || pp->entrylist.empty() || ! bptr)
3909  return;
3910 
3911  ibp = pp->entrylist[pp->curindx];
3912  if (! ibp)
3913  return;
3914 
3915  str = StringSave(bptr);
3916 
3917  name = nullptr;
3918  syns = nullptr;
3919  ltags = nullptr;
3920  orfs = nullptr;
3921  for (p = str; p && *p != '\0'; p = q) {
3922  while (*p == ' ' || *p == ';')
3923  p++;
3924  for (r = p;; r = q + 1) {
3925  q = StringChr(r, ';');
3926  if (! q || q[1] == ' ' || q[1] == '\n' || q[1] == '\0')
3927  break;
3928  }
3929  if (q)
3930  *q++ = '\0';
3931  if (StringEquNI(p, "Name=", 5)) {
3932  if (name) {
3933  ErrPostEx(SEV_REJECT, ERR_FORMAT_ExcessGeneFields, "Field \"Name=\" occurs multiple times within a GN line. Entry dropped.");
3934  ibp->drop = true;
3935  break;
3936  }
3937  p += 5;
3938  if (p != q)
3939  name = StringSave(p);
3940  } else if (StringEquNI(p, "Synonyms=", 9)) {
3941  if (syns) {
3942  ErrPostEx(SEV_REJECT, ERR_FORMAT_ExcessGeneFields, "Field \"Synonyms=\" occurs multiple times within a GN line. Entry dropped.");
3943  ibp->drop = true;
3944  break;
3945  }
3946  p += 9;
3947  if (p != q)
3948  syns = StringSave(p);
3949  } else if (StringEquNI(p, "OrderedLocusNames=", 18)) {
3950  if (ltags) {
3951  ErrPostEx(SEV_REJECT, ERR_FORMAT_ExcessGeneFields, "Field \"OrderedLocusNames=\" occurs multiple times within a GN line. Entry dropped.");
3952  ibp->drop = true;
3953  break;
3954  }
3955  p += 18;
3956  if (p != q)
3957  ltags = StringSave(p);
3958  } else if (StringEquNI(p, "ORFNames=", 9)) {
3959  if (orfs) {
3960  ErrPostEx(SEV_REJECT, ERR_FORMAT_ExcessGeneFields, "Field \"ORFNames=\" occurs multiple times within a GN line. Entry dropped.");
3961  ibp->drop = true;
3962  break;
3963  }
3964  p += 9;
3965  if (p != q)
3966  orfs = StringSave(p);
3967  } else if (StringEquNI(p, "and ", 4)) {
3968  if (q)
3969  *--q = ';';
3970  q = p + 4;
3971 
3972  if (! name && ! syns && ! ltags && ! orfs)
3973  continue;
3974 
3975  if (! name && syns) {
3976  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingGeneName, "Encountered a gene with synonyms \"%s\" that lacks a gene symbol.", syns);
3977  }
3978 
3979  SPGetOneGeneRefNew(pp, feats, seqlen, name, syns, ltags, orfs);
3980  SPFreeGenRefTokens(name, syns, ltags, orfs);
3981  name = nullptr;
3982  syns = nullptr;
3983  ltags = nullptr;
3984  orfs = nullptr;
3985  } else {
3986  ErrPostEx(SEV_REJECT, ERR_FORMAT_UnknownGeneField, "Field \"%s\" is not a legal field for the GN linetype. Entry dropped.", p);
3987  ibp->drop = true;
3988  break;
3989  }
3990  }
3991 
3992  MemFree(str);
3993 
3994  if (! name && ! syns && ! ltags && ! orfs)
3995  return;
3996 
3997  if (ibp->drop) {
3998  SPFreeGenRefTokens(name, syns, ltags, orfs);
3999  return;
4000  }
4001 
4002  SPGetOneGeneRefNew(pp, feats, seqlen, name, syns, ltags, orfs);
4003 
4004  SPFreeGenRefTokens(name, syns, ltags, orfs);
4005 }
4006 
4007 /**********************************************************
4008  *
4009  * static Int4 GetSeqLen(entry):
4010  *
4011  * 11-3-93
4012  *
4013  **********************************************************/
4015 {
4016  EntryBlkPtr ebp = static_cast<EntryBlk*>(entry->mpData);
4017  const CBioseq& bioseq = ebp->seq_entry->GetSeq();
4018  return bioseq.GetLength();
4019 }
4020 
4021 /**********************************************************
4022  *
4023  * static void SPFeatGeneRef(pp, hsfp, entry):
4024  *
4025  * sfp->mpData: gene (Gene-ref).
4026  * Data from GN lines:
4027  * - legal characters for gene_name are 0-9, a-z, A-Z,
4028  * under-score, dash, period, single quote, back
4029  * single quote, slash;
4030  * - each Gene-ref separated by " AND ";
4031  * - gene_name and synonyms separated by " OR ", the
4032  * first one before " OR " is gene_name, others are
4033  * synonyms.
4034  *
4035  * sfp->location: SEQLOC_INT, always from 0 to
4036  * seqence_length.
4037  *
4038  * Output warning message:
4039  * - if DE line containing "(GENE NAME:...)" clause
4040  * (SPFeatProtRef routine);
4041  * - or other illegal character s.t. white space in the
4042  * gene_name.
4043  *
4044  * 10-25-93
4045  *
4046  **********************************************************/
4048 {
4049  char* offset;
4050  char* str;
4051 
4052  size_t len = 0;
4053  offset = SrchNodeType(entry, ParFlatSP_GN, &len);
4054  if (! offset)
4055  return;
4056 
4058  StripECO(str);
4059  if (! str)
4060  return;
4061 
4062  len = GetSeqLen(entry);
4063  if (! StringIStr(str, "Name=") &&
4064  ! StringIStr(str, "Synonyms=") &&
4065  ! StringIStr(str, "OrderedLocusNames=") &&
4066  ! StringIStr(str, "ORFNames="))
4067  GetOneGeneRef(pp, feats, str, len);
4068  else
4069  SPGetGeneRefsNew(pp, feats, str, len);
4070 
4071  MemFree(str);
4072 }
4073 
4074 /**********************************************************/
4075 static void SPValidateEcnum(string& ecnum)
4076 {
4077  char* p;
4078  char* q;
4079  char* buf;
4080  Int4 count;
4081 
4082  buf = StringSave(ecnum.c_str());
4083  for (count = 0, q = buf;; q = p) {
4084  p = q;
4085  count++;
4086  if (*p == '-') {
4087  p++;
4088  if (*p != '.')
4089  break;
4090  p++;
4091  continue;
4092  }
4093  if (*p == 'n') {
4094  p++;
4095  if (*p == '.' || *p == '\0') {
4096  count = 0;
4097  break;
4098  }
4099  }
4100  while (*p >= '0' && *p <= '9')
4101  p++;
4102  if (*q == 'n' && (*p == '.' || *p == '\0')) {
4103  fta_StringCpy(q + 1, p);
4104  p = q + 1;
4105  }
4106  if (p == q) {
4107  count = 0;
4108  break;
4109  }
4110  if (*p != '.')
4111  break;
4112  p++;
4113  }
4114 
4115  if (count != 4 || *p != '\0') {
4116  ErrPostEx(SEV_ERROR, ERR_FORMAT_InvalidECNumber, "Invalid EC number provided in SwissProt DE line: \"%s\". Preserve it anyway.", ecnum.c_str());
4117  } else
4118  ecnum = buf;
4119  MemFree(buf);
4120 }
4121 
4122 /**********************************************************/
4124 {
4125  Char ch;
4126 
4127  for (; sfp; sfp = sfp->next) {
4128  if (sfp->tag == SPDE_RECNAME || sfp->tag == SPDE_ALTNAME ||
4129  sfp->tag == SPDE_SUBNAME || sfp->tag == SPDE_FLAGS)
4130  break;
4131  if (sfp->tag != tag)
4132  continue;
4133 
4134  ch = *sfp->end;
4135  *sfp->end = '\0';
4136 
4137  prot.SetName().push_back(sfp->start);
4138  *sfp->end = ch;
4139  }
4140 }
4141 
4142 /**********************************************************/
4143 static void SPValidateDefinition(SPDEFieldsPtr sfp, bool* drop, bool is_trembl)
4144 {
4145  SPDEFieldsPtr tsfp;
4146  Int4 rcount;
4147  Int4 scount;
4148  Int4 fcount;
4149 
4150  for (rcount = 0, scount = 0, tsfp = sfp; tsfp; tsfp = tsfp->next) {
4151  if (tsfp->tag == SPDE_RECNAME)
4152  rcount++;
4153  else if (tsfp->tag == SPDE_SUBNAME)
4154  scount++;
4155  }
4156 
4157  for (fcount = 0, tsfp = sfp; tsfp; tsfp = tsfp->next) {
4158  if (tsfp->tag != SPDE_RECNAME)
4159  continue;
4160  for (tsfp = tsfp->next; tsfp; tsfp = tsfp->next) {
4161  if (tsfp->tag == SPDE_RECNAME || tsfp->tag == SPDE_ALTNAME ||
4162  tsfp->tag == SPDE_SUBNAME || tsfp->tag == SPDE_FLAGS)
4163  break;
4164  if (tsfp->tag == SPDE_FULL)
4165  fcount++;
4166  }
4167  if (! tsfp)
4168  break;
4169  }
4170 
4171  if (rcount > 1) {
4172  ErrPostEx(SEV_REJECT, ERR_FORMAT_MultipleRecName, "This UniProt record has multiple RecName protein-name categories, but only one is allowed. Entry dropped.");
4173  *drop = true;
4174  } else if (rcount == 0 && ! is_trembl) {
4175  ErrPostEx(SEV_REJECT, ERR_FORMAT_MissingRecName, "This UniProt/Swiss-Prot record lacks required RecName protein-name categorie. Entry dropped.");
4176  *drop = true;
4177  }
4178 
4179  if (scount > 0 && ! is_trembl) {
4180  ErrPostEx(SEV_REJECT, ERR_FORMAT_SwissProtHasSubName, "This UniProt/Swiss-Prot record includes a SubName protein-name category, which should be used only for UniProt/TrEMBL. Entry dropped.");
4181  *drop = true;
4182  }
4183 
4184  if (fcount == 0 && rcount > 0) {
4185  ErrPostEx(SEV_REJECT, ERR_FORMAT_MissingFullRecName, "This UniProt record lacks a Full name in the RecName protein-name category.");
4186  *drop = true;
4187  }
4188 }
4189 
4190 /**********************************************************/
4191 static void SPParseDefinition(char* str, const CBioseq::TId& ids, IndexblkPtr ibp, CProt_ref& prot)
4192 {
4193  CharIntLen* cilp;
4194  SPDEFieldsPtr sfp;
4195  SPDEFieldsPtr tsfp;
4196 
4197  bool is_trembl;
4198  char* p;
4199  char* q;
4200  char* r;
4201  Int4 count;
4202  Char ch;
4203 
4204  if (! str || (! StringEquNI(str, "RecName: ", 9) &&
4205  ! StringEquNI(str, "AltName: ", 9) &&
4206  ! StringEquNI(str, "SubName: ", 9)))
4207  return;
4208 
4209  is_trembl = false;
4210 
4211  for (const auto& id : ids) {
4212  if (! id->IsSwissprot())
4213  continue;
4214 
4215  if (id->GetSwissprot().IsSetRelease() &&
4216  NStr::CompareNocase(id->GetSwissprot().GetRelease().c_str(), "unreviewed") == 0)
4217  is_trembl = true;
4218  }
4219 
4220  sfp = new SPDEFields;
4221  sfp->tag = 0;
4222  sfp->next = nullptr;
4223 
4224  for (tsfp = sfp, p = str, count = 0; *p != '\0';) {
4225  while (*p == ' ')
4226  p++;
4227  for (q = p; *p != '\0' && *p != ' ';)
4228  p++;
4229  ch = *p;
4230  *p = '\0';
4231  for (cilp = spde_tags; cilp->str; cilp++)
4232  if (StringEquNI(cilp->str, q, cilp->len))
4233  break;
4234 
4235  *p = ch;
4236  if (! cilp->str)
4237  continue;
4238 
4239  if (tsfp->tag != 0) {
4240  if (q == tsfp->start)
4241  tsfp->end = q;
4242  else {
4243  for (r = q - 1; *r == ' ' || *r == ';';)
4244  r--;
4245  tsfp->end = r + 1;
4246  }
4247  }
4248 
4249  if (cilp->num == SPDE_INCLUDES || cilp->num == SPDE_CONTAINS)
4250  break;
4251 
4252  count++;
4253  tsfp->next = new SPDEFields;
4254  tsfp = tsfp->next;
4255  tsfp->tag = cilp->num;
4256  for (r = q + cilp->len; *r == ' ';)
4257  r++;
4258  tsfp->start = r;
4259  tsfp->next = nullptr;
4260  }
4261 
4262  if (*p == '\0')
4263  tsfp->end = p;
4264 
4265  SPValidateDefinition(sfp->next, &ibp->drop, is_trembl);
4266 
4267  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4268  if (tsfp->tag == SPDE_RECNAME)
4270  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4271  if (tsfp->tag == SPDE_RECNAME)
4273 
4274  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4275  if (tsfp->tag == SPDE_ALTNAME)
4277  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4278  if (tsfp->tag == SPDE_ALTNAME)
4280 
4281  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4282  if (tsfp->tag == SPDE_SUBNAME)
4284  for (tsfp = sfp->next; tsfp; tsfp = tsfp->next)
4285  if (tsfp->tag == SPDE_SUBNAME)
4287 }
4288 
4289 /**********************************************************/
4290 static void SPGetPEValue(DataBlkPtr entry, CSeq_feat& feat)
4291 {
4292  char* offset;
4293  char* buf;
4294  char* p;
4295  char* q;
4296  Char ch;
4297 
4298  size_t len = 0;
4299  offset = SrchNodeType(entry, ParFlatSP_PE, &len);
4300  if (! offset || len < 1)
4301  return;
4302 
4303  ch = offset[len - 1];
4304  offset[len - 1] = '\0';
4305  buf = StringSave(offset);
4306  offset[len - 1] = ch;
4307 
4308  for (q = buf + 2; *q == ' ';)
4309  q++;
4310  p = StringChr(q, ':');
4311  if (p)
4312  for (p++; *p == ' ';)
4313  p++;
4314  else
4315  p = q;
4316 
4317  q = StringRChr(p, ';');
4318  if (! q)
4319  q = StringChr(p, '\n');
4320  if (q)
4321  *q = '\0';
4322 
4323  if (MatchArrayIString(PE_values, p) < 0)
4324  ErrPostEx(SEV_ERROR, ERR_SPROT_PELine, "Unrecognized value is encountered in PE (Protein Existence) line: \"%s\".", p);
4325 
4326  CRef<CGb_qual> qual(new CGb_qual);
4327  qual->SetQual("UniProtKB_evidence");
4328  qual->SetVal(p);
4329  feat.SetQual().push_back(qual);
4330 
4331  MemFree(buf);
4332 }
4333 
4334 /**********************************************************
4335  *
4336  * static SeqFeatPtr SPFeatProtRef(pp, hsfp, entry,
4337  * spfbp):
4338  *
4339  * sfp->data: prot (Prot-ref):
4340  * - name: DE line, delete everything after " (" or "/";
4341  * - EC_number: if DE lines contains "(EC ...)".
4342  *
4343  * sfp->location: SEQLOC_INT, always from 0 to
4344  * seqence_length.
4345  *
4346  * 10-20-93
4347  *
4348  **********************************************************/
4350 {
4351  IndexblkPtr ibp;
4352 
4353  char* offset;
4354 
4355  char* str;
4356  string str1;
4357 
4358  char* ptr;
4359  char* bptr;
4360  char* eptr;
4361  char* s;
4362 
4363  const char* tag;
4364  Char symb;
4365  Int4 shift;
4366 
4367  EntryBlkPtr ebp;
4368 
4369  ebp = static_cast<EntryBlk*>(entry->mpData);
4370 
4371  CSeq_entry& seq_entry = *ebp->seq_entry;
4372  CBioseq& bioseq = seq_entry.SetSeq();
4373 
4374  size_t len = 0;
4375  offset = SrchNodeType(entry, ParFlatSP_DE, &len);
4376  if (! offset)
4377  return;
4378 
4379  CRef<CSeq_feat> feat(new CSeq_feat);
4380  CProt_ref& prot = feat->SetData().SetProt();
4381 
4382  bptr = offset;
4383  eptr = bptr + len;
4384 
4386  StripECO(str);
4387  s = str + StringLen(str) - 1;
4388  while (s >= str && (*s == '.' || *s == ';' || *s == ','))
4389  *s-- = '\0';
4390 
4391  ShrinkSpaces(str);
4392 
4393  ibp = pp->entrylist[pp->curindx];
4394 
4395  if (StringEquNI(str, "Contains: ", 10) ||
4396  StringEquNI(str, "Includes: ", 10)) {
4397  ErrPostEx(SEV_REJECT, ERR_FORMAT_NoProteinNameCategory, "DE lines do not have a non-Includes/non-Contains RecName, AltName or SubName protein name category. Entry dropped.");
4398  ibp->drop = true;
4399  }
4400 
4401  if (StringEquNI(str, "RecName: ", 9) ||
4402  StringEquNI(str, "AltName: ", 9) ||
4403  StringEquNI(str, "SubName: ", 9)) {
4404  tag = "; EC=";
4405  symb = ';';
4406  shift = 5;
4407  SPParseDefinition(str, bioseq.GetId(), ibp, prot);
4408  } else {
4409  tag = "(EC";
4410  symb = ')';
4411  shift = 3;
4412  }
4413 
4414  while ((ptr = StringStr(str, tag))) {
4415  len = StringLen(str);
4416  str1.assign(str, ptr);
4417 
4418  ptr += shift;
4419  while (*ptr == ' ')
4420  ptr++;
4421 
4422  for (bptr = ptr; *ptr != '\0' && *ptr != ' ' && *ptr != symb;)
4423  ptr++;
4424  if (ptr > bptr) {
4425  string ecnum(bptr, ptr);
4426  SPValidateEcnum(ecnum);
4427 
4428  if (! ecnum.empty())
4429  prot.SetEc().push_back(ecnum);
4430  } else {
4431  ErrPostEx(SEV_WARNING, ERR_FORMAT_ECNumberNotPresent, "Empty EC number provided in SwissProt DE line.");
4432  }
4433 
4434  if (symb == ')') {
4435  while (*ptr != '\0' && (*ptr == ' ' || *ptr == symb))
4436  ptr++;
4437  if (StringLen(ptr) <= 1)
4439  }
4440 
4441  str1 += ptr;
4442 
4443  MemFree(str);
4444  str = StringSave(str1.c_str());
4445  }
4446 
4447  if (symb == ')') {
4448  while ((ptr = StringStr(str, " (")) ||
4449  (ptr = StringStr(str, " /"))) {
4450  str1.assign(str, ptr);
4452 
4453  MemFree(str);
4454  str = StringSave(str1.c_str());
4455  }
4456  }
4457 
4458  if (! prot.IsSetName())
4459  prot.SetName().push_back(str);
4460 
4461  MemFree(str);
4462 
4463  feat->SetLocation(*GetSeqLocIntSP(GetSeqLen(entry), ibp->acnum, pp->accver, ibp->vernum));
4464 
4465  if (spfbp->nonter) {
4466  feat->SetPartial(true);
4467 
4468  if (spfbp->noleft)
4469  GetIntFuzzPtr(4, 2, 0, feat->SetLocation().SetInt().SetFuzz_from()); /* lim, lt, no-min */
4470  if (spfbp->noright)
4471  GetIntFuzzPtr(4, 1, 0, feat->SetLocation().SetInt().SetFuzz_to()); /* lim, gt, no-min */
4472  }
4473 
4474  SPGetPEValue(entry, *feat);
4475 
4476  feats.push_back(feat);
4477 }
4478 
4479 /**********************************************************
4480  *
4481  * static SPSegLocPtr GetSPSegLocInfo(sep, spfip, spfbp):
4482  *
4483  * Return a link list of segment location information,
4484  * data from NON_CONS and change the modif of the sep of
4485  * the bsp->descr to partial.
4486  *
4487  * If input has NON_CONS: 17..18, 31..32, 65..66, and
4488  * total seqlen = 100, then SPSegLocPtr, spslp, will have
4489  * 4 nodes, each node has
4490  * 0, 16, 16-0+1=17, 1, 4, XXXX_1, descr of XXXX_1, add no-right
4491  * 17, 30, 30-17+1=14, 2, 4, XXXX_2, descr of XXXX_2, add no-right, no-left
4492  * 31, 64, 64-31+1=34, 3, 4, XXXX_3, descr of XXXX_3, add no-right, no-left
4493  * 65, 99, 99-65+1=35, 4, 4, XXXX_4, descr of XXXX_4, add no-left
4494  * where XXXX is locus (ID) name.
4495  *
4496  * Set hspslp->fuzzfrom = TRUE if spfbp->noleft = TRUE.
4497  * Set hspslp->fuzzto = TRUE if spfbp->noright = TRUE.
4498  *
4499  * 11-5-93
4500  *
4501  **********************************************************/
4503 {
4504  SPSegLocPtr curspslp = nullptr;
4505  SPSegLocPtr hspslp = nullptr;
4506  SPSegLocPtr spslp;
4507  const char* p;
4508 
4509  if (! spfip)
4510  return nullptr;
4511 
4512  /* get location range
4513  */
4514  for (; spfip; spfip = spfip->next) {
4515  if (spfip->key != "NON_CONS")
4516  continue;
4517 
4518  if (! hspslp) {
4519  spslp = new SPSegLoc;
4520  p = spfip->from.c_str();
4521  if (*p == '<' || *p == '>' || *p == '?')
4522  p++;
4523 
4524  spslp->len = atoi(p);
4525  hspslp = spslp;
4526  curspslp = spslp;
4527  } else {
4528  p = spfip->from.c_str();
4529  if (*p == '<' || *p == '>' || *p == '?')
4530  p++;
4531  curspslp->len = atoi(p) - curspslp->from;
4532  }
4533 
4534  spslp = new SPSegLoc;
4535  p = spfip->from.c_str();
4536  if (*p == '<' || *p == '>' || *p == '?')
4537  p++;
4538  spslp->from = atoi(p);
4539  curspslp->next = spslp;
4540  curspslp = spslp;
4541  }
4542 
4543  for (auto& descr : bioseq.SetDescr().Set()) {
4544  if (! descr->IsMolinfo())
4545  continue;
4546 
4547  if (spfbp->noleft && spfbp->noright)
4548  descr->SetMolinfo().SetCompleteness(CMolInfo::eCompleteness_no_ends);
4549  else if (spfbp->noleft)
4550  descr->SetMolinfo().SetCompleteness(CMolInfo::eCompleteness_no_left);
4551  else if (spfbp->noright)
4552  descr->SetMolinfo().SetCompleteness(CMolInfo::eCompleteness_no_right);
4553  }
4554 
4555  if (hspslp)
4556  curspslp->len = bioseq.GetLength() - curspslp->from;
4557 
4558  return (hspslp);
4559 }
4560 
4561 /**********************************************************
4562  *
4563  * static void CkInitMetSP(pp, spfip, sep, spfbp):
4564  *
4565  * 11-1-93
4566  *
4567  **********************************************************/
4568 static void CkInitMetSP(ParserPtr pp, SPFeatInputPtr spfip, CSeq_entry& seq_entry, SPFeatBlnPtr spfbp)
4569 {
4570  SPFeatInputPtr temp;
4571  const char* p;
4572  Int2 count;
4573  Int4 from = 0;
4574  Int4 to;
4575 
4576  for (count = 0; spfip; spfip = spfip->next) {
4577  if (spfip->key != "INIT_MET")
4578  continue;
4579 
4580  if (count > 0)
4581  break;
4582 
4583  count++;
4584  p = spfip->from.c_str();
4585  if (*p == '<' || *p == '>' || *p == '?')
4586  p++;
4587  from = atoi(p);
4588  p = spfip->to.c_str();
4589  if (*p == '<' || *p == '>' || *p == '?')
4590  p++;
4591  to = atoi(p);
4592 
4593  if ((from != 0 || to != 0) && (from != 1 || to != 1))
4594  break;
4595  temp = spfip;
4596  }
4597 
4598  if (count == 0)
4599  return;
4600 
4601  if (spfip) {
4602  ErrPostEx(SEV_ERROR, ERR_FEATURE_Invalid_INIT_MET, "Either incorrect or more than one INIT_MET feature provided.");
4603  return;
4604  }
4605 
4606  if (! temp->descrip.empty()) {
4607  ErrPostEx(SEV_WARNING, ERR_FEATURE_ExpectEmptyComment, "%s:%d-%d has description: %s", temp->key.c_str(), from, to, temp->descrip.c_str());
4608  }
4609 
4610 
4611  CBioseq& bioseq = seq_entry.SetSeq();
4612 
4613  CSeq_data& data = bioseq.SetInst().SetSeq_data();
4614  string& sequence = data.SetIupacaa().Set();
4615 
4616  if (from == 0) {
4617  spfbp->initmet = true;
4618 
4619  /* insert "M" in the front
4620  */
4621  sequence.insert(sequence.begin(), 'M');
4622  bioseq.SetInst().SetLength(static_cast<TSeqPos>(sequence.size()));
4623  } else if (sequence.empty() || sequence[0] != 'M')
4624  ErrPostEx(SEV_ERROR, ERR_FEATURE_MissingInitMet, "The required Init Met is missing from the sequence.");
4625 }
4626 
4627 /**********************************************************
4628  *
4629  * static void CkNonTerSP(pp, spfip, sep, spfbp):
4630  *
4631  * Set spfbp->nonter = spfbp->noleft = TRUE if
4632  * NON_TER 1..1.
4633  * Set spfbp->nonter = spfbp->noright = TRUE if
4634  * NON_TER base..base.
4635  * Set bsp->descr of modif = partial if there is more
4636  * than 5 contiguous unsequenced residues, X.
4637  *
4638  * 11-2-93
4639  *
4640  **********************************************************/
4641 static void CkNonTerSP(ParserPtr pp, SPFeatInputPtr spfip, CSeq_entry& seq_entry, SPFeatBlnPtr spfbp)
4642 {
4643  SPFeatInputPtr temp;
4644  Int4 from;
4645  Int4 ctr;
4646  bool segm;
4647 
4648  CMolInfo* mol_info = nullptr;
4649  CBioseq& bioseq = seq_entry.SetSeq();
4650 
4651  ctr = 0;
4652  for (auto& descr : bioseq.SetDescr().Set()) {
4653  if (! descr->IsMolinfo())
4654  continue;
4655 
4656  mol_info = &(descr->SetMolinfo());
4657  break;
4658  }
4659 
4660  segm = false;
4661  for (temp = spfip; temp; temp = temp->next) {
4662  if (temp->key == "NON_CONS") {
4663  segm = true;
4664  continue;
4665  }
4666 
4667  if (temp->key != "NON_TER")
4668  continue;
4669 
4670  from = NStr::StringToInt(temp->from);
4671  if (from != NStr::StringToInt(temp->to)) {
4672  ErrPostStr(SEV_WARNING, ERR_FEATURE_UnEqualEndPoint, "NON_TER has unequal endpoints");
4673  continue;
4674  }
4675 
4676  if (from == 1) {
4677  spfbp->nonter = true;
4678  spfbp->noleft = true;
4679  } else if (from == (Int4)pp->entrylist[pp->curindx]->bases) {
4680  spfbp->nonter = true;
4681  spfbp->noright = true;
4682  } else {
4683  ErrPostStr(SEV_WARNING, ERR_FEATURE_NotSeqEndPoint, "NON_TER is not at a sequence endpoint.");
4684  }
4685  }
4686 
4687  if (! mol_info)
4688  return;
4689 
4690  if (segm && mol_info->GetCompleteness() != 2) {
4692  ErrPostEx(SEV_WARNING, ERR_FEATURE_NoFragment, "Found NON_CONS in FT line but no FRAGMENT in DE line.");
4693  } else if (spfbp->nonter && mol_info->GetCompleteness() != CMolInfo::eCompleteness_partial) {
4695  ErrPostEx(SEV_WARNING, ERR_FEATURE_NoFragment, "Found NON_TER in FT line but no FRAGMENT in DE line.");
4696  } else if (! spfbp->nonter && mol_info->GetCompleteness() == CMolInfo::eCompleteness_partial && ! segm) {
4697  ErrPostEx(SEV_WARNING, ERR_FEATURE_PartialNoNonTerNonCons, "Entry is partial but has no NON_TER or NON_CONS features.");
4698  } else if (mol_info->GetCompleteness() != 2) {
4699  if (bioseq.GetInst().IsSetSeq_data()) {
4700  const CSeq_data& data = bioseq.GetInst().GetSeq_data();
4701  const string& sequence = data.GetIupacaa().Get();
4702 
4703  for (string::const_iterator value = sequence.begin(); value != sequence.end(); ++value) {
4704  if (*value != 'X') {
4705  ctr = 0; /* reset counter */
4706  continue;
4707  }
4708 
4709  ctr++;
4710  if (ctr == 5) {
4712  break;
4713  }
4714  }
4715  }
4716  }
4717 }
4718 
4719 /**********************************************************/
4720 static void SeqToDeltaSP(CBioseq& bioseq, SPSegLocPtr spslp)
4721 {
4722  if (! spslp || ! bioseq.GetInst().IsSetSeq_data())
4723  return;
4724 
4725  CSeq_ext::TDelta& deltas = bioseq.SetInst().SetExt().SetDelta();
4726  const string& bioseq_data = bioseq.GetInst().GetSeq_data().GetIupacaa().Get();
4727 
4728  for (; spslp; spslp = spslp->next) {
4730  if (! deltas.Set().empty()) {
4731  delta->SetLiteral().SetLength(0);
4732  delta->SetLiteral().SetFuzz().SetLim();
4733  deltas.Set().push_back(delta);
4734 
4735  delta.Reset(new CDelta_seq);
4736  }
4737 
4738  delta->SetLiteral().SetLength(spslp->len);
4739 
4740 
4741  string data_str = bioseq_data.substr(spslp->from, spslp->len);
4742 
4743  delta->SetLiteral().SetSeq_data().SetIupacaa().Set(data_str);
4744  deltas.Set().push_back(delta);
4745  }
4746 
4747  if (deltas.Set().size() > 1) {
4748  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
4749  bioseq.SetInst().ResetSeq_data();
4750  } else
4751  bioseq.SetInst().SetExt().Reset();
4752 }
4753 
4754 /**********************************************************
4755  *
4756  * static void GetSPAnnot(pp, entry, protconv):
4757  *
4758  * 10-15-93
4759  *
4760  **********************************************************/
4761 static void GetSPAnnot(ParserPtr pp, DataBlkPtr entry, unsigned char* protconv)
4762 {
4763  SPFeatInputPtr spfip;
4764  EntryBlkPtr ebp;
4765 
4766  SPFeatBlnPtr spfbp;
4767  SPSegLocPtr spslp; /* segment location, data from NON_CONS */
4768  SPSegLocPtr next;
4769 
4770  ebp = static_cast<EntryBlk*>(entry->mpData);
4771  CSeq_entry& seq_entry = *ebp->seq_entry;
4772 
4773  spfbp = new SPFeatBln;
4774  spfip = ParseSPFeat(entry, pp->entrylist[pp->curindx]->bases);
4775 
4777 
4778  if (spfip) {
4779  CkNonTerSP(pp, spfip, seq_entry, spfbp);
4780  CkInitMetSP(pp, spfip, seq_entry, spfbp);
4781  SPFeatGeneral(pp, spfip, spfbp->initmet, feats);
4782  }
4783 
4784  SPFeatGeneRef(pp, feats, entry); /* GN line */
4785  SPFeatProtRef(pp, feats, entry, spfbp); /* DE line */
4786 
4787  CBioseq& bioseq = seq_entry.SetSeq();
4788 
4789  spslp = GetSPSegLocInfo(bioseq, spfip, spfbp); /* checking NON_CONS key */
4790  if (spslp)
4791  SeqToDeltaSP(bioseq, spslp);
4792 
4793  if (! feats.empty()) {
4794  CRef<CSeq_annot> annot(new CSeq_annot);
4795  annot->SetData().SetFtable().swap(feats);
4796  bioseq.SetAnnot().push_back(annot);
4797  }
4798 
4799  for (; spslp; spslp = next) {
4800  next = spslp->next;
4801  delete spslp;
4802  }
4803 
4804  FreeSPFeatInputSet(spfip);
4805  delete spfbp;
4806 }
4807 
4808 /**********************************************************/
4809 static void SpPrepareEntry(ParserPtr pp, DataBlkPtr entry, unsigned char* protconv)
4810 {
4811  Int2 curkw;
4812  char* ptr;
4813  char* eptr;
4814  EntryBlkPtr ebp;
4815 
4816  ebp = static_cast<EntryBlk*>(entry->mpData);
4817  ptr = entry->mOffset;
4818  eptr = ptr + entry->len;
4819  for (curkw = ParFlatSP_ID; curkw != ParFlatSP_END;) {
4820  ptr = GetEmblBlock(&ebp->chain, ptr, &curkw, pp->format, eptr);
4821  }
4822  GetSprotSubBlock(pp, entry);
4823 
4824  if (pp->entrylist[pp->curindx]->bases == 0) {
4825  SpAddToIndexBlk(entry, pp->entrylist[pp->curindx]);
4826  }
4827 
4828  CRef<CBioseq> bioseq = CreateEntryBioseq(pp);
4829  ebp->seq_entry.Reset(new CSeq_entry);
4830  ebp->seq_entry->SetSeq(*bioseq);
4831  GetScope().AddBioseq(*bioseq);
4832 
4833  GetSprotDescr(*bioseq, pp, entry);
4834 
4835  GetSPInst(pp, entry, protconv);
4836  GetSPAnnot(pp, entry, protconv);
4837 
4839  entries.push_back(ebp->seq_entry);
4841 
4842  if (pp->citat) {
4844  }
4845 }
4846 
4847 /**********************************************************
4848  *
4849  * bool SprotAscii(pp):
4850  *
4851  * Return FALSE if allocate entry block failed.
4852  *
4853  * 3-23-93
4854  *
4855  **********************************************************/
4857 {
4858  DataBlkPtr entry;
4859 
4860  Int4 total;
4861  Int4 i;
4862  IndexblkPtr ibp;
4863  Int4 imax;
4864 
4865  auto protconv = GetProteinConv();
4866 
4867  for (total = 0, i = 0, imax = pp->indx; i < imax; i++) {
4868  pp->curindx = i;
4869  ibp = pp->entrylist[i];
4870 
4871  err_install(ibp, pp->accver);
4872 
4873  if (! ibp->drop) {
4874  entry = LoadEntry(pp, ibp->offset, ibp->len);
4875  if (! entry) {
4877  return false;
4878  }
4879 
4880  SpPrepareEntry(pp, entry, protconv.get());
4881 
4882  if (! ibp->drop) {
4883  CRef<CSeq_entry>& cur_entry = (static_cast<EntryBlk*>(entry->mpData))->seq_entry;
4884  pp->entries.push_back(cur_entry);
4885 
4886  cur_entry.Reset();
4887  }
4888  // delete entry;
4889  }
4890  if (! ibp->drop) {
4891  total++;
4892  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry \"%s|%s\" parsed successfully", ibp->locusname, ibp->acnum);
4893  } else {
4894  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry \"%s|%s\" skipped", ibp->locusname, ibp->acnum);
4895  }
4896  }
4897 
4899 
4900  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "Parsing completed, %d entr%s parsed", total, (total == 1) ? "y" : "ies");
4901  return true;
4902 }
4903 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
char * tata_save(char *str)
Definition: add.cpp:147
void StripECO(char *str)
Definition: add.cpp:2834
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:290
void BuildSubBlock(DataBlkPtr dbp, Int2 subtype, const char *subkw)
Definition: asci_blk.cpp:758
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3377
unique_ptr< unsigned char[]> GetProteinConv(void)
Definition: asci_blk.cpp:1772
void GetSequenceOfKeywords(const DataBlk &entry, int type, int col_data, TKeywordList &keywords)
Definition: asci_blk.cpp:1505
char * GetEmblBlock(DataBlkPtr *chain, char *ptr, short *retkw, Parser::EFormat format, char *eptr)
Definition: asci_blk.cpp:491
CRef< CSeq_id > MakeAccSeqId(const char *acc, Uint1 seqtype, bool accver, Int2 vernum)
Definition: asci_blk.cpp:906
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1632
void GetLenSubNode(DataBlkPtr dbp)
Definition: asci_blk.cpp:781
CRef< CSeq_id > MakeLocusSeqId(const char *locus, CSeq_id::E_Choice seqtype)
Definition: asci_blk.cpp:936
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1274
void ShrinkSpaces(char *line)
Definition: asci_blk.cpp:118
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1020
void fta_sort_biosource(objects::CBioSource &bio)
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
Definition: Date.hpp:53
void GetDate(string *label, bool year_only=false) const
Append a standardized string representation of the date to the label.
Definition: Date.hpp:149
@ eCompare_after
*this comes second.
Definition: Date.hpp:76
Definition: Dbtag.hpp:53
CDelta_seq –.
Definition: Delta_seq.hpp:66
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
CPDB_mol_id –.
Definition: PDB_mol_id.hpp:66
CSP_block –.
Definition: SP_block.hpp:66
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeq_hist –.
Definition: Seq_hist.hpp:66
char * mOffset
Definition: ftablock.h:332
size_t len
Definition: ftablock.h:333
CFlatFileData * mpData
Definition: ftablock.h:331
DataBlk * mpNext
Definition: ftablock.h:336
int mType
Definition: ftablock.h:330
static const char location[]
Definition: config.c:97
char value[7]
Definition: config.c:431
#define head
Definition: ct_nlmzip_i.h:138
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
Definition: entry.cpp:300
#define ERR_GENENAME_IllegalGeneName
Definition: flat2err.h:409
#define ERR_SOURCE_HostNameVsTaxIDMissMatch
Definition: flat2err.h:540
#define ERR_FEATURE_PartialNoNonTerNonCons
Definition: flat2err.h:381
#define ERR_FORMAT_UnexpectedData
Definition: flat2err.h:51
#define ERR_SPROT_PELine
Definition: flat2err.h:481
#define ERR_FEATURE_ExpectEmptyComment
Definition: flat2err.h:321
#define ERR_FORMAT_NoProteinNameCategory
Definition: flat2err.h:64
#define ERR_FORMAT_MultipleRecName
Definition: flat2err.h:65
#define ERR_SOURCE_InvalidNcbiTaxID
Definition: flat2err.h:530
#define ERR_SOURCE_UnknownOHType
Definition: flat2err.h:538
#define ERR_GENENAME_DELineGeneName
Definition: flat2err.h:410
#define ERR_SOURCE_NcbiTaxIDLookupFailure
Definition: flat2err.h:532
#define ERR_FORMAT_InvalidPDBCrossRef
Definition: flat2err.h:59
#define ERR_FORMAT_Date
Definition: flat2err.h:62
#define ERR_FORMAT_ECNumberNotPresent
Definition: flat2err.h:63
#define ERR_FORMAT_MixedPDBXrefs
Definition: flat2err.h:60
#define ERR_ENTRY_Skipped
Definition: flat2err.h:80
#define ERR_FEATURE_UnEqualEndPoint
Definition: flat2err.h:332
#define ERR_SOURCE_OrgNameVsTaxIDMissMatch
Definition: flat2err.h:534
#define ERR_FORMAT_MissingCopyright
Definition: flat2err.h:46
#define ERR_SOURCE_MissingPlasmidName
Definition: flat2err.h:537
#define ERR_FEATURE_Invalid_INIT_MET
Definition: flat2err.h:375
#define ERR_FEATURE_InvalidQualifier
Definition: flat2err.h:384
#define ERR_FEATURE_BadLocation
Definition: flat2err.h:347
#define ERR_REFERENCE_IllegalDate
Definition: flat2err.h:282
#define ERR_FORMAT_MissingFullRecName
Definition: flat2err.h:68
#define ERR_FORMAT_SwissProtHasSubName
Definition: flat2err.h:67
#define ERR_FEATURE_UnknownFeatKey
Definition: flat2err.h:333
#define ERR_SOURCE_UnknownOXType
Definition: flat2err.h:529
#define ERR_DRXREF_UnknownDBname
Definition: flat2err.h:596
#define ERR_SOURCE_NoNcbiTaxIDLookup
Definition: flat2err.h:531
#define ERR_FEATURE_ObsoleteFeature
Definition: flat2err.h:342
#define ERR_FEATURE_Dropped
Definition: flat2err.h:337
#define ERR_ENTRY_ParsingComplete
Definition: flat2err.h:79
#define ERR_FEATURE_MissingInitMet
Definition: flat2err.h:377
#define ERR_SPROT_DRLine
Definition: flat2err.h:480
#define ERR_SOURCE_IncorrectOHLine
Definition: flat2err.h:539
#define ERR_FORMAT_MissingGeneName
Definition: flat2err.h:58
#define ERR_LOCATION_FailedCheck
Definition: flat2err.h:393
#define ERR_FORMAT_InvalidECNumber
Definition: flat2err.h:52
#define ERR_QUALIFIER_InvalidEvidence
Definition: flat2err.h:117
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
#define ERR_ENTRY_Parsed
Definition: flat2err.h:83
#define ERR_FORMAT_UnknownGeneField
Definition: flat2err.h:56
#define ERR_FEATURE_NotSeqEndPoint
Definition: flat2err.h:327
#define ERR_FEATURE_NoFragment
Definition: flat2err.h:326
#define ERR_SPROT_DRLineCrossDBProtein
Definition: flat2err.h:482
#define ERR_DATACLASS_UnKnownClass
Definition: flat2err.h:76
#define ERR_FORMAT_ExcessGeneFields
Definition: flat2err.h:57
#define ERR_FORMAT_MissingRecName
Definition: flat2err.h:66
#define ERR_FEATURE_DuplicateRemoved
Definition: flat2err.h:349
list< CRef< objects::CSeq_entry > > TEntryList
std::list< CRef< objects::CSeq_id > > TSeqIdList
Definition: ftablock.h:57
char * StringSave(const char *s)
Definition: ftacpp.hpp:61
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:116
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:106
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:96
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:74
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:75
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
char * MemNew(size_t sz)
Definition: ftacpp.hpp:43
void StringCat(char *d, const char *s)
Definition: ftacpp.hpp:73
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:78
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:344
void FtaInstallPrefix(int prefix, const char *name, const char *location)
Definition: ftaerr.cpp:319
#define PREFIX_FEATURE
Definition: ftaerr.hpp:16
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
Definition: ftanet.cpp:762
CRef< COrg_ref > fta_fix_orgref_byid(ParserPtr pp, TTaxId taxid, bool *drop, bool isoh)
Definition: ftanet.cpp:857
void fta_fix_orgref(ParserPtr pp, COrg_ref &org_ref, bool *drop, char *organelle)
Definition: ftanet.cpp:945
static int type
Definition: getdata.c:31
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define TAX_ID_TO(T, tax_id)
Definition: ncbimisc.hpp:1110
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
#define StringStr
Definition: ncbistr.hpp:322
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
TPrim & Set(void)
Definition: serialbase.hpp:351
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
Definition: serialbase.hpp:347
void SetPnt(TPnt &v)
Definition: Seq_loc.hpp:985
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void SetBond(TBond &v)
Definition: Seq_loc.hpp:989
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static string Sanitize(CTempString str, TSS_Flags flags=fSS_print)
Sanitize a string, allowing only specified classes of characters.
Definition: ncbistr.hpp:2876
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3401
@ eTrunc_End
Truncate trailing spaces only.
Definition: ncbistr.hpp:2241
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
void SetGenome(TGenome value)
Assign a value to Genome data member.
Definition: BioSource_.hpp:428
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
EGenome
biological context
Definition: BioSource_.hpp:97