NCBI C++ ToolKit
loadfeat.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: loadfeat.cpp 99051 2023-02-07 19:58:28Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: loadfeat.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Parse features block to subblock.
32  * Process each subblock.
33  * Output each subblock.
34  * Free out subblock.
35  */
36 
37 #include <ncbi_pch.hpp>
38 
39 #include "ftacpp.hpp"
40 
43 #include <objmgr/bioseq_handle.hpp>
44 #include <objmgr/scope.hpp>
57 #include <objects/pub/Pub_set.hpp>
58 #include <objects/pub/Pub.hpp>
59 #include <serial/objostr.hpp>
62 #include <objects/seq/Pubdesc.hpp>
65 #include <objects/seq/MolInfo.hpp>
66 #include <objects/seq/Seq_inst.hpp>
67 #include <objects/seq/Seq_ext.hpp>
70 
71 #include "index.h"
72 #include "embl.h"
73 #include "genbank.h"
74 #include "qual_parse.hpp"
75 
78 
79 #include "ftaerr.hpp"
80 #include "indx_blk.h"
81 #include "asci_blk.h"
82 #include "utilfeat.h"
83 #include "loadfeat.h"
84 #include "add.h"
85 #include "fta_src.h"
86 #include "buf_data_loader.h"
87 #include "utilfun.h"
88 #include "ref.h"
89 #include "xgbfeat.h"
90 #include "xgbparint.h"
91 #include "fta_xml.h"
92 
93 #ifdef THIS_FILE
94 # undef THIS_FILE
95 #endif
96 #define THIS_FILE "loadfeat.cpp"
97 
100 
101 #define Seq_descr_GIBB_mol_unknown CMolInfo::eBiomol_unknown
102 #define Seq_descr_GIBB_mol_genomic CMolInfo::eBiomol_genomic
103 #define Seq_descr_GIBB_mol_preRNA CMolInfo::eBiomol_pre_RNA
104 #define Seq_descr_GIBB_mol_mRNA CMolInfo::eBiomol_mRNA
105 #define Seq_descr_GIBB_mol_rRNA CMolInfo::eBiomol_rRNA
106 #define Seq_descr_GIBB_mol_tRNA CMolInfo::eBiomol_tRNA
107 #define Seq_descr_GIBB_mol_uRNA CMolInfo::eBiomol_snRNA
108 #define Seq_descr_GIBB_mol_snRNA CMolInfo::eBiomol_snRNA
109 #define Seq_descr_GIBB_mol_scRNA CMolInfo::eBiomol_scRNA
110 #define Seq_descr_GIBB_mol_other_genetic CMolInfo::eBiomol_other_genetic
111 #define Seq_descr_GIBB_mol_cRNA CMolInfo::eBiomol_cRNA
112 #define Seq_descr_GIBB_mol_snoRNA CMolInfo::eBiomol_snoRNA
113 #define Seq_descr_GIBB_mol_trRNA CMolInfo::eBiomol_transcribed_RNA
114 #define Seq_descr_GIBB_mol_other CMolInfo::eBiomol_other
115 
116 struct TrnaAa {
117  const char* name;
119 };
120 
121 struct StrNum {
122  const char* str;
123  int num;
124 };
125 
126 const TrnaAa taa[] = {
127  { "alanine", 'A' },
128  { "arginine", 'R' },
129  { "asparagine", 'N' },
130  { "aspartic acid", 'D' },
131  { "aspartate", 'D' },
132  { "cysteine", 'C' },
133  { "glutamine", 'Q' },
134  { "glutamic acid", 'E' },
135  { "glutamate", 'E' },
136  { "glycine", 'G' },
137  { "histidine", 'H' },
138  { "isoleucine", 'I' },
139  { "leucine", 'L' },
140  { "lysine", 'K' },
141  { "methionine", 'M' },
142  { "phenylalanine", 'F' },
143  { "proline", 'P' },
144  { "selenocysteine", 'U' },
145  { "serine", 'S' },
146  { "threonine", 'T' },
147  { "tryptophan", 'W' },
148  { "tyrosine", 'Y' },
149  { "valine", 'V' },
150  { nullptr, '\0' }
151 };
152 
153 struct AaCodons {
154  const char* straa;
157  Int4 vals[8];
158 };
159 
160 const AaCodons aacodons[] = {
161  { "Ala", 'A', 0, { 52, 53, 54, 55, -1, -1, -1, -1 } }, /* GCT, GCC, GCA, GCG */
162  { "Arg", 'R', 2, { 28, 29, 30, 31, -1, -1, -1, -1 } }, /* CGT, CGC, CGA, CGG */
163  { "Arg", 'R', 5, { 28, 29, 30, 31, -1, -1, -1, -1 } }, /* CGT, CGC, CGA, CGG */
164  { "Arg", 'R', 9, { 28, 29, 30, 31, -1, -1, -1, -1 } }, /* CGT, CGC, CGA, CGG */
165  { "Arg", 'R', 13, { 28, 29, 30, 31, -1, -1, -1, -1 } }, /* CGT, CGC, CGA, CGG */
166  { "Arg", 'R', 14, { 28, 29, 30, 31, -1, -1, -1, -1 } }, /* CGT, CGC, CGA, CGG */
167  { "Arg", 'R', 0, { 28, 29, 30, 31, 46, 47, -1, -1 } }, /* CGT, CGC, CGA, CGG, AGA, AGG */
168  { "Asn", 'N', 9, { 40, 41, 42, -1, -1, -1, -1, -1 } }, /* AAT, AAC, AAA */
169  { "Asn", 'N', 14, { 40, 41, 42, -1, -1, -1, -1, -1 } }, /* AAT, AAC, AAA */
170  { "Asn", 'N', 0, { 40, 41, -1, -1, -1, -1, -1, -1 } }, /* AAT, AAC */
171  { "Asp", 'D', 0, { 56, 57, -1, -1, -1, -1, -1, -1 } }, /* GAT, GAC */
172  { "Asx", 'B', 9, { 40, 41, 42, 56, 57, -1, -1, -1 } }, /* Asn + Asp */
173  { "Asx", 'B', 14, { 40, 41, 42, 56, 57, -1, -1, -1 } }, /* Asn + Asp */
174  { "Asx", 'B', 0, { 40, 41, 56, 57, -1, -1, -1, -1 } }, /* Asn + Asp */
175  { "Cys", 'C', 10, { 12, 13, 14, -1, -1, -1, -1, -1 } }, /* TGT, TGC, TGA */
176  { "Cys", 'C', 0, { 12, 13, -1, -1, -1, -1, -1, -1 } }, /* TGT, TGC */
177  { "Gln", 'Q', 6, { 10, 11, 26, 27, -1, -1, -1, -1 } }, /* TAA, TAG, CAA, CAG */
178  { "Gln", 'Q', 15, { 11, 26, 27, -1, -1, -1, -1, -1 } }, /* TAG, CAA, CAG */
179  { "Gln", 'Q', 0, { 26, 27, -1, -1, -1, -1, -1, -1 } }, /* CAA, CAG */
180  { "Glu", 'E', 0, { 58, 59, -1, -1, -1, -1, -1, -1 } }, /* GAA, GAG */
181  { "Glx", 'Z', 6, { 10, 11, 26, 27, 58, 59, -1, -1 } }, /* Gln + Glu */
182  { "Glx", 'Z', 0, { 11, 26, 27, 58, 59, -1, -1, -1 } }, /* Gln + Glu */
183  { "Glx", 'Z', 0, { 26, 27, 58, 59, -1, -1, -1, -1 } }, /* Gln + Glu */
184  { "Gly", 'G', 13, { 46, 47, 60, 61, 62, 63, -1, -1 } }, /* AGA, AGG, GGT, GGC, GGA, GGG */
185  { "Gly", 'G', 0, { 60, 61, 62, 63, -1, -1, -1, -1 } }, /* GGT, GGC, GGA, GGG */
186  { "His", 'H', 0, { 24, 25, -1, -1, -1, -1, -1, -1 } }, /* CAT, CAC */
187  { "Ile", 'I', 2, { 32, 33, -1, -1, -1, -1, -1, -1 } }, /* ATT, ATC */
188  { "Ile", 'I', 3, { 32, 33, -1, -1, -1, -1, -1, -1 } }, /* ATT, ATC */
189  { "Ile", 'I', 5, { 32, 33, -1, -1, -1, -1, -1, -1 } }, /* ATT, ATC */
190  { "Ile", 'I', 13, { 32, 33, -1, -1, -1, -1, -1, -1 } }, /* ATT, ATC */
191  { "Ile", 'I', 0, { 32, 33, 34, -1, -1, -1, -1, -1 } }, /* ATT, ATC, ATA */
192  { "Leu", 'L', 3, { 2, 3, -1, -1, -1, -1, -1, -1 } }, /* TTA, TTG */
193  { "Leu", 'L', 12, { 2, 3, 16, 17, 18, -1, -1, -1 } }, /* TTA, TTG, CTT, CTC, CTA */
194  { "Leu", 'L', 0, { 2, 3, 16, 17, 18, 19, -1, -1 } }, /* TTA, TTG, CTT, CTC, CTA, CTG */
195  { "Lys", 'K', 9, { 43, -1, -1, -1, -1, -1, -1, -1 } }, /* AAG */
196  { "Lys", 'K', 14, { 43, -1, -1, -1, -1, -1, -1, -1 } }, /* AAG */
197  { "Lys", 'K', 0, { 42, 43, -1, -1, -1, -1, -1, -1 } }, /* AAA, AAG */
198  { "Met", 'M', 2, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
199  { "Met", 'M', 3, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
200  { "Met", 'M', 5, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
201  { "Met", 'M', 13, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
202  { "Met", 'M', 0, { 35, -1, -1, -1, -1, -1, -1, -1 } }, /* ATG */
203  { "fMet", 'M', 2, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
204  { "fMet", 'M', 3, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
205  { "fMet", 'M', 5, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
206  { "fMet", 'M', 13, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
207  { "fMet", 'M', 0, { 35, -1, -1, -1, -1, -1, -1, -1 } }, /* ATG */
208  { "Phe", 'F', 0, { 0, 1, -1, -1, -1, -1, -1, -1 } }, /* TTT, TTC */
209  { "Pro", 'P', 0, { 20, 21, 22, 23, -1, -1, -1, -1 } }, /* CCT, CCC, CCA, CCG */
210  { "Sec", 'U', 0, { -1, -1, -1, -1, -1, -1, -1, -1 } },
211  { "Ser", 'S', 5, { 4, 5, 6, 7, 44, 45, 46, 47 } }, /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
212  { "Ser", 'S', 9, { 4, 5, 6, 7, 44, 45, 46, 47 } }, /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
213  { "Ser", 'S', 12, { 4, 5, 6, 7, 19, 44, 45, -1 } }, /* TCT, TCC, TCA, TCG, CTG, AGT, AGC */
214  { "Ser", 'S', 14, { 4, 5, 6, 7, 44, 45, 46, 47 } }, /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
215  { "Ser", 'S', 0, { 4, 5, 6, 7, 44, 45, -1, -1 } }, /* TCT, TCC, TCA, TCG, AGT, AGC */
216  { "Thr", 'T', 3, { 16, 17, 18, 19, 36, 37, 38, 39 } }, /* CTT, CTC, CTA, CTG, ACT, ACC, ACA, ACG */
217  { "Thr", 'T', 0, { 36, 37, 38, 39, -1, -1, -1, -1 } }, /* ACT, ACC, ACA, ACG */
218  { "Trp", 'W', 1, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
219  { "Trp", 'W', 6, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
220  { "Trp", 'W', 10, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
221  { "Trp", 'W', 11, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
222  { "Trp", 'W', 12, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
223  { "Trp", 'W', 15, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
224  { "Trp", 'W', 0, { 14, 15, -1, -1, -1, -1, -1, -1 } }, /* TGA, TGG */
225  { "Tyr", 'Y', 14, { 8, 9, 10, -1, -1, -1, -1, -1 } }, /* TAT, TAC, TAA */
226  { "Tyr", 'Y', 0, { 8, 9, -1, -1, -1, -1, -1, -1 } }, /* TAT, TAC */
227  { "Val", 'V', 0, { 48, 49, 50, 51, -1, -1, -1, -1 } }, /* GTT, GTC, GTA, GTG */
228  { "TERM", '*', 1, { 10, 11, 14, -1, -1, -1, -1, -1 } }, /* TAA, TAG, TGA */
229  { "TERM", '*', 2, { 10, 11, 46, 47, -1, -1, -1, -1 } }, /* TAA, TAG, AGA, AGG */
230  { "TERM", '*', 6, { 14, -1, -1, -1, -1, -1, -1, -1 } }, /* TGA */
231  { "TERM", '*', 11, { 10, 11, 14, -1, -1, -1, -1, -1 } }, /* TAA, TAG, TGA */
232  { "TERM", '*', 12, { 10, 11, 14, -1, -1, -1, -1, -1 } }, /* TAA, TAG, TGA */
233  { "TERM", '*', 14, { 11, -1, -1, -1, -1, -1, -1, -1 } }, /* TAG */
234  { "TERM", '*', 15, { 10, 14, -1, -1, -1, -1, -1, -1 } }, /* TAA, TGA */
235  { "TERM", '*', 0, { 10, 11, -1, -1, -1, -1, -1, -1 } }, /* TAA, TAG */
236  { "OTHER", 'X', 0, { -1, -1, -1, -1, -1, -1, -1, -1 } },
237  { nullptr, '\0', 0, { -1, -1, -1, -1, -1, -1, -1, -1 } }
238 };
239 
240 static const char* trna_tags[] = {
241  "TRANSFERN RNA",
242  "TRANSFER RRNA",
243  "TRANSFER TRNA",
244  "TRANSFER RNA",
245  "TRASNFER RNA",
246  "TRANSDER RNA",
247  "TRANSFERRNA",
248  "TRANFER RNA",
249  "T RNA",
250  "TRNA",
251  nullptr
252 };
253 
254 const char* ParFlat_ESTmod[] = {
255  "EST",
256  "expressed sequence tag",
257  "partial cDNA sequence",
258  "transcribed sequence fragment",
259  "TSR",
260  "putatively transcribed partial sequence",
261  "UK putts",
262  "Plastid",
263  nullptr
264 };
265 
266 static const char* ParFlat_RNA_array[] = {
267  "precursor_RNA",
268  "mRNA",
269  "tRNA",
270  "rRNA",
271  "snRNA",
272  "scRNA",
273  "snoRNA",
274  "ncRNA",
275  "tmRNA",
276  "misc_RNA",
277  nullptr
278 };
279 
280 static const char* DbxrefTagAny[] = {
281  "ASAP",
282  "CDD",
283  "DBEST",
284  "DBSTS",
285  "GDB",
286  "HMP",
287  "MAIZEGDB",
288  nullptr
289 };
290 
291 static const char* DbxrefObsolete[] = {
292  "BHB",
293  "BIOHEALTHBASE",
294  "GENEW",
295  "IFO",
296  "SWISS-PROT",
297  "SPTREMBL",
298  "TREMBL",
299  nullptr
300 };
301 
302 static const char* EMBLDbxrefTagStr[] = {
303  "BIOMUTA",
304  "DEPOD",
305  "ENSEMBLGENOMES-GN",
306  "ENSEMBLGENOMES-TR",
307  "ESTHER",
308  "GENEVISIBLE",
309  "MOONPROT",
310  "PROTEOMES",
311  "UNITE",
312  "WBPARASITE",
313  nullptr
314 };
315 
316 static const char* DbxrefTagStr[] = {
317  "ACEVIEW/WORMGENES",
318  "APHIDBASE",
319  "APIDB",
320  "ARAPORT",
321  "BEEBASE",
322  "BEETLEBASE",
323  "BGD",
324  "BOLD",
325  "CGD",
326  "COLLECTF",
327  "DBSNP",
328  "DICTYBASE",
329  "ECOCYC",
330  "ECOGENE",
331  "ENSEMBL",
332  "ENSEMBLGENOMES",
333  "ERIC",
334  "FANTOM_DB",
335  "FLYBASE",
336  "GABI",
337  "GENEDB",
338  "GOA",
339  "H-INVDB",
340  "HGNC",
341  "HOMD",
342  "HSSP",
343  "I5KNAL",
344  "IMGT/GENE-DB",
345  "IMGT/HLA",
346  "IMGT/LIGM",
347  "INTERPRO",
348  "IRD",
349  "ISD",
350  "ISFINDER",
351  "ISHAM-ITS",
352  "JGIDB",
353  "MARPOLBASE",
354  "MEDGEN",
355  "MGI",
356  "MIRBASE",
357  "NEXTDB",
358  "NIAEST",
359  "NMPDR",
360  "NRESTDB",
361  "OSA1",
362  "PATHEMA",
363  "PDB",
364  "PFAM",
365  "PGN",
366  "PHYTOZOME",
367  "PIR",
368  "POMBASE",
369  "PSEUDO",
370  "PSEUDOCAP",
371  "RAP-DB",
372  "REMTREMBL",
373  "RFAM",
374  "RICEGENES",
375  "RZPD",
376  "SEED",
377  "SGD",
378  "SGN",
379  "SPTREMBL",
380  "SRPDB",
381  "SUBTILIST",
382  "SWISS-PROT",
383  "TAIR",
384  "TIGRFAM",
385  "TREMBL",
386  "TUBERCULIST",
387  "UNIPROT/SWISS-PROT",
388  "UNIPROT/TREMBL",
389  "UNIPROTKB/SWISS-PROT",
390  "UNIPROTKB/TREMBL",
391  "UNITE",
392  "VBASE2",
393  "VECTORBASE",
394  "VGNC",
395  "VIPR",
396  "VISTA",
397  "WORFDB",
398  "WORMBASE",
399  "XENBASE",
400  "ZFIN",
401  nullptr
402 };
403 
404 static const char* DbxrefTagInt[] = {
405  "ATCC",
406  "ATCC(DNA)",
407  "ATCC(IN HOST)",
408  "BDGP_EST",
409  "BDGP_INS",
410  "ESTLIB",
411  "GENEID",
412  "GI",
413  "GO",
414  "GREENGENES",
415  "INTREPIDBIO",
416  "JCM",
417  "LOCUSID",
418  "MIM",
419  "MYCOBANK",
420  "NBRC",
421  "PBMICE",
422  "RATMAP",
423  "RGD",
424  "UNILIB",
425  "UNISTS",
426  nullptr
427 };
428 
429 static const char* EmptyQuals[] = {
430  "artificial_location", /* Fake. Put here to catch
431  it's empty */
432  "chloroplast",
433  "chromoplast",
434  "cyanelle",
435  "environmental_sample",
436  "focus",
437  "germline",
438  "kinetoplast",
439  "macronuclear",
440  "metagenomic",
441  "mitochondrion",
442  "mobile_element_type", /* Fake. Put here to catch
443  it's empty */
444  "partial",
445  "proviral",
446  "pseudo",
447  "rearranged",
448  "ribosomal_slippage",
449  "trans_splicing",
450  "transgenic",
451  "virion",
452  nullptr
453 };
454 
455 const char* TransSplicingFeats[] = {
456  "3'UTR",
457  "5'UTR",
458  "CDS",
459  "gene",
460  "mRNA",
461  "misc_RNA",
462  "precursor_RNA",
463  "tRNA",
464  nullptr
465 };
466 
467 const char* ncRNA_class_values[] = {
468  "antisense_RNA",
469  "autocatalytically_spliced_intron",
470  "hammerhead_ribozyme",
471  "lncRNA",
472  "RNase_P_RNA",
473  "RNase_MRP_RNA",
474  "telomerase_RNA",
475  "guide_RNA",
476  "rasiRNA",
477  "ribozyme",
478  "scRNA",
479  "siRNA",
480  "miRNA",
481  "piRNA",
482  "pre_miRNA",
483  "snoRNA",
484  "snRNA",
485  "SRP_RNA",
486  "vault_RNA",
487  "Y_RNA",
488  "other",
489  nullptr
490 };
491 
492 const char* SatelliteValues[] = {
493  "satellite",
494  "minisatellite",
495  "microsatellite",
496  nullptr
497 };
498 
499 const char* PseudoGeneValues[] = {
500  "allelic",
501  "processed",
502  "unitary",
503  "unknown",
504  "unprocessed",
505  nullptr
506 };
507 
508 const char* RegulatoryClassValues[] = {
509  "attenuator",
510  "CAAT_signal",
511  "DNase_I_hypersensitive_site",
512  "enhancer",
513  "enhancer_blocking_element",
514  "GC_signal",
515  "imprinting_control_region",
516  "insulator",
517  "locus_control_region",
518  "matrix_attachment_region",
519  "minus_35_signal",
520  "minus_10_signal",
521  "response_element",
522  "polyA_signal_sequence",
523  "promoter",
524  "recoding_stimulatory_region",
525  "replication_regulatory_region",
526  "ribosome_binding_site",
527  "riboswitch",
528  "silencer",
529  "TATA_box",
530  "terminator",
531  "transcriptional_cis_regulatory_region",
532  "other",
533  nullptr
534 };
535 
536 // clang-format off
538  { "between scaffolds", CSeq_gap::eType_contig },
539  { "within scaffold", CSeq_gap::eType_scaffold },
540  { "telomere", CSeq_gap::eType_telomere },
541  { "centromere", CSeq_gap::eType_centromere },
542  { "short arm", CSeq_gap::eType_short_arm },
543  { "heterochromatin", CSeq_gap::eType_heterochromatin },
544  { "repeat within scaffold", CSeq_gap::eType_repeat },
545  { "repeat between scaffolds", CSeq_gap::eType_repeat },
546  { "unknown", CSeq_gap::eType_unknown },
547  { nullptr, -1 }
548 };
549 
551  { "paired-ends", CLinkage_evidence::eType_paired_ends },
552  { "align genus", CLinkage_evidence::eType_align_genus },
553  { "align xgenus", CLinkage_evidence::eType_align_xgenus },
554  { "align trnscpt", CLinkage_evidence::eType_align_trnscpt },
555  { "within clone", CLinkage_evidence::eType_within_clone },
556  { "clone contig", CLinkage_evidence::eType_clone_contig },
557  { "map", CLinkage_evidence::eType_map },
558  { "strobe", CLinkage_evidence::eType_strobe },
559  { "unspecified", CLinkage_evidence::eType_unspecified },
560  { "pcr", CLinkage_evidence::eType_pcr },
561  { "proximity ligation", CLinkage_evidence::eType_proximity_ligation },
562  { nullptr, -1 }
563 };
564 // clang-format on
565 
567 {
568  if (key) {
569  MemFree(key);
570  key = nullptr;
571  }
572  if (location) {
573  MemFree(location);
574  location = nullptr;
575  }
576 }
577 
578 extern Int2 SpFeatKeyNameValid(const Char* keystr);
580 
581 /**********************************************************/
583 {
584  DataBlkPtr dbpnext;
585  FeatBlkPtr fbp;
586 
587  for (; dbp; dbp = dbpnext) {
588  dbpnext = dbp->mpNext;
589  fbp = static_cast<FeatBlk*>(dbp->mpData);
590  if (fbp) {
591  delete fbp;
592  dbp->mpData = nullptr;
593  }
595  dbp->SimpleDelete();
596  }
597 }
598 
599 /**********************************************************
600  *
601  * static void DelCharBtwData(value):
602  *
603  * Deletes blanks in the "str".
604  *
605  **********************************************************/
606 static void DelCharBtwData(char* value)
607 {
608  char* p;
609 
610  for (p = value; *p != '\0'; p++)
611  if (*p != ' ')
612  *value++ = *p;
613  *value = '\0';
614 }
615 
616 /**********************************************************
617  *
618  * static Int4 flat2asn_range_func(pp, sip):
619  *
620  * For error handle in gbparint.c routines.
621  * This function has to return the length corresponding
622  * to the SeqId it is passed.
623  *
624  * ks 1/13/94
625  *
626  **********************************************************/
627 static Int4 flat2asn_range_func(void* pp_ptr, const CSeq_id& id)
628 {
629  ParserPtr pp = reinterpret_cast<ParserPtr>(pp_ptr);
630 
631  int use_indx = pp->curindx;
632  char* acnum;
633 
634  Int2 vernum;
635 
636 #ifdef BIOSEQ_FIND_METHOD
637 
638  bsp = BioseqFind(sip);
639  if (bsp)
640  return (bsp->length);
641 
642  // could try ID0 server
643  //
644  return (-1);
645 
646 #else
647 
648  const CTextseq_id* text_id = nullptr;
649  if (id.IsGenbank() || id.IsEmbl() || id.IsDdbj() || id.IsTpg() ||
650  id.IsTpe() || id.IsTpd())
651  text_id = id.GetTextseq_Id();
652 
653  if (text_id) {
654  Int2 text_id_ver = text_id->IsSetVersion() ? text_id->GetVersion() : numeric_limits<short>::min();
655  const string& text_id_acc = text_id->GetAccession();
656  for (use_indx = 0; use_indx < pp->indx; use_indx++) {
657  acnum = pp->entrylist[use_indx]->acnum;
658  vernum = pp->entrylist[use_indx]->vernum;
659  if (text_id_acc == acnum &&
660  (pp->accver == false || vernum == text_id_ver))
661  break;
662  }
663 
664  if (use_indx >= pp->indx) {
665  // entry is not present in this file use remote fetch function
666  // use_indx = pp->curindx;
667  //
668  size_t len = (! pp->ffdb) ? -1 : CheckOutsideEntry(pp, text_id_acc.c_str(), text_id_ver);
669  if (len != static_cast<size_t>(-1))
670  return static_cast<Int4>(len);
671 
672  if (! pp->buf) {
673  if (pp->farseq)
674  return -1;
675 
676  if (pp->accver == false || text_id_ver < 0) {
677  Nlm_ErrSetContext("validatr", __FILE__, __LINE__);
678  Nlm_ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "Location points to outside entry %s", text_id_acc.c_str());
679  } else {
680  Nlm_ErrSetContext("validatr", __FILE__, __LINE__);
681  Nlm_ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "Location points to outside entry %s.%d", text_id_acc.c_str(), text_id_ver);
682  }
683  return (-1);
684  }
685 
686  if (*pp->buf == '\0')
687  return (-1);
688 
690  ErrPostEx(SEV_WARNING, ERR_LOCATION_NCBIRefersToExternalRecord, "Feature location references an interval on another record : %s", pp->buf);
691  else
692  ErrPostEx(SEV_WARNING, ERR_LOCATION_RefersToExternalRecord, "Feature location references an interval on another record : %s", pp->buf);
693  MemFree(pp->buf);
694  pp->buf = MemNew(1);
695  *pp->buf = '\0';
696  return (-1);
697  }
698  }
699  return static_cast<Int4>(pp->entrylist[use_indx]->bases);
700 
701 #endif
702 }
703 
704 /**********************************************************/
705 static bool CheckForeignLoc(const CSeq_loc& loc, const CSeq_id& sid)
706 {
707  const CSeq_id& pid = *loc.GetId();
708 
709  if (loc.IsMix() || loc.IsEquiv() ||
710  sid.Compare(pid) == CSeq_id::e_YES)
711  return false;
712 
713  return true;
714 }
715 
716 /**********************************************************/
718 {
720 
721  if (! qual.IsSetQual() ||
722  qual.GetQual() != "db_xref")
723  return tag;
724 
725  if (! qual.IsSetVal() || qual.GetVal().empty()) {
726  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmptyQual, "Found empty /db_xref qualifier. Qualifier dropped.");
727  return tag;
728  }
729 
730  const string& val = qual.GetVal();
731  if (NStr::CompareNocase(val.c_str(), "taxon") == 0)
732  return tag;
733 
734  string line = val;
735 
736  if (StringEquNI(line.c_str(), "MGD:MGI:", 8))
737  line = line.substr(4);
738 
739  size_t colon = line.find(':');
740  if (colon == string::npos) {
741  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefIncorrect, "Badly formatted /db_xref qualifier: \"%s\". Qualifier dropped.", val.c_str());
742  return tag;
743  }
744 
745  string tail = line.substr(colon + 1);
746  line = line.substr(0, colon);
747 
748  if (MatchArrayIString(DbxrefObsolete, line.c_str()) > -1) {
749  ErrPostEx(SEV_WARNING, ERR_FEATURE_ObsoleteDbXref, "/db_xref type \"%s\" is obsolete.", line.c_str());
750 
751  string buf;
752  if (NStr::CompareNocase(line.c_str(), "BHB") == 0)
753  buf = "IRD";
754  else if (NStr::CompareNocase(line.c_str(), "BioHealthBase") == 0)
755  buf = "IRD";
756  else if (NStr::CompareNocase(line.c_str(), "GENEW") == 0)
757  buf = "HGNC";
758  else if (NStr::CompareNocase(line.c_str(), "IFO") == 0)
759  buf = "NBRC";
760  else if (NStr::CompareNocase(line.c_str(), "SWISS-PROT") == 0)
761  buf = "UniProt/Swiss-Prot";
762  else
763  buf = "UniProt/TrEMBL";
764 
765  line = buf;
766  }
767 
768  if (NStr::CompareNocase(line.c_str(), "UNIPROT/SWISS-PROT") == 0 ||
769  NStr::CompareNocase(line.c_str(), "UNIPROT/TREMBL") == 0) {
770  string buf("UniProtKB");
771  buf += line.substr(7);
772 
773  line = buf;
774  }
775 
776  const Char* strid = nullptr;
777  Int4 intid = 0;
778 
779  const Char* p = tail.c_str();
780  if (MatchArrayIString(DbxrefTagAny, line.c_str()) > -1) {
781  for (strid = p; *p >= '0' && *p <= '9';)
782  p++;
783  if (*p == '\0' && *strid != '0') {
784  intid = atoi(strid);
785  strid = nullptr;
786  }
787  } else if (MatchArrayIString(DbxrefTagStr, line.c_str()) > -1 ||
789  MatchArrayIString(EMBLDbxrefTagStr, line.c_str()) > -1)) {
790  for (strid = p; *p >= '0' && *p <= '9';)
791  p++;
792  if (*p == '\0') {
793  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_DbxrefWrongType, "/db_xref qualifier \"%s\" is supposed to be a string, but its value consists of digits only.", val.c_str());
794  if (*strid != '0') {
795  intid = atoi(strid);
796  strid = nullptr;
797  }
798  }
799  } else if (MatchArrayIString(DbxrefTagInt, line.c_str()) > -1) {
800  const Char* q = p;
801  for (; *q == '0';)
802  q++;
803  if (*q == '\0') {
804  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefShouldBeNumeric, "/db_xref qual should have numeric value greater than 0: \"%s\". Qualifier dropped.", val.c_str());
805  return tag;
806  }
807 
808  const Char* r = q;
809  for (; *r >= '0' && *r <= '9';)
810  r++;
811  if (*r != '\0') {
812  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefWrongType, "/db_xref qualifier \"%s\" is supposed to be a numeric identifier, but its value includes alphabetic characters. Qualifier dropped.", val.c_str());
813  return tag;
814  }
815  if (*r != '\0' || q != p)
816  strid = p;
817  else if (NStr::CompareNocase(line.c_str(), "IntrepidBio") == 0 && fta_number_is_huge(q))
818  strid = q;
819  else
820  intid = atoi(q);
821  } else if (NStr::CompareNocase(line.c_str(), "PID") == 0) {
822  if (*p != 'e' && *p != 'g' && *p != 'd') {
823  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefIncorrect, "Badly formatted /db_xref qual \"PID\": \"%s\". Qualifier dropped.", val.c_str());
824  return tag;
825  }
826 
827  const Char* q = p + 1;
828  for (; *q == '0';)
829  q++;
830 
831  const Char* r;
832  for (r = q; *r >= '0' && *r <= '9';)
833  r++;
834  if (*q == '\0' || *r != '\0') {
835  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefShouldBeNumeric, "/db_xref qual \"PID\" should contain numeric value greater than 0: \"%s\". Qualifier dropped.", val.c_str());
836  return tag;
837  }
838  strid = p;
839  } else {
840  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefUnknownDBName, "Unknown data base name /db_xref = \"%s\". Qualifier dropped.", val.c_str());
841  return tag;
842  }
843 
844 
845  tag.Reset(new CDbtag);
846 
847  tag->SetDb(line);
848 
849  if (strid)
850  tag->SetTag().SetStr(strid);
851  else
852  tag->SetTag().SetId(intid);
853 
854  return tag;
855 }
856 
857 /**********************************************************
858  *
859  * Function:
860  * static void FilterDb_xref(pSeqFeat, source)
861  *
862  * Purpose:
863  * Looks through SeqFeat's qualifiers which contain
864  * "db_xref" in qual field, convert such qualifiers
865  * into Dbtags removing the qualifiers from SeqFeat's
866  * list, got Dbtags links in the chain of ValNodes
867  * and puts the chain into the SeqFeat.
868  *
869  * Parameters:
870  * pSeqFeat - pointer to a SeqFeat for processing
871  *
872  * Return:
873  * None.
874  *
875  **********************************************************/
877 {
878  if (! feat.IsSetQual())
879  return;
880 
881  CSeq_feat::TDbxref& db_refs = feat.SetDbxref();
882 
883  for (CSeq_feat::TQual::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
884  if (! (*qual)->IsSetQual() || (*qual)->GetQual() != "db_xref") {
885  /* Just skip this qualifier, it isn't db_xref
886  */
887  ++qual;
888  continue;
889  }
890 
891  /* Current qualifier is db_xref, process it
892  */
893  CRef<CDbtag> dbtag = DbxrefQualToDbtag(*(*qual), source);
894  if (dbtag.NotEmpty()) {
895  db_refs.push_back(dbtag);
896  }
897 
898  /* Remove converted qualifier from chain of qualifiers
899  */
900  qual = feat.SetQual().erase(qual);
901  }
902 
903  if (feat.GetQual().empty())
904  feat.ResetQual();
905 
906  if (db_refs.empty())
907  feat.ResetDbxref();
908 }
909 
910 bool GetSeqLocation(CSeq_feat& feat, char* location, TSeqIdList& ids, bool* hard_err, ParserPtr pp, const char* name)
911 {
912  bool locmap = true;
913  int num_errs;
914 
915  *hard_err = false;
916  num_errs = 0;
917 
918  CRef<CSeq_loc> loc = xgbparseint_ver(location, locmap, num_errs, ids, pp->accver);
919 
920  if (loc.NotEmpty()) {
921  TSeqLocList locs;
922  locs.push_back(loc);
923  fta_fix_seq_loc_id(locs, pp, location, name, false);
924 
925  feat.SetLocation(*loc);
926  }
927 
928  if (num_errs > 0) {
929  feat.ResetLocation();
930  CSeq_loc& cur_loc = feat.SetLocation();
931  cur_loc.SetWhole(*(*ids.begin()));
932  *hard_err = true;
933  } else if (! feat.GetLocation().IsEmpty()) {
934  if (feat.GetLocation().IsMix()) {
935  if (feat.GetLocation().GetMix().Get().size() == 1) {
936  CRef<CSeq_loc> cur_loc(new CSeq_loc);
937 
938  cur_loc->Assign(*feat.GetLocation().GetMix().GetFirstLoc());
939  if (cur_loc->IsInt())
940  feat.SetLocation(*cur_loc);
941  }
942  }
943  }
944 
945  return locmap;
946 }
947 
948 /**********************************************************
949  *
950  * static char* CheckLocStr(str):
951  *
952  * Nlm_gbparseint routine does not parse certain types
953  * of interval correctly, so this routine will save input
954  * form in fbp before passing it:
955  * (bases 100 to 300) ==> 100 to 300;
956  * (bases 1 to 100; 200 to 300) no change.
957  *
958  * 5-20-93
959  *
960  **********************************************************/
961 static char* CheckLocStr(const Char* str)
962 {
963  const Char* ptr;
964  const Char* eptr;
965  char* location;
966 
967  ptr = StringChr(str, ';');
968  if (ptr)
969  return StringSave(str);
970 
971  for (ptr = str; *ptr != ' ' && *ptr != '\0';)
972  ptr++;
973  while (*ptr == ' ')
974  ptr++;
975 
976  eptr = StringChr(str, ')');
977  if (! eptr)
978  return nullptr;
979 
980  while (*eptr == ' ' || *eptr == ')')
981  --eptr;
982 
983  location = StringSave(string(ptr, eptr + 1).c_str());
984  return (location);
985 }
986 
987 /*****************************************************************************
988  *
989  * bool SeqIntCheckCpp(loc) is instead of C-toolkit 'bool SeqIntCheck(sip)'
990  * checks that a seq interval is valid
991  *
992  *****************************************************************************/
993 static bool SeqIntCheckCpp(const CSeq_loc& loc)
994 {
996 
997  CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
998  if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
999  len = bio_h.GetBioseqLength();
1000 
1001  return loc.GetInt().GetFrom() <= loc.GetInt().GetTo() && loc.GetInt().GetTo() < len;
1002 }
1003 
1004 /*****************************************************************************
1005  *
1006  * bool SeqPntCheckCpp(loc) is instead of C-toolkit 'Boolean SeqPntCheck(SeqPntPtr spp)'
1007  * checks that a seq point is valid
1008  *
1009  *****************************************************************************/
1010 static bool SeqPntCheckCpp(const CSeq_loc& loc)
1011 {
1013 
1014  CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
1015  if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
1016  len = bio_h.GetBioseqLength();
1017 
1018  return loc.GetPnt().GetPoint() < len;
1019 }
1020 
1021 /*****************************************************************************
1022  *
1023  * bool PackSeqPntCheck(loc) is instead of C-toolkit 'Boolean PackSeqPntCheck (pspp)'
1024  *
1025  *****************************************************************************/
1026 static bool PackSeqPntCheckCpp(const CSeq_loc& loc)
1027 {
1029 
1030  CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
1031  if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
1032  len = bio_h.GetBioseqLength();
1033 
1034  for (TSeqPos point : loc.GetPacked_pnt().GetPoints()) {
1035  if (point >= len)
1036  return false;
1037  }
1038 
1039  return true;
1040 }
1041 
1042 /**********************************************************/
1043 /* returns : 2 = Ok, 1 = mixed strands, 0 = error in location
1044  */
1045 static Uint1 FTASeqLocCheck(const CSeq_loc& locs, char* accession)
1046 {
1047  Uint1 strand = 99;
1048  Uint1 retval = 2;
1049 
1050  CSeq_loc_CI ci(locs);
1051 
1052  bool good = true;
1053  for (; ci; ++ci) {
1054  CConstRef<CSeq_loc> cur_loc = ci.GetRangeAsSeq_loc();
1055 
1056  const CSeq_id* cur_id = nullptr;
1057 
1058  switch (cur_loc->Which()) {
1059  case CSeq_loc::e_Int:
1060  good = SeqIntCheckCpp(*cur_loc);
1061  if (good)
1062  cur_id = cur_loc->GetId();
1063  break;
1064 
1065  case CSeq_loc::e_Pnt:
1066  good = SeqPntCheckCpp(*cur_loc);
1067  if (good)
1068  cur_id = cur_loc->GetId();
1069  break;
1070 
1072  good = PackSeqPntCheckCpp(*cur_loc);
1073  if (good)
1074  cur_id = cur_loc->GetId();
1075  break;
1076 
1077  case CSeq_loc::e_Bond:
1078  if (! cur_loc->GetBond().CanGetA())
1079  good = false;
1080 
1081  if (good)
1082  cur_id = cur_loc->GetId();
1083  break;
1084 
1085  case CSeq_loc::e_Empty:
1086  case CSeq_loc::e_Whole:
1087  cur_id = cur_loc->GetId();
1088  break;
1089 
1090  default:
1091  continue;
1092  }
1093 
1094  if (! good)
1095  break;
1096 
1097  if (! accession || ! cur_id)
1098  continue;
1099 
1100  if (! cur_id->IsGenbank() && ! cur_id->IsEmbl() && ! cur_id->IsPir() &&
1101  ! cur_id->IsSwissprot() && ! cur_id->IsOther() && ! cur_id->IsDdbj() &&
1102  ! cur_id->IsPrf() && ! cur_id->IsTpg() && ! cur_id->IsTpe() &&
1103  ! cur_id->IsTpd() && ! cur_id->IsGpipe())
1104  continue;
1105 
1106  const CTextseq_id* text_id = cur_id->GetTextseq_Id();
1107 
1108  if (! text_id || ! text_id->CanGetAccession())
1109  continue;
1110 
1111  if (text_id->GetAccession() == accession) {
1112  if (strand == 99)
1113  strand = cur_loc->GetStrand();
1114  else if (strand != cur_loc->GetStrand())
1115  retval = 1;
1116  }
1117  }
1118 
1119  if (! good)
1120  return 0;
1121 
1122  return retval;
1123 }
1124 
1125 /**********************************************************/
1126 static void fta_strip_aa(char* str)
1127 {
1128  if (! str || *str == '\0')
1129  return;
1130 
1131  while (str) {
1132  str = StringStr(str, "aa");
1133  if (str)
1134  fta_StringCpy(str, str + 2);
1135  }
1136 }
1137 
1138 /**********************************************************
1139  *
1140  * static SeqFeatPtr SeqFeatPub(pp, entry, hsfp, seq_id,
1141  * col_data, ibp):
1142  *
1143  * 5-26-93
1144  *
1145  **********************************************************/
1146 static void SeqFeatPub(ParserPtr pp, const DataBlk& entry, TSeqFeatList& feats, TSeqIdList& seqids, Int4 col_data, IndexblkPtr ibp)
1147 {
1148  DataBlkPtr dbp;
1149  DataBlkPtr subdbp;
1150  char* p;
1151  char* q;
1152  char* location = nullptr;
1153 
1154  bool err = false;
1155  Uint1 i;
1156 
1157  /* REFERENCE, to Seq-feat
1158  */
1159  if (pp->format == Parser::EFormat::XML)
1160  dbp = XMLBuildRefDataBlk(entry.mOffset, ibp->xip, ParFlat_REF_BTW);
1161  else
1162  dbp = TrackNodeType(entry, ParFlat_REF_BTW);
1163  if (! dbp)
1164  return;
1165 
1166 
1167  for (; dbp; dbp = dbp->mpNext) {
1168  if (dbp->mType != ParFlat_REF_BTW)
1169  continue;
1170 
1171  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, col_data);
1172  if (pubdesc.Empty())
1173  continue;
1174 
1175  CRef<CSeq_feat> feat(new CSeq_feat);
1176  feat->SetData().SetPub(*pubdesc);
1177 
1178  location = nullptr;
1179  if (pp->format == Parser::EFormat::XML) {
1180  location = XMLFindTagValue(dbp->mOffset, static_cast<XmlIndex*>(dbp->mpData), INSDREFERENCE_POSITION);
1181  if (! location) {
1182  q = XMLFindTagValue(dbp->mOffset, static_cast<XmlIndex*>(dbp->mpData), INSDREFERENCE_REFERENCE);
1183  if (q) {
1184  for (p = q; *p != '\0' && *p != '(';)
1185  p++;
1186  if (*p != '\0')
1187  location = CheckLocStr(p + 1);
1188  MemFree(q);
1189  }
1190  } else {
1191  p = StringChr(location, ';');
1192  if (p) {
1193  string s("join(");
1194  s.append(location);
1195  s.append(")");
1196  MemFree(location);
1197  location = StringSave(s.c_str());
1198  }
1199  }
1200  } else if (pp->format == Parser::EFormat::GenBank) {
1201  for (p = dbp->mOffset + col_data; *p != '\0' && *p != '(';)
1202  p++;
1203  location = CheckLocStr(string(p, dbp->mOffset + dbp->len - p).c_str());
1204  } else if (pp->format == Parser::EFormat::EMBL) {
1205  subdbp = static_cast<DataBlk*>(dbp->mpData);
1206  for (; subdbp; subdbp = subdbp->mpNext) {
1207  if (subdbp->mType != ParFlat_RP)
1208  continue;
1209 
1210  for (p = subdbp->mOffset; *p != '\0' && isdigit(*p) == 0;)
1211  p++;
1212  if (StringChr(p, ',')) {
1213  string s = "join(";
1214  s += p;
1215  s += ")";
1216  location = StringSave(s.c_str());
1217  } else
1218  location = StringSave(p);
1219  break;
1220  }
1221  }
1222  if (! location || *location == '\0') {
1223  ErrPostEx(SEV_REJECT, ERR_REFERENCE_UnparsableLocation, "NULL or empty reference location. Entry dropped.");
1224  err = true;
1225  if (location)
1226  MemFree(location);
1227  break;
1228  }
1229 
1230  if (ibp->is_prot)
1232 
1233  if (pp->buf)
1234  MemFree(pp->buf);
1235  pp->buf = nullptr;
1236 
1237  GetSeqLocation(*feat, location, seqids, &err, pp, "pub");
1238 
1239  if (err) {
1240  ErrPostEx(SEV_REJECT, ERR_REFERENCE_UnparsableLocation, "Unparsable reference location. Entry dropped.");
1241  MemFree(location);
1242  break;
1243  }
1244 
1245  i = FTASeqLocCheck(feat->GetLocation(), ibp->acnum);
1246 
1247  if (i == 0) {
1249  if (pp->debug) {
1250  feats.push_back(feat);
1251  }
1252  } else {
1253  if (i == 1) {
1254  ErrPostEx(SEV_WARNING, ERR_LOCATION_MixedStrand, "Mixed strands in SeqLoc: %s", location);
1255  }
1256  feats.push_back(feat);
1257  }
1258  if (location)
1259  MemFree(location);
1260  }
1261 
1262  if (! err)
1263  return;
1264 
1265  ibp->drop = true;
1266  feats.clear();
1267 }
1268 
1269 /**********************************************************
1270  *
1271  * static SeqFeatPtr ImpFeatPub(pp, entry, hsfp, seq_id,
1272  * col_data, ibp):
1273  *
1274  * 5-26-93
1275  *
1276  **********************************************************/
1277 static void ImpFeatPub(ParserPtr pp, const DataBlk& entry, TSeqFeatList& feats, CSeq_id& seq_id, Int4 col_data, IndexblkPtr ibp)
1278 {
1279  DataBlkPtr dbp;
1280 
1281  bool first;
1282 
1283  /* REFERENCE, Imp-feat
1284  */
1285  if (pp->format == Parser::EFormat::XML)
1286  dbp = XMLBuildRefDataBlk(entry.mOffset, ibp->xip, ParFlat_REF_SITES);
1287  else
1288  dbp = TrackNodeType(entry, ParFlat_REF_SITES);
1289  if (! dbp)
1290  return;
1291 
1292  CRef<CSeq_feat> feat;
1293  for (first = true; dbp; dbp = dbp->mpNext) {
1294  if (dbp->mType != ParFlat_REF_SITES)
1295  continue;
1296 
1297  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, col_data);
1298  if (pubdesc.Empty() || ! pubdesc->IsSetPub())
1299  continue;
1300 
1301  if (first) {
1302  feat.Reset(new CSeq_feat);
1303 
1304  CImp_feat& imp_feat = feat->SetData().SetImp();
1305  imp_feat.SetKey("Site-ref");
1306  imp_feat.SetLoc("sites");
1307 
1308  feat->SetLocation(*fta_get_seqloc_int_whole(seq_id, ibp->bases));
1309  first = false;
1310  }
1311 
1312  CRef<CPub> pub(new CPub);
1313  pub->SetEquiv(pubdesc->SetPub());
1314 
1315  feat->SetCit().SetPub().push_back(pub);
1316 
1317  if (pubdesc->IsSetComment())
1318  feat->SetComment(pubdesc->GetComment());
1319  else
1320  feat->ResetComment();
1321  }
1322 
1323  if (! first && feat.NotEmpty())
1324  feats.push_back(feat);
1325 }
1326 
1327 /**********************************************************/
1328 static void fta_fake_gbparse_err_handler(const Char*, const Char*)
1329 {
1330 }
1331 
1332 /**********************************************************/
1334 {
1335  auto ret = location_to_string(loc);
1336  if (! ret.empty())
1337  return ret;
1338 
1339  return "unknown location";
1340 }
1341 
1342 /**********************************************************/
1343 static CRef<CSeq_loc> GetTrnaAnticodon(const CSeq_feat& feat, char* qval, const TSeqIdList& seqids, bool accver)
1344 {
1345  char* loc_str;
1346  char* p;
1347  char* q;
1348  bool fake1;
1349  Int4 range;
1350  Int4 pars;
1351  Char ch;
1352  int fake3;
1353 
1354  CRef<CSeq_loc> ret;
1355 
1356  if (! qval)
1357  return ret;
1358 
1359  p = StringStr(qval, "pos:");
1360  if (! p)
1361  return ret;
1362 
1363  for (q = p + 4; *q == ' ';)
1364  q++;
1365 
1366  for (pars = 0, p = q; *p != '\0'; p++) {
1367  if (*p == ',' && pars == 0)
1368  break;
1369  if (*p == '(')
1370  pars++;
1371  else if (*p == ')') {
1372  pars--;
1373  if (pars == 0) {
1374  p++;
1375  break;
1376  }
1377  }
1378  }
1379 
1380  ch = *p;
1381  *p = '\0';
1382  loc_str = StringSave(q);
1383  *p = ch;
1384 
1386  ret = xgbparseint_ver(loc_str, fake1, fake3, seqids, accver);
1388 
1389  if (ret.Empty()) {
1390  string loc = location_to_string_or_unknown(feat.GetLocation());
1391 
1392  ErrPostEx(SEV_ERROR, ERR_FEATURE_InvalidAnticodonPos, "Invalid position element for an /anticodon qualifier : \"%s\" : qualifier dropped : feature location \"%s\".", loc_str, (loc.empty()) ? "unknown" : loc.c_str());
1393 
1394  MemFree(loc_str);
1395 
1396  return ret;
1397  }
1398 
1399  range = sequence::GetLength(*ret, &GetScope());
1400  if (range != 3) {
1401  string loc = location_to_string_or_unknown(feat.GetLocation());
1402 
1403  if (range == 4)
1404  ErrPostEx(SEV_WARNING, ERR_FEATURE_FourBaseAntiCodon, "tRNA feature at \"%s\" has anticodon with location spanning four bases: \"%s\". Cannot generate corresponding codon value from the DNA sequence.", loc.empty() ? "unknown" : loc.c_str(), loc_str);
1405  else
1406  ErrPostEx(SEV_ERROR, ERR_FEATURE_StrangeAntiCodonSize, "tRNA feature at \"%s\" has anticodon of an unusual size: \"%s\". Cannot generate corresponding codon value from the DNA sequence.", loc.empty() ? "unknown" : loc.c_str(), loc_str);
1407  }
1408 
1409  // Comparing two locations ignoring their IDs
1410  // Anticodon should be inside the original location (may be the same)
1411  CRange<TSeqPos> anticodon_range = ret->GetTotalRange();
1412  CRange<TSeqPos> xrange = feat.GetLocation().GetTotalRange().IntersectionWith(anticodon_range);
1413 
1414  if (xrange != anticodon_range) {
1415  string loc = location_to_string_or_unknown(feat.GetLocation());
1416 
1417  ErrPostEx(SEV_ERROR, ERR_FEATURE_BadAnticodonLoc, "Anticodon location \"%s\" does not fall within tRNA feature at \"%s\".", loc_str, loc.empty() ? "unknown" : loc.c_str());
1418 
1419  MemFree(loc_str);
1420  ret.Reset();
1421  return ret;
1422  }
1423 
1424  MemFree(loc_str);
1425  return ret;
1426 }
1427 
1428 /**********************************************************/
1429 static void fta_parse_rrna_feat(CSeq_feat& feat, CRNA_ref& rna_ref)
1430 {
1431  char* qval;
1432  char* p;
1433  char* q;
1434  Char ch;
1435 
1436  qval = GetTheQualValue(feat.SetQual(), "product");
1437  if (feat.GetQual().empty())
1438  feat.ResetQual();
1439 
1440  string qval_str;
1441  if (qval) {
1442  qval_str = qval;
1443  MemFree(qval);
1444  qval = nullptr;
1445  }
1446 
1447  size_t len = 0;
1448  if (qval_str.empty() && feat.IsSetComment() && rna_ref.GetType() == CRNA_ref::eType_rRNA) {
1449  string comment = feat.GetComment();
1450  len = comment.size();
1451 
1452  if (len > 15 && len < 20) {
1453  if (StringEquNI(comment.c_str() + len - 15, "S ribosomal RNA", 15)) {
1454  qval_str = comment;
1455  feat.ResetComment();
1456  }
1457  } else if (len > 6 && len < 20) {
1458  if (StringEquNI(comment.c_str() + len - 6, "S rRNA", 6)) {
1459  qval_str = comment;
1460  feat.ResetComment();
1461  }
1462  }
1463  }
1464 
1465  if (qval_str.empty())
1466  return;
1467 
1468  qval = StringSave(qval_str.c_str());
1469  for (p = qval; p; p += 13) {
1470  p = StringIStr(p, "ribosomal rrna");
1471  if (! p)
1472  break;
1473  fta_StringCpy(p + 10, p + 11);
1474  }
1475 
1476  for (p = qval; p; p = qval + len) {
1477  p = StringIStr(p, "ribosomalrna");
1478  if (! p)
1479  break;
1480  p[9] = '\0';
1481  string s(qval);
1482  s.append(" RNA");
1483  s.append(p + 12);
1484  len = p - qval + 13;
1485  MemFree(qval);
1486  qval = StringSave(s.c_str());
1487  }
1488 
1489  if (qval) {
1490  p = StringIStr(qval, " rrna");
1491  if (p) {
1492  *p = '\0';
1493  string s(qval);
1494  s.append(" ribosomal RNA");
1495  s.append(p + 5);
1496  MemFree(qval);
1497  qval = StringSave(s.c_str());
1498  }
1499  }
1500 
1501  for (p = qval, q = p; q; q = p + 13) {
1502  p = StringIStr(q, "ribosomal DNA");
1503  if (! p) {
1504  p = StringIStr(q, "ribosomal RNA");
1505  if (! p)
1506  break;
1507  }
1508  p[10] = 'R';
1509  p[11] = 'N';
1510  p[12] = 'A';
1511  }
1512 
1513  p = StringIStr(qval, "s ribosomal RNA");
1514  if (p && p > qval && p[15] == '\0') {
1515  p--;
1516  if (*p >= '0' && *p <= '9')
1517  *++p = 'S';
1518  }
1519 
1520  for (p = qval;;) {
1521  p = StringIStr(p, "ribosomal");
1522  if (! p)
1523  break;
1524  if (p == qval || (p[9] != ' ' && p[9] != '\0')) {
1525  p += 9;
1526  continue;
1527  }
1528  if (StringEquN(p + 9, " RNA", 4)) {
1529  p += 13;
1530  continue;
1531  }
1532  len = p - qval + 14;
1533  p += 9;
1534  ch = *p;
1535  *p = '\0';
1536  string s(qval);
1537  s.append(" RNA");
1538  *p = ch;
1539  s.append(p);
1540  MemFree(qval);
1541  qval = StringSave(s.c_str());
1542  p = qval + len;
1543  }
1544 
1545  for (p = qval;;) {
1546  p = StringIStr(p, " ribosomal RNA");
1547  if (! p)
1548  break;
1549  p += 14;
1550  if (StringEquNI(p, " ribosomal RNA", 14))
1551  fta_StringCpy(p, p + 14);
1552  }
1553 
1554  DeleteQual(feat.SetQual(), "product");
1555  if (feat.GetQual().empty())
1556  feat.ResetQual();
1557 
1558  if (StringLen(qval) > 511) {
1559  qval[510] = '>';
1560  qval[511] = '\0';
1561  p = StringSave(qval);
1562  MemFree(qval);
1563  qval = p;
1564  }
1565 
1566  rna_ref.SetExt().SetName(qval);
1567  MemFree(qval);
1568 }
1569 
1570 /**********************************************************/
1572 {
1573  const AaCodons* acp;
1574 
1575  for (acp = aacodons; acp->straa; acp++)
1576  if (acp->intaa == ch)
1577  break;
1578  if (acp->straa)
1579  return (acp->intaa);
1580 
1581  return (0);
1582 }
1583 
1584 /**********************************************************/
1586 {
1587  const AaCodons* acp;
1588  const TrnaAa* tap;
1589 
1590  for (tap = taa; tap->name; tap++)
1591  if (NStr::CompareNocase(str, tap->name) == 0)
1592  break;
1593  if (tap->name)
1594  return (tap->aa);
1595 
1596  for (acp = aacodons; acp->straa; acp++)
1597  if (NStr::CompareNocase(acp->straa, str) == 0)
1598  break;
1599  if (acp->straa)
1600  return (acp->intaa);
1601 
1602  return (0);
1603 }
1604 
1605 /**********************************************************/
1606 static int get_aa_from_trna(const CTrna_ext& trna)
1607 {
1608  int ret = 0;
1609  if (trna.IsSetAa() && trna.GetAa().IsNcbieaa())
1610  ret = trna.GetAa().GetNcbieaa();
1611 
1612  return ret;
1613 }
1614 
1615 /**********************************************************/
1616 static CRef<CTrna_ext> fta_get_trna_from_product(CSeq_feat& feat, const Char* product, unsigned char* remove)
1617 {
1618  const char** b;
1619 
1620  char* p;
1621  char* q;
1622  char* start;
1623  char* end;
1624  char* first;
1625  char* second;
1626  char* third;
1627  char* fourth;
1628  bool fmet;
1629  char* prod;
1630 
1631  if (remove)
1632  *remove = 0;
1633 
1634  CRef<CTrna_ext> ret(new CTrna_ext);
1635 
1636  if (! product || StringLen(product) < 7)
1637  return ret;
1638 
1639  bool digits = false;
1640  prod = StringSave(product);
1641  for (p = prod; *p != '\0'; p++) {
1642  if (*p >= 'a' && *p <= 'z')
1643  *p &= ~040;
1644  else if ((*p < 'A' || *p > 'Z') && *p != '(' && *p != ')') {
1645  if (*p >= '0' && *p <= '9')
1646  digits = true;
1647  *p = ' ';
1648  }
1649  }
1650  ShrinkSpaces(prod);
1651 
1652  for (b = trna_tags; *b; b++) {
1653  start = StringStr(prod, *b);
1654  if (start)
1655  break;
1656  }
1657  if (! *b) {
1658  MemFree(prod);
1659  return ret;
1660  }
1661 
1662  end = start + StringLen(*b);
1663  for (p = end; *p != '\0'; p++)
1664  if (*p == '(' || *p == ')')
1665  *p = ' ';
1666  ShrinkSpaces(prod);
1667 
1668  if (start == prod && *end == '\0') {
1669  if (remove && ! digits)
1670  *remove = 1;
1671  MemFree(prod);
1672  return ret;
1673  }
1674 
1675  first = nullptr;
1676  second = nullptr;
1677  third = nullptr;
1678  fourth = nullptr;
1679  for (p = end; *p == ' ' || *p == ')' || *p == '(';)
1680  p++;
1681  q = p;
1682  if (StringEquN(p, "F MET", 5))
1683  p += 5;
1684  else if (StringEquN(p, "F MT", 4))
1685  p += 4;
1686  while (*p >= 'A' && *p <= 'Z')
1687  p++;
1688  if (p > q) {
1689  if (*p != '\0')
1690  *p++ = '\0';
1691  second = q;
1692  }
1693  while (*p == ' ' || *p == ')' || *p == '(')
1694  p++;
1695  for (q = p; *p >= 'A' && *p <= 'Z';)
1696  p++;
1697  if (p > q) {
1698  if (*p != '\0')
1699  *p++ = '\0';
1700  if (q[1] == '\0') {
1701  while (*p == ' ' || *p == ')' || *p == '(')
1702  p++;
1703  for (q = p; *p >= 'A' && *p <= 'Z';)
1704  p++;
1705  if (p > q) {
1706  if (*p != '\0')
1707  *p++ = '\0';
1708  third = q;
1709  }
1710  } else
1711  third = q;
1712 
1713  while (*p == ' ' || *p == '(' || *p == ')')
1714  p++;
1715  if (*p != '\0')
1716  fourth = p;
1717  }
1718  if (start > prod) {
1719  for (p = start - 1; *p == ' ' || *p == ')' || *p == '('; p--)
1720  if (p == prod)
1721  break;
1722 
1723  if (p > prod && p[1] == ')') {
1724  for (p--; *p != '('; p--)
1725  if (p == prod)
1726  break;
1727  if (p > prod) {
1728  for (p--; *p == ' ' || *p == '(' || *p == '('; p--)
1729  if (p == prod)
1730  break;
1731  }
1732  }
1733  if (p > prod) {
1734  for (q = p++; *q >= 'A' && *q <= 'Z'; q--)
1735  if (q == prod)
1736  break;
1737  if (*q < 'A' || *q > 'Z')
1738  q++;
1739  if (p > q) {
1740  *p = '\0';
1741  first = q;
1742  }
1743  }
1744  }
1745 
1746  fmet = false;
1747  if (second) {
1748  if (StringEqu(second, "F MET") ||
1749  StringEqu(second, "FMET") ||
1750  StringEqu(second, "F MT")) {
1751  StringCpy(second, "FMET");
1752  fmet = true;
1753  }
1754 
1755  ret->SetAa().SetNcbieaa(fta_get_aa_from_string(second));
1756  if (get_aa_from_trna(*ret) != 0)
1757  second = nullptr;
1758  }
1759 
1760  if (get_aa_from_trna(*ret) == 0 && first) {
1761  ret->SetAa().SetNcbieaa(fta_get_aa_from_string(first));
1762  if (get_aa_from_trna(*ret) != 0 && first == prod)
1763  first = nullptr;
1764  }
1765 
1766  if (! first && ! second && ! third && ! fourth && remove && ! digits)
1767  *remove = 1;
1768  MemFree(prod);
1769 
1770  if (! fmet)
1771  return ret;
1772 
1773  if (! feat.IsSetComment())
1774  feat.SetComment("fMet");
1775  else if (! StringIStr(feat.GetComment().c_str(), "fmet")) {
1776  string& comment = feat.SetComment();
1777  comment += "; fMet";
1778  }
1779 
1780  return ret;
1781 }
1782 
1783 /**********************************************************/
1784 static CRef<CTrna_ext> fta_get_trna_from_comment(const Char* comment, unsigned char* remove)
1785 {
1786  char* comm;
1787  char* p;
1788  char* q;
1789 
1790  CRef<CTrna_ext> ret(new CTrna_ext);
1791 
1792  *remove = 0;
1793  if (! comment)
1794  return ret;
1795 
1796  comm = StringSave(comment);
1797  for (p = comm; *p != '\0'; p++) {
1798  if (*p >= 'a' && *p <= 'z')
1799  *p &= ~040;
1800  else if (*p < 'A' || *p > 'Z')
1801  *p = ' ';
1802  }
1803  ShrinkSpaces(comm);
1804 
1805  if (StringEquN(comm, "CODON RECOGNIZED ", 17)) {
1806  p = comm + 17;
1807  q = StringChr(p, ' ');
1808  if (q && StringEqu(q + 1, "PUTATIVE"))
1809  *q = '\0';
1810  if (! StringChr(p, ' ') && StringLen(p) == 3) {
1811  MemFree(comm);
1812  *remove = q ? 2 : 1;
1813  return ret;
1814  }
1815  }
1816 
1817  if (StringEquN(comm, "PUTATIVE ", 9) && comm[10] == ' ' &&
1818  comm[14] == ' ' && StringEquN(&comm[15], "TRNA", 4)) {
1819  ret->SetAa().SetNcbieaa(fta_get_aa_from_symbol(comm[9]));
1820  if (get_aa_from_trna(*ret) != 0) {
1821  MemFree(comm);
1822  return ret;
1823  }
1824  }
1825 
1826  for (q = comm, p = q; p;) {
1827  p = StringChr(p, ' ');
1828  if (p)
1829  *p++ = '\0';
1830 
1831  ret->SetAa().SetNcbieaa(fta_get_aa_from_string(q));
1832  if (get_aa_from_trna(*ret) != 0)
1833  break;
1834  q = p;
1835  }
1836 
1837  MemFree(comm);
1838  return ret;
1839 }
1840 
1841 /**********************************************************/
1842 static int get_first_codon_from_trna(const CTrna_ext& trna)
1843 {
1844  int ret = 255;
1845  if (trna.IsSetCodon() && ! trna.GetCodon().empty())
1846  ret = *trna.GetCodon().begin();
1847 
1848  return ret;
1849 }
1850 
1851 /**********************************************************/
1852 static void GetRnaRef(CSeq_feat& feat, CBioseq& bioseq, Parser::ESource source, bool accver)
1853 {
1854  char* qval = nullptr;
1855  char* p;
1856 
1857  Uint1 remove;
1858 
1859  Int2 type;
1860 
1861  if (! feat.GetData().IsImp())
1862  return;
1863 
1864  const CImp_feat& imp_feat = feat.GetData().GetImp();
1865 
1866  CRef<CRNA_ref> rna_ref(new CRNA_ref);
1867 
1868  type = MatchArrayString(ParFlat_RNA_array, imp_feat.GetKey().c_str());
1869  if (type < 0)
1870  type = 255;
1871  else
1872  ++type;
1873 
1874  rna_ref->SetType(static_cast<CRNA_ref::EType>(type));
1875 
1876  feat.SetData().SetRna(*rna_ref);
1877 
1878  if (type == CRNA_ref::eType_rRNA) {
1879  fta_parse_rrna_feat(feat, *rna_ref);
1880  return;
1881  }
1882 
1883  CRef<CRNA_gen> rna_gen;
1884  CRef<CRNA_qual_set> rna_quals;
1885 
1886  if (type == CRNA_ref::eType_ncRNA) {
1887  p = GetTheQualValue(feat.SetQual(), "ncRNA_class");
1888  if (p) {
1889  rna_gen.Reset(new CRNA_gen);
1890  rna_gen->SetClass(p);
1891  }
1892  } else if (type == CRNA_ref::eType_tmRNA) {
1893  p = GetTheQualValue(feat.SetQual(), "tag_peptide");
1894  if (p) {
1895  CRef<CRNA_qual> rna_qual(new CRNA_qual);
1896  rna_qual->SetQual("tag_peptide");
1897  rna_qual->SetVal(p);
1898 
1899  rna_quals.Reset(new CRNA_qual_set);
1900  rna_quals->Set().push_back(rna_qual);
1901 
1902  rna_gen.Reset(new CRNA_gen);
1903  rna_gen->SetQuals(*rna_quals);
1904  }
1905  }
1906 
1907  if (type != CRNA_ref::eType_premsg && type != CRNA_ref::eType_tRNA) /* mRNA, snRNA, scRNA or other */
1908  {
1909  qval = GetTheQualValue(feat.SetQual(), "product"); // may return newly allocated memory!!!
1910  if (qval) {
1911  p = GetTheQualValue(feat.SetQual(), "product");
1912  if (p && p[0] != 0) {
1913  if (! feat.IsSetComment())
1914  feat.SetComment(p);
1915  else {
1916  string& comment = feat.SetComment();
1917  comment += "; ";
1918  comment += p;
1919  }
1920  }
1921  MemFree(p);
1922  }
1923 
1924  if (! qval && type == CRNA_ref::eType_mRNA &&
1926  qval = GetTheQualValue(feat.SetQual(), "standard_name");
1927 
1928  if (! qval && feat.IsSetComment() && type == CRNA_ref::eType_mRNA) {
1929  const Char* c_p = feat.GetComment().c_str();
1930  const Char* c_q = nullptr;
1931  for (;; c_p += 5, c_q = c_p) {
1932  c_p = StringIStr(c_p, " mRNA");
1933  if (! c_p)
1934  break;
1935  }
1936 
1937  const Char* c_r = nullptr;
1938  for (c_p = feat.GetComment().c_str();; c_p += 4, c_r = c_p) {
1939  c_p = StringIStr(c_p, " RNA");
1940  if (! c_p)
1941  break;
1942  }
1943 
1944  if (c_q && c_r) {
1945  c_p = (c_q > c_r) ? c_q : c_r;
1946  } else if (c_q)
1947  c_p = c_q;
1948  else
1949  c_p = c_r;
1950 
1951  if (c_p) {
1952  while (*c_p == ' ' || *c_p == '\t' || *c_p == ',' || *c_p == ';')
1953  ++c_p;
1954 
1955  if (*c_p == '\0') {
1956  qval = StringSave(feat.GetComment().c_str());
1957  feat.ResetComment();
1958  }
1959  }
1960  }
1961 
1962  if (qval) {
1963  if (StringLen(qval) > 511) {
1964  qval[510] = '>';
1965  qval[511] = '\0';
1966  p = StringSave(qval);
1967  MemFree(qval);
1968  qval = p;
1969  }
1970 
1972  if (rna_gen.Empty())
1973  rna_gen.Reset(new CRNA_gen);
1974 
1975  rna_gen->SetProduct(qval);
1976  } else {
1977  rna_ref->SetExt().SetName(qval);
1978  }
1979  }
1980  MemFree(qval);
1981  }
1982 
1983  if (feat.GetQual().empty())
1984  feat.ResetQual();
1985 
1986  if (rna_gen.NotEmpty()) {
1987  rna_ref->SetExt().SetGen(*rna_gen);
1988  }
1989 
1990  if (type != CRNA_ref::eType_tRNA) /* if tRNA and codon value exist */
1991  return;
1992 
1993  if (qval) {
1994  MemFree(qval);
1995  }
1996  qval = GetTheQualValue(feat.SetQual(), "anticodon");
1997  CRef<CTrna_ext> trnaa;
1998  if (qval) {
1999  bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
2000 
2001  CRef<CSeq_loc> anticodon = GetTrnaAnticodon(feat, qval, bioseq.GetId(), accver);
2002  if (anticodon.NotEmpty()) {
2003  trnaa.Reset(new CTrna_ext);
2004 
2005  /* value has format: (pos:base_range, aa:amino_acid)
2006  */
2007  trnaa->SetAa().SetNcbieaa(GetQualValueAa(qval, true));
2008  trnaa->SetAnticodon(*anticodon);
2009  rna_ref->SetExt().SetTRNA(*trnaa);
2010  }
2011 
2012  MemFree(qval);
2013  qval = nullptr;
2014  }
2015 
2016  qval = CpTheQualValue(feat.SetQual(), "product");
2017 
2018  CRef<CTrna_ext> trnap;
2019  if (qval) {
2020  trnap = fta_get_trna_from_product(feat, qval, nullptr);
2021  MemFree(qval);
2022  qval = nullptr;
2023  }
2024 
2025  if (feat.IsSetComment() && feat.GetComment().empty()) {
2026  feat.ResetComment();
2027  }
2028 
2029  remove = 0;
2030  CRef<CTrna_ext> trnac;
2031  if (feat.IsSetComment()) {
2032  trnac = fta_get_trna_from_product(feat, feat.GetComment().c_str(), &remove);
2033 
2034  if (get_aa_from_trna(*trnac) == 0) {
2035  trnac = fta_get_trna_from_comment(feat.GetComment().c_str(), &remove);
2036  }
2037 
2038  if (get_aa_from_trna(*trnac) == 0 && get_first_codon_from_trna(*trnac) == 255) {
2039  trnac.Reset();
2040  }
2041  }
2042 
2043  if (trnaa.Empty()) {
2044  if (trnap.Empty()) {
2045  if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0) {
2046  rna_ref->SetExt().SetTRNA(*trnac);
2047  if (remove != 0) {
2048  feat.ResetComment();
2049  }
2050  }
2051  } else {
2052  rna_ref->SetExt().SetTRNA(*trnap);
2053 
2054  if (get_aa_from_trna(*trnap) == 0) {
2055  if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0)
2056  rna_ref->SetExt().SetTRNA(*trnac);
2057  } else if (trnac.NotEmpty()) {
2058  if (get_aa_from_trna(*trnac) == 0 && get_first_codon_from_trna(*trnac) != 255 &&
2059  get_first_codon_from_trna(*trnap) == 255 && remove != 0) {
2060  trnap->SetCodon().assign(trnac->GetCodon().begin(), trnac->GetCodon().end());
2061 
2062  feat.ResetComment();
2063  if (remove == 2)
2064  feat.SetComment("putative");
2065  }
2066 
2067  if (get_aa_from_trna(*trnac) == get_aa_from_trna(*trnap) && remove != 0) {
2068  feat.ResetComment();
2069  }
2070  }
2071  }
2072  } else {
2073  if (trnap.NotEmpty()) {
2074  trnap.Reset();
2075  }
2076 
2077  if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0) {
2078  if (get_aa_from_trna(*trnac) == get_aa_from_trna(*trnaa) || get_aa_from_trna(*trnaa) == 88) {
2079  trnac->SetAnticodon(trnaa->SetAnticodon());
2080  trnaa->ResetAnticodon();
2081 
2082  if (get_first_codon_from_trna(*trnac) == 255) {
2083  trnac->SetCodon().assign(trnaa->GetCodon().begin(), trnaa->GetCodon().end());
2084  }
2085 
2086  rna_ref->SetExt().SetTRNA(*trnac);
2087  if (remove != 0) {
2088  feat.ResetComment();
2089  }
2090  }
2091  }
2092  }
2093 
2094  if (feat.GetQual().empty())
2095  feat.ResetQual();
2096 
2097  if (rna_ref->IsSetExt() && rna_ref->GetExt().IsTRNA()) {
2098  const CTrna_ext& trna = rna_ref->GetExt().GetTRNA();
2099  if (get_aa_from_trna(trna) == 0 && ! trna.IsSetAnticodon()) {
2100  rna_ref->ResetExt();
2101  }
2102  }
2103 }
2104 
2105 /**********************************************************
2106  *
2107  * static void GetImpFeat(sfp, fbp, locmap):
2108  *
2109  * 'replace' in loc will be changed later
2110  * in SeqEntryToAsn3Ex.
2111  *
2112  * 01/07/97
2113  *
2114  **********************************************************/
2115 static void GetImpFeat(CSeq_feat& feat, FeatBlkPtr fbp, bool locmap)
2116 {
2117  CRef<CImp_feat> imp_feat(new CImp_feat);
2118  imp_feat->SetKey(fbp->key);
2119 
2120  if (locmap)
2121  imp_feat->SetLoc(fbp->location);
2122 
2123  feat.SetData().SetImp(*imp_feat);
2124 }
2125 
2126 /**********************************************************/
2128 {
2129  if (bio.CanGetOrg() && ! bio.GetOrg().GetDb().empty()) {
2130  for (COrg_ref::TDb::iterator db = bio.SetOrg().SetDb().begin(); db != bio.SetOrg().SetDb().end(); ++db) {
2131  if (! (*db)->CanGetDb())
2132  continue;
2133 
2134  COrg_ref::TDb::iterator tdb = db;
2135  for (++tdb; tdb != bio.SetOrg().SetDb().end(); ++tdb) {
2136  if (! (*tdb)->IsSetDb())
2137  continue;
2138 
2139  if ((*db)->GetDb() < (*tdb)->GetDb())
2140  continue;
2141 
2142  if ((*db)->GetDb() == (*tdb)->GetDb()) {
2143  const CObject_id& db_id = (*db)->GetTag();
2144  const CObject_id& tdb_id = (*tdb)->GetTag();
2145 
2146  if (! db_id.IsStr() && tdb_id.IsStr())
2147  continue;
2148 
2149  if (db_id.IsStr() && tdb_id.IsStr() &&
2150  db_id.GetStr() <= tdb_id.GetStr())
2151  continue;
2152 
2153  if (! db_id.IsStr() && ! tdb_id.IsStr() &&
2154  db_id.GetId() <= tdb_id.GetId())
2155  continue;
2156  }
2157 
2158  db->Swap(*tdb);
2159  }
2160  }
2161 
2162  if (bio.GetOrg().IsSetOrgname() && bio.GetOrg().GetOrgname().IsSetMod()) {
2163  COrgName::TMod& rmod = bio.SetOrg().SetOrgname().SetMod();
2164  for (COrgName::TMod::iterator mod = rmod.begin(); mod != rmod.end(); ++mod) {
2165  COrgName::TMod::iterator tmod = mod;
2166  for (++tmod; tmod != rmod.end(); ++tmod) {
2167  if ((*mod)->GetSubtype() < (*tmod)->GetSubtype())
2168  continue;
2169 
2170  if ((*mod)->GetSubtype() == (*tmod)->GetSubtype() &&
2171  (*mod)->GetSubname() <= (*tmod)->GetSubname())
2172  continue;
2173 
2174  mod->Swap(*tmod);
2175  }
2176  }
2177  }
2178  }
2179 
2180  if (! bio.IsSetSubtype())
2181  return;
2182 
2183  CBioSource::TSubtype& rsub = bio.SetSubtype();
2184  for (CBioSource::TSubtype::iterator sub = rsub.begin(); sub != rsub.end(); ++sub) {
2185  CBioSource::TSubtype::iterator tsub = sub;
2186  for (++tsub; tsub != rsub.end(); ++tsub) {
2187  if ((*sub)->GetSubtype() < (*tsub)->GetSubtype())
2188  continue;
2189 
2190  if ((*sub)->GetSubtype() == (*tsub)->GetSubtype() &&
2191  (*sub)->GetName() <= (*tsub)->GetName())
2192  continue;
2193 
2194  sub->Swap(*tsub);
2195  }
2196  }
2197 }
2198 
2199 /**********************************************************/
2201 {
2202  string val = qual->GetVal();
2203  bool has_comma = val.find(',') != string::npos;
2204 
2205  if (has_comma) {
2206  std::replace(val.begin(), val.end(), ',', ';');
2207  qual->SetVal(val);
2208  }
2209 
2210  if (has_comma)
2211  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_MultRptUnitComma, "Converting commas to semi-colons due to format conventions for multiple /rpt_unit qualifiers.");
2212 }
2213 
2214 /**********************************************************/
2216 {
2217  if (! fbp || fbp->quals.empty())
2218  return;
2219 
2220  TQualVector::iterator first = fbp->quals.end();
2221  size_t len = 0, count = 0;
2222 
2223  for (TQualVector::iterator qual = fbp->quals.begin(); qual != fbp->quals.end();) {
2224  if ((*qual)->GetQual() != "rpt_unit") {
2225  ++qual;
2226  continue;
2227  }
2228 
2229  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_ObsoleteRptUnit, "Obsolete /rpt_unit qualifier found on feature \"%s\" at location \"%s\".", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown");
2230 
2231  if ((*qual)->GetVal().empty()) {
2232  qual = fbp->quals.erase(qual);
2233  continue;
2234  }
2235 
2236  count++;
2237  len += (*qual)->GetVal().size();
2238  if (first == fbp->quals.end())
2239  first = qual;
2240 
2241  if (count == 1) {
2242  ++qual;
2243  continue;
2244  }
2245 
2246  if (count == 2)
2248 
2249  ConvertQualifierValue(*qual);
2250  ++qual;
2251  }
2252 
2253  if (count == 0)
2254  return;
2255 
2256  if (count == 1) {
2257  const string& val = (*first)->GetVal();
2258  if (*val.begin() == '(' && *val.rbegin() == ')') {
2260  }
2261  return;
2262  }
2263 
2264  string p;
2265  p.reserve(len + count + 1);
2266  p.assign("(");
2267  p.append((*first)->GetVal());
2268 
2269  for (TQualVector::iterator qual = first; qual != fbp->quals.end();) {
2270  if ((*qual)->GetQual() != "rpt_unit") {
2271  ++qual;
2272  continue;
2273  }
2274 
2275  p.append(",");
2276  p.append((*qual)->GetVal());
2277  qual = fbp->quals.erase(qual);
2278  }
2279  p.append(")");
2280  (*first)->SetVal(p);
2281 }
2282 
2283 /**********************************************************/
2284 static bool fta_check_evidence(CSeq_feat& feat, FeatBlkPtr fbp)
2285 {
2286  Int4 evi_exp;
2287  Int4 evi_not;
2288  Int4 exp_good;
2289  Int4 exp_bad;
2290  Int4 inf_good;
2291  Int4 inf_bad;
2292  Char ch;
2293 
2294  if (! fbp || fbp->quals.empty())
2295  return true;
2296 
2297  evi_exp = 0;
2298  evi_not = 0;
2299  exp_good = 0;
2300  exp_bad = 0;
2301  inf_good = 0;
2302  inf_bad = 0;
2303 
2304  for (TQualVector::iterator qual = fbp->quals.begin(); qual != fbp->quals.end();) {
2305  const string& qual_str = (*qual)->IsSetQual() ? (*qual)->GetQual() : "";
2306  const string& val_str = (*qual)->IsSetVal() ? (*qual)->GetVal() : "";
2307  if (qual_str == "experiment") {
2308  if (val_str == "experimental evidence, no additional details recorded") {
2309  exp_good++;
2310  qual = fbp->quals.erase(qual);
2311  } else {
2312  exp_bad++;
2313  ++qual;
2314  }
2315  continue;
2316  }
2317 
2318  if (qual_str == "inference") {
2319  if (val_str == "non-experimental evidence, no additional details recorded") {
2320  inf_good++;
2321  qual = fbp->quals.erase(qual);
2322  } else {
2323  inf_bad++;
2324  ++qual;
2325  }
2326  continue;
2327  }
2328 
2329  if (qual_str != "evidence") {
2330  ++qual;
2331  continue;
2332  }
2333 
2334  if (NStr::CompareNocase(val_str.c_str(), "not_experimental") == 0)
2335  evi_not++;
2336  else if (NStr::CompareNocase(val_str.c_str(), "experimental") == 0)
2337  evi_exp++;
2338  else {
2339  if (fbp->location && StringLen(fbp->location) > 50) {
2340  ch = fbp->location[50];
2341  fbp->location[50] = '\0';
2342  } else
2343  ch = '\0';
2344  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidEvidence, "Illegal value \"%s\" for /evidence qualifier on the \"%s\" feature at \"%s\". Qualifier dropped.", val_str.empty() ? "Unknown" : val_str.c_str(), fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2345  if (ch != '\0')
2346  fbp->location[50] = ch;
2347  }
2348 
2349  qual = fbp->quals.erase(qual);
2350  }
2351 
2352  if (evi_exp + evi_not > 0 && exp_good + exp_bad + inf_good + inf_bad > 0) {
2353  if (fbp->location && StringLen(fbp->location) > 50) {
2354  ch = fbp->location[50];
2355  fbp->location[50] = '\0';
2356  } else
2357  ch = '\0';
2358  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict, "Old /evidence and new /experiment or /inference qualifiers both exist on the \"%s\" feature at \"%s\". This is currently unsupported.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2359  if (ch != '\0')
2360  fbp->location[50] = ch;
2361  return false;
2362  }
2363 
2364  if (evi_exp + exp_good > 0 && evi_not + inf_good > 0) {
2365  if (fbp->location && StringLen(fbp->location) > 50) {
2366  ch = fbp->location[50];
2367  fbp->location[50] = '\0';
2368  } else
2369  ch = '\0';
2370  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict, "The special \"no additional details recorded\" values for both /experiment and /inference exist on the \"%s\" feature at \"%s\". This is currently unsupported.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2371  if (ch != '\0')
2372  fbp->location[50] = ch;
2373  return false;
2374  }
2375 
2376  if ((exp_good > 0 && exp_bad > 0) || (inf_good > 0 && inf_bad > 0)) {
2377  if (fbp->location && StringLen(fbp->location) > 50) {
2378  ch = fbp->location[50];
2379  fbp->location[50] = '\0';
2380  } else
2381  ch = '\0';
2382  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict, "The special \"no additional details recorded\" value for /experiment or /inference exists in conjunction with other /experiment or /inference qualifiers on the \"%s\" feature at \"%s\". This is currently unsupported.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2383  if (ch != '\0')
2384  fbp->location[50] = ch;
2385  return false;
2386  }
2387 
2388  if (exp_good + evi_exp > 0)
2390  else if (inf_good + evi_not > 0)
2392  return true;
2393 }
2394 
2395 /**********************************************************
2396  *
2397  * static CRef<CSeq_feat> ProcFeatBlk(pp, fbp, seqids):
2398  *
2399  * Process each feature sub-block.
2400  * location, SeqLocPtr by calling Karl's routine,
2401  * Nml_gbparseint which return locmap = TRUE if mapping
2402  * location rules not work, then SeqLocPtr->whole = seqids[0].
2403  * sitesmap = TRUE if found "(sites" string, num_errs > 0
2404  * if any errors occurred.
2405  * If there is a illegal location, then assign
2406  * qualifier to be a Imp-feat.
2407  *
2408  **********************************************************/
2410 {
2411  const char** b;
2412 
2413  char* loc = nullptr;
2414 
2415  bool locmap = false;
2416  bool err = false;
2417 
2418  CRef<CSeq_feat> feat;
2419 
2420  if (fbp->location) {
2421  loc = fbp->location;
2422  DelCharBtwData(loc);
2423  if (pp->buf)
2424  MemFree(pp->buf);
2425  string s(fbp->key);
2426  s.append(" : ");
2427  s.append(loc);
2428  pp->buf = StringSave(s.c_str());
2429 
2430  feat.Reset(new CSeq_feat);
2431  locmap = GetSeqLocation(*feat, loc, seqids, &err, pp, fbp->key);
2432 
2433  if (pp->buf)
2434  MemFree(pp->buf);
2435  pp->buf = nullptr;
2436  }
2437  if (err) {
2438  if (pp->debug == false) {
2439  ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped, "%s|%s| range check detects problems", fbp->key, loc);
2440  feat.Reset();
2441  return feat;
2442  }
2443  ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "%s|%s| range check detects problems", fbp->key, loc);
2444  }
2445 
2446  if (! fbp->quals.empty()) {
2447  if (DeleteQual(fbp->quals, "partial"))
2448  feat->SetPartial(true);
2449  }
2450 
2451  if (StringStr(loc, "order"))
2452  feat->SetPartial(true);
2453 
2454  if (! fbp->quals.empty()) {
2455  if (DeleteQual(fbp->quals, "pseudo"))
2456  feat->SetPseudo(true);
2457  }
2458 
2459  if (! fbp->quals.empty())
2460  DeleteQual(fbp->quals, "gsdb_id");
2461 
2462  if (! fbp->quals.empty())
2463  fta_parse_rpt_units(fbp);
2464 
2465  if (! fbp->quals.empty()) {
2466  for (b = TransSplicingFeats; *b; b++)
2467  if (StringEqu(fbp->key, *b))
2468  break;
2469  if (*b && DeleteQual(fbp->quals, "trans_splicing")) {
2470  feat->SetExcept(true);
2471  if (! feat->IsSetExcept_text())
2472  feat->SetExcept_text("trans-splicing");
2473  else {
2474  string& exc_text = feat->SetExcept_text();
2475  exc_text += ", trans-splicing";
2476  }
2477  }
2478  }
2479 
2480  if (! fta_check_evidence(*feat, fbp)) {
2481  pp->entrylist[pp->curindx]->drop = true;
2482  return feat;
2483  }
2484 
2485  if ((! feat->IsSetPartial() || ! feat->GetPartial()) && ! StringEqu(fbp->key, "gap")) {
2486  if (SeqLocHaveFuzz(feat->GetLocation()))
2487  feat->SetPartial(true);
2488  }
2489 
2490  if (! fbp->quals.empty()) {
2491  char* comment = GetTheQualValue(fbp->quals, "note");
2492 
2493  if (comment) {
2494  if (comment[0]) {
2495  feat->SetComment(comment);
2496  }
2497  MemFree(comment);
2498  }
2499  }
2500 
2501  /* assume all imp for now
2502  */
2503  if (! StringStr(fbp->key, "source"))
2504  GetImpFeat(*feat, fbp, locmap);
2505 
2506  for (const auto& cur : fbp->quals) {
2507  const string& qual_str = cur->GetQual();
2508  if (qual_str == "pseudogene")
2509  feat->SetPseudo(true);
2510 
2511  // Do nothing for 'translation' qualifier in case of its value is empty
2512  if (qual_str == "translation" && (! cur->IsSetVal() || cur->GetVal().empty()))
2513  continue;
2514 
2515  if (! qual_str.empty())
2516  feat->SetQual().push_back(cur);
2517  }
2518 
2519  return feat;
2520 }
2521 
2522 /**********************************************************/
2523 static void fta_get_gcode_from_biosource(const CBioSource& bio_src, IndexblkPtr ibp)
2524 {
2525  if (! bio_src.IsSetOrg() || ! bio_src.GetOrg().IsSetOrgname())
2526  return;
2527 
2528  ibp->gc_genomic = bio_src.GetOrg().GetOrgname().IsSetGcode() ? bio_src.GetOrg().GetOrgname().GetGcode() : 0;
2529  ibp->gc_mito = bio_src.GetOrg().GetOrgname().IsSetMgcode() ? bio_src.GetOrg().GetOrgname().GetMgcode() : 0;
2530 }
2531 
2532 /**********************************************************/
2533 static void fta_sort_quals(FeatBlkPtr fbp, bool qamode)
2534 {
2535  if (! fbp)
2536  return;
2537 
2538  for (TQualVector::iterator q = fbp->quals.begin(); q != fbp->quals.end(); ++q) {
2539  if ((*q)->GetQual() == "gene" ||
2540  (! qamode && (*q)->GetQual() == "product"))
2541  continue;
2542 
2543  TQualVector::iterator tq = q;
2544  for (++tq; tq != fbp->quals.end(); ++tq) {
2545  const string& q_qual = (*q)->GetQual();
2546  const string& tq_qual = (*tq)->GetQual();
2547 
2548  if (! tq_qual.empty()) {
2549  if (q_qual == "gene")
2550  continue;
2551 
2552  Int4 i = NStr::CompareNocase(q_qual.c_str(), tq_qual.c_str());
2553  if (i < 0)
2554  continue;
2555  if (i == 0) {
2556  /* Do not sort /gene qualifiers
2557  */
2558  const string q_val = (*q)->GetVal();
2559  const string tq_val = (*tq)->GetVal();
2560 
2561  if (q_val.empty())
2562  continue;
2563 
2564  if (! tq_val.empty()) {
2565  if (q_val[0] >= '0' && q_val[0] <= '9' &&
2566  tq_val[0] >= '0' && tq_val[0] <= '9') {
2567  if (atoi(q_val.c_str()) <= atoi(tq_val.c_str()))
2568  continue;
2569  } else if (q_val <= tq_val)
2570  continue;
2571  }
2572  }
2573  }
2574 
2575  q->Swap(*tq);
2576  }
2577  }
2578 }
2579 
2580 /**********************************************************/
2581 static bool fta_qual_a_in_b(const TQualVector& qual1, const TQualVector& qual2)
2582 {
2583  bool found = false;
2584 
2585  for (const auto& gbqp1 : qual1) {
2586  found = false;
2587  for (const auto& gbqp2 : qual2) {
2588  const Char* qual_a = gbqp1->IsSetQual() ? gbqp1->GetQual().c_str() : nullptr;
2589  const Char* qual_b = gbqp2->IsSetQual() ? gbqp2->GetQual().c_str() : nullptr;
2590 
2591  const Char* val_a = gbqp1->IsSetVal() ? gbqp1->GetVal().c_str() : nullptr;
2592  const Char* val_b = gbqp2->IsSetVal() ? gbqp2->GetVal().c_str() : nullptr;
2593 
2594  if (fta_strings_same(qual_a, qual_b) && fta_strings_same(val_a, val_b)) {
2595  found = true;
2596  break;
2597  }
2598  }
2599  if (! found)
2600  break;
2601  }
2602 
2603  if (! found)
2604  return false;
2605 
2606  return true;
2607 }
2608 
2609 /**********************************************************/
2610 static bool fta_feats_same(const FeatBlk* fbp1, const FeatBlk* fbp2)
2611 {
2612  if (! fbp1 && ! fbp2)
2613  return true;
2614  if (! fbp1 || ! fbp2 ||
2615  fta_strings_same(fbp1->key, fbp2->key) == false ||
2616  fta_strings_same(fbp1->location, fbp2->location) == false)
2617  return false;
2618 
2619  if (fta_qual_a_in_b(fbp1->quals, fbp2->quals) && fta_qual_a_in_b(fbp2->quals, fbp1->quals))
2620  return true;
2621 
2622  return false;
2623 }
2624 
2625 /**********************************************************/
2626 static bool fta_check_rpt_unit_span(const char* val, size_t length)
2627 {
2628  const char* p;
2629  const char* q;
2630  Int4 i1;
2631  Int4 i2;
2632 
2633  if (! val || *val == '\0')
2634  return false;
2635 
2636  for (p = val; *p >= '0' && *p <= '9';)
2637  p++;
2638 
2639  if (p == val || p[0] != '.' || p[1] != '.')
2640  return false;
2641 
2642  i1 = atoi(val);
2643  for (p += 2, q = p; *q >= '0' && *q <= '9';)
2644  q++;
2645  if (q == p || *q != '\0')
2646  return false;
2647  i2 = atoi(p);
2648 
2649  if (i1 == 0 || i1 > i2 || i2 > (Int4)length)
2650  return false;
2651  return true;
2652 }
2653 
2654 /**********************************************************/
2655 static void fta_check_rpt_unit_range(FeatBlkPtr fbp, size_t length)
2656 {
2657  Char ch;
2658 
2659  if (! fbp || fbp->quals.empty())
2660  return;
2661 
2662  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();) {
2663  if (! (*cur)->IsSetQual() || ! (*cur)->IsSetVal()) {
2664  ++cur;
2665  continue;
2666  }
2667 
2668  const string& qual_str = (*cur)->GetQual();
2669  const string& val_str = (*cur)->GetVal();
2670 
2671  if (qual_str != "rpt_unit_range" || fta_check_rpt_unit_span(val_str.c_str(), length)) {
2672  ++cur;
2673  continue;
2674  }
2675 
2676  if (fbp->location && StringLen(fbp->location) > 20) {
2677  ch = fbp->location[20];
2678  fbp->location[20] = '\0';
2679  } else
2680  ch = '\0';
2681  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidRptUnitRange, "/rpt_unit_range qualifier \"%s\" on feature \"%s\" at location \"%s%s\" is not a valid basepair range. Qualifier dropped.", val_str.empty() ? "(EMPTY)" : val_str.c_str(), fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown", (ch == '\0') ? "" : "...");
2682  if (ch != '\0')
2683  fbp->location[20] = ch;
2684 
2685  cur = fbp->quals.erase(cur);
2686  }
2687 }
2688 
2689 /**********************************************************/
2691 {
2692  DataBlkPtr tdbp;
2693  DataBlkPtr tdbpprev;
2694  DataBlkPtr tdbpnext;
2695  const FeatBlk* fbp1;
2696  FeatBlkPtr fbp2;
2697  Char ch;
2698 
2699  if (! dbp || ! dbp->mpNext)
2700  return;
2701 
2702  for (; dbp; dbp = dbp->mpNext) {
2703  if (! dbp->mpData)
2704  continue;
2705 
2706  fbp1 = static_cast<const FeatBlk*>(dbp->mpData);
2707  tdbpprev = dbp;
2708  for (tdbp = dbp->mpNext; tdbp; tdbp = tdbpnext) {
2709  tdbpnext = tdbp->mpNext;
2710  if (! tdbp->mpData) {
2711  tdbpprev->mpNext = tdbpnext;
2712  tdbp->SimpleDelete();
2713  continue;
2714  }
2715 
2716  fbp2 = static_cast<FeatBlk*>(tdbp->mpData);
2717 
2718  if (fbp1->location && fbp2->location &&
2719  StringCmp(fbp1->location, fbp2->location) < 0)
2720  break;
2721 
2722  if (! fta_feats_same(fbp1, fbp2)) {
2723  tdbpprev = tdbp;
2724  continue;
2725  }
2726 
2727  if (fbp2->location && StringLen(fbp2->location) > 20) {
2728  ch = fbp2->location[20];
2729  fbp2->location[20] = '\0';
2730  } else
2731  ch = '\0';
2732  ErrPostEx(SEV_WARNING, ERR_FEATURE_DuplicateRemoved, "Duplicated feature \"%s\" at location \"%s%s\" removed.", fbp2->key ? fbp2->key : "???", fbp2->location ? fbp2->location : "???", (ch == '\0') ? "" : "...");
2733 
2734  delete fbp2;
2735  tdbpprev->mpNext = tdbpnext;
2736  tdbp->SimpleDelete();
2737  }
2738  }
2739 }
2740 
2741 /**********************************************************/
2743 {
2744 public:
2745  PredIsGivenQual(const string& qual) :
2746  qual_(qual) {}
2747 
2748  bool operator()(const CRef<CGb_qual>& qual)
2749  {
2750  return qual->GetQual() == qual_;
2751  }
2752 
2753 private:
2754  string qual_;
2755 };
2756 
2757 static void fta_check_multiple_locus_tag(DataBlkPtr dbp, bool* drop)
2758 {
2759  FeatBlkPtr fbp;
2760  Char ch;
2761 
2762  for (; dbp; dbp = dbp->mpNext) {
2763  fbp = static_cast<FeatBlk*>(dbp->mpData);
2764  if (! fbp)
2765  continue;
2766 
2767  size_t i = std::count_if(fbp->quals.begin(), fbp->quals.end(), PredIsGivenQual("locus_tag"));
2768  if (i < 2)
2769  continue;
2770 
2771  if (fbp->location && StringLen(fbp->location) > 50) {
2772  ch = fbp->location[50];
2773  fbp->location[50] = '\0';
2774  } else
2775  ch = '\0';
2776  ErrPostEx(SEV_REJECT, ERR_FEATURE_MultipleLocusTags, "Multiple /locus_tag values for \"%s\" feature at \"%s\".", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2777  if (ch != '\0')
2778  fbp->location[50] = ch;
2779  *drop = true;
2780  break;
2781  }
2782 }
2783 
2784 /**********************************************************/
2785 static void fta_check_old_locus_tags(DataBlkPtr dbp, bool* drop)
2786 {
2787  Int4 i;
2788 
2789  PredIsGivenQual isOldLocusTag("old_locus_tag"),
2790  isLocusTag("locus_tag");
2791 
2792  for (; dbp; dbp = dbp->mpNext) {
2793  FeatBlkPtr fbp = static_cast<FeatBlk*>(dbp->mpData);
2794  if (! fbp)
2795  continue;
2796  size_t olt = std::count_if(fbp->quals.begin(), fbp->quals.end(), isOldLocusTag);
2797  size_t lt = std::count_if(fbp->quals.begin(), fbp->quals.end(), isLocusTag);
2798 
2799  if (olt == 0)
2800  continue;
2801 
2802  if (lt == 0) {
2803  ErrPostEx(SEV_REJECT, ERR_FEATURE_OldLocusTagWithoutNew, "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier but lacks a /locus_tag qualifier. Entry dropped.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2804  *drop = true;
2805  } else {
2806  i = 0;
2807  for (const auto& gbqp1 : fbp->quals) {
2808  if (! gbqp1->IsSetQual() || ! gbqp1->IsSetVal() || ! isLocusTag(gbqp1))
2809  continue;
2810 
2811  i++;
2812 
2813  const string& gbqp1_val = gbqp1->GetVal();
2814  if (gbqp1_val.empty())
2815  continue;
2816 
2817  for (const auto& gbqp2 : fbp->quals) {
2818  if (! gbqp2->IsSetQual() || ! gbqp2->IsSetVal())
2819  continue;
2820 
2821  const string& gbqp2_val = gbqp2->GetVal();
2822 
2823  if (! isOldLocusTag(gbqp2) || ! NStr::EqualNocase(gbqp1_val, gbqp2_val))
2824  continue;
2825 
2826  ErrPostEx(SEV_REJECT, ERR_FEATURE_MatchingOldNewLocusTag, "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier with a value that is identical to that of a /locus_tag qualifier: \"%s\". Entry dropped.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location", gbqp1_val.c_str());
2827  *drop = true;
2828  }
2829  }
2830  }
2831 
2832  if (olt == 1)
2833  continue;
2834 
2835  for (TQualVector::const_iterator gbqp1 = fbp->quals.begin(); gbqp1 != fbp->quals.end(); ++gbqp1) {
2836  const string& gbqp1_val = (*gbqp1)->GetVal();
2837  if (isOldLocusTag(*gbqp1) || gbqp1_val.empty())
2838  continue;
2839 
2840  TQualVector::const_iterator gbqp2 = gbqp1;
2841  for (++gbqp2; gbqp2 != fbp->quals.end(); ++gbqp2) {
2842  const string& gbqp2_val = (*gbqp2)->GetVal();
2843  if (isOldLocusTag(*gbqp2) || gbqp2_val.empty())
2844  continue;
2845 
2846  if (NStr::CompareNocase(gbqp1_val.c_str(), gbqp2_val.c_str()) == 0) {
2847  ErrPostEx(SEV_ERROR, ERR_FEATURE_RedundantOldLocusTag, "Feature \"%s\" at \"%s\" has redundant /old_locus_tag qualifiers. Dropping all but the first.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2848  break;
2849  }
2850  }
2851 
2852  if (gbqp2 != fbp->quals.end())
2853  break;
2854  }
2855  }
2856 }
2857 
2858 /**********************************************************/
2860 {
2861  FeatBlkPtr fbp;
2862  bool got_pseudogene;
2863  bool got_pseudo;
2864 
2865  for (; dbp; dbp = dbp->mpNext) {
2866  fbp = static_cast<FeatBlk*>(dbp->mpData);
2867  if (! fbp)
2868  continue;
2869 
2870  got_pseudo = false;
2871  got_pseudogene = false;
2872 
2873  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();) {
2874  const string& qual_str = (*cur)->GetQual();
2875  const string& val_str = (*cur)->IsSetVal() ? (*cur)->GetVal() : "";
2876 
2877  if (qual_str != "pseudogene") {
2878  if (! got_pseudo && qual_str == "pseudo")
2879  got_pseudo = true;
2880  ++cur;
2881  continue;
2882  }
2883 
2884  if (got_pseudogene) {
2885  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_MultiplePseudoGeneQuals, "Dropping a /pseudogene qualifier because multiple /pseudogene qualifiers are present : <%s> : Feature key <%s> : Feature location <%s>.", val_str.empty() ? "[empty]" : val_str.c_str(), fbp->key, fbp->location);
2886 
2887  cur = fbp->quals.erase(cur);
2888  continue;
2889  }
2890 
2891  got_pseudogene = true;
2892 
2893  if (val_str.empty()) {
2894  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidPseudoGeneValue, "Dropping a /pseudogene qualifier because its value is empty : Feature key <%s> : Feature location <%s>.", fbp->key, fbp->location);
2895 
2896  cur = fbp->quals.erase(cur);
2897  continue;
2898  }
2899 
2900  if (MatchArrayString(PseudoGeneValues, val_str.c_str()) >= 0) {
2901  ++cur;
2902  continue;
2903  }
2904 
2905  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidPseudoGeneValue, "Dropping a /pseudogene qualifier because its value is invalid : <%s> : Feature key <%s> : Feature location <%s>.", val_str.c_str(), fbp->key, fbp->location);
2906 
2907  cur = fbp->quals.erase(cur);
2908  }
2909 
2910  if (! got_pseudogene || ! got_pseudo)
2911  continue;
2912 
2913  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_OldPseudoWithPseudoGene, "A legacy /pseudo qualifier and a /pseudogene qualifier are present on the same feature : Dropping /pseudo : Feature key <%s> : Feature location <%s>.", fbp->key, fbp->location);
2914  DeleteQual(fbp->quals, "pseudo");
2915  }
2916 }
2917 
2918 /**********************************************************/
2919 static void fta_check_compare_qual(DataBlkPtr dbp, bool is_tpa)
2920 {
2921  FeatBlkPtr fbp;
2922  char* p;
2923  char* q;
2924  bool badcom;
2925  Char ch;
2926  Int4 com_count;
2927  Int4 cit_count;
2928 
2929  for (; dbp; dbp = dbp->mpNext) {
2930  fbp = static_cast<FeatBlk*>(dbp->mpData);
2931  if (! fbp)
2932  continue;
2933 
2934  com_count = 0;
2935  cit_count = 0;
2936 
2937  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();) {
2938  const string& qual_str = (*cur)->GetQual();
2939  string dummy;
2940  string& val_str = (*cur)->IsSetVal() ? (*cur)->SetVal() : dummy;
2941 
2942  if (qual_str == "compare") {
2943  badcom = true;
2944  if (! val_str.empty()) {
2945  q = StringChr(val_str.data(), '.');
2946  if (q && q[1] != '\0') {
2947  for (p = q + 1; *p >= '0' && *p <= '9';)
2948  p++;
2949  if (*p == '\0') {
2950  *q = '\0';
2951  if (GetNucAccOwner(val_str.c_str()) > CSeq_id::e_not_set)
2952  badcom = false;
2953  *q = '.';
2954  }
2955  }
2956  }
2957 
2958  if (badcom) {
2959  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_IllegalCompareQualifier, "/compare qualifier value is not a legal Accession.Version : feature \"%s\" at \"%s\" : value \"%s\" : qualifier has been dropped.", fbp->key, fbp->location, val_str.empty() ? "[empty]" : val_str.c_str());
2960 
2961  cur = fbp->quals.erase(cur);
2962  continue;
2963  }
2964  com_count++;
2965  } else if (qual_str == "citation")
2966  cit_count++;
2967 
2968  ++cur;
2969  }
2970 
2971  if (com_count > 0 || cit_count > 0 ||
2972  (! StringEqu(fbp->key, "old_sequence") &&
2973  ! StringEqu(fbp->key, "conflict")))
2974  continue;
2975 
2976  ch = '\0';
2977  if (StringLen(fbp->location) > 30) {
2978  ch = fbp->location[30];
2979  fbp->location[30] = '\0';
2980  }
2981  ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing, "Feature \"%s\" at \"%s\" lacks required /citation and/or /compare qualifier : feature has been dropped.", fbp->key, fbp->location);
2982  if (ch != '\0')
2983  fbp->location[30] = ch;
2984  dbp->mDrop = true;
2985  }
2986 }
2987 
2988 /**********************************************************/
2990  IndexblkPtr ibp)
2991 {
2992  FeatBlkPtr fbp;
2993  char* location;
2994  char* p;
2995  char* q;
2996  char* r;
2997  Uint1 i;
2998 
2999  location = nullptr;
3000  for (; dbp; dbp = dbp->mpNext) {
3001  fbp = static_cast<FeatBlk*>(dbp->mpData);
3002  if (! fbp || ! fbp->location)
3003  continue;
3004  location = StringSave(fbp->location);
3005  for (p = location, q = p; *p != '\0'; p++)
3006  if (*p != ' ' && *p != '\t' && *p != '\n')
3007  *q++ = *p;
3008  *q = '\0';
3009  if (q == location) {
3010  MemFree(location);
3011  location = nullptr;
3012  continue;
3013  }
3014 
3015  for (p = location + 1; *p != '\0'; p++) {
3016  if (*p != ':')
3017  continue;
3018  for (r = nullptr, q = p - 1;; q--) {
3019  if (q == location) {
3020  if (*q != '_' && (*q < '0' || *q > '9') &&
3021  (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
3022  q++;
3023  break;
3024  }
3025  if (*q == '.') {
3026  if (! r) {
3027  r = q;
3028  continue;
3029  }
3030  q++;
3031  break;
3032  }
3033  if (*q != '_' && (*q < '0' || *q > '9') &&
3034  (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z')) {
3035  q++;
3036  break;
3037  }
3038  }
3039  if (q == p)
3040  continue;
3041  if (r)
3042  *r = '\0';
3043  else
3044  *p = '\0';
3045  i = GetNucAccOwner(q);
3046  if (r)
3047  *r = '.';
3048  else
3049  *p = ':';
3050 
3051 
3052  if (i == CSeq_id::e_Genbank && (q[0] == 'e' || q[0] == 'E') &&
3053  (q[1] == 'z' || q[1] == 'Z') && ibp->is_tpa == false)
3054  continue;
3055  if (ibp->is_tpa && (i == CSeq_id::e_Tpg || i == CSeq_id::e_Tpd ||
3056  i == CSeq_id::e_Tpe))
3057  continue;
3058  break;
3059  }
3060  if (*p != '\0')
3061  break;
3062  if (location) {
3063  MemFree(location);
3064  location = nullptr;
3065  }
3066  }
3067  if (! dbp)
3068  return;
3069 
3070  ibp->drop = true;
3071  if (location && StringLen(location) > 45) {
3072  location[40] = '\0';
3073  StringCat(location, "...");
3074  }
3075  if (ibp->is_tsa)
3076  ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTSA, "Feature \"%s\" at \"%s\" on a TSA record cannot point to a non-TSA record.", fbp->key, location ? location : "empty_location");
3077  else if (ibp->is_tls)
3078  ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTLS, "Feature \"%s\" at \"%s\" on a TLS record cannot point to a non-TLS record.", fbp->key, location ? location : "empty_location");
3079  else
3080  ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTPA, "Feature \"%s\" at \"%s\" on a TPA record cannot point to a non-TPA record.", fbp->key, location ? location : "empty_location");
3081  if (location)
3082  MemFree(location);
3083 }
3084 
3085 /**********************************************************/
3087 {
3088  using FTAOperonList = list<FTAOperon*>;
3089  FTAOperonList operonList;
3090  FTAOperonList residentList;
3091  bool success = true;
3092 
3093  if (feats.empty()) {
3094  return true;
3095  }
3096 
3097  for (const auto& pFeat : feats) {
3098  if (! pFeat->GetData().IsImp())
3099  continue;
3100 
3101  const auto& featLocation = pFeat->GetLocation();
3102  const CImp_feat& featImp = pFeat->GetData().GetImp();
3103  FTAOperon* pLatest(nullptr);
3104  int opQualCount(0);
3105 
3106  for (const auto& pQual : pFeat->GetQual()) {
3107  const auto& qual = *pQual;
3108  if (! qual.IsSetQual() || qual.GetQual() != "operon" ||
3109  ! qual.IsSetVal() || qual.GetVal().empty()) {
3110  continue;
3111  }
3112  opQualCount++;
3113 
3114  pLatest = new FTAOperon(
3115  featImp.IsSetKey() ? featImp.GetKey().c_str() : "Unknown",
3116  qual.GetVal(),
3117  featLocation);
3118  if (pLatest->IsOperon()) {
3119  operonList.push_back(pLatest);
3120  } else {
3121  residentList.push_back(pLatest);
3122  continue;
3123  }
3124  for (const auto& operon : operonList) {
3125  if (pLatest == operon) {
3126  continue;
3127  }
3128  if (pLatest->mOperon != operon->mOperon) {
3129  continue;
3130  }
3131  ErrPostEx(SEV_REJECT, ERR_FEATURE_OperonQualsNotUnique, "The operon features at \"%s\" and \"%s\" utilize the same /operon qualifier : \"%s\".", operon->LocationStr().c_str(), pLatest->LocationStr().c_str(), pLatest->mOperon.c_str());
3132  success = false;
3133  }
3134  }
3135 
3136  if (opQualCount > 1) {
3137  ErrPostEx(SEV_REJECT, ERR_FEATURE_MultipleOperonQuals, "Feature \"%s\" at \"%s\" has more than one operon qualifier.", pLatest->mFeatname.c_str(), pLatest->LocationStr().c_str());
3138  success = false;
3139  }
3140 
3141  if (opQualCount == 0 && featImp.IsSetKey() && featImp.GetKey() == "operon") {
3142  ErrPostEx(SEV_REJECT, ERR_FEATURE_MissingOperonQual, "The operon feature at \"%s\" lacks an /operon qualifier.", location_to_string_or_unknown(featLocation).c_str());
3143  success = false;
3144  }
3145  }
3146 
3147  for (const auto& resident : residentList) {
3148  bool matched = false;
3149  for (const auto& operon : operonList) {
3150  if (resident->mOperon != operon->mOperon) {
3151  continue;
3152  }
3153  matched = true;
3155  *resident->mLocation, *operon->mLocation, nullptr, sequence::fCompareOverlapping);
3156  if (compare != sequence::eContained && compare != sequence::eSame) {
3157  ErrPostEx(SEV_REJECT, ERR_FEATURE_OperonLocationMisMatch, "Feature \"%s\" at \"%s\" with /operon qualifier \"%s\" does not fall within the span of the operon feature at \"%s\".", resident->mFeatname.c_str(), resident->LocationStr().c_str(), resident->mOperon.c_str(), operon->LocationStr().c_str());
3158  success = false;
3159  }
3160  }
3161  if (! matched) {
3162  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidOperonQual, "/operon qualifier \"%s\" on feature \"%s\" at \"%s\" has a value that does not match any of the /operon qualifiers on operon features.", resident->mOperon.c_str(), resident->mFeatname.c_str(), resident->LocationStr().c_str());
3163  success = false;
3164  }
3165  }
3166  for (auto& resident : residentList) {
3167  delete resident;
3168  }
3169  for (auto& operon : operonList) {
3170  delete operon;
3171  }
3172  return success;
3173 }
3174 
3175 /**********************************************************/
3177 {
3178  Char ch;
3179 
3180  if (! fbp || fbp->quals.empty())
3181  return;
3182 
3183  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end(); ++cur) {
3184  const char* cur_qual = (*cur)->IsSetQual() ? (*cur)->GetQual().c_str() : nullptr;
3185  const char* cur_val = (*cur)->IsSetVal() ? (*cur)->GetVal().c_str() : nullptr;
3186 
3187  TQualVector::iterator next = cur;
3188  for (++next; next != fbp->quals.end();) {
3189  const char* next_qual = (*next)->IsSetQual() ? (*next)->GetQual().c_str() : nullptr;
3190  const char* next_val = (*next)->IsSetVal() ? (*next)->GetVal().c_str() : nullptr;
3191 
3192  if (! fta_strings_same(cur_qual, next_qual) || ! fta_strings_same(cur_val, next_val)) {
3193  ++next;
3194  continue;
3195  }
3196 
3197  if (fbp->location && StringLen(fbp->location) > 20) {
3198  ch = fbp->location[20];
3199  fbp->location[20] = '\0';
3200  } else
3201  ch = '\0';
3202 
3203  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DuplicateRemoved, "Duplicated qualifier \"%s\" in feature \"%s\" at location \"%s%s\" removed.", cur_qual ? cur_qual : "???", fbp->key ? fbp->key : "???", fbp->location ? fbp->location : "???", (ch == '\0') ? "" : "...");
3204 
3205  if (ch != '\0')
3206  fbp->location[20] = ch;
3207 
3208  next = fbp->quals.erase(next);
3209  }
3210  }
3211 }
3212 
3213 /**********************************************************/
3214 static void CollectGapFeats(const DataBlk& entry, DataBlkPtr dbp, ParserPtr pp, Int2 type)
3215 {
3216  IndexblkPtr ibp;
3217  GapFeatsPtr gfp = nullptr;
3218  GapFeatsPtr tgfp;
3219  DataBlkPtr tdbp;
3220  FeatBlkPtr fbp;
3221 
3222  CLinkage_evidence::TLinkage_evidence asn_linkage_evidence;
3223  list<string> linkage_evidence_names;
3224 
3225  StrNum* snp;
3226  char* p;
3227  char* q;
3228  const char* gap_type;
3229  bool finished_gap;
3230  ErrSev sev;
3231  Int4 estimated_length;
3232  Int4 is_htg;
3233  Int4 from;
3234  Int4 to;
3235  Int4 prev_gap; /* 0 - initial, 1 - "gap",
3236  2 - "assembly_gap" */
3237  Int4 curr_gap; /* 0 - initial, 1 - "gap",
3238  2 - "assembly_gap" */
3239  CSeq_gap::TType asn_gap_type;
3240 
3241  ibp = pp->entrylist[pp->curindx];
3242 
3243  if (ibp->keywords.empty()) {
3244  if (pp->format == Parser::EFormat::GenBank)
3246  else if (pp->format == Parser::EFormat::EMBL)
3248  else if (pp->format == Parser::EFormat::XML)
3249  XMLGetKeywords(entry.mOffset, ibp->xip, ibp->keywords);
3250  }
3251 
3252  is_htg = -1;
3253  for (const string& key : ibp->keywords) {
3254  if (is_htg >= 0 && is_htg <= 2)
3255  break;
3256  if (key == "HTG")
3257  is_htg = 3;
3258  else if (key == "HTGS_PHASE0")
3259  is_htg = 0;
3260  else if (key == "HTGS_PHASE1")
3261  is_htg = 1;
3262  else if (key == "HTGS_PHASE2")
3263  is_htg = 2;
3264  else if (key == "HTGS_PHASE3")
3265  is_htg = 3;
3266  }
3267 
3268  // prev_gap = 0;
3269  curr_gap = 0;
3270  finished_gap = false;
3271  for (ibp->gaps = nullptr; dbp; dbp = dbp->mpNext) {
3272  if (ibp->drop)
3273  break;
3274  if (dbp->mType != type)
3275  continue;
3276 
3277  linkage_evidence_names.clear();
3278  asn_linkage_evidence.clear();
3279 
3280  for (tdbp = static_cast<DataBlk*>(dbp->mpData); tdbp; tdbp = tdbp->mpNext) {
3281  if (ibp->drop)
3282  break;
3283  fbp = static_cast<FeatBlk*>(tdbp->mpData);
3284  if (! fbp || ! fbp->key)
3285  continue;
3286  if (StringEqu(fbp->key, "gap")) {
3287  prev_gap = curr_gap;
3288  curr_gap = 1;
3289  } else if (StringEqu(fbp->key, "assembly_gap")) {
3290  prev_gap = curr_gap;
3291  curr_gap = 2;
3292  } else
3293  continue;
3294 
3295  from = 0;
3296  to = 0;
3297  gap_type = nullptr;
3298  linkage_evidence_names.clear();
3299  asn_gap_type = -1;
3300  asn_linkage_evidence.clear();
3301  estimated_length = -1;
3302 
3303  for (const auto& cur : fbp->quals) {
3304  if (! cur->IsSetQual() || ! cur->IsSetVal())
3305  continue;
3306 
3307  const string& cur_qual = cur->GetQual();
3308  const string& cur_val = cur->GetVal();
3309 
3310  if (cur_qual.empty() || cur_val.empty())
3311  continue;
3312 
3313  if (cur_qual == "estimated_length") {
3314  if (cur_val == "unknown")
3315  estimated_length = -100;
3316  else {
3317  const char* cp = cur_val.c_str();
3318  for (; *cp >= '0' && *cp <= '9';)
3319  ++cp;
3320  if (*cp == '\0')
3321  estimated_length = atoi(cur_val.c_str());
3322  }
3323  } else if (cur_qual == "gap_type")
3324  gap_type = cur_val.c_str();
3325  else if (cur_qual == "linkage_evidence") {
3326  linkage_evidence_names.push_back(cur_val);
3327  }
3328  }
3329 
3330  if (fbp->location) {
3331  p = fbp->location;
3332  if (*p == '<')
3333  p++;
3334  for (q = p; *p >= '0' && *p <= '9';)
3335  p++;
3336  if (*p == '\0') {
3337  from = atoi(q);
3338  to = from;
3339  } else if (*p == '.') {
3340  *p = '\0';
3341  from = atoi(q);
3342  *p++ = '.';
3343  if (*fbp->location == '<' && from != 1)
3344  from = 0;
3345  else if (*p == '.') {
3346  if (*++p == '>')
3347  p++;
3348  for (q = p; *p >= '0' && *p <= '9';)
3349  p++;
3350  if (*p == '\0')
3351  to = atoi(q);
3352  if (*(q - 1) == '>' && to != (int)ibp->bases)
3353  to = 0;
3354  }
3355  }
3356  }
3357 
3358  if (from == 0 || to == 0 || from > to) {
3359  if (curr_gap == 1)
3360  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidGapLocation, "Invalid gap feature location : \"%s\" : all gap features must have a simple X..Y location on the plus strand.", fbp->location ? fbp->location : "unknown");
3361  else
3362  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidAssemblyGapLocation, "Invalid assembly_gap location : \"%s\".", fbp->location ? fbp->location : "unknown");
3363  ibp->drop = true;
3364  break;
3365  }
3366 
3367  if (curr_gap == 2) /* "assembly_gap" feature */
3368  {
3369  if (gap_type && is_htg > -1 &&
3370  ! StringEqu(gap_type, "within scaffold") &&
3371  ! StringEqu(gap_type, "repeat within scaffold"))
3372  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_UnexpectedGapTypeForHTG, "assembly_gap has /gap_type of \"%s\", but clone-based HTG records are only expected to have \"within scaffold\" or \"repeat within scaffold\" gaps. assembly_gap feature located at \"%d..%d\".", gap_type, from, to);
3373 
3374  if (is_htg == 0 || is_htg == 1) {
3375  for (const string& evidence : linkage_evidence_names) {
3377  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_LinkageShouldBeUnspecified, "assembly gap has /linkage_evidence of \"%s\", but unoriented and unordered Phase0/Phase1 HTG records are expected to have \"unspecified\" evidence. assembly_gap feature located at \"%d..%d\".", evidence.c_str(), from, to);
3378  }
3379  }
3380  } else if (is_htg == 2 || is_htg == 3) {
3381  for (const string& evidence : linkage_evidence_names) {
3383  continue;
3384 
3385  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_LinkageShouldNotBeUnspecified, "assembly gap has /linkage_evidence of \"unspecified\", but ordered and oriented HTG records are expected to have some level of linkage for their gaps. assembly_gap feature located at \"%d..%d\".", from, to);
3386  }
3387  }
3388 
3389  if (is_htg == 3 && ! finished_gap) {
3390  ErrPostEx(SEV_ERROR, ERR_FEATURE_FinishedHTGHasAssemblyGap, "Finished Phase-3 HTG records are not expected to have any gaps. First assembly_gap feature encountered at \"%d..%d\".", from, to);
3391  finished_gap = true;
3392  }
3393 
3394  if (! gap_type) {
3395  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingGapType, "assembly_gap feature at \"%d..%d\" lacks the required /gap_type qualifier.", from, to);
3396  ibp->drop = true;
3397  break;
3398  }
3399 
3400  for (snp = GapTypeValues; snp->str; snp++)
3401  if (StringEqu(snp->str, gap_type))
3402  break;
3403  if (! snp->str) {
3404  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidGapType, "assembly_gap feature at \"%d..%d\" has an invalid gap type : \"%s\".", from, to, gap_type);
3405  ibp->drop = true;
3406  break;
3407  }
3408  asn_gap_type = snp->num;
3409 
3410  if (linkage_evidence_names.empty() &&
3411  (StringEqu(gap_type, "within scaffold") ||
3412  StringEqu(gap_type, "repeat within scaffold"))) {
3413  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingLinkageEvidence, "assembly_gap feature at \"%d..%d\" with gap type \"%s\" lacks a /linkage_evidence qualifier.", from, to, gap_type);
3414  ibp->drop = true;
3415  break;
3416  }
3417  if (! linkage_evidence_names.empty()) {
3418  if (! StringEqu(gap_type, "unknown") &&
3419  ! StringEqu(gap_type, "within scaffold") &&
3420  ! StringEqu(gap_type, "repeat within scaffold")) {
3423  "The /linkage_evidence qualifier is not legal for the assembly_gap feature at \"%d..%d\" with /gap_type \"%s\".",
3424  from,
3425  to,
3426  gap_type);
3427  ibp->drop = true;
3428  break;
3429  }
3430 
3431  for (const string& evidence : linkage_evidence_names) {
3432  for (snp = LinkageEvidenceValues; snp->str; snp++)
3433  if (evidence == snp->str)
3434  break;
3435  if (! snp->str) {
3438  "assembly_gap feature at \"%d..%d\" has an invalid linkage evidence : \"%s\".",
3439  from,
3440  to,
3441  evidence.c_str());
3442  ibp->drop = true;
3443  break;
3444  }
3445 
3446  CRef<CLinkage_evidence> new_evidence(new CLinkage_evidence);
3447  new_evidence->SetType(snp->num);
3448  asn_linkage_evidence.push_back(new_evidence);
3449  }
3450  }
3451  }
3452 
3453  if (prev_gap + curr_gap == 3) {
3454  if (curr_gap == 1)
3455  ErrPostEx(SEV_REJECT, ERR_FEATURE_AssemblyGapAndLegacyGap, "Legacy gap feature at \"%d..%d\" co-exists with a new AGP 2.0 assembly_gap feature at \"%d..%d\".", from, to, gfp->from, gfp->to);
3456  else
3457  ErrPostEx(SEV_REJECT, ERR_FEATURE_AssemblyGapAndLegacyGap, "Legacy gap feature at \"%d..%d\" co-exists with a new AGP 2.0 assembly_gap feature at \"%d..%d\".", gfp->from, gfp->to, from, to);
3458  ibp->drop = true;
3459  break;
3460  }
3461 
3462  if (estimated_length == -1) /* missing qual */
3463  {
3464  ErrPostEx(SEV_REJECT, ERR_FEATURE_RequiredQualifierMissing, "The gap feature at \"%d..%d\" lacks the required /estimated_length qualifier.", from, to);
3465  ibp->drop = true;
3466  } else if (estimated_length == 0) {
3467  ErrPostEx(SEV_REJECT, ERR_FEATURE_IllegalEstimatedLength, "Gap feature at \"%d..%d\" has an illegal /estimated_length qualifier : \"%s\" : should be \"unknown\" or an integer.",
3468  // from, to, gbqp->val); // at this point gbqp is definitely = NULL
3469  from,
3470  to,
3471  "");
3472  ibp->drop = true;
3473  } else if (estimated_length == -100) {
3474  if (is_htg >= 0 && to - from != 99) {
3475  ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownGapNot100, "Gap feature at \"%d..%d\" has /estimated_length \"unknown\" but the gap size is not 100 bases.", from, to);
3476  }
3477  } else if (estimated_length != to - from + 1) {
3479  sev = SEV_ERROR;
3480  else {
3481  sev = SEV_REJECT;
3482  ibp->drop = true;
3483  }
3484 
3485  ErrPostEx(sev, ERR_FEATURE_GapSizeEstLengthMissMatch, "Gap feature at \"%d..%d\" has a size that does not match the /estimated_length : %d.", from, to, estimated_length);
3486  }
3487 
3488  for (gfp = ibp->gaps; gfp; gfp = gfp->next) {
3489  if ((gfp->from >= from && gfp->from <= to) ||
3490  (gfp->to >= from && gfp->to <= to) ||
3491  (gfp->from <= from && gfp->to >= to)) {
3492  ErrPostEx(SEV_REJECT, ERR_FEATURE_OverlappingGaps, "Gap features at \"%d..%d\" and \"%d..%d\" overlap.", from, to, gfp->from, gfp->to);
3493  ibp->drop = true;
3494  } else if (to + 1 == gfp->from || from - 1 == gfp->to) {
3495  if (pp->source == Parser::ESource::EMBL)
3496  sev = SEV_ERROR;
3497  else {
3498  sev = SEV_REJECT;
3499  ibp->drop = true;
3500  }
3501 
3502  ErrPostEx(sev, ERR_FEATURE_ContiguousGaps, "Gap features at \"%d..%d\" and \"%d..%d\" are contiguous, and should probably be represented by a single gap that spans both.", from, to, gfp->from, gfp->to);
3503  }
3504  }
3505  if (ibp->drop)
3506  break;
3507 
3508  gfp = new GapFeats;
3509  gfp->from = from;
3510  gfp->to = to;
3511  gfp->estimated_length = estimated_length;
3512  if (curr_gap == 2) /* /assembly_gap feature */
3513  gfp->assembly_gap = true;
3514  if (gap_type) {
3515  gfp->gap_type = gap_type;
3516  gfp->asn_gap_type = asn_gap_type;
3517  }
3518  if (! asn_linkage_evidence.empty()) {
3519  gfp->asn_linkage_evidence.swap(asn_linkage_evidence);
3520  asn_linkage_evidence.clear();
3521  }
3522  gfp->next = nullptr;
3523 
3524  if (! ibp->gaps) {
3525  ibp->gaps = gfp;
3526  continue;
3527  }
3528 
3529  if (ibp->gaps->from > from) {
3530  gfp->next = ibp->gaps;
3531  ibp->gaps = gfp;
3532  continue;
3533  }
3534 
3535  if (! ibp->gaps->next) {
3536  ibp->gaps->next = gfp;
3537  continue;
3538  }
3539 
3540  for (tgfp = ibp->gaps; tgfp; tgfp = tgfp->next) {
3541  if (tgfp->next && tgfp->next->from < from)
3542  continue;
3543  gfp->next = tgfp->next;
3544  tgfp->next = gfp;
3545  break;
3546  }
3547  }
3548  if (ibp->drop) {
3549  linkage_evidence_names.clear();
3550  asn_linkage_evidence.clear();
3551  }
3552  }
3553 
3554  if (! ibp->gaps)
3555  return;
3556 
3557  if (ibp->drop) {
3558  GapFeatsFree(ibp->gaps);
3559  ibp->gaps = nullptr;
3560  }
3561 }
3562 
3563 /**********************************************************/
3564 static void XMLGetQuals(char* entry, XmlIndexPtr xip, TQualVector& quals)
3565 {
3566  XmlIndexPtr xipqual;
3567 
3568  if (! entry || ! xip)
3569  return;
3570 
3571  for (; xip; xip = xip->next) {
3572  if (! xip->subtags)
3573  continue;
3574 
3575  CRef<CGb_qual> qual(new CGb_qual);
3576  for (xipqual = xip->subtags; xipqual; xipqual = xipqual->next) {
3577  if (xipqual->tag == INSDQUALIFIER_NAME)
3578  qual->SetQual(XMLGetTagValue(entry, xipqual));
3579  else if (xipqual->tag == INSDQUALIFIER_VALUE)
3580  qual->SetVal(XMLGetTagValue(entry, xipqual));
3581  }
3582 
3583  if (qual->GetQual() == "replace" && ! qual->IsSetVal()) {
3584  qual->SetVal("");
3585  }
3586 
3587  if (qual->IsSetQual() && ! qual->GetQual().empty())
3588  quals.push_back(qual);
3589  }
3590 }
3591 
3592 /**********************************************************/
3593 static DataBlkPtr XMLLoadFeatBlk(char* entry, XmlIndexPtr xip)
3594 {
3595  XmlIndexPtr xipfeat;
3596  DataBlkPtr headdbp;
3597  DataBlkPtr dbp;
3598  DataBlkPtr ret;
3599  FeatBlkPtr fbp;
3600 
3601  if (! entry || ! xip)
3602  return nullptr;
3603 
3604  for (; xip; xip = xip->next)
3605  if (xip->tag == INSDSEQ_FEATURE_TABLE)
3606  break;
3607 
3608  if (! xip || ! xip->subtags)
3609  return nullptr;
3610 
3611  headdbp = nullptr;
3612  for (xip = xip->subtags; xip; xip = xip->next) {
3613  if (! xip->subtags)
3614  continue;
3615  fbp = new FeatBlk;
3616  fbp->spindex = -1;
3617  for (xipfeat = xip->subtags; xipfeat; xipfeat = xipfeat->next) {
3618  if (xipfeat->tag == INSDFEATURE_KEY)
3619  fbp->key = XMLGetTagValue(entry, xipfeat);
3620  else if (xipfeat->tag == INSDFEATURE_LOCATION)
3621  fbp->location = XMLGetTagValue(entry, xipfeat);
3622  else if (xipfeat->tag == INSDFEATURE_QUALS)
3623  XMLGetQuals(entry, xipfeat->subtags, fbp->quals);
3624  }
3625  if (! headdbp) {
3626  headdbp = new DataBlk;
3627  dbp = headdbp;
3628  } else {
3629  dbp->mpNext = new DataBlk;
3630  dbp = dbp->mpNext;
3631  }
3632  dbp->mpData = fbp;
3633  }
3634  ret = new DataBlk;
3635  ret->mType = XML_FEATURES;
3636  ret->mpData = headdbp;
3637  ret->mpNext = nullptr;
3638  return (ret);
3639 }
3640 
3641 /**********************************************************
3642  *
3643  * static FeatBlkPtr MergeNoteQual(fbp):
3644  *
3645  * Only one note on every key feature block,
3646  * not complete.
3647  *
3648  * 5-28-93
3649  *
3650  **********************************************************/
3652 {
3653  char* p;
3654  char* q;
3655 
3656  size_t size = 0;
3657 
3658  for (auto& cur : fbp->quals) {
3659  if (! cur->IsSetQual() || ! cur->IsSetVal())
3660  continue;
3661 
3662  const string& cur_qual = cur->GetQual();
3663  const string& cur_val = cur->GetVal();
3664 
3665  if (cur_qual != "note" || cur_val.empty())
3666  continue;
3667 
3668  size += 2;
3669  vector<Char> buf(cur_val.size() + 1);
3670 
3671  const char* cp = cur_val.c_str();
3672  for (q = &buf[0]; *cp != '\0'; ++cp) {
3673  *q++ = *cp;
3674  if (*cp == ';' && (cp[1] == ' ' || cp[1] == ';')) {
3675  for (++cp; *cp == ' ' || *cp == ';';)
3676  ++cp;
3677  if (*cp != '\0')
3678  *q++ = ' ';
3679  --cp;
3680  }
3681  }
3682 
3683  *q = '\0';
3684  cur->SetVal(&buf[0]);
3685 
3686  size += cur->GetVal().size();
3687  for (cp = cur->GetVal().c_str(); *cp != '\0'; ++cp)
3688  if (*cp == '~')
3689  ++size;
3690  }
3691 
3692  if (size == 0)
3693  return (fbp);
3694 
3695  char* note = MemNew(size);
3696  p = note;
3697 
3698  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();) {
3699  if (! (*cur)->IsSetQual() || ! (*cur)->IsSetVal()) {
3700  ++cur;
3701  continue;
3702  }
3703 
3704  const string& cur_qual = (*cur)->GetQual();
3705  const string& cur_val = (*cur)->GetVal();
3706 
3707  if (cur_qual != "note") {
3708  ++cur;
3709  continue;
3710  }
3711 
3712  if (! cur_val.empty()) {
3713  /* sometime we get note qual w/o value
3714  */
3715  if (p > note) {
3716  *p++ = ';';
3717  *p++ = '~';
3718  }
3719 
3720  for (const char* cq = cur_val.c_str(); *cq != '\0'; *p++ = *cq++)
3721  if (*cq == '~')
3722  *p++ = '~';
3723  }
3724 
3725  cur = fbp->quals.erase(cur);
3726  }
3727  *p = '\0';
3728 
3729  CRef<CGb_qual> qual_new(new CGb_qual);
3730  qual_new->SetQual("note");
3731  qual_new->SetVal(note);
3732  MemFree(note);
3733 
3734  fbp->quals.push_back(qual_new);
3735 
3736  return (fbp);
3737 }
3738 
3739 /**********************************************************/
3740 static bool CheckLegalQual(const Char* val, Char ch, string* qual)
3741 {
3742  string qual_name;
3743  for (; *val && *val != ch && (isalpha(*val) || *val == '_'); ++val)
3744  qual_name += *val;
3745 
3748  return false;
3749 
3750  if (qual)
3751  *qual = qual_name;
3752 
3753  return true;
3754 }
3755 
3756 /**********************************************************/
3757 static void fta_convert_to_lower_case(char* str)
3758 {
3759  char* p;
3760 
3761  if (! str || *str == '\0')
3762  return;
3763 
3764  for (p = str; *p != '\0'; p++)
3765  if (*p >= 'A' && *p <= 'Z')
3766  *p |= 040;
3767 }
3768 
3769 /**********************************************************/
3770 static void fta_process_con_slice(vector<char>& val_buf)
3771 {
3772  size_t i = 1;
3773  char* p = &val_buf[0];
3774 
3775  // look for commas not followed by blank or end-of-string
3776  for (; *p != '\0'; p++)
3777  if (*p == ',' && p[1] != ' ' && p[1] != '\0')
3778  i++;
3779 
3780  // if there are some ...
3781  if (i > 1) {
3782  vector<char> buf(i + val_buf.size());
3783  char* q = &buf[0];
3784  // ... then insert a blank right after the comma
3785  for (p = &val_buf[0]; *p != '\0'; p++) {
3786  *q++ = *p;
3787  if (*p == ',' && p[1] != ' ' && p[1] != '\0')
3788  *q++ = ' ';
3789  }
3790  *q = '\0';
3791  val_buf.swap(buf);
3792  }
3793 }
3794 
3795 
3797  const string& str,
3798  vector<string>& lines)
3799 {
3800  NStr::Split(str, "\n", lines, 0);
3801 }
3802 
3803 /**********************************************************
3804  *
3805  * static void ParseQualifiers(fbp, bptr, eptr,
3806  * format):
3807  *
3808  * Parsing qualifier and put into link list fbp->qual.
3809  * Some qualifiers may not have value.
3810  * genbank qualifier format: /qualifier=value
3811  * embl qualifier format: /qualifier= value
3812  *
3813  * 10-12-93
3814  *
3815  **********************************************************/
3816 static void ParseQualifiers(
3817  FeatBlkPtr fbp,
3818  const char* bptr,
3819  const char* eptr,
3821 {
3822  string bstr(bptr, eptr);
3824  // cerr << "bstr:\n" << bstr.c_str() << "\n\n";
3825  vector<string> qualLines;
3826  xSplitLines(bstr, qualLines);
3827 
3828  string qualKey, qualVal;
3829  string featKey(fbp->key);
3830  string featLocation(fbp->location);
3831  CQualParser qualParser(format, featKey, featLocation, qualLines);
3832  while (! qualParser.Done()) {
3833  if (qualParser.GetNextQualifier(qualKey, qualVal)) {
3834  // cerr << "Key: " << qualKey.c_str() << "\n";
3835  // cerr << "Val: " << qualVal.c_str() << "\n";
3836  CRef<CGb_qual> pQual(new CGb_qual);
3837  pQual->SetQual(qualKey);
3838  pQual->SetVal(qualVal);
3839  fbp->quals.push_back(pQual);
3840  }
3841  }
3842 }
3843 
3844 
3845 /**********************************************************/
3846 static void fta_check_satellite(char* str, bool* drop)
3847 {
3848  char* p;
3849  Int2 i;
3850 
3851  if (! str || *str == '\0')
3852  return;
3853 
3854  p = StringChr(str, ':');
3855  if (p)
3856  *p = '\0';
3857 
3859  if (p)
3860  *p = ':';
3861  if (i < 0) {
3862  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidSatelliteType, "/satellite qualifier \"%s\" does not begin with a valid satellite type.", str);
3863  *drop = true;
3864  } else if (p && p[1] == '\0') {
3865  ErrPostEx(SEV_REJECT, ERR_FEATURE_NoSatelliteClassOrIdentifier, "/satellite qualifier \"%s\" does not include a class or identifier after the satellite type.", str);
3866  *drop = true;
3867  }
3868 }
3869 
3870 /**********************************************************
3871  *
3872  * int ParseFeatureBlock(ibp, deb, dbp, source, format):
3873  *
3874  * Parsing each feature sub-block, dbp, to
3875  * FeatBlkPtr, fbp.
3876  * Put warning message if bad qualifier's value or
3877  * unknown feature key found.
3878  * fdbp->drop = true, if found unknown feature key, or
3879  * do not go through 2nd time of qualifiers sematic
3880  * check (i.e. drop bad qualifier if the value if illegal
3881  * format in the 1st time)
3882  *
3883  * 11-22-93
3884  *
3885  * The location begins at column 22, and qualifier
3886  * begin on subsequent lines at column 22, they may
3887  * extend from column 22-80.
3888  * Qualifiers take the form of a slash, "/", followed
3889  * by the qualifier name and, if applicable, an equal
3890  * sign, "=", and a value (i.e. some qualifiers only
3891  * have name w/o value, s.t. /pseudo).
3892  *
3893  * 5-4-93
3894  *
3895  **********************************************************/
3897 {
3898  char* bptr;
3899  char* eptr;
3900  char* ptr1;
3901  char* ptr2;
3902  char* p;
3903  char* q;
3904  string loc;
3905  Char ch;
3906 
3907  FeatBlkPtr fbp;
3908  Int4 num;
3909  size_t i;
3910  int retval = GB_FEAT_ERR_NONE;
3911  int ret;
3912 
3913  if (ibp->is_mga)
3914  loc = "1.." + to_string(ibp->bases);
3915  for (num = 0; dbp; dbp = dbp->mpNext, num++) {
3916  fbp = new FeatBlk;
3917  fbp->spindex = -1;
3918  fbp->num = num;
3919  dbp->mpData = fbp;
3920 
3921  bptr = dbp->mOffset;
3922  eptr = bptr + dbp->len;
3923 
3924  for (p = bptr; *p != '\n';)
3925  p++;
3926  *p = '\0';
3927  FtaInstallPrefix(PREFIX_FEATURE, "Parsing FT line: ", bptr);
3928  *p = '\n';
3929  ptr1 = bptr + ParFlat_COL_FEATKEY;
3930  if (*ptr1 == ' ') {
3932  }
3933  for (ptr1 = bptr; *ptr1 == ' ';)
3934  ptr1++;
3935 
3936  for (ptr2 = ptr1; *ptr2 != ' ' && *ptr2 != '\n';)
3937  ptr2++;
3938 
3939  if (StringEquN(ptr1, "- ", 2)) {
3940  ErrPostStr(SEV_WARNING, ERR_FEATURE_FeatureKeyReplaced, "Featkey '-' is replaced by 'misc_feature'");
3941  fbp->key = StringSave("misc_feature");
3942  } else
3943  fbp->key = StringSave(string(ptr1, ptr2).c_str());
3944 
3945  for (ptr1 = ptr2; *ptr1 == ' ';)
3946  ptr1++;
3947  if (*ptr1 == '\n') {
3948  if (ibp->is_mga == false) {
3949  ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing, "Location missing");
3950  dbp->mDrop = true;
3951  retval = GB_FEAT_ERR_DROP;
3952  continue;
3953  }
3954  } else {
3955  i = ptr1 - bptr;
3956  if (i < ParFlat_COL_FEATDAT)
3957  ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing, "Location data is shifted to the left");
3958  else if (i > ParFlat_COL_FEATDAT)
3959  ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing, "Location data is shifted to the right");
3960  }
3961 
3962  for (ptr2 = ptr1; *ptr2 != '/' && ptr2 < eptr;)
3963  ptr2++;
3964  ch = *ptr2;
3965  *ptr2 = '\0';
3966  fbp->location = StringSave(ptr1);
3967  if (ibp->is_prot)
3968  fta_strip_aa(fbp->location);
3969  *ptr2 = ch;
3970  for (p = fbp->location, q = p; *p != '\0'; p++)
3971  if (*p != ' ' && *p != '\n')
3972  *q++ = *p;
3973  *q = '\0';
3974 
3975  if (fbp->location[0] == '\0' && ibp->is_mga) {
3976  MemFree(fbp->location);
3977  fbp->location = StringSave(loc.c_str());
3978  }
3979 
3981  if (StringEqu(fbp->key, "allele") ||
3982  StringEqu(fbp->key, "mutation")) {
3983  ErrPostEx(SEV_ERROR, ERR_FEATURE_ObsoleteFeature, "Obsolete feature \"%s\" found. Replaced with \"variation\".", fbp->key);
3984  MemFree(fbp->key);
3985  fbp->key = StringSave("variation");
3986  }
3987 
3989 
3990  if (subtype == CSeqFeatData::eSubtype_bad && ! deb) {
3991  ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key, "Feature dropped");
3992  dbp->mDrop = true;
3993  retval = GB_FEAT_ERR_DROP;
3994  continue;
3995  }
3996 
3997  if (*ptr2 == '/') /* qualifier start in first "/" */
3998  {
3999  ParseQualifiers(fbp, ptr2, eptr, format);
4000 
4001  if (! StringEqu(fbp->key, "assembly_gap")) {
4002  for (const auto& cur : fbp->quals) {
4003  const string& cur_qual = cur->GetQual();
4004  if (cur_qual == "gap_type" ||
4005  cur_qual == "assembly_evidence") {
4006  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidQualifier, "Qualifier /%s is invalid for the feature \"%s\" at \"%s\".", cur_qual.c_str(), fbp->key, fbp->location ? fbp->location : "Unknown");
4007  ibp->drop = true;
4008  }
4009  }
4010  }
4011 
4012  if (! StringEqu(fbp->key, "source")) {
4013  for (const auto& cur : fbp->quals) {
4014  const string& cur_qual = cur->GetQual();
4015  if (cur_qual == "submitter_seqid") {
4016  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidQualifier, "Qualifier /%s is invalid for the feature \"%s\" at \"%s\".", cur_qual.c_str(), fbp->key, fbp->location ? fbp->location : "Unknown");
4017  ibp->drop = true;
4018  }
4019  }
4020  }
4021 
4022  fbp = MergeNoteQual(fbp); /* allow more than one
4023  notes w/i a key */
4024 
4025  if (subtype == CSeqFeatData::eSubtype_bad) {
4027  ret = GB_FEAT_ERR_REPAIRABLE;
4028  } else {
4029  /* last argument is perform_corrections if debug
4030  * mode is FALSE
4031  */
4032  ret = XGBFeatKeyQualValid(subtype, fbp->quals, true, (source == Parser::ESource::Flybase ? false : ! deb));
4033  }
4034  if (ret > retval)
4035  retval = ret;
4036 
4037  if (ret > GB_FEAT_ERR_REPAIRABLE && ! StringEqu(fbp->key, "ncRNA"))
4038  dbp->mDrop = true;
4039  } else if (subtype == CSeqFeatData::eSubtype_bad && ! CSeqFeatData::GetMandatoryQualifiers(subtype).empty()) {
4040  if (! StringEqu(fbp->key, "mobile_element")) {
4041  auto qual_idx = *CSeqFeatData::GetMandatoryQualifiers(subtype).begin();
4042  string str1 = CSeqFeatData::GetQualifierAsString(qual_idx);
4043  const char* str = str1.c_str();
4044  if ((! StringEqu(fbp->key, "old_sequence") &&
4045  ! StringEqu(fbp->key, "conflict")) ||
4046  ! StringEqu(str, "citation")) {
4047  ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing, "lacks required /%s qualifier : feature has been dropped.", str);
4048  if (! deb) {
4049  dbp->mDrop = true;
4050  retval = GB_FEAT_ERR_DROP;
4051  }
4052  }
4053  }
4054  } else if (StringEqu(fbp->key, "misc_feature") && fbp->quals.empty()) {
4055  if (! deb) {
4056  dbp->mDrop = true;
4057  retval = GB_FEAT_ERR_DROP;
4058  ErrPostStr(SEV_WARNING, ERR_FEATURE_Dropped, "Empty 'misc_feature' dropped");
4059  } else
4060  retval = GB_FEAT_ERR_REPAIRABLE;
4061  }
4062 
4063  for (auto& cur : fbp->quals) {
4064  if (! cur->IsSetQual() || ! cur->IsSetVal())
4065  continue;
4066 
4067  const string& qual_str = cur->GetQual();
4068  const string& val_str = cur->GetVal();
4069 
4070  vector<Char> val_buf(val_str.begin(), val_str.end());
4071  val_buf.push_back(0);
4072 
4073  p = &val_buf[0];
4074  ShrinkSpaces(p);
4075  if (*p == '\0' && qual_str != "replace") {
4076  cur->ResetVal();
4077  val_buf[0] = 0;
4078  } else {
4079  if (qual_str == "replace")
4081  cur->SetVal(p);
4082  }
4083 
4084  if (qual_str == "satellite")
4085  fta_check_satellite(&val_buf[0], &ibp->drop);
4086  }
4087  } /* for, each sub-block, or each feature key */
4089  return (retval);
4090 }
4091 
4092 /**********************************************************/
4094 {
4095  const char** b;
4096  char* p;
4097  Char ch;
4098 
4099  if (! fbp || fbp->quals.empty())
4100  return;
4101 
4102  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();) {
4103  const string& qual_str = (*cur)->GetQual();
4104 
4105  if ((*cur)->IsSetVal()) {
4106  const string& val_str = (*cur)->GetVal();
4107  vector<Char> val_buf(val_str.begin(), val_str.end());
4108  val_buf.push_back(0);
4109 
4110  if (qual_str == "translation") {
4111  DelCharBtwData(&val_buf[0]);
4112  } else if (qual_str == "rpt_unit") {
4113  fta_convert_to_lower_case(&val_buf[0]);
4114  } else if (qual_str == "cons_splice") {
4115  fta_process_con_slice(val_buf);
4116  } else if (qual_str == "note") {
4117  for (p = &val_buf[0];;) {
4118  p = StringChr(p, '/');
4119  if (! p)
4120  break;
4121  p++;
4122  if (! CheckLegalQual(p, ' ', nullptr))
4123  continue;
4124 
4125  if (val_buf.size() > 30) {
4126  ch = val_buf[30];
4127  val_buf[30] = '\0';
4128  } else
4129  ch = '\0';
4130  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmbeddedQual, "/note qualifier value appears to contain other qualifiers : [%s%s].", &val_buf[0], (ch == '\0') ? "" : " ...");
4131  if (ch != '\0')
4132  val_buf[30] = ch;
4133  }
4134  }
4135 
4136  for (p = &val_buf[0]; *p == '\"' || *p == ' ' || *p == '\t';)
4137  p++;
4138 
4139  if (*p == '\0') {
4140  if (qual_str == "replace") {
4141  (*cur)->SetVal("");
4142  } else
4143  (*cur)->ResetVal();
4144  } else
4145  (*cur)->SetVal(&val_buf[0]);
4146  }
4147 
4148  for (b = EmptyQuals; *b; b++)
4149  if (qual_str == *b)
4150  break;
4151 
4152  if (! *b) {
4153  if (! (*cur)->IsSetVal()) {
4154  if (qual_str == "old_locus_tag")
4155  ErrPostEx(SEV_ERROR, ERR_FEATURE_EmptyOldLocusTag, "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier with no value. Qualifier has been dropped.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "Empty");
4156  else
4157  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmptyQual, "Qualifier /%s ignored because it lacks a data value. Feature \"%s\", location \"%s\".", qual_str.c_str(), fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "Empty");
4158 
4159  cur = fbp->quals.erase(cur);
4160  continue;
4161  }
4162  } else if ((*cur)->IsSetVal()) {
4163  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_ShouldNotHaveValue, "Qualifier /%s should not have data value. Qualifier value has been ignored. Feature \"%s\", location \"%s\".", qual_str.c_str(), fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "Empty");
4164 
4165  (*cur)->ResetVal();
4166  }
4167 
4168  if ((*cur)->IsSetVal() && qual_str == "note") {
4169  string val = (*cur)->GetVal();
4170  std::replace(val.begin(), val.end(), '\"', '\'');
4171  (*cur)->SetVal(val);
4172  }
4173 
4174  ++cur;
4175  }
4176 }
4177 
4178 /**********************************************************/
4180 {
4181  FeatBlkPtr fbp;
4182  char* p;
4183  Int4 num;
4184  Int2 keyindx;
4185  int retval = GB_FEAT_ERR_NONE;
4186  int ret = 0;
4187 
4188  for (num = 0; dbp; dbp = dbp->mpNext, num++) {
4189  if (! dbp->mpData)
4190  continue;
4191  fbp = static_cast<FeatBlk*>(dbp->mpData);
4192  fbp->num = num;
4194 
4195  if (fbp->key[0] == '-' && fbp->key[1] == '\0') {
4196  ErrPostStr(SEV_WARNING, ERR_FEATURE_FeatureKeyReplaced, "Featkey '-' is replaced by 'misc_feature'");
4197  MemFree(fbp->key);
4198  fbp->key = StringSave("misc_feature");
4199  }
4200 
4201  if (StringEqu(fbp->key, "allele") ||
4202  StringEqu(fbp->key, "mutation")) {
4203  ErrPostEx(SEV_ERROR, ERR_FEATURE_ObsoleteFeature, "Obsolete feature \"%s\" found. Replaced with \"variation\".", fbp->key);
4204  MemFree(fbp->key);
4205  fbp->key = StringSave("variation");
4206  }
4207 
4209 
4210  /* bsv hack: exclude CONFLICT, REGION, SITE, UNSURE UniProt flatfile
4211  * features from valid GenBank ones: for USPTO only
4212  * Needs better workaround
4213  */
4214  if (source == Parser::ESource::USPTO &&
4215  (subtype == CSeqFeatData::eSubtype_conflict ||
4216  subtype == CSeqFeatData::eSubtype_region ||
4217  subtype == CSeqFeatData::eSubtype_site ||
4218  subtype == CSeqFeatData::eSubtype_unsure))
4219  subtype = CSeqFeatData::eSubtype_bad;
4220  keyindx = -1;
4221  if (subtype == CSeqFeatData::eSubtype_bad && ! deb) {
4223  keyindx = SpFeatKeyNameValid(fbp->key);
4224  if (keyindx < 0 && ! deb) {
4225  ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key, "Feature dropped");
4226  dbp->mDrop = true;
4227  retval = GB_FEAT_ERR_DROP;
4228  continue;
4229  }
4230  fbp->spindex = keyindx;
4231  }
4232 
4233  if (! fbp->quals.empty()) {
4234  XMLCheckQualifiers(fbp);
4235  fbp = MergeNoteQual(fbp); /* allow more than one
4236  notes w/i a key */
4237 
4238  if (subtype == CSeqFeatData::eSubtype_bad) {
4239  if (keyindx < 0) {
4241  ret = GB_FEAT_ERR_REPAIRABLE;
4242  }
4243  } else if (fbp->spindex < 0) {
4244  /* last argument is perform_corrections if debug
4245  * mode is FALSE
4246  */
4247  ret = XGBFeatKeyQualValid(subtype, fbp->quals, true, ((source == Parser::ESource::Flybase) ? false : ! deb));
4248  }
4249  if (ret > retval)
4250  retval = ret;
4251 
4252  if (ret > GB_FEAT_ERR_REPAIRABLE && ! StringEqu(fbp->key, "ncRNA"))
4253  dbp->mDrop = true;
4254  } else if (subtype == CSeqFeatData::eSubtype_bad && ! CSeqFeatData::GetMandatoryQualifiers(subtype).empty()) {
4255  if (! StringEqu(fbp->key, "mobile_element")) {
4256  auto qual_idx = *CSeqFeatData::GetMandatoryQualifiers(subtype).begin();
4257  string str1 = CSeqFeatData::GetQualifierAsString(qual_idx);
4258  const char* str = str1.c_str();
4259  if ((! StringEqu(fbp->key, "old_sequence") &&
4260  ! StringEqu(fbp->key, "conflict")) ||
4261  ! StringEqu(str, "citation")) {
4262  ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing, "lacks required /%s qualifier : feature has been dropped.", str);
4263  if (! deb) {
4264  dbp->mDrop = true;
4265  retval = GB_FEAT_ERR_DROP;
4266  }
4267  }
4268  }
4269  } else if (StringEqu(fbp->key, "misc_feature") && fbp->quals.empty()) {
4270  if (! deb) {
4271  dbp->mDrop = true;
4272  retval = GB_FEAT_ERR_DROP;
4273  ErrPostStr(SEV_WARNING, ERR_FEATURE_Dropped, "Empty 'misc_feature' dropped");
4274  } else
4275  retval = GB_FEAT_ERR_REPAIRABLE;
4276  }
4277 
4278  for (auto& cur : fbp->quals) {
4279  if (! cur->IsSetQual() || ! cur->IsSetVal())
4280  continue;
4281 
4282  const string& qual_str = cur->GetQual();
4283  const string& val_str = cur->GetVal();
4284 
4285  vector<Char> val_buf(val_str.begin(), val_str.end());
4286  val_buf.push_back(0);
4287 
4288  p = &val_buf[0];
4289  ShrinkSpaces(p);
4290  if (*p == '\0' && qual_str != "replace") {
4291  cur->ResetVal();
4292  val_buf[0] = 0;
4293  } else {
4294  if (qual_str == "replace")
4296  cur->SetVal(p);
4297  }
4298  }
4299  } /* for, each sub-block, or each feature key */
4301  return (retval);
4302 }
4303 
4304 /**********************************************************/
4305 static bool fta_check_ncrna(const CSeq_feat& feat)
4306 {
4307  int count = 0;
4308 
4309  bool stop = false;
4310  for (const auto& qual : feat.GetQual()) {
4311  if (! qual->IsSetQual() || qual->GetQual().empty() ||
4312  qual->GetQual() != "ncRNA_class")
4313  continue;
4314 
4315  count++;
4316 
4317  if (! qual->IsSetVal() || qual->GetVal().empty()) {
4318  string loc = location_to_string_or_unknown(feat.GetLocation());
4319 
4320  ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class, "Feature \"ncRNA\" at location \"%s\" has an empty /ncRNA_class qualifier.", loc.empty() ? "unknown" : loc.c_str());
4321  stop = true;
4322  break;
4323  }
4324 
4325  if (MatchArrayString(ncRNA_class_values, qual->GetVal().c_str()) < 0) {
4326  string loc = location_to_string_or_unknown(feat.GetLocation());
4327 
4328  ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class, "Feature \"ncRNA\" at location \"%s\" has an invalid /ncRNA_class qualifier: \"%s\".", loc.empty() ? "unknown" : loc.c_str(), qual->GetVal().c_str());
4329  stop = true;
4330  break;
4331  }
4332  }
4333 
4334  if (stop)
4335  return false;
4336 
4337  if (count == 1)
4338  return true;
4339 
4340  string loc = location_to_string_or_unknown(feat.GetLocation());
4341 
4342  ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class, "Feature \"ncRNA\" at location \"%s\" %s /ncRNA_class qualifier.", loc.empty() ? "unknown" : loc.c_str(), (count == 0) ? "lacks the mandatory" : "has more than one");
4343 
4344  return false;
4345 }
4346 
4347 /**********************************************************/
4349 {
4350  for (auto qual = feat.SetQual().begin(); qual != feat.SetQual().end(); ++qual) {
4351  if (! (*qual)->IsSetQual() || (*qual)->GetQual() != "artificial_location")
4352  continue;
4353 
4354  if ((*qual)->IsSetVal()) {
4355  const Char* p_val = (*qual)->GetVal().c_str();
4356  for (; *p_val == '\"';)
4357  ++p_val;
4358 
4359  if (*p_val == '\0')
4360  (*qual)->ResetVal();
4361  }
4362 
4363  string val = (*qual)->IsSetVal() ? (*qual)->GetVal() : "";
4364 
4365  if (val == "heterogenous population sequenced" ||
4366  val == "low-quality sequence region") {
4367  feat.SetExcept(true);
4368 
4369  if (! feat.IsSetExcept_text())
4370  feat.SetExcept_text(val);
4371  else {
4372  string& except_text = feat.SetExcept_text();
4373  except_text += ", ";
4374  except_text += val;
4375  }
4376  } else {
4377  auto loc_str = location_to_string_or_unknown(feat.GetLocation());
4378 
4379  if (val.empty())
4380  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidArtificialLoc, "Encountered empty /artificial_location qualifier : Feature \"%s\" : Location \"%s\". Qualifier dropped.", (! key || *key == '\0') ? "unknown" : key, loc_str.empty() ? "unknown" : loc_str.c_str());
4381  else
4382  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidArtificialLoc, "Value \"%s\" is not legal for the /artificial_location qualifier : Feature \"%s\" : Location \"%s\". Qualifier dropped.", val.c_str(), (! key || *key == '\0') ? "unknown" : key, loc_str.empty() ? "unknown" : loc_str.c_str());
4383  }
4384 
4385  feat.SetQual().erase(qual);
4386  break;
4387  }
4388 }
4389 
4390 /**********************************************************/
4391 static bool fta_check_mobile_element(const CSeq_feat& feat)
4392 {
4393  bool found = false;
4394  for (const auto& qual : feat.GetQual()) {
4395  if (qual->IsSetQual() && qual->GetQual() == "mobile_element_type" &&
4396  qual->IsSetVal() && ! qual->GetVal().empty()) {
4397  const Char* p_val = qual->GetVal().c_str();
4398  for (; *p_val == '\"';)
4399  ++p_val;
4400 
4401  if (*p_val != '\0') {
4402  found = true;
4403  break;
4404  }
4405  }
4406  }
4407 
4408  if (found)
4409  return true;
4410 
4411  auto loc_str = location_to_string_or_unknown(feat.GetLocation());
4412  ErrPostEx(SEV_REJECT, ERR_FEATURE_RequiredQualifierMissing, "Mandatory qualifier /mobile_element_type is absent or has no value : Feature \"mobile_element\" : Location \"%s\". Entry dropped.", loc_str.empty() ? "unknown" : loc_str.c_str());
4413 
4414  return false;
4415 }
4416 
4417 /**********************************************************/
4418 static bool SortFeaturesByLoc(const DataBlkPtr& sp1, const DataBlkPtr& sp2)
4419 {
4420  FeatBlkPtr fbp1;
4421  FeatBlkPtr fbp2;
4422  Int4 status;
4423 
4424  fbp1 = static_cast<FeatBlk*>(sp1->mpData);
4425  fbp2 = static_cast<FeatBlk*>(sp2->mpData);
4426 
4427  if (! fbp1->location && fbp2->location)
4428  return false;
4429  if (fbp1->location && ! fbp2->location)
4430  return false;
4431 
4432  if (fbp1->location && fbp2->location) {
4433  status = StringCmp(fbp1->location, fbp2->location);
4434  if (status != 0)
4435  return status < 0;
4436  }
4437 
4438  if (! fbp1->key && fbp2->key)
4439  return false;
4440  if (fbp1->key && ! fbp2->key)
4441  return false;
4442  if (fbp1->key && fbp2->key) {
4443  status = StringCmp(fbp1->key, fbp2->key);
4444  if (status != 0)
4445  return status < 0;
4446  }
4447 
4448  return false;
4449 }
4450 
4451 /**********************************************************/
4452 static bool SortFeaturesByOrder(const DataBlkPtr& sp1, const DataBlkPtr& sp2)
4453 {
4454  FeatBlkPtr fbp1;
4455  FeatBlkPtr fbp2;
4456 
4457  fbp1 = static_cast<FeatBlk*>(sp1->mpData);
4458  fbp2 = static_cast<FeatBlk*>(sp2->mpData);
4459 
4460  return fbp1->num < fbp2->num;
4461 }
4462 
4463 /**********************************************************/
4464 static DataBlkPtr fta_sort_features(DataBlkPtr dbp, bool order)
4465 {
4466  size_t total = 0;
4467  for (DataBlkPtr tdbp = dbp; tdbp; tdbp = tdbp->mpNext)
4468  total++;
4469 
4470  vector<DataBlk*> temp;
4471  temp.reserve(total);
4472  for (DataBlkPtr tdbp = dbp; tdbp; tdbp = tdbp->mpNext)
4473  temp.push_back(tdbp);
4474 
4475  std::sort(temp.begin(), temp.end(), (order ? SortFeaturesByOrder : SortFeaturesByLoc));
4476 
4477  DataBlkPtr tdbp = dbp = temp[0];
4478  for (size_t i = 0; i < total - 1; tdbp = tdbp->mpNext, i++)
4479  tdbp->mpNext = temp[i + 1];
4480 
4481  temp[total - 1]->mpNext = nullptr;
4482  return (dbp);
4483 }
4484 
4485 /**********************************************************/
4486 static void fta_convert_to_regulatory(FeatBlkPtr fbp, const char* rclass)
4487 {
4488  if (! fbp || ! fbp->key || ! rclass)
4489  return;
4490 
4491  if (fbp->key)
4492  MemFree(fbp->key);
4493  fbp->key = StringSave("regulatory");
4494 
4495  CRef<CGb_qual> qual(new CGb_qual);
4496  qual->SetQual("regulatory_class");
4497  qual->SetVal(rclass);
4498  fbp->quals.push_back(qual);
4499 }
4500 
4501 /**********************************************************/
4502 static void fta_check_replace_regulatory(DataBlkPtr dbp, bool* drop)
4503 {
4504  FeatBlkPtr fbp;
4505  const char** b;
4506  char* p;
4507  bool got_note;
4508  bool other_class;
4509  Int4 count;
4510  Char ch;
4511 
4512  for (; dbp; dbp = dbp->mpNext) {
4513  fbp = static_cast<FeatBlk*>(dbp->mpData);
4514  if (! fbp || ! fbp->key)
4515  continue;
4516 
4517  if (StringEqu(fbp->key, "attenuator"))
4518  fta_convert_to_regulatory(fbp, "attenuator");
4519  else if (StringEqu(fbp->key, "CAAT_signal"))
4520  fta_convert_to_regulatory(fbp, "CAAT_signal");
4521  else if (StringEqu(fbp->key, "enhancer"))
4522  fta_convert_to_regulatory(fbp, "enhancer");
4523  else if (StringEqu(fbp->key, "GC_signal"))
4524  fta_convert_to_regulatory(fbp, "GC_signal");
4525  else if (StringEqu(fbp->key, "-35_signal"))
4526  fta_convert_to_regulatory(fbp, "minus_35_signal");
4527  else if (StringEqu(fbp->key, "-10_signal"))
4528  fta_convert_to_regulatory(fbp, "minus_10_signal");
4529  else if (StringEqu(fbp->key, "polyA_signal"))
4530  fta_convert_to_regulatory(fbp, "polyA_signal_sequence");
4531  else if (StringEqu(fbp->key, "promoter"))
4532  fta_convert_to_regulatory(fbp, "promoter");
4533  else if (StringEqu(fbp->key, "RBS"))
4534  fta_convert_to_regulatory(fbp, "ribosome_binding_site");
4535  else if (StringEqu(fbp->key, "TATA_signal"))
4536  fta_convert_to_regulatory(fbp, "TATA_box");
4537  else if (StringEqu(fbp->key, "terminator"))
4538  fta_convert_to_regulatory(fbp, "terminator");
4539  else if (! StringEqu(fbp->key, "regulatory"))
4540  continue;
4541 
4542  got_note = false;
4543  other_class = false;
4544  count = 0;
4545 
4546  for (const auto& cur : fbp->quals) {
4547  if (! cur->IsSetQual() || ! cur->IsSetVal())
4548  continue;
4549 
4550  const string& qual_str = cur->GetQual();
4551 
4552  if (qual_str != "regulatory_class") {
4553  if (qual_str == "note")
4554  got_note = true;
4555  continue;
4556  }
4557 
4558  count++;
4559  if (! cur->IsSetVal() || cur->GetVal().empty()) {
4560  ch = '\0';
4561  if (! fbp->location || *fbp->location == '\0')
4562  p = (char*)"(empty)";
4563  else {
4564  p = fbp->location;
4565  if (StringLen(p) > 50) {
4566  ch = p[50];
4567  p[50] = '\0';
4568  }
4569  }
4570  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidRegulatoryClass, "Empty /regulatory_class qualifier value in regulatory feature at location %s.", p);
4571  if (ch != '\0')
4572  p[50] = ch;
4573  *drop = true;
4574  continue;
4575  }
4576 
4577  const string& val_str = cur->GetVal();
4578 
4579  for (b = RegulatoryClassValues; *b; b++)
4580  if (val_str == *b)
4581  break;
4582 
4583  if (*b) {
4584  if (val_str == "other")
4585  other_class = true;
4586  continue;
4587  }
4588 
4589  ch = '\0';
4590  if (! fbp->location || *fbp->location == '\0')
4591  p = (char*)"(empty)";
4592  else {
4593  p = fbp->location;
4594  if (StringLen(p) > 50) {
4595  ch = p[50];
4596  p[50] = '\0';
4597  }
4598  }
4599  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidRegulatoryClass, "Invalid /regulatory_class qualifier value %s provided in regulatory feature at location %s.", val_str.c_str(), p);
4600  if (ch != '\0')
4601  p[50] = ch;
4602  *drop = true;
4603  }
4604 
4605  if (count == 0) {
4606  ch = '\0';
4607  if (! fbp->location || *fbp->location == '\0')
4608  p = (char*)"(empty)";
4609  else {
4610  p = fbp->location;
4611  if (StringLen(p) > 50) {
4612  ch = p[50];
4613  p[50] = '\0';
4614  }
4615  }
4616  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingRegulatoryClass, "The regulatory feature is missing mandatory /regulatory_class qualifier at location %s.", p);
4617  if (ch != '\0')
4618  p[50] = ch;
4619  *drop = true;
4620  } else if (count > 1) {
4621  ch = '\0';
4622  if (! fbp->location || *fbp->location == '\0')
4623  p = (char*)"(empty)";
4624  else {
4625  p = fbp->location;
4626  if (StringLen(p) > 50) {
4627  ch = p[50];
4628  p[50] = '\0';
4629  }
4630  }
4631  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MultipleRegulatoryClass, "Multiple /regulatory_class qualifiers were encountered in regulatory feature at location %s.", p);
4632  if (ch != '\0')
4633  p[50] = ch;
4634  *drop = true;
4635  }
4636 
4637  if (other_class && ! got_note) {
4638  ch = '\0';
4639  if (! fbp->location || *fbp->location == '\0')
4640  p = (char*)"(empty)";
4641  else {
4642  p = fbp->location;
4643  if (StringLen(p) > 50) {
4644  ch = p[50];
4645  p[50] = '\0';
4646  }
4647  }
4648  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_NoNoteForOtherRegulatory, "The regulatory feature of class other is lacking required /note qualifier at location %s.", p);
4649  if (ch != '\0')
4650  p[50] = ch;
4651  *drop = true;
4652  }
4653  }
4654 }
4655 
4656 /**********************************************************/
4657 static void fta_create_wgs_dbtag(CBioseq& bioseq,
4658  const string& submitter_seqid,
4659  char* prefix,
4660  Int4 seqtype)
4661 {
4662  string dbname;
4663  if (seqtype == 0 || seqtype == 1 || seqtype == 7)
4664  dbname = "WGS:";
4665  else if (seqtype == 4 || seqtype == 5 || seqtype == 8 || seqtype == 9)
4666  dbname = "TSA:";
4667  else
4668  dbname = "TLS:";
4669  dbname += prefix;
4670 
4671  CRef<CSeq_id> gen_id(new CSeq_id);
4672  CDbtag& tag = gen_id->SetGeneral();
4673  tag.SetTag().SetStr(submitter_seqid);
4674  tag.SetDb(dbname);
4675  bioseq.SetId().push_back(gen_id);
4676 }
4677 
4678 /**********************************************************/
4679 static void fta_create_wgs_seqid(CBioseq& bioseq,
4680  IndexblkPtr ibp,
4682 {
4683  TokenBlkPtr tbp;
4684  char* prefix;
4685  char* p;
4686  Int4 seqtype;
4687  Int4 i;
4688 
4689  if (! ibp || ibp->submitter_seqid.empty())
4690  return;
4691 
4692  prefix = nullptr;
4693 
4694  seqtype = fta_if_wgs_acc(ibp->acnum);
4695  if (seqtype == 0 || seqtype == 3 || seqtype == 4 || seqtype == 6 ||
4696  seqtype == 10 || seqtype == 12) {
4697  ErrPostEx(SEV_REJECT, ERR_SOURCE_SubmitterSeqidNotAllowed, "WGS/TLS/TSA master records are not allowed to have /submitter_seqid qualifiers, only contigs and scaffolds. Entry dropped.");
4698  ibp->drop = true;
4699  return;
4700  }
4701 
4702  if (seqtype == 1 || seqtype == 5 || seqtype == 7 || seqtype == 8 ||
4703  seqtype == 9 || seqtype == 11) {
4704  prefix = StringSave(ibp->acnum);
4705  if (prefix[4] >= '0' && prefix[4] <= '9')
4706  prefix[6] = '\0';
4707  else
4708  prefix[8] = '\0';
4709  fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix, seqtype);
4710  MemFree(prefix);
4711  return;
4712  }
4713 
4714  for (tbp = ibp->secaccs; tbp; tbp = tbp->next) {
4715  if (tbp->str[0] == '-')
4716  continue;
4717 
4718  if (! prefix)
4719  prefix = StringSave(tbp->str);
4720  else {
4721  i = (prefix[4] >= '0' && prefix[4] <= '9') ? 6 : 8;
4722  if (! StringEquN(prefix, tbp->str, i))
4723  break;
4724  }
4725  }
4726 
4727  if (! tbp && prefix) {
4728  seqtype = fta_if_wgs_acc(prefix);
4729  if (seqtype == 0 || seqtype == 1 || seqtype == 4 || seqtype == 5 ||
4730  seqtype == 7 || seqtype == 8 || seqtype == 9 || seqtype == 10 ||
4731  seqtype == 11) {
4732  if (prefix[4] >= '0' && prefix[4] <= '9')
4733  prefix[6] = '\0';
4734  else
4735  prefix[8] = '\0';
4736  fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix, seqtype);
4737  MemFree(prefix);
4738  return;
4739  }
4740  }
4741 
4742  if (prefix) {
4743  MemFree(prefix);
4744  prefix = nullptr;
4745  }
4746 
4747  if (bioseq.GetInst().IsSetExt() && bioseq.GetInst().GetExt().IsDelta()) {
4748  CDelta_ext::Tdata deltas = bioseq.GetInst().GetExt().GetDelta();
4749  CDelta_ext::Tdata::iterator delta;
4750 
4751  for (delta = deltas.begin(); delta != deltas.end(); delta++) {
4752  const CSeq_id* id = nullptr;
4753 
4754  if (! (*delta)->IsLoc())
4755  continue;
4756 
4757  const CSeq_loc& locs = (*delta)->GetLoc();
4758  CSeq_loc_CI ci(locs);
4759 
4760  for (; ci; ++ci) {
4762  if (! loc->IsInt())
4763  continue;
4764  id = &ci.GetSeq_id();
4765  if (! id)
4766  break;
4767  if (! id->IsGenbank() && ! id->IsEmbl() && ! id->IsDdbj() &&
4768  ! id->IsOther() && ! id->IsTpg() && ! id->IsTpe() &&
4769  ! id->IsTpd())
4770  break;
4771 
4772  const CTextseq_id* text_id = id->GetTextseq_Id();
4773  if (! text_id || ! text_id->IsSetAccession() ||
4774  text_id->GetAccession().empty())
4775  break;
4776 
4777  p = (char*)text_id->GetAccession().c_str();
4778  if (! prefix)
4779  prefix = StringSave(p);
4780  else {
4781  i = (prefix[4] >= '0' && prefix[4] <= '9') ? 6 : 8;
4782  if (! StringEquN(prefix, p, i))
4783  break;
4784  }
4785  }
4786  if (ci)
4787  break;
4788  }
4789 
4790  if (delta == deltas.end() && prefix) {
4791  seqtype = fta_if_wgs_acc(prefix);
4792  if (seqtype == 0 || seqtype == 1 || seqtype == 4 || seqtype == 5 ||
4793  seqtype == 7 || seqtype == 8 || seqtype == 9 || seqtype == 10 ||
4794  seqtype == 11) {
4795  if (prefix[4] >= '0' && prefix[4] <= '9')
4796  prefix[6] = '\0';
4797  else
4798  prefix[8] = '\0';
4799  fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix, seqtype);
4800  MemFree(prefix);
4801  return;
4802  }
4803  }
4804 
4805  if (prefix) {
4806  MemFree(prefix);
4807  prefix = nullptr;
4808  }
4809 
4810  ErrPostEx(SEV_ERROR, ERR_SOURCE_SubmitterSeqidDropped, "Could not determine project code for what appears to be a WGS/TLS/TSA scaffold record. /submitter_seqid dropped.");
4811  return;
4812  }
4813 
4815  ErrPostEx(SEV_ERROR, ERR_SOURCE_SubmitterSeqidIgnored, "Submitter sequence identifiers for non-project-based TSA records are not supported. /submitter_seqid \"%s\" has been dropped.", ibp->submitter_seqid.c_str());
4816  return;
4817  }
4818 
4819  ErrPostEx(SEV_REJECT, ERR_SOURCE_SubmitterSeqidNotAllowed, "Only WGS/TLS/TSA related records (contigs and scaffolds) are allowed to have /submitter_seqid qualifier. This \"%s\" is not one of them. Entry dropped.", ibp->acnum);
4820  ibp->drop = true;
4821 }
4822 
4823 /**********************************************************
4824  *
4825  * SeqAnnotPtr LoadFeat(pp, entry, bsp):
4826  *
4827  * 5-4-93
4828  *
4829  **********************************************************/
4830 void LoadFeat(ParserPtr pp, const DataBlk& entry, CBioseq& bioseq)
4831 {
4832  DataBlkPtr dab;
4833  DataBlkPtr dabnext;
4834  DataBlkPtr dbp;
4835  DataBlkPtr tdbp;
4836  FeatBlkPtr fbp;
4837 
4838  IndexblkPtr ibp;
4839  Int4 col_data;
4840  Int2 type;
4841  Int4 i = 0;
4842  CRef<CSeq_id> pat_seq_id;
4843 
4845 
4846  ibp = pp->entrylist[pp->curindx];
4847 
4848  CRef<CSeq_id> seq_id =
4849  MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum);
4850  if (pp->source == Parser::ESource::USPTO) {
4851  pat_seq_id = new CSeq_id;
4853  pat_seq_id->SetPatent(*pat_id);
4854  }
4855 
4856  if (! seq_id) {
4857  if (! NStr::IsBlank(ibp->acnum)) {
4858  seq_id = Ref(new CSeq_id(CSeq_id::e_Local, ibp->acnum));
4859  } else if (pp->mode == Parser::EMode::Relaxed) {
4860  seq_id = Ref(new CSeq_id(CSeq_id::e_Local, ibp->locusname));
4861  }
4862  }
4863 
4864  TSeqIdList ids;
4865  ids.push_back(seq_id);
4866 
4867  if (pp->format == Parser::EFormat::GenBank) {
4868  col_data = ParFlat_COL_DATA;
4870  } else if (pp->format == Parser::EFormat::XML) {
4871  col_data = 0;
4872  type = XML_FEATURES;
4873  } else {
4874  col_data = ParFlat_COL_DATA_EMBL;
4875  type = ParFlat_FH;
4876  }
4877 
4878  /* Find feature already isolated in a "block"
4879  * The key, location and qualifiers will be isolated to
4880  * a FeatBlk at the first step of ParseFeatureBlock, which
4881  * parses a single feature at a time.
4882  * -Karl
4883  */
4884  if (pp->format == Parser::EFormat::XML)
4885  dab = XMLLoadFeatBlk(entry.mOffset, ibp->xip);
4886  else
4887  dab = TrackNodeType(entry, type);
4888  for (dbp = dab; dbp; dbp = dbp->mpNext) {
4889  if (dbp->mType != type)
4890  continue;
4891 
4892  /* Parsing each feature subblock to FeatBlkPtr, fbp
4893  * it also checks semantics of qualifiers and keys
4894  */
4895  if (pp->format == Parser::EFormat::XML)
4896  XMLParseFeatureBlock(pp->debug, static_cast<DataBlk*>(dbp->mpData), pp->source);
4897  else
4898  ParseFeatureBlock(ibp, pp->debug, static_cast<DataBlk*>(dbp->mpData), pp->source, pp->format);
4899 
4900  dbp->mpData = fta_sort_features(static_cast<DataBlk*>(dbp->mpData), false);
4901  fta_check_pseudogene_qual(static_cast<DataBlk*>(dbp->mpData));
4902  fta_check_old_locus_tags(static_cast<DataBlk*>(dbp->mpData), &ibp->drop);
4903  fta_check_compare_qual(static_cast<DataBlk*>(dbp->mpData), ibp->is_tpa);
4904  tdbp = static_cast<DataBlk*>(dbp->mpData);
4905  for (i = 0; tdbp; i++, tdbp = tdbp->mpNext)
4906  fta_remove_dup_quals(static_cast<FeatBlk*>(tdbp->mpData));
4907  fta_remove_dup_feats(static_cast<DataBlk*>(dbp->mpData));
4908  for (tdbp = static_cast<DataBlk*>(dbp->mpData); tdbp; tdbp = tdbp->mpNext)
4909  fta_check_rpt_unit_range(static_cast<FeatBlk*>(tdbp->mpData), ibp->bases);
4910  fta_check_multiple_locus_tag(static_cast<DataBlk*>(dbp->mpData), &ibp->drop);
4911  if (ibp->is_tpa || ibp->is_tsa || ibp->is_tls)
4912  fta_check_non_tpa_tsa_tls_locations(static_cast<DataBlk*>(dbp->mpData), ibp);
4913  fta_check_replace_regulatory(static_cast<DataBlk*>(dbp->mpData), &ibp->drop);
4914  dbp->mpData = fta_sort_features(static_cast<DataBlk*>(dbp->mpData), true);
4915  }
4916 
4917  if (i > 1 && ibp->is_mga) {
4918  ErrPostEx(SEV_REJECT, ERR_FEATURE_MoreThanOneCAGEFeat, "CAGE records are allowed to have only one feature, and it must be the \"source\" one. Entry dropped.");
4919  ibp->drop = true;
4920  }
4921 
4922  if (! ibp->drop)
4923  CollectGapFeats(entry, dab, pp, type);
4924 
4925  TSeqFeatList seq_feats;
4926  if (! ibp->drop)
4927  ParseSourceFeat(pp, dab, ids, type, bioseq, seq_feats);
4928 
4929  if (seq_feats.empty()) {
4930  ibp->drop = true;
4931  for (; dab; dab = dabnext) {
4932  dabnext = dab->mpNext;
4933  FreeFeatBlk(static_cast<DataBlk*>(dab->mpData), pp->format);
4934  if (pp->format == Parser::EFormat::XML)
4935  dab->SimpleDelete();
4936  }
4937  xinstall_gbparse_range_func(nullptr, nullptr);
4938  return;
4939  }
4940 
4941  if (! ibp->submitter_seqid.empty())
4942  fta_create_wgs_seqid(bioseq, ibp, pp->source);
4943 
4944  CSeq_descr::Tdata& descr_list = bioseq.SetDescr().Set();
4945  for (CSeq_descr::Tdata::iterator descr = descr_list.begin(); descr != descr_list.end();) {
4946  if (! (*descr)->IsSource()) {
4947  ++descr;
4948  continue;
4949  }
4950 
4951  descr = descr_list.erase(descr);
4952  }
4953 
4954  CRef<CSeqdesc> descr_src(new CSeqdesc);
4955  descr_src->SetSource(seq_feats.front()->SetData().SetBiosrc());
4956 
4957  descr_list.push_back(descr_src);
4958  seq_feats.pop_front();
4959 
4960  fta_get_gcode_from_biosource(descr_src->GetSource(), ibp);
4961 
4962  for (; dab; dab = dabnext) {
4963  dabnext = dab->mpNext;
4964  if (dab->mType != type) {
4965  if (pp->format == Parser::EFormat::XML)
4966  dab->SimpleDelete();
4967  continue;
4968  }
4969 
4970  for (dbp = static_cast<DataBlk*>(dab->mpData); dbp; dbp = dbp->mpNext) {
4971  if (dbp->mDrop == true)
4972  continue;
4973 
4974  fbp = static_cast<FeatBlk*>(dbp->mpData);
4975  if (StringEqu(fbp->key, "source") ||
4976  StringEqu(fbp->key, "assembly_gap") ||
4977  (StringEqu(fbp->key, "gap") &&
4979  continue;
4980 
4981  fta_sort_quals(fbp, pp->qamode);
4982  CRef<CSeq_feat> feat;
4983  if (fbp->spindex < 0)
4984  feat = ProcFeatBlk(pp, fbp, ids);
4985  else
4986  feat = SpProcFeatBlk(pp, fbp, ids);
4987  if (feat.Empty()) {
4988  if (StringEqu(fbp->key, "CDS")) {
4989  ErrPostEx(SEV_ERROR, ERR_FEATURE_LocationParsing, "CDS feature has unparsable location. Entry dropped. Location = [%s].", fbp->location);
4990  ibp->drop = true;
4991  }
4992  continue;
4993  }
4994 
4995  if (StringEqu(fbp->key, "mobile_element") &&
4996  ! fta_check_mobile_element(*feat)) {
4997  ibp->drop = true;
4998  continue;
4999  }
5000 
5001  fta_check_artificial_location(*feat, fbp->key);
5002 
5003  if (CheckForeignLoc(feat->GetLocation(),
5004  (pp->source == Parser::ESource::USPTO) ? *pat_seq_id : *seq_id)) {
5005  ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "Location pointing outside the entry [%s]", fbp->location);
5006 
5007  if (feat->GetData().IsImp()) {
5008  const CImp_feat& imp_feat = feat->GetData().GetImp();
5009  if (imp_feat.GetKey() == "intron" ||
5010  imp_feat.GetKey() == "exon") {
5011  /* foreign introns and exons wouldn't be parsed
5012  */
5013  feat.Reset();
5014  continue;
5015  }
5016  }
5017  }
5018 
5019  FilterDb_xref(*feat, pp->source);
5020 
5021  i = FTASeqLocCheck(feat->GetLocation(), ibp->acnum);
5022  if (i == 0) {
5024 
5025  if (pp->debug)
5026  seq_feats.push_back(feat);
5027  else {
5028  feat.Reset();
5029  continue;
5030  }
5031  } else {
5032  if (i == 1) {
5033  if (feat->IsSetExcept_text() && feat->GetExcept_text() == "trans-splicing")
5036  "Mixed strands in SeqLoc of /trans_splicing feature: %s",
5037  fbp->location);
5038  else
5039  ErrPostEx(SEV_WARNING, ERR_LOCATION_MixedStrand, "Mixed strands in SeqLoc: %s", fbp->location);
5040  }
5041 
5042  seq_feats.push_back(feat);
5043  }
5044  }
5045  FreeFeatBlk(static_cast<DataBlk*>(dab->mpData), pp->format);
5046  if (pp->format == Parser::EFormat::XML)
5047  dab->SimpleDelete();
5048  }
5049 
5050  if (! fta_perform_operon_checks(seq_feats, ibp)) {
5051  ibp->drop = true;
5052  seq_feats.clear();
5053  xinstall_gbparse_range_func(nullptr, nullptr);
5054  return;
5055  }
5056 
5057  bool stop = false;
5058  for (auto& feat : seq_feats) {
5059  if (! feat->GetData().IsImp())
5060  continue;
5061 
5062  const CImp_feat& imp_feat = feat->GetData().GetImp();
5063 
5064  if (imp_feat.IsSetKey() &&
5065  StringStr(imp_feat.GetKey().c_str(), "RNA")) {
5066  if (imp_feat.GetKey() == "ncRNA" && ! fta_check_ncrna(*feat)) {
5067  stop = true;
5068  break;
5069  }
5070 
5071  GetRnaRef(*feat, bioseq, pp->source, pp->accver);
5072  }
5073  }
5074 
5075  if (stop) {
5076  ibp->drop = true;
5077  seq_feats.clear();
5078  xinstall_gbparse_range_func(nullptr, nullptr);
5079  return;
5080  }
5081 
5082  SeqFeatPub(pp, entry, seq_feats, ids, col_data, ibp);
5083  if (seq_feats.empty() && ibp->drop) {
5084  xinstall_gbparse_range_func(nullptr, nullptr);
5085  return;
5086  }
5087 
5088  /* ImpFeatPub() call will be removed in asn 4.0
5089  */
5090  ImpFeatPub(pp, entry, seq_feats, *seq_id, col_data, ibp);
5091 
5092  xinstall_gbparse_range_func(nullptr, nullptr);
5093  if (seq_feats.empty())
5094  return;
5095 
5096  CRef<CSeq_annot> annot(new CSeq_annot);
5097  annot->SetData().SetFtable().swap(seq_feats);
5098 
5099  bioseq.SetAnnot().push_back(annot);
5100 }
5101 
5102 /**********************************************************/
5103 static CMolInfo::EBiomol GetBiomolFromToks(char* mRNA, char* tRNA, char* rRNA, char* snRNA, char* scRNA, char* uRNA, char* snoRNA)
5104 {
5105  char* p = nullptr;
5106 
5107  if (mRNA)
5108  p = mRNA;
5109  if (! p || (tRNA && tRNA < p))
5110  p = tRNA;
5111  if (! p || (rRNA && rRNA < p))
5112  p = rRNA;
5113  if (! p || (snRNA && snRNA < p))
5114  p = snRNA;
5115  if (! p || (scRNA && scRNA < p))
5116  p = scRNA;
5117  if (! p || (uRNA && uRNA < p))
5118  p = uRNA;
5119  if (! p || (snoRNA && snoRNA < p))
5120  p = snoRNA;
5121 
5122  if (p == mRNA)
5123  return (Seq_descr_GIBB_mol_mRNA);
5124  if (p == tRNA)
5125  return (Seq_descr_GIBB_mol_tRNA);
5126  if (p == rRNA)
5127  return (Seq_descr_GIBB_mol_rRNA);
5128  if (p == snRNA || p == uRNA)
5129  return (Seq_descr_GIBB_mol_snRNA);
5130  if (p == snoRNA)
5131  return (Seq_descr_GIBB_mol_snoRNA);
5132  return (Seq_descr_GIBB_mol_scRNA);
5133 }
5134 
5135 /**********************************************************/
5136 void GetFlatBiomol(CMolInfo::TBiomol& biomol, CMolInfo::TTech tech, char* molstr, ParserPtr pp, const DataBlk& entry, const COrg_ref* org_ref)
5137 {
5138  Int4 genomic;
5139  char* offset;
5140  char c;
5141  DataBlkPtr dbp;
5142 
5143  Int2 count;
5144  Int2 i;
5145  EntryBlkPtr ebp;
5146  IndexblkPtr ibp;
5147  const char* p;
5148 
5149  char* q;
5150  char* r;
5151  char* mRNA = nullptr;
5152  char* tRNA = nullptr;
5153  char* rRNA = nullptr;
5154  char* snRNA = nullptr;
5155  char* scRNA = nullptr;
5156  char* uRNA = nullptr;
5157  char* snoRNA = nullptr;
5158  bool stage;
5159  bool techok;
5160  bool same;
5161  bool is_syn;
5162 
5163  ebp = static_cast<EntryBlk*>(entry.mpData);
5164 
5165  CBioseq& bioseq = ebp->seq_entry->SetSeq();
5166  ibp = pp->entrylist[pp->curindx];
5167 
5168  if (ibp->is_prot) {
5169  bioseq.SetInst().SetMol(CSeq_inst::eMol_aa);
5170  biomol = CMolInfo::eBiomol_peptide;
5171  return;
5172  }
5173 
5174  if (StringEqu(ibp->division, "SYN") ||
5175  (org_ref && org_ref->IsSetOrgname() && org_ref->GetOrgname().IsSetDiv() &&
5176  org_ref->GetOrgname().GetDiv() == "SYN"))
5177  is_syn = true;
5178  else
5179  is_syn = false;
5180 
5181  r = nullptr;
5182  c = '\0';
5183  if (! ibp->moltype.empty()) {
5184  if (pp->source == Parser::ESource::DDBJ && StringEquNI(molstr, "PRT", 3))
5185  return;
5186 
5187  biomol = Seq_descr_GIBB_mol_genomic;
5188  bioseq.SetInst().SetMol(CSeq_inst::eMol_dna);
5189 
5190  if (molstr) {
5191  q = molstr;
5192  r = molstr;
5194  while (*r != ';' && *r != '\n' && *r != '\0')
5195  r++;
5196  else {
5197  while (*r != ';' && *r != ' ' && *r != '\t' && *r != '\n' &&
5198  *r != '\0')
5199  r++;
5200  if (r - molstr > 10)
5201  r = molstr + 10;
5202  }
5203  c = *r;
5204  *r = '\0';
5205  if (q == r)
5206  q = (char*)"???";
5207  } else
5208  q = (char*)"???";
5209 
5210  same = true;
5211  if (ibp->moltype == "genomic DNA") {
5212  biomol = Seq_descr_GIBB_mol_genomic;
5213  bioseq.SetInst().SetMol(CSeq_inst::eMol_dna);
5214 
5215  if (pp->source == Parser::ESource::EMBL) {
5216  if (NStr::CompareNocase(q, "DNA") != 0 &&
5217  NStr::CompareNocase(ibp->moltype, q) != 0)
5218  same = false;
5219  } else if (NStr::CompareNocase(q, "DNA") != 0)
5220  same = false;
5221  } else if (ibp->moltype == "genomic RNA") {
5222  biomol = Seq_descr_GIBB_mol_genomic;
5223  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5224 
5225  if (pp->source == Parser::ESource::EMBL) {
5226  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5227  same = false;
5228  } else if (NStr::CompareNocase(q, "RNA") != 0)
5229  same = false;
5230  } else if (ibp->moltype == "mRNA") {
5231  biomol = Seq_descr_GIBB_mol_mRNA;
5232  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5233 
5234  if (pp->source == Parser::ESource::EMBL) {
5235  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5236  same = false;
5237  } else if (NStr::CompareNocase(q, "mRNA") != 0)
5238  same = false;
5239  } else if (ibp->moltype == "tRNA") {
5240  biomol = Seq_descr_GIBB_mol_tRNA;
5241  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5242 
5243  if (pp->source == Parser::ESource::EMBL) {
5244  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5245  same = false;
5246  } else if (NStr::CompareNocase(q, "tRNA") != 0)
5247  same = false;
5248  } else if (ibp->moltype == "rRNA") {
5249  biomol = Seq_descr_GIBB_mol_rRNA;
5250  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5251 
5252  if (pp->source == Parser::ESource::EMBL) {
5253  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5254  same = false;
5255  } else if (NStr::CompareNocase(q, "rRNA") != 0)
5256  same = false;
5257  } else if (ibp->moltype == "snoRNA") {
5258  biomol = Seq_descr_GIBB_mol_snoRNA;
5259  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5260 
5261  if (pp->source == Parser::ESource::EMBL) {
5262  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5263  same = false;
5264  } else if (NStr::CompareNocase(q, "snoRNA") != 0)
5265  same = false;
5266  } else if (ibp->moltype == "snRNA") {
5267  biomol =