NCBI C++ ToolKit
loadfeat.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: loadfeat.cpp 102112 2024-04-02 18:07:29Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: loadfeat.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Parse features block to subblock.
32  * Process each subblock.
33  * Output each subblock.
34  * Free out subblock.
35  */
36 
37 #include <ncbi_pch.hpp>
38 
39 #include "ftacpp.hpp"
40 
43 #include <objmgr/bioseq_handle.hpp>
44 #include <objmgr/scope.hpp>
57 #include <objects/pub/Pub_set.hpp>
58 #include <objects/pub/Pub.hpp>
59 #include <serial/objostr.hpp>
62 #include <objects/seq/Pubdesc.hpp>
65 #include <objects/seq/MolInfo.hpp>
66 #include <objects/seq/Seq_inst.hpp>
67 #include <objects/seq/Seq_ext.hpp>
70 
71 #include "index.h"
72 #include "embl.h"
73 #include "genbank.h"
74 #include "qual_parse.hpp"
75 
78 
79 #include "ftaerr.hpp"
80 #include "indx_blk.h"
81 #include "asci_blk.h"
82 #include "utilfeat.h"
83 #include "loadfeat.h"
84 #include "add.h"
85 #include "fta_src.h"
86 #include "buf_data_loader.h"
87 #include "utilfun.h"
88 #include "ref.h"
89 #include "xgbfeat.h"
90 #include "xgbparint.h"
91 #include "fta_xml.h"
92 
93 #ifdef THIS_FILE
94 # undef THIS_FILE
95 #endif
96 #define THIS_FILE "loadfeat.cpp"
97 
100 
101 #define Seq_descr_GIBB_mol_unknown CMolInfo::eBiomol_unknown
102 #define Seq_descr_GIBB_mol_genomic CMolInfo::eBiomol_genomic
103 #define Seq_descr_GIBB_mol_preRNA CMolInfo::eBiomol_pre_RNA
104 #define Seq_descr_GIBB_mol_mRNA CMolInfo::eBiomol_mRNA
105 #define Seq_descr_GIBB_mol_rRNA CMolInfo::eBiomol_rRNA
106 #define Seq_descr_GIBB_mol_tRNA CMolInfo::eBiomol_tRNA
107 #define Seq_descr_GIBB_mol_uRNA CMolInfo::eBiomol_snRNA
108 #define Seq_descr_GIBB_mol_snRNA CMolInfo::eBiomol_snRNA
109 #define Seq_descr_GIBB_mol_scRNA CMolInfo::eBiomol_scRNA
110 #define Seq_descr_GIBB_mol_other_genetic CMolInfo::eBiomol_other_genetic
111 #define Seq_descr_GIBB_mol_cRNA CMolInfo::eBiomol_cRNA
112 #define Seq_descr_GIBB_mol_snoRNA CMolInfo::eBiomol_snoRNA
113 #define Seq_descr_GIBB_mol_trRNA CMolInfo::eBiomol_transcribed_RNA
114 #define Seq_descr_GIBB_mol_other CMolInfo::eBiomol_other
115 
116 struct TrnaAa {
117  const char* name;
119 };
120 
121 struct StrNum {
122  const char* str;
123  int num;
124 };
125 
126 const TrnaAa taa[] = {
127  { "alanine", 'A' },
128  { "arginine", 'R' },
129  { "asparagine", 'N' },
130  { "aspartic acid", 'D' },
131  { "aspartate", 'D' },
132  { "cysteine", 'C' },
133  { "glutamine", 'Q' },
134  { "glutamic acid", 'E' },
135  { "glutamate", 'E' },
136  { "glycine", 'G' },
137  { "histidine", 'H' },
138  { "isoleucine", 'I' },
139  { "leucine", 'L' },
140  { "lysine", 'K' },
141  { "methionine", 'M' },
142  { "phenylalanine", 'F' },
143  { "proline", 'P' },
144  { "selenocysteine", 'U' },
145  { "serine", 'S' },
146  { "threonine", 'T' },
147  { "tryptophan", 'W' },
148  { "tyrosine", 'Y' },
149  { "valine", 'V' },
150  { nullptr, '\0' }
151 };
152 
153 struct AaCodons {
154  const char* straa;
157  Int4 vals[8];
158 };
159 
160 const AaCodons aacodons[] = {
161  { "Ala", 'A', 0, { 52, 53, 54, 55, -1, -1, -1, -1 } }, /* GCT, GCC, GCA, GCG */
162  { "Arg", 'R', 2, { 28, 29, 30, 31, -1, -1, -1, -1 } }, /* CGT, CGC, CGA, CGG */
163  { "Arg", 'R', 5, { 28, 29, 30, 31, -1, -1, -1, -1 } }, /* CGT, CGC, CGA, CGG */
164  { "Arg", 'R', 9, { 28, 29, 30, 31, -1, -1, -1, -1 } }, /* CGT, CGC, CGA, CGG */
165  { "Arg", 'R', 13, { 28, 29, 30, 31, -1, -1, -1, -1 } }, /* CGT, CGC, CGA, CGG */
166  { "Arg", 'R', 14, { 28, 29, 30, 31, -1, -1, -1, -1 } }, /* CGT, CGC, CGA, CGG */
167  { "Arg", 'R', 0, { 28, 29, 30, 31, 46, 47, -1, -1 } }, /* CGT, CGC, CGA, CGG, AGA, AGG */
168  { "Asn", 'N', 9, { 40, 41, 42, -1, -1, -1, -1, -1 } }, /* AAT, AAC, AAA */
169  { "Asn", 'N', 14, { 40, 41, 42, -1, -1, -1, -1, -1 } }, /* AAT, AAC, AAA */
170  { "Asn", 'N', 0, { 40, 41, -1, -1, -1, -1, -1, -1 } }, /* AAT, AAC */
171  { "Asp", 'D', 0, { 56, 57, -1, -1, -1, -1, -1, -1 } }, /* GAT, GAC */
172  { "Asx", 'B', 9, { 40, 41, 42, 56, 57, -1, -1, -1 } }, /* Asn + Asp */
173  { "Asx", 'B', 14, { 40, 41, 42, 56, 57, -1, -1, -1 } }, /* Asn + Asp */
174  { "Asx", 'B', 0, { 40, 41, 56, 57, -1, -1, -1, -1 } }, /* Asn + Asp */
175  { "Cys", 'C', 10, { 12, 13, 14, -1, -1, -1, -1, -1 } }, /* TGT, TGC, TGA */
176  { "Cys", 'C', 0, { 12, 13, -1, -1, -1, -1, -1, -1 } }, /* TGT, TGC */
177  { "Gln", 'Q', 6, { 10, 11, 26, 27, -1, -1, -1, -1 } }, /* TAA, TAG, CAA, CAG */
178  { "Gln", 'Q', 15, { 11, 26, 27, -1, -1, -1, -1, -1 } }, /* TAG, CAA, CAG */
179  { "Gln", 'Q', 0, { 26, 27, -1, -1, -1, -1, -1, -1 } }, /* CAA, CAG */
180  { "Glu", 'E', 0, { 58, 59, -1, -1, -1, -1, -1, -1 } }, /* GAA, GAG */
181  { "Glx", 'Z', 6, { 10, 11, 26, 27, 58, 59, -1, -1 } }, /* Gln + Glu */
182  { "Glx", 'Z', 0, { 11, 26, 27, 58, 59, -1, -1, -1 } }, /* Gln + Glu */
183  { "Glx", 'Z', 0, { 26, 27, 58, 59, -1, -1, -1, -1 } }, /* Gln + Glu */
184  { "Gly", 'G', 13, { 46, 47, 60, 61, 62, 63, -1, -1 } }, /* AGA, AGG, GGT, GGC, GGA, GGG */
185  { "Gly", 'G', 0, { 60, 61, 62, 63, -1, -1, -1, -1 } }, /* GGT, GGC, GGA, GGG */
186  { "His", 'H', 0, { 24, 25, -1, -1, -1, -1, -1, -1 } }, /* CAT, CAC */
187  { "Ile", 'I', 2, { 32, 33, -1, -1, -1, -1, -1, -1 } }, /* ATT, ATC */
188  { "Ile", 'I', 3, { 32, 33, -1, -1, -1, -1, -1, -1 } }, /* ATT, ATC */
189  { "Ile", 'I', 5, { 32, 33, -1, -1, -1, -1, -1, -1 } }, /* ATT, ATC */
190  { "Ile", 'I', 13, { 32, 33, -1, -1, -1, -1, -1, -1 } }, /* ATT, ATC */
191  { "Ile", 'I', 0, { 32, 33, 34, -1, -1, -1, -1, -1 } }, /* ATT, ATC, ATA */
192  { "Leu", 'L', 3, { 2, 3, -1, -1, -1, -1, -1, -1 } }, /* TTA, TTG */
193  { "Leu", 'L', 12, { 2, 3, 16, 17, 18, -1, -1, -1 } }, /* TTA, TTG, CTT, CTC, CTA */
194  { "Leu", 'L', 0, { 2, 3, 16, 17, 18, 19, -1, -1 } }, /* TTA, TTG, CTT, CTC, CTA, CTG */
195  { "Lys", 'K', 9, { 43, -1, -1, -1, -1, -1, -1, -1 } }, /* AAG */
196  { "Lys", 'K', 14, { 43, -1, -1, -1, -1, -1, -1, -1 } }, /* AAG */
197  { "Lys", 'K', 0, { 42, 43, -1, -1, -1, -1, -1, -1 } }, /* AAA, AAG */
198  { "Met", 'M', 2, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
199  { "Met", 'M', 3, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
200  { "Met", 'M', 5, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
201  { "Met", 'M', 13, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
202  { "Met", 'M', 0, { 35, -1, -1, -1, -1, -1, -1, -1 } }, /* ATG */
203  { "fMet", 'M', 2, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
204  { "fMet", 'M', 3, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
205  { "fMet", 'M', 5, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
206  { "fMet", 'M', 13, { 34, 35, -1, -1, -1, -1, -1, -1 } }, /* ATA, ATG */
207  { "fMet", 'M', 0, { 35, -1, -1, -1, -1, -1, -1, -1 } }, /* ATG */
208  { "Phe", 'F', 0, { 0, 1, -1, -1, -1, -1, -1, -1 } }, /* TTT, TTC */
209  { "Pro", 'P', 0, { 20, 21, 22, 23, -1, -1, -1, -1 } }, /* CCT, CCC, CCA, CCG */
210  { "Sec", 'U', 0, { -1, -1, -1, -1, -1, -1, -1, -1 } },
211  { "Ser", 'S', 5, { 4, 5, 6, 7, 44, 45, 46, 47 } }, /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
212  { "Ser", 'S', 9, { 4, 5, 6, 7, 44, 45, 46, 47 } }, /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
213  { "Ser", 'S', 12, { 4, 5, 6, 7, 19, 44, 45, -1 } }, /* TCT, TCC, TCA, TCG, CTG, AGT, AGC */
214  { "Ser", 'S', 14, { 4, 5, 6, 7, 44, 45, 46, 47 } }, /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
215  { "Ser", 'S', 0, { 4, 5, 6, 7, 44, 45, -1, -1 } }, /* TCT, TCC, TCA, TCG, AGT, AGC */
216  { "Thr", 'T', 3, { 16, 17, 18, 19, 36, 37, 38, 39 } }, /* CTT, CTC, CTA, CTG, ACT, ACC, ACA, ACG */
217  { "Thr", 'T', 0, { 36, 37, 38, 39, -1, -1, -1, -1 } }, /* ACT, ACC, ACA, ACG */
218  { "Trp", 'W', 1, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
219  { "Trp", 'W', 6, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
220  { "Trp", 'W', 10, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
221  { "Trp", 'W', 11, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
222  { "Trp", 'W', 12, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
223  { "Trp", 'W', 15, { 15, -1, -1, -1, -1, -1, -1, -1 } }, /* TGG */
224  { "Trp", 'W', 0, { 14, 15, -1, -1, -1, -1, -1, -1 } }, /* TGA, TGG */
225  { "Tyr", 'Y', 14, { 8, 9, 10, -1, -1, -1, -1, -1 } }, /* TAT, TAC, TAA */
226  { "Tyr", 'Y', 0, { 8, 9, -1, -1, -1, -1, -1, -1 } }, /* TAT, TAC */
227  { "Val", 'V', 0, { 48, 49, 50, 51, -1, -1, -1, -1 } }, /* GTT, GTC, GTA, GTG */
228  { "TERM", '*', 1, { 10, 11, 14, -1, -1, -1, -1, -1 } }, /* TAA, TAG, TGA */
229  { "TERM", '*', 2, { 10, 11, 46, 47, -1, -1, -1, -1 } }, /* TAA, TAG, AGA, AGG */
230  { "TERM", '*', 6, { 14, -1, -1, -1, -1, -1, -1, -1 } }, /* TGA */
231  { "TERM", '*', 11, { 10, 11, 14, -1, -1, -1, -1, -1 } }, /* TAA, TAG, TGA */
232  { "TERM", '*', 12, { 10, 11, 14, -1, -1, -1, -1, -1 } }, /* TAA, TAG, TGA */
233  { "TERM", '*', 14, { 11, -1, -1, -1, -1, -1, -1, -1 } }, /* TAG */
234  { "TERM", '*', 15, { 10, 14, -1, -1, -1, -1, -1, -1 } }, /* TAA, TGA */
235  { "TERM", '*', 0, { 10, 11, -1, -1, -1, -1, -1, -1 } }, /* TAA, TAG */
236  { "OTHER", 'X', 0, { -1, -1, -1, -1, -1, -1, -1, -1 } },
237  { nullptr, '\0', 0, { -1, -1, -1, -1, -1, -1, -1, -1 } }
238 };
239 
240 static const char* trna_tags[] = {
241  "TRANSFERN RNA",
242  "TRANSFER RRNA",
243  "TRANSFER TRNA",
244  "TRANSFER RNA",
245  "TRASNFER RNA",
246  "TRANSDER RNA",
247  "TRANSFERRNA",
248  "TRANFER RNA",
249  "T RNA",
250  "TRNA",
251  nullptr
252 };
253 
254 const char* ParFlat_ESTmod[] = {
255  "EST",
256  "expressed sequence tag",
257  "partial cDNA sequence",
258  "transcribed sequence fragment",
259  "TSR",
260  "putatively transcribed partial sequence",
261  "UK putts",
262  "Plastid",
263  nullptr
264 };
265 
266 static const char* ParFlat_RNA_array[] = {
267  "precursor_RNA",
268  "mRNA",
269  "tRNA",
270  "rRNA",
271  "snRNA",
272  "scRNA",
273  "snoRNA",
274  "ncRNA",
275  "tmRNA",
276  "misc_RNA",
277  nullptr
278 };
279 
280 static const char* DbxrefTagAny[] = {
281  "ASAP",
282  "CDD",
283  "DBEST",
284  "DBSTS",
285  "GDB",
286  "HMP",
287  "MAIZEGDB",
288  nullptr
289 };
290 
291 static const char* DbxrefObsolete[] = {
292  "BHB",
293  "BIOHEALTHBASE",
294  "GENEW",
295  "IFO",
296  "SWISS-PROT",
297  "SPTREMBL",
298  "TREMBL",
299  nullptr
300 };
301 
302 static const char* EMBLDbxrefTagStr[] = {
303  "BIOMUTA",
304  "DEPOD",
305  "ENSEMBLGENOMES-GN",
306  "ENSEMBLGENOMES-TR",
307  "ESTHER",
308  "GENEVISIBLE",
309  "MOONPROT",
310  "PROTEOMES",
311  "UNITE",
312  "WBPARASITE",
313  nullptr
314 };
315 
316 static const char* DbxrefTagStr[] = {
317  "ACEVIEW/WORMGENES",
318  "APHIDBASE",
319  "APIDB",
320  "ARAPORT",
321  "BEEBASE",
322  "BEETLEBASE",
323  "BGD",
324  "BOLD",
325  "CGD",
326  "COLLECTF",
327  "DBSNP",
328  "DICTYBASE",
329  "ECOCYC",
330  "ECOGENE",
331  "ENSEMBL",
332  "ENSEMBLGENOMES",
333  "ERIC",
334  "FANTOM_DB",
335  "FLYBASE",
336  "GABI",
337  "GENEDB",
338  "GOA",
339  "H-INVDB",
340  "HGNC",
341  "HOMD",
342  "HSSP",
343  "I5KNAL",
344  "IMGT/GENE-DB",
345  "IMGT/HLA",
346  "IMGT/LIGM",
347  "INTERPRO",
348  "IRD",
349  "ISD",
350  "ISFINDER",
351  "ISHAM-ITS",
352  "JGIDB",
353  "MARPOLBASE",
354  "MEDGEN",
355  "MGI",
356  "MIRBASE",
357  "NEXTDB",
358  "NIAEST",
359  "NMPDR",
360  "NRESTDB",
361  "OSA1",
362  "PATHEMA",
363  "PDB",
364  "PFAM",
365  "PGN",
366  "PHYTOZOME",
367  "PIR",
368  "POMBASE",
369  "PSEUDO",
370  "PSEUDOCAP",
371  "RAP-DB",
372  "REMTREMBL",
373  "RFAM",
374  "RICEGENES",
375  "RZPD",
376  "SEED",
377  "SGD",
378  "SGN",
379  "SPTREMBL",
380  "SRPDB",
381  "SUBTILIST",
382  "SWISS-PROT",
383  "TAIR",
384  "TIGRFAM",
385  "TREMBL",
386  "TUBERCULIST",
387  "UNIPROT/SWISS-PROT",
388  "UNIPROT/TREMBL",
389  "UNIPROTKB/SWISS-PROT",
390  "UNIPROTKB/TREMBL",
391  "UNITE",
392  "VBASE2",
393  "VECTORBASE",
394  "VGNC",
395  "VIPR",
396  "VISTA",
397  "WORFDB",
398  "WORMBASE",
399  "XENBASE",
400  "ZFIN",
401  nullptr
402 };
403 
404 static const char* DbxrefTagInt[] = {
405  "ATCC",
406  "ATCC(DNA)",
407  "ATCC(IN HOST)",
408  "BDGP_EST",
409  "BDGP_INS",
410  "ESTLIB",
411  "GENEID",
412  "GI",
413  "GO",
414  "GREENGENES",
415  "INTREPIDBIO",
416  "JCM",
417  "LOCUSID",
418  "MIM",
419  "MYCOBANK",
420  "NBRC",
421  "PBMICE",
422  "RATMAP",
423  "RGD",
424  "UNILIB",
425  "UNISTS",
426  nullptr
427 };
428 
429 static const char* EmptyQuals[] = {
430  "artificial_location", /* Fake. Put here to catch
431  it's empty */
432  "chloroplast",
433  "chromoplast",
434  "cyanelle",
435  "environmental_sample",
436  "focus",
437  "germline",
438  "kinetoplast",
439  "macronuclear",
440  "metagenomic",
441  "mitochondrion",
442  "mobile_element_type", /* Fake. Put here to catch
443  it's empty */
444  "partial",
445  "proviral",
446  "pseudo",
447  "rearranged",
448  "ribosomal_slippage",
449  "trans_splicing",
450  "transgenic",
451  "virion",
452  nullptr
453 };
454 
455 const char* TransSplicingFeats[] = {
456  "3'UTR",
457  "5'UTR",
458  "CDS",
459  "gene",
460  "mRNA",
461  "misc_RNA",
462  "precursor_RNA",
463  "tRNA",
464  nullptr
465 };
466 
467 const char* ncRNA_class_values[] = {
468  "antisense_RNA",
469  "autocatalytically_spliced_intron",
470  "hammerhead_ribozyme",
471  "lncRNA",
472  "RNase_P_RNA",
473  "RNase_MRP_RNA",
474  "telomerase_RNA",
475  "guide_RNA",
476  "rasiRNA",
477  "ribozyme",
478  "scRNA",
479  "siRNA",
480  "miRNA",
481  "piRNA",
482  "pre_miRNA",
483  "snoRNA",
484  "snRNA",
485  "SRP_RNA",
486  "vault_RNA",
487  "Y_RNA",
488  "other",
489  nullptr
490 };
491 
492 const char* SatelliteValues[] = {
493  "satellite",
494  "minisatellite",
495  "microsatellite",
496  nullptr
497 };
498 
499 const char* PseudoGeneValues[] = {
500  "allelic",
501  "processed",
502  "unitary",
503  "unknown",
504  "unprocessed",
505  nullptr
506 };
507 
508 const char* RegulatoryClassValues[] = {
509  "attenuator",
510  "CAAT_signal",
511  "DNase_I_hypersensitive_site",
512  "enhancer",
513  "enhancer_blocking_element",
514  "GC_signal",
515  "imprinting_control_region",
516  "insulator",
517  "locus_control_region",
518  "matrix_attachment_region",
519  "minus_35_signal",
520  "minus_10_signal",
521  "response_element",
522  "polyA_signal_sequence",
523  "promoter",
524  "recoding_stimulatory_region",
525  "replication_regulatory_region",
526  "ribosome_binding_site",
527  "riboswitch",
528  "silencer",
529  "TATA_box",
530  "terminator",
531  "transcriptional_cis_regulatory_region",
532  "other",
533  nullptr
534 };
535 
536 // clang-format off
538  { "between scaffolds", CSeq_gap::eType_contig },
539  { "within scaffold", CSeq_gap::eType_scaffold },
540  { "telomere", CSeq_gap::eType_telomere },
541  { "centromere", CSeq_gap::eType_centromere },
542  { "short arm", CSeq_gap::eType_short_arm },
543  { "heterochromatin", CSeq_gap::eType_heterochromatin },
544  { "repeat within scaffold", CSeq_gap::eType_repeat },
545  { "repeat between scaffolds", CSeq_gap::eType_repeat },
546  { "unknown", CSeq_gap::eType_unknown },
547  { nullptr, -1 }
548 };
549 
551  { "paired-ends", CLinkage_evidence::eType_paired_ends },
552  { "align genus", CLinkage_evidence::eType_align_genus },
553  { "align xgenus", CLinkage_evidence::eType_align_xgenus },
554  { "align trnscpt", CLinkage_evidence::eType_align_trnscpt },
555  { "within clone", CLinkage_evidence::eType_within_clone },
556  { "clone contig", CLinkage_evidence::eType_clone_contig },
557  { "map", CLinkage_evidence::eType_map },
558  { "strobe", CLinkage_evidence::eType_strobe },
559  { "unspecified", CLinkage_evidence::eType_unspecified },
560  { "pcr", CLinkage_evidence::eType_pcr },
561  { "proximity ligation", CLinkage_evidence::eType_proximity_ligation },
562  { nullptr, -1 }
563 };
564 // clang-format on
565 
567 {
568  if (key) {
569  MemFree(key);
570  key = nullptr;
571  }
572  if (location) {
573  MemFree(location);
574  location = nullptr;
575  }
576 }
577 
578 extern Int2 SpFeatKeyNameValid(const Char* keystr);
580 
581 /**********************************************************/
583 {
584  DataBlkPtr dbpnext;
585  FeatBlkPtr fbp;
586 
587  for (; dbp; dbp = dbpnext) {
588  dbpnext = dbp->mpNext;
589  fbp = static_cast<FeatBlk*>(dbp->mpData);
590  if (fbp) {
591  delete fbp;
592  dbp->mpData = nullptr;
593  }
595  dbp->SimpleDelete();
596  }
597 }
598 
599 /**********************************************************
600  *
601  * static void DelCharBtwData(value):
602  *
603  * Deletes blanks in the "str".
604  *
605  **********************************************************/
606 static void DelCharBtwData(char* value)
607 {
608  char* p;
609 
610  for (p = value; *p != '\0'; p++)
611  if (*p != ' ')
612  *value++ = *p;
613  *value = '\0';
614 }
615 
616 /**********************************************************
617  *
618  * static Int4 flat2asn_range_func(pp, sip):
619  *
620  * For error handle in gbparint.c routines.
621  * This function has to return the length corresponding
622  * to the SeqId it is passed.
623  *
624  * ks 1/13/94
625  *
626  **********************************************************/
627 static Int4 flat2asn_range_func(void* pp_ptr, const CSeq_id& id)
628 {
629  ParserPtr pp = reinterpret_cast<ParserPtr>(pp_ptr);
630 
631  int use_indx = pp->curindx;
632  char* acnum;
633 
634  Int2 vernum;
635 
636 #ifdef BIOSEQ_FIND_METHOD
637 
638  bsp = BioseqFind(sip);
639  if (bsp)
640  return (bsp->length);
641 
642  // could try ID0 server
643  //
644  return (-1);
645 
646 #else
647 
648  const CTextseq_id* text_id = nullptr;
649  if (id.IsGenbank() || id.IsEmbl() || id.IsDdbj() || id.IsTpg() ||
650  id.IsTpe() || id.IsTpd())
651  text_id = id.GetTextseq_Id();
652 
653  if (text_id) {
654  Int2 text_id_ver = text_id->IsSetVersion() ? text_id->GetVersion() : numeric_limits<short>::min();
655  const string& text_id_acc = text_id->GetAccession();
656  for (use_indx = 0; use_indx < pp->indx; use_indx++) {
657  acnum = pp->entrylist[use_indx]->acnum;
658  vernum = pp->entrylist[use_indx]->vernum;
659  if (text_id_acc == acnum &&
660  (pp->accver == false || vernum == text_id_ver))
661  break;
662  }
663 
664  if (use_indx >= pp->indx) {
665  // entry is not present in this file use remote fetch function
666  // use_indx = pp->curindx;
667  //
668  size_t len = (! pp->ffdb) ? -1 : CheckOutsideEntry(pp, text_id_acc.c_str(), text_id_ver);
669  if (len != static_cast<size_t>(-1))
670  return static_cast<Int4>(len);
671 
672  if (! pp->buf) {
673  if (pp->farseq)
674  return -1;
675 
676  if (pp->accver == false || text_id_ver < 0) {
677  Nlm_ErrSetContext("validatr", __FILE__, __LINE__);
678  Nlm_ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "Location points to outside entry %s", text_id_acc.c_str());
679  } else {
680  Nlm_ErrSetContext("validatr", __FILE__, __LINE__);
681  Nlm_ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "Location points to outside entry %s.%d", text_id_acc.c_str(), text_id_ver);
682  }
683  return (-1);
684  }
685 
686  if (*pp->buf == '\0')
687  return (-1);
688 
690  ErrPostEx(SEV_WARNING, ERR_LOCATION_NCBIRefersToExternalRecord, "Feature location references an interval on another record : %s", pp->buf);
691  else
692  ErrPostEx(SEV_WARNING, ERR_LOCATION_RefersToExternalRecord, "Feature location references an interval on another record : %s", pp->buf);
693  MemFree(pp->buf);
694  pp->buf = StringNew(0);
695  *pp->buf = '\0';
696  return (-1);
697  }
698  }
699  return static_cast<Int4>(pp->entrylist[use_indx]->bases);
700 
701 #endif
702 }
703 
704 /**********************************************************/
705 static bool CheckForeignLoc(const CSeq_loc& loc, const CSeq_id& sid)
706 {
707  const CSeq_id& pid = *loc.GetId();
708 
709  if (loc.IsMix() || loc.IsEquiv() ||
710  sid.Compare(pid) == CSeq_id::e_YES)
711  return false;
712 
713  return true;
714 }
715 
716 /**********************************************************/
718 {
720 
721  if (! qual.IsSetQual() ||
722  qual.GetQual() != "db_xref")
723  return tag;
724 
725  if (! qual.IsSetVal() || qual.GetVal().empty()) {
726  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmptyQual, "Found empty /db_xref qualifier. Qualifier dropped.");
727  return tag;
728  }
729 
730  const string& val = qual.GetVal();
731  if (NStr::CompareNocase(val.c_str(), "taxon") == 0)
732  return tag;
733 
734  string line = val;
735 
736  if (StringEquNI(line.c_str(), "MGD:MGI:", 8))
737  line = line.substr(4);
738 
739  size_t colon = line.find(':');
740  if (colon == string::npos) {
741  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefIncorrect, "Badly formatted /db_xref qualifier: \"%s\". Qualifier dropped.", val.c_str());
742  return tag;
743  }
744 
745  string tail = line.substr(colon + 1);
746  line = line.substr(0, colon);
747 
748  if (MatchArrayIString(DbxrefObsolete, line.c_str()) > -1) {
749  ErrPostEx(SEV_WARNING, ERR_FEATURE_ObsoleteDbXref, "/db_xref type \"%s\" is obsolete.", line.c_str());
750 
751  string buf;
752  if (NStr::CompareNocase(line.c_str(), "BHB") == 0)
753  buf = "IRD";
754  else if (NStr::CompareNocase(line.c_str(), "BioHealthBase") == 0)
755  buf = "IRD";
756  else if (NStr::CompareNocase(line.c_str(), "GENEW") == 0)
757  buf = "HGNC";
758  else if (NStr::CompareNocase(line.c_str(), "IFO") == 0)
759  buf = "NBRC";
760  else if (NStr::CompareNocase(line.c_str(), "SWISS-PROT") == 0)
761  buf = "UniProt/Swiss-Prot";
762  else
763  buf = "UniProt/TrEMBL";
764 
765  line = buf;
766  }
767 
768  if (NStr::CompareNocase(line.c_str(), "UNIPROT/SWISS-PROT") == 0 ||
769  NStr::CompareNocase(line.c_str(), "UNIPROT/TREMBL") == 0) {
770  string buf("UniProtKB");
771  buf += line.substr(7);
772 
773  line = buf;
774  }
775 
776  const Char* strid = nullptr;
777  Int4 intid = 0;
778 
779  const Char* p = tail.c_str();
780  if (MatchArrayIString(DbxrefTagAny, line.c_str()) > -1) {
781  for (strid = p; *p >= '0' && *p <= '9';)
782  p++;
783  if (*p == '\0' && *strid != '0') {
784  intid = atoi(strid);
785  strid = nullptr;
786  }
787  } else if (MatchArrayIString(DbxrefTagStr, line.c_str()) > -1 ||
789  MatchArrayIString(EMBLDbxrefTagStr, line.c_str()) > -1)) {
790  for (strid = p; *p >= '0' && *p <= '9';)
791  p++;
792  if (*p == '\0') {
793  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_DbxrefWrongType, "/db_xref qualifier \"%s\" is supposed to be a string, but its value consists of digits only.", val.c_str());
794  if (*strid != '0') {
795  intid = atoi(strid);
796  strid = nullptr;
797  }
798  }
799  } else if (MatchArrayIString(DbxrefTagInt, line.c_str()) > -1) {
800  const Char* q = p;
801  for (; *q == '0';)
802  q++;
803  if (*q == '\0') {
804  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefShouldBeNumeric, "/db_xref qual should have numeric value greater than 0: \"%s\". Qualifier dropped.", val.c_str());
805  return tag;
806  }
807 
808  const Char* r = q;
809  for (; *r >= '0' && *r <= '9';)
810  r++;
811  if (*r != '\0') {
812  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefWrongType, "/db_xref qualifier \"%s\" is supposed to be a numeric identifier, but its value includes alphabetic characters. Qualifier dropped.", val.c_str());
813  return tag;
814  }
815  if (*r != '\0' || q != p)
816  strid = p;
817  else if (NStr::CompareNocase(line.c_str(), "IntrepidBio") == 0 && fta_number_is_huge(q))
818  strid = q;
819  else
820  intid = atoi(q);
821  } else if (NStr::CompareNocase(line.c_str(), "PID") == 0) {
822  if (*p != 'e' && *p != 'g' && *p != 'd') {
823  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefIncorrect, "Badly formatted /db_xref qual \"PID\": \"%s\". Qualifier dropped.", val.c_str());
824  return tag;
825  }
826 
827  const Char* q = p + 1;
828  for (; *q == '0';)
829  q++;
830 
831  const Char* r;
832  for (r = q; *r >= '0' && *r <= '9';)
833  r++;
834  if (*q == '\0' || *r != '\0') {
835  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefShouldBeNumeric, "/db_xref qual \"PID\" should contain numeric value greater than 0: \"%s\". Qualifier dropped.", val.c_str());
836  return tag;
837  }
838  strid = p;
839  } else {
840  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefUnknownDBName, "Unknown data base name /db_xref = \"%s\". Qualifier dropped.", val.c_str());
841  return tag;
842  }
843 
844 
845  tag.Reset(new CDbtag);
846 
847  tag->SetDb(line);
848 
849  if (strid)
850  tag->SetTag().SetStr(strid);
851  else
852  tag->SetTag().SetId(intid);
853 
854  return tag;
855 }
856 
857 /**********************************************************
858  *
859  * Function:
860  * static void FilterDb_xref(pSeqFeat, source)
861  *
862  * Purpose:
863  * Looks through SeqFeat's qualifiers which contain
864  * "db_xref" in qual field, convert such qualifiers
865  * into Dbtags removing the qualifiers from SeqFeat's
866  * list, got Dbtags links in the chain of ValNodes
867  * and puts the chain into the SeqFeat.
868  *
869  * Parameters:
870  * pSeqFeat - pointer to a SeqFeat for processing
871  *
872  * Return:
873  * None.
874  *
875  **********************************************************/
877 {
878  if (! feat.IsSetQual())
879  return;
880 
881  CSeq_feat::TDbxref& db_refs = feat.SetDbxref();
882 
883  for (CSeq_feat::TQual::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
884  if (! (*qual)->IsSetQual() || (*qual)->GetQual() != "db_xref") {
885  /* Just skip this qualifier, it isn't db_xref
886  */
887  ++qual;
888  continue;
889  }
890 
891  /* Current qualifier is db_xref, process it
892  */
893  CRef<CDbtag> dbtag = DbxrefQualToDbtag(*(*qual), source);
894  if (dbtag.NotEmpty()) {
895  db_refs.push_back(dbtag);
896  }
897 
898  /* Remove converted qualifier from chain of qualifiers
899  */
900  qual = feat.SetQual().erase(qual);
901  }
902 
903  if (feat.GetQual().empty())
904  feat.ResetQual();
905 
906  if (db_refs.empty())
907  feat.ResetDbxref();
908 }
909 
910 bool GetSeqLocation(CSeq_feat& feat, char* location, TSeqIdList& ids, bool* hard_err, ParserPtr pp, const char* name)
911 {
912  bool locmap = true;
913  int num_errs;
914 
915  *hard_err = false;
916  num_errs = 0;
917 
918  CRef<CSeq_loc> loc = xgbparseint_ver(location, locmap, num_errs, ids, pp->accver);
919 
920  if (loc.NotEmpty()) {
921  TSeqLocList locs;
922  locs.push_back(loc);
923  fta_fix_seq_loc_id(locs, pp, location, name, false);
924 
925  feat.SetLocation(*loc);
926  }
927 
928  if (num_errs > 0) {
929  feat.ResetLocation();
930  CSeq_loc& cur_loc = feat.SetLocation();
931  cur_loc.SetWhole(*(*ids.begin()));
932  *hard_err = true;
933  } else if (! feat.GetLocation().IsEmpty()) {
934  if (feat.GetLocation().IsMix()) {
935  if (feat.GetLocation().GetMix().Get().size() == 1) {
936  CRef<CSeq_loc> cur_loc(new CSeq_loc);
937 
938  cur_loc->Assign(*feat.GetLocation().GetMix().GetFirstLoc());
939  if (cur_loc->IsInt())
940  feat.SetLocation(*cur_loc);
941  }
942  }
943  }
944 
945  return locmap;
946 }
947 
948 /**********************************************************
949  *
950  * static char* CheckLocStr(str):
951  *
952  * Nlm_gbparseint routine does not parse certain types
953  * of interval correctly, so this routine will save input
954  * form in fbp before passing it:
955  * (bases 100 to 300) ==> 100 to 300;
956  * (bases 1 to 100; 200 to 300) no change.
957  *
958  * 5-20-93
959  *
960  **********************************************************/
961 static char* CheckLocStr(const Char* str)
962 {
963  const Char* ptr;
964  const Char* eptr;
965  char* location;
966 
967  ptr = StringChr(str, ';');
968  if (ptr)
969  return StringSave(str);
970 
971  for (ptr = str; *ptr != ' ' && *ptr != '\0';)
972  ptr++;
973  while (*ptr == ' ')
974  ptr++;
975 
976  eptr = StringChr(str, ')');
977  if (! eptr)
978  return nullptr;
979 
980  while (*eptr == ' ' || *eptr == ')')
981  --eptr;
982 
983  location = StringSave(string(ptr, eptr + 1));
984  return (location);
985 }
986 
987 /*****************************************************************************
988  *
989  * bool SeqIntCheckCpp(loc) is instead of C-toolkit 'bool SeqIntCheck(sip)'
990  * checks that a seq interval is valid
991  *
992  *****************************************************************************/
993 static bool SeqIntCheckCpp(const CSeq_loc& loc)
994 {
996 
997  CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
998  if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
999  len = bio_h.GetBioseqLength();
1000 
1001  return loc.GetInt().GetFrom() <= loc.GetInt().GetTo() && loc.GetInt().GetTo() < len;
1002 }
1003 
1004 /*****************************************************************************
1005  *
1006  * bool SeqPntCheckCpp(loc) is instead of C-toolkit 'Boolean SeqPntCheck(SeqPntPtr spp)'
1007  * checks that a seq point is valid
1008  *
1009  *****************************************************************************/
1010 static bool SeqPntCheckCpp(const CSeq_loc& loc)
1011 {
1013 
1014  CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
1015  if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
1016  len = bio_h.GetBioseqLength();
1017 
1018  return loc.GetPnt().GetPoint() < len;
1019 }
1020 
1021 /*****************************************************************************
1022  *
1023  * bool PackSeqPntCheck(loc) is instead of C-toolkit 'Boolean PackSeqPntCheck (pspp)'
1024  *
1025  *****************************************************************************/
1026 static bool PackSeqPntCheckCpp(const CSeq_loc& loc)
1027 {
1029 
1030  CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
1031  if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
1032  len = bio_h.GetBioseqLength();
1033 
1034  for (TSeqPos point : loc.GetPacked_pnt().GetPoints()) {
1035  if (point >= len)
1036  return false;
1037  }
1038 
1039  return true;
1040 }
1041 
1042 /**********************************************************/
1043 /* returns : 2 = Ok, 1 = mixed strands, 0 = error in location
1044  */
1045 static Uint1 FTASeqLocCheck(const CSeq_loc& locs, char* accession)
1046 {
1047  Uint1 strand = 99;
1048  Uint1 retval = 2;
1049 
1050  CSeq_loc_CI ci(locs);
1051 
1052  bool good = true;
1053  for (; ci; ++ci) {
1054  CConstRef<CSeq_loc> cur_loc = ci.GetRangeAsSeq_loc();
1055 
1056  const CSeq_id* cur_id = nullptr;
1057 
1058  switch (cur_loc->Which()) {
1059  case CSeq_loc::e_Int:
1060  good = SeqIntCheckCpp(*cur_loc);
1061  if (good)
1062  cur_id = cur_loc->GetId();
1063  break;
1064 
1065  case CSeq_loc::e_Pnt:
1066  good = SeqPntCheckCpp(*cur_loc);
1067  if (good)
1068  cur_id = cur_loc->GetId();
1069  break;
1070 
1072  good = PackSeqPntCheckCpp(*cur_loc);
1073  if (good)
1074  cur_id = cur_loc->GetId();
1075  break;
1076 
1077  case CSeq_loc::e_Bond:
1078  if (! cur_loc->GetBond().CanGetA())
1079  good = false;
1080 
1081  if (good)
1082  cur_id = cur_loc->GetId();
1083  break;
1084 
1085  case CSeq_loc::e_Empty:
1086  case CSeq_loc::e_Whole:
1087  cur_id = cur_loc->GetId();
1088  break;
1089 
1090  default:
1091  continue;
1092  }
1093 
1094  if (! good)
1095  break;
1096 
1097  if (! accession || ! cur_id)
1098  continue;
1099 
1100  if (! cur_id->IsGenbank() && ! cur_id->IsEmbl() && ! cur_id->IsPir() &&
1101  ! cur_id->IsSwissprot() && ! cur_id->IsOther() && ! cur_id->IsDdbj() &&
1102  ! cur_id->IsPrf() && ! cur_id->IsTpg() && ! cur_id->IsTpe() &&
1103  ! cur_id->IsTpd() && ! cur_id->IsGpipe())
1104  continue;
1105 
1106  const CTextseq_id* text_id = cur_id->GetTextseq_Id();
1107 
1108  if (! text_id || ! text_id->CanGetAccession())
1109  continue;
1110 
1111  if (text_id->GetAccession() == accession) {
1112  if (strand == 99)
1113  strand = cur_loc->GetStrand();
1114  else if (strand != cur_loc->GetStrand())
1115  retval = 1;
1116  }
1117  }
1118 
1119  if (! good)
1120  return 0;
1121 
1122  return retval;
1123 }
1124 
1125 /**********************************************************/
1126 static void fta_strip_aa(char* str)
1127 {
1128  if (! str || *str == '\0')
1129  return;
1130 
1131  while (str) {
1132  str = StringStr(str, "aa");
1133  if (str)
1134  fta_StringCpy(str, str + 2);
1135  }
1136 }
1137 
1138 /**********************************************************
1139  *
1140  * static SeqFeatPtr SeqFeatPub(pp, entry, hsfp, seq_id,
1141  * col_data, ibp):
1142  *
1143  * 5-26-93
1144  *
1145  **********************************************************/
1146 static void SeqFeatPub(ParserPtr pp, const DataBlk& entry, TSeqFeatList& feats, TSeqIdList& seqids, Int4 col_data, IndexblkPtr ibp)
1147 {
1148  DataBlkPtr dbp;
1149  DataBlkPtr subdbp;
1150  char* p;
1151  char* q;
1152  char* location = nullptr;
1153 
1154  bool err = false;
1155  Uint1 i;
1156 
1157  /* REFERENCE, to Seq-feat
1158  */
1159  if (pp->format == Parser::EFormat::XML)
1160  dbp = XMLBuildRefDataBlk(entry.mOffset, ibp->xip, ParFlat_REF_BTW);
1161  else
1162  dbp = TrackNodeType(entry, ParFlat_REF_BTW);
1163  if (! dbp)
1164  return;
1165 
1166 
1167  for (; dbp; dbp = dbp->mpNext) {
1168  if (dbp->mType != ParFlat_REF_BTW)
1169  continue;
1170 
1171  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, col_data);
1172  if (pubdesc.Empty())
1173  continue;
1174 
1175  CRef<CSeq_feat> feat(new CSeq_feat);
1176  feat->SetData().SetPub(*pubdesc);
1177 
1178  location = nullptr;
1179  if (pp->format == Parser::EFormat::XML) {
1180  location = XMLFindTagValue(dbp->mOffset, static_cast<XmlIndex*>(dbp->mpData), INSDREFERENCE_POSITION);
1181  if (! location) {
1182  q = XMLFindTagValue(dbp->mOffset, static_cast<XmlIndex*>(dbp->mpData), INSDREFERENCE_REFERENCE);
1183  if (q) {
1184  for (p = q; *p != '\0' && *p != '(';)
1185  p++;
1186  if (*p != '\0')
1187  location = CheckLocStr(p + 1);
1188  MemFree(q);
1189  }
1190  } else {
1191  p = StringChr(location, ';');
1192  if (p) {
1193  string s("join(");
1194  s.append(location);
1195  s.append(")");
1196  MemFree(location);
1197  location = StringSave(s);
1198  }
1199  }
1200  } else if (pp->format == Parser::EFormat::GenBank) {
1201  for (p = dbp->mOffset + col_data; *p != '\0' && *p != '(';)
1202  p++;
1203  location = CheckLocStr(string(p, dbp->mOffset + dbp->len - p).c_str());
1204  } else if (pp->format == Parser::EFormat::EMBL) {
1205  subdbp = static_cast<DataBlk*>(dbp->mpData);
1206  for (; subdbp; subdbp = subdbp->mpNext) {
1207  if (subdbp->mType != ParFlat_RP)
1208  continue;
1209 
1210  for (p = subdbp->mOffset; *p != '\0' && isdigit(*p) == 0;)
1211  p++;
1212  if (StringChr(p, ',')) {
1213  string s = "join(";
1214  s += p;
1215  s += ")";
1216  location = StringSave(s);
1217  } else
1218  location = StringSave(p);
1219  break;
1220  }
1221  }
1222  if (! location || *location == '\0') {
1223  ErrPostEx(SEV_REJECT, ERR_REFERENCE_UnparsableLocation, "NULL or empty reference location. Entry dropped.");
1224  err = true;
1225  if (location)
1226  MemFree(location);
1227  break;
1228  }
1229 
1230  if (ibp->is_prot)
1232 
1233  if (pp->buf)
1234  MemFree(pp->buf);
1235  pp->buf = nullptr;
1236 
1237  GetSeqLocation(*feat, location, seqids, &err, pp, "pub");
1238 
1239  if (err) {
1240  ErrPostEx(SEV_REJECT, ERR_REFERENCE_UnparsableLocation, "Unparsable reference location. Entry dropped.");
1241  MemFree(location);
1242  break;
1243  }
1244 
1245  i = FTASeqLocCheck(feat->GetLocation(), ibp->acnum);
1246 
1247  if (i == 0) {
1249  if (pp->debug) {
1250  feats.push_back(feat);
1251  }
1252  } else {
1253  if (i == 1) {
1254  ErrPostEx(SEV_WARNING, ERR_LOCATION_MixedStrand, "Mixed strands in SeqLoc: %s", location);
1255  }
1256  feats.push_back(feat);
1257  }
1258  if (location)
1259  MemFree(location);
1260  }
1261 
1262  if (! err)
1263  return;
1264 
1265  ibp->drop = true;
1266  feats.clear();
1267 }
1268 
1269 /**********************************************************
1270  *
1271  * static SeqFeatPtr ImpFeatPub(pp, entry, hsfp, seq_id,
1272  * col_data, ibp):
1273  *
1274  * 5-26-93
1275  *
1276  **********************************************************/
1277 static void ImpFeatPub(ParserPtr pp, const DataBlk& entry, TSeqFeatList& feats, CSeq_id& seq_id, Int4 col_data, IndexblkPtr ibp)
1278 {
1279  DataBlkPtr dbp;
1280 
1281  bool first;
1282 
1283  /* REFERENCE, Imp-feat
1284  */
1285  if (pp->format == Parser::EFormat::XML)
1286  dbp = XMLBuildRefDataBlk(entry.mOffset, ibp->xip, ParFlat_REF_SITES);
1287  else
1288  dbp = TrackNodeType(entry, ParFlat_REF_SITES);
1289  if (! dbp)
1290  return;
1291 
1292  CRef<CSeq_feat> feat;
1293  for (first = true; dbp; dbp = dbp->mpNext) {
1294  if (dbp->mType != ParFlat_REF_SITES)
1295  continue;
1296 
1297  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, col_data);
1298  if (pubdesc.Empty() || ! pubdesc->IsSetPub())
1299  continue;
1300 
1301  if (first) {
1302  feat.Reset(new CSeq_feat);
1303 
1304  CImp_feat& imp_feat = feat->SetData().SetImp();
1305  imp_feat.SetKey("Site-ref");
1306  imp_feat.SetLoc("sites");
1307 
1308  feat->SetLocation(*fta_get_seqloc_int_whole(seq_id, ibp->bases));
1309  first = false;
1310  }
1311 
1312  CRef<CPub> pub(new CPub);
1313  pub->SetEquiv(pubdesc->SetPub());
1314 
1315  feat->SetCit().SetPub().push_back(pub);
1316 
1317  if (pubdesc->IsSetComment())
1318  feat->SetComment(pubdesc->GetComment());
1319  else
1320  feat->ResetComment();
1321  }
1322 
1323  if (! first && feat.NotEmpty())
1324  feats.push_back(feat);
1325 }
1326 
1327 /**********************************************************/
1328 static void fta_fake_gbparse_err_handler(const Char*, const Char*)
1329 {
1330 }
1331 
1332 /**********************************************************/
1334 {
1335  auto ret = location_to_string(loc);
1336  if (! ret.empty())
1337  return ret;
1338 
1339  return "unknown location";
1340 }
1341 
1342 /**********************************************************/
1343 static CRef<CSeq_loc> GetTrnaAnticodon(const CSeq_feat& feat, char* qval, const TSeqIdList& seqids, bool accver)
1344 {
1345  char* loc_str;
1346  char* p;
1347  char* q;
1348  bool fake1;
1349  Int4 range;
1350  Int4 pars;
1351  int fake3;
1352 
1353  CRef<CSeq_loc> ret;
1354 
1355  if (! qval)
1356  return ret;
1357 
1358  p = StringStr(qval, "pos:");
1359  if (! p)
1360  return ret;
1361 
1362  for (q = p + 4; *q == ' ';)
1363  q++;
1364 
1365  for (pars = 0, p = q; *p != '\0'; p++) {
1366  if (*p == ',' && pars == 0)
1367  break;
1368  if (*p == '(')
1369  pars++;
1370  else if (*p == ')') {
1371  pars--;
1372  if (pars == 0) {
1373  p++;
1374  break;
1375  }
1376  }
1377  }
1378 
1379  loc_str = StringSave(string_view(q, p - q));
1380 
1382  ret = xgbparseint_ver(loc_str, fake1, fake3, seqids, accver);
1384 
1385  if (ret.Empty()) {
1386  string loc = location_to_string_or_unknown(feat.GetLocation());
1387 
1388  ErrPostEx(SEV_ERROR, ERR_FEATURE_InvalidAnticodonPos, "Invalid position element for an /anticodon qualifier : \"%s\" : qualifier dropped : feature location \"%s\".", loc_str, (loc.empty()) ? "unknown" : loc.c_str());
1389 
1390  MemFree(loc_str);
1391 
1392  return ret;
1393  }
1394 
1395  range = sequence::GetLength(*ret, &GetScope());
1396  if (range != 3) {
1397  string loc = location_to_string_or_unknown(feat.GetLocation());
1398 
1399  if (range == 4)
1400  ErrPostEx(SEV_WARNING, ERR_FEATURE_FourBaseAntiCodon, "tRNA feature at \"%s\" has anticodon with location spanning four bases: \"%s\". Cannot generate corresponding codon value from the DNA sequence.", loc.empty() ? "unknown" : loc.c_str(), loc_str);
1401  else
1402  ErrPostEx(SEV_ERROR, ERR_FEATURE_StrangeAntiCodonSize, "tRNA feature at \"%s\" has anticodon of an unusual size: \"%s\". Cannot generate corresponding codon value from the DNA sequence.", loc.empty() ? "unknown" : loc.c_str(), loc_str);
1403  }
1404 
1405  // Comparing two locations ignoring their IDs
1406  // Anticodon should be inside the original location (may be the same)
1407  CRange<TSeqPos> anticodon_range = ret->GetTotalRange();
1408  CRange<TSeqPos> xrange = feat.GetLocation().GetTotalRange().IntersectionWith(anticodon_range);
1409 
1410  if (xrange != anticodon_range) {
1411  string loc = location_to_string_or_unknown(feat.GetLocation());
1412 
1413  ErrPostEx(SEV_ERROR, ERR_FEATURE_BadAnticodonLoc, "Anticodon location \"%s\" does not fall within tRNA feature at \"%s\".", loc_str, loc.empty() ? "unknown" : loc.c_str());
1414 
1415  MemFree(loc_str);
1416  ret.Reset();
1417  return ret;
1418  }
1419 
1420  MemFree(loc_str);
1421  return ret;
1422 }
1423 
1424 /**********************************************************/
1425 static void fta_parse_rrna_feat(CSeq_feat& feat, CRNA_ref& rna_ref)
1426 {
1427  char* qval;
1428  char* p;
1429  char* q;
1430 
1431  auto qval2 = GetTheQualValue(feat.SetQual(), "product");
1432  if (feat.GetQual().empty())
1433  feat.ResetQual();
1434 
1435  string qval_str;
1436  if (qval2) {
1437  qval_str = *qval2;
1438  qval2.reset();
1439  }
1440 
1441  size_t len = 0;
1442  if (qval_str.empty() && feat.IsSetComment() && rna_ref.GetType() == CRNA_ref::eType_rRNA) {
1443  string comment = feat.GetComment();
1444  len = comment.size();
1445 
1446  if (len > 15 && len < 20) {
1447  if (StringEquNI(comment.c_str() + len - 15, "S ribosomal RNA", 15)) {
1448  qval_str = comment;
1449  feat.ResetComment();
1450  }
1451  } else if (len > 6 && len < 20) {
1452  if (StringEquNI(comment.c_str() + len - 6, "S rRNA", 6)) {
1453  qval_str = comment;
1454  feat.ResetComment();
1455  }
1456  }
1457  }
1458 
1459  if (qval_str.empty())
1460  return;
1461 
1462  qval = StringSave(qval_str);
1463  for (p = qval; p; p += 13) {
1464  p = StringIStr(p, "ribosomal rrna");
1465  if (! p)
1466  break;
1467  fta_StringCpy(p + 10, p + 11);
1468  }
1469 
1470  for (p = qval; p; p = qval + len) {
1471  p = StringIStr(p, "ribosomalrna");
1472  if (! p)
1473  break;
1474  p[9] = '\0';
1475  string s(qval);
1476  s.append(" RNA");
1477  s.append(p + 12);
1478  len = p - qval + 13;
1479  MemFree(qval);
1480  qval = StringSave(s);
1481  }
1482 
1483  if (qval) {
1484  p = StringIStr(qval, " rrna");
1485  if (p) {
1486  *p = '\0';
1487  string s(qval);
1488  s.append(" ribosomal RNA");
1489  s.append(p + 5);
1490  MemFree(qval);
1491  qval = StringSave(s);
1492  }
1493  }
1494 
1495  for (p = qval, q = p; q; q = p + 13) {
1496  p = StringIStr(q, "ribosomal DNA");
1497  if (! p) {
1498  p = StringIStr(q, "ribosomal RNA");
1499  if (! p)
1500  break;
1501  }
1502  p[10] = 'R';
1503  p[11] = 'N';
1504  p[12] = 'A';
1505  }
1506 
1507  p = StringIStr(qval, "s ribosomal RNA");
1508  if (p && p > qval && p[15] == '\0') {
1509  p--;
1510  if (*p >= '0' && *p <= '9')
1511  *++p = 'S';
1512  }
1513 
1514  for (p = qval;;) {
1515  p = StringIStr(p, "ribosomal");
1516  if (! p)
1517  break;
1518  if (p == qval || (p[9] != ' ' && p[9] != '\0')) {
1519  p += 9;
1520  continue;
1521  }
1522  if (StringEquN(p + 9, " RNA", 4)) {
1523  p += 13;
1524  continue;
1525  }
1526  len = p - qval + 14;
1527  p += 9;
1528  string s(qval, p);
1529  s.append(" RNA");
1530  s.append(p);
1531  MemFree(qval);
1532  qval = StringSave(s);
1533  p = qval + len;
1534  }
1535 
1536  for (p = qval;;) {
1537  p = StringIStr(p, " ribosomal RNA");
1538  if (! p)
1539  break;
1540  p += 14;
1541  if (StringEquNI(p, " ribosomal RNA", 14))
1542  fta_StringCpy(p, p + 14);
1543  }
1544 
1545  DeleteQual(feat.SetQual(), "product");
1546  if (feat.GetQual().empty())
1547  feat.ResetQual();
1548 
1549  if (StringLen(qval) > 511) {
1550  qval[510] = '>';
1551  qval[511] = '\0';
1552  p = StringSave(qval);
1553  MemFree(qval);
1554  qval = p;
1555  }
1556 
1557  rna_ref.SetExt().SetName(qval);
1558  MemFree(qval);
1559 }
1560 
1561 /**********************************************************/
1563 {
1564  const AaCodons* acp;
1565 
1566  for (acp = aacodons; acp->straa; acp++)
1567  if (acp->intaa == ch)
1568  break;
1569  if (acp->straa)
1570  return (acp->intaa);
1571 
1572  return (0);
1573 }
1574 
1575 /**********************************************************/
1577 {
1578  const AaCodons* acp;
1579  const TrnaAa* tap;
1580 
1581  for (tap = taa; tap->name; tap++)
1582  if (NStr::CompareNocase(str, tap->name) == 0)
1583  break;
1584  if (tap->name)
1585  return (tap->aa);
1586 
1587  for (acp = aacodons; acp->straa; acp++)
1588  if (NStr::CompareNocase(acp->straa, str) == 0)
1589  break;
1590  if (acp->straa)
1591  return (acp->intaa);
1592 
1593  return (0);
1594 }
1595 
1596 /**********************************************************/
1597 static int get_aa_from_trna(const CTrna_ext& trna)
1598 {
1599  int ret = 0;
1600  if (trna.IsSetAa() && trna.GetAa().IsNcbieaa())
1601  ret = trna.GetAa().GetNcbieaa();
1602 
1603  return ret;
1604 }
1605 
1606 /**********************************************************/
1607 static CRef<CTrna_ext> fta_get_trna_from_product(CSeq_feat& feat, const string& product, unsigned char* remove)
1608 {
1609  const char** b;
1610 
1611  char* p;
1612  char* q;
1613  char* start;
1614  char* end;
1615  char* first;
1616  char* second;
1617  char* third;
1618  char* fourth;
1619  bool fmet;
1620  char* prod;
1621 
1622  if (remove)
1623  *remove = 0;
1624 
1625  CRef<CTrna_ext> ret(new CTrna_ext);
1626 
1627  if (product.length() < 7)
1628  return ret;
1629 
1630  bool digits = false;
1631  prod = StringSave(product);
1632  for (p = prod; *p != '\0'; p++) {
1633  if (*p >= 'a' && *p <= 'z')
1634  *p &= ~040;
1635  else if ((*p < 'A' || *p > 'Z') && *p != '(' && *p != ')') {
1636  if (*p >= '0' && *p <= '9')
1637  digits = true;
1638  *p = ' ';
1639  }
1640  }
1641  ShrinkSpaces(prod);
1642 
1643  for (b = trna_tags; *b; b++) {
1644  start = StringStr(prod, *b);
1645  if (start)
1646  break;
1647  }
1648  if (! *b) {
1649  MemFree(prod);
1650  return ret;
1651  }
1652 
1653  end = start + StringLen(*b);
1654  for (p = end; *p != '\0'; p++)
1655  if (*p == '(' || *p == ')')
1656  *p = ' ';
1657  ShrinkSpaces(prod);
1658 
1659  if (start == prod && *end == '\0') {
1660  if (remove && ! digits)
1661  *remove = 1;
1662  MemFree(prod);
1663  return ret;
1664  }
1665 
1666  first = nullptr;
1667  second = nullptr;
1668  third = nullptr;
1669  fourth = nullptr;
1670  for (p = end; *p == ' ' || *p == ')' || *p == '(';)
1671  p++;
1672  q = p;
1673  if (StringEquN(p, "F MET", 5))
1674  p += 5;
1675  else if (StringEquN(p, "F MT", 4))
1676  p += 4;
1677  while (*p >= 'A' && *p <= 'Z')
1678  p++;
1679  if (p > q) {
1680  if (*p != '\0')
1681  *p++ = '\0';
1682  second = q;
1683  }
1684  while (*p == ' ' || *p == ')' || *p == '(')
1685  p++;
1686  for (q = p; *p >= 'A' && *p <= 'Z';)
1687  p++;
1688  if (p > q) {
1689  if (*p != '\0')
1690  *p++ = '\0';
1691  if (q[1] == '\0') {
1692  while (*p == ' ' || *p == ')' || *p == '(')
1693  p++;
1694  for (q = p; *p >= 'A' && *p <= 'Z';)
1695  p++;
1696  if (p > q) {
1697  if (*p != '\0')
1698  *p++ = '\0';
1699  third = q;
1700  }
1701  } else
1702  third = q;
1703 
1704  while (*p == ' ' || *p == '(' || *p == ')')
1705  p++;
1706  if (*p != '\0')
1707  fourth = p;
1708  }
1709  if (start > prod) {
1710  for (p = start - 1; *p == ' ' || *p == ')' || *p == '('; p--)
1711  if (p == prod)
1712  break;
1713 
1714  if (p > prod && p[1] == ')') {
1715  for (p--; *p != '('; p--)
1716  if (p == prod)
1717  break;
1718  if (p > prod) {
1719  for (p--; *p == ' ' || *p == '(' || *p == '('; p--)
1720  if (p == prod)
1721  break;
1722  }
1723  }
1724  if (p > prod) {
1725  for (q = p++; *q >= 'A' && *q <= 'Z'; q--)
1726  if (q == prod)
1727  break;
1728  if (*q < 'A' || *q > 'Z')
1729  q++;
1730  if (p > q) {
1731  *p = '\0';
1732  first = q;
1733  }
1734  }
1735  }
1736 
1737  fmet = false;
1738  if (second) {
1739  if (StringEqu(second, "F MET") ||
1740  StringEqu(second, "FMET") ||
1741  StringEqu(second, "F MT")) {
1742  StringCpy(second, "FMET");
1743  fmet = true;
1744  }
1745 
1746  ret->SetAa().SetNcbieaa(fta_get_aa_from_string(second));
1747  if (get_aa_from_trna(*ret) != 0)
1748  second = nullptr;
1749  }
1750 
1751  if (get_aa_from_trna(*ret) == 0 && first) {
1752  ret->SetAa().SetNcbieaa(fta_get_aa_from_string(first));
1753  if (get_aa_from_trna(*ret) != 0 && first == prod)
1754  first = nullptr;
1755  }
1756 
1757  if (! first && ! second && ! third && ! fourth && remove && ! digits)
1758  *remove = 1;
1759  MemFree(prod);
1760 
1761  if (! fmet)
1762  return ret;
1763 
1764  if (! feat.IsSetComment())
1765  feat.SetComment("fMet");
1766  else if (! StringIStr(feat.GetComment().c_str(), "fmet")) {
1767  string& comment = feat.SetComment();
1768  comment += "; fMet";
1769  }
1770 
1771  return ret;
1772 }
1773 
1774 /**********************************************************/
1775 static CRef<CTrna_ext> fta_get_trna_from_comment(const string& comment, unsigned char* remove)
1776 {
1777  char* comm;
1778  char* p;
1779  char* q;
1780 
1781  CRef<CTrna_ext> ret(new CTrna_ext);
1782 
1783  *remove = 0;
1784  if (comment.empty())
1785  return ret;
1786 
1787  comm = StringSave(comment);
1788  for (p = comm; *p != '\0'; p++) {
1789  if (*p >= 'a' && *p <= 'z')
1790  *p &= ~040;
1791  else if (*p < 'A' || *p > 'Z')
1792  *p = ' ';
1793  }
1794  ShrinkSpaces(comm);
1795 
1796  if (StringEquN(comm, "CODON RECOGNIZED ", 17)) {
1797  p = comm + 17;
1798  q = StringChr(p, ' ');
1799  if (q && StringEqu(q + 1, "PUTATIVE"))
1800  *q = '\0';
1801  if (! StringChr(p, ' ') && StringLen(p) == 3) {
1802  MemFree(comm);
1803  *remove = q ? 2 : 1;
1804  return ret;
1805  }
1806  }
1807 
1808  if (StringEquN(comm, "PUTATIVE ", 9) && comm[10] == ' ' &&
1809  comm[14] == ' ' && StringEquN(&comm[15], "TRNA", 4)) {
1810  ret->SetAa().SetNcbieaa(fta_get_aa_from_symbol(comm[9]));
1811  if (get_aa_from_trna(*ret) != 0) {
1812  MemFree(comm);
1813  return ret;
1814  }
1815  }
1816 
1817  for (q = comm, p = q; p;) {
1818  p = StringChr(p, ' ');
1819  if (p)
1820  *p++ = '\0';
1821 
1822  ret->SetAa().SetNcbieaa(fta_get_aa_from_string(q));
1823  if (get_aa_from_trna(*ret) != 0)
1824  break;
1825  q = p;
1826  }
1827 
1828  MemFree(comm);
1829  return ret;
1830 }
1831 
1832 /**********************************************************/
1833 static int get_first_codon_from_trna(const CTrna_ext& trna)
1834 {
1835  int ret = 255;
1836  if (trna.IsSetCodon() && ! trna.GetCodon().empty())
1837  ret = *trna.GetCodon().begin();
1838 
1839  return ret;
1840 }
1841 
1842 /**********************************************************/
1843 static void GetRnaRef(CSeq_feat& feat, CBioseq& bioseq, Parser::ESource source, bool accver)
1844 {
1845  optional<string> qval;
1846 
1847  Uint1 remove;
1848 
1849  Int2 type;
1850 
1851  if (! feat.GetData().IsImp())
1852  return;
1853 
1854  const CImp_feat& imp_feat = feat.GetData().GetImp();
1855 
1856  CRef<CRNA_ref> rna_ref(new CRNA_ref);
1857 
1858  type = MatchArrayString(ParFlat_RNA_array, imp_feat.GetKey().c_str());
1859  if (type < 0)
1860  type = 255;
1861  else
1862  ++type;
1863 
1864  rna_ref->SetType(static_cast<CRNA_ref::EType>(type));
1865 
1866  feat.SetData().SetRna(*rna_ref);
1867 
1868  if (type == CRNA_ref::eType_rRNA) {
1869  fta_parse_rrna_feat(feat, *rna_ref);
1870  return;
1871  }
1872 
1873  CRef<CRNA_gen> rna_gen;
1874  CRef<CRNA_qual_set> rna_quals;
1875 
1876  if (type == CRNA_ref::eType_ncRNA) {
1877  auto p = GetTheQualValue(feat.SetQual(), "ncRNA_class");
1878  if (p) {
1879  rna_gen.Reset(new CRNA_gen);
1880  rna_gen->SetClass(*p);
1881  }
1882  } else if (type == CRNA_ref::eType_tmRNA) {
1883  auto p = GetTheQualValue(feat.SetQual(), "tag_peptide");
1884  if (p) {
1885  CRef<CRNA_qual> rna_qual(new CRNA_qual);
1886  rna_qual->SetQual("tag_peptide");
1887  rna_qual->SetVal(*p);
1888 
1889  rna_quals.Reset(new CRNA_qual_set);
1890  rna_quals->Set().push_back(rna_qual);
1891 
1892  rna_gen.Reset(new CRNA_gen);
1893  rna_gen->SetQuals(*rna_quals);
1894  }
1895  }
1896 
1897  if (type != CRNA_ref::eType_premsg && type != CRNA_ref::eType_tRNA) /* mRNA, snRNA, scRNA or other */
1898  {
1899  qval = GetTheQualValue(feat.SetQual(), "product"); // may return newly allocated memory!!!
1900  if (qval) {
1901  auto p = GetTheQualValue(feat.SetQual(), "product");
1902  if (p && ! p->empty()) {
1903  if (! feat.IsSetComment())
1904  feat.SetComment(*p);
1905  else {
1906  string& comment = feat.SetComment();
1907  comment += "; ";
1908  comment += *p;
1909  }
1910  }
1911  }
1912 
1913  if (! qval && type == CRNA_ref::eType_mRNA &&
1915  qval = GetTheQualValue(feat.SetQual(), "standard_name");
1916 
1917  if (! qval && feat.IsSetComment() && type == CRNA_ref::eType_mRNA) {
1918  const Char* c_p = feat.GetComment().c_str();
1919  const Char* c_q = nullptr;
1920  for (;; c_p += 5, c_q = c_p) {
1921  c_p = StringIStr(c_p, " mRNA");
1922  if (! c_p)
1923  break;
1924  }
1925 
1926  const Char* c_r = nullptr;
1927  for (c_p = feat.GetComment().c_str();; c_p += 4, c_r = c_p) {
1928  c_p = StringIStr(c_p, " RNA");
1929  if (! c_p)
1930  break;
1931  }
1932 
1933  if (c_q && c_r) {
1934  c_p = (c_q > c_r) ? c_q : c_r;
1935  } else if (c_q)
1936  c_p = c_q;
1937  else
1938  c_p = c_r;
1939 
1940  if (c_p) {
1941  while (*c_p == ' ' || *c_p == '\t' || *c_p == ',' || *c_p == ';')
1942  ++c_p;
1943 
1944  if (*c_p == '\0') {
1945  qval = feat.GetComment();
1946  feat.ResetComment();
1947  }
1948  }
1949  }
1950 
1951  if (qval) {
1952  if (qval->length() > 511) {
1953  qval->resize(511);
1954  qval->back() = '>';
1955  }
1956 
1958  if (rna_gen.Empty())
1959  rna_gen.Reset(new CRNA_gen);
1960 
1961  rna_gen->SetProduct(*qval);
1962  } else {
1963  rna_ref->SetExt().SetName(*qval);
1964  }
1965  }
1966  qval.reset();
1967  }
1968 
1969  if (feat.GetQual().empty())
1970  feat.ResetQual();
1971 
1972  if (rna_gen.NotEmpty()) {
1973  rna_ref->SetExt().SetGen(*rna_gen);
1974  }
1975 
1976  if (type != CRNA_ref::eType_tRNA) /* if tRNA and codon value exist */
1977  return;
1978 
1979  if (qval) {
1980  qval.reset();
1981  }
1982  qval = GetTheQualValue(feat.SetQual(), "anticodon");
1983  CRef<CTrna_ext> trnaa;
1984  if (qval) {
1985  bioseq.SetInst().SetMol(CSeq_inst::eMol_na);
1986 
1987  CRef<CSeq_loc> anticodon = GetTrnaAnticodon(feat, qval->data(), bioseq.GetId(), accver);
1988  if (anticodon.NotEmpty()) {
1989  trnaa.Reset(new CTrna_ext);
1990 
1991  /* value has format: (pos:base_range, aa:amino_acid)
1992  */
1993  trnaa->SetAa().SetNcbieaa(GetQualValueAa(qval->c_str(), true));
1994  trnaa->SetAnticodon(*anticodon);
1995  rna_ref->SetExt().SetTRNA(*trnaa);
1996  }
1997  qval.reset();
1998  }
1999 
2000  string qval2 = CpTheQualValue(feat.SetQual(), "product");
2001 
2002  CRef<CTrna_ext> trnap;
2003  if (! qval2.empty()) {
2004  trnap = fta_get_trna_from_product(feat, qval2, nullptr);
2005  qval2.clear();
2006  }
2007 
2008  if (feat.IsSetComment() && feat.GetComment().empty()) {
2009  feat.ResetComment();
2010  }
2011 
2012  remove = 0;
2013  CRef<CTrna_ext> trnac;
2014  if (feat.IsSetComment()) {
2015  trnac = fta_get_trna_from_product(feat, feat.GetComment(), &remove);
2016 
2017  if (get_aa_from_trna(*trnac) == 0) {
2018  trnac = fta_get_trna_from_comment(feat.GetComment(), &remove);
2019  }
2020 
2021  if (get_aa_from_trna(*trnac) == 0 && get_first_codon_from_trna(*trnac) == 255) {
2022  trnac.Reset();
2023  }
2024  }
2025 
2026  if (trnaa.Empty()) {
2027  if (trnap.Empty()) {
2028  if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0) {
2029  rna_ref->SetExt().SetTRNA(*trnac);
2030  if (remove != 0) {
2031  feat.ResetComment();
2032  }
2033  }
2034  } else {
2035  rna_ref->SetExt().SetTRNA(*trnap);
2036 
2037  if (get_aa_from_trna(*trnap) == 0) {
2038  if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0)
2039  rna_ref->SetExt().SetTRNA(*trnac);
2040  } else if (trnac.NotEmpty()) {
2041  if (get_aa_from_trna(*trnac) == 0 && get_first_codon_from_trna(*trnac) != 255 &&
2042  get_first_codon_from_trna(*trnap) == 255 && remove != 0) {
2043  trnap->SetCodon().assign(trnac->GetCodon().begin(), trnac->GetCodon().end());
2044 
2045  feat.ResetComment();
2046  if (remove == 2)
2047  feat.SetComment("putative");
2048  }
2049 
2050  if (get_aa_from_trna(*trnac) == get_aa_from_trna(*trnap) && remove != 0) {
2051  feat.ResetComment();
2052  }
2053  }
2054  }
2055  } else {
2056  if (trnap.NotEmpty()) {
2057  trnap.Reset();
2058  }
2059 
2060  if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0) {
2061  if (get_aa_from_trna(*trnac) == get_aa_from_trna(*trnaa) || get_aa_from_trna(*trnaa) == 88) {
2062  trnac->SetAnticodon(trnaa->SetAnticodon());
2063  trnaa->ResetAnticodon();
2064 
2065  if (get_first_codon_from_trna(*trnac) == 255) {
2066  trnac->SetCodon().assign(trnaa->GetCodon().begin(), trnaa->GetCodon().end());
2067  }
2068 
2069  rna_ref->SetExt().SetTRNA(*trnac);
2070  if (remove != 0) {
2071  feat.ResetComment();
2072  }
2073  }
2074  }
2075  }
2076 
2077  if (feat.GetQual().empty())
2078  feat.ResetQual();
2079 
2080  if (rna_ref->IsSetExt() && rna_ref->GetExt().IsTRNA()) {
2081  const CTrna_ext& trna = rna_ref->GetExt().GetTRNA();
2082  if (get_aa_from_trna(trna) == 0 && ! trna.IsSetAnticodon()) {
2083  rna_ref->ResetExt();
2084  }
2085  }
2086 }
2087 
2088 /**********************************************************
2089  *
2090  * static void GetImpFeat(sfp, fbp, locmap):
2091  *
2092  * 'replace' in loc will be changed later
2093  * in SeqEntryToAsn3Ex.
2094  *
2095  * 01/07/97
2096  *
2097  **********************************************************/
2098 static void GetImpFeat(CSeq_feat& feat, FeatBlkPtr fbp, bool locmap)
2099 {
2100  CRef<CImp_feat> imp_feat(new CImp_feat);
2101  imp_feat->SetKey(fbp->key);
2102 
2103  if (locmap)
2104  imp_feat->SetLoc(fbp->location);
2105 
2106  feat.SetData().SetImp(*imp_feat);
2107 }
2108 
2109 /**********************************************************/
2111 {
2112  if (bio.CanGetOrg() && ! bio.GetOrg().GetDb().empty()) {
2113  for (COrg_ref::TDb::iterator db = bio.SetOrg().SetDb().begin(); db != bio.SetOrg().SetDb().end(); ++db) {
2114  if (! (*db)->CanGetDb())
2115  continue;
2116 
2117  COrg_ref::TDb::iterator tdb = db;
2118  for (++tdb; tdb != bio.SetOrg().SetDb().end(); ++tdb) {
2119  if (! (*tdb)->IsSetDb())
2120  continue;
2121 
2122  if ((*db)->GetDb() < (*tdb)->GetDb())
2123  continue;
2124 
2125  if ((*db)->GetDb() == (*tdb)->GetDb()) {
2126  const CObject_id& db_id = (*db)->GetTag();
2127  const CObject_id& tdb_id = (*tdb)->GetTag();
2128 
2129  if (! db_id.IsStr() && tdb_id.IsStr())
2130  continue;
2131 
2132  if (db_id.IsStr() && tdb_id.IsStr() &&
2133  db_id.GetStr() <= tdb_id.GetStr())
2134  continue;
2135 
2136  if (! db_id.IsStr() && ! tdb_id.IsStr() &&
2137  db_id.GetId() <= tdb_id.GetId())
2138  continue;
2139  }
2140 
2141  db->Swap(*tdb);
2142  }
2143  }
2144 
2145  if (bio.GetOrg().IsSetOrgname() && bio.GetOrg().GetOrgname().IsSetMod()) {
2146  COrgName::TMod& rmod = bio.SetOrg().SetOrgname().SetMod();
2147  for (COrgName::TMod::iterator mod = rmod.begin(); mod != rmod.end(); ++mod) {
2148  COrgName::TMod::iterator tmod = mod;
2149  for (++tmod; tmod != rmod.end(); ++tmod) {
2150  if ((*mod)->GetSubtype() < (*tmod)->GetSubtype())
2151  continue;
2152 
2153  if ((*mod)->GetSubtype() == (*tmod)->GetSubtype() &&
2154  (*mod)->GetSubname() <= (*tmod)->GetSubname())
2155  continue;
2156 
2157  mod->Swap(*tmod);
2158  }
2159  }
2160  }
2161  }
2162 
2163  if (! bio.IsSetSubtype())
2164  return;
2165 
2166  CBioSource::TSubtype& rsub = bio.SetSubtype();
2167  for (CBioSource::TSubtype::iterator sub = rsub.begin(); sub != rsub.end(); ++sub) {
2168  CBioSource::TSubtype::iterator tsub = sub;
2169  for (++tsub; tsub != rsub.end(); ++tsub) {
2170  if ((*sub)->GetSubtype() < (*tsub)->GetSubtype())
2171  continue;
2172 
2173  if ((*sub)->GetSubtype() == (*tsub)->GetSubtype() &&
2174  (*sub)->GetName() <= (*tsub)->GetName())
2175  continue;
2176 
2177  sub->Swap(*tsub);
2178  }
2179  }
2180 }
2181 
2182 /**********************************************************/
2184 {
2185  string val = qual->GetVal();
2186  bool has_comma = val.find(',') != string::npos;
2187 
2188  if (has_comma) {
2189  std::replace(val.begin(), val.end(), ',', ';');
2190  qual->SetVal(val);
2191  }
2192 
2193  if (has_comma)
2194  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_MultRptUnitComma, "Converting commas to semi-colons due to format conventions for multiple /rpt_unit qualifiers.");
2195 }
2196 
2197 /**********************************************************/
2199 {
2200  if (! fbp || fbp->quals.empty())
2201  return;
2202 
2203  TQualVector::iterator first = fbp->quals.end();
2204  size_t len = 0, count = 0;
2205 
2206  for (TQualVector::iterator qual = fbp->quals.begin(); qual != fbp->quals.end();) {
2207  if ((*qual)->GetQual() != "rpt_unit") {
2208  ++qual;
2209  continue;
2210  }
2211 
2212  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_ObsoleteRptUnit, "Obsolete /rpt_unit qualifier found on feature \"%s\" at location \"%s\".", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown");
2213 
2214  if ((*qual)->GetVal().empty()) {
2215  qual = fbp->quals.erase(qual);
2216  continue;
2217  }
2218 
2219  count++;
2220  len += (*qual)->GetVal().size();
2221  if (first == fbp->quals.end())
2222  first = qual;
2223 
2224  if (count == 1) {
2225  ++qual;
2226  continue;
2227  }
2228 
2229  if (count == 2)
2231 
2232  ConvertQualifierValue(*qual);
2233  ++qual;
2234  }
2235 
2236  if (count == 0)
2237  return;
2238 
2239  if (count == 1) {
2240  const string& val = (*first)->GetVal();
2241  if (*val.begin() == '(' && *val.rbegin() == ')') {
2243  }
2244  return;
2245  }
2246 
2247  string p;
2248  p.reserve(len + count + 1);
2249  p.assign("(");
2250  p.append((*first)->GetVal());
2251 
2252  for (TQualVector::iterator qual = first; qual != fbp->quals.end();) {
2253  if ((*qual)->GetQual() != "rpt_unit") {
2254  ++qual;
2255  continue;
2256  }
2257 
2258  p.append(",");
2259  p.append((*qual)->GetVal());
2260  qual = fbp->quals.erase(qual);
2261  }
2262  p.append(")");
2263  (*first)->SetVal(p);
2264 }
2265 
2266 /**********************************************************/
2267 static bool fta_check_evidence(CSeq_feat& feat, FeatBlkPtr fbp)
2268 {
2269  Int4 evi_exp;
2270  Int4 evi_not;
2271  Int4 exp_good;
2272  Int4 exp_bad;
2273  Int4 inf_good;
2274  Int4 inf_bad;
2275  Char ch;
2276 
2277  if (! fbp || fbp->quals.empty())
2278  return true;
2279 
2280  evi_exp = 0;
2281  evi_not = 0;
2282  exp_good = 0;
2283  exp_bad = 0;
2284  inf_good = 0;
2285  inf_bad = 0;
2286 
2287  for (TQualVector::iterator qual = fbp->quals.begin(); qual != fbp->quals.end();) {
2288  const string& qual_str = (*qual)->IsSetQual() ? (*qual)->GetQual() : "";
2289  const string& val_str = (*qual)->IsSetVal() ? (*qual)->GetVal() : "";
2290  if (qual_str == "experiment") {
2291  if (val_str == "experimental evidence, no additional details recorded") {
2292  exp_good++;
2293  qual = fbp->quals.erase(qual);
2294  } else {
2295  exp_bad++;
2296  ++qual;
2297  }
2298  continue;
2299  }
2300 
2301  if (qual_str == "inference") {
2302  if (val_str == "non-experimental evidence, no additional details recorded") {
2303  inf_good++;
2304  qual = fbp->quals.erase(qual);
2305  } else {
2306  inf_bad++;
2307  ++qual;
2308  }
2309  continue;
2310  }
2311 
2312  if (qual_str != "evidence") {
2313  ++qual;
2314  continue;
2315  }
2316 
2317  if (NStr::CompareNocase(val_str.c_str(), "not_experimental") == 0)
2318  evi_not++;
2319  else if (NStr::CompareNocase(val_str.c_str(), "experimental") == 0)
2320  evi_exp++;
2321  else {
2322  if (fbp->location && StringLen(fbp->location) > 50) {
2323  ch = fbp->location[50];
2324  fbp->location[50] = '\0';
2325  } else
2326  ch = '\0';
2327  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidEvidence, "Illegal value \"%s\" for /evidence qualifier on the \"%s\" feature at \"%s\". Qualifier dropped.", val_str.empty() ? "Unknown" : val_str.c_str(), fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2328  if (ch != '\0')
2329  fbp->location[50] = ch;
2330  }
2331 
2332  qual = fbp->quals.erase(qual);
2333  }
2334 
2335  if (evi_exp + evi_not > 0 && exp_good + exp_bad + inf_good + inf_bad > 0) {
2336  if (fbp->location && StringLen(fbp->location) > 50) {
2337  ch = fbp->location[50];
2338  fbp->location[50] = '\0';
2339  } else
2340  ch = '\0';
2341  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict, "Old /evidence and new /experiment or /inference qualifiers both exist on the \"%s\" feature at \"%s\". This is currently unsupported.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2342  if (ch != '\0')
2343  fbp->location[50] = ch;
2344  return false;
2345  }
2346 
2347  if (evi_exp + exp_good > 0 && evi_not + inf_good > 0) {
2348  if (fbp->location && StringLen(fbp->location) > 50) {
2349  ch = fbp->location[50];
2350  fbp->location[50] = '\0';
2351  } else
2352  ch = '\0';
2353  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict, "The special \"no additional details recorded\" values for both /experiment and /inference exist on the \"%s\" feature at \"%s\". This is currently unsupported.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2354  if (ch != '\0')
2355  fbp->location[50] = ch;
2356  return false;
2357  }
2358 
2359  if ((exp_good > 0 && exp_bad > 0) || (inf_good > 0 && inf_bad > 0)) {
2360  if (fbp->location && StringLen(fbp->location) > 50) {
2361  ch = fbp->location[50];
2362  fbp->location[50] = '\0';
2363  } else
2364  ch = '\0';
2365  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict, "The special \"no additional details recorded\" value for /experiment or /inference exists in conjunction with other /experiment or /inference qualifiers on the \"%s\" feature at \"%s\". This is currently unsupported.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2366  if (ch != '\0')
2367  fbp->location[50] = ch;
2368  return false;
2369  }
2370 
2371  if (exp_good + evi_exp > 0)
2373  else if (inf_good + evi_not > 0)
2375  return true;
2376 }
2377 
2378 /**********************************************************
2379  *
2380  * static CRef<CSeq_feat> ProcFeatBlk(pp, fbp, seqids):
2381  *
2382  * Process each feature sub-block.
2383  * location, SeqLocPtr by calling Karl's routine,
2384  * Nml_gbparseint which return locmap = TRUE if mapping
2385  * location rules not work, then SeqLocPtr->whole = seqids[0].
2386  * sitesmap = TRUE if found "(sites" string, num_errs > 0
2387  * if any errors occurred.
2388  * If there is a illegal location, then assign
2389  * qualifier to be a Imp-feat.
2390  *
2391  **********************************************************/
2393 {
2394  const char** b;
2395 
2396  char* loc = nullptr;
2397 
2398  bool locmap = false;
2399  bool err = false;
2400 
2401  CRef<CSeq_feat> feat;
2402 
2403  if (fbp->location) {
2404  loc = fbp->location;
2405  DelCharBtwData(loc);
2406  if (pp->buf)
2407  MemFree(pp->buf);
2408  string s(fbp->key);
2409  s.append(" : ");
2410  s.append(loc);
2411  pp->buf = StringSave(s);
2412 
2413  feat.Reset(new CSeq_feat);
2414  locmap = GetSeqLocation(*feat, loc, seqids, &err, pp, fbp->key);
2415 
2416  if (pp->buf)
2417  MemFree(pp->buf);
2418  pp->buf = nullptr;
2419  }
2420  if (err) {
2421  if (pp->debug == false) {
2422  ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped, "%s|%s| range check detects problems", fbp->key, loc);
2423  feat.Reset();
2424  return feat;
2425  }
2426  ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "%s|%s| range check detects problems", fbp->key, loc);
2427  }
2428 
2429  if (! fbp->quals.empty()) {
2430  if (DeleteQual(fbp->quals, "partial"))
2431  feat->SetPartial(true);
2432  }
2433 
2434  if (StringStr(loc, "order"))
2435  feat->SetPartial(true);
2436 
2437  if (! fbp->quals.empty()) {
2438  if (DeleteQual(fbp->quals, "pseudo"))
2439  feat->SetPseudo(true);
2440  }
2441 
2442  if (! fbp->quals.empty())
2443  DeleteQual(fbp->quals, "gsdb_id");
2444 
2445  if (! fbp->quals.empty())
2446  fta_parse_rpt_units(fbp);
2447 
2448  if (! fbp->quals.empty()) {
2449  for (b = TransSplicingFeats; *b; b++)
2450  if (StringEqu(fbp->key, *b))
2451  break;
2452  if (*b && DeleteQual(fbp->quals, "trans_splicing")) {
2453  feat->SetExcept(true);
2454  if (! feat->IsSetExcept_text())
2455  feat->SetExcept_text("trans-splicing");
2456  else {
2457  string& exc_text = feat->SetExcept_text();
2458  exc_text += ", trans-splicing";
2459  }
2460  }
2461  }
2462 
2463  if (! fta_check_evidence(*feat, fbp)) {
2464  pp->entrylist[pp->curindx]->drop = true;
2465  return feat;
2466  }
2467 
2468  if ((! feat->IsSetPartial() || ! feat->GetPartial()) && ! StringEqu(fbp->key, "gap")) {
2469  if (SeqLocHaveFuzz(feat->GetLocation()))
2470  feat->SetPartial(true);
2471  }
2472 
2473  if (! fbp->quals.empty()) {
2474  auto comment = GetTheQualValue(fbp->quals, "note");
2475  if (comment) {
2476  if (! comment->empty()) {
2477  feat->SetComment(*comment);
2478  }
2479  }
2480  }
2481 
2482  /* assume all imp for now
2483  */
2484  if (! StringStr(fbp->key, "source"))
2485  GetImpFeat(*feat, fbp, locmap);
2486 
2487  for (const auto& cur : fbp->quals) {
2488  const string& qual_str = cur->GetQual();
2489  if (qual_str == "pseudogene")
2490  feat->SetPseudo(true);
2491 
2492  // Do nothing for 'translation' qualifier in case of its value is empty
2493  if (qual_str == "translation" && (! cur->IsSetVal() || cur->GetVal().empty()))
2494  continue;
2495 
2496  if (! qual_str.empty())
2497  feat->SetQual().push_back(cur);
2498  }
2499 
2500  return feat;
2501 }
2502 
2503 /**********************************************************/
2504 static void fta_get_gcode_from_biosource(const CBioSource& bio_src, IndexblkPtr ibp)
2505 {
2506  if (! bio_src.IsSetOrg() || ! bio_src.GetOrg().IsSetOrgname())
2507  return;
2508 
2509  ibp->gc_genomic = bio_src.GetOrg().GetOrgname().IsSetGcode() ? bio_src.GetOrg().GetOrgname().GetGcode() : 0;
2510  ibp->gc_mito = bio_src.GetOrg().GetOrgname().IsSetMgcode() ? bio_src.GetOrg().GetOrgname().GetMgcode() : 0;
2511 }
2512 
2513 /**********************************************************/
2514 static void fta_sort_quals(FeatBlkPtr fbp, bool qamode)
2515 {
2516  if (! fbp)
2517  return;
2518 
2519  for (TQualVector::iterator q = fbp->quals.begin(); q != fbp->quals.end(); ++q) {
2520  if ((*q)->GetQual() == "gene" ||
2521  (! qamode && (*q)->GetQual() == "product"))
2522  continue;
2523 
2524  TQualVector::iterator tq = q;
2525  for (++tq; tq != fbp->quals.end(); ++tq) {
2526  const string& q_qual = (*q)->GetQual();
2527  const string& tq_qual = (*tq)->GetQual();
2528 
2529  if (! tq_qual.empty()) {
2530  if (q_qual == "gene")
2531  continue;
2532 
2533  Int4 i = NStr::CompareNocase(q_qual.c_str(), tq_qual.c_str());
2534  if (i < 0)
2535  continue;
2536  if (i == 0) {
2537  /* Do not sort /gene qualifiers
2538  */
2539  const string q_val = (*q)->GetVal();
2540  const string tq_val = (*tq)->GetVal();
2541 
2542  if (q_val.empty())
2543  continue;
2544 
2545  if (! tq_val.empty()) {
2546  if (q_val[0] >= '0' && q_val[0] <= '9' &&
2547  tq_val[0] >= '0' && tq_val[0] <= '9') {
2548  if (atoi(q_val.c_str()) <= atoi(tq_val.c_str()))
2549  continue;
2550  } else if (q_val <= tq_val)
2551  continue;
2552  }
2553  }
2554  }
2555 
2556  q->Swap(*tq);
2557  }
2558  }
2559 }
2560 
2561 /**********************************************************/
2562 static bool fta_qual_a_in_b(const TQualVector& qual1, const TQualVector& qual2)
2563 {
2564  bool found = false;
2565 
2566  for (const auto& gbqp1 : qual1) {
2567  found = false;
2568  for (const auto& gbqp2 : qual2) {
2569  const Char* qual_a = gbqp1->IsSetQual() ? gbqp1->GetQual().c_str() : nullptr;
2570  const Char* qual_b = gbqp2->IsSetQual() ? gbqp2->GetQual().c_str() : nullptr;
2571 
2572  const Char* val_a = gbqp1->IsSetVal() ? gbqp1->GetVal().c_str() : nullptr;
2573  const Char* val_b = gbqp2->IsSetVal() ? gbqp2->GetVal().c_str() : nullptr;
2574 
2575  if (fta_strings_same(qual_a, qual_b) && fta_strings_same(val_a, val_b)) {
2576  found = true;
2577  break;
2578  }
2579  }
2580  if (! found)
2581  break;
2582  }
2583 
2584  if (! found)
2585  return false;
2586 
2587  return true;
2588 }
2589 
2590 /**********************************************************/
2591 static bool fta_feats_same(const FeatBlk* fbp1, const FeatBlk* fbp2)
2592 {
2593  if (! fbp1 && ! fbp2)
2594  return true;
2595  if (! fbp1 || ! fbp2 ||
2596  fta_strings_same(fbp1->key, fbp2->key) == false ||
2597  fta_strings_same(fbp1->location, fbp2->location) == false)
2598  return false;
2599 
2600  if (fta_qual_a_in_b(fbp1->quals, fbp2->quals) && fta_qual_a_in_b(fbp2->quals, fbp1->quals))
2601  return true;
2602 
2603  return false;
2604 }
2605 
2606 /**********************************************************/
2607 static bool fta_check_rpt_unit_span(const char* val, size_t length)
2608 {
2609  const char* p;
2610  const char* q;
2611  Int4 i1;
2612  Int4 i2;
2613 
2614  if (! val || *val == '\0')
2615  return false;
2616 
2617  for (p = val; *p >= '0' && *p <= '9';)
2618  p++;
2619 
2620  if (p == val || p[0] != '.' || p[1] != '.')
2621  return false;
2622 
2623  i1 = atoi(val);
2624  for (p += 2, q = p; *q >= '0' && *q <= '9';)
2625  q++;
2626  if (q == p || *q != '\0')
2627  return false;
2628  i2 = atoi(p);
2629 
2630  if (i1 == 0 || i1 > i2 || i2 > (Int4)length)
2631  return false;
2632  return true;
2633 }
2634 
2635 /**********************************************************/
2636 static void fta_check_rpt_unit_range(FeatBlkPtr fbp, size_t length)
2637 {
2638  Char ch;
2639 
2640  if (! fbp || fbp->quals.empty())
2641  return;
2642 
2643  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();) {
2644  if (! (*cur)->IsSetQual() || ! (*cur)->IsSetVal()) {
2645  ++cur;
2646  continue;
2647  }
2648 
2649  const string& qual_str = (*cur)->GetQual();
2650  const string& val_str = (*cur)->GetVal();
2651 
2652  if (qual_str != "rpt_unit_range" || fta_check_rpt_unit_span(val_str.c_str(), length)) {
2653  ++cur;
2654  continue;
2655  }
2656 
2657  if (fbp->location && StringLen(fbp->location) > 20) {
2658  ch = fbp->location[20];
2659  fbp->location[20] = '\0';
2660  } else
2661  ch = '\0';
2662  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidRptUnitRange, "/rpt_unit_range qualifier \"%s\" on feature \"%s\" at location \"%s%s\" is not a valid basepair range. Qualifier dropped.", val_str.empty() ? "(EMPTY)" : val_str.c_str(), fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown", (ch == '\0') ? "" : "...");
2663  if (ch != '\0')
2664  fbp->location[20] = ch;
2665 
2666  cur = fbp->quals.erase(cur);
2667  }
2668 }
2669 
2670 /**********************************************************/
2672 {
2673  DataBlkPtr tdbp;
2674  DataBlkPtr tdbpprev;
2675  DataBlkPtr tdbpnext;
2676  const FeatBlk* fbp1;
2677  FeatBlkPtr fbp2;
2678  Char ch;
2679 
2680  if (! dbp || ! dbp->mpNext)
2681  return;
2682 
2683  for (; dbp; dbp = dbp->mpNext) {
2684  if (! dbp->mpData)
2685  continue;
2686 
2687  fbp1 = static_cast<const FeatBlk*>(dbp->mpData);
2688  tdbpprev = dbp;
2689  for (tdbp = dbp->mpNext; tdbp; tdbp = tdbpnext) {
2690  tdbpnext = tdbp->mpNext;
2691  if (! tdbp->mpData) {
2692  tdbpprev->mpNext = tdbpnext;
2693  tdbp->SimpleDelete();
2694  continue;
2695  }
2696 
2697  fbp2 = static_cast<FeatBlk*>(tdbp->mpData);
2698 
2699  if (fbp1->location && fbp2->location &&
2700  StringCmp(fbp1->location, fbp2->location) < 0)
2701  break;
2702 
2703  if (! fta_feats_same(fbp1, fbp2)) {
2704  tdbpprev = tdbp;
2705  continue;
2706  }
2707 
2708  if (fbp2->location && StringLen(fbp2->location) > 20) {
2709  ch = fbp2->location[20];
2710  fbp2->location[20] = '\0';
2711  } else
2712  ch = '\0';
2713  ErrPostEx(SEV_WARNING, ERR_FEATURE_DuplicateRemoved, "Duplicated feature \"%s\" at location \"%s%s\" removed.", fbp2->key ? fbp2->key : "???", fbp2->location ? fbp2->location : "???", (ch == '\0') ? "" : "...");
2714 
2715  delete fbp2;
2716  tdbpprev->mpNext = tdbpnext;
2717  tdbp->SimpleDelete();
2718  }
2719  }
2720 }
2721 
2722 /**********************************************************/
2724 {
2725 public:
2726  PredIsGivenQual(const string& qual) :
2727  qual_(qual) {}
2728 
2729  bool operator()(const CRef<CGb_qual>& qual)
2730  {
2731  return qual->GetQual() == qual_;
2732  }
2733 
2734 private:
2735  string qual_;
2736 };
2737 
2738 static void fta_check_multiple_locus_tag(DataBlkPtr dbp, bool* drop)
2739 {
2740  FeatBlkPtr fbp;
2741  Char ch;
2742 
2743  for (; dbp; dbp = dbp->mpNext) {
2744  fbp = static_cast<FeatBlk*>(dbp->mpData);
2745  if (! fbp)
2746  continue;
2747 
2748  size_t i = std::count_if(fbp->quals.begin(), fbp->quals.end(), PredIsGivenQual("locus_tag"));
2749  if (i < 2)
2750  continue;
2751 
2752  if (fbp->location && StringLen(fbp->location) > 50) {
2753  ch = fbp->location[50];
2754  fbp->location[50] = '\0';
2755  } else
2756  ch = '\0';
2757  ErrPostEx(SEV_REJECT, ERR_FEATURE_MultipleLocusTags, "Multiple /locus_tag values for \"%s\" feature at \"%s\".", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2758  if (ch != '\0')
2759  fbp->location[50] = ch;
2760  *drop = true;
2761  break;
2762  }
2763 }
2764 
2765 /**********************************************************/
2766 static void fta_check_old_locus_tags(DataBlkPtr dbp, bool* drop)
2767 {
2768  PredIsGivenQual isOldLocusTag("old_locus_tag"),
2769  isLocusTag("locus_tag");
2770 
2771  for (; dbp; dbp = dbp->mpNext) {
2772  FeatBlkPtr fbp = static_cast<FeatBlk*>(dbp->mpData);
2773  if (! fbp)
2774  continue;
2775  size_t olt = std::count_if(fbp->quals.begin(), fbp->quals.end(), isOldLocusTag);
2776  size_t lt = std::count_if(fbp->quals.begin(), fbp->quals.end(), isLocusTag);
2777 
2778  if (olt == 0)
2779  continue;
2780 
2781  if (lt == 0) {
2782  ErrPostEx(SEV_REJECT, ERR_FEATURE_OldLocusTagWithoutNew, "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier but lacks a /locus_tag qualifier. Entry dropped.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2783  *drop = true;
2784  } else {
2785  for (const auto& gbqp1 : fbp->quals) {
2786  if (! gbqp1->IsSetQual() || ! gbqp1->IsSetVal() || ! isLocusTag(gbqp1))
2787  continue;
2788 
2789  const string& gbqp1_val = gbqp1->GetVal();
2790  if (gbqp1_val.empty())
2791  continue;
2792 
2793  for (const auto& gbqp2 : fbp->quals) {
2794  if (! gbqp2->IsSetQual() || ! gbqp2->IsSetVal())
2795  continue;
2796 
2797  const string& gbqp2_val = gbqp2->GetVal();
2798 
2799  if (! isOldLocusTag(gbqp2) || ! NStr::EqualNocase(gbqp1_val, gbqp2_val))
2800  continue;
2801 
2802  ErrPostEx(SEV_REJECT, ERR_FEATURE_MatchingOldNewLocusTag, "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier with a value that is identical to that of a /locus_tag qualifier: \"%s\". Entry dropped.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location", gbqp1_val.c_str());
2803  *drop = true;
2804  }
2805  }
2806  }
2807 
2808  if (olt == 1)
2809  continue;
2810 
2811  for (TQualVector::const_iterator gbqp1 = fbp->quals.begin(); gbqp1 != fbp->quals.end(); ++gbqp1) {
2812  const string& gbqp1_val = (*gbqp1)->GetVal();
2813  if (isOldLocusTag(*gbqp1) || gbqp1_val.empty())
2814  continue;
2815 
2816  TQualVector::const_iterator gbqp2 = gbqp1;
2817  for (++gbqp2; gbqp2 != fbp->quals.end(); ++gbqp2) {
2818  const string& gbqp2_val = (*gbqp2)->GetVal();
2819  if (isOldLocusTag(*gbqp2) || gbqp2_val.empty())
2820  continue;
2821 
2822  if (NStr::CompareNocase(gbqp1_val.c_str(), gbqp2_val.c_str()) == 0) {
2823  ErrPostEx(SEV_ERROR, ERR_FEATURE_RedundantOldLocusTag, "Feature \"%s\" at \"%s\" has redundant /old_locus_tag qualifiers. Dropping all but the first.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "unknown location");
2824  break;
2825  }
2826  }
2827 
2828  if (gbqp2 != fbp->quals.end())
2829  break;
2830  }
2831  }
2832 }
2833 
2834 /**********************************************************/
2836 {
2837  FeatBlkPtr fbp;
2838  bool got_pseudogene;
2839  bool got_pseudo;
2840 
2841  for (; dbp; dbp = dbp->mpNext) {
2842  fbp = static_cast<FeatBlk*>(dbp->mpData);
2843  if (! fbp)
2844  continue;
2845 
2846  got_pseudo = false;
2847  got_pseudogene = false;
2848 
2849  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();) {
2850  const string& qual_str = (*cur)->GetQual();
2851  const string& val_str = (*cur)->IsSetVal() ? (*cur)->GetVal() : "";
2852 
2853  if (qual_str != "pseudogene") {
2854  if (! got_pseudo && qual_str == "pseudo")
2855  got_pseudo = true;
2856  ++cur;
2857  continue;
2858  }
2859 
2860  if (got_pseudogene) {
2861  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_MultiplePseudoGeneQuals, "Dropping a /pseudogene qualifier because multiple /pseudogene qualifiers are present : <%s> : Feature key <%s> : Feature location <%s>.", val_str.empty() ? "[empty]" : val_str.c_str(), fbp->key, fbp->location);
2862 
2863  cur = fbp->quals.erase(cur);
2864  continue;
2865  }
2866 
2867  got_pseudogene = true;
2868 
2869  if (val_str.empty()) {
2870  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidPseudoGeneValue, "Dropping a /pseudogene qualifier because its value is empty : Feature key <%s> : Feature location <%s>.", fbp->key, fbp->location);
2871 
2872  cur = fbp->quals.erase(cur);
2873  continue;
2874  }
2875 
2876  if (MatchArrayString(PseudoGeneValues, val_str.c_str()) >= 0) {
2877  ++cur;
2878  continue;
2879  }
2880 
2881  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidPseudoGeneValue, "Dropping a /pseudogene qualifier because its value is invalid : <%s> : Feature key <%s> : Feature location <%s>.", val_str.c_str(), fbp->key, fbp->location);
2882 
2883  cur = fbp->quals.erase(cur);
2884  }
2885 
2886  if (! got_pseudogene || ! got_pseudo)
2887  continue;
2888 
2889  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_OldPseudoWithPseudoGene, "A legacy /pseudo qualifier and a /pseudogene qualifier are present on the same feature : Dropping /pseudo : Feature key <%s> : Feature location <%s>.", fbp->key, fbp->location);
2890  DeleteQual(fbp->quals, "pseudo");
2891  }
2892 }
2893 
2894 /**********************************************************/
2895 static void fta_check_compare_qual(DataBlkPtr dbp, bool is_tpa)
2896 {
2897  FeatBlkPtr fbp;
2898  Char ch;
2899  Int4 com_count;
2900  Int4 cit_count;
2901 
2902  for (; dbp; dbp = dbp->mpNext) {
2903  fbp = static_cast<FeatBlk*>(dbp->mpData);
2904  if (! fbp)
2905  continue;
2906 
2907  com_count = 0;
2908  cit_count = 0;
2909 
2910  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();) {
2911  const string& qual_str = (*cur)->GetQual();
2912  const string dummy;
2913  const string& val_str = (*cur)->IsSetVal() ? (*cur)->SetVal() : dummy;
2914 
2915  if (qual_str == "compare") {
2916  bool badcom = true;
2917  if (! val_str.empty()) {
2918  const char* q = StringChr(val_str.c_str(), '.');
2919  if (q && q[1] != '\0') {
2920  const char* p;
2921  for (p = q + 1; *p >= '0' && *p <= '9';)
2922  p++;
2923  if (*p == '\0') {
2924  if (GetNucAccOwner(CTempString(val_str, 0, q - val_str.c_str())) > CSeq_id::e_not_set)
2925  badcom = false;
2926  }
2927  }
2928  }
2929 
2930  if (badcom) {
2931  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_IllegalCompareQualifier, "/compare qualifier value is not a legal Accession.Version : feature \"%s\" at \"%s\" : value \"%s\" : qualifier has been dropped.", fbp->key, fbp->location, val_str.empty() ? "[empty]" : val_str.c_str());
2932 
2933  cur = fbp->quals.erase(cur);
2934  continue;
2935  }
2936  com_count++;
2937  } else if (qual_str == "citation")
2938  cit_count++;
2939 
2940  ++cur;
2941  }
2942 
2943  if (com_count > 0 || cit_count > 0 ||
2944  (! StringEqu(fbp->key, "old_sequence") &&
2945  ! StringEqu(fbp->key, "conflict")))
2946  continue;
2947 
2948  ch = '\0';
2949  if (StringLen(fbp->location) > 30) {
2950  ch = fbp->location[30];
2951  fbp->location[30] = '\0';
2952  }
2953  ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing, "Feature \"%s\" at \"%s\" lacks required /citation and/or /compare qualifier : feature has been dropped.", fbp->key, fbp->location);
2954  if (ch != '\0')
2955  fbp->location[30] = ch;
2956  dbp->mDrop = true;
2957  }
2958 }
2959 
2960 /**********************************************************/
2962  IndexblkPtr ibp)
2963 {
2964  FeatBlkPtr fbp;
2965  char* location;
2966  char* p;
2967  char* q;
2968  char* r;
2969  Uint1 i;
2970 
2971  location = nullptr;
2972  for (; dbp; dbp = dbp->mpNext) {
2973  fbp = static_cast<FeatBlk*>(dbp->mpData);
2974  if (! fbp || ! fbp->location)
2975  continue;
2976  location = StringSave(fbp->location);
2977  for (p = location, q = p; *p != '\0'; p++)
2978  if (*p != ' ' && *p != '\t' && *p != '\n')
2979  *q++ = *p;
2980  *q = '\0';
2981  if (q == location) {
2982  MemFree(location);
2983  location = nullptr;
2984  continue;
2985  }
2986 
2987  for (p = location + 1; *p != '\0'; p++) {
2988  if (*p != ':')
2989  continue;
2990  for (r = nullptr, q = p - 1;; q--) {
2991  if (q == location) {
2992  if (*q != '_' && (*q < '0' || *q > '9') &&
2993  (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
2994  q++;
2995  break;
2996  }
2997  if (*q == '.') {
2998  if (! r) {
2999  r = q;
3000  continue;
3001  }
3002  q++;
3003  break;
3004  }
3005  if (*q != '_' && (*q < '0' || *q > '9') &&
3006  (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z')) {
3007  q++;
3008  break;
3009  }
3010  }
3011  if (q == p)
3012  continue;
3013  i = GetNucAccOwner(CTempString(q, (r ? r : p) - q));
3014  if (i == CSeq_id::e_Genbank && (q[0] == 'e' || q[0] == 'E') &&
3015  (q[1] == 'z' || q[1] == 'Z') && ibp->is_tpa == false)
3016  continue;
3017  if (ibp->is_tpa && (i == CSeq_id::e_Tpg || i == CSeq_id::e_Tpd ||
3018  i == CSeq_id::e_Tpe))
3019  continue;
3020  break;
3021  }
3022  if (*p != '\0')
3023  break;
3024  if (location) {
3025  MemFree(location);
3026  location = nullptr;
3027  }
3028  }
3029  if (! dbp)
3030  return;
3031 
3032  ibp->drop = true;
3033  if (location && StringLen(location) > 45) {
3034  location[40] = '\0';
3035  StringCat(location, "...");
3036  }
3037  if (ibp->is_tsa)
3038  ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTSA, "Feature \"%s\" at \"%s\" on a TSA record cannot point to a non-TSA record.", fbp->key, location ? location : "empty_location");
3039  else if (ibp->is_tls)
3040  ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTLS, "Feature \"%s\" at \"%s\" on a TLS record cannot point to a non-TLS record.", fbp->key, location ? location : "empty_location");
3041  else
3042  ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTPA, "Feature \"%s\" at \"%s\" on a TPA record cannot point to a non-TPA record.", fbp->key, location ? location : "empty_location");
3043  if (location)
3044  MemFree(location);
3045 }
3046 
3047 /**********************************************************/
3049 {
3050  using FTAOperonList = list<FTAOperon*>;
3051  FTAOperonList operonList;
3052  FTAOperonList residentList;
3053  bool success = true;
3054 
3055  if (feats.empty()) {
3056  return true;
3057  }
3058 
3059  for (const auto& pFeat : feats) {
3060  if (! pFeat->GetData().IsImp())
3061  continue;
3062 
3063  const auto& featLocation = pFeat->GetLocation();
3064  const CImp_feat& featImp = pFeat->GetData().GetImp();
3065  FTAOperon* pLatest(nullptr);
3066  int opQualCount(0);
3067 
3068  for (const auto& pQual : pFeat->GetQual()) {
3069  const auto& qual = *pQual;
3070  if (! qual.IsSetQual() || qual.GetQual() != "operon" ||
3071  ! qual.IsSetVal() || qual.GetVal().empty()) {
3072  continue;
3073  }
3074  opQualCount++;
3075 
3076  pLatest = new FTAOperon(
3077  featImp.IsSetKey() ? featImp.GetKey().c_str() : "Unknown",
3078  qual.GetVal(),
3079  featLocation);
3080  if (pLatest->IsOperon()) {
3081  operonList.push_back(pLatest);
3082  } else {
3083  residentList.push_back(pLatest);
3084  continue;
3085  }
3086  for (const auto& operon : operonList) {
3087  if (pLatest == operon) {
3088  continue;
3089  }
3090  if (pLatest->mOperon != operon->mOperon) {
3091  continue;
3092  }
3093  ErrPostEx(SEV_REJECT, ERR_FEATURE_OperonQualsNotUnique, "The operon features at \"%s\" and \"%s\" utilize the same /operon qualifier : \"%s\".", operon->LocationStr().c_str(), pLatest->LocationStr().c_str(), pLatest->mOperon.c_str());
3094  success = false;
3095  }
3096  }
3097 
3098  if (opQualCount > 1) {
3099  ErrPostEx(SEV_REJECT, ERR_FEATURE_MultipleOperonQuals, "Feature \"%s\" at \"%s\" has more than one operon qualifier.", pLatest->mFeatname.c_str(), pLatest->LocationStr().c_str());
3100  success = false;
3101  }
3102 
3103  if (opQualCount == 0 && featImp.IsSetKey() && featImp.GetKey() == "operon") {
3104  ErrPostEx(SEV_REJECT, ERR_FEATURE_MissingOperonQual, "The operon feature at \"%s\" lacks an /operon qualifier.", location_to_string_or_unknown(featLocation).c_str());
3105  success = false;
3106  }
3107  }
3108 
3109  for (const auto& resident : residentList) {
3110  bool matched = false;
3111  for (const auto& operon : operonList) {
3112  if (resident->mOperon != operon->mOperon) {
3113  continue;
3114  }
3115  matched = true;
3117  *resident->mLocation, *operon->mLocation, nullptr, sequence::fCompareOverlapping);
3118  if (compare != sequence::eContained && compare != sequence::eSame) {
3119  ErrPostEx(SEV_REJECT, ERR_FEATURE_OperonLocationMisMatch, "Feature \"%s\" at \"%s\" with /operon qualifier \"%s\" does not fall within the span of the operon feature at \"%s\".", resident->mFeatname.c_str(), resident->LocationStr().c_str(), resident->mOperon.c_str(), operon->LocationStr().c_str());
3120  success = false;
3121  }
3122  }
3123  if (! matched) {
3124  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidOperonQual, "/operon qualifier \"%s\" on feature \"%s\" at \"%s\" has a value that does not match any of the /operon qualifiers on operon features.", resident->mOperon.c_str(), resident->mFeatname.c_str(), resident->LocationStr().c_str());
3125  success = false;
3126  }
3127  }
3128  for (auto& resident : residentList) {
3129  delete resident;
3130  }
3131  for (auto& operon : operonList) {
3132  delete operon;
3133  }
3134  return success;
3135 }
3136 
3137 /**********************************************************/
3139 {
3140  Char ch;
3141 
3142  if (! fbp || fbp->quals.empty())
3143  return;
3144 
3145  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end(); ++cur) {
3146  const char* cur_qual = (*cur)->IsSetQual() ? (*cur)->GetQual().c_str() : nullptr;
3147  const char* cur_val = (*cur)->IsSetVal() ? (*cur)->GetVal().c_str() : nullptr;
3148 
3149  TQualVector::iterator next = cur;
3150  for (++next; next != fbp->quals.end();) {
3151  const char* next_qual = (*next)->IsSetQual() ? (*next)->GetQual().c_str() : nullptr;
3152  const char* next_val = (*next)->IsSetVal() ? (*next)->GetVal().c_str() : nullptr;
3153 
3154  if (! fta_strings_same(cur_qual, next_qual) || ! fta_strings_same(cur_val, next_val)) {
3155  ++next;
3156  continue;
3157  }
3158 
3159  if (fbp->location && StringLen(fbp->location) > 20) {
3160  ch = fbp->location[20];
3161  fbp->location[20] = '\0';
3162  } else
3163  ch = '\0';
3164 
3165  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DuplicateRemoved, "Duplicated qualifier \"%s\" in feature \"%s\" at location \"%s%s\" removed.", cur_qual ? cur_qual : "???", fbp->key ? fbp->key : "???", fbp->location ? fbp->location : "???", (ch == '\0') ? "" : "...");
3166 
3167  if (ch != '\0')
3168  fbp->location[20] = ch;
3169 
3170  next = fbp->quals.erase(next);
3171  }
3172  }
3173 }
3174 
3175 /**********************************************************/
3176 static void CollectGapFeats(const DataBlk& entry, DataBlkPtr dbp, ParserPtr pp, Int2 type)
3177 {
3178  IndexblkPtr ibp;
3179  GapFeatsPtr gfp = nullptr;
3180  GapFeatsPtr tgfp;
3181  DataBlkPtr tdbp;
3182  FeatBlkPtr fbp;
3183 
3184  CLinkage_evidence::TLinkage_evidence asn_linkage_evidence;
3185  list<string> linkage_evidence_names;
3186 
3187  StrNum* snp;
3188  char* p;
3189  char* q;
3190  const char* gap_type;
3191  bool finished_gap;
3192  ErrSev sev;
3193  Int4 estimated_length;
3194  Int4 is_htg;
3195  Int4 from;
3196  Int4 to;
3197  Int4 prev_gap; /* 0 - initial, 1 - "gap",
3198  2 - "assembly_gap" */
3199  Int4 curr_gap; /* 0 - initial, 1 - "gap",
3200  2 - "assembly_gap" */
3201  CSeq_gap::TType asn_gap_type;
3202 
3203  ibp = pp->entrylist[pp->curindx];
3204 
3205  if (ibp->keywords.empty()) {
3206  if (pp->format == Parser::EFormat::GenBank)
3208  else if (pp->format == Parser::EFormat::EMBL)
3210  else if (pp->format == Parser::EFormat::XML)
3211  XMLGetKeywords(entry.mOffset, ibp->xip, ibp->keywords);
3212  }
3213 
3214  is_htg = -1;
3215  for (const string& key : ibp->keywords) {
3216  if (is_htg >= 0 && is_htg <= 2)
3217  break;
3218  if (key == "HTG")
3219  is_htg = 3;
3220  else if (key == "HTGS_PHASE0")
3221  is_htg = 0;
3222  else if (key == "HTGS_PHASE1")
3223  is_htg = 1;
3224  else if (key == "HTGS_PHASE2")
3225  is_htg = 2;
3226  else if (key == "HTGS_PHASE3")
3227  is_htg = 3;
3228  }
3229 
3230  // prev_gap = 0;
3231  curr_gap = 0;
3232  finished_gap = false;
3233  for (ibp->gaps = nullptr; dbp; dbp = dbp->mpNext) {
3234  if (ibp->drop)
3235  break;
3236  if (dbp->mType != type)
3237  continue;
3238 
3239  linkage_evidence_names.clear();
3240  asn_linkage_evidence.clear();
3241 
3242  for (tdbp = static_cast<DataBlk*>(dbp->mpData); tdbp; tdbp = tdbp->mpNext) {
3243  if (ibp->drop)
3244  break;
3245  fbp = static_cast<FeatBlk*>(tdbp->mpData);
3246  if (! fbp || ! fbp->key)
3247  continue;
3248  if (StringEqu(fbp->key, "gap")) {
3249  prev_gap = curr_gap;
3250  curr_gap = 1;
3251  } else if (StringEqu(fbp->key, "assembly_gap")) {
3252  prev_gap = curr_gap;
3253  curr_gap = 2;
3254  } else
3255  continue;
3256 
3257  from = 0;
3258  to = 0;
3259  gap_type = nullptr;
3260  linkage_evidence_names.clear();
3261  asn_gap_type = -1;
3262  asn_linkage_evidence.clear();
3263  estimated_length = -1;
3264 
3265  for (const auto& cur : fbp->quals) {
3266  if (! cur->IsSetQual() || ! cur->IsSetVal())
3267  continue;
3268 
3269  const string& cur_qual = cur->GetQual();
3270  const string& cur_val = cur->GetVal();
3271 
3272  if (cur_qual.empty() || cur_val.empty())
3273  continue;
3274 
3275  if (cur_qual == "estimated_length") {
3276  if (cur_val == "unknown")
3277  estimated_length = -100;
3278  else {
3279  const char* cp = cur_val.c_str();
3280  for (; *cp >= '0' && *cp <= '9';)
3281  ++cp;
3282  if (*cp == '\0')
3283  estimated_length = atoi(cur_val.c_str());
3284  }
3285  } else if (cur_qual == "gap_type")
3286  gap_type = cur_val.c_str();
3287  else if (cur_qual == "linkage_evidence") {
3288  linkage_evidence_names.push_back(cur_val);
3289  }
3290  }
3291 
3292  if (fbp->location) {
3293  p = fbp->location;
3294  if (*p == '<')
3295  p++;
3296  for (q = p; *p >= '0' && *p <= '9';)
3297  p++;
3298  if (*p == '\0') {
3299  from = atoi(q);
3300  to = from;
3301  } else if (*p == '.') {
3302  *p = '\0';
3303  from = atoi(q);
3304  *p++ = '.';
3305  if (*fbp->location == '<' && from != 1)
3306  from = 0;
3307  else if (*p == '.') {
3308  if (*++p == '>')
3309  p++;
3310  for (q = p; *p >= '0' && *p <= '9';)
3311  p++;
3312  if (*p == '\0')
3313  to = atoi(q);
3314  if (*(q - 1) == '>' && to != (int)ibp->bases)
3315  to = 0;
3316  }
3317  }
3318  }
3319 
3320  if (from == 0 || to == 0 || from > to) {
3321  if (curr_gap == 1)
3322  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidGapLocation, "Invalid gap feature location : \"%s\" : all gap features must have a simple X..Y location on the plus strand.", fbp->location ? fbp->location : "unknown");
3323  else
3324  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidAssemblyGapLocation, "Invalid assembly_gap location : \"%s\".", fbp->location ? fbp->location : "unknown");
3325  ibp->drop = true;
3326  break;
3327  }
3328 
3329  if (curr_gap == 2) /* "assembly_gap" feature */
3330  {
3331  if (gap_type && is_htg > -1 &&
3332  ! StringEqu(gap_type, "within scaffold") &&
3333  ! StringEqu(gap_type, "repeat within scaffold"))
3334  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_UnexpectedGapTypeForHTG, "assembly_gap has /gap_type of \"%s\", but clone-based HTG records are only expected to have \"within scaffold\" or \"repeat within scaffold\" gaps. assembly_gap feature located at \"%d..%d\".", gap_type, from, to);
3335 
3336  if (is_htg == 0 || is_htg == 1) {
3337  for (const string& evidence : linkage_evidence_names) {
3339  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_LinkageShouldBeUnspecified, "assembly gap has /linkage_evidence of \"%s\", but unoriented and unordered Phase0/Phase1 HTG records are expected to have \"unspecified\" evidence. assembly_gap feature located at \"%d..%d\".", evidence.c_str(), from, to);
3340  }
3341  }
3342  } else if (is_htg == 2 || is_htg == 3) {
3343  for (const string& evidence : linkage_evidence_names) {
3345  continue;
3346 
3347  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_LinkageShouldNotBeUnspecified, "assembly gap has /linkage_evidence of \"unspecified\", but ordered and oriented HTG records are expected to have some level of linkage for their gaps. assembly_gap feature located at \"%d..%d\".", from, to);
3348  }
3349  }
3350 
3351  if (is_htg == 3 && ! finished_gap) {
3352  ErrPostEx(SEV_ERROR, ERR_FEATURE_FinishedHTGHasAssemblyGap, "Finished Phase-3 HTG records are not expected to have any gaps. First assembly_gap feature encountered at \"%d..%d\".", from, to);
3353  finished_gap = true;
3354  }
3355 
3356  if (! gap_type) {
3357  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingGapType, "assembly_gap feature at \"%d..%d\" lacks the required /gap_type qualifier.", from, to);
3358  ibp->drop = true;
3359  break;
3360  }
3361 
3362  for (snp = GapTypeValues; snp->str; snp++)
3363  if (StringEqu(snp->str, gap_type))
3364  break;
3365  if (! snp->str) {
3366  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidGapType, "assembly_gap feature at \"%d..%d\" has an invalid gap type : \"%s\".", from, to, gap_type);
3367  ibp->drop = true;
3368  break;
3369  }
3370  asn_gap_type = snp->num;
3371 
3372  if (linkage_evidence_names.empty() &&
3373  (StringEqu(gap_type, "within scaffold") ||
3374  StringEqu(gap_type, "repeat within scaffold"))) {
3375  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingLinkageEvidence, "assembly_gap feature at \"%d..%d\" with gap type \"%s\" lacks a /linkage_evidence qualifier.", from, to, gap_type);
3376  ibp->drop = true;
3377  break;
3378  }
3379  if (! linkage_evidence_names.empty()) {
3380  if (! StringEqu(gap_type, "unknown") &&
3381  ! StringEqu(gap_type, "within scaffold") &&
3382  ! StringEqu(gap_type, "repeat within scaffold")) {
3385  "The /linkage_evidence qualifier is not legal for the assembly_gap feature at \"%d..%d\" with /gap_type \"%s\".",
3386  from,
3387  to,
3388  gap_type);
3389  ibp->drop = true;
3390  break;
3391  }
3392 
3393  for (const string& evidence : linkage_evidence_names) {
3394  for (snp = LinkageEvidenceValues; snp->str; snp++)
3395  if (evidence == snp->str)
3396  break;
3397  if (! snp->str) {
3400  "assembly_gap feature at \"%d..%d\" has an invalid linkage evidence : \"%s\".",
3401  from,
3402  to,
3403  evidence.c_str());
3404  ibp->drop = true;
3405  break;
3406  }
3407 
3408  CRef<CLinkage_evidence> new_evidence(new CLinkage_evidence);
3409  new_evidence->SetType(snp->num);
3410  asn_linkage_evidence.push_back(new_evidence);
3411  }
3412  }
3413  }
3414 
3415  if (prev_gap + curr_gap == 3) {
3416  if (curr_gap == 1)
3417  ErrPostEx(SEV_REJECT, ERR_FEATURE_AssemblyGapAndLegacyGap, "Legacy gap feature at \"%d..%d\" co-exists with a new AGP 2.0 assembly_gap feature at \"%d..%d\".", from, to, gfp->from, gfp->to);
3418  else
3419  ErrPostEx(SEV_REJECT, ERR_FEATURE_AssemblyGapAndLegacyGap, "Legacy gap feature at \"%d..%d\" co-exists with a new AGP 2.0 assembly_gap feature at \"%d..%d\".", gfp->from, gfp->to, from, to);
3420  ibp->drop = true;
3421  break;
3422  }
3423 
3424  if (estimated_length == -1) /* missing qual */
3425  {
3426  ErrPostEx(SEV_REJECT, ERR_FEATURE_RequiredQualifierMissing, "The gap feature at \"%d..%d\" lacks the required /estimated_length qualifier.", from, to);
3427  ibp->drop = true;
3428  } else if (estimated_length == 0) {
3429  ErrPostEx(SEV_REJECT, ERR_FEATURE_IllegalEstimatedLength, "Gap feature at \"%d..%d\" has an illegal /estimated_length qualifier : \"%s\" : should be \"unknown\" or an integer.",
3430  // from, to, gbqp->val); // at this point gbqp is definitely = NULL
3431  from,
3432  to,
3433  "");
3434  ibp->drop = true;
3435  } else if (estimated_length == -100) {
3436  if (is_htg >= 0 && to - from != 99) {
3437  ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownGapNot100, "Gap feature at \"%d..%d\" has /estimated_length \"unknown\" but the gap size is not 100 bases.", from, to);
3438  }
3439  } else if (estimated_length != to - from + 1) {
3441  sev = SEV_ERROR;
3442  else {
3443  sev = SEV_REJECT;
3444  ibp->drop = true;
3445  }
3446 
3447  ErrPostEx(sev, ERR_FEATURE_GapSizeEstLengthMissMatch, "Gap feature at \"%d..%d\" has a size that does not match the /estimated_length : %d.", from, to, estimated_length);
3448  }
3449 
3450  for (gfp = ibp->gaps; gfp; gfp = gfp->next) {
3451  if ((gfp->from >= from && gfp->from <= to) ||
3452  (gfp->to >= from && gfp->to <= to) ||
3453  (gfp->from <= from && gfp->to >= to)) {
3454  ErrPostEx(SEV_REJECT, ERR_FEATURE_OverlappingGaps, "Gap features at \"%d..%d\" and \"%d..%d\" overlap.", from, to, gfp->from, gfp->to);
3455  ibp->drop = true;
3456  } else if (to + 1 == gfp->from || from - 1 == gfp->to) {
3457  if (pp->source == Parser::ESource::EMBL)
3458  sev = SEV_ERROR;
3459  else {
3460  sev = SEV_REJECT;
3461  ibp->drop = true;
3462  }
3463 
3464  ErrPostEx(sev, ERR_FEATURE_ContiguousGaps, "Gap features at \"%d..%d\" and \"%d..%d\" are contiguous, and should probably be represented by a single gap that spans both.", from, to, gfp->from, gfp->to);
3465  }
3466  }
3467  if (ibp->drop)
3468  break;
3469 
3470  gfp = new GapFeats;
3471  gfp->from = from;
3472  gfp->to = to;
3473  gfp->estimated_length = estimated_length;
3474  if (curr_gap == 2) /* /assembly_gap feature */
3475  gfp->assembly_gap = true;
3476  if (gap_type) {
3477  gfp->gap_type = gap_type;
3478  gfp->asn_gap_type = asn_gap_type;
3479  }
3480  if (! asn_linkage_evidence.empty()) {
3481  gfp->asn_linkage_evidence.swap(asn_linkage_evidence);
3482  asn_linkage_evidence.clear();
3483  }
3484  gfp->next = nullptr;
3485 
3486  if (! ibp->gaps) {
3487  ibp->gaps = gfp;
3488  continue;
3489  }
3490 
3491  if (ibp->gaps->from > from) {
3492  gfp->next = ibp->gaps;
3493  ibp->gaps = gfp;
3494  continue;
3495  }
3496 
3497  if (! ibp->gaps->next) {
3498  ibp->gaps->next = gfp;
3499  continue;
3500  }
3501 
3502  for (tgfp = ibp->gaps; tgfp; tgfp = tgfp->next) {
3503  if (tgfp->next && tgfp->next->from < from)
3504  continue;
3505  gfp->next = tgfp->next;
3506  tgfp->next = gfp;
3507  break;
3508  }
3509  }
3510  if (ibp->drop) {
3511  linkage_evidence_names.clear();
3512  asn_linkage_evidence.clear();
3513  }
3514  }
3515 
3516  if (! ibp->gaps)
3517  return;
3518 
3519  if (ibp->drop) {
3520  GapFeatsFree(ibp->gaps);
3521  ibp->gaps = nullptr;
3522  }
3523 }
3524 
3525 /**********************************************************/
3526 static void XMLGetQuals(char* entry, XmlIndexPtr xip, TQualVector& quals)
3527 {
3528  XmlIndexPtr xipqual;
3529 
3530  if (! entry || ! xip)
3531  return;
3532 
3533  for (; xip; xip = xip->next) {
3534  if (! xip->subtags)
3535  continue;
3536 
3537  CRef<CGb_qual> qual(new CGb_qual);
3538  for (xipqual = xip->subtags; xipqual; xipqual = xipqual->next) {
3539  if (xipqual->tag == INSDQUALIFIER_NAME)
3540  qual->SetQual(XMLGetTagValue(entry, xipqual));
3541  else if (xipqual->tag == INSDQUALIFIER_VALUE)
3542  qual->SetVal(XMLGetTagValue(entry, xipqual));
3543  }
3544 
3545  if (qual->GetQual() == "replace" && ! qual->IsSetVal()) {
3546  qual->SetVal("");
3547  }
3548 
3549  if (qual->IsSetQual() && ! qual->GetQual().empty())
3550  quals.push_back(qual);
3551  }
3552 }
3553 
3554 /**********************************************************/
3555 static DataBlkPtr XMLLoadFeatBlk(char* entry, XmlIndexPtr xip)
3556 {
3557  XmlIndexPtr xipfeat;
3558  DataBlkPtr headdbp;
3559  DataBlkPtr dbp;
3560  DataBlkPtr ret;
3561  FeatBlkPtr fbp;
3562 
3563  if (! entry || ! xip)
3564  return nullptr;
3565 
3566  for (; xip; xip = xip->next)
3567  if (xip->tag == INSDSEQ_FEATURE_TABLE)
3568  break;
3569 
3570  if (! xip || ! xip->subtags)
3571  return nullptr;
3572 
3573  headdbp = nullptr;
3574  for (xip = xip->subtags; xip; xip = xip->next) {
3575  if (! xip->subtags)
3576  continue;
3577  fbp = new FeatBlk;
3578  fbp->spindex = -1;
3579  for (xipfeat = xip->subtags; xipfeat; xipfeat = xipfeat->next) {
3580  if (xipfeat->tag == INSDFEATURE_KEY)
3581  fbp->key = XMLGetTagValue(entry, xipfeat);
3582  else if (xipfeat->tag == INSDFEATURE_LOCATION)
3583  fbp->location = XMLGetTagValue(entry, xipfeat);
3584  else if (xipfeat->tag == INSDFEATURE_QUALS)
3585  XMLGetQuals(entry, xipfeat->subtags, fbp->quals);
3586  }
3587  if (! headdbp) {
3588  headdbp = new DataBlk;
3589  dbp = headdbp;
3590  } else {
3591  dbp->mpNext = new DataBlk;
3592  dbp = dbp->mpNext;
3593  }
3594  dbp->mpData = fbp;
3595  }
3596  ret = new DataBlk;
3597  ret->mType = XML_FEATURES;
3598  ret->mpData = headdbp;
3599  ret->mpNext = nullptr;
3600  return (ret);
3601 }
3602 
3603 /**********************************************************
3604  *
3605  * static FeatBlkPtr MergeNoteQual(fbp):
3606  *
3607  * Only one note on every key feature block,
3608  * not complete.
3609  *
3610  * 5-28-93
3611  *
3612  **********************************************************/
3614 {
3615  char* p;
3616  char* q;
3617 
3618  size_t size = 0;
3619 
3620  for (auto& cur : fbp->quals) {
3621  if (! cur->IsSetQual() || ! cur->IsSetVal())
3622  continue;
3623 
3624  const string& cur_qual = cur->GetQual();
3625  const string& cur_val = cur->GetVal();
3626 
3627  if (cur_qual != "note" || cur_val.empty())
3628  continue;
3629 
3630  size += 2;
3631  vector<Char> buf(cur_val.size() + 1);
3632 
3633  const char* cp = cur_val.c_str();
3634  for (q = &buf[0]; *cp != '\0'; ++cp) {
3635  *q++ = *cp;
3636  if (*cp == ';' && (cp[1] == ' ' || cp[1] == ';')) {
3637  for (++cp; *cp == ' ' || *cp == ';';)
3638  ++cp;
3639  if (*cp != '\0')
3640  *q++ = ' ';
3641  --cp;
3642  }
3643  }
3644 
3645  *q = '\0';
3646  cur->SetVal(&buf[0]);
3647 
3648  size += cur->GetVal().size();
3649  for (cp = cur->GetVal().c_str(); *cp != '\0'; ++cp)
3650  if (*cp == '~')
3651  ++size;
3652  }
3653 
3654  if (size == 0)
3655  return (fbp);
3656 
3657  char* note = StringNew(size - 1);
3658  p = note;
3659 
3660  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();) {
3661  if (! (*cur)->IsSetQual() || ! (*cur)->IsSetVal()) {
3662  ++cur;
3663  continue;
3664  }
3665 
3666  const string& cur_qual = (*cur)->GetQual();
3667  const string& cur_val = (*cur)->GetVal();
3668 
3669  if (cur_qual != "note") {
3670  ++cur;
3671  continue;
3672  }
3673 
3674  if (! cur_val.empty()) {
3675  /* sometime we get note qual w/o value
3676  */
3677  if (p > note) {
3678  *p++ = ';';
3679  *p++ = '~';
3680  }
3681 
3682  for (const char* cq = cur_val.c_str(); *cq != '\0'; *p++ = *cq++)
3683  if (*cq == '~')
3684  *p++ = '~';
3685  }
3686 
3687  cur = fbp->quals.erase(cur);
3688  }
3689  *p = '\0';
3690 
3691  CRef<CGb_qual> qual_new(new CGb_qual);
3692  qual_new->SetQual("note");
3693  qual_new->SetVal(note);
3694  MemFree(note);
3695 
3696  fbp->quals.push_back(qual_new);
3697 
3698  return (fbp);
3699 }
3700 
3701 /**********************************************************/
3702 static bool CheckLegalQual(const Char* val, Char ch, string* qual)
3703 {
3704  string qual_name;
3705  for (; *val && *val != ch && (isalpha(*val) || *val == '_'); ++val)
3706  qual_name += *val;
3707 
3710  return false;
3711 
3712  if (qual)
3713  *qual = qual_name;
3714 
3715  return true;
3716 }
3717 
3718 /**********************************************************/
3719 static void fta_convert_to_lower_case(char* str)
3720 {
3721  char* p;
3722 
3723  if (! str || *str == '\0')
3724  return;
3725 
3726  for (p = str; *p != '\0'; p++)
3727  if (*p >= 'A' && *p <= 'Z')
3728  *p |= 040;
3729 }
3730 
3731 /**********************************************************/
3732 static void fta_process_con_slice(vector<char>& val_buf)
3733 {
3734  size_t i = 1;
3735  char* p = &val_buf[0];
3736 
3737  // look for commas not followed by blank or end-of-string
3738  for (; *p != '\0'; p++)
3739  if (*p == ',' && p[1] != ' ' && p[1] != '\0')
3740  i++;
3741 
3742  // if there are some ...
3743  if (i > 1) {
3744  vector<char> buf(i + val_buf.size());
3745  char* q = &buf[0];
3746  // ... then insert a blank right after the comma
3747  for (p = &val_buf[0]; *p != '\0'; p++) {
3748  *q++ = *p;
3749  if (*p == ',' && p[1] != ' ' && p[1] != '\0')
3750  *q++ = ' ';
3751  }
3752  *q = '\0';
3753  val_buf.swap(buf);
3754  }
3755 }
3756 
3757 
3759  const string& str,
3760  vector<string>& lines)
3761 {
3762  NStr::Split(str, "\n", lines, 0);
3763 }
3764 
3765 /**********************************************************
3766  *
3767  * static void ParseQualifiers(fbp, bptr, eptr,
3768  * format):
3769  *
3770  * Parsing qualifier and put into link list fbp->qual.
3771  * Some qualifiers may not have value.
3772  * genbank qualifier format: /qualifier=value
3773  * embl qualifier format: /qualifier= value
3774  *
3775  * 10-12-93
3776  *
3777  **********************************************************/
3778 static void ParseQualifiers(
3779  FeatBlkPtr fbp,
3780  const char* bptr,
3781  const char* eptr,
3783 {
3784  string bstr(bptr, eptr);
3786  // cerr << "bstr:\n" << bstr.c_str() << "\n\n";
3787  vector<string> qualLines;
3788  xSplitLines(bstr, qualLines);
3789 
3790  string qualKey, qualVal;
3791  string featKey(fbp->key);
3792  string featLocation(fbp->location);
3793  CQualParser qualParser(format, featKey, featLocation, qualLines);
3794  while (! qualParser.Done()) {
3795  if (qualParser.GetNextQualifier(qualKey, qualVal)) {
3796  // cerr << "Key: " << qualKey.c_str() << "\n";
3797  // cerr << "Val: " << qualVal.c_str() << "\n";
3798  CRef<CGb_qual> pQual(new CGb_qual);
3799  pQual->SetQual(qualKey);
3800  pQual->SetVal(qualVal);
3801  fbp->quals.push_back(pQual);
3802  }
3803  }
3804 }
3805 
3806 
3807 /**********************************************************/
3808 static void fta_check_satellite(char* str, bool* drop)
3809 {
3810  char* p;
3811  Int2 i;
3812 
3813  if (! str || *str == '\0')
3814  return;
3815 
3816  p = StringChr(str, ':');
3817  if (p)
3818  *p = '\0';
3819 
3821  if (p)
3822  *p = ':';
3823  if (i < 0) {
3824  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidSatelliteType, "/satellite qualifier \"%s\" does not begin with a valid satellite type.", str);
3825  *drop = true;
3826  } else if (p && p[1] == '\0') {
3827  ErrPostEx(SEV_REJECT, ERR_FEATURE_NoSatelliteClassOrIdentifier, "/satellite qualifier \"%s\" does not include a class or identifier after the satellite type.", str);
3828  *drop = true;
3829  }
3830 }
3831 
3832 /**********************************************************
3833  *
3834  * int ParseFeatureBlock(ibp, deb, dbp, source, format):
3835  *
3836  * Parsing each feature sub-block, dbp, to
3837  * FeatBlkPtr, fbp.
3838  * Put warning message if bad qualifier's value or
3839  * unknown feature key found.
3840  * fdbp->drop = true, if found unknown feature key, or
3841  * do not go through 2nd time of qualifiers sematic
3842  * check (i.e. drop bad qualifier if the value if illegal
3843  * format in the 1st time)
3844  *
3845  * 11-22-93
3846  *
3847  * The location begins at column 22, and qualifier
3848  * begin on subsequent lines at column 22, they may
3849  * extend from column 22-80.
3850  * Qualifiers take the form of a slash, "/", followed
3851  * by the qualifier name and, if applicable, an equal
3852  * sign, "=", and a value (i.e. some qualifiers only
3853  * have name w/o value, s.t. /pseudo).
3854  *
3855  * 5-4-93
3856  *
3857  **********************************************************/
3859 {
3860  char* bptr;
3861  char* eptr;
3862  char* ptr1;
3863  char* ptr2;
3864  char* p;
3865  char* q;
3866  string loc;
3867 
3868  FeatBlkPtr fbp;
3869  Int4 num;
3870  size_t i;
3871  int retval = GB_FEAT_ERR_NONE;
3872  int ret;
3873 
3874  if (ibp->is_mga)
3875  loc = "1.." + to_string(ibp->bases);
3876  for (num = 0; dbp; dbp = dbp->mpNext, num++) {
3877  fbp = new FeatBlk;
3878  fbp->spindex = -1;
3879  fbp->num = num;
3880  dbp->mpData = fbp;
3881 
3882  bptr = dbp->mOffset;
3883  eptr = bptr + dbp->len;
3884 
3885  for (p = bptr; *p != '\n';)
3886  p++;
3887  *p = '\0';
3888  FtaInstallPrefix(PREFIX_FEATURE, "Parsing FT line: ", bptr);
3889  *p = '\n';
3890  ptr1 = bptr + ParFlat_COL_FEATKEY;
3891  if (*ptr1 == ' ') {
3893  }
3894  for (ptr1 = bptr; *ptr1 == ' ';)
3895  ptr1++;
3896 
3897  for (ptr2 = ptr1; *ptr2 != ' ' && *ptr2 != '\n';)
3898  ptr2++;
3899 
3900  if (StringEquN(ptr1, "- ", 2)) {
3901  ErrPostStr(SEV_WARNING, ERR_FEATURE_FeatureKeyReplaced, "Featkey '-' is replaced by 'misc_feature'");
3902  fbp->key = StringSave("misc_feature");
3903  } else
3904  fbp->key = StringSave(string_view(ptr1, ptr2 - ptr1));
3905 
3906  for (ptr1 = ptr2; *ptr1 == ' ';)
3907  ptr1++;
3908  if (*ptr1 == '\n') {
3909  if (ibp->is_mga == false) {
3910  ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing, "Location missing");
3911  dbp->mDrop = true;
3912  retval = GB_FEAT_ERR_DROP;
3913  continue;
3914  }
3915  } else {
3916  i = ptr1 - bptr;
3917  if (i < ParFlat_COL_FEATDAT)
3918  ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing, "Location data is shifted to the left");
3919  else if (i > ParFlat_COL_FEATDAT)
3920  ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing, "Location data is shifted to the right");
3921  }
3922 
3923  for (ptr2 = ptr1; *ptr2 != '/' && ptr2 < eptr;)
3924  ptr2++;
3925  fbp->location = StringSave(string_view(ptr1, ptr2 - ptr1));
3926  if (ibp->is_prot)
3927  fta_strip_aa(fbp->location);
3928  for (p = fbp->location, q = p; *p != '\0'; p++)
3929  if (*p != ' ' && *p != '\n')
3930  *q++ = *p;
3931  *q = '\0';
3932 
3933  if (fbp->location[0] == '\0' && ibp->is_mga) {
3934  MemFree(fbp->location);
3935  fbp->location = StringSave(loc);
3936  }
3937 
3939  if (StringEqu(fbp->key, "allele") ||
3940  StringEqu(fbp->key, "mutation")) {
3941  ErrPostEx(SEV_ERROR, ERR_FEATURE_ObsoleteFeature, "Obsolete feature \"%s\" found. Replaced with \"variation\".", fbp->key);
3942  MemFree(fbp->key);
3943  fbp->key = StringSave("variation");
3944  }
3945 
3947 
3948  if (subtype == CSeqFeatData::eSubtype_bad && ! deb) {
3949  ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key, "Feature dropped");
3950  dbp->mDrop = true;
3951  retval = GB_FEAT_ERR_DROP;
3952  continue;
3953  }
3954 
3955  if (*ptr2 == '/') /* qualifier start in first "/" */
3956  {
3957  ParseQualifiers(fbp, ptr2, eptr, format);
3958 
3959  if (! StringEqu(fbp->key, "assembly_gap")) {
3960  for (const auto& cur : fbp->quals) {
3961  const string& cur_qual = cur->GetQual();
3962  if (cur_qual == "gap_type" ||
3963  cur_qual == "assembly_evidence") {
3964  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidQualifier, "Qualifier /%s is invalid for the feature \"%s\" at \"%s\".", cur_qual.c_str(), fbp->key, fbp->location ? fbp->location : "Unknown");
3965  ibp->drop = true;
3966  }
3967  }
3968  }
3969 
3970  if (! StringEqu(fbp->key, "source")) {
3971  for (const auto& cur : fbp->quals) {
3972  const string& cur_qual = cur->GetQual();
3973  if (cur_qual == "submitter_seqid") {
3974  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidQualifier, "Qualifier /%s is invalid for the feature \"%s\" at \"%s\".", cur_qual.c_str(), fbp->key, fbp->location ? fbp->location : "Unknown");
3975  ibp->drop = true;
3976  }
3977  }
3978  }
3979 
3980  fbp = MergeNoteQual(fbp); /* allow more than one
3981  notes w/i a key */
3982 
3983  if (subtype == CSeqFeatData::eSubtype_bad) {
3985  ret = GB_FEAT_ERR_REPAIRABLE;
3986  } else {
3987  /* last argument is perform_corrections if debug
3988  * mode is FALSE
3989  */
3990  ret = XGBFeatKeyQualValid(subtype, fbp->quals, true, (source == Parser::ESource::Flybase ? false : ! deb));
3991  }
3992  if (ret > retval)
3993  retval = ret;
3994 
3995  if (ret > GB_FEAT_ERR_REPAIRABLE && ! StringEqu(fbp->key, "ncRNA"))
3996  dbp->mDrop = true;
3997  } else if (subtype == CSeqFeatData::eSubtype_bad && ! CSeqFeatData::GetMandatoryQualifiers(subtype).empty()) {
3998  if (! StringEqu(fbp->key, "mobile_element")) {
3999  auto qual_idx = *CSeqFeatData::GetMandatoryQualifiers(subtype).begin();
4000  string str1 = CSeqFeatData::GetQualifierAsString(qual_idx);
4001  const char* str = str1.c_str();
4002  if ((! StringEqu(fbp->key, "old_sequence") &&
4003  ! StringEqu(fbp->key, "conflict")) ||
4004  ! StringEqu(str, "citation")) {
4005  ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing, "lacks required /%s qualifier : feature has been dropped.", str);
4006  if (! deb) {
4007  dbp->mDrop = true;
4008  retval = GB_FEAT_ERR_DROP;
4009  }
4010  }
4011  }
4012  } else if (StringEqu(fbp->key, "misc_feature") && fbp->quals.empty()) {
4013  if (! deb) {
4014  dbp->mDrop = true;
4015  retval = GB_FEAT_ERR_DROP;
4016  ErrPostStr(SEV_WARNING, ERR_FEATURE_Dropped, "Empty 'misc_feature' dropped");
4017  } else
4018  retval = GB_FEAT_ERR_REPAIRABLE;
4019  }
4020 
4021  for (auto& cur : fbp->quals) {
4022  if (! cur->IsSetQual() || ! cur->IsSetVal())
4023  continue;
4024 
4025  const string& qual_str = cur->GetQual();
4026  const string& val_str = cur->GetVal();
4027 
4028  vector<Char> val_buf(val_str.begin(), val_str.end());
4029  val_buf.push_back(0);
4030 
4031  p = &val_buf[0];
4032  ShrinkSpaces(p);
4033  if (*p == '\0' && qual_str != "replace") {
4034  cur->ResetVal();
4035  val_buf[0] = 0;
4036  } else {
4037  if (qual_str == "replace")
4039  cur->SetVal(p);
4040  }
4041 
4042  if (qual_str == "satellite")
4043  fta_check_satellite(&val_buf[0], &ibp->drop);
4044  }
4045  } /* for, each sub-block, or each feature key */
4047  return (retval);
4048 }
4049 
4050 /**********************************************************/
4052 {
4053  const char** b;
4054  char* p;
4055  Char ch;
4056 
4057  if (! fbp || fbp->quals.empty())
4058  return;
4059 
4060  for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();) {
4061  const string& qual_str = (*cur)->GetQual();
4062 
4063  if ((*cur)->IsSetVal()) {
4064  const string& val_str = (*cur)->GetVal();
4065  vector<Char> val_buf(val_str.begin(), val_str.end());
4066  val_buf.push_back(0);
4067 
4068  if (qual_str == "translation") {
4069  DelCharBtwData(&val_buf[0]);
4070  } else if (qual_str == "rpt_unit") {
4071  fta_convert_to_lower_case(&val_buf[0]);
4072  } else if (qual_str == "cons_splice") {
4073  fta_process_con_slice(val_buf);
4074  } else if (qual_str == "note") {
4075  for (p = &val_buf[0];;) {
4076  p = StringChr(p, '/');
4077  if (! p)
4078  break;
4079  p++;
4080  if (! CheckLegalQual(p, ' ', nullptr))
4081  continue;
4082 
4083  if (val_buf.size() > 30) {
4084  ch = val_buf[30];
4085  val_buf[30] = '\0';
4086  } else
4087  ch = '\0';
4088  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmbeddedQual, "/note qualifier value appears to contain other qualifiers : [%s%s].", &val_buf[0], (ch == '\0') ? "" : " ...");
4089  if (ch != '\0')
4090  val_buf[30] = ch;
4091  }
4092  }
4093 
4094  for (p = &val_buf[0]; *p == '\"' || *p == ' ' || *p == '\t';)
4095  p++;
4096 
4097  if (*p == '\0') {
4098  if (qual_str == "replace") {
4099  (*cur)->SetVal("");
4100  } else
4101  (*cur)->ResetVal();
4102  } else
4103  (*cur)->SetVal(&val_buf[0]);
4104  }
4105 
4106  for (b = EmptyQuals; *b; b++)
4107  if (qual_str == *b)
4108  break;
4109 
4110  if (! *b) {
4111  if (! (*cur)->IsSetVal()) {
4112  if (qual_str == "old_locus_tag")
4113  ErrPostEx(SEV_ERROR, ERR_FEATURE_EmptyOldLocusTag, "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier with no value. Qualifier has been dropped.", fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "Empty");
4114  else
4115  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmptyQual, "Qualifier /%s ignored because it lacks a data value. Feature \"%s\", location \"%s\".", qual_str.c_str(), fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "Empty");
4116 
4117  cur = fbp->quals.erase(cur);
4118  continue;
4119  }
4120  } else if ((*cur)->IsSetVal()) {
4121  ErrPostEx(SEV_WARNING, ERR_QUALIFIER_ShouldNotHaveValue, "Qualifier /%s should not have data value. Qualifier value has been ignored. Feature \"%s\", location \"%s\".", qual_str.c_str(), fbp->key ? fbp->key : "Unknown", fbp->location ? fbp->location : "Empty");
4122 
4123  (*cur)->ResetVal();
4124  }
4125 
4126  if ((*cur)->IsSetVal() && qual_str == "note") {
4127  string val = (*cur)->GetVal();
4128  std::replace(val.begin(), val.end(), '\"', '\'');
4129  (*cur)->SetVal(val);
4130  }
4131 
4132  ++cur;
4133  }
4134 }
4135 
4136 /**********************************************************/
4138 {
4139  FeatBlkPtr fbp;
4140  char* p;
4141  Int4 num;
4142  Int2 keyindx;
4143  int retval = GB_FEAT_ERR_NONE;
4144  int ret = 0;
4145 
4146  for (num = 0; dbp; dbp = dbp->mpNext, num++) {
4147  if (! dbp->mpData)
4148  continue;
4149  fbp = static_cast<FeatBlk*>(dbp->mpData);
4150  fbp->num = num;
4152 
4153  if (fbp->key[0] == '-' && fbp->key[1] == '\0') {
4154  ErrPostStr(SEV_WARNING, ERR_FEATURE_FeatureKeyReplaced, "Featkey '-' is replaced by 'misc_feature'");
4155  MemFree(fbp->key);
4156  fbp->key = StringSave("misc_feature");
4157  }
4158 
4159  if (StringEqu(fbp->key, "allele") ||
4160  StringEqu(fbp->key, "mutation")) {
4161  ErrPostEx(SEV_ERROR, ERR_FEATURE_ObsoleteFeature, "Obsolete feature \"%s\" found. Replaced with \"variation\".", fbp->key);
4162  MemFree(fbp->key);
4163  fbp->key = StringSave("variation");
4164  }
4165 
4167 
4168  /* bsv hack: exclude CONFLICT, REGION, SITE, UNSURE UniProt flatfile
4169  * features from valid GenBank ones: for USPTO only
4170  * Needs better workaround
4171  */
4172  if (source == Parser::ESource::USPTO &&
4173  (subtype == CSeqFeatData::eSubtype_conflict ||
4174  subtype == CSeqFeatData::eSubtype_region ||
4175  subtype == CSeqFeatData::eSubtype_site ||
4176  subtype == CSeqFeatData::eSubtype_unsure))
4177  subtype = CSeqFeatData::eSubtype_bad;
4178  keyindx = -1;
4179  if (subtype == CSeqFeatData::eSubtype_bad && ! deb) {
4181  keyindx = SpFeatKeyNameValid(fbp->key);
4182  if (keyindx < 0 && ! deb) {
4183  ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key, "Feature dropped");
4184  dbp->mDrop = true;
4185  retval = GB_FEAT_ERR_DROP;
4186  continue;
4187  }
4188  fbp->spindex = keyindx;
4189  }
4190 
4191  if (! fbp->quals.empty()) {
4192  XMLCheckQualifiers(fbp);
4193  fbp = MergeNoteQual(fbp); /* allow more than one
4194  notes w/i a key */
4195 
4196  if (subtype == CSeqFeatData::eSubtype_bad) {
4197  if (keyindx < 0) {
4199  ret = GB_FEAT_ERR_REPAIRABLE;
4200  }
4201  } else if (fbp->spindex < 0) {
4202  /* last argument is perform_corrections if debug
4203  * mode is FALSE
4204  */
4205  ret = XGBFeatKeyQualValid(subtype, fbp->quals, true, ((source == Parser::ESource::Flybase) ? false : ! deb));
4206  }
4207  if (ret > retval)
4208  retval = ret;
4209 
4210  if (ret > GB_FEAT_ERR_REPAIRABLE && ! StringEqu(fbp->key, "ncRNA"))
4211  dbp->mDrop = true;
4212  } else if (subtype == CSeqFeatData::eSubtype_bad && ! CSeqFeatData::GetMandatoryQualifiers(subtype).empty()) {
4213  if (! StringEqu(fbp->key, "mobile_element")) {
4214  auto qual_idx = *CSeqFeatData::GetMandatoryQualifiers(subtype).begin();
4215  string str1 = CSeqFeatData::GetQualifierAsString(qual_idx);
4216  const char* str = str1.c_str();
4217  if ((! StringEqu(fbp->key, "old_sequence") &&
4218  ! StringEqu(fbp->key, "conflict")) ||
4219  ! StringEqu(str, "citation")) {
4220  ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing, "lacks required /%s qualifier : feature has been dropped.", str);
4221  if (! deb) {
4222  dbp->mDrop = true;
4223  retval = GB_FEAT_ERR_DROP;
4224  }
4225  }
4226  }
4227  } else if (StringEqu(fbp->key, "misc_feature") && fbp->quals.empty()) {
4228  if (! deb) {
4229  dbp->mDrop = true;
4230  retval = GB_FEAT_ERR_DROP;
4231  ErrPostStr(SEV_WARNING, ERR_FEATURE_Dropped, "Empty 'misc_feature' dropped");
4232  } else
4233  retval = GB_FEAT_ERR_REPAIRABLE;
4234  }
4235 
4236  for (auto& cur : fbp->quals) {
4237  if (! cur->IsSetQual() || ! cur->IsSetVal())
4238  continue;
4239 
4240  const string& qual_str = cur->GetQual();
4241  const string& val_str = cur->GetVal();
4242 
4243  vector<Char> val_buf(val_str.begin(), val_str.end());
4244  val_buf.push_back(0);
4245 
4246  p = &val_buf[0];
4247  ShrinkSpaces(p);
4248  if (*p == '\0' && qual_str != "replace") {
4249  cur->ResetVal();
4250  val_buf[0] = 0;
4251  } else {
4252  if (qual_str == "replace")
4254  cur->SetVal(p);
4255  }
4256  }
4257  } /* for, each sub-block, or each feature key */
4259  return (retval);
4260 }
4261 
4262 /**********************************************************/
4263 static bool fta_check_ncrna(const CSeq_feat& feat)
4264 {
4265  int count = 0;
4266 
4267  bool stop = false;
4268  for (const auto& qual : feat.GetQual()) {
4269  if (! qual->IsSetQual() || qual->GetQual().empty() ||
4270  qual->GetQual() != "ncRNA_class")
4271  continue;
4272 
4273  count++;
4274 
4275  if (! qual->IsSetVal() || qual->GetVal().empty()) {
4276  string loc = location_to_string_or_unknown(feat.GetLocation());
4277 
4278  ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class, "Feature \"ncRNA\" at location \"%s\" has an empty /ncRNA_class qualifier.", loc.empty() ? "unknown" : loc.c_str());
4279  stop = true;
4280  break;
4281  }
4282 
4283  if (MatchArrayString(ncRNA_class_values, qual->GetVal().c_str()) < 0) {
4284  string loc = location_to_string_or_unknown(feat.GetLocation());
4285 
4286  ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class, "Feature \"ncRNA\" at location \"%s\" has an invalid /ncRNA_class qualifier: \"%s\".", loc.empty() ? "unknown" : loc.c_str(), qual->GetVal().c_str());
4287  stop = true;
4288  break;
4289  }
4290  }
4291 
4292  if (stop)
4293  return false;
4294 
4295  if (count == 1)
4296  return true;
4297 
4298  string loc = location_to_string_or_unknown(feat.GetLocation());
4299 
4300  ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class, "Feature \"ncRNA\" at location \"%s\" %s /ncRNA_class qualifier.", loc.empty() ? "unknown" : loc.c_str(), (count == 0) ? "lacks the mandatory" : "has more than one");
4301 
4302  return false;
4303 }
4304 
4305 /**********************************************************/
4307 {
4308  for (auto qual = feat.SetQual().begin(); qual != feat.SetQual().end(); ++qual) {
4309  if (! (*qual)->IsSetQual() || (*qual)->GetQual() != "artificial_location")
4310  continue;
4311 
4312  if ((*qual)->IsSetVal()) {
4313  const Char* p_val = (*qual)->GetVal().c_str();
4314  for (; *p_val == '\"';)
4315  ++p_val;
4316 
4317  if (*p_val == '\0')
4318  (*qual)->ResetVal();
4319  }
4320 
4321  string val = (*qual)->IsSetVal() ? (*qual)->GetVal() : "";
4322 
4323  if (val == "heterogenous population sequenced" ||
4324  val == "low-quality sequence region") {
4325  feat.SetExcept(true);
4326 
4327  if (! feat.IsSetExcept_text())
4328  feat.SetExcept_text(val);
4329  else {
4330  string& except_text = feat.SetExcept_text();
4331  except_text += ", ";
4332  except_text += val;
4333  }
4334  } else {
4335  auto loc_str = location_to_string_or_unknown(feat.GetLocation());
4336 
4337  if (val.empty())
4338  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidArtificialLoc, "Encountered empty /artificial_location qualifier : Feature \"%s\" : Location \"%s\". Qualifier dropped.", (! key || *key == '\0') ? "unknown" : key, loc_str.empty() ? "unknown" : loc_str.c_str());
4339  else
4340  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidArtificialLoc, "Value \"%s\" is not legal for the /artificial_location qualifier : Feature \"%s\" : Location \"%s\". Qualifier dropped.", val.c_str(), (! key || *key == '\0') ? "unknown" : key, loc_str.empty() ? "unknown" : loc_str.c_str());
4341  }
4342 
4343  feat.SetQual().erase(qual);
4344  break;
4345  }
4346 }
4347 
4348 /**********************************************************/
4349 static bool fta_check_mobile_element(const CSeq_feat& feat)
4350 {
4351  bool found = false;
4352  for (const auto& qual : feat.GetQual()) {
4353  if (qual->IsSetQual() && qual->GetQual() == "mobile_element_type" &&
4354  qual->IsSetVal() && ! qual->GetVal().empty()) {
4355  const Char* p_val = qual->GetVal().c_str();
4356  for (; *p_val == '\"';)
4357  ++p_val;
4358 
4359  if (*p_val != '\0') {
4360  found = true;
4361  break;
4362  }
4363  }
4364  }
4365 
4366  if (found)
4367  return true;
4368 
4369  auto loc_str = location_to_string_or_unknown(feat.GetLocation());
4370  ErrPostEx(SEV_REJECT, ERR_FEATURE_RequiredQualifierMissing, "Mandatory qualifier /mobile_element_type is absent or has no value : Feature \"mobile_element\" : Location \"%s\". Entry dropped.", loc_str.empty() ? "unknown" : loc_str.c_str());
4371 
4372  return false;
4373 }
4374 
4375 /**********************************************************/
4376 static bool SortFeaturesByLoc(const DataBlkPtr& sp1, const DataBlkPtr& sp2)
4377 {
4378  FeatBlkPtr fbp1;
4379  FeatBlkPtr fbp2;
4380  Int4 status;
4381 
4382  fbp1 = static_cast<FeatBlk*>(sp1->mpData);
4383  fbp2 = static_cast<FeatBlk*>(sp2->mpData);
4384 
4385  if (! fbp1->location && fbp2->location)
4386  return false;
4387  if (fbp1->location && ! fbp2->location)
4388  return false;
4389 
4390  if (fbp1->location && fbp2->location) {
4391  status = StringCmp(fbp1->location, fbp2->location);
4392  if (status != 0)
4393  return status < 0;
4394  }
4395 
4396  if (! fbp1->key && fbp2->key)
4397  return false;
4398  if (fbp1->key && ! fbp2->key)
4399  return false;
4400  if (fbp1->key && fbp2->key) {
4401  status = StringCmp(fbp1->key, fbp2->key);
4402  if (status != 0)
4403  return status < 0;
4404  }
4405 
4406  return false;
4407 }
4408 
4409 /**********************************************************/
4410 static bool SortFeaturesByOrder(const DataBlkPtr& sp1, const DataBlkPtr& sp2)
4411 {
4412  FeatBlkPtr fbp1;
4413  FeatBlkPtr fbp2;
4414 
4415  fbp1 = static_cast<FeatBlk*>(sp1->mpData);
4416  fbp2 = static_cast<FeatBlk*>(sp2->mpData);
4417 
4418  return fbp1->num < fbp2->num;
4419 }
4420 
4421 /**********************************************************/
4422 static DataBlkPtr fta_sort_features(DataBlkPtr dbp, bool order)
4423 {
4424  size_t total = 0;
4425  for (DataBlkPtr tdbp = dbp; tdbp; tdbp = tdbp->mpNext)
4426  total++;
4427 
4428  vector<DataBlk*> temp;
4429  temp.reserve(total);
4430  for (DataBlkPtr tdbp = dbp; tdbp; tdbp = tdbp->mpNext)
4431  temp.push_back(tdbp);
4432 
4433  std::sort(temp.begin(), temp.end(), (order ? SortFeaturesByOrder : SortFeaturesByLoc));
4434 
4435  DataBlkPtr tdbp = dbp = temp[0];
4436  for (size_t i = 0; i < total - 1; tdbp = tdbp->mpNext, i++)
4437  tdbp->mpNext = temp[i + 1];
4438 
4439  temp[total - 1]->mpNext = nullptr;
4440  return (dbp);
4441 }
4442 
4443 /**********************************************************/
4444 static void fta_convert_to_regulatory(FeatBlkPtr fbp, const char* rclass)
4445 {
4446  if (! fbp || ! fbp->key || ! rclass)
4447  return;
4448 
4449  if (fbp->key)
4450  MemFree(fbp->key);
4451  fbp->key = StringSave("regulatory");
4452 
4453  CRef<CGb_qual> qual(new CGb_qual);
4454  qual->SetQual("regulatory_class");
4455  qual->SetVal(rclass);
4456  fbp->quals.push_back(qual);
4457 }
4458 
4459 /**********************************************************/
4460 static void fta_check_replace_regulatory(DataBlkPtr dbp, bool* drop)
4461 {
4462  FeatBlkPtr fbp;
4463  const char** b;
4464  char* p;
4465  bool got_note;
4466  bool other_class;
4467  Int4 count;
4468  Char ch;
4469 
4470  for (; dbp; dbp = dbp->mpNext) {
4471  fbp = static_cast<FeatBlk*>(dbp->mpData);
4472  if (! fbp || ! fbp->key)
4473  continue;
4474 
4475  if (StringEqu(fbp->key, "attenuator"))
4476  fta_convert_to_regulatory(fbp, "attenuator");
4477  else if (StringEqu(fbp->key, "CAAT_signal"))
4478  fta_convert_to_regulatory(fbp, "CAAT_signal");
4479  else if (StringEqu(fbp->key, "enhancer"))
4480  fta_convert_to_regulatory(fbp, "enhancer");
4481  else if (StringEqu(fbp->key, "GC_signal"))
4482  fta_convert_to_regulatory(fbp, "GC_signal");
4483  else if (StringEqu(fbp->key, "-35_signal"))
4484  fta_convert_to_regulatory(fbp, "minus_35_signal");
4485  else if (StringEqu(fbp->key, "-10_signal"))
4486  fta_convert_to_regulatory(fbp, "minus_10_signal");
4487  else if (StringEqu(fbp->key, "polyA_signal"))
4488  fta_convert_to_regulatory(fbp, "polyA_signal_sequence");
4489  else if (StringEqu(fbp->key, "promoter"))
4490  fta_convert_to_regulatory(fbp, "promoter");
4491  else if (StringEqu(fbp->key, "RBS"))
4492  fta_convert_to_regulatory(fbp, "ribosome_binding_site");
4493  else if (StringEqu(fbp->key, "TATA_signal"))
4494  fta_convert_to_regulatory(fbp, "TATA_box");
4495  else if (StringEqu(fbp->key, "terminator"))
4496  fta_convert_to_regulatory(fbp, "terminator");
4497  else if (! StringEqu(fbp->key, "regulatory"))
4498  continue;
4499 
4500  got_note = false;
4501  other_class = false;
4502  count = 0;
4503 
4504  for (const auto& cur : fbp->quals) {
4505  if (! cur->IsSetQual() || ! cur->IsSetVal())
4506  continue;
4507 
4508  const string& qual_str = cur->GetQual();
4509 
4510  if (qual_str != "regulatory_class") {
4511  if (qual_str == "note")
4512  got_note = true;
4513  continue;
4514  }
4515 
4516  count++;
4517  if (! cur->IsSetVal() || cur->GetVal().empty()) {
4518  ch = '\0';
4519  if (! fbp->location || *fbp->location == '\0')
4520  p = (char*)"(empty)";
4521  else {
4522  p = fbp->location;
4523  if (StringLen(p) > 50) {
4524  ch = p[50];
4525  p[50] = '\0';
4526  }
4527  }
4528  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidRegulatoryClass, "Empty /regulatory_class qualifier value in regulatory feature at location %s.", p);
4529  if (ch != '\0')
4530  p[50] = ch;
4531  *drop = true;
4532  continue;
4533  }
4534 
4535  const string& val_str = cur->GetVal();
4536 
4537  for (b = RegulatoryClassValues; *b; b++)
4538  if (val_str == *b)
4539  break;
4540 
4541  if (*b) {
4542  if (val_str == "other")
4543  other_class = true;
4544  continue;
4545  }
4546 
4547  ch = '\0';
4548  if (! fbp->location || *fbp->location == '\0')
4549  p = (char*)"(empty)";
4550  else {
4551  p = fbp->location;
4552  if (StringLen(p) > 50) {
4553  ch = p[50];
4554  p[50] = '\0';
4555  }
4556  }
4557  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidRegulatoryClass, "Invalid /regulatory_class qualifier value %s provided in regulatory feature at location %s.", val_str.c_str(), p);
4558  if (ch != '\0')
4559  p[50] = ch;
4560  *drop = true;
4561  }
4562 
4563  if (count == 0) {
4564  ch = '\0';
4565  if (! fbp->location || *fbp->location == '\0')
4566  p = (char*)"(empty)";
4567  else {
4568  p = fbp->location;
4569  if (StringLen(p) > 50) {
4570  ch = p[50];
4571  p[50] = '\0';
4572  }
4573  }
4574  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingRegulatoryClass, "The regulatory feature is missing mandatory /regulatory_class qualifier at location %s.", p);
4575  if (ch != '\0')
4576  p[50] = ch;
4577  *drop = true;
4578  } else if (count > 1) {
4579  ch = '\0';
4580  if (! fbp->location || *fbp->location == '\0')
4581  p = (char*)"(empty)";
4582  else {
4583  p = fbp->location;
4584  if (StringLen(p) > 50) {
4585  ch = p[50];
4586  p[50] = '\0';
4587  }
4588  }
4589  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MultipleRegulatoryClass, "Multiple /regulatory_class qualifiers were encountered in regulatory feature at location %s.", p);
4590  if (ch != '\0')
4591  p[50] = ch;
4592  *drop = true;
4593  }
4594 
4595  if (other_class && ! got_note) {
4596  ch = '\0';
4597  if (! fbp->location || *fbp->location == '\0')
4598  p = (char*)"(empty)";
4599  else {
4600  p = fbp->location;
4601  if (StringLen(p) > 50) {
4602  ch = p[50];
4603  p[50] = '\0';
4604  }
4605  }
4606  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_NoNoteForOtherRegulatory, "The regulatory feature of class other is lacking required /note qualifier at location %s.", p);
4607  if (ch != '\0')
4608  p[50] = ch;
4609  *drop = true;
4610  }
4611  }
4612 }
4613 
4614 /**********************************************************/
4615 static void fta_create_wgs_dbtag(CBioseq& bioseq,
4616  const string& submitter_seqid,
4617  char* prefix,
4618  Int4 seqtype)
4619 {
4620  string dbname;
4621  if (seqtype == 0 || seqtype == 1 || seqtype == 7)
4622  dbname = "WGS:";
4623  else if (seqtype == 4 || seqtype == 5 || seqtype == 8 || seqtype == 9)
4624  dbname = "TSA:";
4625  else
4626  dbname = "TLS:";
4627  dbname += prefix;
4628 
4629  CRef<CSeq_id> gen_id(new CSeq_id);
4630  CDbtag& tag = gen_id->SetGeneral();
4631  tag.SetTag().SetStr(submitter_seqid);
4632  tag.SetDb(dbname);
4633  bioseq.SetId().push_back(gen_id);
4634 }
4635 
4636 /**********************************************************/
4637 static void fta_create_wgs_seqid(CBioseq& bioseq,
4638  IndexblkPtr ibp,
4640 {
4641  TokenBlkPtr tbp;
4642  char* prefix;
4643  char* p;
4644  Int4 seqtype;
4645  Int4 i;
4646 
4647  if (! ibp || ibp->submitter_seqid.empty())
4648  return;
4649 
4650  prefix = nullptr;
4651 
4652  seqtype = fta_if_wgs_acc(ibp->acnum);
4653  if (seqtype == 0 || seqtype == 3 || seqtype == 4 || seqtype == 6 ||
4654  seqtype == 10 || seqtype == 12) {
4655  ErrPostEx(SEV_REJECT, ERR_SOURCE_SubmitterSeqidNotAllowed, "WGS/TLS/TSA master records are not allowed to have /submitter_seqid qualifiers, only contigs and scaffolds. Entry dropped.");
4656  ibp->drop = true;
4657  return;
4658  }
4659 
4660  if (seqtype == 1 || seqtype == 5 || seqtype == 7 || seqtype == 8 ||
4661  seqtype == 9 || seqtype == 11) {
4662  prefix = StringSave(ibp->acnum);
4663  if (prefix[4] >= '0' && prefix[4] <= '9')
4664  prefix[6] = '\0';
4665  else
4666  prefix[8] = '\0';
4667  fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix, seqtype);
4668  MemFree(prefix);
4669  return;
4670  }
4671 
4672  for (tbp = ibp->secaccs; tbp; tbp = tbp->next) {
4673  if (tbp->str[0] == '-')
4674  continue;
4675 
4676  if (! prefix)
4677  prefix = StringSave(tbp->str);
4678  else {
4679  i = (prefix[4] >= '0' && prefix[4] <= '9') ? 6 : 8;
4680  if (! StringEquN(prefix, tbp->str, i))
4681  break;
4682  }
4683  }
4684 
4685  if (! tbp && prefix) {
4686  seqtype = fta_if_wgs_acc(prefix);
4687  if (seqtype == 0 || seqtype == 1 || seqtype == 4 || seqtype == 5 ||
4688  seqtype == 7 || seqtype == 8 || seqtype == 9 || seqtype == 10 ||
4689  seqtype == 11) {
4690  if (prefix[4] >= '0' && prefix[4] <= '9')
4691  prefix[6] = '\0';
4692  else
4693  prefix[8] = '\0';
4694  fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix, seqtype);
4695  MemFree(prefix);
4696  return;
4697  }
4698  }
4699 
4700  if (prefix) {
4701  MemFree(prefix);
4702  prefix = nullptr;
4703  }
4704 
4705  if (bioseq.GetInst().IsSetExt() && bioseq.GetInst().GetExt().IsDelta()) {
4706  CDelta_ext::Tdata deltas = bioseq.GetInst().GetExt().GetDelta();
4707  CDelta_ext::Tdata::iterator delta;
4708 
4709  for (delta = deltas.begin(); delta != deltas.end(); delta++) {
4710  const CSeq_id* id = nullptr;
4711 
4712  if (! (*delta)->IsLoc())
4713  continue;
4714 
4715  const CSeq_loc& locs = (*delta)->GetLoc();
4716  CSeq_loc_CI ci(locs);
4717 
4718  for (; ci; ++ci) {
4720  if (! loc->IsInt())
4721  continue;
4722  id = &ci.GetSeq_id();
4723  if (! id)
4724  break;
4725  if (! id->IsGenbank() && ! id->IsEmbl() && ! id->IsDdbj() &&
4726  ! id->IsOther() && ! id->IsTpg() && ! id->IsTpe() &&
4727  ! id->IsTpd())
4728  break;
4729 
4730  const CTextseq_id* text_id = id->GetTextseq_Id();
4731  if (! text_id || ! text_id->IsSetAccession() ||
4732  text_id->GetAccession().empty())
4733  break;
4734 
4735  p = (char*)text_id->GetAccession().c_str();
4736  if (! prefix)
4737  prefix = StringSave(p);
4738  else {
4739  i = (prefix[4] >= '0' && prefix[4] <= '9') ? 6 : 8;
4740  if (! StringEquN(prefix, p, i))
4741  break;
4742  }
4743  }
4744  if (ci)
4745  break;
4746  }
4747 
4748  if (delta == deltas.end() && prefix) {
4749  seqtype = fta_if_wgs_acc(prefix);
4750  if (seqtype == 0 || seqtype == 1 || seqtype == 4 || seqtype == 5 ||
4751  seqtype == 7 || seqtype == 8 || seqtype == 9 || seqtype == 10 ||
4752  seqtype == 11) {
4753  if (prefix[4] >= '0' && prefix[4] <= '9')
4754  prefix[6] = '\0';
4755  else
4756  prefix[8] = '\0';
4757  fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix, seqtype);
4758  MemFree(prefix);
4759  return;
4760  }
4761  }
4762 
4763  if (prefix) {
4764  MemFree(prefix);
4765  prefix = nullptr;
4766  }
4767 
4768  ErrPostEx(SEV_ERROR, ERR_SOURCE_SubmitterSeqidDropped, "Could not determine project code for what appears to be a WGS/TLS/TSA scaffold record. /submitter_seqid dropped.");
4769  return;
4770  }
4771 
4773  ErrPostEx(SEV_ERROR, ERR_SOURCE_SubmitterSeqidIgnored, "Submitter sequence identifiers for non-project-based TSA records are not supported. /submitter_seqid \"%s\" has been dropped.", ibp->submitter_seqid.c_str());
4774  return;
4775  }
4776 
4777  ErrPostEx(SEV_REJECT, ERR_SOURCE_SubmitterSeqidNotAllowed, "Only WGS/TLS/TSA related records (contigs and scaffolds) are allowed to have /submitter_seqid qualifier. This \"%s\" is not one of them. Entry dropped.", ibp->acnum);
4778  ibp->drop = true;
4779 }
4780 
4781 /**********************************************************
4782  *
4783  * SeqAnnotPtr LoadFeat(pp, entry, bsp):
4784  *
4785  * 5-4-93
4786  *
4787  **********************************************************/
4788 void LoadFeat(ParserPtr pp, const DataBlk& entry, CBioseq& bioseq)
4789 {
4790  DataBlkPtr dab;
4791  DataBlkPtr dabnext;
4792  DataBlkPtr dbp;
4793  DataBlkPtr tdbp;
4794  FeatBlkPtr fbp;
4795 
4796  IndexblkPtr ibp;
4797  Int4 col_data;
4798  Int2 type;
4799  Int4 i = 0;
4800  CRef<CSeq_id> pat_seq_id;
4801 
4803 
4804  ibp = pp->entrylist[pp->curindx];
4805 
4806  CRef<CSeq_id> seq_id =
4807  MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum);
4808  if (pp->source == Parser::ESource::USPTO) {
4809  pat_seq_id = new CSeq_id;
4811  pat_seq_id->SetPatent(*pat_id);
4812  }
4813 
4814  if (! seq_id) {
4815  if (! NStr::IsBlank(ibp->acnum)) {
4816  seq_id = Ref(new CSeq_id(CSeq_id::e_Local, ibp->acnum));
4817  } else if (pp->mode == Parser::EMode::Relaxed) {
4818  seq_id = Ref(new CSeq_id(CSeq_id::e_Local, ibp->locusname));
4819  }
4820  }
4821 
4822  TSeqIdList ids;
4823  ids.push_back(seq_id);
4824 
4825  if (pp->format == Parser::EFormat::GenBank) {
4826  col_data = ParFlat_COL_DATA;
4828  } else if (pp->format == Parser::EFormat::XML) {
4829  col_data = 0;
4830  type = XML_FEATURES;
4831  } else {
4832  col_data = ParFlat_COL_DATA_EMBL;
4833  type = ParFlat_FH;
4834  }
4835 
4836  /* Find feature already isolated in a "block"
4837  * The key, location and qualifiers will be isolated to
4838  * a FeatBlk at the first step of ParseFeatureBlock, which
4839  * parses a single feature at a time.
4840  * -Karl
4841  */
4842  if (pp->format == Parser::EFormat::XML)
4843  dab = XMLLoadFeatBlk(entry.mOffset, ibp->xip);
4844  else
4845  dab = TrackNodeType(entry, type);
4846  for (dbp = dab; dbp; dbp = dbp->mpNext) {
4847  if (dbp->mType != type)
4848  continue;
4849 
4850  /* Parsing each feature subblock to FeatBlkPtr, fbp
4851  * it also checks semantics of qualifiers and keys
4852  */
4853  if (pp->format == Parser::EFormat::XML)
4854  XMLParseFeatureBlock(pp->debug, static_cast<DataBlk*>(dbp->mpData), pp->source);
4855  else
4856  ParseFeatureBlock(ibp, pp->debug, static_cast<DataBlk*>(dbp->mpData), pp->source, pp->format);
4857 
4858  dbp->mpData = fta_sort_features(static_cast<DataBlk*>(dbp->mpData), false);
4859  fta_check_pseudogene_qual(static_cast<DataBlk*>(dbp->mpData));
4860  fta_check_old_locus_tags(static_cast<DataBlk*>(dbp->mpData), &ibp->drop);
4861  fta_check_compare_qual(static_cast<DataBlk*>(dbp->mpData), ibp->is_tpa);
4862  tdbp = static_cast<DataBlk*>(dbp->mpData);
4863  for (i = 0; tdbp; i++, tdbp = tdbp->mpNext)
4864  fta_remove_dup_quals(static_cast<FeatBlk*>(tdbp->mpData));
4865  fta_remove_dup_feats(static_cast<DataBlk*>(dbp->mpData));
4866  for (tdbp = static_cast<DataBlk*>(dbp->mpData); tdbp; tdbp = tdbp->mpNext)
4867  fta_check_rpt_unit_range(static_cast<FeatBlk*>(tdbp->mpData), ibp->bases);
4868  fta_check_multiple_locus_tag(static_cast<DataBlk*>(dbp->mpData), &ibp->drop);
4869  if (ibp->is_tpa || ibp->is_tsa || ibp->is_tls)
4870  fta_check_non_tpa_tsa_tls_locations(static_cast<DataBlk*>(dbp->mpData), ibp);
4871  fta_check_replace_regulatory(static_cast<DataBlk*>(dbp->mpData), &ibp->drop);
4872  dbp->mpData = fta_sort_features(static_cast<DataBlk*>(dbp->mpData), true);
4873  }
4874 
4875  if (i > 1 && ibp->is_mga) {
4876  ErrPostEx(SEV_REJECT, ERR_FEATURE_MoreThanOneCAGEFeat, "CAGE records are allowed to have only one feature, and it must be the \"source\" one. Entry dropped.");
4877  ibp->drop = true;
4878  }
4879 
4880  if (! ibp->drop)
4881  CollectGapFeats(entry, dab, pp, type);
4882 
4883  TSeqFeatList seq_feats;
4884  if (! ibp->drop)
4885  ParseSourceFeat(pp, dab, ids, type, bioseq, seq_feats);
4886 
4887  if (seq_feats.empty()) {
4888  ibp->drop = true;
4889  for (; dab; dab = dabnext) {
4890  dabnext = dab->mpNext;
4891  FreeFeatBlk(static_cast<DataBlk*>(dab->mpData), pp->format);
4892  if (pp->format == Parser::EFormat::XML)
4893  dab->SimpleDelete();
4894  }
4895  xinstall_gbparse_range_func(nullptr, nullptr);
4896  return;
4897  }
4898 
4899  if (! ibp->submitter_seqid.empty())
4900  fta_create_wgs_seqid(bioseq, ibp, pp->source);
4901 
4902  CSeq_descr::Tdata& descr_list = bioseq.SetDescr().Set();
4903  for (CSeq_descr::Tdata::iterator descr = descr_list.begin(); descr != descr_list.end();) {
4904  if (! (*descr)->IsSource()) {
4905  ++descr;
4906  continue;
4907  }
4908 
4909  descr = descr_list.erase(descr);
4910  }
4911 
4912  CRef<CSeqdesc> descr_src(new CSeqdesc);
4913  descr_src->SetSource(seq_feats.front()->SetData().SetBiosrc());
4914 
4915  descr_list.push_back(descr_src);
4916  seq_feats.pop_front();
4917 
4918  fta_get_gcode_from_biosource(descr_src->GetSource(), ibp);
4919 
4920  for (; dab; dab = dabnext) {
4921  dabnext = dab->mpNext;
4922  if (dab->mType != type) {
4923  if (pp->format == Parser::EFormat::XML)
4924  dab->SimpleDelete();
4925  continue;
4926  }
4927 
4928  for (dbp = static_cast<DataBlk*>(dab->mpData); dbp; dbp = dbp->mpNext) {
4929  if (dbp->mDrop == true)
4930  continue;
4931 
4932  fbp = static_cast<FeatBlk*>(dbp->mpData);
4933  if (StringEqu(fbp->key, "source") ||
4934  StringEqu(fbp->key, "assembly_gap") ||
4935  (StringEqu(fbp->key, "gap") &&
4937  continue;
4938 
4939  fta_sort_quals(fbp, pp->qamode);
4940  CRef<CSeq_feat> feat;
4941  if (fbp->spindex < 0)
4942  feat = ProcFeatBlk(pp, fbp, ids);
4943  else
4944  feat = SpProcFeatBlk(pp, fbp, ids);
4945  if (feat.Empty()) {
4946  if (StringEqu(fbp->key, "CDS")) {
4947  ErrPostEx(SEV_ERROR, ERR_FEATURE_LocationParsing, "CDS feature has unparsable location. Entry dropped. Location = [%s].", fbp->location);
4948  ibp->drop = true;
4949  }
4950  continue;
4951  }
4952 
4953  if (StringEqu(fbp->key, "mobile_element") &&
4954  ! fta_check_mobile_element(*feat)) {
4955  ibp->drop = true;
4956  continue;
4957  }
4958 
4959  fta_check_artificial_location(*feat, fbp->key);
4960 
4961  if (CheckForeignLoc(feat->GetLocation(),
4962  (pp->source == Parser::ESource::USPTO) ? *pat_seq_id : *seq_id)) {
4963  ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, "Location pointing outside the entry [%s]", fbp->location);
4964 
4965  if (feat->GetData().IsImp()) {
4966  const CImp_feat& imp_feat = feat->GetData().GetImp();
4967  if (imp_feat.GetKey() == "intron" ||
4968  imp_feat.GetKey() == "exon") {
4969  /* foreign introns and exons wouldn't be parsed
4970  */
4971  feat.Reset();
4972  continue;
4973  }
4974  }
4975  }
4976 
4977  FilterDb_xref(*feat, pp->source);
4978 
4979  i = FTASeqLocCheck(feat->GetLocation(), ibp->acnum);
4980  if (i == 0) {
4982 
4983  if (pp->debug)
4984  seq_feats.push_back(feat);
4985  else {
4986  feat.Reset();
4987  continue;
4988  }
4989  } else {
4990  if (i == 1) {
4991  if (feat->IsSetExcept_text() && feat->GetExcept_text() == "trans-splicing")
4994  "Mixed strands in SeqLoc of /trans_splicing feature: %s",
4995  fbp->location);
4996  else
4997  ErrPostEx(SEV_WARNING, ERR_LOCATION_MixedStrand, "Mixed strands in SeqLoc: %s", fbp->location);
4998  }
4999 
5000  seq_feats.push_back(feat);
5001  }
5002  }
5003  FreeFeatBlk(static_cast<DataBlk*>(dab->mpData), pp->format);
5004  if (pp->format == Parser::EFormat::XML)
5005  dab->SimpleDelete();
5006  }
5007 
5008  if (! fta_perform_operon_checks(seq_feats, ibp)) {
5009  ibp->drop = true;
5010  seq_feats.clear();
5011  xinstall_gbparse_range_func(nullptr, nullptr);
5012  return;
5013  }
5014 
5015  bool stop = false;
5016  for (auto& feat : seq_feats) {
5017  if (! feat->GetData().IsImp())
5018  continue;
5019 
5020  const CImp_feat& imp_feat = feat->GetData().GetImp();
5021 
5022  if (imp_feat.IsSetKey() &&
5023  StringStr(imp_feat.GetKey().c_str(), "RNA")) {
5024  if (imp_feat.GetKey() == "ncRNA" && ! fta_check_ncrna(*feat)) {
5025  stop = true;
5026  break;
5027  }
5028 
5029  GetRnaRef(*feat, bioseq, pp->source, pp->accver);
5030  }
5031  }
5032 
5033  if (stop) {
5034  ibp->drop = true;
5035  seq_feats.clear();
5036  xinstall_gbparse_range_func(nullptr, nullptr);
5037  return;
5038  }
5039 
5040  SeqFeatPub(pp, entry, seq_feats, ids, col_data, ibp);
5041  if (seq_feats.empty() && ibp->drop) {
5042  xinstall_gbparse_range_func(nullptr, nullptr);
5043  return;
5044  }
5045 
5046  /* ImpFeatPub() call will be removed in asn 4.0
5047  */
5048  ImpFeatPub(pp, entry, seq_feats, *seq_id, col_data, ibp);
5049 
5050  xinstall_gbparse_range_func(nullptr, nullptr);
5051  if (seq_feats.empty())
5052  return;
5053 
5054  CRef<CSeq_annot> annot(new CSeq_annot);
5055  annot->SetData().SetFtable().swap(seq_feats);
5056 
5057  bioseq.SetAnnot().push_back(annot);
5058 }
5059 
5060 /**********************************************************/
5061 static CMolInfo::EBiomol GetBiomolFromToks(char* mRNA, char* tRNA, char* rRNA, char* snRNA, char* scRNA, char* uRNA, char* snoRNA)
5062 {
5063  char* p = nullptr;
5064 
5065  if (mRNA)
5066  p = mRNA;
5067  if (! p || (tRNA && tRNA < p))
5068  p = tRNA;
5069  if (! p || (rRNA && rRNA < p))
5070  p = rRNA;
5071  if (! p || (snRNA && snRNA < p))
5072  p = snRNA;
5073  if (! p || (scRNA && scRNA < p))
5074  p = scRNA;
5075  if (! p || (uRNA && uRNA < p))
5076  p = uRNA;
5077  if (! p || (snoRNA && snoRNA < p))
5078  p = snoRNA;
5079 
5080  if (p == mRNA)
5081  return (Seq_descr_GIBB_mol_mRNA);
5082  if (p == tRNA)
5083  return (Seq_descr_GIBB_mol_tRNA);
5084  if (p == rRNA)
5085  return (Seq_descr_GIBB_mol_rRNA);
5086  if (p == snRNA || p == uRNA)
5087  return (Seq_descr_GIBB_mol_snRNA);
5088  if (p == snoRNA)
5089  return (Seq_descr_GIBB_mol_snoRNA);
5090  return (Seq_descr_GIBB_mol_scRNA);
5091 }
5092 
5093 /**********************************************************/
5094 void GetFlatBiomol(CMolInfo::TBiomol& biomol, CMolInfo::TTech tech, char* molstr, ParserPtr pp, const DataBlk& entry, const COrg_ref* org_ref)
5095 {
5096  Int4 genomic;
5097  char* offset;
5098  char c;
5099  DataBlkPtr dbp;
5100 
5101  Int2 count;
5102  Int2 i;
5103  EntryBlkPtr ebp;
5104  IndexblkPtr ibp;
5105  const char* p;
5106 
5107  char* q;
5108  char* r;
5109  char* mRNA = nullptr;
5110  char* tRNA = nullptr;
5111  char* rRNA = nullptr;
5112  char* snRNA = nullptr;
5113  char* scRNA = nullptr;
5114  char* uRNA = nullptr;
5115  char* snoRNA = nullptr;
5116  bool stage;
5117  bool techok;
5118  bool same;
5119  bool is_syn;
5120 
5121  ebp = static_cast<EntryBlk*>(entry.mpData);
5122 
5123  CBioseq& bioseq = ebp->seq_entry->SetSeq();
5124  ibp = pp->entrylist[pp->curindx];
5125 
5126  if (ibp->is_prot) {
5127  bioseq.SetInst().SetMol(CSeq_inst::eMol_aa);
5128  biomol = CMolInfo::eBiomol_peptide;
5129  return;
5130  }
5131 
5132  if (StringEqu(ibp->division, "SYN") ||
5133  (org_ref && org_ref->IsSetOrgname() && org_ref->GetOrgname().IsSetDiv() &&
5134  org_ref->GetOrgname().GetDiv() == "SYN"))
5135  is_syn = true;
5136  else
5137  is_syn = false;
5138 
5139  r = nullptr;
5140  c = '\0';
5141  if (! ibp->moltype.empty()) {
5142  if (pp->source == Parser::ESource::DDBJ && StringEquNI(molstr, "PRT", 3))
5143  return;
5144 
5145  biomol = Seq_descr_GIBB_mol_genomic;
5146  bioseq.SetInst().SetMol(CSeq_inst::eMol_dna);
5147 
5148  if (molstr) {
5149  q = molstr;
5150  r = molstr;
5152  while (*r != ';' && *r != '\n' && *r != '\0')
5153  r++;
5154  else {
5155  while (*r != ';' && *r != ' ' && *r != '\t' && *r != '\n' &&
5156  *r != '\0')
5157  r++;
5158  if (r - molstr > 10)
5159  r = molstr + 10;
5160  }
5161  c = *r;
5162  *r = '\0';
5163  if (q == r)
5164  q = (char*)"???";
5165  } else
5166  q = (char*)"???";
5167 
5168  same = true;
5169  if (ibp->moltype == "genomic DNA") {
5170  biomol = Seq_descr_GIBB_mol_genomic;
5171  bioseq.SetInst().SetMol(CSeq_inst::eMol_dna);
5172 
5173  if (pp->source == Parser::ESource::EMBL) {
5174  if (NStr::CompareNocase(q, "DNA") != 0 &&
5175  NStr::CompareNocase(ibp->moltype, q) != 0)
5176  same = false;
5177  } else if (NStr::CompareNocase(q, "DNA") != 0)
5178  same = false;
5179  } else if (ibp->moltype == "genomic RNA") {
5180  biomol = Seq_descr_GIBB_mol_genomic;
5181  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5182 
5183  if (pp->source == Parser::ESource::EMBL) {
5184  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5185  same = false;
5186  } else if (NStr::CompareNocase(q, "RNA") != 0)
5187  same = false;
5188  } else if (ibp->moltype == "mRNA") {
5189  biomol = Seq_descr_GIBB_mol_mRNA;
5190  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5191 
5192  if (pp->source == Parser::ESource::EMBL) {
5193  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5194  same = false;
5195  } else if (NStr::CompareNocase(q, "mRNA") != 0)
5196  same = false;
5197  } else if (ibp->moltype == "tRNA") {
5198  biomol = Seq_descr_GIBB_mol_tRNA;
5199  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5200 
5201  if (pp->source == Parser::ESource::EMBL) {
5202  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5203  same = false;
5204  } else if (NStr::CompareNocase(q, "tRNA") != 0)
5205  same = false;
5206  } else if (ibp->moltype == "rRNA") {
5207  biomol = Seq_descr_GIBB_mol_rRNA;
5208  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5209 
5210  if (pp->source == Parser::ESource::EMBL) {
5211  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5212  same = false;
5213  } else if (NStr::CompareNocase(q, "rRNA") != 0)
5214  same = false;
5215  } else if (ibp->moltype == "snoRNA") {
5216  biomol = Seq_descr_GIBB_mol_snoRNA;
5217  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5218 
5219  if (pp->source == Parser::ESource::EMBL) {
5220  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5221  same = false;
5222  } else if (NStr::CompareNocase(q, "snoRNA") != 0)
5223  same = false;
5224  } else if (ibp->moltype == "snRNA") {
5225  biomol = Seq_descr_GIBB_mol_snRNA;
5226  bioseq.SetInst().SetMol(CSeq_inst::eMol_rna);
5227 
5228  if (pp->source == Parser::ESource::EMBL) {
5229  if (NStr::CompareNocase(q, "RNA") != 0 && NStr::CompareNocase(ibp->moltype, q) != 0)
5230  same = false;
5231  } else if (NStr::CompareNocase(q, "snRNA") != 0)
5232  same = false;
5233  } else if (ibp->