NCBI C++ ToolKit
fta_src.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: fta_src.cpp 102300 2024-04-19 12:55:28Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: fta_src.cpp
27  *
28  * Author: Sergey Bazhin
29  *
30  * File Description:
31  * Messes about source features.
32  */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "ftacpp.hpp"
37 
49 
50 #include "index.h"
51 
53 #include "ftanet.h"
54 
55 #include "ftaerr.hpp"
56 #include "asci_blk.h"
57 #include "loadfeat.h"
58 #include "utilfeat.h"
59 #include "add.h"
60 #include "utilfun.h"
61 
62 #ifdef THIS_FILE
63 # undef THIS_FILE
64 #endif
65 #define THIS_FILE "fta_src.cpp"
66 
69 
70 struct CharUInt1 {
71  const char* name;
73 };
74 
75 #define USE_CULTIVAR 00001
76 #define USE_ISOLATE 00002
77 #define USE_SEROTYPE 00004
78 #define USE_SEROVAR 00010
79 #define USE_SPECIMEN_VOUCHER 00020
80 #define USE_STRAIN 00040
81 #define USE_SUB_SPECIES 00100
82 #define USE_SUB_STRAIN 00200
83 #define USE_VARIETY 00400
84 #define USE_ECOTYPE 01000
85 #define USE_ALL 01777
86 
87 #define BIOSOURCES_THRESHOLD 20
88 
89 struct PcrPrimers {
90  char* fwd_name = nullptr;
91  char* fwd_seq = nullptr;
92  char* rev_name = nullptr;
93  char* rev_seq = nullptr;
94  PcrPrimers* next = nullptr;
95 };
97 
98 struct SourceFeatBlk {
99  char* name = nullptr;
100  char* strain = nullptr;
101  char* organelle = nullptr;
102  char* isolate = nullptr;
103  char* namstr = nullptr;
104  char* location = nullptr;
105  char* moltype = nullptr;
106  char* genomename = nullptr;
107  char* submitter_seqid = nullptr;
108 
112 
113  bool full = false;
114  bool focus = false;
115  bool tg = false;
116  bool lookup = false;
117  bool skip = false;
118  bool useit = false;
119 
121  SourceFeatBlk* next = nullptr;
122 };
124 
125 struct MinMax {
126  const char* orgname = nullptr; /* Do not free! It's just a pointer */
127  Int4 min = 0;
128  Int4 max = 0;
129  bool skip = false;
130  MinMax* next = nullptr;
131 };
132 using MinMaxPtr = MinMax*;
133 
134 static const char* ObsoleteSourceDbxrefTag[] = {
135  "IFO",
136  nullptr
137 };
138 
139 static const char* DENLRSourceDbxrefTag[] = { /* DENL = DDBJ + EMBL + NCBI +
140  LANL + RefSeq */
141  "AFTOL",
142  "ANTWEB",
143  "ATCC",
144  "ATCC(DNA)",
145  "ATCC(IN HOST)",
146  "BEI",
147  "BOLD",
148  "FBOL",
149  "FUNGORUM",
150  "GREENGENES",
151  "GRIN",
152  "HMP",
153  "HOMD",
154  "IKMC",
155  "ISHAM-ITS",
156  "JCM",
157  "NBRC",
158  "RBGE_GARDEN",
159  "RBGE_HERBARIUM",
160  "RZPD",
161  "UNILIB",
162  nullptr
163 };
164 
165 static const char* DESourceDbxrefTag[] = { /* DE = DDBJ + EMBL */
166  "FANTOM_DB",
167  "IMGT/HLA",
168  "IMGT/LIGM",
169  "MGD",
170  "MGI",
171  nullptr
172 };
173 
174 static const char* ESourceDbxrefTag[] = { /* E = EMBL */
175  "UNITE",
176  nullptr
177 };
178 
179 static const char* NLRSourceDbxrefTag[] = { /* N = NCBI + LANL + RefSeq */
180  "FLYBASE",
181  nullptr
182 };
183 
184 static const char* exempt_quals[] = {
185  "transposon",
186  "insertion_seq",
187  nullptr
188 };
189 
190 static const char* special_orgs[] = {
191  "synthetic construct",
192  "artificial sequence",
193  "eukaryotic synthetic construct",
194  nullptr
195 };
196 
197 static const char* unusual_toks[] = {
198  "complement",
199  nullptr
200 };
201 
202 static const char* source_genomes[] = {
203  "mitochondr",
204  "chloroplast",
205  "kinetoplas",
206  "cyanelle",
207  "plastid",
208  "chromoplast",
209  "macronuclear",
210  "extrachrom",
211  "plasmid",
212  nullptr
213 };
214 
215 static const char* SourceBadQuals[] = {
216  "label",
217  "usedin",
218  "citation",
219  nullptr
220 };
221 
222 static const char* SourceSubSources[] = {
223  "chromosome", /* 1 = CSubSource::eSubtype_chromosome, etc. */
224  "map", /* 2 */
225  "clone", /* 3 */
226  "sub_clone", /* 4 */
227  "haplotype", /* 5 */
228  "genotype", /* 6 */
229  "sex", /* 7 */
230  "cell_line", /* 8 */
231  "cell_type", /* 9 */
232  "tissue_type", /* 10 */
233  "clone_lib", /* 11 */
234  "dev_stage", /* 12 */
235  "frequency", /* 13 */
236  "germline", /* 14 */
237  "rearranged", /* 15 */
238  "lab_host", /* 16 */
239  "pop_variant", /* 17 */
240  "tissue_lib", /* 18 */
241  "plasmid", /* 19 */
242  "transposon", /* 20 */
243  "insertion_seq", /* 21 */
244  "plastid", /* 22 */
245  "", /* 23 */
246  "segment", /* 24 */
247  "", /* 25 */
248  "transgenic", /* 26 */
249  "environmental_sample", /* 27 */
250  "isolation_source", /* 28 */
251  "lat_lon", /* 29 */
252  "collection_date", /* 30 */
253  "collected_by", /* 31 */
254  "identified_by", /* 32 */
255  "", /* 33 */
256  "", /* 34 */
257  "", /* 35 */
258  "", /* 36 */
259  "metagenomic", /* 37 */
260  "mating_type", /* 38 */
261  nullptr
262 };
263 
264 // clang-format off
265 static const CharUInt1 SourceOrgMods[] = {
266  { "strain", COrgMod::eSubtype_strain },
267  { "sub_strain", COrgMod::eSubtype_substrain },
268  { "variety", COrgMod::eSubtype_variety },
269  { "serotype", COrgMod::eSubtype_serotype },
270  { "serovar", COrgMod::eSubtype_serovar },
271  { "cultivar", COrgMod::eSubtype_cultivar },
272  { "isolate", COrgMod::eSubtype_isolate },
273  { "specific_host", COrgMod::eSubtype_nat_host },
274  { "host", COrgMod::eSubtype_nat_host },
275  { "sub_species", COrgMod::eSubtype_sub_species },
276  { "specimen_voucher", COrgMod::eSubtype_specimen_voucher },
277  { "ecotype", COrgMod::eSubtype_ecotype },
278  { "culture_collection", COrgMod::eSubtype_culture_collection },
279  { "bio_material", COrgMod::eSubtype_bio_material },
280  { "metagenome_source", COrgMod::eSubtype_metagenome_source },
281  { "type_material", COrgMod::eSubtype_type_material },
282 };
283 // clang-format on
284 
285 static const char* GenomicSourceFeatQual[] = {
286  "unknown", // CBioSource::eGenome_unknown, etc.
287  "unknown", // ?
288  "chloroplast",
289  "chromoplast",
290  "kinetoplast",
291  "mitochondrion",
292  "plastid",
293  "macronuclear",
294  "extrachrom",
295  "plasmid",
296  "transposon",
297  "insertion-seq",
298  "cyanelle",
299  "proviral",
300  "virion",
301  "nucleomorph",
302  "apicoplast",
303  "leucoplast",
304  "proplastid", /* 18 */
305  "", /* 19 */
306  "", /* 20 */
307  "", /* 21 */
308  "chromatophore", /* 22 */
309  nullptr
310 };
311 
312 static const char* OrganelleFirstToken[] = {
313  "chromatophore",
314  "hydrogenosome",
315  "mitochondrion",
316  "nucleomorph",
317  "plastid",
318  nullptr
319 };
320 
321 /**********************************************************/
323 {
324  return new SourceFeatBlk;
325 }
326 
327 /**********************************************************/
329 {
330  if (sfbp->name)
331  MemFree(sfbp->name);
332  if (sfbp->strain)
333  MemFree(sfbp->strain);
334  if (sfbp->organelle)
335  MemFree(sfbp->organelle);
336  if (sfbp->isolate)
337  MemFree(sfbp->isolate);
338  if (sfbp->namstr)
339  MemFree(sfbp->namstr);
340  if (sfbp->location)
341  MemFree(sfbp->location);
342  if (sfbp->moltype)
343  MemFree(sfbp->moltype);
344  if (sfbp->genomename)
345  MemFree(sfbp->genomename);
346 
347  delete sfbp;
348 }
349 
350 /**********************************************************/
352 {
353  SourceFeatBlkPtr tsfbp;
354 
355  for (tsfbp = sfbp; tsfbp; tsfbp = sfbp) {
356  sfbp = tsfbp->next;
357  SourceFeatBlkFree(tsfbp);
358  }
359 }
360 
361 /**********************************************************/
363 {
364  SourceFeatBlkPtr sfbp;
365  SourceFeatBlkPtr tsfbp;
366  DataBlkPtr tdbp;
367  FeatBlkPtr fbp;
368 
369  sfbp = SourceFeatBlkNew();
370  tsfbp = sfbp;
371 
372  for (; dbp; dbp = dbp->mpNext) {
373  if (dbp->mType != type)
374  continue;
375  for (tdbp = static_cast<DataBlk*>(dbp->mpData); tdbp; tdbp = tdbp->mpNext) {
376  fbp = static_cast<FeatBlk*>(tdbp->mpData);
377  if (! fbp || ! fbp->key || ! StringEqu(fbp->key, "source"))
378  continue;
379  tsfbp->next = SourceFeatBlkNew();
380  tsfbp = tsfbp->next;
381  if (fbp->location)
382  tsfbp->location = StringSave(fbp->location);
383  tsfbp->quals = fbp->quals;
384  }
385  }
386  tsfbp = sfbp->next;
387  delete sfbp;
388  return (tsfbp);
389 }
390 
391 /**********************************************************/
392 static void RemoveStringSpaces(char* line)
393 {
394  char* p;
395  char* q;
396 
397  if (! line || *line == '\0')
398  return;
399 
400  for (p = line, q = line; *p != '\0'; p++)
401  if (*p != ' ' && *p != '\t')
402  *q++ = *p;
403  *q = '\0';
404 }
405 
406 /**********************************************************/
408 {
409  for (; sfbp; sfbp = sfbp->next) {
411  for (auto& cur : sfbp->quals) {
412  if (cur->IsSetQual()) {
413  ShrinkSpaces(cur->SetQual());
414  }
415  if (cur->IsSetVal()) {
416  ShrinkSpaces(cur->SetVal());
417  }
418  }
419  }
420 }
421 
422 /**********************************************************/
424 {
425  const char** b;
426 
427  for (; sfbp; sfbp = sfbp->next) {
428  for (const auto& cur : sfbp->quals) {
429  for (b = exempt_quals; *b; b++) {
430  if (cur->GetQual() == *b)
431  break;
432  }
433  if (*b) {
434  sfbp->skip = true;
435  break;
436  }
437  }
438  }
439 }
440 
441 /**********************************************************/
442 static void PopulateSubNames(string& namstr, const Char* name, const Char* value, COrgMod::ESubtype subtype, TOrgModList& mods)
443 {
445 
446  namstr.append(name);
447  namstr.append(value);
448  namstr.append(")");
449 
450  mod->SetSubtype(subtype);
451  mod->SetSubname(value);
452 
453  mods.push_front(mod);
454 }
455 
456 /**********************************************************/
457 static void CollectSubNames(SourceFeatBlkPtr sfbp, Int4 use_what, const Char* name, const Char* cultivar, const Char* isolate, const Char* serotype, const Char* serovar, const Char* specimen_voucher, const Char* strain, const Char* sub_species, const Char* sub_strain, const Char* variety, const Char* ecotype)
458 {
459  if (! sfbp)
460  return;
461 
462  if (sfbp->namstr)
463  MemFree(sfbp->namstr);
464  sfbp->namstr = nullptr;
465 
466  if (sfbp->orgname.NotEmpty())
467  sfbp->orgname.Reset();
468 
469  if (! name)
470  return;
471 
472  size_t i = 0;
473  if ((use_what & USE_CULTIVAR) == USE_CULTIVAR && cultivar)
474  i += (StringLen(cultivar) + StringLen("cultivar") + 5);
475  if ((use_what & USE_ISOLATE) == USE_ISOLATE && isolate)
476  i += (StringLen(isolate) + StringLen("isolate") + 5);
477  if ((use_what & USE_SEROTYPE) == USE_SEROTYPE && serotype)
478  i += (StringLen(serotype) + StringLen("serotype") + 5);
479  if ((use_what & USE_SEROVAR) == USE_SEROVAR && serovar)
480  i += (StringLen(serovar) + StringLen("serovar") + 5);
481  if ((use_what & USE_SPECIMEN_VOUCHER) == USE_SPECIMEN_VOUCHER && specimen_voucher)
482  i += (StringLen(specimen_voucher) + StringLen("specimen_voucher") + 5);
483  if ((use_what & USE_STRAIN) == USE_STRAIN && strain)
484  i += (StringLen(strain) + StringLen("strain") + 5);
485  if ((use_what & USE_SUB_SPECIES) == USE_SUB_SPECIES && sub_species)
486  i += (StringLen(sub_species) + StringLen("sub_species") + 5);
487  if ((use_what & USE_SUB_STRAIN) == USE_SUB_STRAIN && sub_strain)
488  i += (StringLen(sub_strain) + StringLen("sub_strain") + 5);
489  if ((use_what & USE_VARIETY) == USE_VARIETY && variety)
490  i += (StringLen(variety) + StringLen("variety") + 5);
491  if ((use_what & USE_ECOTYPE) == USE_ECOTYPE && ecotype)
492  i += (StringLen(ecotype) + StringLen("ecotype") + 5);
493 
494  if (i == 0) {
495  sfbp->namstr = StringSave(name);
496  return;
497  }
498 
499  sfbp->orgname = new COrgName;
500  TOrgModList& mods = sfbp->orgname->SetMod();
501 
502  string s = name;
503  s.reserve(s.size() + i);
504  if ((use_what & USE_CULTIVAR) == USE_CULTIVAR && cultivar)
505  PopulateSubNames(s, " (cultivar ", cultivar, COrgMod::eSubtype_cultivar, mods);
506  if ((use_what & USE_ISOLATE) == USE_ISOLATE && isolate)
507  PopulateSubNames(s, " (isolate ", isolate, COrgMod::eSubtype_isolate, mods);
508  if ((use_what & USE_SEROTYPE) == USE_SEROTYPE && serotype)
509  PopulateSubNames(s, " (serotype ", serotype, COrgMod::eSubtype_serotype, mods);
510  if ((use_what & USE_SEROVAR) == USE_SEROVAR && serovar)
511  PopulateSubNames(s, " (serovar ", serovar, COrgMod::eSubtype_serovar, mods);
512  if ((use_what & USE_SPECIMEN_VOUCHER) == USE_SPECIMEN_VOUCHER && specimen_voucher)
513  PopulateSubNames(s, " (specimen_voucher ", specimen_voucher, COrgMod::eSubtype_specimen_voucher, mods);
514  if ((use_what & USE_STRAIN) == USE_STRAIN && strain)
515  PopulateSubNames(s, " (strain ", strain, COrgMod::eSubtype_strain, mods);
516  if ((use_what & USE_SUB_SPECIES) == USE_SUB_SPECIES && sub_species)
517  PopulateSubNames(s, " (sub_species ", sub_species, COrgMod::eSubtype_sub_species, mods);
518  if ((use_what & USE_SUB_STRAIN) == USE_SUB_STRAIN && sub_strain)
519  PopulateSubNames(s, " (sub_strain ", sub_strain, COrgMod::eSubtype_substrain, mods);
520  if ((use_what & USE_VARIETY) == USE_VARIETY && variety)
521  PopulateSubNames(s, " (variety ", variety, COrgMod::eSubtype_variety, mods);
522  if ((use_what & USE_ECOTYPE) == USE_ECOTYPE && ecotype)
523  PopulateSubNames(s, " (ecotype ", ecotype, COrgMod::eSubtype_ecotype, mods);
524  sfbp->namstr = StringSave(s);
525 }
526 
527 /**********************************************************/
528 static bool SourceFeatStructFillIn(IndexblkPtr ibp, SourceFeatBlkPtr sfbp, Int4 use_what)
529 {
530  const Char** b;
531 
532  const Char* name;
533  const Char* cultivar;
534  const Char* isolate;
535  const Char* organelle;
536  const Char* serotype;
537  const Char* serovar;
538  const Char* ecotype;
539  const Char* specimen_voucher;
540  const Char* strain;
541  const Char* sub_species;
542  const Char* sub_strain;
543  const Char* variety;
544  char* genomename;
545  Char* p;
546  char* q;
547  bool ret;
548  Int4 i;
549 
550  for (ret = true; sfbp; sfbp = sfbp->next) {
551  name = nullptr;
552  cultivar = nullptr;
553  isolate = nullptr;
554  organelle = nullptr;
555  serotype = nullptr;
556  serovar = nullptr;
557  ecotype = nullptr;
558  specimen_voucher = nullptr;
559  strain = nullptr;
560  sub_species = nullptr;
561  sub_strain = nullptr;
562  variety = nullptr;
563  genomename = nullptr;
564 
565  for (auto& cur : sfbp->quals) {
566  if (! cur->IsSetQual())
567  continue;
568 
569  const string& qual_str = cur->GetQual();
570  char* val_ptr = cur->IsSetVal() ? cur->SetVal().data() : nullptr;
571 
572  if (qual_str == "db_xref") {
573  q = StringChr(val_ptr, ':');
574  if (! q || q[1] == '\0')
575  continue;
576  *q = '\0';
577  if (NStr::CompareNocase(val_ptr, "taxon") == 0)
578  if (ibp->taxid <= ZERO_TAX_ID)
579  ibp->taxid = TAX_ID_FROM(int, atoi(q + 1));
580  *q = ':';
581  continue;
582  }
583  if (qual_str == "focus") {
584  sfbp->focus = true;
585  continue;
586  }
587  if (qual_str == "transgenic") {
588  sfbp->tg = true;
589  continue;
590  }
591  if (qual_str == "cultivar") {
592  cultivar = val_ptr;
593  continue;
594  }
595  if (qual_str == "isolate") {
596  if (! isolate)
597  isolate = val_ptr;
598  continue;
599  }
600  if (qual_str == "mol_type") {
601  if (sfbp->moltype)
602  ret = false;
603  else if (val_ptr)
604  sfbp->moltype = StringSave(val_ptr);
605  continue;
606  }
607  if (qual_str == "organelle") {
608  if (! organelle)
609  organelle = val_ptr;
610  continue;
611  }
612  if (qual_str == "serotype") {
613  serotype = val_ptr;
614  continue;
615  }
616  if (qual_str == "serovar") {
617  serovar = val_ptr;
618  continue;
619  }
620  if (qual_str == "ecotype") {
621  ecotype = val_ptr;
622  continue;
623  }
624  if (qual_str == "specimen_voucher") {
625  specimen_voucher = val_ptr;
626  continue;
627  }
628  if (qual_str == "strain") {
629  if (! strain)
630  strain = val_ptr;
631  continue;
632  }
633  if (qual_str == "sub_species") {
634  sub_species = val_ptr;
635  continue;
636  }
637  if (qual_str == "sub_strain") {
638  sub_strain = val_ptr;
639  continue;
640  }
641  if (qual_str == "variety") {
642  variety = val_ptr;
643  continue;
644  }
645  if (qual_str == "submitter_seqid") {
646  if (sfbp->submitter_seqid) {
647  MemFree(sfbp->submitter_seqid);
648  sfbp->submitter_seqid = StringSave("");
649  } else
650  sfbp->submitter_seqid = StringSave(val_ptr);
651  if (ibp->submitter_seqid.empty())
652  ibp->submitter_seqid = StringSave(val_ptr);
653  continue;
654  }
655 
656  if (qual_str != "organism" ||
657  ! val_ptr || val_ptr[0] == '\0')
658  continue;
659 
660  if (ibp->organism.empty())
661  ibp->organism = val_ptr;
662 
663  p = StringChr(val_ptr, ' ');
664 
665  string str_to_find;
666  if (p)
667  str_to_find.assign(val_ptr, p);
668  else
669  str_to_find.assign(val_ptr);
670 
671  for (i = 0, b = source_genomes; *b; b++, i++)
672  if (StringEquNI(str_to_find.c_str(), *b, StringLen(*b)))
673  break;
674  if (*b && i != 8) {
675  if (genomename)
676  MemFree(genomename);
677  genomename = StringSave(str_to_find);
678  }
679 
680  if (p)
681  ++p;
682 
683  if (! *b)
684  p = val_ptr;
685  else {
686  if (i == 0)
688  else if (i == 1)
690  else if (i == 2)
692  else if (i == 3)
694  else if (i == 4)
696  else if (i == 5)
698  else if (i == 6)
700  else if (i == 7)
702  else if (i == 8) {
703  p = val_ptr;
705  }
706  }
707  name = p;
708  }
709 
710  if (sfbp->name)
711  MemFree(sfbp->name);
712  sfbp->name = name ? StringSave(name) : nullptr;
713 
714  if (sfbp->genomename)
715  MemFree(sfbp->genomename);
716  sfbp->genomename = genomename;
717 
718  if (strain && ! sfbp->strain)
719  sfbp->strain = StringSave(strain);
720  if (isolate && ! sfbp->isolate)
721  sfbp->isolate = StringSave(isolate);
722  if (organelle && ! sfbp->organelle)
723  sfbp->organelle = StringSave(organelle);
724 
725  CollectSubNames(sfbp, use_what, name, cultivar, isolate, serotype, serovar, specimen_voucher, strain, sub_species, sub_strain, variety, ecotype);
726  }
727  return (ret);
728 }
729 
730 /**********************************************************/
732 {
733  for (; sfbp; sfbp = sfbp->next) {
734  if (sfbp->focus && sfbp->skip)
735  break;
736  }
737 
738  if (sfbp)
739  return (sfbp->location);
740  return nullptr;
741 }
742 
743 /**********************************************************/
744 static char* CheckSourceFeatOrgs(SourceFeatBlkPtr sfbp, int* status)
745 {
746  *status = 0;
747  for (; sfbp; sfbp = sfbp->next) {
748  /** if (sfbp->namstr) */
749  if (sfbp->name)
750  continue;
751 
752  *status = (sfbp->genome == CBioSource::eGenome_unknown) ? 1 : 2;
753  break;
754  }
755  if (sfbp)
756  return (sfbp->location);
757  return nullptr;
758 }
759 
760 /**********************************************************/
762 {
763  const char** b;
764  char* p;
765  char* q;
766  Int4 count;
767  bool partial;
768  bool invalid;
769  bool ret;
770 
771  ret = true;
772  for (; sfbp; sfbp = sfbp->next) {
773  if (! sfbp->location || sfbp->location[0] == '\0')
774  break;
775  if (sfbp->skip)
776  continue;
777 
778  for (const auto& cur : sfbp->quals) {
779  if (cur->GetQual() != "partial")
780  continue;
781 
782  ErrPostEx(SEV_ERROR, ERR_SOURCE_PartialQualifier, "Source feature location has /partial qualifier. Qualifier has been ignored: \"%s\".", sfbp->location ? sfbp->location : "?empty?");
783  break;
784  }
785 
786  for (b = unusual_toks; *b; b++) {
787  p = StringStr(sfbp->location, *b);
788  if (! p)
789  continue;
790  q = p + StringLen(*b);
791  if (p > sfbp->location)
792  p--;
793  if ((p == sfbp->location || *p == '(' || *p == ')' ||
794  *p == ':' || *p == ',' || *p == '.') &&
795  (*q == '\0' || *q == '(' || *q == ')' || *q == ',' ||
796  *q == ':' || *q == '.')) {
797  ErrPostEx(SEV_ERROR, ERR_SOURCE_UnusualLocation, "Source feature has an unusual location: \"%s\".", sfbp->location ? sfbp->location : "?empty?");
798  break;
799  }
800  }
801 
802  partial = false;
803  invalid = false;
804  for (count = 0, p = sfbp->location; *p != '\0'; p++) {
805  if (*p == '^')
806  invalid = true;
807  else if (*p == '>' || *p == '<')
808  partial = true;
809  else if (*p == '(')
810  count++;
811  else if (*p == ')')
812  count--;
813  else if (*p == '.' && p[1] == '.')
814  p++;
815  else if (*p == '.' && p[1] != '.') {
816  for (q = p + 1; *q >= '0' && *q <= '9';)
817  q++;
818  if (q == p || *q != ':')
819  invalid = true;
820  }
821  }
822  if (partial) {
823  ErrPostEx(SEV_ERROR, ERR_SOURCE_PartialLocation, "Source feature location is partial; partiality flags have been ignored: \"%s\".", sfbp->location ? sfbp->location : "?empty?");
824  }
825  if (invalid || count != 0) {
826  ErrPostEx(SEV_REJECT, ERR_SOURCE_InvalidLocation, "Invalid location for source feature at \"%s\". Entry dropped.", sfbp->location ? sfbp->location : "?empty?");
827  ret = false;
828  }
829  }
830  return (ret);
831 }
832 
833 /**********************************************************/
834 static char* CheckSourceFeatLocAccs(SourceFeatBlkPtr sfbp, char* acc)
835 {
836  char* p;
837  char* q;
838  char* r;
839  Int4 i;
840 
841  for (; sfbp; sfbp = sfbp->next) {
842  if (! sfbp->location || sfbp->location[0] == '\0')
843  continue;
844  for (p = sfbp->location + 1; *p != '\0'; p++) {
845  if (*p != ':')
846  continue;
847  for (r = nullptr, q = p - 1;; q--) {
848  if (q == sfbp->location) {
849  if (*q != '_' && (*q < '0' || *q > '9') &&
850  (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
851  q++;
852  break;
853  }
854  if (*q == '.') {
855  if (! r) {
856  r = q;
857  continue;
858  }
859  q++;
860  break;
861  }
862  if (*q != '_' && (*q < '0' || *q > '9') &&
863  (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z')) {
864  q++;
865  break;
866  }
867  }
868  if (q == p)
869  continue;
870  if (r)
871  *r = '\0';
872  else
873  *p = '\0';
874  i = NStr::CompareNocase(q, acc);
875  if (r)
876  *r = '.';
877  else
878  *p = ':';
879  if (i != 0)
880  break;
881  }
882  if (*p != '\0')
883  break;
884  }
885  if (! sfbp)
886  return nullptr;
887  return (sfbp->location);
888 }
889 
890 /**********************************************************/
891 static void MinMaxFree(MinMaxPtr mmp)
892 {
893  MinMaxPtr tmmp;
894 
895  for (; mmp; mmp = tmmp) {
896  tmmp = mmp->next;
897  delete mmp;
898  }
899 }
900 
901 /**********************************************************/
902 bool fta_if_special_org(const Char* name)
903 {
904  const char** b;
905 
906  if (! name || *name == '\0')
907  return false;
908 
909  for (b = special_orgs; *b; b++)
910  if (NStr::CompareNocase(*b, name) == 0)
911  break;
912  if (*b || StringIStr(name, "vector"))
913  return true;
914  return false;
915 }
916 
917 /**********************************************************/
919 {
920  SourceFeatBlkPtr tsfbp;
921  MinMaxPtr tmmp;
922  MinMaxPtr mmpnext;
923  char* p;
924  char* q;
925  char* r;
926  char* loc;
927  Int4 count;
928  Int4 min;
929  Int4 max;
930  Int4 i;
931  Int4 tgs;
932  Int4 sporg;
933 
934  loc = nullptr;
935  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
936  if (! tsfbp->location || tsfbp->location[0] == '\0' ||
937  ! tsfbp->name || tsfbp->name[0] == '\0')
938  continue;
939  if (loc)
940  MemFree(loc);
941  loc = StringSave(tsfbp->location);
942  for (p = loc; *p != '\0'; p++)
943  if (*p == ',' || *p == '(' || *p == ')' || *p == ':' ||
944  *p == ';' || *p == '^')
945  *p = ' ';
946  for (p = loc, q = loc; *p != '\0';) {
947  if (*p == '>' || *p == '<') {
948  p++;
949  continue;
950  }
951  *q++ = *p;
952  if (*p == ' ')
953  while (*p == ' ')
954  p++;
955  else
956  p++;
957  }
958  if (q > loc && *(q - 1) == ' ')
959  q--;
960  *q = '\0';
961 
962  q = (*loc == ' ') ? (loc + 1) : loc;
963  for (p = q;;) {
964  min = 0;
965  max = 0;
966  p = StringChr(p, ' ');
967  if (p)
968  *p++ = '\0';
969  for (r = q; *r >= '0' && *r <= '9';)
970  r++;
971  if (*r == '\0') {
972  i = atoi(q);
973  if (i > 0) {
974  min = i;
975  max = i;
976  }
977  } else if (*r == '.' && r[1] == '.') {
978  *r++ = '\0';
979  min = atoi(q);
980  if (min > 0) {
981  for (q = ++r; *r >= '0' && *r <= '9';)
982  r++;
983  if (*r == '\0')
984  max = atoi(q);
985  }
986  }
987  if (min > 0 && max > 0) {
988  if (min == 1 && (size_t)max == len)
989  tsfbp->full = true;
990  for (tmmp = mmp;; tmmp = tmmp->next) {
991  if (min < tmmp->min) {
992  mmpnext = tmmp->next;
993  tmmp->next = new MinMax;
994  tmmp->next->orgname = tmmp->orgname;
995  tmmp->next->min = tmmp->min;
996  tmmp->next->max = tmmp->max;
997  tmmp->next->skip = tmmp->skip;
998  tmmp->next->next = mmpnext;
999  tmmp->orgname = tsfbp->name;
1000  tmmp->min = min;
1001  tmmp->max = max;
1002  tmmp->skip = tsfbp->skip;
1003  break;
1004  }
1005  if (! tmmp->next) {
1006  tmmp->next = new MinMax;
1007  tmmp->next->orgname = tsfbp->name;
1008  tmmp->next->min = min;
1009  tmmp->next->max = max;
1010  tmmp->next->skip = tsfbp->skip;
1011  break;
1012  }
1013  }
1014  }
1015 
1016  if (! p)
1017  break;
1018  q = p;
1019  }
1020  }
1021  if (loc)
1022  MemFree(loc);
1023 
1024  mmp = mmp->next;
1025  if (! mmp || mmp->min != 1)
1026  return (1);
1027 
1028  for (max = mmp->max; mmp; mmp = mmp->next)
1029  if (mmp->max > max && mmp->min <= max + 1)
1030  max = mmp->max;
1031 
1032  if ((size_t)max < len)
1033  return (1);
1034 
1035  tgs = 0;
1036  count = 0;
1037  sporg = 0;
1038  for (tsfbp = sfbp, i = 0; tsfbp; tsfbp = tsfbp->next, i++) {
1039  if (! tsfbp->full)
1040  continue;
1041 
1042  if (fta_if_special_org(tsfbp->name))
1043  sporg++;
1044 
1045  count++;
1046  if (tsfbp->tg)
1047  tgs++;
1048  }
1049 
1050  if (count < 2)
1051  return (0);
1052  if (count > 2 || i > count || (tgs != 1 && sporg != 1))
1053  return (2);
1054  return (0);
1055 }
1056 
1057 /**********************************************************/
1059 {
1060  char* p = nullptr;
1061  bool whole = false;
1062 
1063  for (; sfbp; sfbp = sfbp->next) {
1064  if (sfbp->full)
1065  whole = true;
1066  else if (sfbp->focus)
1067  p = sfbp->location;
1068  }
1069 
1070  if (whole)
1071  return (p);
1072  return nullptr;
1073 }
1074 
1075 /**********************************************************/
1076 static bool CheckSYNTGNDivision(SourceFeatBlkPtr sfbp, char* div)
1077 {
1078  char* p;
1079  bool got;
1080  bool ret;
1081  Int4 syntgndiv;
1082  Char ch;
1083 
1084  syntgndiv = 0;
1085  if (div && *div != '\0') {
1086  if (StringEqu(div, "SYN"))
1087  syntgndiv = 1;
1088  else if (StringEqu(div, "TGN"))
1089  syntgndiv = 2;
1090  }
1091 
1092  for (ret = true, got = false; sfbp; sfbp = sfbp->next) {
1093  if (! sfbp->tg)
1094  continue;
1095 
1096  if (syntgndiv == 0) {
1097  p = sfbp->location;
1098  if (p && StringLen(p) > 50) {
1099  ch = p[50];
1100  p[50] = '\0';
1101  } else
1102  ch = '\0';
1103  ErrPostEx(SEV_REJECT, ERR_DIVISION_TransgenicNotSYN_TGN, "Source feature located at \"%s\" has a /transgenic qualifier, but this record is not in the SYN or TGN division.", p ? p : "unknown");
1104  if (ch != '\0')
1105  p[50] = ch;
1106  ret = false;
1107  }
1108 
1109  if (sfbp->full)
1110  got = true;
1111  }
1112 
1113  if (syntgndiv == 2 && ! got)
1114  ErrPostEx(SEV_ERROR, ERR_DIVISION_TGNnotTransgenic, "This record uses the TGN division code, but there is no full-length /transgenic source feature.");
1115  return (ret);
1116 }
1117 
1118 /**********************************************************/
1120 {
1121  SourceFeatBlkPtr tsfbp;
1122  char* taxname;
1123  bool same;
1124  bool tgfull;
1125 
1126  if (! sfbp)
1127  return (0);
1128 
1129  Int4 ret = 0;
1130  bool tgs = false;
1131  bool focus = false;
1132  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1133  if (tsfbp->tg) {
1134  if (! tsfbp->full)
1135  ret = 1; /* /transgenic on not full-length */
1136  else if (tgs)
1137  ret = 3; /* multiple /transgenics */
1138  if (ret != 0)
1139  break;
1140  tgs = true;
1141  }
1142  if (tsfbp->focus)
1143  focus = true;
1144  if (tgs && focus) {
1145  ret = 2; /* /focus and /transgenic */
1146  break;
1147  }
1148  }
1149 
1150  if (ret != 0)
1151  return (ret);
1152 
1153  same = true;
1154  tgfull = false;
1155  taxname = nullptr;
1156  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1157  if (tsfbp->skip)
1158  continue;
1159  if (! taxname)
1160  taxname = tsfbp->name;
1161  else if (same && ! fta_strings_same(taxname, tsfbp->name))
1162  same = false;
1163  if (tsfbp->tg && tsfbp->full)
1164  tgfull = true;
1165  if (tsfbp->focus)
1166  focus = true;
1167  }
1168 
1169  if (same == false && tgfull == false && focus == false)
1170  return (4);
1171 
1172  if (! sfbp->next || ! tgs)
1173  return (0);
1174 
1175  for (tsfbp = sfbp->next; tsfbp; tsfbp = tsfbp->next)
1176  if (fta_strings_same(sfbp->name, tsfbp->name) == false ||
1177  fta_strings_same(sfbp->strain, tsfbp->strain) == false ||
1178  fta_strings_same(sfbp->isolate, tsfbp->isolate) == false ||
1179  fta_strings_same(sfbp->organelle, tsfbp->organelle) == false)
1180  break;
1181 
1182  if (! tsfbp)
1183  return (5); /* all source features have the same
1184  /organism, /strain, /isolate and
1185  /organelle qualifiers */
1186  return (0);
1187 }
1188 
1189 /**********************************************************/
1190 static Int4 CheckFocusInOrgs(SourceFeatBlkPtr sfbp, size_t len, int* status)
1191 {
1192  SourceFeatBlkPtr tsfbp;
1193  const char** b;
1194  char* name;
1195  string pat;
1196  Int4 count;
1197  bool same;
1198 
1199  count = 0;
1200  name = nullptr;
1201  same = true;
1202  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1203  if (! tsfbp->name)
1204  continue;
1205  if (tsfbp->focus)
1206  count++;
1207  if (! name)
1208  name = tsfbp->name;
1209  else if (NStr::CompareNocase(name, tsfbp->name) != 0)
1210  same = false;
1211  }
1212  if (same && count > 0)
1213  (*status)++;
1214 
1215  name = nullptr;
1216  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1217  if (! tsfbp->focus || ! tsfbp->name)
1218  continue;
1219  if (! name)
1220  name = tsfbp->name;
1221  else if (NStr::CompareNocase(name, tsfbp->name) != 0)
1222  break;
1223  }
1224  if (tsfbp)
1225  return (2);
1226 
1227  if (same || count != 0)
1228  return (0);
1229 
1230  name = nullptr;
1231  pat = "1.." + to_string(len);
1232  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1233  if (! tsfbp->name || ! tsfbp->location || tsfbp->skip)
1234  continue;
1235 
1236  for (b = special_orgs; *b; b++) {
1237  if (NStr::CompareNocase(*b, tsfbp->name) == 0 &&
1238  StringEqu(tsfbp->location, pat.c_str()))
1239  break;
1240  }
1241  if (*b)
1242  continue;
1243 
1244  if (! name)
1245  /** name = tsfbp->namstr;*/
1246  name = tsfbp->name;
1247  /** else if(NStr::CompareNocase(name, tsfbp->namstr) != 0)*/
1248  else if (NStr::CompareNocase(name, tsfbp->name) != 0)
1249  break;
1250  }
1251 
1252  if (! tsfbp)
1253  return (0);
1254 
1255  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1256  if (tsfbp->full && tsfbp->tg && ! tsfbp->skip)
1257  break;
1258  }
1259 
1260  if (tsfbp)
1261  return (0);
1262  return (3);
1263 }
1264 
1265 /**********************************************************/
1266 static bool IfSpecialFeat(MinMaxPtr mmp, size_t len)
1267 {
1268  if ((mmp->min == 1 && (size_t)mmp->max == len) || mmp->skip)
1269  return true;
1270  return false;
1271 }
1272 
1273 /**********************************************************/
1274 static char* CheckSourceOverlap(MinMaxPtr mmp, size_t len)
1275 {
1276  MinMaxPtr tmmp;
1277  char* res;
1278 
1279  for (; mmp; mmp = mmp->next) {
1280  if (IfSpecialFeat(mmp, len))
1281  continue;
1282  for (tmmp = mmp->next; tmmp; tmmp = tmmp->next) {
1283  if (IfSpecialFeat(tmmp, len))
1284  continue;
1285  if (NStr::CompareNocase(mmp->orgname, tmmp->orgname) == 0)
1286  continue;
1287  if (tmmp->min <= mmp->max && tmmp->max >= mmp->min)
1288  break;
1289  }
1290  if (tmmp)
1291  break;
1292  }
1293  if (! mmp)
1294  return nullptr;
1295 
1296  stringstream ss;
1297  ss << "\"" << mmp->orgname << "\" at " << mmp->min << ".." << mmp->max
1298  << " vs \"" << tmmp->orgname << "\" at " << tmmp->min << ".." << tmmp->max;
1299  res = StringSave(ss.str());
1300  return res;
1301 }
1302 
1303 /**********************************************************/
1305 {
1306  SourceFeatBlkPtr tsfbp;
1307  const char** b;
1308 
1309  if (! sfbp || ! sfbp->next)
1310  return nullptr;
1311 
1312  for (tsfbp = sfbp->next; tsfbp; tsfbp = tsfbp->next)
1313  if (NStr::CompareNocase(sfbp->name, tsfbp->name) != 0)
1314  break;
1315 
1316  if (! tsfbp)
1317  return nullptr;
1318 
1319  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next)
1320  if (tsfbp->full && tsfbp->tg)
1321  break;
1322 
1323  if (tsfbp)
1324  return nullptr;
1325 
1326  for (; sfbp; sfbp = sfbp->next) {
1327  if (! sfbp->full || sfbp->tg)
1328  continue;
1329 
1330  for (b = special_orgs; *b; b++)
1331  if (NStr::CompareNocase(*b, sfbp->name) == 0)
1332  break;
1333 
1334  if (*b)
1335  continue;
1336 
1337  if (! StringIStr(sfbp->name, "vector"))
1338  break;
1339  }
1340  if (! sfbp)
1341  return nullptr;
1342  return (sfbp->name);
1343 }
1344 
1345 /**********************************************************/
1346 static void CreateRawBioSources(ParserPtr pp, SourceFeatBlkPtr sfbp, Int4 use_what)
1347 {
1348  SourceFeatBlkPtr tsfbp;
1349  char* namstr;
1350  const Char* cultivar;
1351  const Char* isolate;
1352  const Char* serotype;
1353  const Char* serovar;
1354  const Char* ecotype;
1355  const Char* specimen_voucher;
1356  const Char* strain;
1357  const Char* sub_species;
1358  const Char* sub_strain;
1359  const Char* variety;
1360 
1361  for (; sfbp; sfbp = sfbp->next) {
1362  if (sfbp->bio_src.NotEmpty())
1363  continue;
1364 
1365  namstr = StringSave(sfbp->namstr);
1366  CRef<COrg_ref> org_ref(new COrg_ref);
1367  org_ref->SetTaxname(sfbp->name);
1368 
1369  if (sfbp->orgname.NotEmpty()) {
1370  org_ref->SetOrgname(*sfbp->orgname);
1371  }
1372 
1373  CRef<COrg_ref> t_org_ref(new COrg_ref);
1374  t_org_ref->Assign(*org_ref);
1375  fta_fix_orgref(pp, *org_ref, &pp->entrylist[pp->curindx]->drop, sfbp->genomename);
1376 
1377  if (t_org_ref->Equals(*org_ref))
1378  sfbp->lookup = false;
1379  else {
1380  sfbp->lookup = true;
1381  MemFree(sfbp->name);
1382  sfbp->name = StringSave(org_ref->GetTaxname());
1383 
1384  sfbp->orgname.Reset();
1385 
1386  cultivar = nullptr;
1387  isolate = nullptr;
1388  serotype = nullptr;
1389  serovar = nullptr;
1390  ecotype = nullptr;
1391  specimen_voucher = nullptr;
1392  strain = nullptr;
1393  sub_species = nullptr;
1394  sub_strain = nullptr;
1395  variety = nullptr;
1396  if (org_ref->IsSetOrgname() && org_ref->IsSetOrgMod()) {
1397  for (const auto& mod : org_ref->GetOrgname().GetMod()) {
1398  switch (mod->GetSubtype()) {
1400  cultivar = mod->GetSubname().c_str();
1401  break;
1403  isolate = mod->GetSubname().c_str();
1404  break;
1406  serotype = mod->GetSubname().c_str();
1407  break;
1409  serovar = mod->GetSubname().c_str();
1410  break;
1412  ecotype = mod->GetSubname().c_str();
1413  break;
1415  specimen_voucher = mod->GetSubname().c_str();
1416  break;
1418  strain = mod->GetSubname().c_str();
1419  break;
1421  sub_species = mod->GetSubname().c_str();
1422  break;
1424  sub_strain = mod->GetSubname().c_str();
1425  break;
1427  variety = mod->GetSubname().c_str();
1428  break;
1429  }
1430  }
1431  }
1432  CollectSubNames(sfbp, use_what, sfbp->name, cultivar, isolate, serotype, serovar, specimen_voucher, strain, sub_species, sub_strain, variety, ecotype);
1433  }
1434 
1435  sfbp->bio_src.Reset(new CBioSource);
1436  sfbp->bio_src->SetOrg(*org_ref);
1437 
1438  for (tsfbp = sfbp->next; tsfbp; tsfbp = tsfbp->next) {
1439  if (tsfbp->bio_src.NotEmpty() || NStr::CompareNocase(namstr, tsfbp->namstr) != 0)
1440  continue;
1441 
1442  tsfbp->lookup = sfbp->lookup;
1443 
1444  tsfbp->bio_src.Reset(new CBioSource);
1445  tsfbp->bio_src->Assign(*sfbp->bio_src);
1446 
1447  if (! sfbp->lookup)
1448  continue;
1449 
1450  MemFree(tsfbp->name);
1451  tsfbp->name = StringSave(sfbp->name);
1452 
1453  MemFree(tsfbp->namstr);
1454  tsfbp->namstr = StringSave(sfbp->namstr);
1455  }
1456  MemFree(namstr);
1457  }
1458 }
1459 
1460 /**********************************************************/
1462  SourceFeatBlkPtr what)
1463 {
1465  SourceFeatBlkPtr tsfbp;
1466 
1467  if (what == where)
1468  return (where);
1469 
1470  prev = where;
1471  for (tsfbp = where->next; tsfbp; tsfbp = tsfbp->next) {
1472  if (tsfbp == what)
1473  break;
1474  prev = tsfbp;
1475  }
1476  if (! tsfbp)
1477  return (where);
1478 
1479  prev->next = what->next;
1480  what->next = where;
1481  return (what);
1482 }
1483 
1484 /**********************************************************/
1486 {
1487  SourceFeatBlkPtr tsfbp;
1490 
1491  for (prev = sfbp, tsfbp = sfbp->next; tsfbp; tsfbp = next) {
1492  next = tsfbp->next;
1493  if (! tsfbp->useit) {
1494  prev = tsfbp;
1495  continue;
1496  }
1497 
1498  bool different = false;
1499  for (const auto& cur : tsfbp->quals) {
1500  const string& cur_qual = cur->GetQual();
1501  if (cur_qual == "focus")
1502  continue;
1503 
1504  bool found = false;
1505  for (const auto& next : sfbp->quals) {
1506  const string& next_qual = next->GetQual();
1507 
1508  if (next_qual == "focus" || next_qual != cur_qual)
1509  continue;
1510 
1511  if (! cur->IsSetVal() && ! next->IsSetVal()) {
1512  found = true;
1513  break;
1514  }
1515 
1516  if (cur->IsSetVal() && next->IsSetVal() &&
1517  cur->GetVal() == next->GetVal()) {
1518  found = true;
1519  break;
1520  }
1521  }
1522 
1523  if (! found) /* Different, leave as is */
1524  {
1525  different = true;
1526  break;
1527  }
1528  }
1529 
1530  if (different) /* Different, leave as is */
1531  {
1532  prev = tsfbp;
1533  continue;
1534  }
1535  prev->next = tsfbp->next;
1536  tsfbp->next = nullptr;
1537  SourceFeatBlkFree(tsfbp);
1538  }
1539  return (sfbp);
1540 }
1541 
1542 /**********************************************************/
1544  SourceFeatBlkPtr res)
1545 {
1546  SourceFeatBlkPtr tsfbp;
1547 
1548  if (! res)
1549  return (sfbp);
1550 
1551  tsfbp = SourceFeatBlkNew();
1552  tsfbp->name = res->name ? StringSave(res->name) : nullptr;
1553  tsfbp->namstr = res->namstr ? StringSave(res->namstr) : nullptr;
1554  tsfbp->location = res->location ? StringSave(res->location) : nullptr;
1555  tsfbp->full = res->full;
1556  tsfbp->focus = res->focus;
1557  tsfbp->lookup = res->lookup;
1558  tsfbp->genome = res->genome;
1559  tsfbp->next = nullptr;
1560 
1561  tsfbp->bio_src.Reset(new CBioSource);
1562  tsfbp->bio_src->Assign(*res->bio_src);
1563 
1564  tsfbp->orgname.Reset(new COrgName);
1565  if (res->orgname.NotEmpty())
1566  tsfbp->orgname->Assign(*res->orgname);
1567 
1568  tsfbp->quals = res->quals;
1569  tsfbp->next = sfbp;
1570  sfbp = tsfbp;
1571 
1572  for (TQualVector::iterator cur = sfbp->quals.begin(); cur != sfbp->quals.end();) {
1573  const string& cur_qual = (*cur)->GetQual();
1574  if (cur_qual == "focus") {
1575  ++cur;
1576  continue;
1577  }
1578 
1579  for (tsfbp = sfbp->next; tsfbp; tsfbp = tsfbp->next) {
1580  if (tsfbp == res || ! tsfbp->useit)
1581  continue;
1582 
1583  bool found = false;
1584  for (const auto& next : tsfbp->quals) {
1585  const string& next_qual = next->GetQual();
1586 
1587  if (next_qual == "focus" || next_qual != cur_qual)
1588  continue;
1589 
1590  if (! (*cur)->IsSetVal() && ! next->IsSetVal()) {
1591  found = true;
1592  break;
1593  }
1594 
1595  if ((*cur)->IsSetVal() && next->IsSetVal() &&
1596  (*cur)->GetVal() == next->GetVal()) {
1597  found = true;
1598  break;
1599  }
1600  }
1601 
1602  if (! found) /* Not found */
1603  break;
1604  }
1605 
1606  if (! tsfbp) /* Got the match */
1607  {
1608  ++cur;
1609  continue;
1610  }
1611 
1612  cur = sfbp->quals.erase(cur);
1613  }
1614 
1615  return (SourceFeatRemoveDups(sfbp));
1616 }
1617 
1618 /**********************************************************/
1620 {
1621  SourceFeatBlkPtr res;
1622  SourceFeatBlkPtr tsfbp;
1623 
1624  if (! sfbp->next) {
1625  if (! sfbp->full) {
1626  ErrPostEx(SEV_WARNING, ERR_SOURCE_SingleSourceTooShort, "Source feature does not span the entire length of the sequence.");
1627  }
1628  return (sfbp);
1629  }
1630 
1631  NCBI_UNUSED Int4 count_skip = 0;
1632  Int4 count_noskip = 0;
1633  bool same = true;
1634  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1635  if (NStr::CompareNocase(tsfbp->name, sfbp->name) != 0) {
1636  same = false;
1637  break;
1638  }
1639 
1640  if (! tsfbp->skip) {
1641  res = tsfbp;
1642  count_noskip++;
1643  } else
1644  count_skip++;
1645  }
1646 
1647  if (same) {
1648  if (count_noskip == 1) {
1649  sfbp = SourceFeatMoveOneUp(sfbp, res);
1650  return (SourceFeatRemoveDups(sfbp));
1651  }
1652  for (res = nullptr, tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1653  if (count_noskip != 0 && tsfbp->skip)
1654  continue;
1655  tsfbp->useit = true;
1656  if (! res)
1657  res = tsfbp;
1658  }
1659  return (SourceFeatDerive(sfbp, res));
1660  }
1661 
1662  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1663  if (tsfbp->tg)
1664  break;
1665  }
1666  if (tsfbp)
1667  return (SourceFeatMoveOneUp(sfbp, tsfbp));
1668 
1669  for (res = nullptr, tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1670  if (! tsfbp->focus)
1671  continue;
1672  res = tsfbp;
1673  if (! tsfbp->skip)
1674  break;
1675  }
1676 
1677  if (res) {
1678  count_skip = 0;
1679  count_noskip = 0;
1680  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1681  if (NStr::CompareNocase(res->name, tsfbp->name) != 0)
1682  continue;
1683  tsfbp->useit = true;
1684  if (tsfbp->skip)
1685  count_skip++;
1686  else
1687  count_noskip++;
1688  }
1689  if (count_noskip > 0) {
1690  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1691  if (NStr::CompareNocase(res->name, tsfbp->name) != 0)
1692  continue;
1693  if (res != tsfbp && tsfbp->skip)
1694  tsfbp->useit = false;
1695  }
1696  }
1697  return (SourceFeatDerive(sfbp, res));
1698  }
1699 
1700  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
1701  if (! tsfbp->full)
1702  continue;
1703  res = tsfbp;
1704  break;
1705  }
1706  if (res) {
1707  sfbp = SourceFeatMoveOneUp(sfbp, res);
1708  return (SourceFeatRemoveDups(sfbp));
1709  }
1710 
1711  SourceFeatBlkSetFree(sfbp);
1712  ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingSourceFeatureForDescr, "Could not select the right source feature among different organisms to create descriptor: no /focus and 1..N one. Entry dropped.");
1713  return nullptr;
1714 }
1715 
1716 /**********************************************************/
1717 static void AddOrgMod(COrg_ref& org_ref, const Char* val, COrgMod::ESubtype type)
1718 {
1719  COrgName& orgname = org_ref.SetOrgname();
1720 
1721  CRef<COrgMod> mod(new COrgMod);
1722  mod->SetSubtype(type);
1723  mod->SetSubname(val ? val : "");
1724 
1725  orgname.SetMod().push_back(mod);
1726 }
1727 
1728 /**********************************************************/
1730 {
1732  bool found = false;
1733  for (const auto& subtype : bio.GetSubtype()) {
1734  if (subtype->GetSubtype() == type) {
1735  found = true;
1736  break;
1737  }
1738  }
1739 
1740  if (found)
1741  return;
1742  }
1743 
1744  CRef<CSubSource> sub(new CSubSource);
1745  sub->SetSubtype(type);
1746  sub->SetName(val ? val : "");
1747  bio.SetSubtype().push_back(sub);
1748 }
1749 
1750 /**********************************************************/
1751 static void CheckQualsInSourceFeat(CBioSource& bio, TQualVector& quals, Uint1 taxserver)
1752 {
1753  const Char** b;
1754 
1755  char* p;
1756 
1757  if (! bio.CanGetOrg())
1758  return;
1759 
1760  vector<string> modnames;
1761 
1762  if (bio.GetOrg().CanGetOrgname() && bio.GetOrg().GetOrgname().CanGetMod()) {
1763  for (const auto& mod : bio.GetOrg().GetOrgname().GetMod()) {
1764  for (const auto& it : SourceOrgMods) {
1765  if (it.num != mod->GetSubtype())
1766  continue;
1767 
1768  modnames.push_back(it.name);
1769  break;
1770  }
1771  }
1772  }
1773 
1774  for (const auto& cur : quals) {
1775  if (! cur->IsSetQual() || cur->GetQual() == "organism")
1776  continue;
1777 
1778  const string& cur_qual = cur->GetQual();
1779  const Char* val_ptr = cur->IsSetVal() ? cur->GetVal().c_str() : nullptr;
1780 
1781  if (cur_qual == "note") {
1783  continue;
1784  }
1785 
1786  for (b = SourceBadQuals; *b; b++) {
1787  if (cur_qual != *b)
1788  continue;
1789 
1790  if (! val_ptr || val_ptr[0] == '\0')
1791  p = StringSave("???");
1792  else
1793  p = StringSave(val_ptr);
1794  if (StringLen(p) > 50)
1795  p[50] = '\0';
1796  ErrPostEx(SEV_WARNING, ERR_SOURCE_UnwantedQualifiers, "Unwanted qualifier on source feature: %s=%s", cur_qual.c_str(), p);
1797  MemFree(p);
1798  }
1799 
1800  b = SourceSubSources;
1801  for (int i = CSubSource::eSubtype_chromosome; *b; i++, b++) {
1802  if (**b != '\0' && cur_qual == *b) {
1803  FTASubSourceAdd(bio, val_ptr, static_cast<CSubSource::ESubtype>(i));
1804  break;
1805  }
1806  }
1807 
1808  if (cur_qual == "organism" ||
1809  (taxserver != 0 && cur_qual == "type_material"))
1810  continue;
1811 
1812  if (find(modnames.begin(), modnames.end(), cur_qual) != modnames.end())
1813  continue;
1814 
1815  for (const auto& it : SourceOrgMods) {
1816  if (cur_qual == it.name) {
1817  AddOrgMod(bio.SetOrg(), val_ptr, it.num);
1818  break;
1819  }
1820  }
1821  }
1822 }
1823 
1824 /**********************************************************/
1826 {
1827  const char** b;
1828  const char* q;
1829  char* p;
1830 
1831  CRef<CDbtag> tag;
1832 
1833  if (qual->GetQual() != "db_xref")
1834  return tag;
1835 
1836  std::vector<Char> val_buf(qual->GetVal().begin(), qual->GetVal().end());
1837  val_buf.push_back(0);
1838 
1839  p = StringChr(&val_buf[0], ':');
1840  if (! p || p[1] == '\0')
1841  return tag;
1842 
1843  *p = '\0';
1844  if (NStr::CompareNocase(&val_buf[0], "taxon") == 0) {
1845  *p = ':';
1846  return tag;
1847  }
1848 
1850  q = "NCBI";
1851  else if (source == Parser::ESource::EMBL)
1852  q = "EMBL";
1853  else if (source == Parser::ESource::DDBJ)
1854  q = "DDBJ";
1855  else if (source == Parser::ESource::SPROT)
1856  q = "SwissProt";
1857  else if (source == Parser::ESource::LANL)
1858  q = "LANL";
1859  else if (source == Parser::ESource::Refseq)
1860  q = "RefSeq";
1861  else
1862  q = "Unknown";
1863 
1867  *p = ':';
1868  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidDbXref, "Cannot process source feature's \"/db_xref=%s\" for source \"%s\".", &val_buf[0], q);
1869  return tag;
1870  }
1871 
1872  for (b = ObsoleteSourceDbxrefTag; *b; b++) {
1873  if (NStr::CompareNocase(*b, &val_buf[0]) == 0)
1874  break;
1875  }
1876 
1877  if (*b) {
1878  ErrPostEx(SEV_WARNING, ERR_SOURCE_ObsoleteDbXref, "/db_xref type \"%s\" is obsolete.", &val_buf[0]);
1879  if (NStr::CompareNocase(&val_buf[0], "IFO") == 0) {
1880  string line("NBRC:");
1881  line.append(p + 1);
1882  qual->SetVal(line);
1883 
1884  val_buf.assign(line.begin(), line.end());
1885  val_buf.push_back(0);
1886 
1887  p = &val_buf[0] + 4;
1888  *p = '\0';
1889  }
1890  }
1891 
1892  for (b = DENLRSourceDbxrefTag; *b; b++) {
1893  if (NStr::CompareNocase(*b, &val_buf[0]) == 0)
1894  break;
1895  }
1896 
1898  for (b = DESourceDbxrefTag; *b; b++)
1899  if (NStr::CompareNocase(*b, &val_buf[0]) == 0)
1900  break;
1901  }
1902  if (! *b && source == Parser::ESource::EMBL) {
1903  for (b = ESourceDbxrefTag; *b; b++)
1904  if (NStr::CompareNocase(*b, &val_buf[0]) == 0)
1905  break;
1906  }
1909  for (b = NLRSourceDbxrefTag; *b; b++) {
1910  if (NStr::CompareNocase(*b, &val_buf[0]) == 0)
1911  break;
1912  }
1913  }
1914 
1915  if (! *b) {
1916  *p = ':';
1917  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidDbXref, "Invalid database name in source feature's \"/db_xref=%s\" for source \"%s\".", &val_buf[0], q);
1918  return tag;
1919  }
1920 
1921  tag.Reset(new CDbtag);
1922  tag->SetDb(&val_buf[0]);
1923 
1924  *p++ = ':';
1925  for (q = p; *p >= '0' && *p <= '9';)
1926  p++;
1927 
1928  if (*p == '\0' && *q != '0')
1929  tag->SetTag().SetId(atoi(q));
1930  else
1931  tag->SetTag().SetStr(q);
1932 
1933  return tag;
1934 }
1935 
1936 /**********************************************************/
1938 {
1939  char* div;
1940  char* tco;
1941  char* p;
1942  char* q;
1943 
1944  CBioSource::TGenome newgen;
1945  CBioSource::TGenome oldgen;
1946  Int2 i;
1947 
1948  bool is_syn = false;
1949  bool is_pat = false;
1950 
1951  div = ibp->division;
1952  if (div) {
1953  if (StringEqu(div, "SYN"))
1954  is_syn = true;
1955  else if (StringEqu(div, "PAT"))
1956  is_pat = true;
1957  }
1958  for (; sfbp; sfbp = sfbp->next) {
1959  if (sfbp->bio_src.Empty())
1960  continue;
1961 
1962  CBioSource& bio = *sfbp->bio_src;
1963 
1964  if (! sfbp->lookup) {
1965  if (is_syn && ! sfbp->tg)
1967  } else {
1968  if (bio.CanGetOrg() && bio.GetOrg().CanGetOrgname() &&
1969  bio.GetOrg().GetOrgname().CanGetDiv() &&
1970  bio.GetOrg().GetOrgname().GetDiv() == "SYN") {
1972  if (is_syn == false && is_pat == false) {
1973  const Char* taxname = nullptr;
1974  if (bio.GetOrg().CanGetTaxname() &&
1975  ! bio.GetOrg().GetTaxname().empty())
1976  taxname = bio.GetOrg().GetTaxname().c_str();
1977  ErrPostEx(SEV_ERROR, ERR_ORGANISM_SynOrgNameNotSYNdivision, "The NCBI Taxonomy DB believes that organism name \"%s\" is reserved for synthetic sequences, but this record is not in the SYN division.", taxname ? taxname : "not_specified");
1978  }
1979  }
1980  }
1981 
1982  newgen = -1;
1983  oldgen = -1;
1984 
1985  bool dropped = false;
1986  for (auto& cur : sfbp->quals) {
1987  if (! cur->IsSetQual() || cur->GetQual().empty())
1988  continue;
1989 
1990  const string& cur_qual = cur->GetQual();
1991  string cq = cur_qual;
1992  if (cq == "geo_loc_name") {
1993  cq = "country";
1994  }
1995  if (cq == "db_xref") {
1996  CRef<CDbtag> dbtag = GetSourceDbtag(cur, source);
1997  if (dbtag.Empty())
1998  continue;
1999 
2000  bio.SetOrg().SetDb().push_back(dbtag);
2001  continue;
2002  }
2003 
2004  const Char* val_ptr = cur->IsSetVal() ? cur->GetVal().c_str() : nullptr;
2005  if (cq == "organelle") {
2006  if (! val_ptr || val_ptr[0] == '\0')
2007  continue;
2008 
2009  const char* p = StringChr(val_ptr, ':');
2010  if (p) {
2011  if (StringChr(p + 1, ':')) {
2012  ErrPostEx(SEV_ERROR, ERR_SOURCE_OrganelleQualMultToks, "More than 2 tokens found in /organelle qualifier: \"%s\". Entry dropped.", val_ptr);
2013  dropped = true;
2014  break;
2015  }
2016 
2017  string val_str(val_ptr, p);
2018  i = StringMatchIcase(OrganelleFirstToken, val_str.c_str());
2019  if (i < 0) {
2020  ErrPostEx(SEV_ERROR, ERR_SOURCE_OrganelleIllegalClass, "Illegal class in /organelle qualifier: \"%s\". Entry dropped.", val_ptr);
2021  dropped = true;
2022  break;
2023  }
2024  if (i == 4)
2025  ibp->got_plastid = true;
2026  if (newgen < 0)
2028  p + 1);
2029  } else {
2031  if (i < 0) {
2032  ErrPostEx(SEV_ERROR, ERR_SOURCE_OrganelleIllegalClass, "Illegal class in /organelle qualifier: \"%s\". Entry dropped.", val_ptr);
2033  dropped = true;
2034  break;
2035  }
2036  if (i == 4)
2037  ibp->got_plastid = true;
2038  if (newgen < 0)
2039  newgen = StringMatchIcase(GenomicSourceFeatQual, val_ptr);
2040  }
2041  continue;
2042  }
2043 
2044  if (oldgen < 0)
2045  oldgen = StringMatchIcase(GenomicSourceFeatQual, cq.c_str());
2046 
2047  if (cq != "country" ||
2048  ! val_ptr || val_ptr[0] == '\0')
2049  continue;
2050 
2051  tco = StringSave(val_ptr);
2052  p = StringChr(tco, ':');
2053  if (p)
2054  *p = '\0';
2055  for (p = tco; *p == ' ' || *p == '\t';)
2056  p++;
2057  if (*p == '\0') {
2058  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCountry, "Empty country name in /country qualifier : \"%s\".", val_ptr);
2059  } else {
2060  for (q = p + 1; *q != '\0';)
2061  q++;
2062  for (q--; *q == ' ' || *q == '\t';)
2063  q--;
2064  *++q = '\0';
2065 
2066  bool valid_country = CCountries::IsValid(p);
2067  if (! valid_country) {
2068  valid_country = CCountries::WasValid(p);
2069 
2070  if (! valid_country)
2071  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCountry, "Country \"%s\" from /country qualifier \"%s\" is not a valid country name.", tco, val_ptr);
2072  else
2073  ErrPostEx(SEV_WARNING, ERR_SOURCE_FormerCountry, "Country \"%s\" from /country qualifier \"%s\" is a former country name which is no longer valid.", tco, val_ptr);
2074  }
2075  }
2076 
2077  MemFree(tco);
2079  }
2080 
2081  if (dropped)
2082  break;
2083 
2084  if (newgen > -1)
2085  bio.SetGenome(newgen);
2086  else if (oldgen > -1)
2087  bio.SetGenome(oldgen);
2088  else if (sfbp->genome != CBioSource::eGenome_unknown)
2089  bio.SetGenome(sfbp->genome);
2090 
2091  CheckQualsInSourceFeat(bio, sfbp->quals, taxserver);
2092  fta_sort_biosource(bio);
2093  }
2094 
2095  if (sfbp)
2096  return false;
2097 
2098  return true;
2099 }
2100 
2101 
2102 /**********************************************************/
2103 static bool is_a_space_char(Char c)
2104 {
2105  return c == ' ' || c == '\t';
2106 }
2107 
2108 /**********************************************************/
2109 static void CompareDescrFeatSources(SourceFeatBlkPtr sfbp, const CBioseq& bioseq)
2110 {
2111  SourceFeatBlkPtr tsfbp;
2112 
2113  if (! sfbp || ! bioseq.IsSetDescr())
2114  return;
2115 
2116  for (const auto& descr : bioseq.GetDescr().Get()) {
2117  if (! descr->IsSource())
2118  continue;
2119 
2120  const CBioSource& bio_src = descr->GetSource();
2121 
2122  if (! bio_src.IsSetOrg() || ! bio_src.GetOrg().IsSetTaxname() ||
2123  bio_src.GetOrg().GetTaxname().empty())
2124  continue;
2125 
2126  const string& taxname = bio_src.GetOrg().GetTaxname();
2127  string orgdescr;
2128  std::remove_copy_if(taxname.begin(), taxname.end(), std::back_inserter(orgdescr), is_a_space_char);
2129 
2130  string commdescr;
2131  if (bio_src.GetOrg().IsSetCommon()) {
2132  const string& common = bio_src.GetOrg().GetCommon();
2133  std::remove_copy_if(common.begin(), common.end(), std::back_inserter(commdescr), is_a_space_char);
2134  }
2135 
2136  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
2137  if (tsfbp->name == nullptr || tsfbp->name[0] == '\0')
2138  continue;
2139 
2140  size_t name_len = strlen(tsfbp->name);
2141  string orgfeat;
2142  std::remove_copy_if(tsfbp->name, tsfbp->name + name_len, std::back_inserter(orgfeat), is_a_space_char);
2143 
2144  if (NStr::CompareNocase(orgdescr.c_str(), "unknown") == 0) {
2145  if (NStr::CompareNocase(orgdescr.c_str(), orgfeat.c_str()) == 0 ||
2146  (! commdescr.empty() && NStr::CompareNocase(commdescr.c_str(), orgfeat.c_str()) == 0)) {
2147  break;
2148  }
2149  } else {
2150  if (orgdescr == orgfeat || commdescr == orgfeat) {
2151  break;
2152  }
2153  }
2154  }
2155 
2156  if (! tsfbp) {
2157  ErrPostEx(SEV_ERROR, ERR_ORGANISM_NoSourceFeatMatch, "Organism name \"%s\" from OS/ORGANISM line does not exist in this record's source features.", taxname.c_str());
2158  }
2159  }
2160 }
2161 
2162 /**********************************************************/
2164 {
2165  const Char* p;
2166  ErrSev sev;
2167 
2168  for (; sfbp; sfbp = sfbp->next) {
2169  if (! sfbp->lookup || sfbp->bio_src.Empty() || ! sfbp->bio_src->IsSetOrg())
2170  continue;
2171 
2172  p = nullptr;
2173  if (sfbp->bio_src->GetOrg().IsSetOrgname() &&
2174  sfbp->bio_src->GetOrg().GetOrgname().IsSetLineage())
2175  p = sfbp->bio_src->GetOrg().GetOrgname().GetLineage().c_str();
2176 
2177  if (! p || *p == '\0') {
2178  if ((source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL) && is_pat)
2179  sev = SEV_WARNING;
2180  else
2181  sev = SEV_REJECT;
2182  ErrPostEx(sev, ERR_SERVER_NoLineageFromTaxon, "Taxonomy lookup for organism name \"%s\" yielded an Org-ref that has no lineage.", sfbp->name);
2183  if (sev == SEV_REJECT)
2184  break;
2185  }
2186  }
2187  if (! sfbp)
2188  return true;
2189  return false;
2190 }
2191 
2192 /**********************************************************/
2193 static void PropogateSuppliedLineage(CBioseq& bioseq,
2194  SourceFeatBlkPtr sfbp,
2195  Uint1 taxserver)
2196 {
2197  SourceFeatBlkPtr tsfbp;
2198 
2199  const Char* p;
2200 
2201  if (! bioseq.IsSetDescr() || ! sfbp)
2202  return;
2203 
2204  for (; sfbp; sfbp = sfbp->next) {
2205  if (sfbp->lookup || sfbp->bio_src.Empty() ||
2206  ! sfbp->bio_src->IsSetOrg() || ! sfbp->bio_src->GetOrg().IsSetTaxname() ||
2207  ! sfbp->name || *sfbp->name == '\0' ||
2208  sfbp->bio_src->GetOrg().GetTaxname().empty())
2209  continue;
2210 
2211  COrgName& orgname = sfbp->bio_src->SetOrg().SetOrgname();
2212 
2213  if (orgname.IsSetLineage()) {
2214  if (! orgname.GetLineage().empty())
2215  continue;
2216 
2217  orgname.ResetLineage();
2218  }
2219 
2220  const string& taxname = sfbp->bio_src->GetOrg().GetTaxname();
2221  string lineage;
2222 
2223  bool found = false;
2224  for (const auto& descr : bioseq.GetDescr().Get()) {
2225  if (! descr->IsSource())
2226  continue;
2227 
2228  const CBioSource& bio_src = descr->GetSource();
2229 
2230  if (! bio_src.IsSetOrg() || ! bio_src.GetOrg().IsSetOrgname() ||
2231  ! bio_src.GetOrg().IsSetTaxname() || bio_src.GetOrg().GetTaxname().empty() ||
2232  ! bio_src.GetOrg().GetOrgname().IsSetLineage())
2233  continue;
2234 
2235  lineage = bio_src.GetOrg().GetOrgname().GetLineage();
2236  const string& cur_taxname = bio_src.GetOrg().GetTaxname();
2237 
2238  if (NStr::CompareNocase(cur_taxname.c_str(), taxname.c_str()) == 0) {
2239  found = true;
2240  break;
2241  }
2242  }
2243 
2244  if (! found) {
2245  ErrPostEx((taxserver == 0) ? SEV_INFO : SEV_WARNING,
2247  "Taxonomy lookup for organism name \"%s\" failed, and no matching organism exists in OS/ORGANISM lines, so lineage has been set to \"Unclassified\".",
2248  taxname.c_str());
2249  p = "Unclassified";
2250  } else {
2251  if (lineage.empty()) {
2252  ErrPostEx((taxserver == 0) ? SEV_INFO : SEV_WARNING,
2254  "Taxonomy lookup for organism name \"%s\" failed, and the matching organism from OS/ORGANISM lines has no lineage, so lineage has been set to \"Unclassified\".",
2255  taxname.c_str());
2256  p = "Unclassified";
2257  } else
2258  p = lineage.c_str();
2259  }
2260 
2261  orgname.SetLineage(p);
2262  for (tsfbp = sfbp->next; tsfbp; tsfbp = tsfbp->next) {
2263  if (tsfbp->lookup || tsfbp->bio_src.Empty() ||
2264  ! tsfbp->bio_src->IsSetOrg() || ! tsfbp->bio_src->GetOrg().IsSetTaxname() ||
2265  ! tsfbp->name || *tsfbp->name == '\0' ||
2266  tsfbp->bio_src->GetOrg().GetTaxname().empty() ||
2267  NStr::CompareNocase(sfbp->name, tsfbp->name) != 0)
2268 
2269  continue;
2270 
2271  COrgName& torgname = tsfbp->bio_src->SetOrg().SetOrgname();
2272 
2273  if (torgname.IsSetLineage()) {
2274  if (! torgname.GetLineage().empty())
2275  continue;
2276  }
2277  torgname.SetLineage(p);
2278  }
2279  }
2280 }
2281 
2282 /**********************************************************/
2283 static bool CheckMoltypeConsistency(SourceFeatBlkPtr sfbp, string& moltype)
2284 {
2285  SourceFeatBlkPtr tsfbp;
2286  char* name;
2287  char* p;
2288  bool ret;
2289  Char ch;
2290 
2291  if (! sfbp)
2292  return true;
2293 
2294  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next)
2295  if (tsfbp->moltype)
2296  break;
2297 
2298  if (! tsfbp)
2299  return true;
2300 
2301  name = tsfbp->moltype;
2302  for (ret = true, tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
2303  if (! tsfbp->moltype) {
2304  ch = '\0';
2305  p = tsfbp->location;
2306  if (p && StringLen(p) > 50) {
2307  ch = p[50];
2308  p[50] = '\0';
2309  }
2310  ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingMolType, "Source feature at \"%s\" lacks a /mol_type qualifier.", p ? p : "<empty>");
2311  if (ch != '\0')
2312  p[50] = ch;
2313  } else if (ret && ! StringEqu(name, tsfbp->moltype))
2314  ret = false;
2315  }
2316 
2317  if (ret)
2318  moltype = name;
2319 
2320  return (ret);
2321 }
2322 
2323 /**********************************************************/
2325 {
2326  const char** b;
2327 
2328  char* location;
2329  Int4 sources;
2330  Int4 envs;
2331  Char ch;
2332 
2333  if (! sfbp || ! ibp)
2334  return true;
2335 
2336  bool skip = false;
2337  location = nullptr;
2338  ibp->env_sample_qual = false;
2339  for (envs = 0, sources = 0; sfbp; sfbp = sfbp->next, sources++) {
2340  bool env_found = false;
2341  for (const auto& cur : sfbp->quals) {
2342  if (cur->IsSetQual() && cur->GetQual() == "environmental_sample") {
2343  env_found = true;
2344  break;
2345  }
2346  }
2347  if (env_found)
2348  envs++;
2349  else
2350  location = sfbp->location;
2351 
2352  if (! sfbp->full || ! sfbp->name || sfbp->name[0] == '\0')
2353  continue;
2354 
2355  for (b = special_orgs; *b; b++) {
2356  if (NStr::CompareNocase(*b, sfbp->name) == 0)
2357  break;
2358  }
2359  if (*b)
2360  skip = true;
2361  }
2362 
2363  if (envs > 0) {
2364  ibp->env_sample_qual = true;
2365  if (! skip && envs != sources) {
2366  if (location && StringLen(location) > 50) {
2367  ch = location[50];
2368  location[50] = '\0';
2369  } else
2370  ch = '\0';
2371  ErrPostEx(SEV_REJECT, ERR_SOURCE_InconsistentEnvSampQual, "Inconsistent /environmental_sample qualifier usage. Source feature at location \"%s\" lacks the qualifier.", location ? location : "unknown");
2372  if (ch != '\0')
2373  location[50] = ch;
2374  return false;
2375  }
2376  } else if (NStr::CompareNocase(ibp->division, "ENV") == 0) {
2378  ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingEnvSampQual, "This ENV division record has source features that lack the /environmental_sample qualifier. It will not be placed in the ENV division until the qualifier is added.");
2379  else {
2380  ErrPostEx(SEV_REJECT, ERR_SOURCE_MissingEnvSampQual, "This ENV division record has source features that lack the /environmental_sample qualifier.");
2381  return false;
2382  }
2383  }
2384  return true;
2385 }
2386 
2387 /**********************************************************/
2388 static char* CheckPcrPrimersTag(char* str)
2389 {
2390  if (StringEquN(str, "fwd_name", 8) ||
2391  StringEquN(str, "rev_name", 8))
2392  str += 8;
2393  else if (StringEquN(str, "fwd_seq", 7) ||
2394  StringEquN(str, "rev_seq", 7))
2395  str += 7;
2396  else
2397  return nullptr;
2398 
2399  if (*str == ' ')
2400  str++;
2401  if (*str == ':')
2402  return (str + 1);
2403  return nullptr;
2404 }
2405 
2406 /**********************************************************/
2407 static void PopulatePcrPrimers(CBioSource& bio, PcrPrimersPtr ppp, Int4 count)
2408 {
2409  PcrPrimersPtr tppp = nullptr;
2410 
2411  string str_fs;
2412  string str_rs;
2413  string str_fn;
2414  string str_rn;
2415  Int4 num_fn;
2416  Int4 num_rn;
2417 
2418  if (! ppp || count < 1)
2419  return;
2420 
2421  CBioSource::TSubtype& subs = bio.SetSubtype();
2422  CRef<CSubSource> sub;
2423 
2424  if (count == 1) {
2425  sub.Reset(new CSubSource);
2427  sub->SetName(ppp->fwd_seq);
2428  subs.push_back(sub);
2429 
2430  sub.Reset(new CSubSource);
2432  sub->SetName(ppp->rev_seq);
2433  subs.push_back(sub);
2434 
2435  if (ppp->fwd_name && ppp->fwd_name[0] != '\0') {
2436  sub.Reset(new CSubSource);
2438  sub->SetName(ppp->fwd_name);
2439  subs.push_back(sub);
2440  }
2441 
2442  if (ppp->rev_name && ppp->rev_name[0] != '\0') {
2443  sub.Reset(new CSubSource);
2445  sub->SetName(ppp->rev_name);
2446  subs.push_back(sub);
2447  }
2448  return;
2449  }
2450 
2451  size_t len_fs = 1,
2452  len_rs = 1,
2453  len_fn = 0,
2454  len_rn = 0;
2455  num_fn = 0;
2456  num_rn = 0;
2457  for (tppp = ppp; tppp; tppp = tppp->next) {
2458  len_fs += (StringLen(tppp->fwd_seq) + 1);
2459  len_rs += (StringLen(tppp->rev_seq) + 1);
2460  if (tppp->fwd_name && tppp->fwd_name[0] != '\0') {
2461  len_fn += (StringLen(tppp->fwd_name) + 1);
2462  num_fn++;
2463  }
2464  if (tppp->rev_name && tppp->rev_name[0] != '\0') {
2465  len_rn += (StringLen(tppp->rev_name) + 1);
2466  num_rn++;
2467  }
2468  }
2469 
2470  str_fs.reserve(len_fs);
2471  str_rs.reserve(len_rs);
2472  if (len_fn > 0)
2473  str_fn.reserve(len_fn + count - num_fn + 1);
2474  if (len_rn > 0)
2475  str_rn.reserve(len_rn + count - num_rn + 1);
2476 
2477  for (tppp = ppp; tppp; tppp = tppp->next) {
2478  str_fs.append(",");
2479  str_fs.append(tppp->fwd_seq);
2480  str_rs.append(",");
2481  str_rs.append(tppp->rev_seq);
2482  if (len_fn > 0) {
2483  str_fn.append(",");
2484  if (tppp->fwd_name && tppp->fwd_name[0] != '\0')
2485  str_fn.append(tppp->fwd_name);
2486  }
2487  if (len_rn > 0) {
2488  str_rn.append(",");
2489  if (tppp->rev_name && tppp->rev_name[0] != '\0')
2490  str_rn.append(tppp->rev_name);
2491  }
2492  }
2493 
2494  if (! str_fs.empty()) {
2495  str_fs[0] = '(';
2496  str_fs += ')';
2497  }
2498 
2499  sub.Reset(new CSubSource);
2501  sub->SetName(str_fs);
2502  subs.push_back(sub);
2503 
2504  if (! str_rs.empty()) {
2505  str_rs[0] = '(';
2506  str_rs += ')';
2507  }
2508 
2509  sub.Reset(new CSubSource);
2511  sub->SetName(str_rs);
2512  subs.push_back(sub);
2513 
2514  if (! str_fn.empty()) {
2515  str_fn[0] = '(';
2516  str_fn += ')';
2517 
2518  sub.Reset(new CSubSource);
2520  sub->SetName(str_fn);
2521  subs.push_back(sub);
2522  }
2523 
2524  if (! str_rn.empty()) {
2525  str_rn[0] = '(';
2526  str_rn += ')';
2527 
2528  sub.Reset(new CSubSource);
2530  sub->SetName(str_rn);
2531  subs.push_back(sub);
2532  }
2533 }
2534 
2535 /**********************************************************/
2537 {
2539 
2540  for (; ppp; ppp = next) {
2541  next = ppp->next;
2542  if (ppp->fwd_name)
2543  MemFree(ppp->fwd_name);
2544  if (ppp->fwd_seq)
2545  MemFree(ppp->fwd_seq);
2546  if (ppp->rev_name)
2547  MemFree(ppp->rev_name);
2548  if (ppp->rev_seq)
2549  MemFree(ppp->rev_seq);
2550  delete ppp;
2551  }
2552 }
2553 
2554 /**********************************************************/
2556 {
2557  PcrPrimersPtr ppp;
2558  PcrPrimersPtr tppp = nullptr;
2559 
2560  char* p;
2561  char* q;
2562  char* r;
2563  bool comma;
2564  bool bad_start;
2565  bool empty;
2566  Char ch;
2567  Int4 count;
2568  Int4 prev; /* 1 = fwd_name, 2 = fwd_seq,
2569  3 = rev_name, 4 = rev_seq */
2570 
2571  bool got_problem = false;
2572  for (ppp = nullptr; sfbp; sfbp = sfbp->next) {
2573  if (sfbp->quals.empty() || sfbp->bio_src.Empty())
2574  continue;
2575 
2576  count = 0;
2577  for (const auto& cur : sfbp->quals) {
2578  if (cur->GetQual() != "PCR_primers" ||
2579  ! cur->IsSetVal() || cur->GetVal().empty())
2580  continue;
2581 
2582  count++;
2583  if (! ppp) {
2584  ppp = new PcrPrimers;
2585  tppp = ppp;
2586  } else {
2587  tppp->next = new PcrPrimers;
2588  tppp = tppp->next;
2589  }
2590 
2591  prev = 0;
2592  std::vector<Char> val_buf(cur->GetVal().begin(), cur->GetVal().end());
2593  val_buf.push_back(0);
2594 
2595  for (comma = false, bad_start = false, p = &val_buf[0]; *p != '\0';) {
2596  q = CheckPcrPrimersTag(p);
2597  if (! q) {
2598  if (p != &val_buf[0]) {
2599  p++;
2600  continue;
2601  }
2602  bad_start = true;
2603  break;
2604  }
2605 
2606  if (*q == ' ')
2607  q++;
2608  for (r = q;;) {
2609  r = StringChr(r, ',');
2610  if (! r)
2611  break;
2612  if (*++r == ' ')
2613  r++;
2614  if (CheckPcrPrimersTag(r))
2615  break;
2616  }
2617  if (r) {
2618  r--;
2619  if (*r == ' ')
2620  r--;
2621  if (r > q && *(r - 1) == ' ')
2622  r--;
2623  ch = *r;
2624  *r = '\0';
2625  }
2626 
2627  if (StringChr(q, ','))
2628  comma = true;
2629 
2630  empty = false;
2631  if (! q || *q == '\0')
2632  empty = true;
2633  else if (StringEquN(p, "fwd_name", 8)) {
2634  if (prev == 1)
2635  prev = -2;
2636  else if (prev > 2 && prev < 5)
2637  prev = -1;
2638  else {
2639  if (! tppp->fwd_name)
2640  tppp->fwd_name = StringSave(q);
2641  else {
2642  string s(tppp->fwd_name);
2643  s.append(":");
2644  s.append(q);
2645  MemFree(tppp->fwd_name);
2646  tppp->fwd_name = StringSave(s);
2647  }
2648  prev = 1;
2649  }
2650  } else if (StringEquN(p, "fwd_seq", 7)) {
2651  if (prev > 2 && prev < 5)
2652  prev = -1;
2653  else {
2654  if (! tppp->fwd_seq)
2655  tppp->fwd_seq = StringSave(q);
2656  else {
2657  string s(tppp->fwd_seq);
2658  s.append(":");
2659  s.append(q);
2660  MemFree(tppp->fwd_seq);
2661  tppp->fwd_seq = StringSave(s);
2662  if (prev != 1) {
2663  if (! tppp->fwd_name)
2664  tppp->fwd_name = StringSave(":");
2665  else {
2666  string s(tppp->fwd_name);
2667  s.append(":");
2668  MemFree(tppp->fwd_name);
2669  tppp->fwd_name = StringSave(s);
2670  }
2671  }
2672  }
2673  prev = 2;
2674  }
2675  } else if (StringEquN(p, "rev_name", 8)) {
2676  if (prev == 3 || prev == 1)
2677  prev = -2;
2678  else {
2679  if (! tppp->rev_name)
2680  tppp->rev_name = StringSave(q);
2681  else {
2682  string s(tppp->rev_name);
2683  s.append(":");
2684  s.append(q);
2685  MemFree(tppp->rev_name);
2686  tppp->rev_name = StringSave(s);
2687  }
2688  prev = 3;
2689  }
2690  } else {
2691  if (prev == 1)
2692  prev = -2;
2693  else {
2694  if (! tppp->rev_seq)
2695  tppp->rev_seq = StringSave(q);
2696  else {
2697  string s(tppp->rev_seq);
2698  s.append(":");
2699  s.append(q);
2700  MemFree(tppp->rev_seq);
2701  tppp->rev_seq = StringSave(s);
2702  if (prev != 3) {
2703  if (! tppp->rev_name)
2704  tppp->rev_name = StringSave(":");
2705  else {
2706  string s(tppp->rev_name);
2707  s.append(":");
2708  MemFree(tppp->rev_name);
2709  tppp->rev_name = StringSave(s);
2710  }
2711  }
2712  }
2713  prev = 4;
2714  }
2715  }
2716 
2717  if (! r)
2718  break;
2719 
2720  *r++ = ch;
2721 
2722  if (comma || prev < 0 || empty)
2723  break;
2724 
2725  if (ch == ' ')
2726  r++;
2727  if (*r == ' ')
2728  r++;
2729  p = r;
2730  }
2731 
2732  if (prev == 1 || prev == 3)
2733  prev = -2;
2734 
2735  if (bad_start) {
2736  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidPCRprimer, "Unknown text found at the beginning of /PCR_primers qualifier: \"%s\". Entry dropped.", &val_buf[0]);
2737  got_problem = true;
2738  break;
2739  }
2740 
2741  if (comma) {
2742  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_PCRprimerEmbeddedComma, "Encountered embedded comma within /PCR_primers qualifier's field value: \"%s\". Entry dropped.", &val_buf[0]);
2743  got_problem = true;
2744  break;
2745  }
2746 
2747  if (prev == -1) {
2748  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidPCRprimer, "Encountered incorrect order of \"forward\" and \"reversed\" sequences within /PCR_primers qualifier: \"%s\". Entry dropped.", &val_buf[0]);
2749  got_problem = true;
2750  break;
2751  }
2752 
2753  if (prev == -2) {
2754  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingPCRprimerSeq, "/PCR_primers qualifier \"%s\" is missing or has an empty required fwd_seq or rev_seq fields (or both). Entry dropped.", &val_buf[0]);
2755  got_problem = true;
2756  break;
2757  }
2758 
2759  if (empty) {
2760  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidPCRprimer, "/PCR_primers qualifier \"%s\" has an empty field value. Entry dropped.", &val_buf[0]);
2761  got_problem = true;
2762  break;
2763  }
2764 
2765  if (! tppp->fwd_seq || tppp->fwd_seq[0] == '\0' ||
2766  ! tppp->rev_seq || tppp->rev_seq[0] == '\0') {
2767  ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingPCRprimerSeq, "/PCR_primers qualifier \"%s\" is missing or has an empty required fwd_seq or rev_seq fields (or both). Entry dropped.", &val_buf[0]);
2768  got_problem = true;
2769  break;
2770  }
2771  }
2772 
2773  if (got_problem) {
2774  PcrPrimersFree(ppp);
2775  break;
2776  }
2777 
2778  PopulatePcrPrimers(*sfbp->bio_src, ppp, count);
2779  PcrPrimersFree(ppp);
2780  ppp = nullptr;
2781  }
2782 
2783  if (! sfbp)
2784  return true;
2785  return false;
2786 }
2787 
2788 /**********************************************************/
2790 {
2791  const char* Mmm[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", nullptr };
2792  const char** b;
2793  const char* q;
2794 
2795  char* p;
2796  char* r;
2797  char* val;
2798  Int4 year;
2799  Int4 month;
2800  Int4 day;
2801  Int4 bad;
2802  Int4 num_slash;
2803  Int4 num_T;
2804  Int4 num_colon;
2805  Int4 num_Z;
2806  size_t len;
2807 
2808  CTime time(CTime::eCurrent);
2809  CDate_std date(time);
2810 
2811  for (; sfbp; sfbp = sfbp->next) {
2812  if (sfbp->quals.empty() || sfbp->bio_src.Empty())
2813  continue;
2814 
2815  for (const auto& cur : sfbp->quals) {
2816  bad = 0;
2817  if (cur->GetQual() != "collection_date" ||
2818  ! cur->IsSetVal() || cur->GetVal().empty())
2819  continue;
2820 
2821  val = (char*)cur->GetVal().c_str();
2822  for (num_slash = 0, p = val; *p != '\0'; p++)
2823  if (*p == '/')
2824  num_slash++;
2825 
2826  if (num_slash > 1) {
2827  p = StringSave(sfbp->location);
2828  if (p && StringLen(p) > 50)
2829  p[50] = '\0';
2830  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCollectionDate, "/collection_date \"%s\" for source feature at \"%s\" has too many components.", val, p ? p : "unknown location");
2831  if (p)
2832  MemFree(p);
2833  continue;
2834  }
2835 
2836  for (val = (char*)cur->GetVal().c_str();;) {
2837  r = StringChr(val, '/');
2838  if (r)
2839  *r = '\0';
2840 
2841  len = StringLen(val);
2842 
2843  if (len == 4) {
2844  for (q = val; *q == '0';)
2845  q++;
2846  for (p = (char*)q; *p != '\0'; p++)
2847  if (*p < '0' || *p > '9')
2848  break;
2849  if (*p != '\0')
2850  bad = 1;
2851  else if (atoi(q) > date.GetYear())
2852  bad = 3;
2853  } else if (len == 8) {
2854  if (val[3] != '-')
2855  bad = 1;
2856  else {
2857  p = val;
2858  p[3] = '\0';
2859  if (source == Parser::ESource::DDBJ) {
2860  if (p[0] >= 'a' && p[0] <= 'z')
2861  p[0] &= ~040;
2862  if (p[1] >= 'A' && p[1] <= 'Z')
2863  p[1] |= 040;
2864  if (p[2] >= 'A' && p[2] <= 'Z')
2865  p[2] |= 040;
2866  }
2867  for (b = Mmm, month = 1; *b; b++, month++)
2868  if (StringEqu(*b, p))
2869  break;
2870  if (! *b)
2871  bad = 1;
2872  p[3] = '-';
2873  }
2874  if (bad == 0) {
2875  for (q = val + 4; *q == '0';)
2876  q++;
2877  for (p = (char*)q; *p != '\0'; p++)
2878  if (*p < '0' || *p > '9')
2879  break;
2880  if (*p != '\0')
2881  bad = 1;
2882  else {
2883  year = atoi(q);
2884  if (year > date.GetYear() ||
2885  (year == date.GetYear() && month > date.GetMonth()))
2886  bad = 3;
2887  }
2888  }
2889  } else if (len == 11) {
2890  if (val[2] != '-' || val[6] != '-')
2891  bad = 1;
2892  else {
2893  p = val;
2894  val[2] = '\0';
2895  val[6] = '\0';
2896  if (p[0] < '0' || p[0] > '3' || p[1] < '0' || p[1] > '9')
2897  bad = 1;
2898  else {
2899  if (*p == '0')
2900  p++;
2901  day = atoi(p);
2902  p = val + 3;
2903  if (source == Parser::ESource::DDBJ) {
2904  if (p[0] >= 'a' && p[0] <= 'z')
2905  p[0] &= ~040;
2906  if (p[1] >= 'A' && p[1] <= 'Z')
2907  p[1] |= 040;
2908  if (p[2] >= 'A' && p[2] <= 'Z')
2909  p[2] |= 040;
2910  }
2911  for (b = Mmm, month = 1; *b; b++, month++)
2912  if (StringEqu(*b, p))
2913  break;
2914  if (! *b)
2915  bad = 1;
2916  else {
2917  if (day < 1 || day > 31)
2918  bad = 2;
2919  else if (month == 2 && day > 29)
2920  bad = 2;
2921  else if ((month == 4 || month == 6 || month == 9 || month == 11) && day > 30)
2922  bad = 2;
2923  }
2924  }
2925  if (bad == 0) {
2926  for (q = val + 7; *q == '0';)
2927  q++;
2928  for (p = (char*)q; *p != '\0'; p++)
2929  if (*p < '0' || *p > '9')
2930  break;
2931  if (*p != '\0')
2932  bad = 1;
2933  else {
2934  year = atoi(q) - 1900;
2935  if (year > date.GetYear() ||
2936  (year == date.GetYear() && month > date.GetMonth()) ||
2937  (year == date.GetYear() && month == date.GetMonth() && day > date.GetDay()))
2938  bad = 3;
2939  }
2940  }
2941  val[2] = '-';
2942  val[6] = '-';
2943  }
2944  } else if (len == 7 || len == 10 || len == 14 || len == 17 ||
2945  len == 20) {
2946  num_T = 0;
2947  num_Z = 0;
2948  num_colon = 0;
2949  for (p = val; *p != '\0'; p++) {
2950  if ((*p < 'a' || *p > 'z') && (*p < 'A' || *p > 'Z') &&
2951  (*p < '0' || *p > '9') && *p != '-' && *p != '/' &&
2952  *p != ':') {
2953  bad = 3;
2954  break;
2955  }
2956  if (*p == ':')
2957  num_colon++;
2958  else if (*p == 'T')
2959  num_T++;
2960  else if (*p == 'Z')
2961  num_Z++;
2962  }
2963  if (len == 7 || len == 10) {
2964  if (num_T > 0)
2965  bad = 4;
2966  if (num_Z > 0)
2967  bad = 5;
2968  if (num_colon > 0)
2969  bad = 6;
2970  } else {
2971  if (num_Z > 1)
2972  bad = 5;
2973  if (num_T > 1)
2974  bad = 4;
2975  if ((len == 14 && num_colon > 0) ||
2976  (len == 17 && num_colon > 1) ||
2977  (len == 20 && num_colon > 2))
2978  bad = 6;
2979  }
2980  } else
2981  bad = 8;
2982 
2983  if (bad == 0) {
2984  if (! r)
2985  break;
2986 
2987  *r = '/';
2988  val = r + 1;
2989  continue;
2990  }
2991 
2992  p = StringSave(sfbp->location);
2993  if (p && StringLen(p) > 50)
2994  p[50] = '\0';
2995  if (bad == 1)
2996  q = "is not of the format DD-Mmm-YYYY, Mmm-YYYY, or YYYY";
2997  else if (bad == 2)
2998  q = "has an illegal day value for the stated month";
2999  else if (bad == 3)
3000  q = "has invalid characters";
3001  else if (bad == 4)
3002  q = "has too many time values";
3003  else if (bad == 5)
3004  q = "has too many Zulu indicators";
3005  else if (bad == 6)
3006  q = "has too many hour and minute delimiters";
3007  else
3008  q = "has not yet occured";
3009  ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCollectionDate, "/collection_date \"%s\" for source feature at \"%s\" %s.", val, p ? p : "unknown location", q);
3010  if (p)
3011  MemFree(p);
3012 
3013  if (! r)
3014  break;
3015 
3016  *r = '/';
3017  val = r + 1;
3018  }
3019  }
3020  }
3021 }
3022 
3023 /**********************************************************/
3025 {
3026  const char** b;
3027 
3028  if (! sfbp || ! sfbp->next)
3029  return false;
3030 
3031  for (; sfbp; sfbp = sfbp->next) {
3032  if (! sfbp->full)
3033  continue;
3034 
3035  for (b = special_orgs; *b; b++)
3036  if (NStr::CompareNocase(*b, sfbp->name) == 0)
3037  break;
3038 
3039  if (*b)
3040  break;
3041  }
3042 
3043  if (sfbp)
3044  return false;
3045  return true;
3046 }
3047 
3048 /**********************************************************/
3049 static void CheckMetagenome(CBioSource& bio)
3050 {
3051  if (! bio.IsSetOrg())
3052  return;
3053 
3054  bool metatax = false;
3055  bool metalin = false;
3056 
3057  if (bio.IsSetOrgname() && bio.GetOrgname().IsSetLineage() &&
3058  StringStr(bio.GetOrgname().GetLineage().c_str(), "metagenomes"))
3059  metalin = true;
3060 
3061  if (bio.GetOrg().IsSetTaxname() &&
3062  StringStr(bio.GetOrg().GetTaxname().c_str(), "metagenome"))
3063  metatax = true;
3064 
3065  if (! metalin && ! metatax)
3066  return;
3067 
3068  const Char* taxname = bio.GetOrg().IsSetTaxname() ? bio.GetOrg().GetTaxname().c_str() : nullptr;
3069  if (! taxname || taxname[0] == 0)
3070  taxname = "unknown";
3071 
3072  if (metalin && metatax) {
3073  CRef<CSubSource> sub(new CSubSource);
3075  sub->SetName("");
3076  bio.SetSubtype().push_back(sub);
3077  } else if (! metalin)
3078  ErrPostEx(SEV_ERROR, ERR_ORGANISM_LineageLacksMetagenome, "Organism name \"%s\" contains \"metagenome\" but the lineage lacks the \"metagenomes\" classification.", taxname);
3079  else
3080  ErrPostEx(SEV_ERROR, ERR_ORGANISM_OrgNameLacksMetagenome, "Lineage includes the \"metagenomes\" classification but organism name \"%s\" lacks \"metagenome\".", taxname);
3081 }
3082 
3083 /**********************************************************/
3084 static bool CheckSubmitterSeqidQuals(SourceFeatBlkPtr sfbp, char* acc)
3085 {
3086  SourceFeatBlkPtr tsfbp;
3087  char* ssid;
3088  Int4 count_feat;
3089  Int4 count_qual;
3090 
3091  if (! sfbp)
3092  return (true);
3093 
3094  count_feat = 0;
3095  count_qual = 0;
3096  for (ssid = nullptr, tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
3097  count_feat++;
3098  if (! tsfbp->submitter_seqid)
3099  continue;
3100 
3101  count_qual++;
3102  if (tsfbp->submitter_seqid[0] == '\0') {
3103  ErrPostEx(SEV_REJECT, ERR_SOURCE_MultipleSubmitterSeqids, "Multiple /submitter_seqid qualifiers were encountered within source feature at location \"%s\". Entry dropped.", tsfbp->location ? tsfbp->location : "?empty?");
3104  break;
3105  }
3106 
3107  if (! ssid)
3108  ssid = tsfbp->submitter_seqid;
3109  else if (! StringEqu(ssid, tsfbp->submitter_seqid)) {
3110  ErrPostEx(SEV_REJECT, ERR_SOURCE_DifferentSubmitterSeqids, "Different /submitter_seqid qualifiers were encountered amongst source features: \"%s\" and \"%s\" at least. Entry dropped.", ssid, tsfbp->submitter_seqid);
3111  break;
3112  }
3113  }
3114 
3115  if (tsfbp)
3116  return (false);
3117 
3118  if (count_feat == count_qual)
3119  return (true);
3120 
3121  ErrPostEx(SEV_REJECT, ERR_SOURCE_LackingSubmitterSeqids, "One ore more source features are lacking /submitter_seqid qualifiers provided in others. Entry dropped.");
3122  return (false);
3123 }
3124 
3125 /**********************************************************/
3126 void ParseSourceFeat(ParserPtr pp, DataBlkPtr dbp, TSeqIdList& seqids, Int2 type, CBioseq& bioseq, TSeqFeatList& seq_feats)
3127 {
3128  SourceFeatBlkPtr sfbp;
3129  SourceFeatBlkPtr tsfbp;
3130 
3131  MinMaxPtr mmp;
3132  IndexblkPtr ibp;
3133  char* res;
3134  char* acc;
3135  Int4 i;
3136  Int4 use_what = USE_ALL;
3137  bool err;
3138  ErrSev sev;
3139  bool need_focus;
3140  bool already;
3141 
3142  ibp = pp->entrylist[pp->curindx];
3143  acc = ibp->acnum;
3144  size_t len = ibp->bases;
3145 
3146  if (ibp->segnum < 2)
3147  pp->errstat = 0;
3148 
3149  sfbp = CollectSourceFeats(dbp, type);
3150  if (! sfbp) {
3151  ErrPostEx(SEV_REJECT, ERR_SOURCE_FeatureMissing, "Required source feature is missing. Entry dropped.");
3152  return;
3153  }
3154 
3155  RemoveSourceFeatSpaces(sfbp);
3156  CheckForExemption(sfbp);
3157 
3158  if (! CheckSourceFeatLocFuzz(sfbp)) {
3159  SourceFeatBlkSetFree(sfbp);
3160  return;
3161  }
3162 
3163  res = CheckSourceFeatLocAccs(sfbp, acc);
3164  if (res) {
3165  ErrPostEx(SEV_REJECT, ERR_SOURCE_BadLocation, "Source feature location points to another record: \"%s\". Entry dropped.", res);
3166  SourceFeatBlkSetFree(sfbp);
3167  return;
3168  }
3169 
3170  if (! SourceFeatStructFillIn(ibp, sfbp, use_what)) {
3171  ErrPostEx(SEV_REJECT, ERR_SOURCE_MultipleMolTypes, "Multiple /mol_type qualifiers were encountered within source feature. Entry dropped.");
3172  SourceFeatBlkSetFree(sfbp);
3173  return;
3174  }
3175 
3176  if (! ibp->submitter_seqid.empty() && ! CheckSubmitterSeqidQuals(sfbp, acc)) {
3177  ibp->submitter_seqid.clear();
3178  SourceFeatBlkSetFree(sfbp);
3179  return;
3180  }
3181 
3182  if (! CheckMoltypeConsistency(sfbp, ibp->moltype)) {
3183  ErrPostEx(SEV_REJECT, ERR_SOURCE_InconsistentMolType, "Inconsistent /mol_type qualifiers were encountered. Entry dropped.");
3184  SourceFeatBlkSetFree(sfbp);
3185  return;
3186  }
3187 
3189  if (res) {
3190  ErrPostEx(SEV_REJECT, ERR_SOURCE_FocusAndTransposonNotAllowed, "/transposon (or /insertion_seq) qualifiers should not be used in conjunction with /focus. Source feature at \"%s\". Entry dropped.", res);
3191  SourceFeatBlkSetFree(sfbp);
3192  return;
3193  }
3194 
3195  res = CheckSourceFeatOrgs(sfbp, &i);
3196  if (res) {
3197  if (i == 1) {
3198  ErrPostEx(SEV_REJECT, ERR_SOURCE_NoOrganismQual, "/organism qualifier contains only organell/genome name. No genus/species present. Source feature at \"%s\". Entry dropped.", res);
3199  } else {
3200  ErrPostEx(SEV_REJECT, ERR_SOURCE_OrganismIncomplete, "Required /organism qualifier is containing genome info only at \"%s\". Entry dropped.", res);
3201  }
3202  SourceFeatBlkSetFree(sfbp);
3203  return;
3204  }
3205 
3206  CompareDescrFeatSources(sfbp, bioseq);
3207 
3208  CreateRawBioSources(pp, sfbp, use_what);
3209 
3210  if (! CheckSourceLineage(sfbp, pp->source, ibp->is_pat)) {
3211  SourceFeatBlkSetFree(sfbp);
3212  return;
3213  }
3214 
3215  PropogateSuppliedLineage(bioseq, sfbp, pp->taxserver);
3216 
3217  mmp = new MinMax;
3218  i = CheckSourceFeatCoverage(sfbp, mmp, len);
3219  if (i != 0) {
3220  if (i == 1) {
3221  ErrPostEx(SEV_REJECT, ERR_SOURCE_IncompleteCoverage, "Supplied source features do not span every base of the sequence. Entry dropped.");
3222  } else {
3223  ErrPostEx(SEV_REJECT, ERR_SOURCE_ExcessCoverage, "Sequence is spanned by too many source features. Entry dropped.");
3224  }
3225  SourceFeatBlkSetFree(sfbp);
3226  MinMaxFree(mmp);
3227  return;
3228  }
3229 
3230  if (! CheckForENV(sfbp, ibp, pp->source)) {
3231  SourceFeatBlkSetFree(sfbp);
3232  MinMaxFree(mmp);
3233  return;
3234  }
3235 
3236  if (! CheckSYNTGNDivision(sfbp, ibp->division)) {
3237  SourceFeatBlkSetFree(sfbp);
3238  MinMaxFree(mmp);
3239  return;
3240  }
3241 
3242  if (pp->source == Parser::ESource::EMBL)
3243  need_focus = CheckNeedSYNFocus(sfbp);
3244  else
3245  need_focus = true;
3246 
3247  already = false;
3248  i = CheckTransgenicSourceFeats(sfbp);
3249  if (i == 5) {
3251  sev = SEV_WARNING;
3252  else
3253  sev = SEV_ERROR;
3254  ErrPostEx(sev, ERR_SOURCE_TransSingleOrgName, "Use of /transgenic requires at least two source features with differences among /organism, /strain, /organelle, and /isolate, between the host and foreign organisms.");
3255  } else if (i > 0) {
3256  sev = SEV_REJECT;
3257  if (i == 1) {
3258  ErrPostEx(sev, ERR_SOURCE_TransgenicTooShort, "Source feature with /transgenic qualifier does not span the entire sequence. Entry dropped.");
3259  } else if (i == 2) {
3260  ErrPostEx(sev, ERR_SOURCE_FocusAndTransgenicQuals, "Both /focus and /transgenic qualifiers exist; these quals are mutually exclusive. Entry dropped.");
3261  } else if (i == 3) {
3262  ErrPostEx(sev, ERR_SOURCE_MultipleTransgenicQuals, "Multiple source features have /transgenic qualifiers. Entry dropped.");
3263  } else {
3264  already = true;
3265  if (! need_focus)
3266  sev = SEV_ERROR;
3267  ErrPostEx(sev, ERR_SOURCE_FocusQualMissing, "Multiple organism names exist, but no source feature has a /focus qualifier.%s", (sev == SEV_ERROR) ? "" : " Entry dropped.");
3268  }
3269 
3270  if (sev == SEV_REJECT) {
3271  SourceFeatBlkSetFree(sfbp);
3272  MinMaxFree(mmp);
3273  return;
3274  }
3275  }
3276 
3277  res = CheckWholeSourcesVersusFocused(sfbp);
3278  if (res) {
3279  ErrPostEx(SEV_REJECT, ERR_SOURCE_FocusQualNotFullLength, "/focus qualifier should be used for the full-length source feature, not on source feature at \"%s\".", res);
3280  SourceFeatBlkSetFree(sfbp);
3281  MinMaxFree(mmp);
3282  return;
3283  }
3284  i = CheckFocusInOrgs(sfbp, len, &pp->errstat);
3285  if (pp->errstat != 0 && (ibp->segnum == 0 || pp->errstat == ibp->segtotal))
3286  i = 1;
3287  if (i > 0) {
3288  sev = SEV_REJECT;
3289  if (i == 1) {
3290  ErrPostEx(sev, ERR_SOURCE_FocusQualNotNeeded, "/focus qualifier present, but only one organism name exists. Entry dropped.");
3291  } else if (i == 2) {
3292  ErrPostEx(sev, ERR_SOURCE_MultipleOrganismWithFocus, "/focus qualifiers exist on source features with differing organism names. Entry dropped.");
3293  } else {
3294  if (! need_focus)
3295  sev = SEV_ERROR;
3296  if (! already)
3297  ErrPostEx(sev, ERR_SOURCE_FocusQualMissing, "Multiple organism names exist, but no source feature has a /focus qualifier.%s", (sev == SEV_ERROR) ? "" : " Entry dropped.");
3298  }
3299 
3300  if (sev == SEV_REJECT) {
3301  SourceFeatBlkSetFree(sfbp);
3302  MinMaxFree(mmp);
3303  return;
3304  }
3305  }
3306  res = CheckSourceOverlap(mmp->next, len);
3307  MinMaxFree(mmp);
3308  if (res) {
3309  ErrPostEx(SEV_REJECT, ERR_SOURCE_MultiOrgOverlap, "Overlapping source features have different organism names %s. Entry dropped.", res);
3310  SourceFeatBlkSetFree(sfbp);
3311  MemFree(res);
3312  return;
3313  }
3314 
3315  res = CheckForUnusualFullLengthOrgs(sfbp);
3316  if (res) {
3317  ErrPostEx(SEV_WARNING, ERR_SOURCE_UnusualOrgName, "Unusual organism name \"%s\" encountered for full-length source feature.", res);
3318  }
3319 
3320  for (tsfbp = sfbp, i = 0; tsfbp; tsfbp = tsfbp->next)
3321  i++;
3322  if (i > BIOSOURCES_THRESHOLD) {
3323  ErrPostEx(SEV_WARNING, ERR_SOURCE_ManySourceFeats, "This record has more than %d source features.", BIOSOURCES_THRESHOLD);
3324  }
3325 
3326  if (! ParsePcrPrimers(sfbp)) {
3327  SourceFeatBlkSetFree(sfbp);
3328  return;
3329  }
3330 
3331  CheckCollectionDate(sfbp, pp->source);
3332 
3333  sfbp = PickTheDescrSource(sfbp);
3334  if (! sfbp || ! UpdateRawBioSource(sfbp, pp->source, ibp, pp->taxserver)) {
3335  SourceFeatBlkSetFree(sfbp);
3336  return;
3337  }
3338 
3339  if (sfbp->focus)
3340  sfbp->bio_src->SetIs_focus();
3341  else
3342  sfbp->bio_src->ResetIs_focus();
3343 
3344 
3345  for (tsfbp = sfbp; tsfbp; tsfbp = tsfbp->next) {
3346  CheckMetagenome(*tsfbp->bio_src);
3347 
3348  CRef<CSeq_feat> feat(new CSeq_feat);
3349  feat->SetData().SetBiosrc(*tsfbp->bio_src);
3350 
3351  if (pp->buf)
3352  MemFree(pp->buf);
3353  pp->buf = nullptr;
3354 
3355  GetSeqLocation(*feat, tsfbp->location, seqids, &err, pp, "source");
3356 
3357  if (err) {
3358  ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped, "/source|%s| range check detects problems. Entry dropped.", tsfbp->location);
3359  break;
3360  }
3361 
3362  if (! tsfbp->quals.empty()) {
3363  auto p = GetTheQualValue(tsfbp->quals, "evidence");
3364  if (p) {
3365  if (NStr::CompareNocase(p->c_str(), "experimental") == 0)
3367  else if (NStr::CompareNocase(p->c_str(), "not_experimental") == 0)
3369  }
3370  }
3371 
3372  seq_feats.push_back(feat);
3373  }
3374 
3375  SourceFeatBlkSetFree(sfbp);
3376 
3377  if (tsfbp)
3378  seq_feats.clear();
3379 }
3380 
Data storage class.
bool fta_strings_same(const char *s1, const char *s2)
Definition: add.cpp:903
void ShrinkSpaces(char *line)
Definition: asci_blk.cpp:118
void fta_sort_biosource(objects::CBioSource &bio)
const COrgName & GetOrgname(void) const
Definition: BioSource.cpp:410
bool IsSetOrgname(void) const
Definition: BioSource.cpp:405
static bool WasValid(const string &country)
Definition: SubSource.cpp:3377
static bool IsValid(const string &country)
Definition: SubSource.cpp:3304
Definition: Dbtag.hpp:53
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
bool IsSetOrgMod(void) const
Definition: Org_ref.cpp:169
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CTime –.
Definition: ncbitime.hpp:296
CFlatFileData * mpData
Definition: ftablock.h:331
DataBlk * mpNext
Definition: ftablock.h:336
int mType
Definition: ftablock.h:330
#define ERR_SOURCE_InconsistentMolType
Definition: flat2err.h:523
#define ERR_SOURCE_MissingMolType
Definition: flat2err.h:528
#define ERR_SOURCE_FocusAndTransposonNotAllowed
Definition: flat2err.h:507
#define ERR_SOURCE_ExcessCoverage
Definition: flat2err.h:518
#define ERR_SOURCE_IncompleteCoverage
Definition: flat2err.h:496
#define ERR_SOURCE_InvalidDbXref
Definition: flat2err.h:491
#define ERR_SOURCE_TransSingleOrgName
Definition: flat2err.h:519
#define ERR_QUALIFIER_PCRprimerEmbeddedComma
Definition: flat2err.h:123
#define ERR_SOURCE_OrganelleQualMultToks
Definition: flat2err.h:486
#define ERR_SOURCE_UnwantedQualifiers
Definition: flat2err.h:504
#define ERR_SOURCE_PartialLocation
Definition: flat2err.h:520
#define ERR_SOURCE_DifferentSubmitterSeqids
Definition: flat2err.h:545
#define ERR_SOURCE_FormerCountry
Definition: flat2err.h:543
#define ERR_SOURCE_OrganismIncomplete
Definition: flat2err.h:503
#define ERR_SOURCE_MultipleOrganismWithFocus
Definition: flat2err.h:499
#define ERR_SOURCE_UnusualOrgName
Definition: flat2err.h:509
#define ERR_ORGANISM_UnclassifiedLineage
Definition: flat2err.h:190
#define ERR_SERVER_NoLineageFromTaxon
Definition: flat2err.h:472
#define ERR_SOURCE_MissingSourceFeatureForDescr
Definition: flat2err.h:506
#define ERR_SOURCE_MissingEnvSampQual
Definition: flat2err.h:536
#define ERR_DIVISION_TGNnotTransgenic
Definition: flat2err.h:259
#define ERR_SOURCE_MultipleSubmitterSeqids
Definition: flat2err.h:544
#define ERR_ORGANISM_OrgNameLacksMetagenome
Definition: flat2err.h:200
#define ERR_SOURCE_ManySourceFeats
Definition: flat2err.h:505
#define ERR_SOURCE_FocusQualMissing
Definition: flat2err.h:500
#define ERR_SOURCE_InvalidCollectionDate
Definition: flat2err.h:542
#define ERR_SOURCE_UnusualLocation
Definition: flat2err.h:502
#define ERR_DIVISION_TransgenicNotSYN_TGN
Definition: flat2err.h:260
#define ERR_QUALIFIER_InvalidPCRprimer
Definition: flat2err.h:121
#define ERR_SOURCE_LackingSubmitterSeqids
Definition: flat2err.h:546
#define ERR_SOURCE_OrganelleIllegalClass
Definition: flat2err.h:487
#define ERR_SOURCE_MultipleMolTypes
Definition: flat2err.h:524
#define ERR_SOURCE_BadLocation
Definition: flat2err.h:494
#define ERR_SOURCE_FocusQualNotFullLength
Definition: flat2err.h:508
#define ERR_SOURCE_MultiOrgOverlap
Definition: flat2err.h:501
#define ERR_FEATURE_Dropped
Definition: flat2err.h:337
#define ERR_SOURCE_MultipleTransgenicQuals
Definition: flat2err.h:517
#define ERR_SOURCE_InconsistentEnvSampQual
Definition: flat2err.h:535
#define ERR_ORGANISM_SynOrgNameNotSYNdivision
Definition: flat2err.h:198
#define ERR_QUALIFIER_MissingPCRprimerSeq
Definition: flat2err.h:122
#define ERR_SOURCE_FocusQualNotNeeded
Definition: flat2err.h:498
#define ERR_SOURCE_PartialQualifier
Definition: flat2err.h:521
#define ERR_SOURCE_FocusAndTransgenicQuals
Definition: flat2err.h:516
#define ERR_ORGANISM_NoSourceFeatMatch
Definition: flat2err.h:189
#define ERR_SOURCE_TransgenicTooShort
Definition: flat2err.h:515
#define ERR_ORGANISM_LineageLacksMetagenome
Definition: flat2err.h:199
#define ERR_SOURCE_ObsoleteDbXref
Definition: flat2err.h:541
#define ERR_SOURCE_InvalidLocation
Definition: flat2err.h:493
#define ERR_SOURCE_FeatureMissing
Definition: flat2err.h:492
#define ERR_SOURCE_SingleSourceTooShort
Definition: flat2err.h:522
#define ERR_SOURCE_InvalidCountry
Definition: flat2err.h:485
#define ERR_SOURCE_NoOrganismQual
Definition: flat2err.h:495
static void FTASubSourceAdd(CBioSource &bio, const Char *val, CSubSource::ESubtype type)
Definition: fta_src.cpp:1729
static void SourceFeatBlkFree(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:328
USING_SCOPE(objects)
#define USE_VARIETY
Definition: fta_src.cpp:83
static void PopulateSubNames(string &namstr, const Char *name, const Char *value, COrgMod::ESubtype subtype, TOrgModList &mods)
Definition: fta_src.cpp:442
static const char * GenomicSourceFeatQual[]
Definition: fta_src.cpp:285
#define USE_SUB_SPECIES
Definition: fta_src.cpp:81
static void PcrPrimersFree(PcrPrimersPtr ppp)
Definition: fta_src.cpp:2536
#define USE_CULTIVAR
Definition: fta_src.cpp:75
static void RemoveSourceFeatSpaces(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:407
static char * CheckWholeSourcesVersusFocused(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:1058
static char * CheckSourceFeatLocAccs(SourceFeatBlkPtr sfbp, char *acc)
Definition: fta_src.cpp:834
static bool IfSpecialFeat(MinMaxPtr mmp, size_t len)
Definition: fta_src.cpp:1266
static const char * OrganelleFirstToken[]
Definition: fta_src.cpp:312
static void PopulatePcrPrimers(CBioSource &bio, PcrPrimersPtr ppp, Int4 count)
Definition: fta_src.cpp:2407
static const char * unusual_toks[]
Definition: fta_src.cpp:197
#define USE_ISOLATE
Definition: fta_src.cpp:76
static SourceFeatBlkPtr CollectSourceFeats(DataBlkPtr dbp, Int2 type)
Definition: fta_src.cpp:362
#define USE_SEROTYPE
Definition: fta_src.cpp:77
static const char * special_orgs[]
Definition: fta_src.cpp:190
#define USE_STRAIN
Definition: fta_src.cpp:80
static bool SourceFeatStructFillIn(IndexblkPtr ibp, SourceFeatBlkPtr sfbp, Int4 use_what)
Definition: fta_src.cpp:528
static void CheckQualsInSourceFeat(CBioSource &bio, TQualVector &quals, Uint1 taxserver)
Definition: fta_src.cpp:1751
static Int4 CheckFocusInOrgs(SourceFeatBlkPtr sfbp, size_t len, int *status)
Definition: fta_src.cpp:1190
static const char * SourceSubSources[]
Definition: fta_src.cpp:222
static const char * DENLRSourceDbxrefTag[]
Definition: fta_src.cpp:139
static void CompareDescrFeatSources(SourceFeatBlkPtr sfbp, const CBioseq &bioseq)
Definition: fta_src.cpp:2109
#define USE_SPECIMEN_VOUCHER
Definition: fta_src.cpp:79
static bool CheckMoltypeConsistency(SourceFeatBlkPtr sfbp, string &moltype)
Definition: fta_src.cpp:2283
static void CheckCollectionDate(SourceFeatBlkPtr sfbp, Parser::ESource source)
Definition: fta_src.cpp:2789
static Int4 CheckTransgenicSourceFeats(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:1119
static void CheckMetagenome(CBioSource &bio)
Definition: fta_src.cpp:3049
#define USE_SUB_STRAIN
Definition: fta_src.cpp:82
static char * CheckSourceFeatOrgs(SourceFeatBlkPtr sfbp, int *status)
Definition: fta_src.cpp:744
static void PropogateSuppliedLineage(CBioseq &bioseq, SourceFeatBlkPtr sfbp, Uint1 taxserver)
Definition: fta_src.cpp:2193
static const char * NLRSourceDbxrefTag[]
Definition: fta_src.cpp:179
static void CreateRawBioSources(ParserPtr pp, SourceFeatBlkPtr sfbp, Int4 use_what)
Definition: fta_src.cpp:1346
#define BIOSOURCES_THRESHOLD
Definition: fta_src.cpp:87
static const char * exempt_quals[]
Definition: fta_src.cpp:184
static void MinMaxFree(MinMaxPtr mmp)
Definition: fta_src.cpp:891
static const char * SourceBadQuals[]
Definition: fta_src.cpp:215
static SourceFeatBlkPtr SourceFeatBlkNew(void)
Definition: fta_src.cpp:322
static const char * source_genomes[]
Definition: fta_src.cpp:202
static Int4 CheckSourceFeatCoverage(SourceFeatBlkPtr sfbp, MinMaxPtr mmp, size_t len)
Definition: fta_src.cpp:918
#define USE_SEROVAR
Definition: fta_src.cpp:78
static bool CheckSourceFeatLocFuzz(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:761
static bool CheckSYNTGNDivision(SourceFeatBlkPtr sfbp, char *div)
Definition: fta_src.cpp:1076
static SourceFeatBlkPtr SourceFeatRemoveDups(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:1485
static char * CheckSourceOverlap(MinMaxPtr mmp, size_t len)
Definition: fta_src.cpp:1274
static char * CheckSourceFeatFocusAndTransposon(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:731
static const char * ObsoleteSourceDbxrefTag[]
Definition: fta_src.cpp:134
static bool CheckForENV(SourceFeatBlkPtr sfbp, IndexblkPtr ibp, Parser::ESource source)
Definition: fta_src.cpp:2324
static SourceFeatBlkPtr PickTheDescrSource(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:1619
static const char * DESourceDbxrefTag[]
Definition: fta_src.cpp:165
static const CharUInt1 SourceOrgMods[]
Definition: fta_src.cpp:265
static char * CheckPcrPrimersTag(char *str)
Definition: fta_src.cpp:2388
static bool CheckNeedSYNFocus(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:3024
static const char * ESourceDbxrefTag[]
Definition: fta_src.cpp:174
static bool UpdateRawBioSource(SourceFeatBlkPtr sfbp, Parser::ESource source, IndexblkPtr ibp, Uint1 taxserver)
Definition: fta_src.cpp:1937
static char * CheckForUnusualFullLengthOrgs(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:1304
bool fta_if_special_org(const Char *name)
Definition: fta_src.cpp:902
static void RemoveStringSpaces(char *line)
Definition: fta_src.cpp:392
static bool is_a_space_char(Char c)
Definition: fta_src.cpp:2103
static void CheckForExemption(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:423
static bool ParsePcrPrimers(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:2555
static CRef< CDbtag > GetSourceDbtag(CRef< CGb_qual > &qual, Parser::ESource source)
Definition: fta_src.cpp:1825
#define USE_ALL
Definition: fta_src.cpp:85
static void AddOrgMod(COrg_ref &org_ref, const Char *val, COrgMod::ESubtype type)
Definition: fta_src.cpp:1717
#define USE_ECOTYPE
Definition: fta_src.cpp:84
static void CollectSubNames(SourceFeatBlkPtr sfbp, Int4 use_what, const Char *name, const Char *cultivar, const Char *isolate, const Char *serotype, const Char *serovar, const Char *specimen_voucher, const Char *strain, const Char *sub_species, const Char *sub_strain, const Char *variety, const Char *ecotype)
Definition: fta_src.cpp:457
void ParseSourceFeat(ParserPtr pp, DataBlkPtr dbp, TSeqIdList &seqids, Int2 type, CBioseq &bioseq, TSeqFeatList &seq_feats)
Definition: fta_src.cpp:3126
static bool CheckSourceLineage(SourceFeatBlkPtr sfbp, Parser::ESource source, bool is_pat)
Definition: fta_src.cpp:2163
static SourceFeatBlkPtr SourceFeatDerive(SourceFeatBlkPtr sfbp, SourceFeatBlkPtr res)
Definition: fta_src.cpp:1543
static bool CheckSubmitterSeqidQuals(SourceFeatBlkPtr sfbp, char *acc)
Definition: fta_src.cpp:3084
static void SourceFeatBlkSetFree(SourceFeatBlkPtr sfbp)
Definition: fta_src.cpp:351
static SourceFeatBlkPtr SourceFeatMoveOneUp(SourceFeatBlkPtr where, SourceFeatBlkPtr what)
Definition: fta_src.cpp:1461
std::list< CRef< objects::COrgMod > > TOrgModList
Definition: ftablock.h:58
std::list< CRef< objects::CSeq_id > > TSeqIdList
Definition: ftablock.h:57
std::list< CRef< objects::CSeq_feat > > TSeqFeatList
Definition: ftablock.h:55
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:125
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:115
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:105
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void fta_fix_orgref(ParserPtr pp, COrg_ref &org_ref, bool *drop, char *organelle)
Definition: ftanet.cpp:937
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static const char * str(char *buf, int n)
Definition: stats.c:84
static const char location[]
Definition: config.c:97
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
ErrSev
Definition: ncbierr.hpp:63
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define NCBI_UNUSED
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
Definition: SubSource_.hpp:319
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool CanGetOrg(void) const
Check if it is safe to call GetOrg method.
Definition: BioSource_.hpp:503
list< CRef< CSubSource > > TSubtype
Definition: BioSource_.hpp:145
void SetIs_focus(void)
Set NULL data member (assign 'NULL' value to Is_focus data member).
Definition: BioSource_.hpp:570
void ResetIs_focus(void)
Reset Is_focus data member.
Definition: BioSource_.hpp:564
void SetOrigin(TOrigin value)
Assign a value to Origin data member.
Definition: BioSource_.hpp:478
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
void SetGenome(TGenome value)
Assign a value to Genome data member.
Definition: BioSource_.hpp:428
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
void SetName(const TName &value)
Assign a value to Name data member.
Definition: SubSource_.hpp:359
EGenome
biological context
Definition: BioSource_.hpp:97
TSubtype & SetSubtype(void)
Assign a value to Subtype data member.
Definition: BioSource_.hpp:545
@ eSubtype_fwd_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:117
@ eSubtype_rev_primer_seq
sequence (possibly more than one; semicolon-separated)
Definition: SubSource_.hpp:118
@ eOrigin_artificial
artificially engineered
Definition: BioSource_.hpp:133
TYear GetYear(void) const
Get the Year member data.
Definition: Date_std_.hpp:426
TMonth GetMonth(void) const
Get the Month member data.
Definition: Date_std_.hpp:473
TDay GetDay(void) const
Get the Day member data.
Definition: Date_std_.hpp:520
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
bool CanGetMod(void) const
Check if it is safe to call GetMod method.
Definition: OrgName_.hpp:833
const TLineage & GetLineage(void) const
Get the Lineage member data.
Definition: OrgName_.hpp:864
bool CanGetDiv(void) const
Check if it is safe to call GetDiv method.
Definition: OrgName_.hpp:999
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
void ResetLineage(void)
Reset Lineage data member.
Definition: OrgName_.cpp:274
bool IsSetCommon(void) const
common name Check if a value has been assigned to Common data member.
Definition: Org_ref_.hpp:407
bool IsSetLineage(void) const
lineage with semicolon separators Check if a value has been assigned to Lineage data member.
Definition: OrgName_.hpp:852
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
const TCommon & GetCommon(void) const
Get the Common member data.
Definition: Org_ref_.hpp:419
bool CanGetTaxname(void) const
Check if it is safe to call GetTaxname method.
Definition: Org_ref_.hpp:366
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
bool CanGetOrgname(void) const
Check if it is safe to call GetOrgname method.
Definition: Org_ref_.hpp:535
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
TMod & SetMod(void)
Assign a value to Mod data member.
Definition: OrgName_.hpp:845
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
void SetLineage(const TLineage &value)
Assign a value to Lineage data member.
Definition: OrgName_.hpp:873
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_substrain
Definition: OrgMod_.hpp:86
@ eSubtype_sub_species
Definition: OrgMod_.hpp:105
@ eSubtype_nat_host
natural host of this specimen
Definition: OrgMod_.hpp:104
@ eSubtype_cultivar
Definition: OrgMod_.hpp:93
@ eSubtype_variety
Definition: OrgMod_.hpp:89
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_metagenome_source
Definition: OrgMod_.hpp:120
@ eSubtype_type_material
Definition: OrgMod_.hpp:121
@ eSubtype_specimen_voucher
Definition: OrgMod_.hpp:106
@ eSubtype_serotype
Definition: OrgMod_.hpp:90
@ eSubtype_serovar
Definition: OrgMod_.hpp:92
@ eSubtype_bio_material
Definition: OrgMod_.hpp:119
@ eSubtype_culture_collection
Definition: OrgMod_.hpp:118
@ eSubtype_ecotype
Definition: OrgMod_.hpp:110
@ eSubtype_isolate
Definition: OrgMod_.hpp:100
const TVal & GetVal(void) const
Get the Val member data.
Definition: Gb_qual_.hpp:259
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void SetExp_ev(TExp_ev value)
Assign a value to Exp_ev data member.
Definition: Seq_feat_.hpp:1277
void SetVal(const TVal &value)
Assign a value to Val data member.
Definition: Gb_qual_.hpp:268
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Gb_qual_.hpp:212
@ eExp_ev_experimental
any reasonable experimental check
Definition: Seq_feat_.hpp:102
@ eExp_ev_not_experimental
similarity, pattern, etc
Definition: Seq_feat_.hpp:103
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is whole
int i
int len
bool GetSeqLocation(CSeq_feat &feat, char *location, TSeqIdList &ids, bool *hard_err, ParserPtr pp, const char *name)
Definition: loadfeat.cpp:910
constexpr bool empty(list< Ts... >) noexcept
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const CharType(& source)[N]
Definition: pointer.h:1149
const char * tag
T max(T x_, T y_)
T min(T x_, T y_)
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
const char * name
Definition: fta_src.cpp:71
COrgMod::ESubtype num
Definition: fta_src.cpp:72
TQualVector quals
Definition: loadfeat.h:49
char * location
Definition: loadfeat.h:46
char * key
Definition: loadfeat.h:45
Char acnum[200]
Definition: ftablock.h:169
Char division[4]
Definition: ftablock.h:174
string moltype
Definition: ftablock.h:216
TTaxId taxid
Definition: ftablock.h:227
string organism
Definition: ftablock.h:226
string submitter_seqid
Definition: ftablock.h:252
bool is_pat
Definition: ftablock.h:205
size_t bases
Definition: ftablock.h:175
Uint2 segtotal
Definition: ftablock.h:178
bool got_plastid
Definition: ftablock.h:236
Uint2 segnum
Definition: ftablock.h:176
bool env_sample_qual
Definition: ftablock.h:222
Int4 max
Definition: fta_src.cpp:128
const char * orgname
Definition: fta_src.cpp:126
MinMax * next
Definition: fta_src.cpp:130
bool skip
Definition: fta_src.cpp:129
Int4 min
Definition: fta_src.cpp:127
vector< IndexblkPtr > entrylist
char * fwd_name
Definition: fta_src.cpp:90
char * rev_name
Definition: fta_src.cpp:92
char * rev_seq
Definition: fta_src.cpp:93
char * fwd_seq
Definition: fta_src.cpp:91
PcrPrimers * next
Definition: fta_src.cpp:94
char * isolate
Definition: fta_src.cpp:102
char * strain
Definition: fta_src.cpp:100
char * genomename
Definition: fta_src.cpp:106
CBioSource::EGenome genome
Definition: fta_src.cpp:120
char * submitter_seqid
Definition: fta_src.cpp:107
TQualVector quals
Definition: fta_src.cpp:109
CRef< COrgName > orgname
Definition: fta_src.cpp:111
char * location
Definition: fta_src.cpp:104
SourceFeatBlk * next
Definition: fta_src.cpp:121
char * moltype
Definition: fta_src.cpp:105
char * organelle
Definition: fta_src.cpp:101
CRef< CBioSource > bio_src
Definition: fta_src.cpp:110
char * name
Definition: fta_src.cpp:99
char * namstr
Definition: fta_src.cpp:103
Definition: type.c:6
optional< string > GetTheQualValue(TQualVector &qlist, const Char *qual)
Definition: utilfeat.cpp:147
Int2 StringMatchIcase(const Char **array, const Char *text)
Definition: utilfun.cpp:576
Char * StringIStr(const Char *where, const Char *what)
Definition: utilfun.cpp:674
std::vector< CRef< objects::CGb_qual > > TQualVector
Definition: xgbfeat.h:12
Modified on Mon Apr 22 04:04:30 2024 by modify_doxy.py rev. 669887