NCBI C++ ToolKit
nucprot.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: nucprot.cpp 102143 2024-04-09 12:51:41Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: nucprot.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * -----------------
32  *
33  * Take a Seq-entry or elements of a Bioseq-set, do orglookup,
34  * protein translation lookup, then make a nucleic acid protein
35  * sequence.
36  *
37  * Get genetic code from either from Taxonomy database or from
38  * guess rules (if the organism is different in the segment set or
39  * Taxserver is not available)
40  *
41  * orglookup includes
42  * - lookup taxname, common name
43  * - get lineage and division
44  * - get genetic codes
45  *
46  * Protein translation lookup includes
47  * - lookup internal and end stop codon
48  * - compare two sequences, one from CdRegion, one from
49  * translation qualifier
50  *
51  * Take our translation when the only diff is start codon.
52  *
53  * This program only assign 3 different level of Bioseqset:
54  * class = nucprot, assign level = 1
55  * class = segset, assign level = 2
56  * class = parts, assign levle = 3
57  *
58  */
59 
60 #include <ncbi_pch.hpp>
61 
62 #include "ftacpp.hpp"
63 
64 #include <objmgr/scope.hpp>
73 #include <objmgr/util/sequence.hpp>
74 #include <objects/seq/Seq_data.hpp>
83 
84 
85 #include "index.h"
86 
89 
90 #include "ftaerr.hpp"
91 #include "asci_blk.h"
92 #include "add.h"
93 #include "utilfeat.h"
94 #include "nucprot.h"
95 #include "fta_src.h"
96 #include "utilfun.h"
97 #include "indx_blk.h"
98 #include "xgbparint.h"
99 
100 #ifdef THIS_FILE
101 # undef THIS_FILE
102 #endif
103 #define THIS_FILE "nucprot.cpp"
104 
107 
108 typedef list<CRef<CCode_break>> TCodeBreakList;
109 
110 const char* GBExceptionQualVals[] = {
111  "RNA editing",
112  "reasons given in citation",
113  "rearrangement required for product",
114  "annotated by transcript or proteomic data",
115  nullptr
116 };
117 
118 const char* RSExceptionQualVals[] = {
119  "RNA editing",
120  "reasons given in citation",
121  "ribosomal slippage",
122  "trans-splicing",
123  "alternative processing",
124  "artificial frameshift",
125  "nonconsensus splice site",
126  "rearrangement required for product",
127  "modified codon recognition",
128  "alternative start codon",
129  "unclassified transcription discrepancy",
130  "unclassified translation discrepancy",
131  "mismatches in transcription",
132  "mismatches in translation",
133  nullptr
134 };
135 
136 /**********************************************************
137  *
138  * bool FindTheQual(qlist, qual):
139  *
140  * Finds qual in the "qlist" return TRUE.
141  * Otherwise, return FALSE.
142  *
143  **********************************************************/
144 static bool FindTheQual(const CSeq_feat& feat, const Char* qual_to_find)
145 {
146  for (const auto& qual : feat.GetQual()) {
147  if (qual->IsSetQual() && qual->GetQual() == qual_to_find)
148  return true;
149  }
150 
151  return (false);
152 }
153 
154 /**********************************************************
155  *
156  * static char* CpTheQualValueNext(qlist, retq, qual):
157  *
158  * Return qual's value if found the "qual" in the
159  * "qlist", and retq points to next available searching
160  * list; Otherwise, return NULL value and retq points
161  * to NULL.
162  *
163  **********************************************************/
164 static char* CpTheQualValueNext(TQualVector::iterator& cur_qual, const TQualVector::iterator& end_qual, const char* qual)
165 {
166  string qvalue;
167 
168  for (; cur_qual != end_qual; ++cur_qual) {
169  if (! (*cur_qual)->IsSetQual() || (*cur_qual)->GetQual() != qual || ! (*cur_qual)->IsSetVal())
170  continue;
171 
172  qvalue = NStr::Sanitize((*cur_qual)->GetVal());
173 
174  ++cur_qual;
175  break;
176  }
177 
178  char* ret = nullptr;
179  if (! qvalue.empty())
180  ret = StringSave(qvalue);
181 
182  return ret;
183 }
184 
185 /**********************************************************/
187 {
188  IndexblkPtr ibp;
189  ProtBlkPtr pbp;
190  Int4 gcode;
191 
192  if (pp->taxserver != 1)
193  return (0);
194 
195  ibp = pp->entrylist[pp->curindx];
196  if (ibp->gc_genomic < 1 && ibp->gc_mito < 1)
197  return (0);
198 
199  pbp = pp->pbp;
200  gcode = ibp->gc_genomic;
201  if (pbp->genome == 4 || pbp->genome == 5)
202  gcode = ibp->gc_mito;
203  pp->no_code = false;
204 
205  return (gcode);
206 }
207 
208 /**********************************************************/
209 static void GuessGeneticCode(ParserPtr pp, const CSeq_descr& descrs)
210 {
211  ProtBlkPtr pbp;
212  Int4 gcode = 0;
213 
214  pbp = pp->pbp;
215 
216  for (const auto& descr : descrs.Get()) {
217  if (! descr->IsModif())
218  continue;
219 
220  for (EGIBB_mod modif : descr->GetModif()) {
221  if (modif == eGIBB_mod_mitochondrial ||
222  modif == eGIBB_mod_kinetoplast) {
223  pbp->genome = 5; /* mitochondrion */
224  break;
225  }
226  }
227  break;
228  }
229 
230  for (const auto& descr : descrs.Get()) {
231  if (! descr->IsSource())
232  continue;
233 
234  pbp->genome = descr->GetSource().IsSetGenome() ? descr->GetSource().GetGenome() : 0;
235  break;
236  }
237 
238  gcode = fta_get_genetic_code(pp);
239  if (gcode <= 0)
240  return;
241 
242  pbp->orig_gcode = gcode;
243  pbp->gcode.SetId(gcode);
244 }
245 
246 /**********************************************************/
247 static void GetGcode(const TEntryList& seq_entries, ParserPtr pp)
248 {
249  if (pp && pp->pbp && ! pp->pbp->gcode.IsId()) {
250  for (const auto& entry : seq_entries) {
251  GuessGeneticCode(pp, GetDescrPointer(*entry));
252 
253  if (pp->pbp->gcode.IsId())
254  break;
255  }
256  }
257 }
258 
259 /**********************************************************/
260 static void ProtBlkFree(ProtBlkPtr pbp)
261 {
262  pbp->gcode.Reset();
263  pbp->feats.clear();
264 
265  pbp->entries.clear();
266  // delete pbp->ibp;
267  // pbp->ibp = nullptr;
268  pbp->ibp->ids.clear();
269 }
270 
271 /**********************************************************/
272 static void ProtBlkInit(ProtBlkPtr pbp)
273 {
274  pbp->biosep = nullptr;
275 
276  pbp->gcode.Reset();
277  pbp->segset = false;
278  pbp->genome = 0;
279 
280  InfoBioseqPtr ibp = pbp->ibp;
281  if (ibp) {
282  ibp->ids.clear();
283  ibp->mLocus.clear();
284  ibp->mAccNum.clear();
285  }
286 }
287 
288 /**********************************************************/
289 static void AssignBioseqSetLevel(TEntryList& seq_entries)
290 {
291 
292  for (auto& entry : seq_entries) {
293  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
294  switch (bio_set->GetClass()) {
296  bio_set->SetLevel(1);
297  break;
299  bio_set->SetLevel(2);
300  break;
302  bio_set->SetLevel(3);
303  break;
304  default:
305  ErrPostEx(SEV_INFO, ERR_BIOSEQSETCLASS_NewClass, "BioseqSeq class %d not handled", (int)bio_set->GetClass());
306  }
307  }
308  }
309 }
310 
311 /**********************************************************
312  *
313  * static bool check_short_CDS(pp, sfp, err_msg):
314  *
315  * If CDS location contains one of the sequence ends
316  * return TRUE, e.g. it's short do not create protein
317  * bioseq, copy prot-ref to Xref.
318  *
319  **********************************************************/
320 static bool check_short_CDS(ParserPtr pp, const CSeq_feat& feat, bool err_msg)
321 {
322  const CSeq_interval& interval = feat.GetLocation().GetInt();
323  if (interval.GetFrom() == 0 || interval.GetTo() == (TSeqPos)(pp->entrylist[pp->curindx]->bases) - 1)
324  return true;
325 
326  if (err_msg) {
327  string loc = location_to_string(feat.GetLocation());
328  ErrPostEx(SEV_WARNING, ERR_CDREGION_ShortProtein, "Short CDS (< 6 aa) located in the middle of the sequence: %s", loc.c_str());
329  }
330  return false;
331 }
332 
333 /**********************************************************/
334 static void GetProtRefSeqId(CBioseq::TId& ids, InfoBioseqPtr ibp, int* num, ParserPtr pp, CScope& scope, CSeq_feat& cds)
335 {
336  const char* r;
337  string protacc;
338  const char* p;
339  const char* q;
340  ErrSev sev;
341 
342  CSeq_id::E_Choice cho;
343  CSeq_id::E_Choice ncho;
344 
345  if (pp->mode == Parser::EMode::Relaxed) {
346  protacc = CpTheQualValue(cds.SetQual(), "protein_id");
347  if (protacc.empty()) {
348  int protein_id_counter = 0;
349  string idLabel;
350  auto pProteinId =
351  edit::GetNewProtId(scope.GetBioseqHandle(cds.GetLocation()), protein_id_counter, idLabel, false);
352  cds.SetProduct().SetWhole().Assign(*pProteinId);
353  ids.push_back(pProteinId);
354  return;
355  }
356  CSeq_id::ParseIDs(ids, protacc);
357  return;
358  }
359 
360  if (pp->source == Parser::ESource::USPTO) {
361  protacc = CpTheQualValue(cds.SetQual(), "protein_id");
362  CRef<CSeq_id> pat_seq_id(new CSeq_id);
363  CRef<CPatent_seq_id> pat_id = MakeUsptoPatSeqId(protacc.c_str());
364  pat_seq_id->SetPatent(*pat_id);
365  ids.push_back(pat_seq_id);
366  return;
367  }
368 
369  const CTextseq_id* text_id = nullptr;
370  for (const auto& id : ibp->ids) {
371  if (! id->IsPatent()) {
372  text_id = id->GetTextseq_Id();
373  break;
374  }
375  }
376 
377  if (! text_id)
378  return;
379 
380  if (pp->accver == false || (pp->source != Parser::ESource::EMBL &&
382  ++(*num);
383  string obj_id_str = text_id->GetAccession();
384  obj_id_str += '_';
385  obj_id_str += to_string(*num);
386 
387  CRef<CSeq_id> seq_id(new CSeq_id);
388  seq_id->SetLocal().SetStr(obj_id_str);
389  ids.push_back(seq_id);
390  return;
391  }
392 
393  protacc = CpTheQualValue(cds.SetQual(), "protein_id");
394  if (protacc.empty()) {
395  string loc = location_to_string(cds.GetLocation());
396  ErrPostEx(SEV_FATAL, ERR_CDREGION_MissingProteinId, "/protein_id qualifier is missing for CDS feature: \"%s\".", loc.c_str());
397  return;
398  }
399 
400  if (pp->mode == Parser::EMode::HTGSCON) {
401  ++(*num);
402  string obj_id_str = text_id->GetAccession();
403  obj_id_str += '_';
404  obj_id_str += to_string(*num);
405 
406  CRef<CSeq_id> seq_id(new CSeq_id);
407  seq_id->SetLocal().SetStr(obj_id_str);
408  ids.push_back(seq_id);
409  return;
410  }
411 
412  p = StringChr(protacc.c_str(), '.');
413  if (! p || *(p + 1) == '\0') {
414  string loc = location_to_string(cds.GetLocation());
415  ErrPostEx(SEV_FATAL, ERR_CDREGION_MissingProteinVersion, "/protein_id qualifier has missing version for CDS feature: \"%s\".", loc.c_str());
416  return;
417  }
418 
419  for (q = p + 1; *q >= '0' && *q <= '9';)
420  q++;
421  if (*q != '\0') {
422  string loc = location_to_string(cds.GetLocation());
423  ErrPostEx(SEV_FATAL, ERR_CDREGION_IncorrectProteinVersion, "/protein_id qualifier \"%s\" has incorrect version for CDS feature: \"%s\".", protacc.c_str(), loc.c_str());
424  return;
425  }
426 
427  const string protaccStr(protacc.c_str(), p);
428  const int protaccVer(atoi(p + 1));
429  protacc.clear();
430 
431  cho = GetProtAccOwner(protaccStr);
432  if (cho == CSeq_id::e_not_set) {
433  string loc = location_to_string(cds.GetLocation());
434  ErrPostEx(SEV_FATAL, ERR_CDREGION_IncorrectProteinAccession, "/protein_id qualifier has incorrect accession \"%s\" for CDS feature: \"%s\".", protaccStr.c_str(), loc.c_str());
435  return;
436  }
437 
438  r = nullptr;
439  ncho = cho;
440  if (pp->source == Parser::ESource::EMBL && cho != CSeq_id::e_Embl && cho != CSeq_id::e_Tpe)
441  r = "EMBL";
442  else if (pp->source == Parser::ESource::DDBJ && cho != CSeq_id::e_Ddbj &&
443  cho != CSeq_id::e_Tpd)
444  r = "DDBJ";
445  else if (pp->source == Parser::ESource::NCBI && cho != CSeq_id::e_Genbank &&
446  cho != CSeq_id::e_Tpg)
447  r = "NCBI";
448  else {
449  ncho = GetNucAccOwner(text_id->GetAccession());
450  if ((ncho == CSeq_id::e_Tpe && cho == CSeq_id::e_Embl) ||
451  (ncho == CSeq_id::e_Tpd && cho == CSeq_id::e_Ddbj))
452  cho = ncho;
453  }
454 
455  if (r || ncho != cho) {
456  string loc = location_to_string(cds.GetLocation());
457  if (pp->ign_prot_src == false)
458  sev = SEV_FATAL;
459  else
460  sev = SEV_WARNING;
461  if (r)
462  ErrPostEx(sev, ERR_CDREGION_IncorrectProteinAccession, "/protein_id qualifier has incorrect accession prefix \"%s\" for source %s for CDS feature: \"%s\".", protaccStr.c_str(), r, loc.c_str());
463  else
464  ErrPostEx(sev, ERR_CDREGION_IncorrectProteinAccession, "Found mismatching TPA and non-TPA nucleotides's and protein's accessions in one nuc-prot set. Nuc = \"%s\", prot = \"%s\".", text_id->GetAccession().c_str(), protaccStr.c_str());
465  if (pp->ign_prot_src == false) {
466  return;
467  }
468  }
469 
470  CRef<CSeq_id> seq_id(new CSeq_id);
471 
472  CRef<CTextseq_id> new_text_id(new CTextseq_id);
473  new_text_id->SetAccession(protaccStr);
474  new_text_id->SetVersion(protaccVer);
475  SetTextId(cho, *seq_id, *new_text_id);
476 
477  ids.push_back(seq_id);
478 
479  if ((pp->source != Parser::ESource::DDBJ && pp->source != Parser::ESource::EMBL) ||
480  pp->entrylist[pp->curindx]->is_wgs == false || ibp->mAccNum.size() == 8) {
481  return;
482  }
483 
484  seq_id.Reset(new CSeq_id);
485  seq_id->SetGeneral().SetTag().SetStr(protaccStr);
486 
487  string& db = seq_id->SetGeneral().SetDb();
488  if (pp->entrylist[pp->curindx]->is_tsa != false)
489  db = "TSA:";
490  else if (pp->entrylist[pp->curindx]->is_tls != false)
491  db = "TLS:";
492  else
493  db = "WGS:";
494 
495  db.append(ibp->mAccNum.substr(0, 4));
496  ids.push_back(seq_id);
497 }
498 
499 /**********************************************************/
500 static char* stripStr(char* base, const char* str)
501 {
502  char* bptr;
503  char* eptr;
504 
505  if (! base || ! str)
506  return nullptr;
507  bptr = StringStr(base, str);
508  if (bptr) {
509  eptr = bptr + StringLen(str);
510  fta_StringCpy(bptr, eptr);
511  }
512 
513  return (base);
514 }
515 
516 /**********************************************************/
517 static void StripCDSComment(CSeq_feat& feat)
518 {
519  static const char* strA[] = {
520  "Author-given protein sequence is in conflict with the conceptual translation.",
521  "direct peptide sequencing.",
522  "Method: conceptual translation with partial peptide sequencing.",
523  "Method: sequenced peptide, ordered by overlap.",
524  "Method: sequenced peptide, ordered by homology.",
525  "Method: conceptual translation supplied by author.",
526  nullptr
527  };
528 
529  char* comment;
530 
531  if (! feat.IsSetComment())
532  return;
533 
534  string s = tata_save(feat.GetComment());
535 
536  if (! s.empty()) {
537  char* pchComment = StringSave(s);
538  s.clear();
539  for (const char** b = strA; *b; b++) {
540  pchComment = stripStr(pchComment, *b);
541  }
542  s = tata_save(pchComment);
543  MemFree(pchComment);
544  }
545 
546  comment = nullptr;
547  if (! s.empty()) {
548  comment = StringSave(s);
549  s.clear();
550  ShrinkSpaces(comment);
551  }
552 
553  if (comment && *comment != '\0')
554  feat.SetComment(comment);
555  else
556  feat.ResetComment();
557  MemFree(comment);
558 }
559 
560 static void GetProtRefAnnot(InfoBioseqPtr ibp, CSeq_feat& feat, CBioseq& bioseq)
561 {
562  optional<string> qval;
563  bool partial5;
564  bool partial3;
565 
566  std::set<string> names;
567 
568  for (;;) {
569  qval = GetTheQualValue(feat.SetQual(), "product");
570  if (! qval)
571  break;
572 
573  string qval_str = *qval;
574  qval.reset();
575 
576  if (qval_str[0] == '\'')
577  qval_str = qval_str.substr(1, qval_str.size() - 2);
578 
579  names.insert(qval_str);
580  }
581 
582  string qval2;
583  if (names.empty()) {
584  qval2 = CpTheQualValue(feat.GetQual(), "gene");
585  if (qval2.empty())
586  qval2 = CpTheQualValue(feat.GetQual(), "standard_name");
587  if (qval2.empty())
588  qval2 = CpTheQualValue(feat.GetQual(), "label");
589  }
590 
591  CRef<CProt_ref> prot_ref(new CProt_ref);
592 
593  if (names.empty() && qval2.empty()) {
594  string prid = CpTheQualValue(feat.GetQual(), "protein_id");
595  string loc = location_to_string(feat.GetLocation());
596  if (! prid.empty()) {
597  ErrPostEx(SEV_WARNING, ERR_PROTREF_NoNameForProtein, "No product, gene, or standard_name qualifier found for protein \"%s\" on CDS:%s", prid.c_str(), loc.c_str());
598  } else
599  ErrPostEx(SEV_WARNING, ERR_PROTREF_NoNameForProtein, "No product, gene, or standard_name qualifier found on CDS:%s", loc.c_str());
600  prot_ref->SetDesc("unnamed protein product");
601  } else {
602  if (! names.empty()) {
603  prot_ref->SetName().clear();
604  std::copy(names.begin(), names.end(), std::back_inserter(prot_ref->SetName()));
605  names.clear();
606  } else
607  prot_ref->SetDesc(qval2);
608  }
609 
610  while ((qval = GetTheQualValue(feat.SetQual(), "EC_number"))) {
611  prot_ref->SetEc().push_back(*qval);
612  }
613 
614  while ((qval = GetTheQualValue(feat.SetQual(), "function"))) {
615  prot_ref->SetActivity().push_back(*qval);
616  }
617 
618  if (feat.GetQual().empty())
619  feat.ResetQual();
620 
621  CRef<CSeq_feat> feat_prot(new CSeq_feat);
622  feat_prot->SetData().SetProt(*prot_ref);
623  feat_prot->SetLocation(*fta_get_seqloc_int_whole(*bioseq.SetId().front(), bioseq.GetLength()));
624 
625  if (feat.IsSetPartial())
626  feat_prot->SetPartial(feat.GetPartial());
627 
629  partial3 = feat.GetLocation().IsPartialStop(eExtreme_Biological);
630 
631  if (partial5 || partial3) {
632  CSeq_interval& interval = feat_prot->SetLocation().SetInt();
633 
634  if (partial5) {
635  interval.SetFuzz_from().SetLim(CInt_fuzz::eLim_lt);
636  }
637 
638  if (partial3) {
639  interval.SetFuzz_to().SetLim(CInt_fuzz::eLim_gt);
640  }
641  }
642 
643  CRef<CSeq_annot> annot(new CSeq_annot);
644  annot->SetData().SetFtable().push_back(feat_prot);
645 
646  bioseq.SetAnnot().push_back(annot);
647 }
648 
649 /**********************************************************/
650 static void GetProtRefDescr(CSeq_feat& feat, Uint1 method, const CBioseq& bioseq, TSeqdescList& descrs)
651 {
652  char* p;
653  char* q;
654  bool partial5;
655  bool partial3;
656  Int4 diff_lowest;
657  Int4 diff_current;
658  Int4 cdslen;
659  Int4 orglen;
660  Uint1 strand;
661 
662  strand = feat.GetLocation().GetStrand();
663 
664  string organism;
665 
666  for (const auto& desc : bioseq.GetDescr().Get()) {
667  if (! desc->IsSource())
668  continue;
669 
670  const CBioSource& source = desc->GetSource();
671  if (source.IsSetOrg() && source.GetOrg().IsSetTaxname()) {
672  organism = source.GetOrg().GetTaxname();
673  break;
674  }
675  }
676 
677  if (! fta_if_special_org(organism.c_str())) {
678  diff_lowest = -1;
679  cdslen = sequence::GetLength(feat.GetLocation(), &GetScope());
680 
681  for (const auto& annot : bioseq.GetAnnot()) {
682  if (! annot->IsFtable())
683  continue;
684 
685  bool found = false;
686  for (const auto& cur_feat : annot->GetData().GetFtable()) {
687  if (! cur_feat->IsSetData() || ! cur_feat->GetData().IsBiosrc())
688  continue;
689 
690  orglen = sequence::GetLength(cur_feat->GetLocation(), &GetScope());
691 
692  const CBioSource& source = cur_feat->GetData().GetBiosrc();
693  if (! source.IsSetOrg() || ! source.GetOrg().IsSetTaxname() ||
694  strand != cur_feat->GetLocation().GetStrand())
695  continue;
696 
697  sequence::ECompare cmp_res = sequence::Compare(feat.GetLocation(), cur_feat->GetLocation(), nullptr, sequence::fCompareOverlapping);
698  if (cmp_res == sequence::eNoOverlap)
699  continue;
700 
701  if (cmp_res == sequence::eSame) {
702  organism = source.GetOrg().GetTaxname();
703  break;
704  }
705 
706  if (cmp_res == sequence::eContained) {
707  diff_current = orglen - cdslen;
708  if (diff_lowest == -1 || diff_current < diff_lowest) {
709  diff_lowest = diff_current;
710  organism = source.GetOrg().GetTaxname();
711  }
712  } else if (cmp_res == sequence::eOverlap && diff_lowest < 0)
713  organism = source.GetOrg().GetTaxname();
714  }
715 
716  if (found)
717  break;
718  }
719  }
720 
721  CRef<CMolInfo> mol_info(new CMolInfo);
722  mol_info->SetBiomol(CMolInfo::eBiomol_peptide); /* peptide */
723 
725  partial3 = feat.GetLocation().IsPartialStop(eExtreme_Biological);
726 
727  if (partial5 && partial3)
728  mol_info->SetCompleteness(CMolInfo::eCompleteness_no_ends);
729  else if (partial5)
730  mol_info->SetCompleteness(CMolInfo::eCompleteness_no_left);
731  else if (partial3)
732  mol_info->SetCompleteness(CMolInfo::eCompleteness_no_right);
733  else if (feat.IsSetPartial() && feat.GetPartial())
734  mol_info->SetCompleteness(CMolInfo::eCompleteness_partial);
735 
736  if (method == eGIBB_method_concept_trans_a)
737  mol_info->SetTech(CMolInfo::eTech_concept_trans_a);
738  else if (method == eGIBB_method_concept_trans)
739  mol_info->SetTech(CMolInfo::eTech_concept_trans);
740 
741  CRef<CSeqdesc> descr(new CSeqdesc);
742  descr->SetMolinfo(*mol_info);
743  descrs.push_back(descr);
744 
745  string s;
746  for (const auto& qual : feat.GetQual()) {
747  if (qual->GetQual() != "product")
748  continue;
749 
750  if (! s.empty())
751  s.append("; ");
752  s.append(qual->GetVal());
753  }
754 
755  if (s.empty())
756  s = CpTheQualValue(feat.GetQual(), "gene");
757  if (s.empty())
758  s = CpTheQualValue(feat.GetQual(), "label");
759  if (s.empty())
760  s = CpTheQualValue(feat.GetQual(), "standard_name");
761  if (s.empty())
762  p = StringSave("unnamed protein product");
763  else {
764  p = StringSave(s);
765  for (q = p; *q != '\0';)
766  q++;
767  if (q > p) {
768  for (q--; *q == ' ' || *q == ','; q--)
769  if (q == p)
770  break;
771  if (*q != ' ' && *q != ',')
772  q++;
773  *q = '\0';
774  }
775  }
776 
777  if (StringLen(p) < 511 && ! organism.empty() && ! StringStr(p, organism.c_str())) {
778  string s = p;
779  s.append(" [");
780  s.append(organism);
781  s.append("]");
782  MemFree(p);
783  p = StringSave(s);
784  }
785 
786  if (StringLen(p) > 511) {
787  p[510] = '>';
788  p[511] = '\0';
789  q = StringSave(p);
790  MemFree(p);
791  } else
792  q = p;
793 
794  descr.Reset(new CSeqdesc);
795  descr->SetTitle(q);
796  descrs.push_back(descr);
797  MemFree(q);
798 }
799 
800 /**********************************************************
801  *
802  * Function:
803  * static SeqIdPtr QualsToSeqID(pSeqFeat, source)
804  *
805  * Purpose:
806  * Searches all /db_xref qualifiers make from them
807  * corresponding SeqId and removing found qualifiers
808  * from given SeqFeature.
809  *
810  * Tatiana: /db_xref qualifiers were already processed
811  * sfp->dbxref in loadfeat.c
812  *
813  * Parameters:
814  * pSeqFeat - pointer to SeqFeat structure which have
815  * to be processed.
816  *
817  * Return:
818  * Pointer to resultant SeqId chain if successful,
819  * otherwise NULL.
820  *
821  * Note:
822  * Returned SeqId must be freed by caller.
823  *
824  **********************************************************/
826 {
827  char* p;
828 
829  if (! feat.IsSetQual())
830  return;
831 
832  for (TQualVector::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
833  if ((*qual)->GetQual() != "db_xref") {
834  ++qual;
835  continue;
836  }
837 
838  CRef<CSeq_id> seq_id;
839  p = StringIStr((*qual)->GetVal().c_str(), "pid:");
840  if (p)
841  seq_id = StrToSeqId(p + 4, true);
842 
843  if (seq_id.Empty()) {
844  ErrPostEx(SEV_ERROR, ERR_CDREGION_InvalidDb_xref, "Invalid data format /db_xref = \"%s\", ignore it", (*qual)->GetVal().c_str());
845  } else {
846  if (p[4] == 'g' && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL))
847  seq_id.Reset();
848  else
849  ids.push_back(seq_id);
850  }
851 
852  qual = feat.SetQual().erase(qual);
853  }
854 
855  if (feat.GetQual().empty())
856  feat.ResetQual();
857 }
858 
859 /**********************************************************
860  *
861  * Function:
862  * static SeqIdPtr ValidateQualSeqId(pSeqID)
863  *
864  * Purpose:
865  * Validates consistency of SeqId list obtained from
866  * /db_xref in following maner. The number of SeqId
867  * must be not more then 3. Each SeqId must refer to
868  * a different GenBank. If two or more SeqId's refer
869  * to the same GenBank only first is taken in account
870  * and other ones are abandoned.
871  * During validating the function drop corresponding
872  * error messages if something wrong occured.
873  *
874  * Parameters:
875  * pSeqFeat - pointer to SeqFeat structure which have
876  * to be processed;
877  *
878  * Return:
879  * Pointer to resultant SeqId chain. If pointer is
880  * NULL it means that there is no good SeqId.
881  *
882  **********************************************************/
883 static void ValidateQualSeqId(TSeqIdList& ids)
884 {
885  bool abGenBanks[3] = { false, false, false };
886  Int2 num;
887  Char ch;
888 
889  if (ids.empty())
890  return;
891 
892  ch = '\0';
893  for (TSeqIdList::iterator id = ids.begin(); id != ids.end();) {
894  num = -1;
895  const Char* dbtag_str = (*id)->IsGeneral() ? (*id)->GetGeneral().GetTag().GetStr().c_str() : "";
896  if ((*id)->IsGi()) {
897  num = 1;
898  ch = 'g';
899  } else if (*dbtag_str == 'e') {
900  num = 0;
901  ch = 'e';
902  } else if (*dbtag_str == 'd') {
903  num = 2;
904  ch = 'd';
905  }
906  if (num == -1)
907  continue;
908 
909  if (abGenBanks[num]) {
910  /* PID of this type already exist, ignore it */
911  ErrPostEx(SEV_WARNING, ERR_CDREGION_Multiple_PID, "/db_xref=\"pid:%c%i\" refer the same data base", ch, (*id)->GetGeneral().GetTag().GetId());
912 
913  id = ids.erase(id);
914  } else {
915  abGenBanks[num] = true;
916  ++id;
917  }
918  }
919 }
920 
921 /**********************************************************/
923 {
924  if (! feat.IsSetDbxref())
925  return;
926 
927  for (CSeq_feat::TDbxref::iterator xref = feat.SetDbxref().begin(); xref != feat.SetDbxref().end();) {
928  if (! (*xref)->IsSetTag() || ! (*xref)->IsSetDb()) {
929  ++xref;
930  continue;
931  }
932 
933  CRef<CSeq_id> id;
934 
935  if ((*xref)->GetDb() == "PID") {
936  const Char* tag_str = (*xref)->GetTag().GetStr().c_str();
937  switch (tag_str[0]) {
938  case 'g':
940  id.Reset(new CSeq_id);
941  id->SetGi(GI_FROM(long long, strtoll(tag_str + 1, nullptr, 10)));
942  }
943  break;
944 
945  case 'd':
946  case 'e':
947  id.Reset(new CSeq_id);
948  id->SetGeneral(*(*xref));
949  break;
950 
951  default:
952  break;
953  }
954 
955  xref = feat.SetDbxref().erase(xref);
956  } else
957  ++xref;
958 
959  if (id.NotEmpty())
960  ids.push_back(id);
961  }
962 
963  if (feat.GetDbxref().empty())
964  feat.ResetDbxref();
965 }
966 
967 /**********************************************************
968  *
969  * Function:
970  * static void ProcessForDbxref(pBioseq, pSeqFeat,
971  * source)
972  *
973  * Purpose:
974  * Finds all qualifiers corresponding to /db_xref,
975  * makes from them SeqId and remove them from further
976  * processing. Also the function looks for PID which
977  * can be found in /note (pSeqFeat->comment) and
978  * removes PID record from /note.
979  *
980  * Parameters:
981  * pBioseq - pointer or a Bioseq structure which will
982  * hold resultant list of SeqId;
983  * pSeqFeat - pointer to SeqFeat structure which have
984  * to processed.
985  * source - source of sequence.
986  *
987  * Return:
988  * void
989  *
990  **********************************************************/
992 {
993  TSeqIdList ids;
994  QualsToSeqID(feat, source, ids);
995  if (! ids.empty()) {
996  ValidateQualSeqId(ids);
997  if (! ids.empty()) {
998  bioseq.SetId().splice(bioseq.SetId().end(), ids);
999  return;
1000  }
1001  }
1002 
1003  DbxrefToSeqID(feat, source, ids);
1004  if (feat.IsSetComment() && feat.GetComment().empty())
1005  feat.ResetComment();
1006 
1007  if (! ids.empty())
1008  bioseq.SetId().splice(bioseq.SetId().end(), ids);
1009 }
1010 
1011 /**********************************************************/
1012 static CRef<CBioseq> BldProtRefSeqEntry(ProtBlkPtr pbp, CSeq_feat& feat, const string& seq_data, Uint1 method, ParserPtr pp, const CBioseq& bioseq, CBioseq::TId& ids)
1013 {
1014  CRef<CBioseq> new_bioseq;
1015 
1016  if (ids.empty())
1017  return new_bioseq;
1018 
1019  new_bioseq.Reset(new CBioseq);
1020 
1021  new_bioseq->SetId().swap(ids);
1022 
1023  ProcessForDbxref(*new_bioseq, feat, pp->source);
1024 
1025  new_bioseq->SetInst().SetLength(static_cast<TSeqPos>(seq_data.size()));
1026 
1027  GetProtRefDescr(feat, method, bioseq, new_bioseq->SetDescr());
1028  GetProtRefAnnot(pbp->ibp, feat, *new_bioseq);
1029 
1030  new_bioseq->SetInst().SetRepr(CSeq_inst::eRepr_raw);
1031  new_bioseq->SetInst().SetMol(CSeq_inst::eMol_aa);
1032 
1033  /* Seq_code always ncbieaa 08.08.96 */
1035  new_bioseq->SetInst().SetSeq_data(*data);
1036 
1037  return new_bioseq;
1038 }
1039 
1040 /**********************************************************/
1041 static void AddProtRefSeqEntry(ProtBlkPtr pbp, CBioseq& bioseq)
1042 {
1043  CRef<CSeq_entry> entry(new CSeq_entry);
1044  entry->SetSeq(bioseq);
1045  pbp->entries.push_back(entry);
1046 }
1047 
1048 /**********************************************************/
1049 static char* SimpleValuePos(const char* qval)
1050 {
1051  const char* bptr;
1052  const char* eptr;
1053 
1054  bptr = StringStr(qval, "(pos:");
1055  if (! bptr)
1056  return nullptr;
1057 
1058  bptr += 5;
1059  while (*bptr == ' ')
1060  bptr++;
1061  for (eptr = bptr; *eptr != ',' && *eptr != '\0';)
1062  eptr++;
1063 
1064  return StringSave(string(bptr, eptr));
1065 }
1066 
1067 /**********************************************************
1068  *
1069  * static CodeBreakPtr GetCdRegionCB(ibp, sfp, accver):
1070  *
1071  * If protein translation (InternalStopCodon) O.K.,
1072  * then this qualifier will be deleted ==> different from
1073  * Karl's parser.
1074  * For transl_except of type OTHER, use the ncibeaa
1075  * code 'X' for Code-break.aa.
1076  * CodeBreakPtr->aa.choice = 1 ==> for ncbieaa;
1077  * CodeBreakPtr->aa.value.intvalue = (Int4)'X' ==>
1078  * for unknown.
1079  *
1080  **********************************************************/
1081 static void GetCdRegionCB(InfoBioseqPtr ibp, CSeq_feat& feat, TCodeBreakList& code_breaks, unsigned char* dif, bool accver)
1082 {
1083  Int4 feat_start = -1;
1084  Int4 feat_stop = -1;
1085 
1086  if (feat.IsSetLocation()) {
1087  feat_start = feat.GetLocation().GetStart(eExtreme_Positional);
1088  feat_stop = feat.GetLocation().GetStop(eExtreme_Positional);
1089  }
1090 
1091  Uint1 res = 2;
1092 
1093  if (feat.IsSetQual()) {
1094  TQualVector::iterator cur_qual = feat.SetQual().begin(),
1095  end_qual = feat.SetQual().end();
1096 
1097  char* qval = nullptr;
1098  while ((qval = CpTheQualValueNext(cur_qual, end_qual, "transl_except"))) {
1099  CRef<CCode_break> code_break(new CCode_break);
1100 
1101  int ncbieaa_val = GetQualValueAa(qval, false);
1102 
1103  code_break->SetAa().SetNcbieaa(ncbieaa_val);
1104 
1105  char* pos = SimpleValuePos(qval);
1106 
1107  int num_errs = 0;
1108  bool locmap = false;
1109 
1110  CRef<CSeq_loc> location = xgbparseint_ver(pos, locmap, num_errs, ibp->ids, accver);
1111  if (location.NotEmpty())
1112  code_break->SetLoc(*location);
1113 
1114  Int4 start = code_break->IsSetLoc() ? code_break->GetLoc().GetStart(eExtreme_Positional) : -1;
1115  Int4 stop = code_break->IsSetLoc() ? code_break->GetLoc().GetStop(eExtreme_Positional) : -1;
1116 
1117  Uint1 i = (start > stop) ? 3 : (stop - start);
1118 
1119  if (i != 2)
1120  res = i;
1121 
1122  bool itis = false;
1123  if (ncbieaa_val == 42 &&
1124  (start == feat_start ||
1125  stop == feat_stop))
1126  itis = true;
1127 
1128  bool pos_range = false;
1129  if (i == 2)
1130  pos_range = false;
1131  else if (i > 2)
1132  pos_range = true;
1133  else
1134  pos_range = ! itis;
1135 
1136  if (num_errs > 0 || pos_range) {
1137  ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing, "transl_except range is wrong, %s, drop the transl_except", pos);
1138  MemFree(pos);
1139  MemFree(qval);
1140  break;
1141  }
1142 
1143  if (code_break->IsSetLoc()) {
1144  if (feat.GetLocation().IsSetStrand())
1145  code_break->SetLoc().SetStrand(feat.GetLocation().GetStrand());
1146 
1147  sequence::ECompare cmp_res = sequence::Compare(feat.GetLocation(), code_break->GetLoc(), nullptr, sequence::fCompareOverlapping);
1148  if (cmp_res != sequence::eContains) {
1149  ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing, "/transl_except not in CDS: %s", qval);
1150  }
1151  }
1152 
1153  MemFree(pos);
1154  MemFree(qval);
1155 
1156  if (code_break.NotEmpty())
1157  code_breaks.push_back(code_break);
1158  }
1159  }
1160 
1161  *dif = 2 - res;
1162 }
1163 
1164 /**********************************************************/
1165 static void CkEndStop(const CSeq_feat& feat, Uint1 dif)
1166 {
1167  Int4 len;
1168  Int4 r;
1169  Int4 frm;
1170 
1172  const CCdregion& cdregion = feat.GetData().GetCdregion();
1173 
1174  if (! cdregion.IsSetFrame() || cdregion.GetFrame() == 0)
1175  frm = 0;
1176  else
1177  frm = cdregion.GetFrame() - 1;
1178 
1179  r = (len - frm + (Int4)dif) % 3;
1180  if (r != 0 && (! feat.IsSetExcept() || feat.GetExcept() == false)) {
1181  string loc = location_to_string(feat.GetLocation());
1182  ErrPostEx(SEV_WARNING, ERR_CDREGION_UnevenLocation, "CDS: %s. Length is not divisable by 3, the remain is: %d", loc.c_str(), r);
1183  }
1184 }
1185 
1186 /**********************************************************/
1187 static void check_end_internal(size_t protlen, const CSeq_feat& feat, Uint1 dif)
1188 {
1189  Int4 frm;
1190 
1191  const CCdregion& cdregion = feat.GetData().GetCdregion();
1192 
1193  if (! cdregion.IsSetFrame() || cdregion.GetFrame() == 0)
1194  frm = 0;
1195  else
1196  frm = cdregion.GetFrame() - 1;
1197 
1198  size_t len = sequence::GetLength(feat.GetLocation(), &GetScope()) - frm + dif;
1199 
1200  if (protlen * 3 != len && (! feat.IsSetExcept() || feat.GetExcept() == false)) {
1201  string loc = location_to_string(feat.GetLocation());
1202  ErrPostEx(SEV_WARNING, ERR_CDREGION_LocationLength, "Length of the CDS: %s (%ld) disagree with the length calculated from the protein: %ld", loc.c_str(), len, protlen * 3);
1203  }
1204 }
1205 
1206 /**********************************************************
1207  *
1208  * static void ErrByteStorePtr(ibp, sfp, bsp):
1209  *
1210  * For debugging, put to error logfile, it needs
1211  * to delete for "buildcds.c program.
1212  *
1213  **********************************************************/
1214 static void ErrByteStorePtr(InfoBioseqPtr ibp, const CSeq_feat& feat, const string& prot)
1215 {
1216  string qval = CpTheQualValue(feat.GetQual(), "translation");
1217  if (qval.empty())
1218  qval = "no translation qualifier";
1219 
1220  if (! feat.IsSetExcept() || feat.GetExcept() == false) {
1221  string loc = location_to_string(feat.GetLocation());
1222  ErrPostEx(SEV_WARNING, ERR_CDREGION_TranslationDiff, "Location: %s, translation: %s", loc.c_str(), qval.c_str());
1223  }
1224 
1225  ErrLogPrintStr(prot.c_str());
1226  ErrLogPrintStr("\n");
1227 }
1228 
1229 /**********************************************************
1230  *
1231  * static ByteStorePtr CkProteinTransl(pp, ibp, bsp, sfp,
1232  * qval, intercodon,
1233  * gcode, method):
1234  *
1235  * If bsp != translation's value, then take
1236  * translation's value and print out warning message.
1237  * If the only diff is start codon and bsp has "M"
1238  * take bsp.
1239  * If intercodon = TRUE, then no comparison.
1240  *
1241  **********************************************************/
1242 static void CkProteinTransl(ParserPtr pp, InfoBioseqPtr ibp, string& prot, CSeq_feat& feat, const char* qval, bool intercodon, const char* gcode, unsigned char* method)
1243 {
1244  const char* ptr;
1245 
1246  string msg2;
1247  Int2 residue;
1248  Int4 num = 0;
1249  size_t aa;
1250  bool msgout = false;
1251  bool first = false;
1252  Int4 difflen;
1253 
1254  CCdregion& cdregion = feat.SetData().SetCdregion();
1255  size_t len = StringLen(qval);
1256  msg2.reserve(1100);
1257 
1258  string loc = location_to_string(feat.GetLocation());
1259 
1260  size_t blen = prot.size();
1261 
1262  if (len != blen && (! feat.IsSetExcept() || feat.GetExcept() == false)) {
1263  ErrPostEx(SEV_ERROR, ERR_CDREGION_ProteinLenDiff, "Lengths of conceptual translation and translation qualifier differ : %d : %d : %s", blen, len, loc.c_str());
1264  }
1265 
1266  difflen = 0;
1267  if (! intercodon) {
1268  if (len != blen)
1269  difflen = 1;
1270  if (len > blen)
1271  difflen = 2;
1272 
1273  size_t residue_idx = 0;
1274  for (ptr = qval, num = 0, aa = 1; residue_idx < blen; ++aa, ++residue_idx) {
1275  residue = prot[residue_idx];
1276 
1277  if (aa > len) {
1278  if (residue == '*')
1279  continue;
1280  difflen = 2;
1281  break;
1282  }
1283 
1284  if (residue == *ptr) {
1285  ptr++;
1286  continue;
1287  }
1288 
1289  if (aa == 1 && residue == 'M')
1290  first = true;
1291 
1292  msgout = true;
1293  num++;
1294  if (num == 1)
1295  msg2 = "at AA # ";
1296 
1297  if (num < 11 && msg2.length() < 1000) {
1298  stringstream aastr;
1299  aastr << aa << '(' << (char)residue << ',' << (char)*ptr << "), ";
1300  msg2 += aastr.str();
1301  } else if (num == 11 && msg2.length() < 1000)
1302  msg2 += ", additional details suppressed, ";
1303  ptr++;
1304  }
1305 
1306  if (num > 0) {
1307  cdregion.SetConflict(true);
1308  stringstream aastr;
1309  aastr << "using genetic code " << gcode << ", total " << num << " difference";
1310  if (num > 1)
1311  aastr << 's'; // plural
1312  msg2 += aastr.str();
1313  if (! feat.IsSetExcept() || feat.GetExcept() == false) {
1314  ErrPostEx(SEV_WARNING, ERR_CDREGION_TranslationDiff, "%s:%s", msg2.c_str(), loc.c_str());
1315  }
1316  }
1317 
1318  if (! msgout) {
1319  cdregion.ResetConflict();
1320  ErrPostEx(SEV_INFO, ERR_CDREGION_TranslationsAgree, "Parser-generated conceptual translation agrees with input translation %s", loc.c_str());
1321  }
1322 
1323  if (difflen == 2) {
1324  cdregion.SetConflict(true);
1325  msgout = true;
1326  }
1327  } /* intercodon */
1328  else {
1329  msgout = true;
1330  if (! feat.IsSetExcept() || feat.GetExcept() == false) {
1331  ErrPostEx(SEV_WARNING, ERR_CDREGION_NoTranslationCompare, "Conceptual translation has internal stop codons, no comparison: %s", loc.c_str());
1332  }
1333  }
1334 
1335  if (msgout) {
1336  if (pp->debug) {
1337  ErrByteStorePtr(ibp, feat, prot);
1338  }
1339  }
1340 
1341  if (pp->accver == false) {
1342  if ((num == 1 && first) ||
1343  (pp->transl && ! prot.empty() && cdregion.IsSetConflict() && cdregion.GetConflict())) {
1344  cdregion.ResetConflict();
1345  ErrPostEx(SEV_WARNING, ERR_CDREGION_TranslationOverride, "Input translation is replaced with conceptual translation: %s", loc.c_str());
1346  *method = eGIBB_method_concept_trans;
1347  }
1348  }
1349 
1350  if ((cdregion.IsSetConflict() && cdregion.GetConflict() == true) || difflen != 0)
1351  *method = eGIBB_method_concept_trans_a;
1352 
1353  if (msgout && pp->transl == false && (! feat.IsSetExcept() || feat.GetExcept() == false)) {
1354  ErrPostEx(SEV_WARNING, ERR_CDREGION_SuppliedProteinUsed, "In spite of previously reported problems, the supplied protein translation will be used : %s", loc.c_str());
1355  }
1356 
1357  prot = qval;
1358 }
1359 
1360 /**********************************************************
1361  *
1362  * static bool check_translation(bsp, qval):
1363  *
1364  * If bsp != translation's value return FALSE.
1365  *
1366  **********************************************************/
1367 static bool check_translation(string& prot, const char* qval)
1368 {
1369  size_t len = 0;
1370  size_t blen = prot.size();
1371 
1372  for (len = StringLen(qval); len != 0; len--) {
1373  if (qval[len - 1] != 'X') /* remove terminal X */
1374  break;
1375  }
1376 
1377  size_t cur_pos = blen;
1378  for (; cur_pos != 0; --cur_pos) {
1379  if (prot[cur_pos - 1] != 'X')
1380  break;
1381  }
1382 
1383  if (cur_pos != len)
1384  return false;
1385 
1386  cur_pos = 0;
1387  for (; *qval != '\0' && cur_pos < blen; qval++, ++cur_pos) {
1388  if (prot[cur_pos] != *qval)
1389  return false;
1390  }
1391 
1392  return true;
1393 }
1394 
1395 // Workaround for translation function.
1396 // This is needed because feat->location usually does not have a 'version' in its ID,
1397 // but feat->cdregion->code_break->location has.
1398 // In this case both locations is treated as they are from different sequences, but they definitely
1399 // belong to the same one.
1400 // Therefore, this function changes all IDs of the codebreaks to feat->location->id before translation,
1401 // and return all this stuff back after translation.
1402 // TODO: it probably should be organized in another way
1403 static bool Translate(CSeq_feat& feat, string& prot)
1404 {
1405  std::list<CRef<CSeq_id>> orig_ids;
1406  const CSeq_id* feat_loc_id = nullptr;
1407  if (feat.IsSetLocation())
1408  feat_loc_id = feat.GetLocation().GetId();
1409 
1410  bool change = feat_loc_id && feat.GetData().GetCdregion().IsSetCode_break();
1411  if (change) {
1412  for (auto& code_break : feat.SetData().SetCdregion().SetCode_break()) {
1413  orig_ids.push_back(CRef<CSeq_id>(new CSeq_id));
1414  orig_ids.back()->Assign(*code_break->GetLoc().GetId());
1415  code_break->SetLoc().SetId(*feat_loc_id);
1416  }
1417  }
1418 
1419  bool ret = true;
1420  try {
1422  } catch (CSeqMapException& e) {
1423  ErrPostEx(SEV_REJECT, 0, 0, "%s", e.GetMsg().c_str());
1424  ret = false;
1425  }
1426 
1427  if (change) {
1428  std::list<CRef<CSeq_id>>::iterator it = orig_ids.begin();
1429  for (auto& code_break : feat.SetData().SetCdregion().SetCode_break()) {
1430  code_break->SetLoc().SetId(**it);
1431  ++it;
1432  }
1433  }
1434 
1435  return ret;
1436 }
1437 
1438 /**********************************************************
1439  *
1440  * static Int2 EndAdded(sfp, gene):
1441  *
1442  * From EndStopCodonBaseAdded:
1443  * Return 0 if no bases added, no end stop codon found
1444  * after added one or two bases, or can not add
1445  * (substract) any bases at end because it is really
1446  * a partial.
1447  * Return -1 if something bad.
1448  * Return +1 if it found end stop codon after bases
1449  * added.
1450  *
1451  **********************************************************/
1452 static Int2 EndAdded(CSeq_feat& feat, GeneRefFeats& gene_refs)
1453 {
1454  Int4 pos;
1455  Int4 pos1;
1456  Int4 pos2;
1457  Int4 len;
1458  Int4 len2;
1459 
1460  Uint4 remainder;
1461 
1462  Uint4 oldfrom;
1463  Uint4 oldto;
1464 
1465  size_t i;
1466 
1467  string transl;
1468  string name;
1469 
1470  CCdregion& cdregion = feat.SetData().SetCdregion();
1472  len2 = len;
1473 
1474  int frame = cdregion.IsSetFrame() ? cdregion.GetFrame() : 0;
1475  if (frame > 1 && frame < 4)
1476  len -= frame - 1;
1477 
1478  remainder = 3 - len % 3;
1479  if (remainder == 0)
1480  return (0);
1481 
1482  if (cdregion.IsSetCode_break()) {
1483  bool ret_condition = false;
1484  for (const auto& code_break : cdregion.GetCode_break()) {
1485  pos1 = numeric_limits<int>::max();
1486  pos2 = -10;
1487 
1488  for (CSeq_loc_CI loc(code_break->GetLoc()); loc; ++loc) {
1489  pos = sequence::LocationOffset(*loc.GetRangeAsSeq_loc(), feat.GetLocation(), sequence::eOffset_FromStart);
1490  if (pos < pos1)
1491  pos1 = pos;
1492  pos = sequence::LocationOffset(*loc.GetRangeAsSeq_loc(), feat.GetLocation(), sequence::eOffset_FromEnd);
1493  if (pos > pos2)
1494  pos2 = pos;
1495  }
1496  pos = pos2 - pos1;
1497  if (pos == 2 || (pos >= 0 && pos <= 1 && pos2 == len2 - 1)) {
1498  ret_condition = true;
1499  break;
1500  }
1501  }
1502 
1503  if (ret_condition)
1504  return (0);
1505  }
1506 
1507  CSeq_interval* last_interval = nullptr;
1508  for (CTypeIterator<CSeq_interval> loc(Begin(feat.SetLocation())); loc; ++loc) {
1509  last_interval = &(*loc);
1510  }
1511 
1512  if (! last_interval)
1513  return (0);
1514 
1515  const CBioseq_Handle& bioseq_h = GetScope().GetBioseqHandle(last_interval->GetId());
1516  if (bioseq_h.GetState() != CBioseq_Handle::fState_none)
1517  return (0);
1518 
1519  oldfrom = last_interval->GetFrom();
1520  oldto = last_interval->GetTo();
1521 
1522  if (last_interval->IsSetStrand() && last_interval->GetStrand() == eNa_strand_minus) {
1523  if (last_interval->GetFrom() < remainder || last_interval->IsSetFuzz_from())
1524  return (0);
1525  last_interval->SetFrom(oldfrom - remainder);
1526  } else {
1527  if (last_interval->GetTo() >= bioseq_h.GetBioseqLength() - remainder || last_interval->IsSetFuzz_to())
1528  return (0);
1529  last_interval->SetTo(oldto + remainder);
1530  }
1531 
1532  string newprot;
1533  Translate(feat, newprot);
1534 
1535  if (newprot.empty()) {
1536  last_interval->SetFrom(oldfrom);
1537  last_interval->SetTo(oldto);
1538  return (0);
1539  }
1540 
1541  size_t protlen = newprot.size();
1542  if (protlen != (size_t)(len + remainder) / 3) {
1543  last_interval->SetFrom(oldfrom);
1544  last_interval->SetTo(oldto);
1545  return (0);
1546  }
1547 
1548  if (newprot[protlen - 1] != '*') {
1549  last_interval->SetFrom(oldfrom);
1550  last_interval->SetTo(oldto);
1551  return (0);
1552  }
1553 
1554  transl = CpTheQualValue(feat.GetQual(), "translation");
1555  if (transl.empty()) {
1556  last_interval->SetFrom(oldfrom);
1557  last_interval->SetTo(oldto);
1558  return (0);
1559  }
1560 
1561  protlen--;
1562  newprot = newprot.substr(0, newprot.size() - 1);
1563 
1564  for (i = 0; i < protlen; i++) {
1565  if (transl[i] != newprot[i])
1566  break;
1567  }
1568  transl.clear();
1569 
1570  if (i < protlen) {
1571  last_interval->SetFrom(oldfrom);
1572  last_interval->SetTo(oldto);
1573  return (0);
1574  }
1575 
1576  if (! gene_refs.valid)
1577  return (remainder);
1578 
1579  name = CpTheQualValue(feat.GetQual(), "gene");
1580  if (name.empty())
1581  return (remainder);
1582 
1583  for (TSeqFeatList::iterator gene = gene_refs.first; gene != gene_refs.last; ++gene) {
1584  if (! (*gene)->IsSetData() || ! (*gene)->GetData().IsGene())
1585  continue;
1586 
1587  int cur_strand = (*gene)->GetLocation().IsSetStrand() ? (*gene)->GetLocation().GetStrand() : 0,
1588  last_strand = last_interval->IsSetStrand() ? last_interval->GetStrand() : 0;
1589  if (cur_strand != last_strand)
1590  continue;
1591 
1592  const CGene_ref& cur_gene_ref = (*gene)->GetData().GetGene();
1593  if (NStr::CompareNocase(cur_gene_ref.GetLocus(), name) != 0) {
1594  if (! cur_gene_ref.IsSetSyn())
1595  continue;
1596 
1597  bool found = false;
1598  for (const string& syn : cur_gene_ref.GetSyn()) {
1599  if (NStr::CompareNocase(name, syn) == 0) {
1600  found = true;
1601  break;
1602  }
1603  }
1604 
1605  if (! found)
1606  continue;
1607  }
1608 
1609  for (CTypeIterator<CSeq_interval> loc(Begin((*gene)->SetLocation())); loc; ++loc) {
1610  if (! loc->GetId().Match(last_interval->GetId()))
1611  continue;
1612 
1613  int cur_strand = loc->IsSetStrand() ? loc->GetStrand() : 0;
1614  if (cur_strand == eNa_strand_minus && loc->GetFrom() == oldfrom)
1615  loc->SetFrom(last_interval->GetFrom());
1616  else if (cur_strand != eNa_strand_minus && loc->GetTo() == oldto)
1617  loc->SetTo(last_interval->GetTo());
1618  }
1619  }
1620 
1621  return (remainder);
1622 }
1623 
1624 /**********************************************************/
1626 {
1627  if (! feat.IsSetQual())
1628  return;
1629 
1630  for (TQualVector::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
1631  if (! (*qual)->IsSetQual() || ! (*qual)->IsSetVal() ||
1632  (*qual)->GetQual() != "codon") {
1633  ++qual;
1634  continue;
1635  }
1636 
1637  string loc = location_to_string(feat.GetLocation());
1638  ErrPostEx(SEV_ERROR, ERR_CDREGION_CodonQualifierUsed, "Encountered /codon qualifier for \"CDS\" feature at \"%s\". Code-breaks (/transl_except) should be used instead.", loc.c_str());
1639 
1640  qual = feat.SetQual().erase(qual);
1641  }
1642 
1643  if (feat.GetQual().empty())
1644  feat.ResetQual();
1645 }
1646 
1647 /**********************************************************
1648  *
1649  * static ByteStorePtr InternalStopCodon(pp, ibp, sfp,
1650  * method, gene):
1651  *
1652  * Return NULL if there is no protein sequence, or
1653  * there is no translation qualifier and protein sequence
1654  * has internal stop codon; otherwise, return a protein
1655  * sequence.
1656  * In the embl format, it may not have "translation"
1657  * qualifier, take protein sequence (without
1658  * end_stop_codon) instead.
1659  *
1660  **********************************************************/
1661 static void InternalStopCodon(ParserPtr pp, InfoBioseqPtr ibp, CSeq_feat& feat, unsigned char* method, Uint1 dif, GeneRefFeats& gene_refs, string& seq_data)
1662 {
1663  string qval;
1664  bool intercodon = false;
1665  bool again = true;
1666 
1667  ErrSev sev;
1668 
1669  Uint1 m = 0;
1670  string gcode_str;
1671  string stopmsg;
1672  size_t protlen;
1673  Int4 aa;
1674  Int4 num = 0;
1675  Int2 residue;
1676  Int2 r;
1677 
1678  CCdregion& cdregion = feat.SetData().SetCdregion();
1679 
1680  if (! cdregion.IsSetCode())
1681  return;
1682 
1683  CGenetic_code& gen_code = cdregion.SetCode();
1684  CGenetic_code::C_E* cur_code = nullptr;
1685 
1686  for (auto& gcode : gen_code.Set()) {
1687  if (gcode->IsId()) {
1688  cur_code = gcode;
1689  break;
1690  }
1691  }
1692 
1693  if (! cur_code)
1694  return;
1695 
1696  if (pp->no_code) {
1697  int orig_code_id = cur_code->GetId();
1698  for (int cur_code_id = cur_code->GetId(); again && cur_code_id < 14; ++cur_code_id) {
1699  cur_code->SetId(cur_code_id);
1700 
1701  string prot;
1702  if (! Translate(feat, prot))
1703  pp->entrylist[pp->curindx]->drop = true;
1704 
1705  if (prot.empty())
1706  continue;
1707 
1708  protlen = prot.size();
1709  residue = prot[protlen - 1];
1710 
1711  if (residue == '*')
1712  prot = prot.substr(0, prot.size() - 1);
1713 
1714  intercodon = prot.find('*') != string::npos;
1715 
1716  if (! intercodon) {
1717  qval = CpTheQualValue(feat.GetQual(), "translation");
1718  if (! qval.empty()) /* compare protein sequence */
1719  {
1720  if (check_translation(prot, qval.c_str())) {
1721  sev = (pp->taxserver == 0) ? SEV_INFO : SEV_WARNING;
1722  ErrPostEx(sev, ERR_CDREGION_GeneticCodeAssumed, "No genetic code from TaxArch, trying to guess, code %d assumed", cur_code_id);
1723  again = false;
1724  }
1725  qval.clear();
1726  } else
1727  break;
1728  }
1729  }
1730 
1731  if (again) {
1732  sev = (pp->taxserver == 0) ? SEV_INFO : SEV_WARNING;
1733  ErrPostEx(sev, ERR_CDREGION_GeneticCodeAssumed, "Can't guess genetic code, code %d assumed", orig_code_id);
1734  cur_code->SetId(orig_code_id);
1735  }
1736  }
1737 
1738  string prot;
1739  if (! Translate(feat, prot))
1740  pp->entrylist[pp->curindx]->drop = true;
1741 
1742  if (cur_code)
1743  gcode_str = to_string(cur_code->GetId());
1744  else
1745  gcode_str = "unknown";
1746 
1747  qval = CpTheQualValue(feat.GetQual(), "translation");
1748  intercodon = false;
1749 
1750  string loc = location_to_string(feat.GetLocation());
1751 
1752  if (! prot.empty()) {
1753  protlen = prot.size();
1754  residue = prot[protlen - 1];
1755  if ((! feat.IsSetPartial() || feat.GetPartial() == false) && ! SeqLocHaveFuzz(feat.GetLocation())) {
1756  CkEndStop(feat, dif);
1757  }
1758 
1759  if (residue != '*') {
1760  r = EndAdded(feat, gene_refs);
1761  if (r > 0 && (! feat.IsSetExcept() || feat.GetExcept() == false)) {
1762  ErrPostEx(SEV_WARNING, ERR_CDREGION_TerminalStopCodonMissing, "CDS: %s |found end stop codon after %d bases added", loc.c_str(), r);
1763  }
1764 
1765  if ((! feat.IsSetPartial() || feat.GetPartial() == false) && ! SeqLocHaveFuzz(feat.GetLocation())) {
1766  /* if there is no partial qualifier and location
1767  * doesn't have "fuzz" then output message
1768  */
1769  if (! feat.IsSetExcept() || feat.GetExcept() == false) {
1770  ErrPostEx(SEV_ERROR, ERR_CDREGION_TerminalStopCodonMissing, "No end stop codon found for CDS: %s", loc.c_str());
1771  }
1772  }
1773  } else /* remove termination codon from protein */
1774  {
1775  check_end_internal(protlen, feat, dif);
1776  prot = prot.substr(0, prot.size() - 1);
1777  }
1778 
1779  /* check internal stop codon */
1780  size_t residue_idx = 0;
1781  protlen = prot.size();
1782  for (stopmsg.reserve(550), aa = 1; residue_idx < protlen; ++residue_idx) {
1783  residue = prot[residue_idx];
1784  if (aa == 1 && residue == '-') {
1785  /* if unrecognized start of translation,
1786  * a ncbigap character is inserted
1787  */
1788  if (! feat.IsSetExcept() || feat.GetExcept() == false) {
1789  ErrPostEx(SEV_WARNING, ERR_CDREGION_IllegalStart, "unrecognized initiation codon from CDS: %s", loc.c_str());
1790  }
1791  if (qval.empty()) /* no /translation */
1792  {
1793  return;
1794  }
1795  }
1796 
1797  if (residue == '*') /* only report 10 internal stop codons */
1798  {
1799  intercodon = true;
1800  ++num;
1801 
1802  if (num < 11 && stopmsg.length() < 500) {
1803  stopmsg += to_string(aa);
1804  stopmsg += ' ';
1805  } else if (num == 11 && stopmsg.length() < 500) {
1806  stopmsg += ", only report 10 positions";
1807  }
1808  }
1809 
1810  aa++;
1811  }
1812 
1813  if (intercodon) {
1814  if (! feat.IsSetExcept() || feat.GetExcept() == false) {
1815  ErrPostEx(SEV_ERROR, ERR_CDREGION_InternalStopCodonFound, "Found %d internal stop codon, at AA # %s, on feature key, CDS, frame # %d, genetic code %s:%s", (int)num, stopmsg.c_str(), cdregion.GetFrame(), gcode_str.c_str(), loc.c_str());
1816  }
1817 
1818  if (pp->debug) {
1819  ErrByteStorePtr(ibp, feat, prot);
1820  }
1821  }
1822  } else if (! feat.IsSetExcept() || feat.GetExcept() == false) {
1823  ErrPostEx(SEV_WARNING, ERR_CDREGION_NoProteinSeq, "No protein sequence found:%s", loc.c_str());
1824  }
1825 
1826  if (! qval.empty()) /* compare protein sequence */
1827  {
1828  CkProteinTransl(pp, ibp, prot, feat, qval.c_str(), intercodon, gcode_str.c_str(), &m);
1829  *method = m;
1830  seq_data.swap(prot);
1831  return;
1832  }
1833 
1834  if (! prot.empty() && ! intercodon) {
1835  if (prot.size() > 6 || ! check_short_CDS(pp, feat, false)) {
1836  ErrPostEx(SEV_INFO, ERR_CDREGION_TranslationAdded, "input CDS lacks a translation: %s", loc.c_str());
1837  }
1838  *method = eGIBB_method_concept_trans;
1839  seq_data.swap(prot);
1840  return;
1841  }
1842 
1843  /* no translation qual and internal stop codon */
1844  if (intercodon) {
1845  cdregion.SetStops(num);
1846  if (! feat.IsSetExcept() || feat.GetExcept() == false) {
1847  ErrPostEx(SEV_WARNING, ERR_CDREGION_NoProteinSeq, "internal stop codons, and no translation qualifier CDS:%s", loc.c_str());
1848  }
1849  }
1850 }
1851 
1852 /**********************************************************/
1853 static void check_gen_code(const char* qval, ProtBlkPtr pbp, Uint1 taxserver)
1854 {
1855  ErrSev sev;
1856  Uint1 gcpvalue;
1857  Uint1 genome;
1858  Uint1 value;
1859 
1860  if (! pbp || ! pbp->gcode.IsId())
1861  return;
1862 
1863  gcpvalue = pbp->gcode.GetId();
1864  value = (Uint1)atoi(qval);
1865  genome = pbp->genome;
1866 
1867  if (value == gcpvalue)
1868  return;
1869 
1870  if (value == 7 || value == 8) {
1871  ErrPostEx(SEV_WARNING, ERR_CDREGION_InvalidGcodeTable, "genetic code table is obsolete /transl_table = %d", value);
1872  pbp->gcode.SetId(pbp->orig_gcode);
1873  return;
1874  }
1875 
1876  if (value != 11 || (genome != 2 && genome != 3 && genome != 6 &&
1877  genome != 12 && genome != 16 && genome != 17 &&
1878  genome != 18 && genome != 22)) {
1879  sev = (taxserver == 0) ? SEV_INFO : SEV_ERROR;
1880  ErrPostEx(sev, ERR_CDREGION_GeneticCodeDiff, "Genetic code from Taxonomy server: %d, from /transl_table: %d", gcpvalue, value);
1881  }
1882 
1883  pbp->gcode.SetId(value);
1884 }
1885 
1886 /**********************************************************/
1888 {
1889  if (! gcode.IsId())
1890  return false;
1891 
1893  ce->SetId(gcode.GetId());
1894  code.Set().push_back(ce);
1895 
1896  return true;
1897 }
1898 
1899 /**********************************************************/
1900 static Int4 IfOnlyStopCodon(const CBioseq& bioseq, const CSeq_feat& feat, bool transl)
1901 {
1902  Uint1 strand;
1903  Int4 len;
1904  Int4 i;
1905 
1906  if (! feat.IsSetLocation() || transl)
1907  return (0);
1908 
1909  const CSeq_loc& loc = feat.GetLocation();
1910  TSeqPos start = loc.GetStart(eExtreme_Positional),
1911  stop = loc.GetStop(eExtreme_Positional) + 1;
1912 
1913  if (start == kInvalidSeqPos || stop == kInvalidSeqPos)
1914  return (0);
1915 
1916  len = stop - start;
1917  if (len < 1 || len > 5)
1918  return (0);
1919 
1920  strand = loc.IsSetStrand() ? loc.GetStrand() : 0;
1921 
1922  string loc_str = location_to_string(loc);
1923  if (loc_str.empty()) {
1924  loc_str = "???";
1925  }
1926  if ((strand == 2 && stop == bioseq.GetLength()) || (strand != 2 && start == 0)) {
1927  ErrPostEx(SEV_INFO, ERR_CDREGION_StopCodonOnly, "Assuming coding region at \"%s\" annotates the stop codon of an upstream or downstream coding region.", loc_str.c_str());
1928  i = 1;
1929  } else {
1930  ErrPostEx(SEV_REJECT, ERR_CDREGION_StopCodonBadInterval, "Coding region at \"%s\" appears to annotate a stop codon, but its location does not include a sequence endpoint.", loc_str.c_str());
1931  i = -1;
1932  }
1933  return (i);
1934 }
1935 
1936 /**********************************************************/
1937 static void fta_concat_except_text(CSeq_feat& feat, const Char* text)
1938 {
1939  if (! text)
1940  return;
1941 
1942  if (feat.IsSetExcept_text()) {
1943  feat.SetExcept_text() += ", ";
1944  feat.SetExcept_text() += text;
1945  } else
1946  feat.SetExcept_text() = text;
1947 }
1948 
1949 /**********************************************************/
1951 {
1952  const char** b;
1953  ErrSev sev;
1954  char* p;
1955 
1956  if (! feat.IsSetQual())
1957  return true;
1958 
1959  bool stopped = false;
1960  for (TQualVector::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end();) {
1961  if (! (*qual)->IsSetQual()) {
1962  ++qual;
1963  continue;
1964  }
1965 
1966  if ((*qual)->GetQual() == "ribosomal_slippage")
1967  p = (char*)"ribosomal slippage";
1968  else if ((*qual)->GetQual() == "trans_splicing")
1969  p = (char*)"trans-splicing";
1970  else
1971  p = nullptr;
1972 
1973  if (p) {
1974  feat.SetExcept(true);
1975 
1976  qual = feat.SetQual().erase(qual);
1977  fta_concat_except_text(feat, p);
1978  continue;
1979  }
1980 
1981  if ((*qual)->GetQual() != "exception" || ! (*qual)->IsSetVal()) {
1982  ++qual;
1983  continue;
1984  }
1985 
1988  else
1990 
1991  const Char* cur_val = (*qual)->GetVal().c_str();
1992  for (; *b; b++) {
1993  if (NStr::CompareNocase(*b, cur_val) == 0)
1994  break;
1995  }
1996 
1997  if (! *b) {
1999 
2000  string loc = location_to_string(feat.GetLocation());
2001  ErrPostEx(sev, ERR_QUALIFIER_InvalidException, "/exception value \"%s\" on feature \"CDS\" at location \"%s\" is invalid.", cur_val, loc.empty() ? "Unknown" : loc.c_str());
2003  stopped = true;
2004  break;
2005  }
2006  } else {
2007  feat.SetExcept(true);
2008  fta_concat_except_text(feat, cur_val);
2009  }
2010 
2011  qual = feat.SetQual().erase(qual);
2012  }
2013 
2014  if (feat.GetQual().empty())
2015  feat.ResetQual();
2016 
2017  if (stopped)
2018  return false;
2019 
2020  return true;
2021 }
2022 
2023 /**********************************************************
2024  *
2025  * static Int2 CkCdRegion(pp, sfp, bsq, num, gene):
2026  *
2027  * Routine returns 0, and
2028  * sfp->data.choice = SEQFEAT_IMP :
2029  * - if ambiguous frame number and no "codon_start"
2030  * qualifier;
2031  * - if there is a "pseudo" qualifier;
2032  * - if the range of the "transl_except" qualifier is
2033  * wrong;
2034  * Otherwise return 1, and
2035  * sfp->data.choice = SEQFEAT_CDREGION
2036  *
2037  **********************************************************/
2038 static Int2 CkCdRegion(ParserPtr pp, CScope& scope, CSeq_feat& cds, CBioseq& bioseq, int* num, GeneRefFeats& gene_refs)
2039 {
2040  ProtBlkPtr pbp;
2041  const char* r;
2042 
2043  optional<string> qval;
2044  bool is_pseudo;
2045  bool is_stop;
2046  bool is_transl;
2047 
2048  ErrSev sev;
2049 
2050  Uint1 method = 0;
2051  Uint1 dif;
2052  Uint1 codon_start;
2053  Int2 frame;
2054  Int2 i;
2055 
2056  pbp = pp->pbp;
2057  if (pp->buf)
2058  MemFree(pp->buf);
2059  pp->buf = nullptr;
2060 
2061  TCodeBreakList code_breaks;
2062  GetCdRegionCB(pbp->ibp, cds, code_breaks, &dif, pp->accver);
2063 
2064  is_pseudo = cds.IsSetPseudo() ? cds.GetPseudo() : false;
2065  is_transl = FindTheQual(cds, "translation");
2066 
2067  CCode_break* first_code_break = nullptr;
2068  if (! code_breaks.empty())
2069  first_code_break = *code_breaks.begin();
2070 
2071  if (first_code_break && first_code_break->GetAa().GetNcbieaa() == 42)
2072  is_stop = true;
2073  else if (is_pseudo)
2074  is_stop = false;
2075  else {
2076  i = IfOnlyStopCodon(bioseq, cds, is_transl);
2077  if (i < 0)
2078  return (-1);
2079  is_stop = (i != 0);
2080  }
2081 
2082  if (! is_transl) {
2083  bool found = false;
2084  for (const auto& qual : cds.GetQual()) {
2085  if (! qual->IsSetQual() || ! qual->IsSetVal())
2086  continue;
2087 
2088  if (qual->GetQual() == "product" ||
2089  qual->GetQual() == "function" ||
2090  qual->GetQual() == "EC_number") {
2091  found = true;
2092  break;
2093  }
2094  }
2095 
2096  if (found) {
2097  CRef<CSeqFeatXref> xfer(new CSeqFeatXref);
2098  CProt_ref& prot_ref = xfer->SetData().SetProt();
2099  for (const auto& qual : cds.GetQual()) {
2100  if (! qual->IsSetQual() || ! qual->IsSetVal())
2101  continue;
2102 
2103  if (qual->GetQual() == "product")
2104  prot_ref.SetName().push_back(qual->GetVal());
2105  else if (qual->GetQual() == "EC_number")
2106  prot_ref.SetEc().push_back(qual->GetVal());
2107  else if (qual->GetQual() == "function")
2108  prot_ref.SetActivity().push_back(qual->GetVal());
2109  }
2110 
2111  DeleteQual(cds.SetQual(), "product");
2112  DeleteQual(cds.SetQual(), "EC_number");
2113  DeleteQual(cds.SetQual(), "function");
2114 
2115  if (cds.GetQual().empty())
2116  cds.ResetQual();
2117 
2118  cds.SetXref().push_back(xfer);
2119  }
2120  }
2121 
2122  if (pp->accver && is_transl && is_pseudo) {
2123  string loc = location_to_string(cds.GetLocation());
2124  ErrPostEx(SEV_ERROR, ERR_CDREGION_PseudoWithTranslation, "Coding region flagged as /pseudo has a /translation qualifier : \"%s\".", loc.c_str());
2125  return (-1);
2126  }
2127 
2128  if (pp->mode != Parser::EMode::Relaxed &&
2129  pp->accver && is_transl == false && FindTheQual(cds, "protein_id")) {
2130  string loc = location_to_string(cds.GetLocation());
2131  ErrPostEx(SEV_ERROR, ERR_CDREGION_UnexpectedProteinId, "CDS without /translation should not have a /protein_id qualifier. CDS = \"%s\".", loc.c_str());
2132  return (-1);
2133  }
2134 
2135  if (pp->mode != Parser::EMode::Relaxed &&
2136  is_transl == false && is_pseudo == false && is_stop == false) {
2137  string loc = location_to_string(cds.GetLocation());
2138  if (pp->accver == false) {
2139  r = "Feature and protein bioseq";
2140  i = -2;
2141  } else {
2142  r = "Record";
2143  i = -1;
2144  }
2145  sev = (i == -1) ? SEV_REJECT : SEV_ERROR;
2146  ErrPostEx(sev, ERR_CDREGION_MissingTranslation, "Missing /translation qualifier for CDS \"%s\". %s rejected.", loc.c_str(), r);
2147  return (i);
2148  }
2149 
2150  /* check exception qualifier */
2151  if (! fta_check_exception(cds, pp->source))
2152  return (-1);
2153 
2154  CRef<CImp_feat> imp_feat(new CImp_feat);
2155  if (cds.IsSetData() && cds.GetData().IsImp())
2156  imp_feat->Assign(cds.GetData().GetImp());
2157 
2158  codon_start = 1;
2159  qval = GetTheQualValue(cds.SetQual(), "codon_start");
2160 
2161  if (! qval) {
2162  if (pp->source == Parser::ESource::EMBL)
2163  frame = 1;
2164  else {
2165  frame = 0;
2167  if (CCleanup::SetFrameFromLoc(loc_frame, cds.GetLocation(), scope))
2168  frame = loc_frame;
2169 
2170  if (frame == 0 && is_pseudo == false) {
2171  string loc = location_to_string(cds.GetLocation());
2172  sev = (pp->source == Parser::ESource::DDBJ) ? SEV_INFO : SEV_ERROR;
2173  ErrPostEx(sev, ERR_CDREGION_MissingCodonStart, "CDS feature \"%s\" is lacking /codon_start qualifier; assuming frame = 1.", loc.c_str());
2174  frame = 1;
2175  }
2176  }
2177  } else {
2178  frame = (Uint1)atoi(qval->c_str());
2179  qval.reset();
2180  }
2181 
2182  CRef<CCdregion> cdregion(new CCdregion);
2183 
2184  if (frame > 0)
2185  cdregion->SetFrame(static_cast<CCdregion::EFrame>(frame));
2186 
2187  qval = GetTheQualValue(cds.SetQual(), "transl_table");
2188 
2189  if (qval) {
2190  check_gen_code(qval->c_str(), pbp, pp->taxserver);
2191  pp->no_code = false;
2192  qval.reset();
2193  } else if (pbp && pbp->gcode.IsId())
2194  pbp->gcode.SetId(pbp->orig_gcode);
2195 
2196  if (! code_breaks.empty())
2197  cdregion->SetCode_break().swap(code_breaks);
2198 
2199  if (! CpGeneticCodePtr(cdregion->SetCode(), pbp->gcode))
2200  cdregion->ResetCode();
2201 
2202  cds.SetData().SetCdregion(*cdregion);
2203 
2204  if (cds.GetQual().empty())
2205  cds.ResetQual();
2206 
2207  if (! is_transl) {
2208  imp_feat.Reset();
2209  return (0);
2210  }
2211 
2212  CBioseq::TId ids;
2213  GetProtRefSeqId(ids, pbp->ibp, num, pp, scope, cds);
2214 
2215  if (! ids.empty())
2216  fta_check_codon_quals(cds);
2217 
2218  string sequence_data;
2219  InternalStopCodon(pp, pbp->ibp, cds, &method, dif, gene_refs, sequence_data);
2220 
2221  if (cds.GetQual().empty())
2222  cds.ResetQual();
2223 
2224  if (cdregion->IsSetConflict() && cdregion->GetConflict() && codon_start == 0) {
2225  string loc = location_to_string(cds.GetLocation());
2226  ErrPostEx(SEV_ERROR, ERR_CDREGION_TooBad, "Input translation does not agree with parser generated one, cdregion \"%s\" is lacking /codon_start, frame not set, - so sequence will be rejected.", loc.c_str());
2227  return (-1);
2228  }
2229 
2230  if (! sequence_data.empty()) {
2231  imp_feat.Reset();
2232  CRef<CBioseq> new_bioseq = BldProtRefSeqEntry(pbp, cds, sequence_data, method, pp, bioseq, ids);
2233 
2234  if (new_bioseq.Empty()) {
2235  return (-1);
2236  }
2237 
2238  scope.AddBioseq(*new_bioseq);
2239 
2240  /* remove qualifiers which were processed before */
2241  DeleteQual(cds.SetQual(), "codon_start");
2242  DeleteQual(cds.SetQual(), "transl_except");
2243  DeleteQual(cds.SetQual(), "translation");
2244  DeleteQual(cds.SetQual(), "protein_id");
2245 
2246  if (cds.GetQual().empty())
2247  cds.ResetQual();
2248 
2249  if (sequence_data.size() < 6 && pp->accver == false && check_short_CDS(pp, cds, true)) {
2250  /* make xref from prot-ref for short CDS only */
2251  if (new_bioseq->IsSetAnnot()) {
2252  for (const auto& annot : new_bioseq->GetAnnot()) {
2253  if (! annot->IsFtable())
2254  continue;
2255 
2256  for (const auto& cur_feat : annot->GetData().GetFtable()) {
2257  if (! cur_feat->IsSetData() || ! cur_feat->GetData().IsProt())
2258  continue;
2259 
2260  CRef<CSeqFeatXref> new_xref(new CSeqFeatXref);
2261  new_xref->SetData().SetProt().Assign(cur_feat->GetData().GetProt());
2262 
2263  cds.SetXref().push_back(new_xref);
2264  }
2265  }
2266  }
2267  return (0);
2268  }
2269 
2270  CSeq_id& first_id = *(*new_bioseq->SetId().begin());
2271  cds.SetProduct().SetWhole(first_id);
2272 
2273  AddProtRefSeqEntry(pbp, *new_bioseq);
2274 
2275  return (1);
2276  }
2277 
2278  /* no protein sequence, or there is no translation qualifier
2279  * and protein sequence has internal stop codon
2280  */
2281 
2282  cds.SetExcept(false);
2283  if (cds.IsSetExcept_text())
2284  cds.ResetExcept_text();
2285 
2286  cds.ResetData();
2287  if (imp_feat.NotEmpty())
2288  cds.SetData().SetImp(*imp_feat);
2289 
2290  if (! is_pseudo) {
2291  string loc = location_to_string(cds.GetLocation());
2292  ErrPostEx(SEV_ERROR, ERR_CDREGION_ConvertToImpFeat, "non-pseudo CDS with data problems is converted to ImpFeat%s", loc.c_str());
2293  }
2294  return (0);
2295 }
2296 
2297 /**********************************************************
2298  *
2299  * static SeqFeatPtr SrchCdRegion(pp, bsp, sfp, gene):
2300  *
2301  * Return a link list of SeqFeatPtr of type CDREGION.
2302  *
2303  **********************************************************/
2304 static void SrchCdRegion(ParserPtr pp, CScope& scope, CBioseq& bioseq, CSeq_annot& annot, GeneRefFeats& gene_refs)
2305 {
2306  Int4 num = 0;
2307  Int2 i;
2308 
2309  if (! annot.IsSetData() || ! annot.GetData().IsFtable())
2310  return;
2311 
2312  for (CSeq_annot::C_Data::TFtable::iterator feat = annot.SetData().SetFtable().begin();
2313  feat != annot.SetData().SetFtable().end();) {
2314  if (! (*feat)->IsSetData() || ! (*feat)->GetData().IsImp()) {
2315  ++feat;
2316  continue;
2317  }
2318 
2319  const CImp_feat& imp_feat = (*feat)->GetData().GetImp();
2320  if (! imp_feat.IsSetKey() || imp_feat.GetKey() != "CDS") {
2321  ++feat;
2322  continue;
2323  }
2324 
2325  /* remove asn2ff_generated comments */
2326  StripCDSComment(*(*feat));
2327 
2328  const CSeq_loc& loc = (*feat)->GetLocation();
2329  if (loc.IsEmpty() || loc.IsEquiv() || loc.IsBond()) {
2330  string loc_str = location_to_string(loc);
2331  ErrPostEx(SEV_REJECT, ERR_CDREGION_BadLocForTranslation, "Coding region feature has a location that cannot be processed: \"%s\".", loc_str.c_str());
2332  pp->entrylist[pp->curindx]->drop = true;
2333  break;
2334  }
2335 
2336  i = CkCdRegion(pp, scope, *(*feat), bioseq, &num, gene_refs);
2337 
2338  if (i == -2) {
2339  feat = annot.SetData().SetFtable().erase(feat);
2340  continue;
2341  }
2342 
2343  if (i == -1) {
2344  pp->entrylist[pp->curindx]->drop = true;
2345  break;
2346  }
2347 
2348  if (i != 1) {
2349  ++feat;
2350  continue;
2351  }
2352 
2353  /* prepare cdregion to link list, for nuc-prot level */
2354  pp->pbp->feats.push_back(*feat);
2355 
2356  feat = annot.SetData().SetFtable().erase(feat);
2357  }
2358 }
2359 
2360 /**********************************************************/
2361 static void FindCd(TEntryList& seq_entries, CScope& scope, ParserPtr pp, GeneRefFeats& gene_refs)
2362 {
2363  ProtBlkPtr pbp = pp->pbp;
2364 
2365  for (auto& entry : seq_entries) {
2366  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
2367  pbp->segset = true;
2368  pbp->biosep = entry;
2369  break;
2370  }
2371 
2372  if (pbp->segset)
2373  break;
2374  }
2375 
2376  for (auto& entry : seq_entries) {
2377  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
2378  const CSeq_id& first_id = *(*bioseq->GetId().begin());
2379  if (IsSegBioseq(first_id))
2380  continue;
2381 
2382  if (pp->source != Parser::ESource::USPTO)
2383  CpSeqId(pbp->ibp, first_id);
2384 
2385  if (bioseq->IsSetAnnot()) {
2386  for (CBioseq::TAnnot::iterator annot = bioseq->SetAnnot().begin(); annot != bioseq->SetAnnot().end();) {
2387  if (! (*annot)->IsFtable()) {
2388  ++annot;
2389  continue;
2390  }
2391 
2392  SrchCdRegion(pp, scope, *bioseq, *(*annot), gene_refs);
2393  if (! (*annot)->GetData().GetFtable().empty()) {
2394  ++annot;
2395  continue;
2396  }
2397 
2398  annot = bioseq->SetAnnot().erase(annot);
2399  }
2400 
2401  if (bioseq->GetAnnot().empty())
2402  bioseq->ResetAnnot();
2403  }
2404 
2405  if (! pbp->segset) {
2406  pbp->biosep = entry;
2407  }
2408  }
2409  }
2410 }
2411 
2412 /**********************************************************/
2413 static bool check_GIBB(TSeqdescList& descrs)
2414 {
2415  if (descrs.empty())
2416  return false;
2417 
2418  const CSeqdesc* descr_modif = nullptr;
2419  for (const auto& descr : descrs) {
2420  if (descr->IsModif()) {
2421  descr_modif = descr;
2422  break;
2423  }
2424  }
2425  if (! descr_modif)
2426  return true;
2427 
2428  if (! descr_modif->GetModif().empty()) {
2429  EGIBB_mod gmod = *descr_modif->GetModif().begin();
2430  if (gmod == eGIBB_mod_dna || gmod == eGIBB_mod_rna ||
2431  gmod == eGIBB_mod_est)
2432  return false;
2433  }
2434  return true;
2435 }
2436 
2437 /**********************************************************/
2438 static void ValNodeExtractUserObject(TSeqdescList& descrs_from, TSeqdescList& descrs_to, const Char* tag)
2439 {
2440  for (TSeqdescList::iterator descr = descrs_from.begin(); descr != descrs_from.end();) {
2441  if ((*descr)->IsUser() && (*descr)->GetUser().IsSetData() &&
2442  (*descr)->GetUser().IsSetType() && (*descr)->GetUser().GetType().IsStr() &&
2443  (*descr)->GetUser().GetType().GetStr() == tag) {
2444  descrs_to.push_back(*descr);
2445  descr = descrs_from.erase(descr);
2446  break;
2447  } else
2448  ++descr;
2449  }
2450 }
2451 
2452 /**********************************************************/
2453 void ExtractDescrs(TSeqdescList& descrs_from, TSeqdescList& descrs_to, CSeqdesc::E_Choice choice)
2454 {
2455  for (TSeqdescList::iterator descr = descrs_from.begin(); descr != descrs_from.end();) {
2456  if ((*descr)->Which() == choice) {
2457  descrs_to.push_back(*descr);
2458  descr = descrs_from.erase(descr);
2459  } else
2460  ++descr;
2461  }
2462 }
2463 
2464 /**********************************************************/
2465 static void GetBioseqSetDescr(ProtBlkPtr pbp, TSeqdescList& descrs)
2466 {
2467  TSeqdescList* descrs_from = nullptr;
2468  if (pbp->segset) {
2469  if (! pbp->biosep->GetSet().GetDescr().Get().empty())
2470  descrs_from = &pbp->biosep->SetSet().SetDescr().Set();
2471  } else {
2472  if (! pbp->biosep->GetSeq().GetDescr().Get().empty())
2473  descrs_from = &pbp->biosep->SetSeq().SetDescr().Set();
2474  }
2475 
2476  if (! descrs_from)
2477  return;
2478 
2479  ExtractDescrs(*descrs_from, descrs, CSeqdesc::e_Org);
2480 
2481  if (check_GIBB(*descrs_from)) {
2482  ExtractDescrs(*descrs_from, descrs, CSeqdesc::e_Modif);
2483  }
2484 
2485  ExtractDescrs(*descrs_from, descrs, CSeqdesc::e_Comment);
2486  ExtractDescrs(*descrs_from, descrs, CSeqdesc::e_Pub);
2487  ExtractDescrs(*descrs_from, descrs, CSeqdesc::e_Update_date);
2488 
2489  ValNodeExtractUserObject(*descrs_from, descrs, "GenomeProjectsDB");
2490  ValNodeExtractUserObject(*descrs_from, descrs, "DBLink");
2491  ValNodeExtractUserObject(*descrs_from, descrs, "FeatureFetchPolicy");
2492 }
2493 
2494 /**********************************************************/
2496 {
2497  CRef<CSeq_entry> entry(new CSeq_entry);
2498  CBioseq_set& seq_set = entry->SetSet();
2500 
2501  /* add descr if nuc-prot */
2502  GetBioseqSetDescr(pbp, seq_set.SetDescr().Set()); /* get from ASN.1 tree */
2503  if (seq_set.GetDescr().Get().empty())
2504  seq_set.ResetDescr();
2505 
2506  seq_set.SetSeq_set().splice(seq_set.SetSeq_set().end(), entries);
2507 
2508  CRef<CSeq_annot> annot(new CSeq_annot);
2509 
2510  if (! pbp->feats.empty())
2511  annot->SetData().SetFtable().swap(pbp->feats);
2512 
2513  seq_set.SetAnnot().push_back(annot);
2514 
2515  entries.push_back(entry);
2516 }
2517 
2518 /**********************************************************/
2519 void ProcNucProt(ParserPtr pp, TEntryList& seq_entries, GeneRefFeats& gene_refs)
2520 {
2521  ProtBlkPtr pbp;
2522  ErrSev sev;
2523  Int4 gcode = 0;
2524 
2525  pbp = pp->pbp;
2526  ProtBlkInit(pbp);
2527 
2528  GetGcode(seq_entries, pp);
2529 
2530  if (! pbp->gcode.IsId()) {
2531  gcode = (pbp->genome == 4 || pbp->genome == 5) ? 2 : 1;
2532  pp->no_code = true;
2533  sev = (pp->taxserver == 0) ? SEV_INFO : SEV_WARNING;
2534  ErrPostEx(sev, ERR_CDREGION_GeneticCodeAssumed, "No %sgenetic code from TaxArch, code %d assumed", (gcode == 2) ? "mitochondrial " : "", gcode);
2535  pbp->gcode.SetId(gcode);
2536  pbp->orig_gcode = gcode;
2537  }
2538 
2539  FindCd(seq_entries, GetScope(), pp, gene_refs);
2540 
2541  if (pp->entrylist[pp->curindx]->drop) {
2542  ProtBlkFree(pbp);
2543  seq_entries.clear();
2544  return;
2545  }
2546 
2547  if (! pbp->entries.empty()) {
2548  seq_entries.splice(seq_entries.end(), pbp->entries);
2549 
2550  BuildProtBioseqSet(pbp, seq_entries);
2551  AssignBioseqSetLevel(seq_entries);
2552  }
2553 
2554  ProtBlkFree(pbp);
2555 }
2556 
2557 /**********************************************************/
2558 static const CDate* GetDateFromDescrs(const TSeqdescList& descrs, CSeqdesc::E_Choice what)
2559 {
2560  const CDate* set_date = nullptr;
2561  for (const auto& descr : descrs) {
2562  if (descr->Which() == what) {
2563  if (what == CSeqdesc::e_Create_date)
2564  set_date = &descr->GetCreate_date();
2565  else if (what == CSeqdesc::e_Update_date)
2566  set_date = &descr->GetUpdate_date();
2567 
2568  if (set_date)
2569  break;
2570  }
2571  }
2572 
2573  return set_date;
2574 }
2575 
2576 /**********************************************************/
2577 static void FixDupDates(CBioseq_set& bio_set, CSeqdesc::E_Choice what)
2578 {
2579  if (! bio_set.IsSetSeq_set() || ! bio_set.IsSetDescr())
2580  return;
2581 
2582  for (auto& entry : bio_set.SetSeq_set()) {
2583  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
2584  if (! bioseq->IsSetInst() || ! bioseq->GetInst().IsSetMol() || ! bioseq->GetInst().IsNa() || ! bioseq->IsSetDescr())
2585  continue;
2586 
2587  const CDate* set_date = GetDateFromDescrs(bio_set.GetDescr().Get(), what);
2588 
2589  TSeqdescList& cur_descrs = bioseq->SetDescr().Set();
2590  TSeqdescList::iterator cur_descr = cur_descrs.begin();
2591 
2592  for (; cur_descr != cur_descrs.end(); ++cur_descr) {
2593  if ((*cur_descr)->Which() == what)
2594  break;
2595  }
2596 
2597  if (cur_descr == cur_descrs.end())
2598  continue;
2599 
2600  const CDate* seq_date = nullptr;
2601  if (what == CSeqdesc::e_Create_date)
2602  seq_date = &(*cur_descr)->GetCreate_date();
2603  else if (what == CSeqdesc::e_Update_date)
2604  seq_date = &(*cur_descr)->GetUpdate_date();
2605 
2606  if (! seq_date)
2607  continue;
2608 
2609  if (set_date && seq_date->Compare(*set_date) == CDate::eCompare_same)
2610  cur_descrs.erase(cur_descr);
2611 
2612  if (! set_date) {
2613  bio_set.SetDescr().Set().push_back(*cur_descr);
2614  cur_descrs.erase(cur_descr);
2615  }
2616  }
2617  }
2618 }
2619 
2620 /**********************************************************/
2621 static void FixCreateDates(CBioseq_set& bio_set)
2622 {
2624 }
2625 
2626 /**********************************************************/
2627 static void FixUpdateDates(CBioseq_set& bio_set)
2628 {
2630 }
2631 
2632 /**********************************************************/
2633 static void FixEmblUpdateDates(CBioseq_set& bio_set)
2634 {
2635  if (! bio_set.IsSetSeq_set() || ! bio_set.IsSetDescr())
2636  return;
2637 
2638  for (auto& entry : bio_set.SetSeq_set()) {
2639  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
2640  if (! bioseq->IsSetInst() || ! bioseq->GetInst().IsSetMol() || ! bioseq->GetInst().IsNa() || ! bioseq->IsSetDescr())
2641  continue;
2642 
2643  const CDate* set_date = GetDateFromDescrs(bio_set.GetDescr().Get(), CSeqdesc::e_Update_date);
2644 
2645  const CEMBL_block* embl_block = nullptr;
2646  for (const auto& descr : bioseq->GetDescr().Get()) {
2647  if (descr->IsEmbl()) {
2648  embl_block = &descr->GetEmbl();
2649  break;
2650  }
2651  }
2652 
2653  const CDate* seq_date = nullptr;
2654  if (embl_block && embl_block->IsSetUpdate_date())
2655  seq_date = &embl_block->GetUpdate_date();
2656 
2657  if (! seq_date)
2658  continue;
2659 
2660  if (set_date && seq_date->Compare(*set_date) == CDate::eCompare_same)
2661  continue;
2662 
2663  if (! set_date) {
2664  CRef<CSeqdesc> new_descr(new CSeqdesc);
2665  new_descr->SetUpdate_date().Assign(*seq_date);
2666  bio_set.SetDescr().Set().push_back(new_descr);
2667  }
2668  }
2669  }
2670 }
2671 
2672 /**********************************************************/
2673 void CheckDupDates(TEntryList& seq_entries)
2674 {
2675  for (auto& entry : seq_entries) {
2676  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
2677  if (bio_set->IsSetClass() && bio_set->GetClass() == CBioseq_set::eClass_nuc_prot) {
2678  FixCreateDates(*bio_set);
2679  FixUpdateDates(*bio_set);
2680  FixEmblUpdateDates(*bio_set);
2681  }
2682  }
2683  }
2684 }
2685 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
CRef< CSeq_loc > fta_get_seqloc_int_whole(CSeq_id &seq_id, size_t len)
Definition: add.cpp:1453
string tata_save(string_view t)
Definition: add.cpp:148
CRef< CSeq_id > StrToSeqId(const char *pch, bool pid)
Definition: asci_blk.cpp:2660
CRef< CPatent_seq_id > MakeUsptoPatSeqId(const char *acc)
Definition: asci_blk.cpp:830
bool IsSegBioseq(const CSeq_id &id)
Definition: asci_blk.cpp:2506
void ShrinkSpaces(char *line)
Definition: asci_blk.cpp:118
const CSeq_descr & GetDescrPointer(const CSeq_entry &entry)
Definition: asci_blk.cpp:2924
CRef< objects::CSeq_id > GetNewProtId(objects::CBioseq_Handle bsh, int &offset, string &id_label, bool general_only)
CBioseq_Handle –.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
CCdregion –.
Definition: Cdregion.hpp:66
static bool SetFrameFromLoc(CCdregion &cdregion, const CSeq_loc &loc, CScope &scope)
Chooses best frame based on location 1.
Definition: cleanup.cpp:1253
CCode_break –.
Definition: Code_break.hpp:66
Definition: Date.hpp:53
ECompare Compare(const CDate &date) const
Definition: Date.cpp:83
@ eCompare_same
They're equivalent.
Definition: Date.hpp:75
CEMBL_block –.
Definition: EMBL_block.hpp:66
@Imp_feat.hpp User-defined methods of the data storage class.
Definition: Imp_feat.hpp:54
CScope –.
Definition: scope.hpp:92
CSeqFeatXref –.
Definition: SeqFeatXref.hpp:66
SeqMap related exceptions.
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
#define ERR_CDREGION_SuppliedProteinUsed
Definition: flat2err.h:448
#define ERR_CDREGION_IllegalStart
Definition: flat2err.h:421
#define ERR_CDREGION_ProteinLenDiff
Definition: flat2err.h:447
#define ERR_CDREGION_MissingProteinVersion
Definition: flat2err.h:437
#define ERR_QUALIFIER_InvalidException
Definition: flat2err.h:118
#define ERR_CDREGION_UnexpectedProteinId
Definition: flat2err.h:443
#define ERR_CDREGION_MissingCodonStart
Definition: flat2err.h:440
#define ERR_CDREGION_PseudoWithTranslation
Definition: flat2err.h:442
#define ERR_CDREGION_ConvertToImpFeat
Definition: flat2err.h:429
#define ERR_FEATURE_LocationParsing
Definition: flat2err.h:335
#define ERR_CDREGION_MissingProteinId
Definition: flat2err.h:436
#define ERR_CDREGION_InvalidDb_xref
Definition: flat2err.h:433
#define ERR_CDREGION_LocationLength
Definition: flat2err.h:431
#define ERR_CDREGION_BadLocForTranslation
Definition: flat2err.h:430
#define ERR_CDREGION_CodonQualifierUsed
Definition: flat2err.h:453
#define ERR_CDREGION_IncorrectProteinVersion
Definition: flat2err.h:438
#define ERR_CDREGION_NoTranslationCompare
Definition: flat2err.h:426
#define ERR_CDREGION_MissingTranslation
Definition: flat2err.h:441
#define ERR_CDREGION_NoProteinSeq
Definition: flat2err.h:417
#define ERR_BIOSEQSETCLASS_NewClass
Definition: flat2err.h:413
#define ERR_CDREGION_GeneticCodeAssumed
Definition: flat2err.h:425
#define ERR_CDREGION_TranslationOverride
Definition: flat2err.h:432
#define ERR_CDREGION_TooBad
Definition: flat2err.h:435
#define ERR_CDREGION_TranslationDiff
Definition: flat2err.h:419
#define ERR_CDREGION_TranslationsAgree
Definition: flat2err.h:420
#define ERR_CDREGION_TerminalStopCodonMissing
Definition: flat2err.h:418
#define ERR_CDREGION_IncorrectProteinAccession
Definition: flat2err.h:439
#define ERR_CDREGION_ShortProtein
Definition: flat2err.h:424
#define ERR_CDREGION_Multiple_PID
Definition: flat2err.h:434
#define ERR_CDREGION_InvalidGcodeTable
Definition: flat2err.h:428
#define ERR_PROTREF_NoNameForProtein
Definition: flat2err.h:464
#define ERR_CDREGION_InternalStopCodonFound
Definition: flat2err.h:416
#define ERR_CDREGION_GeneticCodeDiff
Definition: flat2err.h:422
#define ERR_CDREGION_UnevenLocation
Definition: flat2err.h:423
#define ERR_CDREGION_TranslationAdded
Definition: flat2err.h:427
#define ERR_CDREGION_StopCodonOnly
Definition: flat2err.h:445
#define ERR_CDREGION_StopCodonBadInterval
Definition: flat2err.h:446
list< CRef< objects::CSeq_entry > > TEntryList
bool fta_if_special_org(const Char *name)
Definition: fta_src.cpp:909
std::list< CRef< objects::CSeq_id > > TSeqIdList
Definition: ftablock.h:57
std::list< CRef< objects::CSeqdesc > > TSeqdescList
Definition: ftablock.h:60
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void ErrLogPrintStr(const char *str)
Definition: ftaerr.cpp:422
static const struct name_t names[]
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static const char * str(char *buf, int n)
Definition: stats.c:84
static const char location[]
Definition: config.c:97
char data[12]
Definition: iconv.c:80
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_FATAL
Definition: gicache.c:93
#define SEV_REJECT
Definition: gicache.c:92
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
ErrSev
Definition: ncbierr.hpp:63
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
Definition: Seq_id.cpp:2613
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
bool IsSetStrand(EIsSetStrand flag=eIsSetStrand_Any) const
Check if strand is set for any/all part(s) of the seq-loc depending on the flag.
Definition: Seq_loc.cpp:858
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
TSeqPos LocationOffset(const CSeq_loc &outer, const CSeq_loc &inner, EOffsetType how=eOffset_FromStart, CScope *scope=0)
returns (TSeqPos)-1 if the locations don't overlap
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
ECompare
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eContains
First CSeq_loc contains second.
@ eOverlap
CSeq_locs overlap.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
@ eNoOverlap
CSeq_locs do not overlap or abut.
@ eOffset_FromEnd
relative to end of location
@ eOffset_FromStart
For positive-orientation strands, start = left and end = right; for reverse-orientation strands,...
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TBioseqStateFlags GetState(void) const
Get state of the bioseq.
TSeqPos GetBioseqLength(void) const
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static string Sanitize(CTempString str, TSS_Flags flags=fSS_print)
Sanitize a string, allowing only specified classes of characters.
Definition: ncbistr.hpp:2876
const TUpdate_date & GetUpdate_date(void) const
Get the Update_date member data.
bool IsSetUpdate_date(void) const
Check if a value has been assigned to Update_date data member.
bool IsSetSyn(void) const
synonyms for locus Check if a value has been assigned to Syn data member.
Definition: Gene_ref_.hpp:756
const TSyn & GetSyn(void) const
Get the Syn member data.
Definition: Gene_ref_.hpp:768
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
@ eLim_gt
greater than
Definition: Int_fuzz_.hpp:211
@ eLim_lt
less than
Definition: Int_fuzz_.hpp:212
TActivity & SetActivity(void)
Assign a value to Activity data member.
Definition: Prot_ref_.hpp:481
TEc & SetEc(void)
Assign a value to Ec data member.
Definition: Prot_ref_.hpp:456
void SetDesc(const TDesc &value)
Assign a value to Desc data member.
Definition: Prot_ref_.hpp:412
TName & SetName(void)
Assign a value to Name data member.
Definition: Prot_ref_.hpp:384
void SetAa(TAa &value)
Assign a value to Aa data member.
TXref & SetXref(void)
Assign a value to Xref data member.
Definition: Seq_feat_.hpp:1314
const TKey & GetKey(void) const
Get the Key member data.
Definition: Imp_feat_.hpp:259
bool IsSetComment(void) const
Check if a value has been assigned to Comment data member.
Definition: Seq_feat_.hpp:1037
void SetStops(TStops value)
Assign a value to Stops data member.
Definition: Cdregion_.hpp:774
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
Definition: Seq_feat_.hpp:1339
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsSetQual(void) const
qualifiers Check if a value has been assigned to Qual data member.
Definition: Seq_feat_.hpp:1135
bool IsSetCode(void) const
genetic code used Check if a value has been assigned to Code data member.
Definition: Cdregion_.hpp:700
TId GetId(void) const
Get the variant data.
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
bool IsImp(void) const
Check if variant Imp is selected.
const TLoc & GetLoc(void) const
Get the Loc member data.
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
TId & SetId(void)
Select the variant.
Tdata & Set(void)
Assign a value to data member.
void SetPartial(TPartial value)
Assign a value to Partial data member.
Definition: Seq_feat_.hpp:971
void SetProduct(TProduct &value)
Assign a value to Product data member.
Definition: Seq_feat_.cpp:110
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Seq_feat_.hpp:1147
bool IsSetPartial(void) const
incomplete in some way? Check if a value has been assigned to Partial data member.
Definition: Seq_feat_.hpp:943
bool IsSetKey(void) const
Check if a value has been assigned to Key data member.
Definition: Imp_feat_.hpp:247
void SetCode(TCode &value)
Assign a value to Code data member.
Definition: Cdregion_.cpp:68
void ResetExcept_text(void)
Reset Except_text data member.
Definition: Seq_feat_.cpp:194
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
void SetExcept(TExcept value)
Assign a value to Except data member.
Definition: Seq_feat_.hpp:1018
bool IsSetConflict(void) const
conflict Check if a value has been assigned to Conflict data member.
Definition: Cdregion_.hpp:559
TFrame GetFrame(void) const
Get the Frame member data.
Definition: Cdregion_.hpp:534
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool IsSetExcept(void) const
something funny about this? Check if a value has been assigned to Except data member.
Definition: Seq_feat_.hpp:990
void ResetData(void)
Reset Data data member.
Definition: Seq_feat_.cpp:85
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
const TDbxref & GetDbxref(void) const
Get the Dbxref member data.
Definition: Seq_feat_.hpp:1333
void ResetConflict(void)
Reset Conflict data member.
Definition: Cdregion_.hpp:571
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void ResetComment(void)
Reset Comment data member.
Definition: Seq_feat_.cpp:99
const TCdregion & GetCdregion(void) const
Get the variant data.
const TAa & GetAa(void) const
Get the Aa member data.
TPseudo GetPseudo(void) const
Get the Pseudo member data.
Definition: Seq_feat_.hpp:1365
void SetLoc(TLoc &value)
Assign a value to Loc data member.
TNcbieaa GetNcbieaa(void) const
Get the variant data.
bool IsSetPseudo(void) const
annotated on pseudogene? Check if a value has been assigned to Pseudo data member.
Definition: Seq_feat_.hpp:1346
const TComment & GetComment(void) const
Get the Comment member data.
Definition: Seq_feat_.hpp:1049
TPartial GetPartial(void) const
Get the Partial member data.
Definition: Seq_feat_.hpp:962
void SetExcept_text(const TExcept_text &value)
Assign a value to Except_text data member.
Definition: Seq_feat_.hpp:1414
TExcept GetExcept(void) const
Get the Except member data.
Definition: Seq_feat_.hpp:1009
void SetConflict(TConflict value)
Assign a value to Conflict data member.
Definition: Cdregion_.hpp:587
void ResetDbxref(void)
Reset Dbxref data member.
Definition: Seq_feat_.cpp:188
bool IsSetDbxref(void) const
support for xref to other databases Check if a value has been assigned to Dbxref data member.
Definition: Seq_feat_.hpp:1321
TQual & SetQual(void)
Assign a value to Qual data member.
Definition: Seq_feat_.hpp:1153
bool IsId(void) const
Check if variant Id is selected.
const TCode_break & GetCode_break(void) const
Get the Code_break member data.
Definition: Cdregion_.hpp:733
bool IsSetLoc(void) const
location of exception Check if a value has been assigned to Loc data member.
TConflict GetConflict(void) const
Get the Conflict member data.
Definition: Cdregion_.hpp:578
void ResetQual(void)
Reset Qual data member.
Definition: Seq_feat_.cpp:136
bool IsSetCode_break(void) const
individual exceptions Check if a value has been assigned to Code_break data member.
Definition: Cdregion_.hpp:721
const TImp & GetImp(void) const
Get the variant data.
bool IsSetFrame(void) const
Check if a value has been assigned to Frame data member.
Definition: Cdregion_.hpp:509
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
Definition: Seq_feat_.hpp:1105
@ eFrame_not_set
not set, code uses one
Definition: Cdregion_.hpp:95
void SetTo(TTo value)
Assign a value to To data member.
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
TPatent & SetPatent(void)
Select the variant.
Definition: Seq_id_.cpp:331
bool IsEmpty(void) const
Check if variant Empty is selected.
Definition: Seq_loc_.hpp:516
const TId & GetId(void) const
Get the Id member data.
TFrom GetFrom(void) const
Get the From member data.
bool IsEquiv(void) const
Check if variant Equiv is selected.
Definition: Seq_loc_.hpp:558
virtual void Reset(void)
Reset the whole object.
Definition: Seq_id_.cpp:56
void SetFrom(TFrom value)
Assign a value to From data member.
bool IsBond(void) const
Check if variant Bond is selected.
Definition: Seq_loc_.hpp:564
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
void SetFuzz_to(TFuzz_to &value)
Assign a value to Fuzz_to data member.
void SetFuzz_from(TFuzz_from &value)
Assign a value to Fuzz_from data member.
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
bool IsSetStrand(void) const
Check if a value has been assigned to Strand data member.
bool IsSetFuzz_to(void) const
Check if a value has been assigned to Fuzz_to data member.
TStrand GetStrand(void) const
Get the Strand member data.
TTo GetTo(void) const
Get the To member data.
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
bool IsSetFuzz_from(void) const
Check if a value has been assigned to Fuzz_from data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
const TDescr & GetDescr(void) const
Get the Descr member data.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
bool IsSetDescr(void) const
Check if a value has been assigned to Descr data member.
void SetClass(TClass value)
Assign a value to Class data member.
void ResetDescr(void)
Reset Descr data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_parts
parts for 2 or 3
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ eClass_segset
segmented sequence + parts
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
Definition: Seq_annot_.hpp:861
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
EGIBB_mod
GenInfo Backbone modifiers.
Definition: GIBB_mod_.hpp:64
E_Choice
Choice variants.
Definition: Seqdesc_.hpp:109
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
const TModif & GetModif(void) const
Get the variant data.
Definition: Seqdesc_.hpp:965
bool IsFtable(void) const
Check if variant Ftable is selected.
Definition: Seq_annot_.hpp:615
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
@ eGIBB_method_concept_trans
conceptual translation
@ eGIBB_method_concept_trans_a
conceptual transl. supplied by author
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eCompleteness_no_left
missing 5' or NH3 end
Definition: MolInfo_.hpp:158
@ eCompleteness_partial
partial but no details given
Definition: MolInfo_.hpp:157
@ eCompleteness_no_right
missing 3' or COOH end
Definition: MolInfo_.hpp:159
@ eCompleteness_no_ends
missing both ends
Definition: MolInfo_.hpp:160
@ eTech_concept_trans
conceptual translation
Definition: MolInfo_.hpp:131
@ eTech_concept_trans_a
conceptual transl. supplied by author
Definition: MolInfo_.hpp:136
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ eGIBB_mod_mitochondrial
Definition: GIBB_mod_.hpp:69
@ eGIBB_mod_kinetoplast
Definition: GIBB_mod_.hpp:71
@ eGIBB_mod_dna
Definition: GIBB_mod_.hpp:65
@ eGIBB_mod_rna
Definition: GIBB_mod_.hpp:66
@ eGIBB_mod_est
expressed sequence tag
Definition: GIBB_mod_.hpp:85
@ e_Org
if all from one organism
Definition: Seqdesc_.hpp:116
@ e_Update_date
date of last update
Definition: Seqdesc_.hpp:129
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Comment
a more extensive comment
Definition: Seqdesc_.hpp:117
@ e_Modif
modifiers
Definition: Seqdesc_.hpp:112
@ e_Create_date
date entry first created/released
Definition: Seqdesc_.hpp:128
CSeq_id::E_Choice GetNucAccOwner(const CTempString &acc)
Definition: indx_blk.cpp:2271
CSeq_id::E_Choice GetProtAccOwner(const CTempString &acc)
Definition: indx_blk.cpp:2288
int i
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const CharType(& source)[N]
Definition: pointer.h:1149
const char * tag
T max(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static void FixUpdateDates(CBioseq_set &bio_set)
Definition: nucprot.cpp:2627
USING_SCOPE(objects)
static bool Translate(CSeq_feat &feat, string &prot)
Definition: nucprot.cpp:1403
static bool check_short_CDS(ParserPtr pp, const CSeq_feat &feat, bool err_msg)
Definition: nucprot.cpp:320
void CheckDupDates(TEntryList &seq_entries)
Definition: nucprot.cpp:2673
static Int4 IfOnlyStopCodon(const CBioseq &bioseq, const CSeq_feat &feat, bool transl)
Definition: nucprot.cpp:1900
static void GetGcode(const TEntryList &seq_entries, ParserPtr pp)
Definition: nucprot.cpp:247
void ProcNucProt(ParserPtr pp, TEntryList &seq_entries, GeneRefFeats &gene_refs)
Definition: nucprot.cpp:2519
static bool check_translation(string &prot, const char *qval)
Definition: nucprot.cpp:1367
static Int4 fta_get_genetic_code(ParserPtr pp)
Definition: nucprot.cpp:186
static bool check_GIBB(TSeqdescList &descrs)
Definition: nucprot.cpp:2413
static const CDate * GetDateFromDescrs(const TSeqdescList &descrs, CSeqdesc::E_Choice what)
Definition: nucprot.cpp:2558
static void GetBioseqSetDescr(ProtBlkPtr pbp, TSeqdescList &descrs)
Definition: nucprot.cpp:2465
static void QualsToSeqID(CSeq_feat &feat, Parser::ESource source, TSeqIdList &ids)
Definition: nucprot.cpp:825
static CRef< CBioseq > BldProtRefSeqEntry(ProtBlkPtr pbp, CSeq_feat &feat, const string &seq_data, Uint1 method, ParserPtr pp, const CBioseq &bioseq, CBioseq::TId &ids)
Definition: nucprot.cpp:1012
static Int2 EndAdded(CSeq_feat &feat, GeneRefFeats &gene_refs)
Definition: nucprot.cpp:1452
static bool FindTheQual(const CSeq_feat &feat, const Char *qual_to_find)
Definition: nucprot.cpp:144
void ExtractDescrs(TSeqdescList &descrs_from, TSeqdescList &descrs_to, CSeqdesc::E_Choice choice)
Definition: nucprot.cpp:2453
static bool CpGeneticCodePtr(CGenetic_code &code, const CGenetic_code::C_E &gcode)
Definition: nucprot.cpp:1887
static void fta_check_codon_quals(CSeq_feat &feat)
Definition: nucprot.cpp:1625
static void GuessGeneticCode(ParserPtr pp, const CSeq_descr &descrs)
Definition: nucprot.cpp:209
static void GetProtRefDescr(CSeq_feat &feat, Uint1 method, const CBioseq &bioseq, TSeqdescList &descrs)
Definition: nucprot.cpp:650
static void ProtBlkFree(ProtBlkPtr pbp)
Definition: nucprot.cpp:260
static void check_gen_code(const char *qval, ProtBlkPtr pbp, Uint1 taxserver)
Definition: nucprot.cpp:1853
static void FixCreateDates(CBioseq_set &bio_set)
Definition: nucprot.cpp:2621
static void AddProtRefSeqEntry(ProtBlkPtr pbp, CBioseq &bioseq)
Definition: nucprot.cpp:1041
static void GetProtRefAnnot(InfoBioseqPtr ibp, CSeq_feat &feat, CBioseq &bioseq)
Definition: nucprot.cpp:560
list< CRef< CCode_break > > TCodeBreakList
Definition: nucprot.cpp:108
static void GetProtRefSeqId(CBioseq::TId &ids, InfoBioseqPtr ibp, int *num, ParserPtr pp, CScope &scope, CSeq_feat &cds)
Definition: nucprot.cpp:334
static void ValidateQualSeqId(TSeqIdList &ids)
Definition: nucprot.cpp:883
static void StripCDSComment(CSeq_feat &feat)
Definition: nucprot.cpp:517
static void GetCdRegionCB(InfoBioseqPtr ibp, CSeq_feat &feat, TCodeBreakList &code_breaks, unsigned char *dif, bool accver)
Definition: nucprot.cpp:1081
static void DbxrefToSeqID(CSeq_feat &feat, Parser::ESource source, TSeqIdList &ids)
Definition: nucprot.cpp:922
const char * GBExceptionQualVals[]
Definition: nucprot.cpp:110
static void AssignBioseqSetLevel(TEntryList &seq_entries)
Definition: nucprot.cpp:289
static char * stripStr(char *base, const char *str)
Definition: nucprot.cpp:500
static void ProtBlkInit(ProtBlkPtr pbp)
Definition: nucprot.cpp:272
static char * CpTheQualValueNext(TQualVector::iterator &cur_qual, const TQualVector::iterator &end_qual, const char *qual)
Definition: nucprot.cpp:164
static void CkProteinTransl(ParserPtr pp, InfoBioseqPtr ibp, string &prot, CSeq_feat &feat, const char *qval, bool intercodon, const char *gcode, unsigned char *method)
Definition: nucprot.cpp:1242
static void ValNodeExtractUserObject(TSeqdescList &descrs_from, TSeqdescList &descrs_to, const Char *tag)
Definition: nucprot.cpp:2438
static void FixEmblUpdateDates(CBioseq_set &bio_set)
Definition: nucprot.cpp:2633
static void FindCd(TEntryList &seq_entries, CScope &scope, ParserPtr pp, GeneRefFeats &gene_refs)
Definition: nucprot.cpp:2361
static bool fta_check_exception(CSeq_feat &feat, Parser::ESource source)
Definition: nucprot.cpp:1950
static void fta_concat_except_text(CSeq_feat &feat, const Char *text)
Definition: nucprot.cpp:1937
const char * RSExceptionQualVals[]
Definition: nucprot.cpp:118
static void CkEndStop(const CSeq_feat &feat, Uint1 dif)
Definition: nucprot.cpp:1165
static void check_end_internal(size_t protlen, const CSeq_feat &feat, Uint1 dif)
Definition: nucprot.cpp:1187
static void SrchCdRegion(ParserPtr pp, CScope &scope, CBioseq &bioseq, CSeq_annot &annot, GeneRefFeats &gene_refs)
Definition: nucprot.cpp:2304
static void ProcessForDbxref(CBioseq &bioseq, CSeq_feat &feat, Parser::ESource source)
Definition: nucprot.cpp:991
static char * SimpleValuePos(const char *qval)
Definition: nucprot.cpp:1049
static void BuildProtBioseqSet(ProtBlkPtr pbp, TEntryList &entries)
Definition: nucprot.cpp:2495
static void FixDupDates(CBioseq_set &bio_set, CSeqdesc::E_Choice what)
Definition: nucprot.cpp:2577
static void ErrByteStorePtr(InfoBioseqPtr ibp, const CSeq_feat &feat, const string &prot)
Definition: nucprot.cpp:1214
static Int2 CkCdRegion(ParserPtr pp, CScope &scope, CSeq_feat &cds, CBioseq &bioseq, int *num, GeneRefFeats &gene_refs)
Definition: nucprot.cpp:2038
static void InternalStopCodon(ParserPtr pp, InfoBioseqPtr ibp, CSeq_feat &feat, unsigned char *method, Uint1 dif, GeneRefFeats &gene_refs, string &seq_data)
Definition: nucprot.cpp:1661
TSeqFeatList::iterator first
Definition: nucprot.h:65
TSeqFeatList::iterator last
Definition: nucprot.h:66
bool valid
Definition: nucprot.h:64
int gc_mito
Definition: ftablock.h:242
int gc_genomic
Definition: ftablock.h:241
string mAccNum
Definition: ftablock.h:84
string mLocus
Definition: ftablock.h:83
TSeqIdList ids
Definition: ftablock.h:82
vector< IndexblkPtr > entrylist
ProtBlkPtr pbp
TEntryList entries
Definition: ftablock.h:91
objects::CSeq_entry * biosep
Definition: ftablock.h:89
bool segset
Definition: ftablock.h:90
Int4 orig_gcode
Definition: ftablock.h:100
Uint1 genome
Definition: ftablock.h:99
InfoBioseq * ibp
Definition: ftablock.h:98
objects::CGenetic_code::C_E gcode
Definition: ftablock.h:96
TSeqFeatList feats
Definition: ftablock.h:93
Definition: inftrees.h:24
CScope & GetScope()
bool DeleteQual(TQualVector &qlist, const Char *qual)
Definition: utilfeat.cpp:180
Uint1 GetQualValueAa(const char *qval, bool checkseq)
Definition: utilfeat.cpp:204
string location_to_string(const CSeq_loc &loc)
Definition: utilfeat.cpp:471
string CpTheQualValue(const TQualVector &qlist, const Char *qual)
Definition: utilfeat.cpp:120
bool SeqLocHaveFuzz(const CSeq_loc &loc)
Definition: utilfeat.cpp:97
optional< string > GetTheQualValue(TQualVector &qlist, const Char *qual)
Definition: utilfeat.cpp:147
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
Definition: utilfun.cpp:1596
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1585
Char * StringIStr(const Char *where, const Char *what)
Definition: utilfun.cpp:674
void CpSeqId(InfoBioseqPtr ibp, const CSeq_id &id)
Definition: utilfun.cpp:880
static wxAcceleratorEntry entries[3]
CRef< CSeq_loc > xgbparseint_ver(const char *raw_intervals, bool &keep_rawPt, int &numErrors, const TSeqIdList &seq_ids, bool accver)
Definition: xgbparint.cpp:1466
Modified on Sun Apr 14 05:27:59 2024 by modify_doxy.py rev. 669887