NCBI C++ ToolKit
cleanup.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cleanup.cpp 102112 2024-04-02 18:07:29Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Robert Smith
27  *
28  * File Description:
29  * Basic Cleanup of CSeq_entries.
30  *
31  */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <serial/serialbase.hpp>
35 #include <objects/seq/Bioseq.hpp>
37 // included for GetPubdescLabels and GetCitationList
38 #include <objects/pub/Pub.hpp>
40 #include <objects/seq/Pubdesc.hpp>
46 
57 
59 #include <objmgr/util/sequence.hpp>
60 #include <objmgr/util/feature.hpp>
61 #include <objmgr/util/autodef.hpp>
62 #include <objmgr/seq_annot_ci.hpp>
63 #include <objmgr/seqdesc_ci.hpp>
64 #include <objmgr/seq_vector.hpp>
65 #include <objmgr/seq_vector_ci.hpp>
68 #include "cleanup_utils.hpp"
70 
71 #include <util/strsearch.hpp>
72 
73 #include "newcleanupp.hpp"
74 
76 
78 
81 
84 };
85 
86 // *********************** CCleanup implementation **********************
87 
88 
89 CCleanup::CCleanup(CScope* scope, EScopeOptions scope_handling)
90 {
91  if (scope && scope_handling == eScope_UseInPlace) {
92  m_Scope = scope;
93  }
94  else {
96  if (scope) {
97  m_Scope->AddScope(*scope);
98  }
99  }
100 }
101 
102 
104 {
105 }
106 
107 
109 {
111  if (scope) {
112  m_Scope->AddScope(*scope);
113  }
114 }
115 
116 
117 static
119 {
120  CRef<CCleanupChange> changes;
121  if (! (options & CCleanup::eClean_NoReporting)) {
122  changes.Reset(new CCleanupChange);
123  }
124  return changes;
125 }
126 
127 #define CLEANUP_SETUP \
128  auto changes = makeCleanupChange(options); \
129  CNewCleanup_imp clean_i(changes, options); \
130  clean_i.SetScope(*m_Scope);
131 
133 {
135  clean_i.BasicCleanupSeqEntry(se);
136  return changes;
137 }
138 
139 
141 {
143  clean_i.BasicCleanupSeqSubmit(ss);
144  return changes;
145 }
146 
147 
149 {
151  clean_i.BasicCleanupSubmitblock(block);
152  return changes;
153 }
154 
155 
157 {
159  clean_i.BasicCleanupBioseqSet(bss);
160  return changes;
161 }
162 
163 
165 {
167  clean_i.BasicCleanupSeqAnnot(sa);
168  return changes;
169 }
170 
171 
173 {
175  clean_i.BasicCleanupSeqFeat(sf);
176  return changes;
177 }
178 
179 
181 {
183  clean_i.BasicCleanupBioSource(src);
184  return changes;
185 }
186 
187 
189 {
190  auto changes = makeCleanupChange(options);
191  CNewCleanup_imp clean_i(changes, options);
192  clean_i.SetScope(seh.GetScope());
193  clean_i.BasicCleanupSeqEntryHandle(seh);
194  return changes;
195 }
196 
197 
199 {
200  auto changes = makeCleanupChange(options);
201  CNewCleanup_imp clean_i(changes, options);
202  clean_i.SetScope(bsh.GetScope());
203  clean_i.BasicCleanupBioseqHandle(bsh);
204  return changes;
205 }
206 
207 
209 {
210  auto changes = makeCleanupChange(options);
211  CNewCleanup_imp clean_i(changes, options);
212  clean_i.SetScope(bssh.GetScope());
213  clean_i.BasicCleanupBioseqSetHandle(bssh);
214  return changes;
215 }
216 
217 
219 {
220  auto changes = makeCleanupChange(options);
221  CNewCleanup_imp clean_i(changes, options);
222  clean_i.SetScope(sah.GetScope());
223  clean_i.BasicCleanupSeqAnnotHandle(sah);
224  return changes;
225 }
226 
227 
229 {
230  auto changes = makeCleanupChange(options);
231  CNewCleanup_imp clean_i(changes, options);
232  clean_i.SetScope(sfh.GetScope());
233  clean_i.BasicCleanupSeqFeatHandle(sfh);
234  return changes;
235 }
236 
237 
239 {
241  clean_i.BasicCleanup(desc);
242  return changes;
243 
244 }
245 
246 
248 {
250 
251  for (auto& it : desc.Set()) {
252  clean_i.BasicCleanup(*it);
253  }
254  return changes;
255 }
256 
257 
258 // *********************** Extended Cleanup implementation ********************
260 {
262  clean_i.ExtendedCleanupSeqEntry(se);
263 
264  return changes;
265 }
266 
267 
269 {
271  clean_i.ExtendedCleanupSeqSubmit(ss);
272  return changes;
273 }
274 
275 
277 {
279  clean_i.ExtendedCleanupSeqAnnot(sa); // (m_Scope->GetSeq_annotHandle(sa));
280  return changes;
281 }
282 
284 {
285  auto changes = makeCleanupChange(options);
286  CNewCleanup_imp clean_i(changes, options);
287  clean_i.ExtendedCleanupSeqEntryHandle(seh); // (m_Scope->GetSeq_annotHandle(sa));
288  return changes;
289 }
290 
291 
292 // *********************** CCleanupChange implementation **********************
293 
294 
295 vector<CCleanupChangeCore::EChanges> CCleanupChangeCore::GetAllChanges() const
296 {
297  return m_Changes;
298 }
299 
300 vector<string_view> CCleanupChangeCore::GetDescriptions() const
301 {
302  vector<string_view> result;
303  result.reserve(m_Changes.size());
304  for (auto it : m_Changes) {
305  result.push_back(GetDescription(it));
306  }
307  return result;
308 }
309 
310 // corresponds to the values in CCleanupChange::EChanges.
311 // They must be edited together.
312 static constexpr std::array<string_view, CCleanupChangeCore::eNumberofChangeTypes> sm_ChangeDesc = {
313  "Invalid Change Code",
314  // set when strings are changed.
315  "Trim Spaces",
316  "Clean Double Quotes",
317  "Append To String",
318  // set when lists are sorted or uniqued.
319  "Clean Qualifiers List",
320  "Clean Dbxrefs List",
321  "Clean CitonFeat List",
322  "Clean Keywords List",
323  "Clean Subsource List",
324  "Clean Orgmod List",
325  // Set when fields are moved or have content changes
326  "Repair BioseqMol", //10
327  "Change Feature Key",
328  "Normalize Authors",
329  "Change Publication",
330  "Change Qualifiers",
331  "Change Dbxrefs",
332  "Change Keywords",
333  "Change Subsource",
334  "Change Orgmod",
335  "Change Exception",
336  "Change Comment", //20
337  // Set when fields are rescued
338  "Change tRna",
339  "Change rRna",
340  "Change ITS",
341  "Change Anticodon",
342  "Change Code Break",
343  "Change Genetic Code",
344  "Copy GeneXref",
345  "Copy ProtXref",
346  // set when locations are repaired
347  "Change Seqloc",
348  "Change Strand", //30
349  "Change WholeLocation",
350  // set when MolInfo descriptors are affected
351  "Change MolInfo Descriptor",
352  // set when prot-xref is removed
353  "Remove ProtXref",
354  // set when gene-xref is removed
355  "Remove GeneXref",
356  // set when protein feature is added
357  "Add Protein Feature",
358  // set when feature is removed
359  "Remove Feature",
360  // set when feature is moved
361  "Move Feature",
362  // set when qualifier is removed
363  "Remove Qualifier",
364  // set when Gene Xref is created
365  "Add GeneXref",
366  // set when descriptor is removed
367  "Remove Descriptor", //40
368  "Remove Keyword",
369  "Add Descriptor",
370  "Move Descriptor",
371  "Convert Feature to Descriptor",
372  "Collapse Set",
373  "Change Feature Location",
374  "Remove Annotation",
375  "Convert Feature",
376  "Remove Comment",
377  "Add BioSource OrgMod", //50
378  "Add BioSource SubSource",
379  "Change BioSource Genome",
380  "Change BioSource Origin",
381  "Change BioSource Other",
382  "Change SeqId",
383  "Remove Empty Publication",
384  "Add Qualifier",
385  "Cleanup Date",
386  "Change BioseqInst",
387  "Remove SeqID", // 60
388  "Add ProtXref",
389  "Change Partial",
390  "Change Prot Names",
391  "Change Prot Activities",
392  "Change Site",
393  "Change PCR Primers",
394  "Change RNA-ref",
395  "Move To Prot Xref",
396  "Compress Spaces",
397  "Strip serial", // 70
398  "Remove Orgmod",
399  "Remove SubSource",
400  "Create Gene Nomenclature",
401  "Clean Seq-feat xref",
402  "Clean User-Object Or -Field",
403  "Letter Case Change",
404  "Change Bioseq-set Class",
405  "Unique Without Sort",
406  "Add RNA-ref",
407  "Change Gene-ref", // 80
408  "Clean Dbtag",
409  "Change Biomol",
410  "Change Cdregion",
411  "Clean EC Number",
412  "Remove Exception",
413  "Add NcbiCleanupObject",
414  "Clean Delta-ext",
415  "Trim Flanking Quotes",
416  "Clean Bioseq Title",
417  "Decode XML", // 90
418  "Remove Dup BioSource",
419  "Clean Org-ref",
420  "Trim Internal Semicolons",
421  "Add SeqFeatXref",
422  "Convert Unstructured Org-ref Modifier",
423  "Change taxname",
424  "Move GO term to GeneOntology object",
425 
426  // set when any other change is made.
427  "Change Other",
428 };
429 
431 {
432  if (e <= eNoChange || e >= eNumberofChangeTypes) {
433  return sm_ChangeDesc[eNoChange]; // this is "Invalid Change Code"
434  }
435  return sm_ChangeDesc[e];
436 }
437 
439 {
440  if (NStr::Equal(key, "sig_peptide")) {
442  } else if (NStr::Equal(key, "mat_peptide")) {
444  } else if (NStr::Equal(key, "transit_peptide")) {
446  } else if (NStr::Equal(key, "preprotein") || NStr::Equal(key, "proprotein")) {
448  } else if (NStr::Equal(key, "propeptide")) {
450  } else {
452  }
453 }
454 
456 {
457  switch (processed) {
459  return "mat_peptide";
460  break;
462  return "preprotein";
463  break;
465  return "sig_peptide";
466  break;
468  return "transit_peptide";
469  break;
471  return "propeptide";
472  break;
474  return kEmptyStr;
475  break;
476  }
477  return kEmptyStr;
478 }
479 
480 
482 {
483  if (fh.GetData().IsProt() && fh.GetData().GetProt().IsSetProcessed()) {
485  if (!NStr::IsBlank(key)) {
486  CRef<CSeq_feat> new_feat(new CSeq_feat());
487  new_feat->Assign(*(fh.GetSeq_feat()));
488  if (fh.GetData().GetProt().IsSetName() && !fh.GetData().GetProt().GetName().empty()) {
489  CRef<CGb_qual> q(new CGb_qual());
490  q->SetQual("product");
491  q->SetVal(fh.GetData().GetProt().GetName().front());
492  new_feat->SetQual().push_back(q);
493  }
494  new_feat->SetData().SetImp().SetKey(key);
495  CSeq_feat_EditHandle efh(fh);
496  efh.Replace(*new_feat);
497  return true;
498  }
499  }
500  return false;
501 }
502 
503 
505 {
506  if (!fh.IsSetData()) {
507  return false;
508  } else if (fh.GetData().IsProt() &&
509  fh.GetData().GetProt().IsSetProcessed() &&
511  return true;
512  } else if (fh.GetData().IsImp() &&
513  fh.GetData().GetImp().IsSetKey() &&
515  return true;
516  } else {
517  return false;
518  }
519 }
520 
521 
523 {
524  if (!feat.IsSetQual() ||
525  !feat.IsSetData() ||
526  !feat.GetData().IsProt() ||
527  feat.GetData().GetProt().IsSetName()) {
528  return;
529  }
530  CSeq_feat::TQual::iterator it = feat.SetQual().begin();
531  while (it != feat.SetQual().end()) {
532  if ((*it)->IsSetQual() &&
533  NStr::Equal((*it)->GetQual(), "product")) {
534  if ((*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal())) {
535  feat.SetData().SetProt().SetName().push_back((*it)->GetVal());
536  }
537  it = feat.SetQual().erase(it);
538  } else {
539  ++it;
540  }
541  }
542 
543  if (feat.SetQual().empty()) {
544  feat.ResetQual();
545  }
546 }
547 
548 
550 {
551  const bool feat_by_product = true;
552  SAnnotSelector sel(CSeqFeatData::e_Cdregion, feat_by_product);
553  CFeat_CI fi(scope, product, sel);
554  if (fi) {
555  return ConstRef(&(fi->GetOriginalFeature()));
556  }
557  return CConstRef<CSeq_feat>();
558 };
559 
561 {
562  sequence::TFeatScores cdsScores;
564  feat_loc,
568  cdsScores,
569  scope);
570 
571  if (cdsScores.empty()) {
572  return CConstRef<CSeq_feat>();
573  }
574 
575  if (!feat_loc.IsPartialStart(eExtreme_Biological)) {
576  for (auto cdsScore : cdsScores) {
577  if (feature::IsLocationInFrame(scope.GetSeq_featHandle(*cdsScore.second), feat_loc)
579  return cdsScore.second;
580  }
581  }
582  }
583 
584  return cdsScores.front().second;
585 }
586 
587 
588 
590 {
592  if (fh.GetData().IsImp()) {
593  if (!fh.GetData().GetImp().IsSetKey()) {
594  return false;
595  }
596  processed = s_ProcessedFromKey(fh.GetData().GetImp().GetKey());
597  if (processed == CProt_ref::eProcessed_not_set || processed == CProt_ref::eProcessed_preprotein) {
598  return false;
599  }
600  } else if (s_IsPreprotein(fh)) {
601  return ConvertProteinToImp(fh);
602  }
603 
604  CBioseq_Handle parent_bsh = fh.GetScope().GetBioseqHandle(fh.GetLocation());
605 
606  if (!parent_bsh) {
607  // feature is mispackaged
608  return false;
609  }
610  if (parent_bsh.IsAa()) {
611  // feature is already on protein sequence
612  return false;
613  }
614 
616  bool matched_by_product = false;
617 
618  if (fh.IsSetProduct() &&
619  fh.GetData().IsProt() &&
620  fh.GetData().GetProt().IsSetProcessed() &&
622  cds = s_GetCdsByProduct(fh.GetScope(), fh.GetProduct());
623  if (cds) {
624  matched_by_product = true;
625  }
626  }
627  if (!matched_by_product) {
628  cds = s_GetCdsByLocation(fh.GetScope(), fh.GetLocation());
629  }
630  if (!cds || !cds->IsSetProduct()) {
631  // there is no overlapping coding region feature, so there is no appropriate
632  // protein sequence to move to
633  return ConvertProteinToImp(fh);
634  }
635 
636  bool require_frame = false;
637  if (!require_frame) {
638  ITERATE(CBioseq::TId, id_it, parent_bsh.GetBioseqCore()->GetId()) {
639  if ((*id_it)->IsEmbl() || (*id_it)->IsDdbj()) {
640  require_frame = true;
641  break;
642  }
643  }
644  }
645 
646  CRef<CSeq_loc> prot_loc = GetProteinLocationFromNucleotideLocation(fh.GetLocation(), *cds, fh.GetScope(), require_frame);
647 
648  if (!prot_loc) {
649  return false;
650  }
651 
652  CConstRef<CSeq_feat> orig_feat = fh.GetSeq_feat();
653  CRef<CSeq_feat> new_feat(new CSeq_feat());
654  new_feat->Assign(*orig_feat);
655  if (new_feat->GetData().Which() == CSeqFeatData::e_Imp) {
656  new_feat->SetData().SetProt().SetProcessed(processed);
657  // if possible, rescue product qual
658  RescueProtProductQual(*new_feat);
659  if (processed == CProt_ref::eProcessed_mature &&
660  !new_feat->GetData().GetProt().IsSetName()) {
661  if (orig_feat->IsSetComment() && !NStr::IsBlank(orig_feat->GetComment())) {
662  new_feat->SetData().SetProt().SetName().push_back(orig_feat->GetComment());
663  new_feat->ResetComment();
664  } else {
665  new_feat->SetData().SetProt().SetName().push_back("unnamed");
666  }
667  }
668  }
669 
670  // change location to protein
671  new_feat->ResetLocation();
672  new_feat->SetLocation(*prot_loc);
673  SetFeaturePartial(*new_feat);
674  if (matched_by_product) {
675  new_feat->ResetProduct();
676  }
677 
678  CSeq_feat_EditHandle edh(fh);
679  edh.Replace(*new_feat);
680  auto changes= makeCleanupChange(0);
681  CNewCleanup_imp clean_i(changes, 0);
682  clean_i.SetScope(fh.GetScope());
683  clean_i.BasicCleanupSeqFeat(*new_feat);
684 
685  CSeq_annot_Handle ah = fh.GetAnnot();
686 
687  CBioseq_Handle target_bsh = fh.GetScope().GetBioseqHandle(new_feat->GetLocation());
688  if (!target_bsh) {
689  return false;
690  }
691 
692  CBioseq_EditHandle eh = target_bsh.GetEditHandle();
693 
694  // Find a feature table on the protein sequence to add the feature to.
696  if (target_bsh.GetCompleteBioseq()->IsSetAnnot()) {
697  ITERATE(CBioseq::TAnnot, annot_it, target_bsh.GetCompleteBioseq()->GetAnnot()) {
698  if ((*annot_it)->IsFtable()) {
699  ftable = fh.GetScope().GetSeq_annotHandle(**annot_it);
700  }
701  }
702  }
703 
704  // If there is no feature table present, make one
705  if (!ftable) {
706  CRef<CSeq_annot> new_annot(new CSeq_annot());
707  ftable = eh.AttachAnnot(*new_annot);
708  }
709 
710  // add feature to the protein bioseq
712  aeh.TakeFeat(edh);
713 
714  // remove old annot if now empty
717  orig.Remove();
718  }
719 
720  return true;
721 }
722 
723 
725 {
726  bool any_change = false;
728  while (bi) {
732  for (CFeat_CI prot_it(*bi, sel); prot_it; ++prot_it) {
733  any_change |= MoveFeatToProtein(*prot_it);
734  }
735  for (CFeat_CI imp_it(*bi, CSeqFeatData::e_Imp); imp_it; ++imp_it) {
736  any_change |= MoveFeatToProtein(*imp_it);
737  }
738  ++bi;
739  }
740  return any_change;
741 }
742 
743 
744 bool CCleanup::IsGeneXrefUnnecessary(const CSeq_feat& sf, CScope& scope, const CGene_ref& gene_xref)
745 {
746  if (gene_xref.IsSuppressed()) {
747  return false;
748  }
749 
751  if (!gene || !gene->IsSetData() || !gene->GetData().IsGene()) {
752  return false;
753  }
754 
755  if (!gene->GetData().GetGene().RefersToSameGene(gene_xref)) {
756  return false;
757  }
758 
759  // see if other gene might also match
760  sequence::TFeatScores scores;
762  sequence::eOverlap_Contained, scores, scope);
763  if (scores.size() == 1) {
764  return true;
765  } else if (scores.size() == 0) {
766  return false;
767  }
768 
769  ITERATE(sequence::TFeatScores, g, scores) {
770  if (g->second.GetPointer() != gene.GetPointer() &&
771  sequence::Compare(g->second->GetLocation(), gene->GetLocation(), &scope, sequence::fCompareOverlapping) == sequence::eSame) {
772  return false;
773  }
774  }
775  return true;
776 }
777 
778 
780 {
781  if (!f.IsSetXref()) {
782  return false;
783  }
784  bool any_removed = false;
785  CSeq_feat::TXref::iterator xit = f.SetXref().begin();
786  while (xit != f.SetXref().end()) {
787  if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
788  IsGeneXrefUnnecessary(f, scope, (*xit)->GetData().GetGene())) {
789  xit = f.SetXref().erase(xit);
790  any_removed = true;
791  } else {
792  ++xit;
793  }
794  }
795  if (any_removed) {
796  if (f.IsSetXref() && f.GetXref().empty()) {
797  f.ResetXref();
798  }
799  }
800  return any_removed;
801 }
802 
803 
805 {
806  bool any_change = false;
807  CScope& scope = seh.GetScope();
808 
809  for (CFeat_CI fi(seh); fi; ++fi) {
810  if (fi->IsSetXref()) {
811  CRef<CSeq_feat> new_feat(new CSeq_feat());
812  new_feat->Assign(*(fi->GetOriginalSeq_feat()));
813  bool any_removed = RemoveUnnecessaryGeneXrefs(*new_feat, scope);
814  if (any_removed) {
815  CSeq_feat_EditHandle edh(*fi);
816  edh.Replace(*new_feat);
817  any_change = true;
818  }
819  }
820  }
821 
822  return any_change;
823 }
824 
825 
826 //LCOV_EXCL_START
827 //not used by asn_cleanup but used by other applications
829 {
830  if (!f.IsSetXref()) {
831  return false;
832  }
833  bool any_removed = false;
834  CSeq_feat::TXref::iterator xit = f.SetXref().begin();
835  while (xit != f.SetXref().end()) {
836  if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
837  !(*xit)->GetData().GetGene().IsSuppressed()) {
838  xit = f.SetXref().erase(xit);
839  any_removed = true;
840  } else {
841  ++xit;
842  }
843  }
844  if (any_removed) {
845  if (f.IsSetXref() && f.GetXref().empty()) {
846  f.ResetXref();
847  }
848  }
849  return any_removed;
850 }
851 //LCOV_EXCL_STOP
852 
853 
855 {
856  if (!src.IsSetId() || !src.GetId().IsLocal()) {
857  // can't create xref if no ID
858  return false;
859  }
861  // only create reciprocal xrefs if permitted
862  return false;
863  }
864  // don't create xref if already have xref or if dst not gene and already has
865  // xref to feature of same type as src
866  bool has_xref = false;
867  if (dst.IsSetXref()) {
868  ITERATE(CSeq_feat::TXref, xit, dst.GetXref()) {
869  if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
870  if ((*xit)->GetId().Equals(src.GetId())) {
871  // already have xref
872  has_xref = true;
873  break;
874  } else if (!dst.GetData().IsGene()) {
875  const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
877  ITERATE(CTSE_Handle::TSeq_feat_Handles, fit, far_feats) {
878  if (fit->GetData().GetSubtype() == src.GetData().GetSubtype()) {
879  has_xref = true;
880  break;
881  }
882  }
883  if (has_xref) {
884  break;
885  }
886  }
887  }
888  }
889  }
890  bool rval = false;
891  if (!has_xref) {
892  // to put into "editing mode"
893  dst.GetAnnot().GetEditHandle();
894  CSeq_feat_EditHandle eh(dst);
895  CRef<CSeq_feat> cpy(new CSeq_feat());
896  cpy->Assign(*(dst.GetSeq_feat()));
897  cpy->AddSeqFeatXref(src.GetId());
898  eh.Replace(*cpy);
899  rval = true;
900  }
901  return rval;
902 }
903 
904 
906 {
907  bool rval = false;
908 
909  if (!f.IsSetId() || !f.IsSetXref()) {
910  return rval;
911  }
912 
913  ITERATE(CSeq_feat::TXref, xit, f.GetXref()) {
914  if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
915  const CTSE_Handle::TFeatureId& x_id = (*xit)->GetId().GetLocal();
917  if (far_feats.size() == 1) {
918  rval |= RepairXrefs(f, far_feats[0], tse);
919  }
920  }
921  }
922  return rval;
923 }
924 
925 
927 {
928  bool rval = false;
929  const CTSE_Handle& tse = seh.GetTSE_Handle();
930 
931  CFeat_CI fi(seh);
932  while (fi) {
933  rval |= RepairXrefs(*(fi->GetSeq_feat()), tse);
934  ++fi;
935  }
936  return rval;
937 }
938 
939 
940 //LCOV_EXCL_START
941 //not used by asn_cleanup but used by other applications
943 {
944  bool match = false;
945  string locus1;
946  if (gene_xref.IsSetLocus())
947  locus1 = gene_xref.GetLocus();
948  for (CFeat_CI feat_ci(bsh, SAnnotSelector(CSeqFeatData::eSubtype_gene)); feat_ci; ++feat_ci)
949  {
950  string locus2;
951  if ( !f.Equals(*feat_ci->GetSeq_feat()) && feat_ci->GetSeq_feat()->IsSetData() && feat_ci->GetSeq_feat()->GetData().IsGene()
952  && feat_ci->GetSeq_feat()->GetData().GetGene().IsSetLocus())
953  {
954  locus2 = feat_ci->GetSeq_feat()->GetData().GetGene().GetLocus();
955  }
956  if (!locus1.empty() && !locus2.empty() && locus1 == locus2)
957  {
958  match = true;
959  break;
960  }
961  }
962  return match;
963 }
964 
966 {
967  if (!f.IsSetXref()) {
968  return false;
969  }
970  bool any_removed = false;
971  CSeq_feat::TXref::iterator xit = f.SetXref().begin();
972  while (xit != f.SetXref().end()) {
973  if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
974  !(*xit)->GetData().GetGene().IsSuppressed() && !FindMatchingLocusGene(f, (*xit)->GetData().GetGene(), bsh)) {
975  xit = f.SetXref().erase(xit);
976  any_removed = true;
977  } else {
978  ++xit;
979  }
980  }
981  if (any_removed) {
982  if (f.IsSetXref() && f.GetXref().empty()) {
983  f.ResetXref();
984  }
985  }
986  return any_removed;
987 }
988 
989 
991 {
992  bool match = false;
993  string locus_tag1;
994  if (gene_xref.IsSetLocus_tag())
995  locus_tag1 = gene_xref.GetLocus_tag();
996  for (CFeat_CI feat_ci(bsh, SAnnotSelector(CSeqFeatData::eSubtype_gene)); feat_ci; ++feat_ci)
997  {
998  string locus_tag2;
999  if ( !f.Equals(*feat_ci->GetSeq_feat()) && feat_ci->GetSeq_feat()->IsSetData() && feat_ci->GetSeq_feat()->GetData().IsGene()
1000  && feat_ci->GetSeq_feat()->GetData().GetGene().IsSetLocus_tag())
1001  {
1002  locus_tag2 = feat_ci->GetSeq_feat()->GetData().GetGene().GetLocus_tag();
1003  }
1004  if (!locus_tag1.empty() && !locus_tag2.empty() && locus_tag1 == locus_tag2)
1005  {
1006  match = true;
1007  break;
1008  }
1009  }
1010  return match;
1011 }
1012 
1014 {
1015  if (!f.IsSetXref()) {
1016  return false;
1017  }
1018  bool any_removed = false;
1019  CSeq_feat::TXref::iterator xit = f.SetXref().begin();
1020  while (xit != f.SetXref().end()) {
1021  if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
1022  !(*xit)->GetData().GetGene().IsSuppressed() && !FindMatchingLocus_tagGene(f, (*xit)->GetData().GetGene(), bsh)) {
1023  xit = f.SetXref().erase(xit);
1024  any_removed = true;
1025  } else {
1026  ++xit;
1027  }
1028  }
1029  if (any_removed) {
1030  if (f.IsSetXref() && f.GetXref().empty()) {
1031  f.ResetXref();
1032  }
1033  }
1034  return any_removed;
1035 }
1036 
1037 
1038 bool CCleanup::SeqLocExtend(CSeq_loc& loc, size_t pos_, CScope& scope)
1039 {
1040  TSeqPos pos = static_cast<TSeqPos>(pos_);
1041  TSeqPos loc_start = loc.GetStart(eExtreme_Positional);
1042  TSeqPos loc_stop = loc.GetStop(eExtreme_Positional);
1043  bool partial_start = loc.IsPartialStart(eExtreme_Positional);
1044  bool partial_stop = loc.IsPartialStop(eExtreme_Positional);
1045  ENa_strand strand = loc.GetStrand();
1046  CRef<CSeq_loc> new_loc;
1047  bool changed = false;
1048 
1049  if (pos < loc_start) {
1050  CRef<CSeq_id> id(new CSeq_id());
1051  id->Assign(*(loc.GetId()));
1052  CRef<CSeq_loc> add(new CSeq_loc(*id, pos, loc_start - 1, strand));
1053  add->SetPartialStart(partial_start, eExtreme_Positional);
1055  changed = true;
1056  } else if (pos > loc_stop) {
1057  CRef<CSeq_id> id(new CSeq_id());
1058  id->Assign(*(loc.GetId()));
1059  CRef<CSeq_loc> add(new CSeq_loc(*id, loc_stop + 1, pos, strand));
1060  add->SetPartialStop(partial_stop, eExtreme_Positional);
1062  changed = true;
1063  }
1064  if (changed) {
1065  loc.Assign(*new_loc);
1066  }
1067  return changed;
1068 }
1069 //LCOV_EXCL_STOP
1070 
1071 
1072 bool CCleanup::ExtendStopPosition(CSeq_feat& f, const CSeq_feat* cdregion, size_t extension_)
1073 {
1074  TSeqPos extension = static_cast<TSeqPos>(extension_);
1075  CRef<CSeq_loc> new_loc(&f.SetLocation());
1076 
1077  CRef<CSeq_loc> last_interval;
1078  if (new_loc->IsMix()) {
1079  last_interval = new_loc->SetMix().SetLastLoc();
1080  }
1081  else
1082  {
1083  last_interval = new_loc;
1084  }
1085 
1086  CConstRef<CSeq_id> id(last_interval->GetId());
1087 
1088  TSeqPos new_start;
1089  TSeqPos new_stop;
1090 
1091  // the last element of the mix or the single location MUST be converted into interval
1092  // whethe it's whole or point, etc
1093  if (last_interval->IsSetStrand() && last_interval->GetStrand() == eNa_strand_minus) {
1094  new_start = (cdregion ? cdregion->GetLocation().GetStart(eExtreme_Positional) :
1095  last_interval->GetStart(eExtreme_Positional)) - extension;
1096 
1097  new_stop = last_interval->GetStop(eExtreme_Positional);
1098  }
1099  else {
1100  new_start = last_interval->GetStart(eExtreme_Positional);
1101  new_stop = (cdregion ? cdregion->GetLocation().GetStop(eExtreme_Positional) :
1102  last_interval->GetStop(eExtreme_Positional)) + extension;
1103  }
1104  last_interval->SetInt().SetFrom(new_start);
1105  last_interval->SetInt().SetTo(new_stop);
1106  last_interval->SetInt().SetId().Assign(*id);
1107 
1108  new_loc->SetPartialStop(false, eExtreme_Biological);
1109 
1110  return true;
1111 }
1112 
1114 {
1115  const CSeq_loc& loc = f.GetLocation();
1116 
1118  const CGenetic_code* code = nullptr;
1119  // we need to extract frame and cd_region from linked cd_region
1120  if (f.IsSetData() && f.GetData().IsCdregion())
1121  {
1122  if (f.GetData().GetCdregion().IsSetCode())
1123  code = &(f.GetData().GetCdregion().GetCode());
1124  if (f.GetData().GetCdregion().IsSetFrame())
1125  frame = f.GetData().GetCdregion().GetFrame();
1126  }
1127 
1128  TSeqPos stop = loc.GetStop(eExtreme_Biological);
1129  if (stop < 1 || stop > bsh.GetBioseqLength() - 1) {
1130  // no room to extend
1131  return false;
1132  }
1133  // figure out if we have a partial codon at the end
1134  size_t orig_len = sequence::GetLength(loc, &(bsh.GetScope()));
1135  size_t len = orig_len;
1136 
1137  if (frame == CCdregion::eFrame_two) {
1138  len -= 1;
1139  } else if (frame == CCdregion::eFrame_three) {
1140  len -= 2;
1141  }
1142 
1143  TSeqPos mod = len % 3;
1144  CRef<CSeq_loc> vector_loc(new CSeq_loc());
1145  vector_loc->SetInt().SetId().Assign(*(bsh.GetId().front().GetSeqId()));
1146 
1147  if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
1148  vector_loc->SetInt().SetFrom(0);
1149  vector_loc->SetInt().SetTo(stop + mod - 1);
1150  vector_loc->SetStrand(eNa_strand_minus);
1151  } else {
1152  vector_loc->SetInt().SetFrom(stop - mod + 1);
1153  vector_loc->SetInt().SetTo(bsh.GetInst_Length() - 1);
1154  }
1155 
1156  CSeqVector seq(*vector_loc, bsh.GetScope(), CBioseq_Handle::eCoding_Iupac);
1157  // reserve our space
1158  size_t usable_size = seq.size();
1159 
1160  if (limit > 0 && usable_size > limit) {
1161  usable_size = limit;
1162  }
1163 
1164  // get appropriate translation table
1165  const CTrans_table & tbl =
1168 
1169  // main loop through bases
1170  CSeqVector::const_iterator start = seq.begin();
1171 
1172  size_t i;
1173  size_t k;
1174  int state = 0;
1175  size_t length = usable_size / 3;
1176 
1177  for (i = 0; i < length; ++i) {
1178  // loop through one codon at a time
1179  for (k = 0; k < 3; ++k, ++start) {
1180  state = tbl.NextCodonState(state, *start);
1181  }
1182 
1183  if (tbl.GetCodonResidue(state) == '*') {
1184  TSeqPos extension = static_cast<TSeqPos>(((i + 1) * 3) - mod);
1185  ExtendStopPosition(f, 0, extension);
1186  return true;
1187  }
1188  }
1189 
1190  return false;
1191 }
1192 
1193 
1195 {
1196  bool changed = false;
1198  if (cds.GetData().GetCdregion().IsSetFrame()) {
1199  frame = cds.GetData().GetCdregion().GetFrame();
1200  }
1201 
1202  CCdregion::TFrame new_frame = CSeqTranslator::FindBestFrame(cds, scope);
1203  if (frame != new_frame) {
1204  cds.SetData().SetCdregion().SetFrame(new_frame);
1205  changed = true;
1206  }
1207  return changed;
1208 }
1209 
1210 // like C's function GetFrameFromLoc, but better
1212 {
1213  if (!loc.IsPartialStart(eExtreme_Biological)) {
1214  if (frame != CCdregion::eFrame_one) {
1215  frame = CCdregion::eFrame_one;
1216  return true;
1217  }
1218  return false;
1219  }
1221  // cannot make a determination if both ends are partial
1222  return false;
1223  }
1224 
1225  const TSeqPos seq_len = sequence::GetLength(loc, &scope);
1226 
1228 
1229  // have complete last codon, get frame from length
1230  switch( (seq_len % 3) + 1 ) {
1231  case 1:
1232  desired_frame = CCdregion::eFrame_one;
1233  break;
1234  case 2:
1235  desired_frame = CCdregion::eFrame_two;
1236  break;
1237  case 3:
1238  desired_frame = CCdregion::eFrame_three;
1239  break;
1240  default:
1241  // mathematically impossible
1242  _ASSERT(false);
1243  return false;
1244  }
1245  if (frame != desired_frame) {
1246  frame = desired_frame;
1247  return true;
1248  }
1249  return false;
1250 }
1251 
1252 
1253 bool CCleanup::SetFrameFromLoc(CCdregion &cdregion, const CSeq_loc& loc, CScope& scope)
1254 {
1256  if (cdregion.IsSetFrame()) {
1257  frame = cdregion.GetFrame();
1258  }
1259  if (SetFrameFromLoc(frame, loc, scope)) {
1260  cdregion.SetFrame(frame);
1261  return true;
1262  } else {
1263  return false;
1264  }
1265 }
1266 
1267 
1269 {
1270  size_t loc_end = loc.GetStop(eExtreme_Biological);
1271  CSeq_loc_CI other_int(other_loc);
1272  while (other_int) {
1273  if (other_int.IsSetStrand() &&
1274  other_int.GetStrand() == eNa_strand_minus) {
1275  if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus &&
1276  loc_end == other_int.GetRange().GetFrom()) {
1277  return true;
1278  }
1279  } else {
1280  if ((!loc.IsSetStrand() || loc.GetStrand() != eNa_strand_minus) &&
1281  loc_end == other_int.GetRange().GetTo()) {
1282  return true;
1283  }
1284  }
1285  ++other_int;
1286  }
1287  return false;
1288 }
1289 
1290 
1292 {
1293  if (!f.GetData().IsCdregion()) {
1294  // not coding region
1295  return false;
1296  }
1297  if (sequence::IsPseudo(f, bsh.GetScope())) {
1298  return false;
1299  }
1300  if (f.GetLocation().IsPartialStop(eExtreme_Biological)) {
1301  return false;
1302  }
1304  if (mrna) {
1305  if (mrna->GetLocation().GetStop(eExtreme_Biological) == f.GetLocation().GetStop(eExtreme_Biological)) {
1306  //ok
1307  } else if (s_IsLocationEndAtOtherLocationInternalEndpoint(f.GetLocation(), mrna->GetLocation())) {
1308  return false;
1309  }
1310  }
1311 
1312  if (check_for_stop) {
1313  string translation;
1314  try {
1315  CSeqTranslator::Translate(f, bsh.GetScope(), translation, true);
1316  } catch (CSeqMapException&) {
1317  //unable to translate
1318  return false;
1319  } catch (CSeqVectorException&) {
1320  //unable to translate
1321  return false;
1322  }
1323  if (NStr::EndsWith(translation, "*")) {
1324  //already has stop codon
1325  return false;
1326  }
1327  }
1328 
1329  return ExtendToStopCodon(f, bsh, 3);
1330 }
1331 
1332 
1334 {
1335  if ((orig.GetStrand() == eNa_strand_minus &&
1336  orig.GetStop(eExtreme_Biological) > improved.GetStop(eExtreme_Biological)) ||
1337  (orig.GetStrand() != eNa_strand_minus &&
1338  orig.GetStop(eExtreme_Biological) < improved.GetStop(eExtreme_Biological))) {
1339  return true;
1340  }
1341 
1342  return false;
1343 }
1344 
1345 void CCleanup::SetProteinName(CProt_ref& prot_ref, const string& protein_name, bool append)
1346 {
1347  if (append && prot_ref.IsSetName() && prot_ref.GetName().size() > 0) {
1348  if (!NStr::IsBlank(prot_ref.GetName().front())) {
1349  prot_ref.SetName().front() += "; ";
1350  }
1351  prot_ref.SetName().front() += protein_name;
1352  } else {
1353  prot_ref.SetName().push_back(protein_name);
1354  }
1355 }
1356 
1357 
1358 void CCleanup::SetMrnaName(CSeq_feat& mrna, const string& protein_name)
1359 {
1360  bool used_qual = false;
1361  if (mrna.IsSetQual()) {
1362  for (auto it = mrna.SetQual().begin(); it != mrna.SetQual().end(); it++) {
1363  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1364  (*it)->SetVal(protein_name);
1365  used_qual = true;
1366  break;
1367  }
1368  }
1369  }
1370  if (!used_qual || (mrna.IsSetData() && mrna.GetData().IsRna() && mrna.GetData().GetRna().IsSetExt())) {
1371  string remainder;
1372  mrna.SetData().SetRna().SetRnaProductName(protein_name, remainder);
1373  }
1374 }
1375 
1376 
1377 //LCOV_EXCL_START
1378 //seems to be unused
1380 {
1381  if (cds.IsSetXref()) {
1382  for (auto it = cds.GetXref().begin(); it != cds.GetXref().end(); it++) {
1383  if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1384  return true;
1385  }
1386  }
1387  }
1388  if (cds.IsSetQual()) {
1389  for (auto it = cds.GetQual().begin(); it != cds.GetQual().end(); it++) {
1390  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1391  return true;
1392  }
1393  }
1394  }
1395  return false;
1396 }
1397 //LCOV_EXCL_STOP
1398 
1399 
1400 void CCleanup::s_SetProductOnFeat(CSeq_feat& feat, const string& protein_name, bool append)
1401 {
1402  if (feat.IsSetXref()) {
1403  // see if this seq-feat already has a prot xref
1404  for (auto it = feat.SetXref().begin(); it != feat.SetXref().end(); it++) {
1405  if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1406  SetProteinName((*it)->SetData().SetProt(), protein_name, append);
1407  break;
1408  }
1409  }
1410  }
1411  if (feat.IsSetQual()) {
1412  for (auto it = feat.SetQual().begin(); it != feat.SetQual().end(); it++) {
1413  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1414  if ((*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal()) && append) {
1415  (*it)->SetVal((*it)->GetVal() + "; " + protein_name);
1416  } else {
1417  (*it)->SetVal(protein_name);
1418  }
1419  }
1420  }
1421  }
1422 }
1423 
1424 
1425 void CCleanup::SetProteinName(CSeq_feat& cds, const string& protein_name, bool append, CScope& scope)
1426 {
1427  s_SetProductOnFeat(cds, protein_name, append);
1428  bool added = false;
1429  if (cds.IsSetProduct()) {
1431  if (prot) {
1432  // find main protein feature
1434  if (feat_ci) {
1435  CRef<CSeq_feat> new_prot(new CSeq_feat());
1436  new_prot->Assign(feat_ci->GetOriginalFeature());
1437  SetProteinName(new_prot->SetData().SetProt(), protein_name, append);
1438  CSeq_feat_EditHandle feh(feat_ci->GetSeq_feat_Handle());
1439  feh.Replace(*new_prot);
1440  } else {
1441  // make new protein feature
1442  feature::AddProteinFeature(*(prot.GetCompleteBioseq()), protein_name, cds, scope);
1443  }
1444  added = true;
1445  }
1446  }
1447  if (!added) {
1448  if (cds.IsSetXref()) {
1449  // see if this seq-feat already has a prot xref
1451  if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1452  SetProteinName((*it)->SetData().SetProt(), protein_name, append);
1453  added = true;
1454  break;
1455  }
1456  }
1457  }
1458  if (!added) {
1459  CRef<CSeqFeatXref> xref(new CSeqFeatXref());
1460  xref->SetData().SetProt().SetName().push_back(protein_name);
1461  cds.SetXref().push_back(xref);
1462  }
1463  }
1464 }
1465 
1466 
1467 const string& CCleanup::GetProteinName(const CProt_ref& prot)
1468 {
1469  if (prot.IsSetName() && !prot.GetName().empty()) {
1470  return prot.GetName().front();
1471  } else {
1472  return kEmptyStr;
1473  }
1474 }
1475 
1476 
1477 static const string& s_GetProteinNameFromXrefOrQual(const CSeq_feat& cds) {
1478  if (cds.IsSetXref()) {
1479  ITERATE(CSeq_feat::TXref, it, cds.GetXref()) {
1480  if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1481  return CCleanup::GetProteinName((*it)->GetData().GetProt());
1482  }
1483  }
1484  }
1485  if (cds.IsSetQual()) {
1486  for (auto it = cds.GetQual().begin(); it != cds.GetQual().end(); it++) {
1487  if ((*it)->IsSetQual() && (*it)->IsSetVal() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1488  return (*it)->GetVal();
1489  }
1490  }
1491  }
1492 
1493  return kEmptyStr;
1494 }
1495 
1496 
1498 {
1499  if (cds.IsSetProduct() && cds.GetProduct().GetId()) {
1501  if (prot) {
1503  if (f) {
1504  return GetProteinName(f->GetData().GetProt());
1505  }
1506  }
1507  }
1508 
1509  return s_GetProteinNameFromXrefOrQual(cds);
1510 }
1511 
1512 
1514 {
1515  bool any_change = false;
1516 
1518  cds.GetData().GetCdregion().IsSetFrame() &&
1521  cds.SetLocation().SetPartialStart(true, eExtreme_Biological);
1522  any_change = true;
1523  }
1524 
1526  // look for start and stop codon
1527  string transl_prot;
1528  try {
1529  CSeqTranslator::Translate(cds, scope, transl_prot,
1530  true, // include stop codons
1531  false); // do not remove trailing X/B/Z
1532 
1533  } catch (const runtime_error&) {
1534  }
1535  if (!NStr::IsBlank(transl_prot)) {
1536  if (!cds.GetLocation().IsPartialStart(eExtreme_Biological) && !NStr::StartsWith(transl_prot, "M")) {
1537  cds.SetLocation().SetPartialStart(true, eExtreme_Biological);
1538  any_change = true;
1539  }
1540  if (!cds.GetLocation().IsPartialStop(eExtreme_Biological) && !NStr::EndsWith(transl_prot, "*")) {
1541  cds.SetLocation().SetPartialStop(true, eExtreme_Biological);
1542  any_change = true;
1543  }
1544  }
1545  }
1546 
1548 
1549  return any_change;
1550 }
1551 
1552 
1553 bool CCleanup::ClearInternalPartials(CSeq_loc& loc, bool is_first, bool is_last)
1554 {
1555  bool rval = false;
1556  switch (loc.Which()) {
1557  case CSeq_loc::e_Mix:
1558  rval |= ClearInternalPartials(loc.SetMix(), is_first, is_last);
1559  break;
1561  rval |= ClearInternalPartials(loc.SetPacked_int(), is_first, is_last);
1562  break;
1563  default:
1564  break;
1565  }
1566  return rval;
1567 }
1568 
1569 
1570 bool CCleanup::ClearInternalPartials(CSeq_loc_mix& mix, bool is_first, bool is_last)
1571 {
1572  bool rval = false;
1574  bool this_is_last = is_last && (*it == mix.Set().back());
1575  if ((*it)->IsMix() || (*it)->IsPacked_int()) {
1576  rval |= ClearInternalPartials(**it, is_first, this_is_last);
1577  } else {
1578  if (!is_first &&
1579  (*it)->IsPartialStart(eExtreme_Biological)) {
1580  (*it)->SetPartialStart(false, eExtreme_Biological);
1581  rval = true;
1582  }
1583  if (!this_is_last &&
1584  (*it)->IsPartialStop(eExtreme_Biological)) {
1585  (*it)->SetPartialStop(false, eExtreme_Biological);
1586  rval = true;
1587  }
1588  }
1589  is_first = false;
1590  }
1591  return rval;
1592 }
1593 
1594 
1595 bool CCleanup::ClearInternalPartials(CPacked_seqint& pint, bool is_first, bool is_last)
1596 {
1597  bool rval = false;
1598 
1600  bool this_is_last = is_last && (*it == pint.Set().back());
1601  if (!is_first && (*it)->IsPartialStart(eExtreme_Biological)) {
1602  (*it)->SetPartialStart(false, eExtreme_Biological);
1603  rval = true;
1604  }
1605  if (!this_is_last && (*it)->IsPartialStop(eExtreme_Biological)) {
1606  (*it)->SetPartialStop(false, eExtreme_Biological);
1607  rval = true;
1608  }
1609  is_first = false;
1610  }
1611  return rval;
1612 }
1613 
1614 
1616 {
1617  bool rval = false;
1618  CFeat_CI f(seh);
1619  while (f) {
1620  CRef<CSeq_feat> new_feat(new CSeq_feat());
1621  new_feat->Assign(*(f->GetSeq_feat()));
1622  if (ClearInternalPartials(new_feat->SetLocation())) {
1623  CSeq_feat_EditHandle eh(f->GetSeq_feat_Handle());
1624  eh.Replace(*new_feat);
1625  }
1626  ++f;
1627  }
1628 
1629  return rval;
1630 }
1631 
1632 
1634 {
1635  if (!f.IsSetLocation()) {
1636  return false;
1637  }
1638  bool partial = false;
1639  CSeq_loc_CI li(f.GetLocation());
1640  while (li && !partial) {
1641  if (li.GetFuzzFrom() || li.GetFuzzTo()) {
1642  partial = true;
1643  break;
1644  }
1645  ++li;
1646  }
1647  bool changed = false;
1648  if (f.IsSetPartial() && f.GetPartial()) {
1649  if (!partial) {
1650  f.ResetPartial();
1651  changed = true;
1652  }
1653  } else {
1654  if (partial) {
1655  f.SetPartial(true);
1656  changed = true;
1657  }
1658  }
1659  return changed;
1660 }
1661 
1662 
1664 {
1665  bool changed = false;
1666  // CProt_ref::TEc is a list, so the iterator stays valid even if we
1667  // add new entries after the current one
1668  NON_CONST_ITERATE(CProt_ref::TEc, ec_num_iter, ec_num_list) {
1669  string & ec_num = *ec_num_iter;
1670  size_t tlen = ec_num.length();
1671  CleanVisStringJunk(ec_num);
1672  if (tlen != ec_num.length()) {
1673  changed = true;
1674  }
1676  !CProt_ref::IsECNumberSplit(ec_num)) {
1677  string new_val = CProt_ref::GetECNumberReplacement(ec_num);
1678  if (!NStr::IsBlank(new_val)) {
1679  ec_num = new_val;
1680  changed = true;
1681  }
1682  }
1683 
1684  }
1685  return changed;
1686 }
1687 
1688 
1690 {
1691  bool changed = false;
1692  CProt_ref::TEc::iterator ec_num_iter = ec_num_list.begin();
1693  while (ec_num_iter != ec_num_list.end()) {
1694  string & ec_num = *ec_num_iter;
1695  size_t tlen = ec_num.length();
1696  CleanVisStringJunk(ec_num);
1697  if (tlen != ec_num.length()) {
1698  changed = true;
1699  }
1701  if (ec_status == CProt_ref::eEC_deleted || ec_status == CProt_ref::eEC_unknown || CProt_ref::IsECNumberSplit(ec_num)) {
1702  ec_num_iter = ec_num_list.erase(ec_num_iter);
1703  changed = true;
1704  } else {
1705  ++ec_num_iter;
1706  }
1707 
1708  }
1709  return changed;
1710 }
1711 
1712 
1714 {
1715  bool any_change = false;
1717  while (f) {
1718  if (f->GetData().GetProt().IsSetEc()) {
1719  bool this_change = false;
1720  CRef<CSeq_feat> new_feat(new CSeq_feat());
1721  new_feat->Assign(*(f->GetSeq_feat()));
1722  this_change = UpdateECNumbers(new_feat->SetData().SetProt().SetEc());
1723  this_change |= RemoveBadECNumbers(new_feat->SetData().SetProt().SetEc());
1724  if (new_feat->GetData().GetProt().GetEc().empty()) {
1725  new_feat->SetData().SetProt().ResetEc();
1726  this_change = true;
1727  }
1728  if (this_change) {
1729  CSeq_feat_EditHandle efh(*f);
1730  efh.Replace(*new_feat);
1731  }
1732  }
1733  ++f;
1734  }
1735  return any_change;
1736 }
1737 
1738 
1740 {
1741  CBioseq_Handle bh = scope.GetBioseqHandle(gene.GetLocation());
1742  if (!bh) {
1743  return false;
1744  }
1745  CFeat_CI under(scope, gene.GetLocation());
1746  size_t longest = 0;
1747  CConstRef<CSeq_feat> longest_feat;
1748 
1749  while (under) {
1750  // ignore genes
1751  if (under->GetData().IsGene()) {
1752 
1753  } else {
1754  // must be contained in gene location
1756 
1757  if (loc_cmp == sequence::eSame || loc_cmp == sequence::eContains) {
1758  size_t len = sequence::GetLength(under->GetLocation(), &scope);
1759  // if longer than longest, record new length and feature
1760  if (len > longest) {
1761  longest_feat.Reset(under->GetSeq_feat());
1762  }
1763  }
1764  }
1765 
1766  ++under;
1767  }
1768  bool changed = false;
1769  if (longest_feat) {
1770  changed = feature::CopyFeaturePartials(gene, *longest_feat);
1771  }
1772  return changed;
1773 }
1774 
1775 
1777 {
1779  if (di) {
1780  if (di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetTech() == tech) {
1781  // no change necessary
1782  return false;
1783  } else {
1784  CSeqdesc* d = const_cast<CSeqdesc*>(&(*di));
1785  d->SetMolinfo().SetTech(tech);
1786  return true;
1787  }
1788  }
1789  CRef<CSeqdesc> m(new CSeqdesc());
1790  m->SetMolinfo().SetTech(tech);
1791  if (bsh.IsSetInst() && bsh.GetInst().IsSetMol() && bsh.IsAa()) {
1793  }
1794  CBioseq_EditHandle eh = bsh.GetEditHandle();
1795  eh.AddSeqdesc(*m);
1796  return true;
1797 }
1798 
1799 
1800 //LCOV_EXCL_START
1801 //does not appear to be used
1803 {
1805  if (di) {
1806  if (di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetBiomol() == biomol) {
1807  // no change necessary
1808  return false;
1809  } else {
1810  CSeqdesc* d = const_cast<CSeqdesc*>(&(*di));
1811  d->SetMolinfo().SetBiomol(biomol);
1812  return true;
1813  }
1814  }
1815  CRef<CSeqdesc> m(new CSeqdesc());
1816  m->SetMolinfo().SetBiomol(biomol);
1817  CBioseq_EditHandle eh = bsh.GetEditHandle();
1818  eh.AddSeqdesc(*m);
1819  return true;
1820 }
1821 //LCOV_EXCL_STOP
1822 
1823 
1824 bool CCleanup::AddMissingMolInfo(CBioseq& seq, bool is_product)
1825 {
1826  if (!seq.IsSetInst() || !seq.GetInst().IsSetMol()) {
1827  return false;
1828  }
1829  bool needs_molinfo = true;
1830 
1831  if (seq.IsSetDescr()) {
1833  if ((*it)->IsMolinfo()) {
1834  needs_molinfo = false;
1835  if (seq.IsAa() &&
1836  (!(*it)->GetMolinfo().IsSetBiomol() ||
1837  (*it)->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_unknown)) {
1838  (*it)->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1839  }
1840  }
1841  }
1842  }
1843  if (needs_molinfo) {
1844  if (seq.IsAa()) {
1845  CRef<CSeqdesc> m(new CSeqdesc());
1847  if (is_product) {
1849  }
1850  seq.SetDescr().Set().push_back(m);
1851  } else if (seq.GetInst().GetMol() == CSeq_inst::eMol_rna && is_product) {
1852  CRef<CSeqdesc> m(new CSeqdesc());
1855  seq.SetDescr().Set().push_back(m);
1856  } else {
1857  needs_molinfo = false;
1858  }
1859  }
1860 
1861  return needs_molinfo;
1862 }
1863 
1864 
1866 {
1867  if (!bsh.IsSetInst() || !bsh.GetInst().IsSetMol() || !bsh.IsAa()) {
1868  return false;
1869  }
1870  if (bsh.IsSetId()) {
1871  ITERATE(CBioseq_Handle::TId, it, bsh.GetId()) {
1872  // do not add titles for sequences with certain IDs
1873  switch (it->Which()) {
1874  case CSeq_id::e_Pir:
1875  case CSeq_id::e_Swissprot:
1876  case CSeq_id::e_Patent:
1877  case CSeq_id::e_Prf:
1878  case CSeq_id::e_Pdb:
1879  return false;
1880  break;
1881  default:
1882  break;
1883  }
1884  }
1885  }
1886 
1887  string new_defline = sequence::CDeflineGenerator().GenerateDefline(bsh, sequence::CDeflineGenerator::fIgnoreExisting);
1888 
1890 
1891  bool modified = title_desc.Set().SetTitle() != new_defline; // get or create a title
1892  if (modified)
1893  title_desc.Set().SetTitle().swap(new_defline);
1894  return modified;
1895 }
1896 
1897 
1899 {
1900  bool rval = false;
1901  if (seq_entry.IsSetDescr()) {
1902  CBioseq::TDescr::Tdata::iterator it = seq_entry.SetDescr().Set().begin();
1903  while (it != seq_entry.SetDescr().Set().end()) {
1904  if ((*it)->IsUser() && (*it)->GetUser().GetObjectType() == CUser_object::eObjectType_Cleanup){
1905  it = seq_entry.SetDescr().Set().erase(it);
1906  rval = true;
1907  }
1908  else {
1909  ++it;
1910  }
1911  }
1912  if (seq_entry.SetDescr().Set().empty()) {
1913  if (seq_entry.IsSeq()) {
1914  seq_entry.SetSeq().ResetDescr();
1915  }
1916  else if (seq_entry.IsSet()) {
1917  seq_entry.SetSet().ResetDescr();
1918  }
1919  }
1920  }
1921  if (seq_entry.IsSet() && seq_entry.GetSet().IsSetSeq_set()) {
1923  rval |= RemoveNcbiCleanupObject(**it);
1924  }
1925  }
1926  return rval;
1927 }
1928 
1929 void CCleanup::AddNcbiCleanupObject(int ncbi_cleanup_version, CSeq_descr& descr)
1930 {
1931  // update existing
1932  if (descr.IsSet()) {
1933  for (auto pDesc : descr.Set()) {
1934  if (pDesc->IsUser() && pDesc->GetUser().GetObjectType() == CUser_object::eObjectType_Cleanup) {
1935  pDesc->SetUser().UpdateNcbiCleanup(ncbi_cleanup_version);
1936  return;
1937  }
1938  }
1939  }
1940 
1941  // create new
1942  auto pCleanupObject = Ref(new CSeqdesc());
1943  auto& user = pCleanupObject->SetUser();
1944  user.UpdateNcbiCleanup(ncbi_cleanup_version);
1945  descr.Set().push_back(pCleanupObject);
1946 }
1947 
1948 
1949 //LCOV_EXCL_START
1950 //not used by asn_cleanup but used by functions used by other applications
1951 void GetSourceDescriptors(const CSeq_entry& se, vector<const CSeqdesc* >& src_descs)
1952 {
1953  if (se.IsSetDescr()) {
1955  if ((*it)->IsSource() && (*it)->GetSource().IsSetOrg()) {
1956  src_descs.push_back(*it);
1957  }
1958  }
1959  }
1960 
1961  if (se.IsSet() && se.GetSet().IsSetSeq_set()) {
1963  GetSourceDescriptors(**it, src_descs);
1964  }
1965  }
1966 }
1967 //LCOV_EXCL_STOP
1968 
1969 
1970 //LCOV_EXCL_START
1971 //not used by asn_cleanup
1973 {
1974  bool any_changes = false;
1975 
1976  vector<CRef<COrg_ref> > rq_list;
1977  vector<const CSeqdesc* > src_descs;
1978  vector<CConstRef<CSeq_feat> > src_feats;
1979 
1980  GetSourceDescriptors(*(seh.GetCompleteSeq_entry()), src_descs);
1981  vector<const CSeqdesc* >::iterator desc_it = src_descs.begin();
1982  while (desc_it != src_descs.end()) {
1983  // add org ref for descriptor to request list
1984  CRef<COrg_ref> org(new COrg_ref());
1985  org->Assign((*desc_it)->GetSource().GetOrg());
1986  rq_list.push_back(org);
1987 
1988  ++desc_it;
1989  }
1990 
1992  while (feat) {
1993  if (feat->GetData().GetBiosrc().IsSetOrg()) {
1994  // add org ref for feature to request list
1995  CRef<COrg_ref> org(new COrg_ref());
1996  org->Assign(feat->GetData().GetBiosrc().GetOrg());
1997  rq_list.push_back(org);
1998  // add feature to list
1999  src_feats.push_back(feat->GetOriginalSeq_feat());
2000  }
2001  ++feat;
2002  }
2003 
2004  if (rq_list.size() > 0) {
2006  CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(rq_list);
2007  if (reply) {
2008  CTaxon3_reply::TReply::const_iterator reply_it = reply->GetReply().begin();
2009 
2010  // process descriptor responses
2011  desc_it = src_descs.begin();
2012 
2013  while (reply_it != reply->GetReply().end()
2014  && desc_it != src_descs.end()) {
2015  if ((*reply_it)->IsData() &&
2016  !(*desc_it)->GetSource().GetOrg().Equals((*reply_it)->GetData().GetOrg())) {
2017  any_changes = true;
2018  CSeqdesc* desc = const_cast<CSeqdesc*>(*desc_it);
2019  desc->SetSource().SetOrg().Assign((*reply_it)->GetData().GetOrg());
2020  desc->SetSource().SetOrg().CleanForGenBank();
2021  }
2022  ++reply_it;
2023  ++desc_it;
2024  }
2025 
2026  // process feature responses
2027  vector<CConstRef<CSeq_feat> >::iterator feat_it = src_feats.begin();
2028  while (reply_it != reply->GetReply().end()
2029  && feat_it != src_feats.end()) {
2030  if ((*reply_it)->IsData() &&
2031  !(*feat_it)->GetData().GetBiosrc().GetOrg().Equals((*reply_it)->GetData().GetOrg())) {
2032  any_changes = true;
2033  CRef<CSeq_feat> new_feat(new CSeq_feat());
2034  new_feat->Assign(**feat_it);
2035  new_feat->SetData().SetBiosrc().SetOrg().Assign((*reply_it)->GetData().GetOrg());
2036  CSeq_feat_Handle fh = seh.GetScope().GetSeq_featHandle(**feat_it);
2037  CSeq_feat_EditHandle efh(fh);
2038  efh.Replace(*new_feat);
2039  }
2040  ++reply_it;
2041  ++feat_it;
2042  }
2043  }
2044  }
2045 
2046  return any_changes;
2047 }
2048 //LCOV_EXCL_STOP
2049 
2050 
2052 {
2053  CBioseq_Handle cds_bsh = scope.GetBioseqHandle(cds.GetLocation());
2054  if (!cds_bsh) {
2055  return CRef<CSeq_entry>();
2056  }
2057  CSeq_entry_Handle seh = cds_bsh.GetSeq_entry_Handle();
2058  if (!seh) {
2059  return CRef<CSeq_entry>();
2060  }
2061 
2062  CRef<CBioseq> new_product = CSeqTranslator::TranslateToProtein(cds, scope);
2063  if (new_product.Empty()) {
2064  return CRef<CSeq_entry>();
2065  }
2066 
2067  CRef<CSeqdesc> molinfo(new CSeqdesc());
2070  new_product->SetDescr().Set().push_back(molinfo);
2071 
2072  if (cds.IsSetProduct()) {
2073  CRef<CSeq_id> prot_id(new CSeq_id());
2074  prot_id->Assign(*(cds.GetProduct().GetId()));
2075  new_product->SetId().push_back(prot_id);
2076  }
2077  CRef<CSeq_entry> prot_entry(new CSeq_entry());
2078  prot_entry->SetSeq(*new_product);
2079 
2081  if (!eh.IsSet()) {
2082  CBioseq_set_Handle nuc_parent = eh.GetParentBioseq_set();
2083  if (nuc_parent && nuc_parent.IsSetClass() && nuc_parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
2084  eh = nuc_parent.GetParentEntry().GetEditHandle();
2085  }
2086  }
2087  if (!eh.IsSet()) {
2088  eh.ConvertSeqToSet();
2089  // move all descriptors on nucleotide sequence except molinfo, title, and create-date to set
2090  eh.SetSet().SetClass(CBioseq_set::eClass_nuc_prot);
2092  if (set && set->IsSetSeq_set()) {
2093  CConstRef<CSeq_entry> nuc = set->GetSeq_set().front();
2094  if (nuc->IsSetDescr()) {
2095  auto neh = eh.GetScope().GetSeq_entryEditHandle(*nuc);
2096  auto it = nuc->GetDescr().Get().begin();
2097  while (it != nuc->GetDescr().Get().end()) {
2098  if (!(*it)->IsMolinfo() && !(*it)->IsTitle() && !(*it)->IsCreate_date()) {
2099  CRef<CSeqdesc> copy(new CSeqdesc());
2100  copy->Assign(**it);
2101  eh.AddSeqdesc(*copy);
2102  neh.RemoveSeqdesc(**it);
2103  if (nuc->IsSetDescr()) {
2104  it = nuc->GetDescr().Get().begin();
2105  }
2106  else {
2107  break;
2108  }
2109  }
2110  else {
2111  ++it;
2112  }
2113  }
2114  }
2115  }
2116  }
2117 
2118  CSeq_entry_EditHandle added = eh.AttachEntry(*prot_entry);
2119  return prot_entry;
2120 }
2121 
2123 {
2124  if (!bsh) {
2125  return false;
2126  }
2127  if (!bsh.IsNa()) {
2128  return false;
2129  }
2130 
2131  CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
2132  if (!src) {
2133  // no source, don't fix
2134  return false;
2135  }
2136  const auto& bsrc = src->GetSource();
2137  if (!bsrc.IsSetOrg() || !bsrc.IsSetOrgname()) {
2138  return false;
2139  }
2140  const auto& orgname = bsrc.GetOrg().GetOrgname();
2141  if (!orgname.IsSetGcode() && !orgname.IsSetMgcode() && !orgname.IsSetPgcode()) {
2142  return false;
2143  }
2144  int bioseqGenCode = src->GetSource().GetGenCode();
2145 
2146  bool any_changed = false;
2147  // set Cdregion's gcode from BioSource (unless except-text)
2149  CFeat_CI feat_ci(bsh, sel);
2150  for (; feat_ci; ++feat_ci) {
2151  const CSeq_feat& feat = feat_ci->GetOriginalFeature();
2152  const CCdregion& cds = feat.GetData().GetCdregion();
2153  int cdregionGenCode = (cds.IsSetCode() ?
2154  cds.GetCode().GetId() :
2155  0);
2156  if (cdregionGenCode != bioseqGenCode)
2157  {
2158  // make cdregion's gencode match bioseq's gencode,
2159  // if allowed
2160  if (!feat.HasExceptionText("genetic code exception"))
2161  {
2162  CRef<CSeq_feat> new_feat(new CSeq_feat);
2163  new_feat->Assign(feat);
2164  CCdregion& new_cds = new_feat->SetData().SetCdregion();
2165  new_cds.ResetCode();
2166  new_cds.SetCode().SetId(bioseqGenCode);
2167  CSeq_feat_EditHandle edit_handle(*feat_ci);
2168  edit_handle.Replace(*new_feat);
2169  any_changed = true;
2170  }
2171  }
2172  }
2173  return any_changed;
2174 }
2175 
2176 
2177 // return position of " [" + sOrganism + "]", but only if it's
2178 // at the end and there are characters before it.
2179 // Also, returns the position of the organelle prefix in the title.
2181  const string & sTitle,
2182  const string & sOrganism,
2183  SIZE_TYPE& OrganellePos)
2184 {
2185  OrganellePos = NPOS;
2186 
2187  SIZE_TYPE answer = NPOS;
2188 
2189  const string sPattern = " [" + sOrganism + "]";
2190  if (NStr::EndsWith(sTitle, sPattern, NStr::eNocase)) {
2191  answer = sTitle.length() - sPattern.length();
2192  if (answer < 1) {
2193  // title must have something before the pattern
2194  answer = NPOS;
2195  }
2196  } else {
2197  answer = NStr::Find(sTitle, sPattern, NStr::eNocase, NStr::eReverseSearch);
2198  if (answer < 1 || answer == NPOS) {
2199  // pattern not found
2200  answer = NPOS;
2201  }
2202  }
2203 
2204  if (answer != NPOS) {
2205  // find organelle prefix
2206  for (unsigned int genome = CBioSource::eGenome_chloroplast;
2208  genome++) {
2209  if (genome != CBioSource::eGenome_extrachrom &&
2210  genome != CBioSource::eGenome_transposon &&
2212  genome != CBioSource::eGenome_proviral &&
2213  genome != CBioSource::eGenome_virion &&
2215  {
2216  string organelle = " (" + CBioSource::GetOrganelleByGenome(genome) + ")";
2217  SIZE_TYPE possible_organelle_start_pos = NStr::Find(sTitle, organelle, NStr::eNocase, NStr::eReverseSearch);
2218  if (possible_organelle_start_pos != NPOS &&
2219  NStr::EndsWith(CTempString(sTitle, 0, answer), organelle)) {
2220  OrganellePos = possible_organelle_start_pos;
2221  break;
2222  }
2223 
2224  }
2225  }
2226  }
2227  return answer;
2228 }
2229 
2230 
2232  const string & sTitle,
2233  const COrgName::TName& orgname,
2234  SIZE_TYPE &organelle_pos)
2235 {
2236  SIZE_TYPE suffixPos = NPOS; // will point to " [${organism name}]" at end
2237  organelle_pos = NPOS;
2238 
2239  if (orgname.IsBinomial() &&
2240  orgname.GetBinomial().IsSetGenus() &&
2241  !NStr::IsBlank(orgname.GetBinomial().GetGenus()) &&
2242  orgname.GetBinomial().IsSetSpecies() &&
2243  !NStr::IsBlank(orgname.GetBinomial().GetSpecies())) {
2244  string binomial = orgname.GetBinomial().GetGenus() + " " + orgname.GetBinomial().GetSpecies();
2245  suffixPos = s_TitleEndsInOrganism(sTitle, binomial, organelle_pos);
2246  }
2247  return suffixPos;
2248 }
2249 
2250 
2251 bool IsCrossKingdom(const COrg_ref& org, string& first_kingdom, string& second_kingdom)
2252 {
2253  bool is_cross_kingdom = false;
2254  first_kingdom = kEmptyStr;
2255  second_kingdom = kEmptyStr;
2256  if (org.IsSetOrgname() && org.GetOrgname().IsSetName() &&
2257  org.GetOrgname().GetName().IsPartial() &&
2258  org.GetOrgname().GetName().GetPartial().IsSet()) {
2260  const CTaxElement& te = **it;
2261  if (te.IsSetFixed_level() && te.GetFixed_level() == 0 &&
2262  te.IsSetLevel() &&
2263  NStr::EqualNocase(te.GetLevel(), "superkingdom") &&
2264  te.IsSetName() && !NStr::IsBlank(te.GetName())) {
2265  if (first_kingdom.empty()) {
2266  first_kingdom = te.GetName();
2267  } else if (!NStr::EqualNocase(first_kingdom, te.GetName())) {
2268  is_cross_kingdom = true;
2269  second_kingdom = te.GetName();
2270  break;
2271  }
2272  }
2273  }
2274  }
2275  return is_cross_kingdom;
2276 }
2277 
2278 
2279 bool IsCrossKingdom(const COrg_ref& org)
2280 {
2281  string first_kingdom, second_kingdom;
2282  return IsCrossKingdom(org, first_kingdom, second_kingdom);
2283 }
2284 
2285 
2287  const string & sTitle,
2288  const COrg_ref& org,
2289  SIZE_TYPE &organelle_pos)
2290 {
2291  SIZE_TYPE suffixPos = NPOS; // will point to " [${organism name}]" at end
2292  organelle_pos = NPOS;
2293 
2294  // first, check to see if protein title matches old-name
2295  if (org.IsSetOrgMod()) {
2296  ITERATE(COrgName::TMod, it, org.GetOrgname().GetMod()) {
2297  if ((*it)->IsSetSubtype() && (*it)->IsSetSubname() &&
2298  (*it)->GetSubtype() == COrgMod::eSubtype_old_name &&
2299  !NStr::IsBlank((*it)->GetSubname())) {
2300  suffixPos = s_TitleEndsInOrganism(sTitle, (*it)->GetSubname(), organelle_pos);
2301  if (suffixPos != NPOS) {
2302  return suffixPos;
2303  }
2304  }
2305  }
2306  }
2307 
2308  // next, check to see if protein title matches taxname
2309  if (org.IsSetTaxname() && !NStr::IsBlank(org.GetTaxname())) {
2310  suffixPos = s_TitleEndsInOrganism(sTitle, org.GetTaxname(), organelle_pos);
2311  if (suffixPos != NPOS) {
2312  return suffixPos;
2313  }
2314  }
2315 
2316  // try binomial if preset
2317  if (org.IsSetOrgname() && org.GetOrgname().IsSetName() &&
2318  org.GetOrgname().GetName().IsBinomial()) {
2319  suffixPos = s_TitleEndsInOrganism(sTitle, org.GetOrgname().GetName(), organelle_pos);
2320  if (suffixPos != NPOS) {
2321  return suffixPos;
2322  }
2323  }
2324 
2325  // cross-kingdom?
2326  if (IsCrossKingdom(org)) {
2327  SIZE_TYPE sep = NStr::Find(sTitle, "][");
2328  if (sep != string::npos) {
2329  suffixPos = s_TitleEndsInOrganism(sTitle.substr(0, sep + 1), org.GetTaxname(), organelle_pos);
2330  }
2331  }
2332  return suffixPos;
2333 }
2334 
2335 
2336 static void s_RemoveOrgFromEndOfProtein(CBioseq& seq, string taxname)
2337 
2338 {
2339  if (taxname.empty()) return;
2340  SIZE_TYPE taxlen = taxname.length();
2341 
2342  EDIT_EACH_SEQANNOT_ON_BIOSEQ(annot_it, seq) {
2343  CSeq_annot& annot = **annot_it;
2344  if (!annot.IsFtable()) continue;
2345  EDIT_EACH_FEATURE_ON_ANNOT(feat_it, annot) {
2346  CSeq_feat& feat = **feat_it;
2347  CSeqFeatData& data = feat.SetData();
2348  if (!data.IsProt()) continue;
2349  CProt_ref& prot_ref = data.SetProt();
2350  EDIT_EACH_NAME_ON_PROTREF(it, prot_ref) {
2351  string str = *it;
2352  if (str.empty()) continue;
2353  auto len = str.length();
2354  if (len < 5) continue;
2355  if (str[len - 1] != ']') continue;
2357  if (cp == NPOS) continue;
2358  string suffix = str.substr(cp + 1);
2359  if (NStr::StartsWith(suffix, "NAD")) continue;
2360  if (suffix.length() != taxlen + 1) continue;
2361  if (NStr::StartsWith(suffix, taxname)) {
2362  str.erase(cp);
2364  *it = str;
2365  }
2366  }
2367  }
2368  }
2369 }
2370 
2372 {
2373  // Bail if not protein
2374  if (!bioseq.IsSetInst() || !bioseq.GetInst().IsSetMol() || !bioseq.GetInst().IsAa()) {
2375  return false;
2376  }
2377 
2378  // Bail if record is swissprot
2379  FOR_EACH_SEQID_ON_BIOSEQ(seqid_itr, bioseq) {
2380  if ((*seqid_itr)->IsSwissprot()) {
2381  return false;
2382  }
2383  }
2384 
2385  // gather some info from the Seqdesc's on the bioseq, into
2386  // the following variables
2387  bool bPartial = false;
2388  string organelle;
2389 
2390  CConstRef<CSeqdesc> molinfo_desc;
2391  CConstRef<CSeqdesc> src_desc;
2392  FOR_EACH_SEQDESC_ON_BIOSEQ(descr_iter, bioseq) {
2393  if (!molinfo_desc && (*descr_iter)->IsMolinfo()) {
2394  molinfo_desc = *descr_iter;
2395  }
2396  if (!src_desc && (*descr_iter)->IsSource()) {
2397  src_desc = *descr_iter;
2398  }
2399  if (molinfo_desc && src_desc) {
2400  break;
2401  }
2402  }
2403  if (!molinfo_desc || !src_desc) {
2404  // climb up to get parent Seqdescs
2405  CConstRef<CBioseq_set> bioseq_set(bioseq.GetParentSet());
2406  for (; bioseq_set; bioseq_set = bioseq_set->GetParentSet()) {
2407  FOR_EACH_SEQDESC_ON_SEQSET(descr_iter, *bioseq_set) {
2408  if (!molinfo_desc && (*descr_iter)->IsMolinfo()) {
2409  molinfo_desc = *descr_iter;
2410  }
2411  if (!src_desc && (*descr_iter)->IsSource()) {
2412  src_desc = *descr_iter;
2413  }
2414  if (molinfo_desc && src_desc) {
2415  break;
2416  }
2417  }
2418  if (molinfo_desc && src_desc) {
2419  break;
2420  }
2421  }
2422  }
2423 
2424  if (molinfo_desc && molinfo_desc->GetMolinfo().IsSetCompleteness()) {
2425  switch (molinfo_desc->GetMolinfo().GetCompleteness()) {
2426  case NCBI_COMPLETENESS(partial):
2427  case NCBI_COMPLETENESS(no_left):
2428  case NCBI_COMPLETENESS(no_right):
2429  case NCBI_COMPLETENESS(no_ends):
2430  bPartial = true;
2431  break;
2432  default:
2433  break;
2434  }
2435  }
2436 
2437  CConstRef<COrg_ref> org;
2438  if (src_desc) {
2439  const TBIOSOURCE_GENOME genome = (src_desc->GetSource().IsSetGenome() ?
2441  if (genome >= CBioSource::eGenome_chloroplast &&
2443  genome != CBioSource::eGenome_extrachrom &&
2444  genome != CBioSource::eGenome_transposon &&
2446  genome != CBioSource::eGenome_proviral &&
2447  genome != CBioSource::eGenome_virion &&
2449  {
2450  organelle = CBioSource::GetOrganelleByGenome(genome);
2451  }
2452 
2453  if (src_desc->GetSource().IsSetOrg()) {
2454  org.Reset(&(src_desc->GetSource().GetOrg()));
2455  }
2456  }
2457 
2458  if (!org) {
2459  return false;
2460  }
2461  if (org->IsSetTaxname() && !NStr::IsBlank(org->GetTaxname())) {
2462  s_RemoveOrgFromEndOfProtein(bioseq, org->GetTaxname());
2463  }
2464 
2465  // find the title to edit
2466  if (!bioseq.IsSetDescr()) {
2467  return false;
2468  }
2469  CRef<CSeqdesc> title_desc;
2470  NON_CONST_ITERATE(CBioseq::TDescr::Tdata, d, bioseq.SetDescr().Set()) {
2471  if ((*d)->IsTitle()) {
2472  title_desc = *d;
2473  }
2474  }
2475  if (!title_desc) {
2476  return false;
2477  }
2478  string & sTitle = title_desc->SetTitle();
2479  // remember original so we can see if we changed it
2480  const string sOriginalTitle = sTitle;
2481 
2482  // search for partial, must be just before bracketed organism
2483  SIZE_TYPE partialPos = NStr::Find(sTitle, ", partial [");
2484  if (partialPos == NPOS) {
2485  partialPos = NStr::Find(sTitle, ", partial (");
2486  }
2487 
2488  // find oldname or taxname in brackets at end of protein title
2489  SIZE_TYPE penult = NPOS;
2490  SIZE_TYPE suffixPos = s_TitleEndsInOrganism(sTitle, *org, penult); // will point to " [${organism name}]" at end
2491  // do not change unless [genus species] was at the end
2492  if (suffixPos == NPOS) {
2493  return false;
2494  }
2495 
2496  // truncate bracketed info from end of title, will replace with current taxname
2497  sTitle.resize(suffixPos);
2498  if (penult != NPOS) {
2499  sTitle.resize(penult);
2500  }
2501 
2502  // if ", partial [" was indeed just before the [genus species], it will now be ", partial"
2503  // Note: 9 is length of ", partial"
2504  if (!bPartial &&
2505  partialPos != string::npos &&
2506  (partialPos == (sTitle.length() - 9)))
2507  {
2508  sTitle.resize(partialPos);
2509  }
2511 
2512  //
2513  if (bPartial && partialPos == NPOS) {
2514  sTitle += ", partial";
2515  }
2516  if (!NStr::IsBlank(organelle)) {
2517  sTitle += " (" + string(organelle) + ")";
2518  }
2519  string first_kingdom, second_kingdom;
2520  if (IsCrossKingdom(*org, first_kingdom, second_kingdom)) {
2521  sTitle += " [" + first_kingdom + "][" + second_kingdom + "]";
2522  } else {
2523  sTitle += " [";
2524  if (org->IsSetTaxname()) {
2525  sTitle += org->GetTaxname();
2526  }
2527  sTitle += "]";
2528  }
2529 
2530  if (sTitle != sOriginalTitle) {
2531  return true;
2532  } else {
2533  return false;
2534  }
2535 }
2536 
2538 {
2539  if (!sequence::IsPseudo(cds, scope) ||
2540  !cds.IsSetData() || !cds.GetData().IsCdregion() ||
2541  !cds.IsSetProduct()) {
2542  return false;
2543  }
2544  CBioseq_Handle pseq = scope.GetBioseqHandle(cds.GetProduct());
2545  if (pseq) {
2547  if (prot) {
2548  string label;
2549  if (prot->GetData().GetProt().IsSetName() &&
2550  !prot->GetData().GetProt().GetName().empty()) {
2551  label = prot->GetData().GetProt().GetName().front();
2552  } else if (prot->GetData().GetProt().IsSetDesc()) {
2553  label = prot->GetData().GetProt().GetDesc();
2554  }
2555  if (!NStr::IsBlank(label)) {
2556  if (cds.IsSetComment() && !NStr::IsBlank(cds.GetComment())) {
2557  cds.SetComment(cds.GetComment() + "; " + label);
2558  } else {
2559  cds.SetComment(label);
2560  }
2561  }
2562  }
2563  CBioseq_EditHandle pseq_e(pseq);
2564  pseq_e.Remove();
2565  }
2566  cds.ResetProduct();
2567  return true;
2568 }
2569 
2570 
2572 {
2573  if (!gene.IsSetXref() || !gene.IsSetLocation() || !gene.GetLocation().IsInt()) {
2574  return false;
2575  }
2576  bool any_change = false;
2577  TSeqPos gene_start = gene.GetLocation().GetStart(eExtreme_Positional);
2578  TSeqPos gene_stop = gene.GetLocation().GetStop(eExtreme_Positional);
2579  ITERATE(CSeq_feat::TXref, xit, gene.GetXref()) {
2580  if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
2581  const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
2584  TSeqPos f_start = f->GetLocation().GetStart(eExtreme_Positional);
2585  TSeqPos f_stop = f->GetLocation().GetStop(eExtreme_Positional);
2586  if (f_start < gene_start) {
2587  gene.SetLocation().SetInt().SetFrom(f_start);
2588  gene_start = f_start;
2589  any_change = true;
2590  }
2591  if (f_stop > gene_stop) {
2592  gene.SetLocation().SetInt().SetTo(f_stop);
2593  gene_stop = f_stop;
2594  any_change = true;
2595  }
2596  }
2597  }
2598  }
2599  return any_change;
2600 }
2601 
2602 
2603 typedef pair<size_t, bool> TRNALength;
2605 
2607  { "16S", { 1000, false } },
2608  { "18S", { 1000, false } },
2609  { "23S", { 2000, false } },
2610  { "25S", { 1000, false } },
2611  { "26S", { 1000, false } },
2612  { "28S", { 3300, false } },
2613  { "small", { 1000, false } },
2614  { "large", { 1000, false } },
2615  { "5.8S", { 130, true } },
2616  { "5S", { 90, true } }
2617  // possible problem: if it matches /25S/ it would also match /5S/
2618  // luckily, if it fails the /5S/ rule it would fail the /25S/ rule
2619 };
2620 
2621 
2622 static bool s_CleanupIsShortrRNA(const CSeq_feat& f, CScope* scope) // used in feature_tests.cpp
2623 {
2624  if (f.GetData().GetSubtype() != CSeqFeatData::eSubtype_rRNA) {
2625  return false;
2626  }
2627  bool is_bad = false;
2628  size_t len = sequence::GetLength(f.GetLocation(), scope);
2629  const CRNA_ref& rrna = f.GetData().GetRna();
2630  string rrna_name = rrna.GetRnaProductName();
2631  if (rrna_name.empty()) {
2632  // RNA name may still be in product GBQual
2633  if (f.IsSetQual()) {
2634  for (auto qit : f.GetQual()) {
2635  const CGb_qual& gbq = *qit;
2636  if ( gbq.IsSetQual() && gbq.GetQual() == "product" ) {
2637  rrna_name = gbq.GetVal();
2638  break;
2639  }
2640  }
2641  }
2642  }
2644  SIZE_TYPE pos = NStr::FindNoCase(rrna_name, it->first);
2645  if (pos != string::npos && len < it->second.first && !(it->second.second && f.IsSetPartial() && f.GetPartial()) ) {
2646  is_bad = true;
2647  break;
2648  }
2649  }
2650  return is_bad;
2651 }
2652 
2653 bool CCleanup::WGSCleanup(CSeq_entry_Handle entry, bool instantiate_missing_proteins, Uint4 options, bool run_extended_cleanup)
2654 {
2655  bool any_changes = false;
2656 
2657  int protein_id_counter = 1;
2658  bool create_general_only = edit::IsGeneralIdProtPresent(entry.GetTopLevelEntry());
2660  for (CFeat_CI cds_it(entry, sel); cds_it; ++cds_it) {
2661  bool change_this_cds = false;
2662  CRef<CSeq_feat> new_cds(new CSeq_feat());
2663  new_cds->Assign(*(cds_it->GetSeq_feat()));
2664  if (sequence::IsPseudo(*(cds_it->GetSeq_feat()), entry.GetScope())) {
2665  change_this_cds = RemovePseudoProduct(*new_cds, entry.GetScope());
2666  } else {
2667  string current_name = GetProteinName(*new_cds, entry);
2668 
2669  change_this_cds |= SetBestFrame(*new_cds, entry.GetScope());
2670 
2671  change_this_cds |= SetCDSPartialsByFrameAndTranslation(*new_cds, entry.GetScope());
2672 
2673  // retranslate
2674  if (new_cds->IsSetProduct() && entry.GetScope().GetBioseqHandleFromTSE(*(new_cds->GetProduct().GetId()), entry)) {
2675  any_changes |= feature::RetranslateCDS(*new_cds, entry.GetScope());
2676  } else {
2677  // need to set product if not set
2678  if (!new_cds->IsSetProduct() && !sequence::IsPseudo(*new_cds, entry.GetScope())) {
2679  string id_label;
2680  CRef<CSeq_id> new_id = edit::GetNewProtId(entry.GetScope().GetBioseqHandle(new_cds->GetLocation()), protein_id_counter, id_label, create_general_only);
2681  if (new_id) {
2682  new_cds->SetProduct().SetWhole().Assign(*new_id);
2683  change_this_cds = true;
2684  }
2685  }
2686  if (new_cds->IsSetProduct() && instantiate_missing_proteins) {
2687  CRef<CSeq_entry> prot = AddProtein(*new_cds, entry.GetScope());
2688  if (prot) {
2689  any_changes = true;
2690  }
2691  }
2692  any_changes |= feature::AdjustForCDSPartials(*new_cds, entry);
2693  }
2694  //prefer ncbieaa
2695  if (new_cds->IsSetProduct()) {
2696  CBioseq_Handle p = entry.GetScope().GetBioseqHandle(new_cds->GetProduct());
2697  if (p && p.IsSetInst() && p.GetInst().IsSetSeq_data() && p.GetInst().GetSeq_data().IsIupacaa()) {
2698  CBioseq_EditHandle peh(p);
2699  string current = p.GetInst().GetSeq_data().GetIupacaa().Get();
2700  CRef<CSeq_inst> new_inst(new CSeq_inst());
2701  new_inst->Assign(p.GetInst());
2702  new_inst->SetSeq_data().SetNcbieaa().Set(current);
2703  peh.SetInst(*new_inst);
2704  any_changes = true;
2705  }
2706  }
2707 
2708  if (NStr::IsBlank(current_name)) {
2709  SetProteinName(*new_cds, "hypothetical protein", false, entry.GetScope());
2710  current_name = "hypothetical protein";
2711  change_this_cds = true;
2712  } else if (new_cds->IsSetProduct()) {
2713  CBioseq_Handle p = entry.GetScope().GetBioseqHandle(new_cds->GetProduct());
2714  if (p) {
2716  if (!feat_ci) {
2717  // make new protein feature
2718  feature::AddProteinFeature(*(p.GetCompleteBioseq()), current_name, *new_cds, entry.GetScope());
2719  }
2720  }
2721  }
2722 
2723  CConstRef<CSeq_feat> mrna = sequence::GetmRNAforCDS(*(cds_it->GetSeq_feat()), entry.GetScope());
2724  if (mrna) {
2725  bool change_mrna = false;
2726  CRef<CSeq_feat> new_mrna(new CSeq_feat());
2727  new_mrna->Assign(*mrna);
2728  // Make mRNA name match coding region protein
2729  string mrna_name = new_mrna->GetData().GetRna().GetRnaProductName();
2730  if (NStr::IsBlank(mrna_name) && new_mrna->IsSetQual()) {
2731  for (auto it = new_mrna->GetQual().begin(); it != new_mrna->GetQual().end(); it++) {
2732  if ((*it)->IsSetQual() && (*it)->IsSetVal() && NStr::EqualNocase((*it)->GetQual(), "product")) {
2733  mrna_name = (*it)->GetVal();
2734  break;
2735  }
2736  }
2737  }
2738  if (NStr::IsBlank(mrna_name)
2739  || (!NStr::Equal(current_name, "hypothetical protein") &&
2740  !NStr::Equal(current_name, mrna_name))) {
2741  SetMrnaName(*new_mrna, current_name);
2742  change_mrna = true;
2743  }
2744  // Adjust mRNA partials to match coding region
2745  change_mrna |= feature::CopyFeaturePartials(*new_mrna, *new_cds);
2746  if (change_mrna) {
2747  CSeq_feat_Handle fh = entry.GetScope().GetSeq_featHandle(*mrna);
2748  CSeq_feat_EditHandle feh(fh);
2749  feh.Replace(*new_mrna);
2750  any_changes = true;
2751  }
2752  }
2753  }
2754 
2755  //any_changes |= feature::RetranslateCDS(*new_cds, entry.GetScope());
2756  if (change_this_cds) {
2757  CSeq_feat_EditHandle cds_h(*cds_it);
2758 
2759  cds_h.Replace(*new_cds);
2760  any_changes = true;
2761 
2762  //also need to redo protein title
2763  }
2764 
2765  }
2766 
2767  CTSE_Handle tse = entry.GetTSE_Handle();
2768 
2769  for (CFeat_CI rna_it(entry, SAnnotSelector(CSeqFeatData::e_Rna)); rna_it; ++rna_it) {
2770 
2771  const CSeq_feat& rna_feat = *(rna_it->GetSeq_feat());
2772  if (rna_feat.IsSetData() &&
2774  s_CleanupIsShortrRNA(rna_feat, &(entry.GetScope()))) {
2775 
2776  bool change_this_rrna = false;
2777  CRef<CSeq_feat> new_rrna(new CSeq_feat());
2778  new_rrna->Assign(*(rna_it->GetSeq_feat()));
2779 
2780  const CSeq_loc& loc = rna_feat.GetLocation();
2781  if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
2782  if (loc.GetStart(eExtreme_Biological) >= sequence::GetLength(rna_feat.GetLocation(), &entry.GetScope())) {
2783  new_rrna->SetLocation().SetPartialStart(true, eExtreme_Biological);
2784  change_this_rrna = true;
2785  }
2786  if (loc.GetStop(eExtreme_Biological) < 1) {
2787  new_rrna->SetLocation().SetPartialStop(true, eExtreme_Biological);
2788  change_this_rrna = true;
2789  }
2790  } else {
2791  if (loc.GetStart(eExtreme_Biological) < 1) {
2792  new_rrna->SetLocation().SetPartialStart(true, eExtreme_Biological);
2793  change_this_rrna = true;
2794  }
2795  if (loc.GetStop(eExtreme_Biological) >= sequence::GetLength(rna_feat.GetLocation(), &entry.GetScope())) {
2796  new_rrna->SetLocation().SetPartialStop(true, eExtreme_Biological);
2797  change_this_rrna = true;
2798  }
2799  }
2800 
2801  if (change_this_rrna) {
2802  CSeq_feat_EditHandle rrna_h(*rna_it);
2803  rrna_h.Replace(*new_rrna);
2804  any_changes = true;
2805  }
2806  }
2807  }
2808 
2809  for (CFeat_CI gene_it(entry, SAnnotSelector(CSeqFeatData::e_Gene)); gene_it; ++gene_it) {
2810  bool change_this_gene;
2811  CRef<CSeq_feat> new_gene(new CSeq_feat());
2812  new_gene->Assign(*(gene_it->GetSeq_feat()));
2813 
2814  change_this_gene = ExpandGeneToIncludeChildren(*new_gene, tse);
2815 
2816  change_this_gene |= SetGenePartialByLongestContainedFeature(*new_gene, entry.GetScope());
2817 
2818  if (change_this_gene) {
2819  CSeq_feat_EditHandle gene_h(*gene_it);
2820  gene_h.Replace(*new_gene);
2821  any_changes = true;
2822  }
2823  }
2824 
2825  NormalizeDescriptorOrder(entry);
2826 
2827  for (CBioseq_CI bi(entry, CSeq_inst::eMol_na); bi; ++bi) {
2828  any_changes |= SetGeneticCodes(*bi);
2829  }
2830 
2831  if (run_extended_cleanup) {
2832  auto pChanged = CCleanup::ExtendedCleanup(entry, options);
2833  if (pChanged->ChangeCount()>0) {
2834  return true;
2835  }
2836  }
2837  return any_changes;
2838 }
2839 
2840 
2841 bool CCleanup::x_HasShortIntron(const CSeq_loc& loc, size_t min_len)
2842 {
2843  CSeq_loc_CI li(loc);
2844  while (li && li.IsEmpty()) {
2845  ++li;
2846  }
2847  if (!li) {
2848  return false;
2849  }
2850  while (li) {
2851  TSeqPos prev_end;
2852  ENa_strand prev_strand;
2853  if (li.IsSetStrand() && li.GetStrand() == eNa_strand_minus) {
2854  prev_end = li.GetRange().GetFrom();
2855  prev_strand = eNa_strand_minus;
2856  } else {
2857  prev_end = li.GetRange().GetTo();
2858  prev_strand = eNa_strand_plus;
2859  }
2860  ++li;
2861  while (li && li.IsEmpty()) {
2862  ++li;
2863  }
2864  if (li) {
2865  TSeqPos this_start;
2866  ENa_strand this_strand;
2867  if (li.IsSetStrand() && li.GetStrand() == eNa_strand_minus) {
2868  this_start = li.GetRange().GetTo();
2869  this_strand = eNa_strand_minus;
2870  } else {
2871  this_start = li.GetRange().GetFrom();
2872  this_strand = eNa_strand_plus;
2873  }
2874  if (this_strand == prev_strand) {
2875  if (abs((long int)this_start - (long int)prev_end) < min_len) {
2876  return true;
2877  }
2878  }
2879  }
2880  }
2881  return false;
2882 }
2883 
2884 //LCOV_EXCL_START
2885 //not used by asn_cleanup but used by table2asn
2886 const string kLowQualitySequence = "low-quality sequence region";
2887 
2889 {
2890  bool any_change = false;
2891  if (!feat.IsSetExcept()) {
2892  any_change = true;
2893  feat.SetExcept(true);
2894  }
2895  if (!feat.IsSetExcept_text() || NStr::IsBlank(feat.GetExcept_text())) {
2897  any_change = true;
2898  } else if (NStr::Find(feat.GetExcept_text(), kLowQualitySequence) == string::npos) {
2899  feat.SetExcept_text(feat.GetExcept_text() + "; " + kLowQualitySequence);
2900  any_change = true;
2901  }
2902  return any_change;
2903 }
2904 
2905 
2907 {
2908  bool any_changes = false;
2909 
2910  SAnnotSelector sel(subtype);
2911  for (CFeat_CI cds_it(entry, sel); cds_it; ++cds_it) {
2912  bool change_this_cds = false;
2913  CRef<CSeq_feat> new_cds(new CSeq_feat());
2914  new_cds->Assign(*(cds_it->GetSeq_feat()));
2915  if (!sequence::IsPseudo(*(cds_it->GetSeq_feat()), entry.GetScope()) &&
2916  x_HasShortIntron(cds_it->GetLocation())) {
2917  change_this_cds = x_AddLowQualityException(*new_cds);
2918  }
2919 
2920  if (change_this_cds) {
2921  CSeq_feat_EditHandle cds_h(*cds_it);
2922 
2923  cds_h.Replace(*new_cds);
2924  any_changes = true;
2925  }
2926  }
2927  return any_changes;
2928 }
2929 
2930 
2932 {
2933  bool any_changes = x_AddLowQualityException(entry, CSeqFeatData::eSubtype_cdregion);
2935  return any_changes;
2936 }
2937 //LCOV_EXCL_STOP
2938 
2939 
2940 // maps the type of seqdesc to the order it should be in
2941 // (lowest to highest)
2944  // Note that ordering must match ordering
2945  // in CSeqdesc::E_Choice
2946  { CSeqdesc::e_Mol_type, 13 },
2947  { CSeqdesc::e_Modif, 14 },
2948  { CSeqdesc::e_Method, 15 },
2949  { CSeqdesc::e_Name, 7 },
2950  { CSeqdesc::e_Title, 1 },
2951  { CSeqdesc::e_Org, 16 },
2952  { CSeqdesc::e_Comment, 6 },
2953  { CSeqdesc::e_Num, 11 },
2954  { CSeqdesc::e_Maploc, 9 },
2955  { CSeqdesc::e_Pir, 18 },
2956  { CSeqdesc::e_Genbank, 22 },
2957  { CSeqdesc::e_Pub, 5 },
2958  { CSeqdesc::e_Region, 10 },
2959  { CSeqdesc::e_User, 8 },
2960  { CSeqdesc::e_Sp, 17 },
2961  { CSeqdesc::e_Dbxref, 12 },
2962  { CSeqdesc::e_Embl, 21 },
2963  { CSeqdesc::e_Create_date, 24 },
2964  { CSeqdesc::e_Update_date, 25 },
2965  { CSeqdesc::e_Prf, 19 },
2966  { CSeqdesc::e_Pdb, 20 },
2967  { CSeqdesc::e_Het, 4 },
2968 
2969  { CSeqdesc::e_Source, 2 },
2970  { CSeqdesc::e_Molinfo, 3 },
2971  { CSeqdesc::e_Modelev, 23 }
2972 };
2975 
2976 static
2978  // ordering assigned to unknown
2979  const int unknown_seqdesc = static_cast<int>(1 + sc_SeqdescOrderMap.size());
2980 
2981  TSeqdescOrderMap::const_iterator find_iter = sc_SeqdescOrderMap.find(chs);
2982  if (find_iter == sc_SeqdescOrderMap.end()) {
2983  return unknown_seqdesc;
2984  }
2985 
2986  return find_iter->second;
2987 }
2988 
2989 static
2990 bool s_SeqDescLessThan(const CRef<CSeqdesc> &desc1, const CRef<CSeqdesc> &desc2)
2991 {
2992  CSeqdesc::E_Choice chs1, chs2;
2993 
2994  chs1 = desc1->Which();
2995  chs2 = desc2->Which();
2996 
2997  return (s_SeqDescToOrdering(chs1) < s_SeqDescToOrdering(chs2));
2998 }
2999 
3001 {
3002  bool rval = false;
3003  if (!seq_mac_is_sorted(descr.Set().begin(), descr.Set().end(), s_SeqDescLessThan)) {
3004  descr.Set().sort(s_SeqDescLessThan);
3005  rval = true;
3006  }
3007  return rval;
3008 }
3009 
3011 {
3012  bool rval = false;
3013 
3015  while (ci) {
3017  if (edit.IsSetDescr()) {
3018  rval |= NormalizeDescriptorOrder(edit.SetDescr());
3019  }
3020  ++ci;
3021  }
3022 
3023  return rval;
3024 }
3025 
3026 
3028 {
3029  bool removed = false;
3030  if (seq.IsSetDescr()) {
3031  CConstRef<CSeqdesc> last_title;
3033  if ((*d)->IsTitle()) {
3034  if (last_title) {
3035  seq.RemoveSeqdesc(*last_title);
3036  removed = true;
3037  }
3038  last_title.Reset(d->GetPointer());
3039  }
3040  }
3041  }
3042  return removed;
3043 }
3044 
3045 
3047 {
3048  bool removed = false;
3049  if (set.IsSetDescr()) {
3050  CConstRef<CSeqdesc> last_title;
3051  ITERATE(CBioseq::TDescr::Tdata, d, set.GetDescr().Get()) {
3052  if ((*d)->IsTitle()) {
3053  if (last_title) {
3054  set.RemoveSeqdesc(*last_title);
3055  removed = true;
3056  }
3057  last_title.Reset(d->GetPointer());
3058  }
3059  }
3060  }
3061  return removed;
3062 }
3063 
3064 
3066 {
3067  if (seh.IsSet() && seh.GetSet().IsSetClass() &&
3069  return false;
3070  }
3071  CSeq_entry_EditHandle eh(seh);
3073  return true;
3074 }
3075 
3076 
3077 void s_GetAuthorsString(string *out_authors, const CAuth_list& auth_list)
3078 {
3079  string & auth_str = *out_authors;
3080  auth_str.clear();
3081 
3082  if (!auth_list.IsSetNames()) {
3083  return;
3084  }
3085 
3086  vector<string> name_list;
3087 
3088  if (auth_list.GetNames().IsStd()) {
3089  ITERATE(CAuth_list::TNames::TStd, auth_it, auth_list.GetNames().GetStd()) {
3090  if ((*auth_it)->IsSetName()) {
3091  string label;
3092  (*auth_it)->GetName().GetLabel(&label);
3093  name_list.push_back(label);
3094  }
3095  }
3096  } else if (auth_list.GetNames().IsMl()) {
3097  copy(BEGIN_COMMA_END(auth_list.GetNames().GetMl()),
3098  back_inserter(name_list));
3099  } else if (auth_list.GetNames().IsStr()) {
3100  copy(BEGIN_COMMA_END(auth_list.GetNames().GetStr()),
3101  back_inserter(name_list));
3102  }
3103 
3104  if (name_list.size() == 0) {
3105  return;
3106  } else if (name_list.size() == 1) {
3107  auth_str = name_list.back();
3108  return;
3109  }
3110 
3111  // join most of them by commas, but the last one gets an "and"
3112  string last_author;
3113  last_author.swap(name_list.back());
3114  name_list.pop_back();
3115  // swap is faster than assignment
3116  NStr::Join(name_list, ", ").swap(auth_str);
3117  auth_str += "and ";
3118  auth_str += last_author;
3119 
3120  return;
3121 }
3122 
3123 
3125  string *out_authors_string, const CPubdesc& pd)
3126 {
3127  string & authors_string = *out_authors_string;
3128  authors_string.clear();
3129 
3130  FOR_EACH_PUB_ON_PUBDESC(pub, pd) {
3131  if ((*pub)->IsSetAuthors()) {
3132  s_GetAuthorsString(&authors_string, (*pub)->GetAuthors());
3133  break;
3134  }
3135  }
3136 }
3137 
3138 
3140 (const CPubdesc& pd,
3141 vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
3142 vector<string>& published_labels,
3143 vector<string>& unpublished_labels)
3144 {
3145  string label;
3146  bool is_published = false;
3147  bool need_label = false;
3148 
3149  if (!pd.IsSetPub()) {
3150  return;
3151  }
3152  ITERATE(CPubdesc::TPub::Tdata, it, pd.GetPub().Get()) {
3153  if ((*it)->IsPmid()) {
3154  pmids.push_back((*it)->GetPmid());
3155  is_published = true;
3156  } else if ((*it)->IsMuid()) {
3157  muids.push_back((*it)->GetMuid());
3158  is_published = true;
3159  } else if ((*it)->IsGen()) {
3160  if ((*it)->GetGen().IsSetCit()
3161  && NStr::StartsWith((*it)->GetGen().GetCit(), "BackBone id_pub", NStr::eNocase)) {
3162  need_label = true;
3163  }
3164  if ((*it)->GetGen().IsSetSerial_number()) {
3165  serials.push_back((*it)->GetGen().GetSerial_number());
3166  if ((*it)->GetGen().IsSetCit()
3167  || (*it)->GetGen().IsSetJournal()
3168  || (*it)->GetGen().IsSetDate()) {
3169  need_label = true;
3170  }
3171  } else {
3172  need_label = true;
3173  }
3174  } else if ((*it)->IsArticle() && (*it)->GetArticle().IsSetIds()) {
3175  is_published = true;
3176  ITERATE(CArticleIdSet::Tdata, id, (*it)->GetArticle().GetIds().Get()) {
3177  if ((*id)->IsPubmed()) {
3178  pmids.push_back((*id)->GetPubmed());
3179  is_published = true;
3180  } else if ((*id)->IsMedline()) {
3181  muids.push_back((*id)->GetMedline());
3182  }
3183  }
3184  need_label = true;
3185  } else {
3186  need_label = true;
3187  }
3188  if (need_label && NStr::IsBlank(label)) {
3189  // create unique label
3190  (*it)->GetLabel(&label, CPub::eContent, CPub::fLabel_Unique);
3191  string auth_str;
3192  s_GetAuthorsString(&auth_str, pd);
3193  label += "; ";
3194  label += auth_str;
3195  }
3196  }
3197  if (!NStr::IsBlank(label)) {
3198  if (is_published) {
3199  published_labels.push_back(label);
3200  } else {
3201  unpublished_labels.push_back(label);
3202  }
3203  }
3204 }
3205 
3206 
3207 vector<CConstRef<CPub> > CCleanup::GetCitationList(CBioseq_Handle bsh)
3208 {
3209  vector<CConstRef<CPub> > pub_list;
3210 
3211  // first get descriptor pubs
3212  CSeqdesc_CI di(bsh, CSeqdesc::e_Pub);
3213  while (di) {
3214  vector<TEntrezId> pmids;
3215  vector<TEntrezId> muids;
3216  vector<int> serials;
3217  vector<string> published_labels;
3218  vector<string> unpublished_labels;
3219  GetPubdescLabels(di->GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
3220  if (pmids.size() > 0) {
3221  CRef<CPub> pub(new CPub());
3222  pub->SetPmid().Set(pmids[0]);
3223  pub_list.push_back(pub);
3224  } else if (muids.size() > 0) {
3225  CRef<CPub> pub(new CPub());
3226  pub->SetMuid(muids[0]);
3227  pub_list.push_back(pub);
3228  } else if (serials.size() > 0) {
3229  CRef<CPub> pub(new CPub());
3230  pub->SetGen().SetSerial_number(serials[0]);
3231  pub_list.push_back(pub);
3232  } else if (published_labels.size() > 0) {
3233  CRef<CPub> pub(new CPub());
3234  pub->SetGen().SetCit(published_labels[0]);
3235  pub_list.push_back(pub);
3236  } else if (unpublished_labels.size() > 0) {
3237  CRef<CPub> pub(new CPub());
3238  pub->SetGen().SetCit(unpublished_labels[0]);
3239  pub_list.push_back(pub);
3240  }
3241 
3242  ++di;
3243  }
3244  // now get pub features
3246  while (fi) {
3247  vector<TEntrezId> pmids;
3248  vector<TEntrezId> muids;
3249  vector<int> serials;
3250  vector<string> published_labels;
3251  vector<string> unpublished_labels;
3252  GetPubdescLabels(fi->GetData().GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
3253  if (pmids.size() > 0) {
3254  CRef<CPub> pub(new CPub());
3255  pub->SetPmid().Set(pmids[0]);
3256  pub_list.push_back(pub);
3257  } else if (muids.size() > 0) {
3258  CRef<CPub> pub(new CPub());
3259  pub->SetMuid(muids[0]);
3260  pub_list.push_back(pub);
3261  } else if (serials.size() > 0) {
3262  CRef<CPub> pub(new CPub());
3263  pub->SetGen().SetSerial_number(serials[0]);
3264  pub_list.push_back(pub);
3265  } else if (published_labels.size() > 0) {
3266  CRef<CPub> pub(new CPub());
3267  pub->SetGen().SetCit(published_labels[0]);
3268  pub_list.push_back(pub);
3269  } else if (unpublished_labels.size() > 0) {
3270  CRef<CPub> pub(new CPub());
3271  pub->SetGen().SetCit(unpublished_labels[0]);
3272  pub_list.push_back(pub);
3273  }
3274 
3275  ++fi;
3276  }
3277  return pub_list;
3278 }
3279 
3280 
3282 {
3283  bool any_change = false;
3284  CSeq_descr::Tdata::iterator it1 = descr.Set().begin();
3285  while (it1 != descr.Set().end()) {
3286  if ((*it1)->IsPub()) {
3287  CSeq_descr::Tdata::iterator it2 = it1;
3288  ++it2;
3289  while (it2 != descr.Set().end()) {
3290  if ((*it2)->IsPub() && (*it1)->GetPub().Equals((*it2)->GetPub())) {
3291  it2 = descr.Set().erase(it2);
3292  any_change = true;
3293  } else {
3294  ++it2;
3295  }
3296  }
3297  }
3298  ++it1;
3299  }
3300  return any_change;
3301 }
3302 
3303 
3304 bool s_FirstPubMatchesSecond(const CPubdesc& pd1, const CPubdesc& pd2)
3305 {
3306  if (pd1.Equals(pd2)) {
3307  return true;
3308  } else if (pd1.IsSetPub() && pd2.IsSetPub() && pd1.GetPub().Get().size() == 1) {
3309  ITERATE(CPubdesc::TPub::Tdata, it, pd2.GetPub().Get()) {
3310  if (pd1.GetPub().Get().front()->Equals(**it)) {
3311  return true;
3312  }
3313  }
3314  }
3315  return false;
3316 }
3317 
3318 
3319 bool CCleanup::PubAlreadyInSet(const CPubdesc& pd, const CSeq_descr& descr)
3320 {
3321  ITERATE(CSeq_descr::Tdata, d, descr.Get()) {
3322  if ((*d)->IsPub() && s_FirstPubMatchesSecond(pd, (*d)->GetPub())) {
3323  return true;
3324  }
3325  }
3326  return false;
3327 }
3328 
3329 
3331 {
3332  bool is_embl_or_ddbj = false;
3333  ITERATE(CBioseq::TId, id, b.GetId()) {
3334  if ((*id)->IsEmbl() || (*id)->IsDdbj()) {
3335  is_embl_or_ddbj = true;
3336  break;
3337  }
3338  }
3339  return !is_embl_or_ddbj;
3340 }
3341 
3342 
3344 {
3345  if (pd.IsSetNum() || pd.IsSetName() || pd.IsSetFig() || pd.IsSetComment()) {
3346  return false;
3347  } else {
3348  return true;
3349  }
3350 }
3351 
3352 
3354 {
3355  // add descriptor to nuc-prot parent or sequence itself
3356  CBioseq_set_Handle parent = b.GetParentBioseq_set();
3357  if (!CCleanup::OkToPromoteNpPub(*(b.GetCompleteBioseq()))) {
3358  // add to sequence
3359  CBioseq_EditHandle eh(b);
3360  eh.AddSeqdesc(*d);
3363  } else if (parent && parent.IsSetClass() &&
3364  parent.GetClass() == CBioseq_set::eClass_nuc_prot &&
3365  parent.IsSetDescr() && PubAlreadyInSet(d->GetPub(), parent.GetDescr())) {
3366  // don't add descriptor, just delete feature
3367  } else if (OkToPromoteNpPub((d)->GetPub()) &&
3368  parent && parent.IsSetClass() &&
3369  parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
3370  CBioseq_set_EditHandle eh(parent);
3371  eh.AddSeqdesc(*d);
3374  } else {
3375  CBioseq_EditHandle eh(b);
3376  eh.AddSeqdesc(*d);
3379  }
3380  if (remove_feat) {
3381  // remove feature
3382  CSeq_feat_EditHandle feh(feat);
3383  feh.Remove();
3384  }
3385 }
3386 
3387 
3389 {
3390  bool any_change = false;
3391  for (CBioseq_CI b(seh); b; ++b) {
3392  for (CFeat_CI p(*b, CSeqFeatData::e_Pub); p; ++p) {
3393  if (p->GetLocation().IsInt() &&
3394  p->GetLocation().GetStart(eExtreme_Biological) == 0 &&
3395  p->GetLocation().GetStop(eExtreme_Biological) == b->GetBioseqLength() - 1) {
3396  CRef<CSeqdesc> d(new CSeqdesc());
3397  d->SetPub().Assign(p->GetData().GetPub());
3398  if (p->IsSetComment()) {
3399  if (d->GetPub().IsSetComment() && !NStr::IsBlank(d->GetPub().GetComment())) {
3400  d->SetPub().SetComment(d->GetPub().GetComment() + "; " + p->GetComment());
3401  } else {
3402  d->SetPub().SetComment();
3403  }
3404  }
3405  MoveOneFeatToPubdesc(*p, d, *b);
3406  any_change = true;
3407  }
3408  }
3409  }
3410  return any_change;
3411 }
3412 
3413 
3414 bool IsSiteRef(const CSeq_feat& sf)
3415 {
3416  if (sf.GetData().IsImp() &&
3417  sf.GetData().GetImp().IsSetKey() &&
3418  NStr::Equal(sf.GetData().GetImp().GetKey(), "Site-ref")) {
3419  return true;
3420  } else {
3421  return false;
3422  }
3423 }
3424 
3425 
3426 bool CCleanup::IsMinPub(const CPubdesc& pd, bool is_refseq_prot)
3427 {
3428  if (!pd.IsSetPub()) {
3429  return true;
3430  }
3431  bool found_non_minimal = false;
3432  ITERATE(CPubdesc::TPub::Tdata, it, pd.GetPub().Get()) {
3433  if ((*it)->IsMuid() || (*it)->IsPmid()) {
3434  if (is_refseq_prot) {
3435  found_non_minimal = true;
3436  break;
3437  }
3438  } else if ((*it)->IsGen()) {
3439  const CCit_gen& gen = (*it)->GetGen();
3440  if (gen.IsSetCit() && !gen.IsSetJournal() &&
3441  !gen.IsSetAuthors() && !gen.IsSetVolume() &&
3442  !gen.IsSetPages()) {
3443  //minimalish, keep looking
3444  } else {
3445  found_non_minimal = true;
3446  }
3447  } else {
3448  found_non_minimal = true;
3449  break;
3450  }
3451  }
3452 
3453  return !found_non_minimal;
3454 }
3455 
3456 
3458 {
3459  bool found_site_ref = false;
3461  while (f && !found_site_ref) {
3462  if (IsSiteRef(*(f->GetSeq_feat()))) {
3463  found_site_ref = true;
3464  }
3465  ++f;
3466  }
3467  if (!found_site_ref) {
3468  return false;
3469  }
3470 
3471  bool any_change = false;
3472  for (CBioseq_CI b(seh); b; ++b) {
3473  bool is_refseq_prot = false;
3474  if (b->IsAa()) {
3475  ITERATE(CBioseq::TId, id_it, b->GetCompleteBioseq()->GetId()) {
3476  if ((*id_it)->IsOther()) {
3477  is_refseq_prot = true;
3478  break;
3479  }
3480  }
3481  }
3482 
3483  for (CFeat_CI p(*b); p; ++p) {
3484  if (!p->IsSetCit() || p->GetCit().Which() != CPub_set::e_Pub) {
3485  continue;
3486  }
3487 
3488  bool is_site_ref = IsSiteRef(*(p->GetSeq_feat()));
3489  ITERATE(CSeq_feat::TCit::TPub, c, p->GetCit().GetPub()) {
3490  CRef<CSeqdesc> d(new CSeqdesc());
3491  if ((*c)->IsEquiv()) {
3492  ITERATE(CPub_equiv::Tdata, t, (*c)->GetEquiv().Get()) {
3493  CRef<CPub> pub_copy(new CPub());
3494  pub_copy->Assign(**t);
3495  d->SetPub().SetPub().Set().push_back(pub_copy);
3496  }
3497 
3498  } else {
3499  CRef<CPub> pub_copy(new CPub());
3500  pub_copy->Assign(**c);
3501  d->SetPub().SetPub().Set().push_back(pub_copy);
3502  }
3503  if (is_site_ref) {
3505  } else {
3507  }
3508  auto changes = makeCleanupChange(0);
3509  CNewCleanup_imp pubclean(changes, 0);
3510  pubclean.BasicCleanup(d->SetPub(), ShouldStripPubSerial(*(b->GetCompleteBioseq())));
3511  if (!IsMinPub(d->SetPub(), is_refseq_prot)) {
3512  MoveOneFeatToPubdesc(*p, d, *b, false);
3513  }
3514  }
3515  if (is_site_ref) {
3516 
3517  CSeq_feat_EditHandle feh(*p);
3518  CSeq_annot_Handle annot = feh.GetAnnot();
3519 
3520  feh.Remove();
3521 
3522  // remove old annot if now empty
3524  CSeq_annot_EditHandle annot_edit(annot);
3525  annot_edit.Remove();
3526  }
3527 
3528  }
3529  any_change = true;
3530  }
3531  }
3532  return any_change;
3533 }
3534 
3535 
3537 {
3538  if (src1.IsSetOrg() && src1.GetOrg().IsSetTaxname() &&
3539  src2.IsSetOrg() && src2.GetOrg().IsSetTaxname() &&
3540  NStr::Equal(src1.GetOrg().GetTaxname(), src2.GetOrg().GetTaxname())) {
3541  return true;
3542  } else {
3543  return false;
3544  }
3545 }
3546 
3547 
3548 static bool s_SubsourceCompareC (
3549  const CRef<CSubSource>& st1,
3550  const CRef<CSubSource>& st2
3551 )
3552 
3553 {
3554  const CSubSource& sbs1 = *(st1);
3555  const CSubSource& sbs2 = *(st2);
3556 
3557  TSUBSOURCE_SUBTYPE chs1 = GET_FIELD (sbs1, Subtype);
3558  TSUBSOURCE_SUBTYPE chs2 = GET_FIELD (sbs2, Subtype);
3559 
3560  if (chs1 < chs2) return true;
3561  if (chs1 > chs2) return false;
3562 
3563  if (FIELD_IS_SET (sbs2, Name)) {
3564  if (! FIELD_IS_SET (sbs1, Name)) return true;
3565  if (NStr::CompareNocase(GET_FIELD (sbs1, Name), GET_FIELD (sbs2, Name)) < 0) return true;
3566  }
3567 
3568  return false;
3569 }
3570 
3571 static bool s_SameSubtypeC(const CSubSource& s1, const CSubSource& s2)
3572 {
3573  if (!s1.IsSetSubtype() && !s2.IsSetSubtype()) {
3574  return true;
3575  } else if (!s1.IsSetSubtype() || !s2.IsSetSubtype()) {
3576  return false;
3577  } else {
3578  return s1.GetSubtype() == s2.GetSubtype();
3579  }
3580 }
3581 
3582 // close enough if second name contains the first
3583 static bool s_NameCloseEnoughC(const CSubSource& s1, const CSubSource& s2)
3584 {
3585  if (!s1.IsSetName() && !s2.IsSetName()) {
3586  return true;
3587  } else if (!s1.IsSetName() || !s2.IsSetName()) {
3588  return false;
3589  }
3590  const string& n1 = s1.GetName();
3591  const string& n2 = s2.GetName();
3592 
3593  if (NStr::Equal(n1, n2)) {
3594  return true;
3595  } else {
3596  return false;
3597  }
3598 }
3599 
3600 
3602 {
3603  bool res = false;
3604 
3605  // sort and remove duplicates.
3606  if (biosrc.IsSetSubtype() && biosrc.GetSubtype().size() > 1) {
3609  }
3610 
3611  // remove duplicates and subsources that contain previous values
3612  CBioSource::TSubtype::iterator s = biosrc.SetSubtype().begin();
3613  CBioSource::TSubtype::iterator s_next = s;
3614  ++s_next;
3615  while (s_next != biosrc.SetSubtype().end()) {
3616  if (s_SameSubtypeC(**s, **s_next) && s_NameCloseEnoughC(**s, **s_next)) {
3617  s = biosrc.SetSubtype().erase(s);
3618  res = true;
3619  } else {
3620  ++s;
3621  }
3622  ++s_next;
3623  }
3624  }
3625 
3626  return res;
3627 }
3628 
3630 {
3631  bool any_change = false;
3632  // genome
3633  if ((!src1.IsSetGenome() || src1.GetGenome() == CBioSource::eGenome_unknown) &&
3635  src1.SetGenome(add.GetGenome());
3636  any_change = true;
3637  }
3638  // origin
3639  if ((!src1.IsSetOrigin() || src1.GetOrigin() == CBioSource::eOrigin_unknown) &&
3641  src1.SetOrigin(add.GetOrigin());
3642  any_change = true;
3643  }
3644  // focus
3645  if (!src1.IsSetIs_focus() && add.IsSetIs_focus()) {
3646  src1.SetIs_focus();
3647  any_change = true;
3648  }
3649 
3650  // merge subtypes
3651  if (add.IsSetSubtype()) {
3653  CRef<CSubSource> a(new CSubSource());
3654  a->Assign(**it);
3655  src1.SetSubtype().push_back(a);
3656  }
3657  any_change = true;
3658  }
3659 
3660  x_MergeDupOrgRefs(src1.SetOrg(), add.GetOrg());
3661 
3662  if (s_SubSourceListUniqued(src1)) {
3663  any_change = true;
3664  }
3665 
3666  return any_change;
3667 }
3668 
3669 
3671 {
3672  bool any_change = false;
3673 
3674  // OrgMods
3675  if (add.IsSetMod()) {
3676  ITERATE(COrgName::TMod, it, add.GetMod()) {
3677  CRef<COrgMod> a(new COrgMod());
3678  a->Assign(**it);
3679  on1.SetMod().push_back(a);
3680  }
3681  any_change = true;
3682  }
3683 
3684  // gcode
3685  if ((!on1.IsSetGcode() || on1.GetGcode() == 0) && add.IsSetGcode() && add.GetGcode() != 0) {
3686  on1.SetGcode(add.GetGcode());
3687  any_change = true;
3688  }
3689 
3690  // mgcode
3691  if ((!on1.IsSetMgcode() || on1.GetMgcode() == 0) && add.IsSetMgcode() && add.GetMgcode() != 0) {
3692  on1.SetMgcode(add.GetMgcode());
3693  any_change = true;
3694  }
3695 
3696  // lineage
3697  if (!on1.IsSetLineage() && add.IsSetLineage()) {
3698  on1.SetLineage(add.GetLineage());
3699  any_change = true;
3700  }
3701 
3702  // div
3703  if (!on1.IsSetDiv() && add.IsSetDiv()) {
3704  on1.SetDiv(add.GetDiv());
3705  any_change = true;
3706  }
3707 
3708  return any_change;
3709 }
3710 
3711 
3712 bool HasMod(const COrg_ref& org, const string& mod)
3713 {
3714  if (!org.IsSetMod()) {
3715  return false;
3716  }
3717  ITERATE(COrg_ref::TMod, it, org.GetMod()) {
3718  if (NStr::Equal(*it, mod)) {
3719  return true;
3720  }
3721  }
3722  return false;
3723 }
3724 
3725 
3727 {
3728  bool any_change = false;
3729  // mods
3730  if (add.IsSetMod()) {
3731  ITERATE(COrg_ref::TMod, it, add.GetMod()) {
3732  if (!HasMod(org1, *it)) {
3733  org1.SetMod().push_back(*it);
3734  any_change = true;
3735  }
3736  }
3737  }
3738 
3739  // dbxrefs
3740  if (add.IsSetDb()) {
3741  ITERATE(COrg_ref::TDb, it, add.GetDb()) {
3742  CRef<CDbtag> a(new CDbtag());
3743  a->Assign(**it);
3744  org1.SetDb().push_back(a);
3745  }
3746  any_change = true;
3747  }
3748 
3749  // synonyms
3750  if (add.IsSetSyn()) {
3751  ITERATE(COrg_ref::TSyn, it, add.GetSyn()) {
3752  org1.SetSyn().push_back(*it);
3753  }
3754  any_change = true;
3755  }
3756 
3757  if (add.IsSetOrgname()) {
3758  any_change |= x_MergeDupOrgNames(org1.SetOrgname(), add.GetOrgname());
3759  }
3760 
3761  return any_change;
3762 }
3763 
3764 
3766 {
3767  bool any_change = false;
3768  CSeq_descr::Tdata::iterator src1 = seq_descr.Set().begin();
3769  while (src1 != seq_descr.Set().end()) {
3770  if ((*src1)->IsSource() && (*src1)->GetSource().IsSetOrg() && (*src1)->GetSource().GetOrg().IsSetTaxname()) {
3771  CSeq_descr::Tdata::iterator src2 = src1;
3772  ++src2;
3773  while (src2 != seq_descr.Set().end()) {
3774  if ((*src2)->IsSource() &&
3775  AreBioSourcesMergeable((*src1)->GetSource(), (*src2)->GetSource())) {
3776  MergeDupBioSources((*src1)->SetSource(), (*src2)->GetSource());
3777 
3778  auto changes = makeCleanupChange(0);
3779  CNewCleanup_imp srcclean(changes, 0);
3780  srcclean.ExtendedCleanup((*src1)->SetSource());
3781  src2 = seq_descr.Set().erase(src2);
3782  any_change = true;
3783  } else {
3784  ++src2;
3785  }
3786  }
3787  }
3788  ++src1;
3789  }
3790  return any_change;
3791 }
3792 
3793 /// Remove duplicate biosource descriptors
3795 {
3796  bool any_change = false;
3797  vector<CConstRef<CBioSource> > src_list;
3798  CSeq_descr::Tdata::iterator d = descr.Set().begin();
3799  while (d != descr.Set().end()) {
3800  if ((*d)->IsSource()) {
3801  bool found = false;
3802  ITERATE(vector<CConstRef<CBioSource> >, s, src_list) {
3803  if ((*d)->GetSource().Equals(**s)) {
3804  found = true;
3805  break;
3806  }
3807  }
3808  if (found) {
3809  d = descr.Set().erase(d);
3810  any_change = true;
3811  } else {
3812  CConstRef<CBioSource> src(&((*d)->GetSource()));
3813  src_list.push_back(src);
3814  ++d;
3815  }
3816  } else {
3817  ++d;
3818  }
3819  }
3820  return any_change;
3821 }
3822 
3823 
3825 {
3826  if (!f.IsSetData() || !f.GetData().IsBiosrc()) {
3827  return CRef<CBioSource>();
3828  }
3829  CRef<CBioSource> src(new CBioSource());
3830  src->Assign(f.GetData().GetBiosrc());
3831 
3832  // move comment to subsource note
3833  if (f.IsSetComment()) {
3834  CRef<CSubSource> s(new CSubSource());
3836  s->SetName(f.GetComment());
3837  src->SetSubtype().push_back(s);
3838 
3839  }
3840 
3841  // move dbxrefs on feature to source
3842  if (f.IsSetDbxref()) {
3843  ITERATE(CSeq_feat::TDbxref, it, f.GetDbxref()) {
3844  CRef<CDbtag> a(new CDbtag());
3845  a->Assign(**it);
3846  src->SetOrg().SetDb().push_back(a);
3847  }
3848  }
3849  auto changes = makeCleanupChange(0);
3850  CNewCleanup_imp srcclean(changes, 0);
3851  srcclean.ExtendedCleanup(*src);
3852 
3853  return src;
3854 }
3855 
3856 
3858 {
3859  bool any_change = false;
3860  for (CBioseq_CI b(seh); b; ++b) {
3861  bool transgenic_or_focus = false;
3862  CSeqdesc_CI existing_src(*b, CSeqdesc::e_Source);
3863  while (existing_src && !transgenic_or_focus) {
3864  if (existing_src->GetSource().IsSetIs_focus() ||
3866  transgenic_or_focus = true;
3867  }
3868  ++existing_src;
3869  }
3870  if (transgenic_or_focus) {
3871  continue;
3872  }
3873  for (CFeat_CI p(*b, CSeqFeatData::e_Biosrc); p; ++p) {
3874  if (p->GetLocation().IsInt() &&
3875  p->GetLocation().GetStart(eExtreme_Biological) == 0 &&
3876  p->GetLocation().GetStop(eExtreme_Biological) == b->GetBioseqLength() - 1) {
3877  CRef<CSeqdesc> d(new CSeqdesc());
3878  d->SetSource().Assign(*(BioSrcFromFeat(*(p->GetSeq_feat()))));
3879 
3880  // add descriptor to nuc-prot parent or sequence itself
3881  CBioseq_set_Handle parent = b->GetParentBioseq_set();
3882  if (parent && parent.IsSetClass() &&
3883  parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
3884  CBioseq_set_EditHandle eh(parent);
3885  eh.AddSeqdesc(*d);
3889  } else {
3890  CBioseq_EditHandle eh(*b);
3891  eh.AddSeqdesc(*d);
3895  }
3896 
3897  // remove feature
3898  CSeq_feat_EditHandle feh(*p);
3899  CSeq_annot_Handle ah = feh.GetAnnot();
3900  feh.Remove();
3902  CSeq_annot_EditHandle aeh(ah);
3903  aeh.Remove();
3904  }
3905 
3906  any_change = true;
3907  }
3908  }
3909  }
3910  return any_change;
3911 }
3912 
3913 
3914 
3916 {
3917  CFeat_CI fi(seh);
3918  size_t num_gene_locus = 0;
3919  size_t num_gene_locus_tag = 0;
3920  size_t num_gene_xref_locus = 0;
3921  size_t num_gene_xref_locus_tag = 0;
3922 
3923  while (fi) {
3924  if (fi->GetData().IsGene()) {
3925  if (fi->GetData().GetGene().IsSetLocus()) {
3926  num_gene_locus++;
3927  }
3928  if (fi->GetData().GetGene().IsSetLocus_tag()) {
3929  num_gene_locus_tag++;
3930  }
3931  } else if (fi->IsSetXref()) {
3932  const CGene_ref* g = fi->GetGeneXref();
3933  if (g) {
3934  if (g->IsSetLocus()) {
3935  num_gene_xref_locus++;
3936  }
3937  if (g->IsSetLocus_tag()) {
3938  num_gene_xref_locus_tag++;
3939  }
3940  }
3941  }
3942  if (num_gene_locus > 0) {
3943  if (num_gene_locus_tag > 0) {
3944  return false;
3945  }
3946  if (num_gene_xref_locus > 0) {
3947  return false;
3948  }
3949  }
3950  if (num_gene_locus_tag > 0) {
3951  if (num_gene_locus > 0) {
3952  return false;
3953  }
3954  if (num_gene_xref_locus_tag > 0) {
3955  return false;
3956  }
3957  }
3958  ++fi;
3959  }
3960 
3961  bool any_change = false;
3962  if (num_gene_locus == 0 && num_gene_locus_tag > 0) {
3963  if (num_gene_xref_locus > 0 && num_gene_xref_locus_tag == 0) {
3964  fi.Rewind();
3965  while (fi) {
3966  if (!fi->GetData().IsGene() && fi->GetGeneXref()) {
3967  bool this_change = false;
3968  CRef<CSeq_feat> new_f(new CSeq_feat());
3969  new_f->Assign(*(fi->GetSeq_feat()));
3970  NON_CONST_ITERATE(CSeq_feat::TXref, it, new_f->SetXref()) {
3971  if ((*it)->IsSetData() && (*it)->GetData().IsGene()
3972  && (*it)->GetData().GetGene().IsSetLocus()) {
3973  (*it)->SetData().SetGene().SetLocus_tag((*it)->GetData().GetGene().GetLocus());
3974  (*it)->SetData().SetGene().ResetLocus();
3975  this_change = true;
3976  }
3977  }
3978  if (this_change) {
3979  CSeq_feat_EditHandle eh(*fi);
3980  eh.Replace(*new_f);
3981  }
3982  }
3983  ++fi;
3984  }
3985  }
3986  } else if (num_gene_locus > 0 && num_gene_locus_tag == 0) {
3987  if (num_gene_xref_locus == 0 && num_gene_xref_locus_tag > 0) {
3988  fi.Rewind();
3989  while (fi) {
3990  if (!fi->GetData().IsGene() && fi->GetGeneXref()) {
3991  bool this_change = false;
3992  CRef<CSeq_feat> new_f(new CSeq_feat());
3993  new_f->Assign(*(fi->GetSeq_feat()));
3994  NON_CONST_ITERATE(CSeq_feat::TXref, it, new_f->SetXref()) {
3995  if ((*it)->IsSetData() && (*it)->GetData().IsGene()
3996  && (*it)->GetData().GetGene().IsSetLocus_tag()) {
3997  (*it)->SetData().SetGene().SetLocus((*it)->GetData().GetGene().GetLocus_tag());
3998  (*it)->SetData().SetGene().ResetLocus_tag();
3999  this_change = true;
4000  }
4001  }
4002  if (this_change) {
4003  CSeq_feat_EditHandle eh(*fi);
4004  eh.Replace(*new_f);
4005  any_change = true;
4006  }
4007  }
4008  ++fi;
4009  }
4010  }
4011  }
4012  return any_change;
4013 }
4014 
4015 
4017 {
4018  bool strip_serial = true;
4019  ITERATE(CBioseq::TId, id, bs.GetId()) {
4020  const CSeq_id& sid = **id;
4021  switch (sid.Which()) {
4022  case NCBI_SEQID(Genbank):
4023  case NCBI_SEQID(Tpg):
4024  {
4025  const CTextseq_id& tsid = *GET_FIELD(sid, Textseq_Id);
4026  if (FIELD_IS_SET(tsid, Accession)) {
4027  const string& acc = GET_FIELD(tsid, Accession);
4028  if (acc.length() == 6) {
4029  strip_serial = false;
4030  }
4031  }
4032  }
4033  break;
4034  case NCBI_SEQID(Embl):
4035  case NCBI_SEQID(Ddbj):
4036  strip_serial = false;
4037  break;
4038  case NCBI_SEQID(not_set):
4039  case NCBI_SEQID(Local):
4040  case NCBI_SEQID(Other):
4041  case NCBI_SEQID(General):
4042  break;
4043  case NCBI_SEQID(Gibbsq):
4044  case NCBI_SEQID(Gibbmt):
4045  case NCBI_SEQID(Pir):
4046  case NCBI_SEQID(Swissprot):
4047  case NCBI_SEQID(Patent):
4048  case NCBI_SEQID(Prf):
4049  case NCBI_SEQID(Pdb):
4050  case NCBI_SEQID(Gpipe):
4051  case NCBI_SEQID(Tpe):
4052  case NCBI_SEQID(Tpd):
4053  strip_serial = false;
4054  break;
4055  default:
4056  break;
4057  }
4058  }
4059  return strip_serial;
4060 }
4061 
4062 
4064 {
4065  bool change_made = false;
4067  if (seh.IsSet() && seh.GetSet().IsSetClass() &&
4068  entry->GetSet().IsSetSeq_set()) {
4069  CBioseq_set::TClass set_class = seh.GetSet().GetClass();
4070  if (set_class == CBioseq_set::eClass_nuc_prot) {
4071  if (entry->GetSet().GetSeq_set().size() == 1 &&
4072  entry->GetSet().GetSeq_set().front()->IsSeq()) {
4074  eh.ConvertSetToSeq();
4075  if (eh.GetSeq().IsSetDescr()) {
4076  RemoveUnseenTitles(eh.SetSeq());
4077  NormalizeDescriptorOrder(eh.SetSeq().SetDescr());
4078  }
4079  change_made = true;
4080  }
4081  } else if (set_class == CBioseq_set::eClass_genbank ||
4082  set_class == CBioseq_set::eClass_mut_set ||
4083  set_class == CBioseq_set::eClass_pop_set ||
4084  set_class == CBioseq_set::eClass_phy_set ||
4085  set_class == CBioseq_set::eClass_eco_set ||
4086  set_class == CBioseq_set::eClass_wgs_set ||
4087  set_class == CBioseq_set::eClass_gen_prod_set ||
4088  set_class == CBioseq_set::eClass_small_genome_set) {
4091  change_made |= RenormalizeNucProtSets(ch);
4092  }
4093  }
4094  }
4095  return change_made;
4096 }
4097 
4098 
4100 {
4101 // return false;
4102  bool change_made = false;
4103 
4104  // This is more complex than you might initially think is necessary
4105  // because this needs to be as efficient as possible since it's
4106  // called on every single string in an object.
4107 
4108  SIZE_TYPE amp = str.find('&');
4109  if( NPOS == amp ) {
4110  // Check for the common case of no replacements required
4111  return change_made;
4112  }
4113 
4114  // transformations done by this function:
4115  const static struct {
4116  string src_word;
4117  string result_word;
4118  } transformations[] = {
4119  // all start with an implicit ampersand
4120  // and end with an implicit semi-colon
4121  { "amp", "&" },
4122  { "apos", "\'" },
4123  { "gt", ">" },
4124  { "lt", "<" },
4125  { "quot", "\"" },
4126  { "#13&#10", "" },
4127  { "#13;&#10", "" },
4128  { "#916", "Delta" },
4129  { "#945", "alpha" },
4130  { "#946", "beta" },
4131  { "#947", "gamma" },
4132  { "#952", "theta" },
4133  { "#955", "lambda" },
4134  { "#956", "mu" },
4135  { "#957", "nu" },
4136  { "#8201", "" },
4137  { "#8206", "" },
4138  { "#8242", "'" },
4139  { "#8594", "->" },
4140  { "#8722", "-" },
4141  { "#8710", "delta" },
4142  { "#64257", "fi" },
4143  { "#64258", "fl" },
4144  { "#65292", "," },
4145  };
4146 
4147  // Collisions should be rare enough that the CFastMutex is
4148  // faster than recreating the searcher each time this function is called
4149  static CTextFsm<int> searcher;
4150  // set searcher's state, if not already done
4151  {
4152  // just in case of the tiny chance that two threads try to prime
4153  // the searcher at the same time.
4154  static CFastMutex searcher_mtx;
4155  CFastMutexGuard searcher_mtx_guard( searcher_mtx );
4156  if (! searcher.IsPrimed()) {
4157  for (unsigned idx = 0; idx < ArraySize(transformations); ++idx) {
4158  // match type is index into transformations array
4159  searcher.AddWord(transformations[idx].src_word, idx);
4160  }
4161  searcher.Prime();
4162  }
4163  }
4164 
4165  // a smart compiler probably won't need this manual optimization,
4166  // but just in case.
4167  const SIZE_TYPE str_len = str.length();
4168 
4169  // fill result up to the first '&'
4170  string result;
4171  result.reserve( str_len );
4172  copy( str.begin(), str.begin() + amp,
4173  back_inserter(result) );
4174 
4175  // at the start of each loop, the result is filled in
4176  // up to the ampersand (amp)
4177  while( amp != NPOS && amp < str_len ) {
4178 
4179  // find out what the ampersand code represents
4180  // (if it represents anything)
4181  int state = searcher.GetInitialState();
4182  SIZE_TYPE search_pos = (amp + 1);
4183  if (str[search_pos] == ' ') {
4184  break;
4185  }
4186  for( ; search_pos < str_len ; ++search_pos ) {
4187  const char ch = str[search_pos];
4188  if( ch == ';' ) {
4189  break;
4190  }
4191  if( ch == '&' && state == 0 ) {
4192  --search_pos; // so we don't skip over the '&'
4193  state = searcher.GetInitialState(); // force "no-match"
4194  break;
4195  }
4196  state = searcher.GetNextState(state, ch);
4197  }
4198 
4199  if( search_pos == str_len && searcher.IsMatchFound(state) ) {
4200  // copy the translation of the XML code:
4201  _ASSERT( searcher.GetMatches(state).size() == 1 );
4202  const int match_idx = searcher.GetMatches(state)[0];
4203  const string & result_word = transformations[match_idx].result_word;
4204  copy( result_word.begin(), result_word.end(),
4205  back_inserter(result) );
4206  change_made = true;
4207  break;
4208  }
4209 
4210  if( search_pos >= str_len ) {
4211  // we reached the end without finding anything, so
4212  // copy the rest and break
4213  copy( str.begin() + amp, str.end(),
4214  back_inserter(result) );
4215  break;
4216  }
4217 
4218  if( searcher.IsMatchFound(state) ) {
4219  // copy the translation of the XML code:
4220  _ASSERT( searcher.GetMatches(state).size() == 1 );
4221  const int match_idx = searcher.GetMatches(state)[0];
4222  const string & result_word = transformations[match_idx].result_word;
4223  copy( result_word.begin(), result_word.end(),
4224  back_inserter(result) );
4225  change_made = true;
4226  } else {
4227  // no match found, so copy the text we looked at
4228  // as-is
4229  copy( str.begin() + amp, str.begin() + search_pos + 1,
4230  back_inserter(result) );
4231  }
4232 
4233  // find next_amp
4234  if( str[search_pos] == '&' ) {
4235  // special case that occurs when there are multiple '&' together
4236  ++search_pos;
4237  result += '&';
4238  }
4239  SIZE_TYPE next_amp = str.find('&', search_pos );
4240  if( NPOS == next_amp ) {
4241  // no more amps; copy the rest and break
4242  copy( str.begin() + search_pos + 1, str.end(),
4243  back_inserter(result) );
4244  break;
4245  }
4246 
4247  // copy up to the next amp
4248  if( (search_pos + 1) < next_amp ) {
4249  copy( str.begin() + search_pos + 1, str.begin() + next_amp,
4250  back_inserter(result) );
4251  }
4252  amp = next_amp;
4253  }
4254 
4255  if (change_made) {
4256  str = result;
4257  }
4258 
4259  return change_made;
4260 }
4261 
4262 
4263 CRef<CSeq_loc> CCleanup::GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, const CSeq_feat& cds, CScope& scope, bool require_inframe)
4264 {
4265  if (require_inframe) {
4266  feature::ELocationInFrame is_in_frame = feature::IsLocationInFrame(scope.GetSeq_featHandle(cds), nuc_loc);
4267  bool is_ok = false;
4268  switch (is_in_frame) {
4270  is_ok = true;
4271  break;
4274  is_ok = true;
4275  }
4276  break;
4279  is_ok = true;
4280  }
4281  break;
4285  is_ok = true;
4286  }
4287  break;
4289  break;
4290  }
4291  if (!is_ok) {
4292  return CRef<CSeq_loc>();
4293  }
4294  }
4295  CRef<CSeq_loc> new_loc;
4296  CRef<CSeq_loc_Mapper> nuc2prot_mapper(
4298  new_loc = nuc2prot_mapper->Map(nuc_loc);
4299  if (!new_loc) {
4300  return CRef<CSeq_loc>();
4301  }
4302 
4303  const CSeq_id* sid = new_loc->GetId();
4304  const CSeq_id* orig_id = nuc_loc.GetId();
4305  if (!sid || (orig_id && sid->Equals(*orig_id))) {
4306  // unable to map to protein location
4307  return CRef<CSeq_loc>();
4308  }
4309 
4310  new_loc->ResetStrand();
4311 
4312  // if location includes stop codon, remove it
4313  CBioseq_Handle prot = scope.GetBioseqHandle(*sid);
4314  if (prot && new_loc->GetStop(eExtreme_Positional) >= prot.GetBioseqLength())
4315  {
4316  CRef<CSeq_id> sub_id(new CSeq_id());
4317  sub_id->Assign(*sid);
4318  CSeq_loc sub(*sub_id, prot.GetBioseqLength(), new_loc->GetStop(eExtreme_Positional), new_loc->GetStrand());
4319  new_loc = sequence::Seq_loc_Subtract(*new_loc, sub, CSeq_loc::fMerge_All | CSeq_loc::fSort, &scope);
4320  if (nuc_loc.IsPartialStop(eExtreme_Biological)) {
4321  new_loc->SetPartialStop(true, eExtreme_Biological);
4322  }
4323  }
4324 
4325  if (!new_loc->IsInt() && !new_loc->IsPnt()) {
4327  new_loc = tmp;
4328  }
4329 
4330  // fix partials if protein feature starts or ends at beginning or end of protein sequence
4332  new_loc->GetStart(eExtreme_Biological) == 0) {
4333  if (new_loc->IsPartialStart(eExtreme_Biological)) {
4334  new_loc->SetPartialStart(false, eExtreme_Biological);
4335  }
4336  }
4338  new_loc->GetStop(eExtreme_Biological) == prot.GetBioseqLength() - 1) {
4339  if (new_loc->IsPartialStop(eExtreme_Biological)) {
4340  new_loc->SetPartialStop(false, eExtreme_Biological);
4341  }
4342  }
4343 
4344  return new_loc;
4345 }
4346 
4347 
4349 {
4350  CConstRef<CSeq_feat> cds = sequence::GetOverlappingCDS(nuc_loc, scope);
4351  if (!cds || !cds->IsSetProduct()) {
4352  // there is no overlapping coding region feature, so there is no appropriate
4353  // protein sequence to move to
4354  return CRef<CSeq_loc>();
4355  }
4356 
4357  return GetProteinLocationFromNucleotideLocation(nuc_loc, *cds, scope);
4358 }
4359 
4360 
4361 
4363 {
4364  if (!cds.IsSetProduct() || !cds.GetProduct().IsWhole()) {
4365  // no product, or product is specified weirdly
4366  return false;
4367  }
4369  if (!protein) {
4370  // protein is not in the same TSE
4371  return false;
4372  }
4373  if (protein.GetParentBioseq_set() == np) {
4374  // already in the right set
4375  return false;
4376  }
4377  CBioseq_set_EditHandle eh(np);
4378  CSeq_entry_Handle ph = protein.GetSeq_entry_Handle();
4379  CSeq_entry_EditHandle peh(ph);
4380  eh.TakeEntry(peh);
4381  return true;
4382 }
4383 
4384 
4386 {
4387  bool changed = false;
4389  while (si) {
4390  CBioseq_set_Handle set = si->GetSet();
4391  if (set.IsSetClass() && set.GetClass() == CBioseq_set::eClass_nuc_prot && set.HasAnnots()) {
4392  ITERATE(CBioseq_set::TAnnot, annot_it, set.GetCompleteBioseq_set()->GetAnnot()) {
4393  if ((*annot_it)->IsSetData() && (*annot_it)->IsFtable()) {
4394  ITERATE(CSeq_annot::TData::TFtable, feat_it, (*annot_it)->GetData().GetFtable()) {
4395  if ((*feat_it)->IsSetData() && (*feat_it)->GetData().IsCdregion()) {
4396  changed |= RepackageProteins(**feat_it, set);
4397  }
4398  }
4399  }
4400  }
4401  }
4402  ++si;
4403  }
4404  return changed;
4405 }
4406 
4407 
4409 {
4410  bool any_change = false;
4411  for (CBioseq_CI bi(seh, filter); bi; ++bi) {
4412  CBioseq_Handle bsh = *bi;
4413  CRef<CSeq_inst> inst(new CSeq_inst());
4414  inst->Assign(bsh.GetInst());
4415  if (inst->ConvertDeltaToRaw()) {
4416  CBioseq_EditHandle beh(bsh);
4417  beh.SetInst(*inst);
4418  any_change = true;
4419  }
4420  }
4421  return any_change;
4422 }
4423 
4424 
4426  CCdregion& cds,
4427  const CTempString& str,
4428  CScope& scope,
4429  IObjtoolsListener* pMessageListener)
4430 {
4431  if (str.empty() || !feat.IsSetLocation()) {
4432  return false;
4433  }
4434 
4435  const CSeq_id* feat_loc_seq_id = feat.GetLocation().GetId();
4436  if (!feat_loc_seq_id) {
4437  return false;
4438  }
4439 
4440  string::size_type aa_pos = NStr::Find(str, "aa:");
4441  string::size_type len = 0;
4442  string::size_type loc_pos, end_pos;
4443  char protein_letter = 'X';
4444  CRef<CSeq_loc> break_loc;
4445 
4446  if (aa_pos == string::npos) {
4447  aa_pos = NStr::Find(str, ",");
4448  if (aa_pos != string::npos) {
4449  aa_pos = NStr::Find(str, ":", aa_pos);
4450  }
4451  if (aa_pos != string::npos) {
4452  aa_pos++;
4453  }
4454  } else {
4455  aa_pos += 3;
4456  }
4457 
4458  if (aa_pos != string::npos) {
4459  while (aa_pos < str.length() && isspace(str[aa_pos])) {
4460  aa_pos++;
4461  }
4462  while (aa_pos + len < str.length() && isalpha(str[aa_pos + len])) {
4463  len++;
4464  }
4465  if (len != 0) {
4466  protein_letter = x_ValidAminoAcid(str.substr(aa_pos, len));
4467  }
4468  }
4469 
4470  loc_pos = NStr::Find(str, "(pos:");
4471 
4472  using TSubcode = CCleanupMessage::ESubcode;
4473  auto postMessage =
4474  [pMessageListener](string msg, TSubcode subcode) {
4475  pMessageListener->PutMessage(
4477  };
4478 
4479  if (loc_pos == string::npos) {
4480  if (pMessageListener) {
4481  string msg = "Unable to identify code-break location in '" + str + "'";
4482  postMessage(msg, TSubcode::eParseError);
4483  }
4484  return false;
4485  }
4486  loc_pos += 5;
4487  while (loc_pos < str.length() && isspace(str[loc_pos])) {
4488  loc_pos++;
4489  }
4490 
4491  end_pos = NStr::Find(str, ",aa:", loc_pos);
4492  if (end_pos == NPOS) {
4493  end_pos = NStr::Find(str, ",", loc_pos);
4494  if (end_pos == NPOS) {
4495  end_pos = str.length();
4496  }
4497  }
4498 
4499  string pos = NStr::TruncateSpaces_Unsafe(str.substr(loc_pos, end_pos - loc_pos));
4500 
4501  // handle multi-interval positions by adding a join() around them
4502  if (pos.find_first_of(",") != string::npos) {
4503  pos = "join(" + pos + ")";
4504  }
4505 
4506  break_loc = ReadLocFromText(pos, feat_loc_seq_id, &scope);
4507 
4508  if (!break_loc) {
4509  if (pMessageListener) {
4510  string msg = "Unable to extract code-break location from '" + str + "'";
4511  postMessage(msg, TSubcode::eParseError);
4512  }
4513  return false;
4514  }
4515 
4516  if (break_loc->IsInt() && sequence::GetLength(*break_loc, &scope) > 3) {
4517  if (pMessageListener) {
4518  string msg = "code-break location exceeds 3 bases";
4519  postMessage(msg, TSubcode::eBadLocation);
4520  }
4521  return false;
4522  }
4523  if ((break_loc->IsInt() || break_loc->IsPnt()) &&
4525  if (pMessageListener) {
4526  string msg = "code-break location lies outside of coding region";
4527  postMessage(msg, TSubcode::eBadLocation);
4528  }
4529  return false;
4530  }
4531 
4532  if (FIELD_IS_SET(feat.GetLocation(), Strand)) {
4533  if (GET_FIELD(feat.GetLocation(), Strand) == eNa_strand_minus) {
4534  break_loc->SetStrand(eNa_strand_minus);
4535  }
4536  else if (GET_FIELD(feat.GetLocation(), Strand) == eNa_strand_plus) {
4537  break_loc->SetStrand(eNa_strand_plus);
4538  }
4539  } else {
4540  RESET_FIELD(*break_loc, Strand);
4541  }
4542 
4543  // need to build code break object and add it to coding region
4544  CRef<CCode_break> newCodeBreak(new CCode_break());
4545  CCode_break::TAa& aa = newCodeBreak->SetAa();
4546  aa.SetNcbieaa(protein_letter);
4547  newCodeBreak->SetLoc(*break_loc);
4548 
4549  CCdregion::TCode_break& orig_list = cds.SetCode_break();
4550  orig_list.push_back(newCodeBreak);
4551 
4552  return true;
4553 }
4554 
4555 
4557 {
4558  if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
4559  !feat.IsSetQual() || !feat.IsSetLocation()) {
4560  return false;
4561  }
4562 
4563  bool any_removed = false;
4564  CSeq_feat::TQual::iterator it = feat.SetQual().begin();
4565  while (it != feat.SetQual().end()) {
4566  if ((*it)->IsSetQual() &&
4567  NStr::EqualNocase((*it)->GetQual(), "transl_except") &&
4568  (*it)->IsSetVal() &&
4569  ParseCodeBreak(feat, feat.SetData().SetCdregion(), (*it)->GetVal(), scope)) {
4570  it = feat.SetQual().erase(it);
4571  any_removed = true;
4572  } else {
4573  ++it;
4574  }
4575  }
4576  if (feat.GetQual().size() == 0) {
4577  feat.ResetQual();
4578  }
4579  return any_removed;
4580 }
4581 
4582 
4584 {
4586 
4587  CBioseq_CI bi(entry, CSeq_inst::eMol_na);
4588  while (bi) {
4589  CSeqdesc_CI src(*bi, CSeqdesc::e_Source);
4590  if (src && src->GetSource().IsSetOrg()) {
4591  string key = CInfluenzaSet::GetKey(src->GetSource().GetOrg());
4592  if (!NStr::IsBlank(key)) {
4593  // add to set
4594  auto it = flu_map.find(key);
4595  if (it == flu_map.end()) {
4596  CRef<CInfluenzaSet> new_set(new CInfluenzaSet(key));
4597  new_set->AddBioseq(*bi);
4598  flu_map[key] = new_set;
4599  } else {
4600  it->second->AddBioseq(*bi);
4601  }
4602  }
4603  }
4604  ++bi;
4605  }
4606  // now create sets
4607  size_t added = 0;
4608  for (auto& entry : flu_map) {
4609  if (entry.second->OkToMakeSet()) {
4610  entry.second->MakeSet();
4611  added++;
4612  }
4613  }
4614 
4615  return added;
4616 }
4617 
4618 
4620 {
4622 
4623  CSeq_annot_CI annot_ci(bh);
4624  for (; annot_ci; ++annot_ci) {
4625  if ((*annot_ci).IsFtable()) {
4626  ftable = *annot_ci;
4627  break;
4628  }
4629  }
4630 
4631  if (!ftable) {
4632  CBioseq_EditHandle beh = bh.GetEditHandle();
4633  CRef<CSeq_annot> new_annot(new CSeq_annot());
4634  ftable = beh.AttachAnnot(*new_annot);
4635  }
4636 
4638 
4639  CRef<CSeq_feat> f(new CSeq_feat());
4640  f->SetData().SetImp().SetKey("misc_feature");
4641  f->SetLocation().SetInt().SetFrom(0);
4642  f->SetLocation().SetInt().SetTo(bh.GetBioseqLength() - 1);
4643  f->SetLocation().SetInt().SetId().Assign(*(bh.GetSeqId()));
4644  CRef<CDbtag> xref(new CDbtag());
4645  xref->Assign(tag);
4646  f->SetDbxref().push_back(xref);
4647  CRef<CSeqFeatXref> suppress(new CSeqFeatXref());
4648  suppress->SetData().SetGene();
4649  f->SetXref().push_back(suppress);
4650  aeh.AddFeat(*f);
4651 }
4652 
4653 
4655 {
4656  bool any = false;
4657  CBioseq_CI bi(entry, CSeq_inst::eMol_na);
4658  while (bi) {
4659  CSeqdesc_CI src(*bi, CSeqdesc::e_Source);
4660  while (src) {
4661  if (src->GetSource().IsSetOrg() && src->GetSource().GetOrg().IsSetDb()) {
4662  CRef<COrg_ref> org(const_cast<COrg_ref *>(&(src->GetSource().GetOrg())));
4663  COrg_ref::TDb::iterator db = org->SetDb().begin();
4664  while (db != org->SetDb().end()) {
4665  if ((*db)->IsSetDb() && NStr::Equal((*db)->GetDb(), "IRD")) {
4666  AddIRDMiscFeature(*bi, **db);
4667  db = org->SetDb().erase(db);
4668  any = true;
4669  } else {
4670  ++db;
4671  }
4672  }
4673  if (org->GetDb().size() == 0) {
4674  org->ResetDb();
4675  }
4676  }
4677  ++src;
4678  }
4679  ++bi;
4680  }
4681  return any;
4682 }
4683 
4684 //LCOV_EXCL_START
4685 //not used by asn_cleanup but used by other applications
4686 const unsigned int methionine_encoded = 'M' - 'A';
4687 
4689 {
4690  if (!cb.IsSetAa()) {
4691  return false;
4692  }
4693  bool rval = false;
4694  switch (cb.GetAa().Which()) {
4696  if (cb.GetAa().GetNcbi8aa() == methionine_encoded) {
4697  rval = true;
4698  }
4699  break;
4701  if (cb.GetAa().GetNcbieaa() == 'M') {
4702  rval = true;
4703  }
4704  break;
4706  if (cb.GetAa().GetNcbistdaa() == methionine_encoded) {
4707  rval = true;
4708  }
4709  break;
4710  default:
4711  break;
4712  }
4713  return rval;
4714 }
4715 //LCOV_EXCL_STOP
4716 
4717 
4718 //LCOV_EXCL_START
4719 //not used by asn_cleanup but used by other applications
4721 {
4722  if (!cds.IsSetData() || !cds.GetData().IsCdregion() ||
4723  !cds.IsSetLocation() ||
4724  !cds.GetData().GetCdregion().IsSetCode_break()) {
4725  return CConstRef<CCode_break>();
4726  }
4727 
4728  TSeqPos frame = 0;
4729  if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetFrame())
4730  {
4731  switch(cds.GetData().GetCdregion().GetFrame())
4732  {
4734  case CCdregion::eFrame_one : frame = 0; break;
4735  case CCdregion::eFrame_two : frame = 1; break;
4736  case CCdregion::eFrame_three : frame = 2; break;
4737  default : frame = 0; break;
4738  }
4739  }
4740 
4741  for (auto cb : cds.GetData().GetCdregion().GetCode_break()) {
4742  if (cb->IsSetLoc()) {
4744  cb->GetLoc());
4745  if (offset >= frame &&
4746  ((offset - frame) / 3 ) + 1 == pos) {
4747  return cb;
4748  }
4749  }
4750  }
4751  return CConstRef<CCode_break>();
4752 }
4753 //LCOV_EXCL_STOP
4754 
4755 //LCOV_EXCL_START
4756 //appears not to be used
4757 void CCleanup::SetCodeBreakLocation(CCode_break& cb, size_t pos, const CSeq_feat& cds)
4758 {
4759  int start = static_cast<int>((pos-1)*3);
4760  //start -= 1;
4761  //start *= 3;
4762  int frame = 0;
4763  if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetFrame())
4764  {
4765  switch(cds.GetData().GetCdregion().GetFrame())
4766  {
4768  case CCdregion::eFrame_one : frame = 0; break;
4769  case CCdregion::eFrame_two : frame = 1; break;
4770  case CCdregion::eFrame_three : frame = 2; break;
4771  default : frame = 0; break;
4772  }
4773  }
4774  int frame_shift = (start - frame) % 3;
4775  if (frame_shift < 0) {
4776  frame_shift += 3;
4777  }
4778  if (frame_shift == 1)
4779  start += 2;
4780  else if (frame_shift == 2)
4781  start += 1;
4782 
4783  int offset = 0;
4784  CRef<CSeq_loc> packed (new CSeq_loc());
4785  for (CSeq_loc_CI loc_iter(cds.GetLocation()); loc_iter; ++loc_iter) {
4786  int len = loc_iter.GetRange().GetLength();
4787  if (offset <= start && offset + len > start) {
4789  tmp->SetId().Assign(loc_iter.GetSeq_id());
4790  if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
4791  tmp->SetStrand(eNa_strand_minus);
4792  tmp->SetTo(loc_iter.GetRange().GetTo() - (start - offset) );
4793  } else {
4794  tmp->SetFrom(loc_iter.GetRange().GetFrom() + start - offset);
4795  }
4796  if (offset <= start + 2 && offset + len > start + 2) {
4797  if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
4798  tmp->SetFrom(loc_iter.GetRange().GetTo() - (start - offset + 2) );
4799  } else {
4800  tmp->SetTo(loc_iter.GetRange().GetFrom() + start - offset + 2);
4801  }
4802  } else {
4803  if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
4804  tmp->SetFrom(loc_iter.GetRange().GetFrom());
4805  } else {
4806  tmp->SetTo(loc_iter.GetRange().GetTo());
4807  }
4808  }
4809  packed->SetPacked_int().Set().push_back(tmp);
4810  } else if (offset > start && offset <= start + 2) {
4811  // add new interval
4813  tmp->SetId().Assign(loc_iter.GetSeq_id());
4814  if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
4815  tmp->SetStrand(eNa_strand_minus);
4816  tmp->SetTo(loc_iter.GetRange().GetTo());
4817  if (offset + len >= start + 2) {
4818  tmp->SetFrom(loc_iter.GetRange().GetTo() - (start - offset + 2) );
4819  } else {
4820  tmp->SetFrom(loc_iter.GetRange().GetFrom());
4821  }
4822  } else {
4823  tmp->SetFrom(loc_iter.GetRange().GetFrom());
4824  if (offset + len >= start + 2) {
4825  tmp->SetTo(loc_iter.GetRange().GetFrom() + start - offset + 2);
4826  } else {
4827  tmp->SetTo(loc_iter.GetRange().GetTo());
4828  }
4829  }
4830 
4831  packed->SetPacked_int().Set().push_back(tmp);
4832  }
4833  offset += len;
4834  }
4835  if (packed->Which() != CSeq_loc::e_Packed_int || packed->GetPacked_int().Get().size() == 0) {
4836  cb.ResetLoc();
4837  }
4838  if (packed->GetPacked_int().Get().size() == 1) {
4839  cb.SetLoc().SetInt().Assign(*(packed->GetPacked_int().Get().front()));
4840  } else {
4841  cb.SetLoc(*packed);
4842  }
4843 }
4844 //LCOV_EXCL_STOP
4845 
4846 
4847 //LCOV_EXCL_START
4848 //not used by asn_cleanup but used by other applications
4850 {
4851  if (!cds.IsSetData() || !cds.GetData().IsCdregion()) {
4852  return false;
4853  }
4854  if (!cds.IsSetLocation() ||
4856  return false;
4857  }
4859  if (cbstart && !CCleanup::IsMethionine(*cbstart)) {
4860  // already have a start translation exception AND it is not methionine
4861  return false;
4862  }
4863 
4864  bool any_change = false;
4865  if (!cds.IsSetExcept_text() || NStr::IsBlank(cds.GetExcept_text())) {
4866  cds.SetExcept_text("RNA editing");
4867  any_change = true;
4868  } else if (NStr::Find(cds.GetExcept_text(), "RNA editing") == string::npos) {
4869  cds.SetExcept_text(cds.GetExcept_text() + "; RNA editing");
4870  any_change = true;
4871  }
4872  if (!cds.IsSetExcept() || !cds.GetExcept()) {
4873  cds.SetExcept(true);
4874  any_change = true;
4875  }
4876  return any_change;
4877 }
4878 //LCOV_EXCL_STOP
4879 
4880 
4881 //LCOV_EXCL_START
4882 //not used by asn_cleanup but used by other applications
4884 {
4885  bool any_changes = false;
4886 
4887  vector<CRef<COrg_ref> > rq_list;
4888  vector<const CSeqdesc* > src_descs;
4889  vector<CConstRef<CSeq_feat> > src_feats;
4890 
4891  GetSourceDescriptors(*(seh.GetCompleteSeq_entry()), src_descs);
4892  vector<const CSeqdesc* >::iterator desc_it = src_descs.begin();
4893  while (desc_it != src_descs.end()) {
4894  if ((*desc_it)->GetSource().IsSetSubtype()) {
4895  CSeqdesc* desc = const_cast<CSeqdesc*>(*desc_it);
4896  for (auto s : desc->SetSource().SetSubtype()) {
4897  if (s->IsSetSubtype() && s->GetSubtype() == CSubSource::eSubtype_collection_date
4898  && s->IsSetName()) {
4899  bool month_ambiguous = false;
4900  string new_date = CSubSource::FixDateFormat(s->GetName(), month_first, month_ambiguous);
4901  if (!NStr::Equal(new_date, s->GetName())) {
4902  s->SetName(new_date);
4903  any_changes = true;
4904  }
4905  }
4906  }
4907  }
4908  ++desc_it;
4909  }
4910 
4912  while (feat) {
4913  if (feat->GetData().GetBiosrc().IsSetSubtype()) {
4914  CRef<CSeq_feat> new_feat(new CSeq_feat());
4915  new_feat->Assign(*(feat->GetOriginalSeq_feat()));
4916  bool local_change = false;
4917  for (auto s : new_feat->SetData().SetBiosrc().SetSubtype()) {
4918  if (s->IsSetSubtype() && s->GetSubtype() == CSubSource::eSubtype_collection_date
4919  && s->IsSetName()) {
4920  bool month_ambiguous = false;
4921  string new_date = CSubSource::FixDateFormat(s->GetName(), month_first, month_ambiguous);
4922  if (!NStr::Equal(new_date, s->GetName())) {
4923  s->SetName(new_date);
4924  local_change = true;
4925  }
4926  }
4927  }
4928  if (local_change) {
4929  any_changes = true;
4930  CSeq_feat_EditHandle efh(*feat);
4931  efh.Replace(*new_feat);
4932  }
4933  ++feat;
4934  }
4935  }
4936 
4937  return any_changes;
4938 }
4939 //LCOV_EXCL_STOP
4940 
4941 
4943 {
4944  // remove existing options (TODO)
4945  for (CBioseq_CI b(seh); b; ++b) {
4946  bool removed = true;
4947  while (removed) {
4948  removed = false;
4950  while (ud) {
4951  if (ud->GetUser().IsAutodefOptions()) {
4952  CSeq_entry_Handle s = ud.GetSeq_entry_Handle();
4954  se.RemoveSeqdesc(*ud);
4955  removed = true;
4956  break;
4957  }
4958  ++ud;
4959  }
4960  }
4961  }
4962 
4963  // create new options
4965  CRef<CSeqdesc> d(new CSeqdesc());
4966  d->SetUser().Assign(*auto_user);
4968  eh.AddSeqdesc(*d);
4969 
4971 }
4972 
4973 
4974 char CCleanup::ValidAminoAcid(string_view abbrev)
4975 {
4976  return x_ValidAminoAcid(abbrev);
4977 }
4978 
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CRef< objects::CSeq_id > GetNewProtId(objects::CBioseq_Handle bsh, int &offset, string &id_label, bool general_only)
bool IsGeneralIdProtPresent(objects::CSeq_entry_Handle tse)
@Auth_list.hpp User-defined methods of the data storage class.
Definition: Auth_list.hpp:57
CSeqdesc & Set(bool skip_lookup=false)
Definition: Seq_descr.cpp:93
static CRef< CUser_object > CreateIDOptions(CSeq_entry_Handle seh)
Definition: autodef.cpp:1442
static bool RegenerateSequenceDefLines(CSeq_entry_Handle se)
Definition: autodef.cpp:1248
static string GetOrganelleByGenome(unsigned int genome)
Definition: BioSource.cpp:216
int GetGenCode(int def=1) const
Definition: BioSource.cpp:73
bool HasSubtype(CSubSource::TSubtype subtype) const
Definition: BioSource.cpp:2040
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_EditHandle –.
CBioseq_Handle –.
CBioseq_set_EditHandle –.
CBioseq_set_Handle –.
CConstRef< CBioseq_set > GetParentSet(void) const
Definition: Bioseq_set.cpp:294
CConstRef< CBioseq_set > GetParentSet(void) const
Definition: Bioseq_set.cpp:312
bool IsAa(void) const
Definition: Bioseq.cpp:350
CCdregion –.
Definition: Cdregion.hpp:66
vector< EChanges > GetAllChanges() const
Definition: cleanup.cpp:295
vector< string_view > GetDescriptions() const
Definition: cleanup.cpp:300
static string_view GetDescription(EChanges e)
Definition: cleanup.cpp:430
static bool RescueSiteRefPubs(CSeq_entry_Handle seh)
Rescue pubs from Site-ref features.
Definition: cleanup.cpp:3457
static bool ConvertDeltaSeqToRaw(CSeq_entry_Handle seh, CSeq_inst::EMol filter=CSeq_inst::eMol_not_set)
Definition: cleanup.cpp:4408
static bool RenormalizeNucProtSets(CSeq_entry_Handle seh)
Convert nuc-prot sets with just one sequence to just the sequence can't be done during the explore ph...
Definition: cleanup.cpp:4063
static bool ShouldStripPubSerial(const CBioseq &bs)
Definition: cleanup.cpp:4016
static char ValidAminoAcid(string_view abbrev)
Definition: cleanup.cpp:4974
static bool RemoveOrphanLocus_tagGeneXrefs(CSeq_feat &f, CBioseq_Handle bsh)
Removes orphaned locus_tag Gene-xrefs.
Definition: cleanup.cpp:1013
static bool FixGeneXrefSkew(CSeq_entry_Handle seh)
Examine all genes and gene xrefs in the Seq-entry.
Definition: cleanup.cpp:3915
static void MoveOneFeatToPubdesc(CSeq_feat_Handle feat, CRef< CSeqdesc > d, CBioseq_Handle b, bool remove_feat=true)
Definition: cleanup.cpp:3353
static bool AddGenBankWrapper(CSeq_entry_Handle seh)
Add GenBank Wrapper Set.
Definition: cleanup.cpp:3065
static bool ConvertPubFeatsToPubDescs(CSeq_entry_Handle seh)
Convert full-length publication features to publication descriptors.
Definition: cleanup.cpp:3388
static void SetProteinName(CProt_ref &prot, const string &protein_name, bool append)
Definition: cleanup.cpp:1345
static void s_SetProductOnFeat(CSeq_feat &feat, const string &protein_name, bool append)
Definition: cleanup.cpp:1400
static bool AddPartialToProteinTitle(CBioseq &bioseq)
Adjusts protein title to reflect partialness.
Definition: cleanup.cpp:2371
static bool RemovePseudoProduct(CSeq_feat &cds, CScope &scope)
Removes protein product from pseudo coding region.
Definition: cleanup.cpp:2537
static bool FixECNumbers(CSeq_entry_Handle entry)
Fix EC numbers.
Definition: cleanup.cpp:1713
static bool AddMissingMolInfo(CBioseq &seq, bool is_product)
Adds missing MolInfo descriptor to sequence.
Definition: cleanup.cpp:1824
static void SetMrnaName(CSeq_feat &mrna, const string &protein_name)
Definition: cleanup.cpp:1358
static CRef< CSeq_entry > AddProtein(const CSeq_feat &cds, CScope &scope)
Definition: cleanup.cpp:2051
static bool OkToPromoteNpPub(const CPubdesc &pd)
Some pubs should not be promoted to nuc-prot set from sequence.
Definition: cleanup.cpp:3343
static void GetPubdescLabels(const CPubdesc &pd, vector< TEntrezId > &pmids, vector< TEntrezId > &muids, vector< int > &serials, vector< string > &published_labels, vector< string > &unpublished_labels)
For Publication Citations Get labels for a pubdesc.
Definition: cleanup.cpp:3140
static bool DecodeXMLMarkChanged(std::string &str)
decodes various tags, including carriage-return-line-feed constructs
Definition: cleanup.cpp:4099
static bool SetFeaturePartial(CSeq_feat &f)
Set feature partial based on feature location.
Definition: cleanup.cpp:1633
static bool AddProteinTitle(CBioseq_Handle bsh)
Creates missing protein title descriptor.
Definition: cleanup.cpp:1865
static size_t MakeSmallGenomeSet(CSeq_entry_Handle entry)
Definition: cleanup.cpp:4583
static bool ExtendToStopIfShortAndNotPartial(CSeq_feat &f, CBioseq_Handle bsh, bool check_for_stop=true)
Extends a coding region up to 50 nt.
Definition: cleanup.cpp:1291
static bool IsGeneXrefUnnecessary(const CSeq_feat &sf, CScope &scope, const CGene_ref &gene_xref)
Calculates whether a Gene-xref is unnecessary (because it refers to the same gene as would be calcula...
Definition: cleanup.cpp:744
static bool RemoveNcbiCleanupObject(CSeq_entry &seq_entry)
Removes NcbiCleanup User Objects in the Seq-entry.
Definition: cleanup.cpp:1898
static bool ClearInternalPartials(CSeq_loc &loc, bool is_first=true, bool is_last=true)
Clear internal partials.
Definition: cleanup.cpp:1553
static bool RepackageProteins(CSeq_entry_Handle seh)
Find proteins that are not packaged in the same nuc-prot set as the coding region for which they are ...
Definition: cleanup.cpp:4385
CCleanup(CScope *scope=nullptr, EScopeOptions scope_handling=eScope_Copy)
Definition: cleanup.cpp:89
static bool ParseCodeBreaks(CSeq_feat &feat, CScope &scope)
Parses all valid transl_except Gb-quals into code-breaks for cdregion, then removes the transl_except...
Definition: cleanup.cpp:4556
static bool SetMolinfoTech(CBioseq_Handle seq, CMolInfo::ETech tech)
Sets MolInfo::tech for a sequence.
Definition: cleanup.cpp:1776
static bool AddLowQualityException(CSeq_entry_Handle entry)
For table2asn -c s Adds an exception of "low-quality sequence region" to coding regions and mRNAs tha...
Definition: cleanup.cpp:2931
static bool RemoveUnseenTitles(CSeq_entry_EditHandle::TSeq seq)
Remove all titles in Seqdescr except the last, because it is the only one that would be displayed in ...
Definition: cleanup.cpp:3027
static bool RemoveDupBioSource(CSeq_descr &descr)
Remove duplicate biosource descriptors.
Definition: cleanup.cpp:3794
EScopeOptions
Definition: cleanup.hpp:83
@ eScope_UseInPlace
Definition: cleanup.hpp:85
static bool ExtendStopPosition(CSeq_feat &f, const CSeq_feat *cdregion, size_t extension=0)
Definition: cleanup.cpp:1072
TChanges ExtendedCleanup(CSeq_entry &se, Uint4 options=0)
Cleanup a Seq-entry.
Definition: cleanup.cpp:259
@ eClean_NoReporting
Definition: cleanup.hpp:73
static bool s_IsProductOnFeat(const CSeq_feat &cds)
Definition: cleanup.cpp:1379
static bool SetGenePartialByLongestContainedFeature(CSeq_feat &gene, CScope &scope)
Set partialness of gene to match longest feature contained in gene.
Definition: cleanup.cpp:1739
static CConstRef< CCode_break > GetCodeBreakForLocation(size_t pos, const CSeq_feat &cds)
utility function for finding the code break for a given amino acid position pos is the position of th...
Definition: cleanup.cpp:4720
TChanges BasicCleanup(CSeq_entry &se, Uint4 options=0)
Definition: cleanup.cpp:132
static bool SetCDSPartialsByFrameAndTranslation(CSeq_feat &cds, CScope &scope)
1.
Definition: cleanup.cpp:1513
static bool RemoveBadECNumbers(CProt_ref::TEc &ec_num_list)
Delete EC numbers.
Definition: cleanup.cpp:1689
void SetScope(CScope *scope)
Definition: cleanup.cpp:108
static bool RepairXrefs(const CSeq_feat &f, const CTSE_Handle &tse)
Repairs non-reciprocal xref pairs for specified feature if xrefs between subtypes are permitted and f...
Definition: cleanup.cpp:905
static bool ExtendToStopCodon(CSeq_feat &f, CBioseq_Handle bsh, size_t limit)
Extends a feature up to limit nt to a stop codon, or to the end of the sequence if limit == 0 (partia...
Definition: cleanup.cpp:1113
static CRef< CSeq_loc > GetProteinLocationFromNucleotideLocation(const CSeq_loc &nuc_loc, CScope &scope)
Definition: cleanup.cpp:4348
static bool ParseCodeBreak(const CSeq_feat &feat, CCdregion &cds, const CTempString &str, CScope &scope, IObjtoolsListener *pMessageListener=nullptr)
Parse string into code break and add to coding region.
Definition: cleanup.cpp:4425
static void SetCodeBreakLocation(CCode_break &cb, size_t pos, const CSeq_feat &cds)
utility function for setting code break location given offset pos is the position of the amino acid w...
Definition: cleanup.cpp:4757
static bool x_AddLowQualityException(CSeq_feat &feat)
Definition: cleanup.cpp:2888
static const string & GetProteinName(const CProt_ref &prot)
Definition: cleanup.cpp:1467
static CRef< CBioSource > BioSrcFromFeat(const CSeq_feat &f)
Get BioSource from feature to use for source descriptor.
Definition: cleanup.cpp:3824
static bool MergeDupBioSources(CSeq_descr &descr)
Definition: cleanup.cpp:3765
static bool SetFrameFromLoc(CCdregion &cdregion, const CSeq_loc &loc, CScope &scope)
Chooses best frame based on location 1.
Definition: cleanup.cpp:1253
static bool SetMolinfoBiomol(CBioseq_Handle seq, CMolInfo::EBiomol biomol)
Sets MolInfo::biomol for a sequence.
Definition: cleanup.cpp:1802
static bool MoveProteinSpecificFeats(CSeq_entry_Handle seh)
Moves protein-specific features from nucleotide sequences in the Seq-entry to the appropriate protein...
Definition: cleanup.cpp:724
static bool TaxonomyLookup(CSeq_entry_Handle seh)
Looks up Org-refs in the Seq-entry.
Definition: cleanup.cpp:1972
static bool PubAlreadyInSet(const CPubdesc &pd, const CSeq_descr &descr)
Definition: cleanup.cpp:3319
static bool NormalizeDescriptorOrder(CSeq_descr &descr)
Normalize Descriptor Order on a specific Seq-entry.
Definition: cleanup.cpp:3000
static bool ConvertSrcFeatsToSrcDescs(CSeq_entry_Handle seh)
Convert full-length source features to source descriptors.
Definition: cleanup.cpp:3857
static bool SetBestFrame(CSeq_feat &cds, CScope &scope)
Translates coding region and selects best frame (without stops, or longest)
Definition: cleanup.cpp:1194
static bool x_MergeDupOrgNames(COrgName &on1, const COrgName &add)
Definition: cleanup.cpp:3670
static bool FindMatchingLocusGene(CSeq_feat &f, const CGene_ref &gene_xref, CBioseq_Handle bsh)
Detects gene features with matching locus.
Definition: cleanup.cpp:942
static bool MoveFeatToProtein(CSeq_feat_Handle fh)
Moves one feature from nucleotide bioseq to the appropriate protein sequence.
Definition: cleanup.cpp:589
static bool RemoveOrphanLocusGeneXrefs(CSeq_feat &f, CBioseq_Handle bsh)
Removes orphaned locus Gene-xrefs.
Definition: cleanup.cpp:965
static void AddNcbiCleanupObject(int ncbi_cleanup_version, CSeq_descr &descr)
Adds NcbiCleanup User Object to Seq-descr.
Definition: cleanup.cpp:1929
CRef< CScope > m_Scope
Definition: cleanup.hpp:613
static bool AreBioSourcesMergeable(const CBioSource &src1, const CBioSource &src2)
Definition: cleanup.cpp:3536
static bool ExpandGeneToIncludeChildren(CSeq_feat &gene, CTSE_Handle &tse)
Expands gene to include features it cross-references.
Definition: cleanup.cpp:2571
static bool WGSCleanup(CSeq_entry_Handle entry, bool instantiate_missing_proteins=true, Uint4 options=0, bool run_extended_cleanup=true)
Performs WGS specific cleanup.
Definition: cleanup.cpp:2653
static vector< CConstRef< CPub > > GetCitationList(CBioseq_Handle bsh)
Get list of pubs that can be used for citations for Seq-feat on a Bioseq-handle.
Definition: cleanup.cpp:3207
static bool LocationMayBeExtendedToMatch(const CSeq_loc &orig, const CSeq_loc &improved)
Checks whether it is possible to extend the original location up to improved one.
Definition: cleanup.cpp:1333
static bool UpdateECNumbers(CProt_ref::TEc &ec_num_list)
Update EC numbers.
Definition: cleanup.cpp:1663
static bool FixRNAEditingCodingRegion(CSeq_feat &cds)
From GB-7563 An action has been requested that will do the following: 1.
Definition: cleanup.cpp:4849
static bool x_HasShortIntron(const CSeq_loc &loc, size_t min_len=11)
Definition: cleanup.cpp:2841
static bool SetGeneticCodes(CBioseq_Handle bsh)
Sets genetic codes for coding regions on Bioseq-Handle.
Definition: cleanup.cpp:2122
static bool RemoveUnnecessaryGeneXrefs(CSeq_feat &f, CScope &scope)
Removes unnecessary Gene-xrefs.
Definition: cleanup.cpp:779
static bool CleanupCollectionDates(CSeq_entry_Handle seh, bool month_first)
Definition: cleanup.cpp:4883
~CCleanup()
Definition: cleanup.cpp:103
static bool MakeIRDFeatsFromSourceXrefs(CSeq_entry_Handle entry)
From SQD-4329 For each sequence with a source that has an IRD db_xref, create a misc_feature across t...
Definition: cleanup.cpp:4654
static bool IsMinPub(const CPubdesc &pd, bool is_refseq_prot)
Is this a "minimal" pub? (If yes, do not rescue from a Seq-feat.cit)
Definition: cleanup.cpp:3426
static bool FindMatchingLocus_tagGene(CSeq_feat &f, const CGene_ref &gene_xref, CBioseq_Handle bsh)
Detects gene features with matching locus_tag.
Definition: cleanup.cpp:990
static bool x_MergeDupOrgRefs(COrg_ref &org1, const COrg_ref &add)
Definition: cleanup.cpp:3726
static void AutodefId(CSeq_entry_Handle seh)
Definition: