NCBI C++ ToolKit
cleanup.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cleanup.cpp 100584 2023-08-14 12:35:51Z foleyjp $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Robert Smith
27  *
28  * File Description:
29  * Basic Cleanup of CSeq_entries.
30  *
31  */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <serial/serialbase.hpp>
35 #include <objects/seq/Bioseq.hpp>
37 // included for GetPubdescLabels and GetCitationList
38 #include <objects/pub/Pub.hpp>
40 #include <objects/seq/Pubdesc.hpp>
46 
57 
59 #include <objmgr/util/sequence.hpp>
60 #include <objmgr/util/feature.hpp>
61 #include <objmgr/util/autodef.hpp>
62 #include <objmgr/seq_annot_ci.hpp>
63 #include <objmgr/seqdesc_ci.hpp>
64 #include <objmgr/seq_vector.hpp>
65 #include <objmgr/seq_vector_ci.hpp>
68 #include "cleanup_utils.hpp"
70 
71 #include <util/strsearch.hpp>
72 
73 #include "newcleanupp.hpp"
74 
76 
77 #include "influenza_set.hpp"
78 
81 
84 };
85 
86 // *********************** CCleanup implementation **********************
87 
88 
89 CCleanup::CCleanup(CScope* scope, EScopeOptions scope_handling)
90 {
91  if (scope && scope_handling == eScope_UseInPlace) {
92  m_Scope = scope;
93  }
94  else {
96  if (scope) {
97  m_Scope->AddScope(*scope);
98  }
99  }
100 }
101 
102 
104 {
105 }
106 
107 
109 {
111  if (scope) {
112  m_Scope->AddScope(*scope);
113  }
114 }
115 
116 
117 static
119 {
120  CRef<CCleanupChange> changes;
121  if (! (options & CCleanup::eClean_NoReporting)) {
122  changes.Reset(new CCleanupChange);
123  }
124  return changes;
125 }
126 
127 #define CLEANUP_SETUP \
128  auto changes = makeCleanupChange(options); \
129  CNewCleanup_imp clean_i(changes, options); \
130  clean_i.SetScope(*m_Scope);
131 
133 {
135  clean_i.BasicCleanupSeqEntry(se);
136  return changes;
137 }
138 
139 
141 {
143  clean_i.BasicCleanupSeqSubmit(ss);
144  return changes;
145 }
146 
147 
149 {
151  clean_i.BasicCleanupSubmitblock(block);
152  return changes;
153 }
154 
155 
157 {
159  clean_i.BasicCleanupBioseqSet(bss);
160  return changes;
161 }
162 
163 
165 {
167  clean_i.BasicCleanupSeqAnnot(sa);
168  return changes;
169 }
170 
171 
173 {
175  clean_i.BasicCleanupSeqFeat(sf);
176  return changes;
177 }
178 
179 
181 {
183  clean_i.BasicCleanupBioSource(src);
184  return changes;
185 }
186 
187 
189 {
190  auto changes = makeCleanupChange(options);
191  CNewCleanup_imp clean_i(changes, options);
192  clean_i.SetScope(seh.GetScope());
193  clean_i.BasicCleanupSeqEntryHandle(seh);
194  return changes;
195 }
196 
197 
199 {
200  auto changes = makeCleanupChange(options);
201  CNewCleanup_imp clean_i(changes, options);
202  clean_i.SetScope(bsh.GetScope());
203  clean_i.BasicCleanupBioseqHandle(bsh);
204  return changes;
205 }
206 
207 
209 {
210  auto changes = makeCleanupChange(options);
211  CNewCleanup_imp clean_i(changes, options);
212  clean_i.SetScope(bssh.GetScope());
213  clean_i.BasicCleanupBioseqSetHandle(bssh);
214  return changes;
215 }
216 
217 
219 {
220  auto changes = makeCleanupChange(options);
221  CNewCleanup_imp clean_i(changes, options);
222  clean_i.SetScope(sah.GetScope());
223  clean_i.BasicCleanupSeqAnnotHandle(sah);
224  return changes;
225 }
226 
227 
229 {
230  auto changes = makeCleanupChange(options);
231  CNewCleanup_imp clean_i(changes, options);
232  clean_i.SetScope(sfh.GetScope());
233  clean_i.BasicCleanupSeqFeatHandle(sfh);
234  return changes;
235 }
236 
237 
239 {
241  clean_i.BasicCleanup(desc);
242  return changes;
243 
244 }
245 
246 
248 {
250 
251  for (auto& it : desc.Set()) {
252  clean_i.BasicCleanup(*it);
253  }
254  return changes;
255 }
256 
257 
258 // *********************** Extended Cleanup implementation ********************
260 {
262  clean_i.ExtendedCleanupSeqEntry(se);
263 
264  return changes;
265 }
266 
267 
269 {
271  clean_i.ExtendedCleanupSeqSubmit(ss);
272  return changes;
273 }
274 
275 
277 {
279  clean_i.ExtendedCleanupSeqAnnot(sa); // (m_Scope->GetSeq_annotHandle(sa));
280  return changes;
281 }
282 
284 {
285  auto changes = makeCleanupChange(options);
286  CNewCleanup_imp clean_i(changes, options);
287  clean_i.ExtendedCleanupSeqEntryHandle(seh); // (m_Scope->GetSeq_annotHandle(sa));
288  return changes;
289 }
290 
291 
292 // *********************** CCleanupChange implementation **********************
293 
294 
295 vector<CCleanupChangeCore::EChanges> CCleanupChangeCore::GetAllChanges() const
296 {
297  return m_Changes;
298 }
299 
300 
302 {
303  vector<string> result;
304  result.reserve(m_Changes.size());
305  for (auto it: m_Changes) {
306  result.push_back( string( GetDescription(it) ) );
307  }
308  return result;
309 }
310 
311 vector<string_view> CCleanupChangeCore::GetDescriptions() const
312 {
313  vector<string_view> result;
314  result.reserve(m_Changes.size());
315  for (auto it: m_Changes) {
316  result.push_back( GetDescription(it) );
317  }
318  return result;
319 }
320 
321 // corresponds to the values in CCleanupChange::EChanges.
322 // They must be edited together.
323 static constexpr std::array<string_view, CCleanupChangeCore::eNumberofChangeTypes> sm_ChangeDesc = {
324  "Invalid Change Code",
325  // set when strings are changed.
326  "Trim Spaces",
327  "Clean Double Quotes",
328  "Append To String",
329  // set when lists are sorted or uniqued.
330  "Clean Qualifiers List",
331  "Clean Dbxrefs List",
332  "Clean CitonFeat List",
333  "Clean Keywords List",
334  "Clean Subsource List",
335  "Clean Orgmod List",
336  // Set when fields are moved or have content changes
337  "Repair BioseqMol", //10
338  "Change Feature Key",
339  "Normalize Authors",
340  "Change Publication",
341  "Change Qualifiers",
342  "Change Dbxrefs",
343  "Change Keywords",
344  "Change Subsource",
345  "Change Orgmod",
346  "Change Exception",
347  "Change Comment", //20
348  // Set when fields are rescued
349  "Change tRna",
350  "Change rRna",
351  "Change ITS",
352  "Change Anticodon",
353  "Change Code Break",
354  "Change Genetic Code",
355  "Copy GeneXref",
356  "Copy ProtXref",
357  // set when locations are repaired
358  "Change Seqloc",
359  "Change Strand", //30
360  "Change WholeLocation",
361  // set when MolInfo descriptors are affected
362  "Change MolInfo Descriptor",
363  // set when prot-xref is removed
364  "Remove ProtXref",
365  // set when gene-xref is removed
366  "Remove GeneXref",
367  // set when protein feature is added
368  "Add Protein Feature",
369  // set when feature is removed
370  "Remove Feature",
371  // set when feature is moved
372  "Move Feature",
373  // set when qualifier is removed
374  "Remove Qualifier",
375  // set when Gene Xref is created
376  "Add GeneXref",
377  // set when descriptor is removed
378  "Remove Descriptor", //40
379  "Remove Keyword",
380  "Add Descriptor",
381  "Move Descriptor",
382  "Convert Feature to Descriptor",
383  "Collapse Set",
384  "Change Feature Location",
385  "Remove Annotation",
386  "Convert Feature",
387  "Remove Comment",
388  "Add BioSource OrgMod", //50
389  "Add BioSource SubSource",
390  "Change BioSource Genome",
391  "Change BioSource Origin",
392  "Change BioSource Other",
393  "Change SeqId",
394  "Remove Empty Publication",
395  "Add Qualifier",
396  "Cleanup Date",
397  "Change BioseqInst",
398  "Remove SeqID", // 60
399  "Add ProtXref",
400  "Change Partial",
401  "Change Prot Names",
402  "Change Prot Activities",
403  "Change Site",
404  "Change PCR Primers",
405  "Change RNA-ref",
406  "Move To Prot Xref",
407  "Compress Spaces",
408  "Strip serial", // 70
409  "Remove Orgmod",
410  "Remove SubSource",
411  "Create Gene Nomenclature",
412  "Clean Seq-feat xref",
413  "Clean User-Object Or -Field",
414  "Letter Case Change",
415  "Change Bioseq-set Class",
416  "Unique Without Sort",
417  "Add RNA-ref",
418  "Change Gene-ref", // 80
419  "Clean Dbtag",
420  "Change Biomol",
421  "Change Cdregion",
422  "Clean EC Number",
423  "Remove Exception",
424  "Add NcbiCleanupObject",
425  "Clean Delta-ext",
426  "Trim Flanking Quotes",
427  "Clean Bioseq Title",
428  "Decode XML", // 90
429  "Remove Dup BioSource",
430  "Clean Org-ref",
431  "Trim Internal Semicolons",
432  "Add SeqFeatXref",
433  "Convert Unstructured Org-ref Modifier",
434  "Change taxname",
435  "Move GO term to GeneOntology object",
436 
437  // set when any other change is made.
438  "Change Other",
439 };
440 
442 {
443  if (e <= eNoChange || e >= eNumberofChangeTypes) {
444  return sm_ChangeDesc[eNoChange]; // this is "Invalid Change Code"
445  }
446  return sm_ChangeDesc[e];
447 }
448 
450 {
451  if (NStr::Equal(key, "sig_peptide")) {
453  } else if (NStr::Equal(key, "mat_peptide")) {
455  } else if (NStr::Equal(key, "transit_peptide")) {
457  } else if (NStr::Equal(key, "preprotein") || NStr::Equal(key, "proprotein")) {
459  } else if (NStr::Equal(key, "propeptide")) {
461  } else {
463  }
464 }
465 
467 {
468  switch (processed) {
470  return "mat_peptide";
471  break;
473  return "preprotein";
474  break;
476  return "sig_peptide";
477  break;
479  return "transit_peptide";
480  break;
482  return "propeptide";
483  break;
485  return kEmptyStr;
486  break;
487  }
488  return kEmptyStr;
489 }
490 
491 
493 {
494  if (fh.GetData().IsProt() && fh.GetData().GetProt().IsSetProcessed()) {
496  if (!NStr::IsBlank(key)) {
497  CRef<CSeq_feat> new_feat(new CSeq_feat());
498  new_feat->Assign(*(fh.GetSeq_feat()));
499  if (fh.GetData().GetProt().IsSetName() && !fh.GetData().GetProt().GetName().empty()) {
500  CRef<CGb_qual> q(new CGb_qual());
501  q->SetQual("product");
502  q->SetVal(fh.GetData().GetProt().GetName().front());
503  new_feat->SetQual().push_back(q);
504  }
505  new_feat->SetData().SetImp().SetKey(key);
506  CSeq_feat_EditHandle efh(fh);
507  efh.Replace(*new_feat);
508  return true;
509  }
510  }
511  return false;
512 }
513 
514 
516 {
517  if (!fh.IsSetData()) {
518  return false;
519  } else if (fh.GetData().IsProt() &&
520  fh.GetData().GetProt().IsSetProcessed() &&
522  return true;
523  } else if (fh.GetData().IsImp() &&
524  fh.GetData().GetImp().IsSetKey() &&
526  return true;
527  } else {
528  return false;
529  }
530 }
531 
532 
534 {
535  if (!feat.IsSetQual() ||
536  !feat.IsSetData() ||
537  !feat.GetData().IsProt() ||
538  feat.GetData().GetProt().IsSetName()) {
539  return;
540  }
541  CSeq_feat::TQual::iterator it = feat.SetQual().begin();
542  while (it != feat.SetQual().end()) {
543  if ((*it)->IsSetQual() &&
544  NStr::Equal((*it)->GetQual(), "product")) {
545  if ((*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal())) {
546  feat.SetData().SetProt().SetName().push_back((*it)->GetVal());
547  }
548  it = feat.SetQual().erase(it);
549  } else {
550  ++it;
551  }
552  }
553 
554  if (feat.SetQual().empty()) {
555  feat.ResetQual();
556  }
557 }
558 
559 
561 {
562  const bool feat_by_product = true;
563  SAnnotSelector sel(CSeqFeatData::e_Cdregion, feat_by_product);
564  CFeat_CI fi(scope, product, sel);
565  if (fi) {
566  return ConstRef(&(fi->GetOriginalFeature()));
567  }
568  return CConstRef<CSeq_feat>();
569 };
570 
572 {
573  sequence::TFeatScores cdsScores;
575  feat_loc,
579  cdsScores,
580  scope);
581 
582  if (cdsScores.empty()) {
583  return CConstRef<CSeq_feat>();
584  }
585 
586  if (!feat_loc.IsPartialStart(eExtreme_Biological)) {
587  for (auto cdsScore : cdsScores) {
588  if (feature::IsLocationInFrame(scope.GetSeq_featHandle(*cdsScore.second), feat_loc)
590  return cdsScore.second;
591  }
592  }
593  }
594 
595  return cdsScores.front().second;
596 }
597 
598 
599 
601 {
603  if (fh.GetData().IsImp()) {
604  if (!fh.GetData().GetImp().IsSetKey()) {
605  return false;
606  }
607  processed = s_ProcessedFromKey(fh.GetData().GetImp().GetKey());
608  if (processed == CProt_ref::eProcessed_not_set || processed == CProt_ref::eProcessed_preprotein) {
609  return false;
610  }
611  } else if (s_IsPreprotein(fh)) {
612  return ConvertProteinToImp(fh);
613  }
614 
615  CBioseq_Handle parent_bsh = fh.GetScope().GetBioseqHandle(fh.GetLocation());
616 
617  if (!parent_bsh) {
618  // feature is mispackaged
619  return false;
620  }
621  if (parent_bsh.IsAa()) {
622  // feature is already on protein sequence
623  return false;
624  }
625 
627  bool matched_by_product = false;
628 
629  if (fh.IsSetProduct() &&
630  fh.GetData().IsProt() &&
631  fh.GetData().GetProt().IsSetProcessed() &&
633  cds = s_GetCdsByProduct(fh.GetScope(), fh.GetProduct());
634  if (cds) {
635  matched_by_product = true;
636  }
637  }
638  if (!matched_by_product) {
639  cds = s_GetCdsByLocation(fh.GetScope(), fh.GetLocation());
640  }
641  if (!cds || !cds->IsSetProduct()) {
642  // there is no overlapping coding region feature, so there is no appropriate
643  // protein sequence to move to
644  return ConvertProteinToImp(fh);
645  }
646 
647  bool require_frame = false;
648  if (!require_frame) {
649  ITERATE(CBioseq::TId, id_it, parent_bsh.GetBioseqCore()->GetId()) {
650  if ((*id_it)->IsEmbl() || (*id_it)->IsDdbj()) {
651  require_frame = true;
652  break;
653  }
654  }
655  }
656 
657  CRef<CSeq_loc> prot_loc = GetProteinLocationFromNucleotideLocation(fh.GetLocation(), *cds, fh.GetScope(), require_frame);
658 
659  if (!prot_loc) {
660  return false;
661  }
662 
663  CConstRef<CSeq_feat> orig_feat = fh.GetSeq_feat();
664  CRef<CSeq_feat> new_feat(new CSeq_feat());
665  new_feat->Assign(*orig_feat);
666  if (new_feat->GetData().Which() == CSeqFeatData::e_Imp) {
667  new_feat->SetData().SetProt().SetProcessed(processed);
668  // if possible, rescue product qual
669  RescueProtProductQual(*new_feat);
670  if (processed == CProt_ref::eProcessed_mature &&
671  !new_feat->GetData().GetProt().IsSetName()) {
672  if (orig_feat->IsSetComment() && !NStr::IsBlank(orig_feat->GetComment())) {
673  new_feat->SetData().SetProt().SetName().push_back(orig_feat->GetComment());
674  new_feat->ResetComment();
675  } else {
676  new_feat->SetData().SetProt().SetName().push_back("unnamed");
677  }
678  }
679  }
680 
681  // change location to protein
682  new_feat->ResetLocation();
683  new_feat->SetLocation(*prot_loc);
684  SetFeaturePartial(*new_feat);
685  if (matched_by_product) {
686  new_feat->ResetProduct();
687  }
688 
689  CSeq_feat_EditHandle edh(fh);
690  edh.Replace(*new_feat);
691  auto changes= makeCleanupChange(0);
692  CNewCleanup_imp clean_i(changes, 0);
693  clean_i.SetScope(fh.GetScope());
694  clean_i.BasicCleanupSeqFeat(*new_feat);
695 
696  CSeq_annot_Handle ah = fh.GetAnnot();
697 
698  CBioseq_Handle target_bsh = fh.GetScope().GetBioseqHandle(new_feat->GetLocation());
699  if (!target_bsh) {
700  return false;
701  }
702 
703  CBioseq_EditHandle eh = target_bsh.GetEditHandle();
704 
705  // Find a feature table on the protein sequence to add the feature to.
707  if (target_bsh.GetCompleteBioseq()->IsSetAnnot()) {
708  ITERATE(CBioseq::TAnnot, annot_it, target_bsh.GetCompleteBioseq()->GetAnnot()) {
709  if ((*annot_it)->IsFtable()) {
710  ftable = fh.GetScope().GetSeq_annotHandle(**annot_it);
711  }
712  }
713  }
714 
715  // If there is no feature table present, make one
716  if (!ftable) {
717  CRef<CSeq_annot> new_annot(new CSeq_annot());
718  ftable = eh.AttachAnnot(*new_annot);
719  }
720 
721  // add feature to the protein bioseq
723  aeh.TakeFeat(edh);
724 
725  // remove old annot if now empty
728  orig.Remove();
729  }
730 
731  return true;
732 }
733 
734 
736 {
737  bool any_change = false;
739  while (bi) {
743  for (CFeat_CI prot_it(*bi, sel); prot_it; ++prot_it) {
744  any_change |= MoveFeatToProtein(*prot_it);
745  }
746  for (CFeat_CI imp_it(*bi, CSeqFeatData::e_Imp); imp_it; ++imp_it) {
747  any_change |= MoveFeatToProtein(*imp_it);
748  }
749  ++bi;
750  }
751  return any_change;
752 }
753 
754 
755 bool CCleanup::IsGeneXrefUnnecessary(const CSeq_feat& sf, CScope& scope, const CGene_ref& gene_xref)
756 {
757  if (gene_xref.IsSuppressed()) {
758  return false;
759  }
760 
762  if (!gene || !gene->IsSetData() || !gene->GetData().IsGene()) {
763  return false;
764  }
765 
766  if (!gene->GetData().GetGene().RefersToSameGene(gene_xref)) {
767  return false;
768  }
769 
770  // see if other gene might also match
771  sequence::TFeatScores scores;
773  sequence::eOverlap_Contained, scores, scope);
774  if (scores.size() == 1) {
775  return true;
776  } else if (scores.size() == 0) {
777  return false;
778  }
779 
780  ITERATE(sequence::TFeatScores, g, scores) {
781  if (g->second.GetPointer() != gene.GetPointer() &&
782  sequence::Compare(g->second->GetLocation(), gene->GetLocation(), &scope, sequence::fCompareOverlapping) == sequence::eSame) {
783  return false;
784  }
785  }
786  return true;
787 }
788 
789 
791 {
792  if (!f.IsSetXref()) {
793  return false;
794  }
795  bool any_removed = false;
796  CSeq_feat::TXref::iterator xit = f.SetXref().begin();
797  while (xit != f.SetXref().end()) {
798  if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
799  IsGeneXrefUnnecessary(f, scope, (*xit)->GetData().GetGene())) {
800  xit = f.SetXref().erase(xit);
801  any_removed = true;
802  } else {
803  ++xit;
804  }
805  }
806  if (any_removed) {
807  if (f.IsSetXref() && f.GetXref().empty()) {
808  f.ResetXref();
809  }
810  }
811  return any_removed;
812 }
813 
814 
816 {
817  bool any_change = false;
818  CScope& scope = seh.GetScope();
819 
820  for (CFeat_CI fi(seh); fi; ++fi) {
821  if (fi->IsSetXref()) {
822  CRef<CSeq_feat> new_feat(new CSeq_feat());
823  new_feat->Assign(*(fi->GetOriginalSeq_feat()));
824  bool any_removed = RemoveUnnecessaryGeneXrefs(*new_feat, scope);
825  if (any_removed) {
826  CSeq_feat_EditHandle edh(*fi);
827  edh.Replace(*new_feat);
828  any_change = true;
829  }
830  }
831  }
832 
833  return any_change;
834 }
835 
836 
837 //LCOV_EXCL_START
838 //not used by asn_cleanup but used by other applications
840 {
841  if (!f.IsSetXref()) {
842  return false;
843  }
844  bool any_removed = false;
845  CSeq_feat::TXref::iterator xit = f.SetXref().begin();
846  while (xit != f.SetXref().end()) {
847  if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
848  !(*xit)->GetData().GetGene().IsSuppressed()) {
849  xit = f.SetXref().erase(xit);
850  any_removed = true;
851  } else {
852  ++xit;
853  }
854  }
855  if (any_removed) {
856  if (f.IsSetXref() && f.GetXref().empty()) {
857  f.ResetXref();
858  }
859  }
860  return any_removed;
861 }
862 //LCOV_EXCL_STOP
863 
864 
866 {
867  if (!src.IsSetId() || !src.GetId().IsLocal()) {
868  // can't create xref if no ID
869  return false;
870  }
872  // only create reciprocal xrefs if permitted
873  return false;
874  }
875  // don't create xref if already have xref or if dst not gene and already has
876  // xref to feature of same type as src
877  bool has_xref = false;
878  if (dst.IsSetXref()) {
879  ITERATE(CSeq_feat::TXref, xit, dst.GetXref()) {
880  if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
881  if ((*xit)->GetId().Equals(src.GetId())) {
882  // already have xref
883  has_xref = true;
884  break;
885  } else if (!dst.GetData().IsGene()) {
886  const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
888  ITERATE(CTSE_Handle::TSeq_feat_Handles, fit, far_feats) {
889  if (fit->GetData().GetSubtype() == src.GetData().GetSubtype()) {
890  has_xref = true;
891  break;
892  }
893  }
894  if (has_xref) {
895  break;
896  }
897  }
898  }
899  }
900  }
901  bool rval = false;
902  if (!has_xref) {
903  // to put into "editing mode"
904  dst.GetAnnot().GetEditHandle();
905  CSeq_feat_EditHandle eh(dst);
906  CRef<CSeq_feat> cpy(new CSeq_feat());
907  cpy->Assign(*(dst.GetSeq_feat()));
908  cpy->AddSeqFeatXref(src.GetId());
909  eh.Replace(*cpy);
910  rval = true;
911  }
912  return rval;
913 }
914 
915 
917 {
918  bool rval = false;
919 
920  if (!f.IsSetId() || !f.IsSetXref()) {
921  return rval;
922  }
923 
924  ITERATE(CSeq_feat::TXref, xit, f.GetXref()) {
925  if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
926  const CTSE_Handle::TFeatureId& x_id = (*xit)->GetId().GetLocal();
928  if (far_feats.size() == 1) {
929  rval |= RepairXrefs(f, far_feats[0], tse);
930  }
931  }
932  }
933  return rval;
934 }
935 
936 
938 {
939  bool rval = false;
940  const CTSE_Handle& tse = seh.GetTSE_Handle();
941 
942  CFeat_CI fi(seh);
943  while (fi) {
944  rval |= RepairXrefs(*(fi->GetSeq_feat()), tse);
945  ++fi;
946  }
947  return rval;
948 }
949 
950 
951 //LCOV_EXCL_START
952 //not used by asn_cleanup but used by other applications
954 {
955  bool match = false;
956  string locus1;
957  if (gene_xref.IsSetLocus())
958  locus1 = gene_xref.GetLocus();
959  for (CFeat_CI feat_ci(bsh, SAnnotSelector(CSeqFeatData::eSubtype_gene)); feat_ci; ++feat_ci)
960  {
961  string locus2;
962  if ( !f.Equals(*feat_ci->GetSeq_feat()) && feat_ci->GetSeq_feat()->IsSetData() && feat_ci->GetSeq_feat()->GetData().IsGene()
963  && feat_ci->GetSeq_feat()->GetData().GetGene().IsSetLocus())
964  {
965  locus2 = feat_ci->GetSeq_feat()->GetData().GetGene().GetLocus();
966  }
967  if (!locus1.empty() && !locus2.empty() && locus1 == locus2)
968  {
969  match = true;
970  break;
971  }
972  }
973  return match;
974 }
975 
977 {
978  if (!f.IsSetXref()) {
979  return false;
980  }
981  bool any_removed = false;
982  CSeq_feat::TXref::iterator xit = f.SetXref().begin();
983  while (xit != f.SetXref().end()) {
984  if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
985  !(*xit)->GetData().GetGene().IsSuppressed() && !FindMatchingLocusGene(f, (*xit)->GetData().GetGene(), bsh)) {
986  xit = f.SetXref().erase(xit);
987  any_removed = true;
988  } else {
989  ++xit;
990  }
991  }
992  if (any_removed) {
993  if (f.IsSetXref() && f.GetXref().empty()) {
994  f.ResetXref();
995  }
996  }
997  return any_removed;
998 }
999 
1000 
1002 {
1003  bool match = false;
1004  string locus_tag1;
1005  if (gene_xref.IsSetLocus_tag())
1006  locus_tag1 = gene_xref.GetLocus_tag();
1007  for (CFeat_CI feat_ci(bsh, SAnnotSelector(CSeqFeatData::eSubtype_gene)); feat_ci; ++feat_ci)
1008  {
1009  string locus_tag2;
1010  if ( !f.Equals(*feat_ci->GetSeq_feat()) && feat_ci->GetSeq_feat()->IsSetData() && feat_ci->GetSeq_feat()->GetData().IsGene()
1011  && feat_ci->GetSeq_feat()->GetData().GetGene().IsSetLocus_tag())
1012  {
1013  locus_tag2 = feat_ci->GetSeq_feat()->GetData().GetGene().GetLocus_tag();
1014  }
1015  if (!locus_tag1.empty() && !locus_tag2.empty() && locus_tag1 == locus_tag2)
1016  {
1017  match = true;
1018  break;
1019  }
1020  }
1021  return match;
1022 }
1023 
1025 {
1026  if (!f.IsSetXref()) {
1027  return false;
1028  }
1029  bool any_removed = false;
1030  CSeq_feat::TXref::iterator xit = f.SetXref().begin();
1031  while (xit != f.SetXref().end()) {
1032  if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() &&
1033  !(*xit)->GetData().GetGene().IsSuppressed() && !FindMatchingLocus_tagGene(f, (*xit)->GetData().GetGene(), bsh)) {
1034  xit = f.SetXref().erase(xit);
1035  any_removed = true;
1036  } else {
1037  ++xit;
1038  }
1039  }
1040  if (any_removed) {
1041  if (f.IsSetXref() && f.GetXref().empty()) {
1042  f.ResetXref();
1043  }
1044  }
1045  return any_removed;
1046 }
1047 
1048 
1049 bool CCleanup::SeqLocExtend(CSeq_loc& loc, size_t pos_, CScope& scope)
1050 {
1051  TSeqPos pos = static_cast<TSeqPos>(pos_);
1052  TSeqPos loc_start = loc.GetStart(eExtreme_Positional);
1053  TSeqPos loc_stop = loc.GetStop(eExtreme_Positional);
1054  bool partial_start = loc.IsPartialStart(eExtreme_Positional);
1055  bool partial_stop = loc.IsPartialStop(eExtreme_Positional);
1056  ENa_strand strand = loc.GetStrand();
1057  CRef<CSeq_loc> new_loc;
1058  bool changed = false;
1059 
1060  if (pos < loc_start) {
1061  CRef<CSeq_id> id(new CSeq_id());
1062  id->Assign(*(loc.GetId()));
1063  CRef<CSeq_loc> add(new CSeq_loc(*id, pos, loc_start - 1, strand));
1064  add->SetPartialStart(partial_start, eExtreme_Positional);
1066  changed = true;
1067  } else if (pos > loc_stop) {
1068  CRef<CSeq_id> id(new CSeq_id());
1069  id->Assign(*(loc.GetId()));
1070  CRef<CSeq_loc> add(new CSeq_loc(*id, loc_stop + 1, pos, strand));
1071  add->SetPartialStop(partial_stop, eExtreme_Positional);
1073  changed = true;
1074  }
1075  if (changed) {
1076  loc.Assign(*new_loc);
1077  }
1078  return changed;
1079 }
1080 //LCOV_EXCL_STOP
1081 
1082 
1083 bool CCleanup::ExtendStopPosition(CSeq_feat& f, const CSeq_feat* cdregion, size_t extension_)
1084 {
1085  TSeqPos extension = static_cast<TSeqPos>(extension_);
1086  CRef<CSeq_loc> new_loc(&f.SetLocation());
1087 
1088  CRef<CSeq_loc> last_interval;
1089  if (new_loc->IsMix()) {
1090  last_interval = new_loc->SetMix().SetLastLoc();
1091  }
1092  else
1093  {
1094  last_interval = new_loc;
1095  }
1096 
1097  CConstRef<CSeq_id> id(last_interval->GetId());
1098 
1099  TSeqPos new_start;
1100  TSeqPos new_stop;
1101 
1102  // the last element of the mix or the single location MUST be converted into interval
1103  // whethe it's whole or point, etc
1104  if (last_interval->IsSetStrand() && last_interval->GetStrand() == eNa_strand_minus) {
1105  new_start = (cdregion ? cdregion->GetLocation().GetStart(eExtreme_Positional) :
1106  last_interval->GetStart(eExtreme_Positional)) - extension;
1107 
1108  new_stop = last_interval->GetStop(eExtreme_Positional);
1109  }
1110  else {
1111  new_start = last_interval->GetStart(eExtreme_Positional);
1112  new_stop = (cdregion ? cdregion->GetLocation().GetStop(eExtreme_Positional) :
1113  last_interval->GetStop(eExtreme_Positional)) + extension;
1114  }
1115  last_interval->SetInt().SetFrom(new_start);
1116  last_interval->SetInt().SetTo(new_stop);
1117  last_interval->SetInt().SetId().Assign(*id);
1118 
1119  new_loc->SetPartialStop(false, eExtreme_Biological);
1120 
1121  return true;
1122 }
1123 
1125 {
1126  const CSeq_loc& loc = f.GetLocation();
1127 
1129  const CGenetic_code* code = nullptr;
1130  // we need to extract frame and cd_region from linked cd_region
1131  if (f.IsSetData() && f.GetData().IsCdregion())
1132  {
1133  if (f.GetData().GetCdregion().IsSetCode())
1134  code = &(f.GetData().GetCdregion().GetCode());
1135  if (f.GetData().GetCdregion().IsSetFrame())
1136  frame = f.GetData().GetCdregion().GetFrame();
1137  }
1138 
1139  TSeqPos stop = loc.GetStop(eExtreme_Biological);
1140  if (stop < 1 || stop > bsh.GetBioseqLength() - 1) {
1141  // no room to extend
1142  return false;
1143  }
1144  // figure out if we have a partial codon at the end
1145  size_t orig_len = sequence::GetLength(loc, &(bsh.GetScope()));
1146  size_t len = orig_len;
1147 
1148  if (frame == CCdregion::eFrame_two) {
1149  len -= 1;
1150  } else if (frame == CCdregion::eFrame_three) {
1151  len -= 2;
1152  }
1153 
1154  TSeqPos mod = len % 3;
1155  CRef<CSeq_loc> vector_loc(new CSeq_loc());
1156  vector_loc->SetInt().SetId().Assign(*(bsh.GetId().front().GetSeqId()));
1157 
1158  if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
1159  vector_loc->SetInt().SetFrom(0);
1160  vector_loc->SetInt().SetTo(stop + mod - 1);
1161  vector_loc->SetStrand(eNa_strand_minus);
1162  } else {
1163  vector_loc->SetInt().SetFrom(stop - mod + 1);
1164  vector_loc->SetInt().SetTo(bsh.GetInst_Length() - 1);
1165  }
1166 
1167  CSeqVector seq(*vector_loc, bsh.GetScope(), CBioseq_Handle::eCoding_Iupac);
1168  // reserve our space
1169  size_t usable_size = seq.size();
1170 
1171  if (limit > 0 && usable_size > limit) {
1172  usable_size = limit;
1173  }
1174 
1175  // get appropriate translation table
1176  const CTrans_table & tbl =
1179 
1180  // main loop through bases
1181  CSeqVector::const_iterator start = seq.begin();
1182 
1183  size_t i;
1184  size_t k;
1185  int state = 0;
1186  size_t length = usable_size / 3;
1187 
1188  for (i = 0; i < length; ++i) {
1189  // loop through one codon at a time
1190  for (k = 0; k < 3; ++k, ++start) {
1191  state = tbl.NextCodonState(state, *start);
1192  }
1193 
1194  if (tbl.GetCodonResidue(state) == '*') {
1195  TSeqPos extension = static_cast<TSeqPos>(((i + 1) * 3) - mod);
1196  ExtendStopPosition(f, 0, extension);
1197  return true;
1198  }
1199  }
1200 
1201  return false;
1202 }
1203 
1204 
1206 {
1207  bool changed = false;
1209  if (cds.GetData().GetCdregion().IsSetFrame()) {
1210  frame = cds.GetData().GetCdregion().GetFrame();
1211  }
1212 
1213  CCdregion::TFrame new_frame = CSeqTranslator::FindBestFrame(cds, scope);
1214  if (frame != new_frame) {
1215  cds.SetData().SetCdregion().SetFrame(new_frame);
1216  changed = true;
1217  }
1218  return changed;
1219 }
1220 
1221 // like C's function GetFrameFromLoc, but better
1223 {
1224  if (!loc.IsPartialStart(eExtreme_Biological)) {
1225  if (frame != CCdregion::eFrame_one) {
1226  frame = CCdregion::eFrame_one;
1227  return true;
1228  }
1229  return false;
1230  }
1232  // cannot make a determination if both ends are partial
1233  return false;
1234  }
1235 
1236  const TSeqPos seq_len = sequence::GetLength(loc, &scope);
1237 
1239 
1240  // have complete last codon, get frame from length
1241  switch( (seq_len % 3) + 1 ) {
1242  case 1:
1243  desired_frame = CCdregion::eFrame_one;
1244  break;
1245  case 2:
1246  desired_frame = CCdregion::eFrame_two;
1247  break;
1248  case 3:
1249  desired_frame = CCdregion::eFrame_three;
1250  break;
1251  default:
1252  // mathematically impossible
1253  _ASSERT(false);
1254  return false;
1255  }
1256  if (frame != desired_frame) {
1257  frame = desired_frame;
1258  return true;
1259  }
1260  return false;
1261 }
1262 
1263 
1264 bool CCleanup::SetFrameFromLoc(CCdregion &cdregion, const CSeq_loc& loc, CScope& scope)
1265 {
1267  if (cdregion.IsSetFrame()) {
1268  frame = cdregion.GetFrame();
1269  }
1270  if (SetFrameFromLoc(frame, loc, scope)) {
1271  cdregion.SetFrame(frame);
1272  return true;
1273  } else {
1274  return false;
1275  }
1276 }
1277 
1278 
1280 {
1281  size_t loc_end = loc.GetStop(eExtreme_Biological);
1282  CSeq_loc_CI other_int(other_loc);
1283  while (other_int) {
1284  if (other_int.IsSetStrand() &&
1285  other_int.GetStrand() == eNa_strand_minus) {
1286  if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus &&
1287  loc_end == other_int.GetRange().GetFrom()) {
1288  return true;
1289  }
1290  } else {
1291  if ((!loc.IsSetStrand() || loc.GetStrand() != eNa_strand_minus) &&
1292  loc_end == other_int.GetRange().GetTo()) {
1293  return true;
1294  }
1295  }
1296  ++other_int;
1297  }
1298  return false;
1299 }
1300 
1301 
1303 {
1304  if (!f.GetData().IsCdregion()) {
1305  // not coding region
1306  return false;
1307  }
1308  if (sequence::IsPseudo(f, bsh.GetScope())) {
1309  return false;
1310  }
1311  if (f.GetLocation().IsPartialStop(eExtreme_Biological)) {
1312  return false;
1313  }
1315  if (mrna) {
1316  if (mrna->GetLocation().GetStop(eExtreme_Biological) == f.GetLocation().GetStop(eExtreme_Biological)) {
1317  //ok
1318  } else if (s_IsLocationEndAtOtherLocationInternalEndpoint(f.GetLocation(), mrna->GetLocation())) {
1319  return false;
1320  }
1321  }
1322 
1323  if (check_for_stop) {
1324  string translation;
1325  try {
1326  CSeqTranslator::Translate(f, bsh.GetScope(), translation, true);
1327  } catch (CSeqMapException&) {
1328  //unable to translate
1329  return false;
1330  } catch (CSeqVectorException&) {
1331  //unable to translate
1332  return false;
1333  }
1334  if (NStr::EndsWith(translation, "*")) {
1335  //already has stop codon
1336  return false;
1337  }
1338  }
1339 
1340  return ExtendToStopCodon(f, bsh, 3);
1341 }
1342 
1343 
1345 {
1346  if ((orig.GetStrand() == eNa_strand_minus &&
1347  orig.GetStop(eExtreme_Biological) > improved.GetStop(eExtreme_Biological)) ||
1348  (orig.GetStrand() != eNa_strand_minus &&
1349  orig.GetStop(eExtreme_Biological) < improved.GetStop(eExtreme_Biological))) {
1350  return true;
1351  }
1352 
1353  return false;
1354 }
1355 
1356 void CCleanup::SetProteinName(CProt_ref& prot_ref, const string& protein_name, bool append)
1357 {
1358  if (append && prot_ref.IsSetName() && prot_ref.GetName().size() > 0) {
1359  if (!NStr::IsBlank(prot_ref.GetName().front())) {
1360  prot_ref.SetName().front() += "; ";
1361  }
1362  prot_ref.SetName().front() += protein_name;
1363  } else {
1364  prot_ref.SetName().push_back(protein_name);
1365  }
1366 }
1367 
1368 
1369 void CCleanup::SetMrnaName(CSeq_feat& mrna, const string& protein_name)
1370 {
1371  bool used_qual = false;
1372  if (mrna.IsSetQual()) {
1373  for (auto it = mrna.SetQual().begin(); it != mrna.SetQual().end(); it++) {
1374  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1375  (*it)->SetVal(protein_name);
1376  used_qual = true;
1377  break;
1378  }
1379  }
1380  }
1381  if (!used_qual || (mrna.IsSetData() && mrna.GetData().IsRna() && mrna.GetData().GetRna().IsSetExt())) {
1382  string remainder;
1383  mrna.SetData().SetRna().SetRnaProductName(protein_name, remainder);
1384  }
1385 }
1386 
1387 
1388 //LCOV_EXCL_START
1389 //seems to be unused
1391 {
1392  if (cds.IsSetXref()) {
1393  for (auto it = cds.GetXref().begin(); it != cds.GetXref().end(); it++) {
1394  if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1395  return true;
1396  }
1397  }
1398  }
1399  if (cds.IsSetQual()) {
1400  for (auto it = cds.GetQual().begin(); it != cds.GetQual().end(); it++) {
1401  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1402  return true;
1403  }
1404  }
1405  }
1406  return false;
1407 }
1408 //LCOV_EXCL_STOP
1409 
1410 
1411 void CCleanup::s_SetProductOnFeat(CSeq_feat& feat, const string& protein_name, bool append)
1412 {
1413  if (feat.IsSetXref()) {
1414  // see if this seq-feat already has a prot xref
1415  for (auto it = feat.SetXref().begin(); it != feat.SetXref().end(); it++) {
1416  if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1417  SetProteinName((*it)->SetData().SetProt(), protein_name, append);
1418  break;
1419  }
1420  }
1421  }
1422  if (feat.IsSetQual()) {
1423  for (auto it = feat.SetQual().begin(); it != feat.SetQual().end(); it++) {
1424  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1425  if ((*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal()) && append) {
1426  (*it)->SetVal((*it)->GetVal() + "; " + protein_name);
1427  } else {
1428  (*it)->SetVal(protein_name);
1429  }
1430  }
1431  }
1432  }
1433 }
1434 
1435 
1436 void CCleanup::SetProteinName(CSeq_feat& cds, const string& protein_name, bool append, CScope& scope)
1437 {
1438  s_SetProductOnFeat(cds, protein_name, append);
1439  bool added = false;
1440  if (cds.IsSetProduct()) {
1442  if (prot) {
1443  // find main protein feature
1445  if (feat_ci) {
1446  CRef<CSeq_feat> new_prot(new CSeq_feat());
1447  new_prot->Assign(feat_ci->GetOriginalFeature());
1448  SetProteinName(new_prot->SetData().SetProt(), protein_name, append);
1449  CSeq_feat_EditHandle feh(feat_ci->GetSeq_feat_Handle());
1450  feh.Replace(*new_prot);
1451  } else {
1452  // make new protein feature
1453  feature::AddProteinFeature(*(prot.GetCompleteBioseq()), protein_name, cds, scope);
1454  }
1455  added = true;
1456  }
1457  }
1458  if (!added) {
1459  if (cds.IsSetXref()) {
1460  // see if this seq-feat already has a prot xref
1462  if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1463  SetProteinName((*it)->SetData().SetProt(), protein_name, append);
1464  added = true;
1465  break;
1466  }
1467  }
1468  }
1469  if (!added) {
1470  CRef<CSeqFeatXref> xref(new CSeqFeatXref());
1471  xref->SetData().SetProt().SetName().push_back(protein_name);
1472  cds.SetXref().push_back(xref);
1473  }
1474  }
1475 }
1476 
1477 
1478 const string& CCleanup::GetProteinName(const CProt_ref& prot)
1479 {
1480  if (prot.IsSetName() && !prot.GetName().empty()) {
1481  return prot.GetName().front();
1482  } else {
1483  return kEmptyStr;
1484  }
1485 }
1486 
1487 
1488 static const string& s_GetProteinNameFromXrefOrQual(const CSeq_feat& cds) {
1489  if (cds.IsSetXref()) {
1490  ITERATE(CSeq_feat::TXref, it, cds.GetXref()) {
1491  if ((*it)->IsSetData() && (*it)->GetData().IsProt()) {
1492  return CCleanup::GetProteinName((*it)->GetData().GetProt());
1493  }
1494  }
1495  }
1496  if (cds.IsSetQual()) {
1497  for (auto it = cds.GetQual().begin(); it != cds.GetQual().end(); it++) {
1498  if ((*it)->IsSetQual() && (*it)->IsSetVal() && NStr::EqualNocase((*it)->GetQual(), "product")) {
1499  return (*it)->GetVal();
1500  }
1501  }
1502  }
1503 
1504  return kEmptyStr;
1505 }
1506 
1507 
1509 {
1510  if (cds.IsSetProduct() && cds.GetProduct().GetId()) {
1512  if (prot) {
1514  if (f) {
1515  return GetProteinName(f->GetData().GetProt());
1516  }
1517  }
1518  }
1519 
1520  return s_GetProteinNameFromXrefOrQual(cds);
1521 }
1522 
1523 
1524 const string& CCleanup::GetProteinName(const CSeq_feat& cds, CScope& scope)
1525 {
1526  if (cds.IsSetProduct()) {
1528  if (prot) {
1530  if (f) {
1531  return GetProteinName(f->GetData().GetProt());
1532  }
1533  }
1534  }
1535 
1536  return s_GetProteinNameFromXrefOrQual(cds);
1537 }
1538 
1539 
1541 {
1542  bool any_change = false;
1543 
1545  cds.GetData().GetCdregion().IsSetFrame() &&
1548  cds.SetLocation().SetPartialStart(true, eExtreme_Biological);
1549  any_change = true;
1550  }
1551 
1553  // look for start and stop codon
1554  string transl_prot;
1555  try {
1556  CSeqTranslator::Translate(cds, scope, transl_prot,
1557  true, // include stop codons
1558  false); // do not remove trailing X/B/Z
1559 
1560  } catch (const runtime_error&) {
1561  }
1562  if (!NStr::IsBlank(transl_prot)) {
1563  if (!cds.GetLocation().IsPartialStart(eExtreme_Biological) && !NStr::StartsWith(transl_prot, "M")) {
1564  cds.SetLocation().SetPartialStart(true, eExtreme_Biological);
1565  any_change = true;
1566  }
1567  if (!cds.GetLocation().IsPartialStop(eExtreme_Biological) && !NStr::EndsWith(transl_prot, "*")) {
1568  cds.SetLocation().SetPartialStop(true, eExtreme_Biological);
1569  any_change = true;
1570  }
1571  }
1572  }
1573 
1575 
1576  return any_change;
1577 }
1578 
1579 
1580 bool CCleanup::ClearInternalPartials(CSeq_loc& loc, bool is_first, bool is_last)
1581 {
1582  bool rval = false;
1583  switch (loc.Which()) {
1584  case CSeq_loc::e_Mix:
1585  rval |= ClearInternalPartials(loc.SetMix(), is_first, is_last);
1586  break;
1588  rval |= ClearInternalPartials(loc.SetPacked_int(), is_first, is_last);
1589  break;
1590  default:
1591  break;
1592  }
1593  return rval;
1594 }
1595 
1596 
1597 bool CCleanup::ClearInternalPartials(CSeq_loc_mix& mix, bool is_first, bool is_last)
1598 {
1599  bool rval = false;
1601  bool this_is_last = is_last && (*it == mix.Set().back());
1602  if ((*it)->IsMix() || (*it)->IsPacked_int()) {
1603  rval |= ClearInternalPartials(**it, is_first, this_is_last);
1604  } else {
1605  if (!is_first &&
1606  (*it)->IsPartialStart(eExtreme_Biological)) {
1607  (*it)->SetPartialStart(false, eExtreme_Biological);
1608  rval = true;
1609  }
1610  if (!this_is_last &&
1611  (*it)->IsPartialStop(eExtreme_Biological)) {
1612  (*it)->SetPartialStop(false, eExtreme_Biological);
1613  rval = true;
1614  }
1615  }
1616  is_first = false;
1617  }
1618  return rval;
1619 }
1620 
1621 
1622 bool CCleanup::ClearInternalPartials(CPacked_seqint& pint, bool is_first, bool is_last)
1623 {
1624  bool rval = false;
1625 
1627  bool this_is_last = is_last && (*it == pint.Set().back());
1628  if (!is_first && (*it)->IsPartialStart(eExtreme_Biological)) {
1629  (*it)->SetPartialStart(false, eExtreme_Biological);
1630  rval = true;
1631  }
1632  if (!this_is_last && (*it)->IsPartialStop(eExtreme_Biological)) {
1633  (*it)->SetPartialStop(false, eExtreme_Biological);
1634  rval = true;
1635  }
1636  is_first = false;
1637  }
1638  return rval;
1639 }
1640 
1641 
1643 {
1644  bool rval = false;
1645  CFeat_CI f(seh);
1646  while (f) {
1647  CRef<CSeq_feat> new_feat(new CSeq_feat());
1648  new_feat->Assign(*(f->GetSeq_feat()));
1649  if (ClearInternalPartials(new_feat->SetLocation())) {
1650  CSeq_feat_EditHandle eh(f->GetSeq_feat_Handle());
1651  eh.Replace(*new_feat);
1652  }
1653  ++f;
1654  }
1655 
1656  return rval;
1657 }
1658 
1659 
1661 {
1662  if (!f.IsSetLocation()) {
1663  return false;
1664  }
1665  bool partial = false;
1666  CSeq_loc_CI li(f.GetLocation());
1667  while (li && !partial) {
1668  if (li.GetFuzzFrom() || li.GetFuzzTo()) {
1669  partial = true;
1670  break;
1671  }
1672  ++li;
1673  }
1674  bool changed = false;
1675  if (f.IsSetPartial() && f.GetPartial()) {
1676  if (!partial) {
1677  f.ResetPartial();
1678  changed = true;
1679  }
1680  } else {
1681  if (partial) {
1682  f.SetPartial(true);
1683  changed = true;
1684  }
1685  }
1686  return changed;
1687 }
1688 
1689 
1691 {
1692  bool changed = false;
1693  // CProt_ref::TEc is a list, so the iterator stays valid even if we
1694  // add new entries after the current one
1695  NON_CONST_ITERATE(CProt_ref::TEc, ec_num_iter, ec_num_list) {
1696  string & ec_num = *ec_num_iter;
1697  size_t tlen = ec_num.length();
1698  CleanVisStringJunk(ec_num);
1699  if (tlen != ec_num.length()) {
1700  changed = true;
1701  }
1703  !CProt_ref::IsECNumberSplit(ec_num)) {
1704  string new_val = CProt_ref::GetECNumberReplacement(ec_num);
1705  if (!NStr::IsBlank(new_val)) {
1706  ec_num = new_val;
1707  changed = true;
1708  }
1709  }
1710 
1711  }
1712  return changed;
1713 }
1714 
1715 
1717 {
1718  bool changed = false;
1719  CProt_ref::TEc::iterator ec_num_iter = ec_num_list.begin();
1720  while (ec_num_iter != ec_num_list.end()) {
1721  string & ec_num = *ec_num_iter;
1722  size_t tlen = ec_num.length();
1723  CleanVisStringJunk(ec_num);
1724  if (tlen != ec_num.length()) {
1725  changed = true;
1726  }
1728  if (ec_status == CProt_ref::eEC_deleted || ec_status == CProt_ref::eEC_unknown || CProt_ref::IsECNumberSplit(ec_num)) {
1729  ec_num_iter = ec_num_list.erase(ec_num_iter);
1730  changed = true;
1731  } else {
1732  ++ec_num_iter;
1733  }
1734 
1735  }
1736  return changed;
1737 }
1738 
1739 
1741 {
1742  bool any_change = false;
1744  while (f) {
1745  if (f->GetData().GetProt().IsSetEc()) {
1746  bool this_change = false;
1747  CRef<CSeq_feat> new_feat(new CSeq_feat());
1748  new_feat->Assign(*(f->GetSeq_feat()));
1749  this_change = UpdateECNumbers(new_feat->SetData().SetProt().SetEc());
1750  this_change |= RemoveBadECNumbers(new_feat->SetData().SetProt().SetEc());
1751  if (new_feat->GetData().GetProt().GetEc().empty()) {
1752  new_feat->SetData().SetProt().ResetEc();
1753  this_change = true;
1754  }
1755  if (this_change) {
1756  CSeq_feat_EditHandle efh(*f);
1757  efh.Replace(*new_feat);
1758  }
1759  }
1760  ++f;
1761  }
1762  return any_change;
1763 }
1764 
1765 
1767 {
1768  CBioseq_Handle bh = scope.GetBioseqHandle(gene.GetLocation());
1769  if (!bh) {
1770  return false;
1771  }
1772  CFeat_CI under(scope, gene.GetLocation());
1773  size_t longest = 0;
1774  CConstRef<CSeq_feat> longest_feat;
1775 
1776  while (under) {
1777  // ignore genes
1778  if (under->GetData().IsGene()) {
1779 
1780  } else {
1781  // must be contained in gene location
1783 
1784  if (loc_cmp == sequence::eSame || loc_cmp == sequence::eContains) {
1785  size_t len = sequence::GetLength(under->GetLocation(), &scope);
1786  // if longer than longest, record new length and feature
1787  if (len > longest) {
1788  longest_feat.Reset(under->GetSeq_feat());
1789  }
1790  }
1791  }
1792 
1793  ++under;
1794  }
1795  bool changed = false;
1796  if (longest_feat) {
1797  changed = feature::CopyFeaturePartials(gene, *longest_feat);
1798  }
1799  return changed;
1800 }
1801 
1802 
1804 {
1806  if (di) {
1807  if (di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetTech() == tech) {
1808  // no change necessary
1809  return false;
1810  } else {
1811  CSeqdesc* d = const_cast<CSeqdesc*>(&(*di));
1812  d->SetMolinfo().SetTech(tech);
1813  return true;
1814  }
1815  }
1816  CRef<CSeqdesc> m(new CSeqdesc());
1817  m->SetMolinfo().SetTech(tech);
1818  if (bsh.IsSetInst() && bsh.GetInst().IsSetMol() && bsh.IsAa()) {
1820  }
1821  CBioseq_EditHandle eh = bsh.GetEditHandle();
1822  eh.AddSeqdesc(*m);
1823  return true;
1824 }
1825 
1826 
1827 //LCOV_EXCL_START
1828 //does not appear to be used
1830 {
1832  if (di) {
1833  if (di->GetMolinfo().IsSetTech() && di->GetMolinfo().GetBiomol() == biomol) {
1834  // no change necessary
1835  return false;
1836  } else {
1837  CSeqdesc* d = const_cast<CSeqdesc*>(&(*di));
1838  d->SetMolinfo().SetBiomol(biomol);
1839  return true;
1840  }
1841  }
1842  CRef<CSeqdesc> m(new CSeqdesc());
1843  m->SetMolinfo().SetBiomol(biomol);
1844  CBioseq_EditHandle eh = bsh.GetEditHandle();
1845  eh.AddSeqdesc(*m);
1846  return true;
1847 }
1848 //LCOV_EXCL_STOP
1849 
1850 
1851 bool CCleanup::AddMissingMolInfo(CBioseq& seq, bool is_product)
1852 {
1853  if (!seq.IsSetInst() || !seq.GetInst().IsSetMol()) {
1854  return false;
1855  }
1856  bool needs_molinfo = true;
1857 
1858  if (seq.IsSetDescr()) {
1860  if ((*it)->IsMolinfo()) {
1861  needs_molinfo = false;
1862  if (seq.IsAa() &&
1863  (!(*it)->GetMolinfo().IsSetBiomol() ||
1864  (*it)->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_unknown)) {
1865  (*it)->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1866  }
1867  }
1868  }
1869  }
1870  if (needs_molinfo) {
1871  if (seq.IsAa()) {
1872  CRef<CSeqdesc> m(new CSeqdesc());
1874  if (is_product) {
1876  }
1877  seq.SetDescr().Set().push_back(m);
1878  } else if (seq.GetInst().GetMol() == CSeq_inst::eMol_rna && is_product) {
1879  CRef<CSeqdesc> m(new CSeqdesc());
1882  seq.SetDescr().Set().push_back(m);
1883  } else {
1884  needs_molinfo = false;
1885  }
1886  }
1887 
1888  return needs_molinfo;
1889 }
1890 
1891 
1893 {
1894  if (!bsh.IsSetInst() || !bsh.GetInst().IsSetMol() || !bsh.IsAa()) {
1895  return false;
1896  }
1897  if (bsh.IsSetId()) {
1898  ITERATE(CBioseq_Handle::TId, it, bsh.GetId()) {
1899  // do not add titles for sequences with certain IDs
1900  switch (it->Which()) {
1901  case CSeq_id::e_Pir:
1902  case CSeq_id::e_Swissprot:
1903  case CSeq_id::e_Patent:
1904  case CSeq_id::e_Prf:
1905  case CSeq_id::e_Pdb:
1906  return false;
1907  break;
1908  default:
1909  break;
1910  }
1911  }
1912  }
1913 
1914  string new_defline = sequence::CDeflineGenerator().GenerateDefline(bsh, sequence::CDeflineGenerator::fIgnoreExisting);
1915 
1917 
1918  bool modified = title_desc.Set().SetTitle() != new_defline; // get or create a title
1919  if (modified)
1920  title_desc.Set().SetTitle().swap(new_defline);
1921  return modified;
1922 }
1923 
1924 
1926 {
1927  bool rval = false;
1928  if (seq_entry.IsSetDescr()) {
1929  CBioseq::TDescr::Tdata::iterator it = seq_entry.SetDescr().Set().begin();
1930  while (it != seq_entry.SetDescr().Set().end()) {
1931  if ((*it)->IsUser() && (*it)->GetUser().GetObjectType() == CUser_object::eObjectType_Cleanup){
1932  it = seq_entry.SetDescr().Set().erase(it);
1933  rval = true;
1934  }
1935  else {
1936  ++it;
1937  }
1938  }
1939  if (seq_entry.SetDescr().Set().empty()) {
1940  if (seq_entry.IsSeq()) {
1941  seq_entry.SetSeq().ResetDescr();
1942  }
1943  else if (seq_entry.IsSet()) {
1944  seq_entry.SetSet().ResetDescr();
1945  }
1946  }
1947  }
1948  if (seq_entry.IsSet() && seq_entry.GetSet().IsSetSeq_set()) {
1950  rval |= RemoveNcbiCleanupObject(**it);
1951  }
1952  }
1953  return rval;
1954 }
1955 
1956 void CCleanup::AddNcbiCleanupObject(int ncbi_cleanup_version, CSeq_descr& descr)
1957 {
1958  // update existing
1959  if (descr.IsSet()) {
1960  for (auto pDesc : descr.Set()) {
1961  if (pDesc->IsUser() && pDesc->GetUser().GetObjectType() == CUser_object::eObjectType_Cleanup) {
1962  pDesc->SetUser().UpdateNcbiCleanup(ncbi_cleanup_version);
1963  return;
1964  }
1965  }
1966  }
1967 
1968  // create new
1969  auto pCleanupObject = Ref(new CSeqdesc());
1970  auto& user = pCleanupObject->SetUser();
1971  user.UpdateNcbiCleanup(ncbi_cleanup_version);
1972  descr.Set().push_back(pCleanupObject);
1973 }
1974 
1975 
1976 //LCOV_EXCL_START
1977 //not used by asn_cleanup but used by functions used by other applications
1978 void GetSourceDescriptors(const CSeq_entry& se, vector<const CSeqdesc* >& src_descs)
1979 {
1980  if (se.IsSetDescr()) {
1982  if ((*it)->IsSource() && (*it)->GetSource().IsSetOrg()) {
1983  src_descs.push_back(*it);
1984  }
1985  }
1986  }
1987 
1988  if (se.IsSet() && se.GetSet().IsSetSeq_set()) {
1990  GetSourceDescriptors(**it, src_descs);
1991  }
1992  }
1993 }
1994 //LCOV_EXCL_STOP
1995 
1996 
1997 //LCOV_EXCL_START
1998 //not used by asn_cleanup
2000 {
2001  bool any_changes = false;
2002 
2003  vector<CRef<COrg_ref> > rq_list;
2004  vector<const CSeqdesc* > src_descs;
2005  vector<CConstRef<CSeq_feat> > src_feats;
2006 
2007  GetSourceDescriptors(*(seh.GetCompleteSeq_entry()), src_descs);
2008  vector<const CSeqdesc* >::iterator desc_it = src_descs.begin();
2009  while (desc_it != src_descs.end()) {
2010  // add org ref for descriptor to request list
2011  CRef<COrg_ref> org(new COrg_ref());
2012  org->Assign((*desc_it)->GetSource().GetOrg());
2013  rq_list.push_back(org);
2014 
2015  ++desc_it;
2016  }
2017 
2019  while (feat) {
2020  if (feat->GetData().GetBiosrc().IsSetOrg()) {
2021  // add org ref for feature to request list
2022  CRef<COrg_ref> org(new COrg_ref());
2023  org->Assign(feat->GetData().GetBiosrc().GetOrg());
2024  rq_list.push_back(org);
2025  // add feature to list
2026  src_feats.push_back(feat->GetOriginalSeq_feat());
2027  }
2028  ++feat;
2029  }
2030 
2031  if (rq_list.size() > 0) {
2033  CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(rq_list);
2034  if (reply) {
2035  CTaxon3_reply::TReply::const_iterator reply_it = reply->GetReply().begin();
2036 
2037  // process descriptor responses
2038  desc_it = src_descs.begin();
2039 
2040  while (reply_it != reply->GetReply().end()
2041  && desc_it != src_descs.end()) {
2042  if ((*reply_it)->IsData() &&
2043  !(*desc_it)->GetSource().GetOrg().Equals((*reply_it)->GetData().GetOrg())) {
2044  any_changes = true;
2045  CSeqdesc* desc = const_cast<CSeqdesc*>(*desc_it);
2046  desc->SetSource().SetOrg().Assign((*reply_it)->GetData().GetOrg());
2047  desc->SetSource().SetOrg().CleanForGenBank();
2048  }
2049  ++reply_it;
2050  ++desc_it;
2051  }
2052 
2053  // process feature responses
2054  vector<CConstRef<CSeq_feat> >::iterator feat_it = src_feats.begin();
2055  while (reply_it != reply->GetReply().end()
2056  && feat_it != src_feats.end()) {
2057  if ((*reply_it)->IsData() &&
2058  !(*feat_it)->GetData().GetBiosrc().GetOrg().Equals((*reply_it)->GetData().GetOrg())) {
2059  any_changes = true;
2060  CRef<CSeq_feat> new_feat(new CSeq_feat());
2061  new_feat->Assign(**feat_it);
2062  new_feat->SetData().SetBiosrc().SetOrg().Assign((*reply_it)->GetData().GetOrg());
2063  CSeq_feat_Handle fh = seh.GetScope().GetSeq_featHandle(**feat_it);
2064  CSeq_feat_EditHandle efh(fh);
2065  efh.Replace(*new_feat);
2066  }
2067  ++reply_it;
2068  ++feat_it;
2069  }
2070  }
2071  }
2072 
2073  return any_changes;
2074 }
2075 //LCOV_EXCL_STOP
2076 
2077 
2079 {
2080  CBioseq_Handle cds_bsh = scope.GetBioseqHandle(cds.GetLocation());
2081  if (!cds_bsh) {
2082  return CRef<CSeq_entry>();
2083  }
2084  CSeq_entry_Handle seh = cds_bsh.GetSeq_entry_Handle();
2085  if (!seh) {
2086  return CRef<CSeq_entry>();
2087  }
2088 
2089  CRef<CBioseq> new_product = CSeqTranslator::TranslateToProtein(cds, scope);
2090  if (new_product.Empty()) {
2091  return CRef<CSeq_entry>();
2092  }
2093 
2094  CRef<CSeqdesc> molinfo(new CSeqdesc());
2097  new_product->SetDescr().Set().push_back(molinfo);
2098 
2099  if (cds.IsSetProduct()) {
2100  CRef<CSeq_id> prot_id(new CSeq_id());
2101  prot_id->Assign(*(cds.GetProduct().GetId()));
2102  new_product->SetId().push_back(prot_id);
2103  }
2104  CRef<CSeq_entry> prot_entry(new CSeq_entry());
2105  prot_entry->SetSeq(*new_product);
2106 
2108  if (!eh.IsSet()) {
2109  CBioseq_set_Handle nuc_parent = eh.GetParentBioseq_set();
2110  if (nuc_parent && nuc_parent.IsSetClass() && nuc_parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
2111  eh = nuc_parent.GetParentEntry().GetEditHandle();
2112  }
2113  }
2114  if (!eh.IsSet()) {
2115  eh.ConvertSeqToSet();
2116  // move all descriptors on nucleotide sequence except molinfo, title, and create-date to set
2117  eh.SetSet().SetClass(CBioseq_set::eClass_nuc_prot);
2119  if (set && set->IsSetSeq_set()) {
2120  CConstRef<CSeq_entry> nuc = set->GetSeq_set().front();
2121  if (nuc->IsSetDescr()) {
2122  auto neh = eh.GetScope().GetSeq_entryEditHandle(*nuc);
2123  auto it = nuc->GetDescr().Get().begin();
2124  while (it != nuc->GetDescr().Get().end()) {
2125  if (!(*it)->IsMolinfo() && !(*it)->IsTitle() && !(*it)->IsCreate_date()) {
2126  CRef<CSeqdesc> copy(new CSeqdesc());
2127  copy->Assign(**it);
2128  eh.AddSeqdesc(*copy);
2129  neh.RemoveSeqdesc(**it);
2130  if (nuc->IsSetDescr()) {
2131  it = nuc->GetDescr().Get().begin();
2132  }
2133  else {
2134  break;
2135  }
2136  }
2137  else {
2138  ++it;
2139  }
2140  }
2141  }
2142  }
2143  }
2144 
2145  CSeq_entry_EditHandle added = eh.AttachEntry(*prot_entry);
2146  return prot_entry;
2147 }
2148 
2150 {
2151  if (!bsh) {
2152  return false;
2153  }
2154  if (!bsh.IsNa()) {
2155  return false;
2156  }
2157 
2158  CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
2159  if (!src) {
2160  // no source, don't fix
2161  return false;
2162  }
2163  const auto& bsrc = src->GetSource();
2164  if (!bsrc.IsSetOrg() || !bsrc.IsSetOrgname()) {
2165  return false;
2166  }
2167  const auto& orgname = bsrc.GetOrg().GetOrgname();
2168  if (!orgname.IsSetGcode() && !orgname.IsSetMgcode() && !orgname.IsSetPgcode()) {
2169  return false;
2170  }
2171  int bioseqGenCode = src->GetSource().GetGenCode();
2172 
2173  bool any_changed = false;
2174  // set Cdregion's gcode from BioSource (unless except-text)
2176  CFeat_CI feat_ci(bsh, sel);
2177  for (; feat_ci; ++feat_ci) {
2178  const CSeq_feat& feat = feat_ci->GetOriginalFeature();
2179  const CCdregion& cds = feat.GetData().GetCdregion();
2180  int cdregionGenCode = (cds.IsSetCode() ?
2181  cds.GetCode().GetId() :
2182  0);
2183  if (cdregionGenCode != bioseqGenCode)
2184  {
2185  // make cdregion's gencode match bioseq's gencode,
2186  // if allowed
2187  if (!feat.HasExceptionText("genetic code exception"))
2188  {
2189  CRef<CSeq_feat> new_feat(new CSeq_feat);
2190  new_feat->Assign(feat);
2191  CCdregion& new_cds = new_feat->SetData().SetCdregion();
2192  new_cds.ResetCode();
2193  new_cds.SetCode().SetId(bioseqGenCode);
2194  CSeq_feat_EditHandle edit_handle(*feat_ci);
2195  edit_handle.Replace(*new_feat);
2196  any_changed = true;
2197  }
2198  }
2199  }
2200  return any_changed;
2201 }
2202 
2203 
2204 // return position of " [" + sOrganism + "]", but only if it's
2205 // at the end and there are characters before it.
2206 // Also, returns the position of the organelle prefix in the title.
2208  const string & sTitle,
2209  const string & sOrganism,
2210  SIZE_TYPE& OrganellePos)
2211 {
2212  OrganellePos = NPOS;
2213 
2214  SIZE_TYPE answer = NPOS;
2215 
2216  const string sPattern = " [" + sOrganism + "]";
2217  if (NStr::EndsWith(sTitle, sPattern, NStr::eNocase)) {
2218  answer = sTitle.length() - sPattern.length();
2219  if (answer < 1) {
2220  // title must have something before the pattern
2221  answer = NPOS;
2222  }
2223  } else {
2224  answer = NStr::Find(sTitle, sPattern, NStr::eNocase, NStr::eReverseSearch);
2225  if (answer < 1 || answer == NPOS) {
2226  // pattern not found
2227  answer = NPOS;
2228  }
2229  }
2230 
2231  if (answer != NPOS) {
2232  // find organelle prefix
2233  for (unsigned int genome = CBioSource::eGenome_chloroplast;
2235  genome++) {
2236  if (genome != CBioSource::eGenome_extrachrom &&
2237  genome != CBioSource::eGenome_transposon &&
2239  genome != CBioSource::eGenome_proviral &&
2240  genome != CBioSource::eGenome_virion &&
2242  {
2243  string organelle = " (" + CBioSource::GetOrganelleByGenome(genome) + ")";
2244  SIZE_TYPE possible_organelle_start_pos = NStr::Find(sTitle, organelle, NStr::eNocase, NStr::eReverseSearch);
2245  if (possible_organelle_start_pos != NPOS &&
2246  NStr::EndsWith(CTempString(sTitle, 0, answer), organelle)) {
2247  OrganellePos = possible_organelle_start_pos;
2248  break;
2249  }
2250 
2251  }
2252  }
2253  }
2254  return answer;
2255 }
2256 
2257 
2259  const string & sTitle,
2260  const COrgName::TName& orgname,
2261  SIZE_TYPE &organelle_pos)
2262 {
2263  SIZE_TYPE suffixPos = NPOS; // will point to " [${organism name}]" at end
2264  organelle_pos = NPOS;
2265 
2266  if (orgname.IsBinomial() &&
2267  orgname.GetBinomial().IsSetGenus() &&
2268  !NStr::IsBlank(orgname.GetBinomial().GetGenus()) &&
2269  orgname.GetBinomial().IsSetSpecies() &&
2270  !NStr::IsBlank(orgname.GetBinomial().GetSpecies())) {
2271  string binomial = orgname.GetBinomial().GetGenus() + " " + orgname.GetBinomial().GetSpecies();
2272  suffixPos = s_TitleEndsInOrganism(sTitle, binomial, organelle_pos);
2273  }
2274  return suffixPos;
2275 }
2276 
2277 
2278 bool IsCrossKingdom(const COrg_ref& org, string& first_kingdom, string& second_kingdom)
2279 {
2280  bool is_cross_kingdom = false;
2281  first_kingdom = kEmptyStr;
2282  second_kingdom = kEmptyStr;
2283  if (org.IsSetOrgname() && org.GetOrgname().IsSetName() &&
2284  org.GetOrgname().GetName().IsPartial() &&
2285  org.GetOrgname().GetName().GetPartial().IsSet()) {
2287  const CTaxElement& te = **it;
2288  if (te.IsSetFixed_level() && te.GetFixed_level() == 0 &&
2289  te.IsSetLevel() &&
2290  NStr::EqualNocase(te.GetLevel(), "superkingdom") &&
2291  te.IsSetName() && !NStr::IsBlank(te.GetName())) {
2292  if (first_kingdom.empty()) {
2293  first_kingdom = te.GetName();
2294  } else if (!NStr::EqualNocase(first_kingdom, te.GetName())) {
2295  is_cross_kingdom = true;
2296  second_kingdom = te.GetName();
2297  break;
2298  }
2299  }
2300  }
2301  }
2302  return is_cross_kingdom;
2303 }
2304 
2305 
2306 bool IsCrossKingdom(const COrg_ref& org)
2307 {
2308  string first_kingdom, second_kingdom;
2309  return IsCrossKingdom(org, first_kingdom, second_kingdom);
2310 }
2311 
2312 
2314  const string & sTitle,
2315  const COrg_ref& org,
2316  SIZE_TYPE &organelle_pos)
2317 {
2318  SIZE_TYPE suffixPos = NPOS; // will point to " [${organism name}]" at end
2319  organelle_pos = NPOS;
2320 
2321  // first, check to see if protein title matches old-name
2322  if (org.IsSetOrgMod()) {
2323  ITERATE(COrgName::TMod, it, org.GetOrgname().GetMod()) {
2324  if ((*it)->IsSetSubtype() && (*it)->IsSetSubname() &&
2325  (*it)->GetSubtype() == COrgMod::eSubtype_old_name &&
2326  !NStr::IsBlank((*it)->GetSubname())) {
2327  suffixPos = s_TitleEndsInOrganism(sTitle, (*it)->GetSubname(), organelle_pos);
2328  if (suffixPos != NPOS) {
2329  return suffixPos;
2330  }
2331  }
2332  }
2333  }
2334 
2335  // next, check to see if protein title matches taxname
2336  if (org.IsSetTaxname() && !NStr::IsBlank(org.GetTaxname())) {
2337  suffixPos = s_TitleEndsInOrganism(sTitle, org.GetTaxname(), organelle_pos);
2338  if (suffixPos != NPOS) {
2339  return suffixPos;
2340  }
2341  }
2342 
2343  // try binomial if preset
2344  if (org.IsSetOrgname() && org.GetOrgname().IsSetName() &&
2345  org.GetOrgname().GetName().IsBinomial()) {
2346  suffixPos = s_TitleEndsInOrganism(sTitle, org.GetOrgname().GetName(), organelle_pos);
2347  if (suffixPos != NPOS) {
2348  return suffixPos;
2349  }
2350  }
2351 
2352  // cross-kingdom?
2353  if (IsCrossKingdom(org)) {
2354  SIZE_TYPE sep = NStr::Find(sTitle, "][");
2355  if (sep != string::npos) {
2356  suffixPos = s_TitleEndsInOrganism(sTitle.substr(0, sep + 1), org.GetTaxname(), organelle_pos);
2357  }
2358  }
2359  return suffixPos;
2360 }
2361 
2362 
2363 static void s_RemoveOrgFromEndOfProtein(CBioseq& seq, string taxname)
2364 
2365 {
2366  if (taxname.empty()) return;
2367  SIZE_TYPE taxlen = taxname.length();
2368 
2369  EDIT_EACH_SEQANNOT_ON_BIOSEQ(annot_it, seq) {
2370  CSeq_annot& annot = **annot_it;
2371  if (!annot.IsFtable()) continue;
2372  EDIT_EACH_FEATURE_ON_ANNOT(feat_it, annot) {
2373  CSeq_feat& feat = **feat_it;
2374  CSeqFeatData& data = feat.SetData();
2375  if (!data.IsProt()) continue;
2376  CProt_ref& prot_ref = data.SetProt();
2377  EDIT_EACH_NAME_ON_PROTREF(it, prot_ref) {
2378  string str = *it;
2379  if (str.empty()) continue;
2380  auto len = str.length();
2381  if (len < 5) continue;
2382  if (str[len - 1] != ']') continue;
2384  if (cp == NPOS) continue;
2385  string suffix = str.substr(cp + 1);
2386  if (NStr::StartsWith(suffix, "NAD")) continue;
2387  if (suffix.length() != taxlen + 1) continue;
2388  if (NStr::StartsWith(suffix, taxname)) {
2389  str.erase(cp);
2391  *it = str;
2392  }
2393  }
2394  }
2395  }
2396 }
2397 
2399 {
2400  // Bail if not protein
2401  if (!bioseq.IsSetInst() || !bioseq.GetInst().IsSetMol() || !bioseq.GetInst().IsAa()) {
2402  return false;
2403  }
2404 
2405  // Bail if record is swissprot
2406  FOR_EACH_SEQID_ON_BIOSEQ(seqid_itr, bioseq) {
2407  if ((*seqid_itr)->IsSwissprot()) {
2408  return false;
2409  }
2410  }
2411 
2412  // gather some info from the Seqdesc's on the bioseq, into
2413  // the following variables
2414  bool bPartial = false;
2415  string organelle;
2416 
2417  CConstRef<CSeqdesc> molinfo_desc;
2418  CConstRef<CSeqdesc> src_desc;
2419  FOR_EACH_SEQDESC_ON_BIOSEQ(descr_iter, bioseq) {
2420  if (!molinfo_desc && (*descr_iter)->IsMolinfo()) {
2421  molinfo_desc = *descr_iter;
2422  }
2423  if (!src_desc && (*descr_iter)->IsSource()) {
2424  src_desc = *descr_iter;
2425  }
2426  if (molinfo_desc && src_desc) {
2427  break;
2428  }
2429  }
2430  if (!molinfo_desc || !src_desc) {
2431  // climb up to get parent Seqdescs
2432  CConstRef<CBioseq_set> bioseq_set(bioseq.GetParentSet());
2433  for (; bioseq_set; bioseq_set = bioseq_set->GetParentSet()) {
2434  FOR_EACH_SEQDESC_ON_SEQSET(descr_iter, *bioseq_set) {
2435  if (!molinfo_desc && (*descr_iter)->IsMolinfo()) {
2436  molinfo_desc = *descr_iter;
2437  }
2438  if (!src_desc && (*descr_iter)->IsSource()) {
2439  src_desc = *descr_iter;
2440  }
2441  if (molinfo_desc && src_desc) {
2442  break;
2443  }
2444  }
2445  if (molinfo_desc && src_desc) {
2446  break;
2447  }
2448  }
2449  }
2450 
2451  if (molinfo_desc && molinfo_desc->GetMolinfo().IsSetCompleteness()) {
2452  switch (molinfo_desc->GetMolinfo().GetCompleteness()) {
2453  case NCBI_COMPLETENESS(partial):
2454  case NCBI_COMPLETENESS(no_left):
2455  case NCBI_COMPLETENESS(no_right):
2456  case NCBI_COMPLETENESS(no_ends):
2457  bPartial = true;
2458  break;
2459  default:
2460  break;
2461  }
2462  }
2463 
2464  CConstRef<COrg_ref> org;
2465  if (src_desc) {
2466  const TBIOSOURCE_GENOME genome = (src_desc->GetSource().IsSetGenome() ?
2468  if (genome >= CBioSource::eGenome_chloroplast &&
2470  genome != CBioSource::eGenome_extrachrom &&
2471  genome != CBioSource::eGenome_transposon &&
2473  genome != CBioSource::eGenome_proviral &&
2474  genome != CBioSource::eGenome_virion &&
2476  {
2477  organelle = CBioSource::GetOrganelleByGenome(genome);
2478  }
2479 
2480  if (src_desc->GetSource().IsSetOrg()) {
2481  org.Reset(&(src_desc->GetSource().GetOrg()));
2482  }
2483  }
2484 
2485  if (!org) {
2486  return false;
2487  }
2488  if (org->IsSetTaxname() && !NStr::IsBlank(org->GetTaxname())) {
2489  s_RemoveOrgFromEndOfProtein(bioseq, org->GetTaxname());
2490  }
2491 
2492  // find the title to edit
2493  if (!bioseq.IsSetDescr()) {
2494  return false;
2495  }
2496  CRef<CSeqdesc> title_desc;
2497  NON_CONST_ITERATE(CBioseq::TDescr::Tdata, d, bioseq.SetDescr().Set()) {
2498  if ((*d)->IsTitle()) {
2499  title_desc = *d;
2500  }
2501  }
2502  if (!title_desc) {
2503  return false;
2504  }
2505  string & sTitle = title_desc->SetTitle();
2506  // remember original so we can see if we changed it
2507  const string sOriginalTitle = sTitle;
2508 
2509  // search for partial, must be just before bracketed organism
2510  SIZE_TYPE partialPos = NStr::Find(sTitle, ", partial [");
2511  if (partialPos == NPOS) {
2512  partialPos = NStr::Find(sTitle, ", partial (");
2513  }
2514 
2515  // find oldname or taxname in brackets at end of protein title
2516  SIZE_TYPE penult = NPOS;
2517  SIZE_TYPE suffixPos = s_TitleEndsInOrganism(sTitle, *org, penult); // will point to " [${organism name}]" at end
2518  // do not change unless [genus species] was at the end
2519  if (suffixPos == NPOS) {
2520  return false;
2521  }
2522 
2523  // truncate bracketed info from end of title, will replace with current taxname
2524  sTitle.resize(suffixPos);
2525  if (penult != NPOS) {
2526  sTitle.resize(penult);
2527  }
2528 
2529  // if ", partial [" was indeed just before the [genus species], it will now be ", partial"
2530  // Note: 9 is length of ", partial"
2531  if (!bPartial &&
2532  partialPos != string::npos &&
2533  (partialPos == (sTitle.length() - 9)))
2534  {
2535  sTitle.resize(partialPos);
2536  }
2538 
2539  //
2540  if (bPartial && partialPos == NPOS) {
2541  sTitle += ", partial";
2542  }
2543  if (!NStr::IsBlank(organelle)) {
2544  sTitle += " (" + string(organelle) + ")";
2545  }
2546  string first_kingdom, second_kingdom;
2547  if (IsCrossKingdom(*org, first_kingdom, second_kingdom)) {
2548  sTitle += " [" + first_kingdom + "][" + second_kingdom + "]";
2549  } else {
2550  sTitle += " [";
2551  if (org->IsSetTaxname()) {
2552  sTitle += org->GetTaxname();
2553  }
2554  sTitle += "]";
2555  }
2556 
2557  if (sTitle != sOriginalTitle) {
2558  return true;
2559  } else {
2560  return false;
2561  }
2562 }
2563 
2565 {
2566  if (!sequence::IsPseudo(cds, scope) ||
2567  !cds.IsSetData() || !cds.GetData().IsCdregion() ||
2568  !cds.IsSetProduct()) {
2569  return false;
2570  }
2571  CBioseq_Handle pseq = scope.GetBioseqHandle(cds.GetProduct());
2572  if (pseq) {
2574  if (prot) {
2575  string label;
2576  if (prot->GetData().GetProt().IsSetName() &&
2577  !prot->GetData().GetProt().GetName().empty()) {
2578  label = prot->GetData().GetProt().GetName().front();
2579  } else if (prot->GetData().GetProt().IsSetDesc()) {
2580  label = prot->GetData().GetProt().GetDesc();
2581  }
2582  if (!NStr::IsBlank(label)) {
2583  if (cds.IsSetComment() && !NStr::IsBlank(cds.GetComment())) {
2584  cds.SetComment(cds.GetComment() + "; " + label);
2585  } else {
2586  cds.SetComment(label);
2587  }
2588  }
2589  }
2590  CBioseq_EditHandle pseq_e(pseq);
2591  pseq_e.Remove();
2592  }
2593  cds.ResetProduct();
2594  return true;
2595 }
2596 
2597 
2599 {
2600  if (!gene.IsSetXref() || !gene.IsSetLocation() || !gene.GetLocation().IsInt()) {
2601  return false;
2602  }
2603  bool any_change = false;
2604  TSeqPos gene_start = gene.GetLocation().GetStart(eExtreme_Positional);
2605  TSeqPos gene_stop = gene.GetLocation().GetStop(eExtreme_Positional);
2606  ITERATE(CSeq_feat::TXref, xit, gene.GetXref()) {
2607  if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal()) {
2608  const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
2611  TSeqPos f_start = f->GetLocation().GetStart(eExtreme_Positional);
2612  TSeqPos f_stop = f->GetLocation().GetStop(eExtreme_Positional);
2613  if (f_start < gene_start) {
2614  gene.SetLocation().SetInt().SetFrom(f_start);
2615  gene_start = f_start;
2616  any_change = true;
2617  }
2618  if (f_stop > gene_stop) {
2619  gene.SetLocation().SetInt().SetTo(f_stop);
2620  gene_stop = f_stop;
2621  any_change = true;
2622  }
2623  }
2624  }
2625  }
2626  return any_change;
2627 }
2628 
2629 
2630 typedef pair<size_t, bool> TRNALength;
2632 
2634  { "16S", { 1000, false } },
2635  { "18S", { 1000, false } },
2636  { "23S", { 2000, false } },
2637  { "25S", { 1000, false } },
2638  { "26S", { 1000, false } },
2639  { "28S", { 3300, false } },
2640  { "small", { 1000, false } },
2641  { "large", { 1000, false } },
2642  { "5.8S", { 130, true } },
2643  { "5S", { 90, true } }
2644  // possible problem: if it matches /25S/ it would also match /5S/
2645  // luckily, if it fails the /5S/ rule it would fail the /25S/ rule
2646 };
2647 
2648 
2649 static bool s_CleanupIsShortrRNA(const CSeq_feat& f, CScope* scope) // used in feature_tests.cpp
2650 {
2651  if (f.GetData().GetSubtype() != CSeqFeatData::eSubtype_rRNA) {
2652  return false;
2653  }
2654  bool is_bad = false;
2655  size_t len = sequence::GetLength(f.GetLocation(), scope);
2656  const CRNA_ref& rrna = f.GetData().GetRna();
2657  string rrna_name = rrna.GetRnaProductName();
2658  if (rrna_name.empty()) {
2659  // RNA name may still be in product GBQual
2660  if (f.IsSetQual()) {
2661  for (auto qit : f.GetQual()) {
2662  const CGb_qual& gbq = *qit;
2663  if ( gbq.IsSetQual() && gbq.GetQual() == "product" ) {
2664  rrna_name = gbq.GetVal();
2665  break;
2666  }
2667  }
2668  }
2669  }
2671  SIZE_TYPE pos = NStr::FindNoCase(rrna_name, it->first);
2672  if (pos != string::npos && len < it->second.first && !(it->second.second && f.IsSetPartial() && f.GetPartial()) ) {
2673  is_bad = true;
2674  break;
2675  }
2676  }
2677  return is_bad;
2678 }
2679 
2680 bool CCleanup::WGSCleanup(CSeq_entry_Handle entry, bool instantiate_missing_proteins, Uint4 options, bool run_extended_cleanup)
2681 {
2682  bool any_changes = false;
2683 
2684  int protein_id_counter = 1;
2685  bool create_general_only = edit::IsGeneralIdProtPresent(entry.GetTopLevelEntry());
2687  for (CFeat_CI cds_it(entry, sel); cds_it; ++cds_it) {
2688  bool change_this_cds = false;
2689  CRef<CSeq_feat> new_cds(new CSeq_feat());
2690  new_cds->Assign(*(cds_it->GetSeq_feat()));
2691  if (sequence::IsPseudo(*(cds_it->GetSeq_feat()), entry.GetScope())) {
2692  change_this_cds = RemovePseudoProduct(*new_cds, entry.GetScope());
2693  } else {
2694  string current_name = GetProteinName(*new_cds, entry);
2695 
2696  change_this_cds |= SetBestFrame(*new_cds, entry.GetScope());
2697 
2698  change_this_cds |= SetCDSPartialsByFrameAndTranslation(*new_cds, entry.GetScope());
2699 
2700  // retranslate
2701  if (new_cds->IsSetProduct() && entry.GetScope().GetBioseqHandleFromTSE(*(new_cds->GetProduct().GetId()), entry)) {
2702  any_changes |= feature::RetranslateCDS(*new_cds, entry.GetScope());
2703  } else {
2704  // need to set product if not set
2705  if (!new_cds->IsSetProduct() && !sequence::IsPseudo(*new_cds, entry.GetScope())) {
2706  string id_label;
2707  CRef<CSeq_id> new_id = edit::GetNewProtId(entry.GetScope().GetBioseqHandle(new_cds->GetLocation()), protein_id_counter, id_label, create_general_only);
2708  if (new_id) {
2709  new_cds->SetProduct().SetWhole().Assign(*new_id);
2710  change_this_cds = true;
2711  }
2712  }
2713  if (new_cds->IsSetProduct() && instantiate_missing_proteins) {
2714  CRef<CSeq_entry> prot = AddProtein(*new_cds, entry.GetScope());
2715  if (prot) {
2716  any_changes = true;
2717  }
2718  }
2719  any_changes |= feature::AdjustForCDSPartials(*new_cds, entry);
2720  }
2721  //prefer ncbieaa
2722  if (new_cds->IsSetProduct()) {
2723  CBioseq_Handle p = entry.GetScope().GetBioseqHandle(new_cds->GetProduct());
2724  if (p && p.IsSetInst() && p.GetInst().IsSetSeq_data() && p.GetInst().GetSeq_data().IsIupacaa()) {
2725  CBioseq_EditHandle peh(p);
2726  string current = p.GetInst().GetSeq_data().GetIupacaa().Get();
2727  CRef<CSeq_inst> new_inst(new CSeq_inst());
2728  new_inst->Assign(p.GetInst());
2729  new_inst->SetSeq_data().SetNcbieaa().Set(current);
2730  peh.SetInst(*new_inst);
2731  any_changes = true;
2732  }
2733  }
2734 
2735  if (NStr::IsBlank(current_name)) {
2736  SetProteinName(*new_cds, "hypothetical protein", false, entry.GetScope());
2737  current_name = "hypothetical protein";
2738  change_this_cds = true;
2739  } else if (new_cds->IsSetProduct()) {
2740  CBioseq_Handle p = entry.GetScope().GetBioseqHandle(new_cds->GetProduct());
2741  if (p) {
2743  if (!feat_ci) {
2744  // make new protein feature
2745  feature::AddProteinFeature(*(p.GetCompleteBioseq()), current_name, *new_cds, entry.GetScope());
2746  }
2747  }
2748  }
2749 
2750  CConstRef<CSeq_feat> mrna = sequence::GetmRNAforCDS(*(cds_it->GetSeq_feat()), entry.GetScope());
2751  if (mrna) {
2752  bool change_mrna = false;
2753  CRef<CSeq_feat> new_mrna(new CSeq_feat());
2754  new_mrna->Assign(*mrna);
2755  // Make mRNA name match coding region protein
2756  string mrna_name = new_mrna->GetData().GetRna().GetRnaProductName();
2757  if (NStr::IsBlank(mrna_name) && new_mrna->IsSetQual()) {
2758  for (auto it = new_mrna->GetQual().begin(); it != new_mrna->GetQual().end(); it++) {
2759  if ((*it)->IsSetQual() && (*it)->IsSetVal() && NStr::EqualNocase((*it)->GetQual(), "product")) {
2760  mrna_name = (*it)->GetVal();
2761  break;
2762  }
2763  }
2764  }
2765  if (NStr::IsBlank(mrna_name)
2766  || (!NStr::Equal(current_name, "hypothetical protein") &&
2767  !NStr::Equal(current_name, mrna_name))) {
2768  SetMrnaName(*new_mrna, current_name);
2769  change_mrna = true;
2770  }
2771  // Adjust mRNA partials to match coding region
2772  change_mrna |= feature::CopyFeaturePartials(*new_mrna, *new_cds);
2773  if (change_mrna) {
2774  CSeq_feat_Handle fh = entry.GetScope().GetSeq_featHandle(*mrna);
2775  CSeq_feat_EditHandle feh(fh);
2776  feh.Replace(*new_mrna);
2777  any_changes = true;
2778  }
2779  }
2780  }
2781 
2782  //any_changes |= feature::RetranslateCDS(*new_cds, entry.GetScope());
2783  if (change_this_cds) {
2784  CSeq_feat_EditHandle cds_h(*cds_it);
2785 
2786  cds_h.Replace(*new_cds);
2787  any_changes = true;
2788 
2789  //also need to redo protein title
2790  }
2791 
2792  }
2793 
2794  CTSE_Handle tse = entry.GetTSE_Handle();
2795 
2796  for (CFeat_CI rna_it(entry, SAnnotSelector(CSeqFeatData::e_Rna)); rna_it; ++rna_it) {
2797 
2798  const CSeq_feat& rna_feat = *(rna_it->GetSeq_feat());
2799  if (rna_feat.IsSetData() &&
2801  s_CleanupIsShortrRNA(rna_feat, &(entry.GetScope()))) {
2802 
2803  bool change_this_rrna = false;
2804  CRef<CSeq_feat> new_rrna(new CSeq_feat());
2805  new_rrna->Assign(*(rna_it->GetSeq_feat()));
2806 
2807  const CSeq_loc& loc = rna_feat.GetLocation();
2808  if (loc.IsSetStrand() && loc.GetStrand() == eNa_strand_minus) {
2809  if (loc.GetStart(eExtreme_Biological) >= sequence::GetLength(rna_feat.GetLocation(), &entry.GetScope())) {
2810  new_rrna->SetLocation().SetPartialStart(true, eExtreme_Biological);
2811  change_this_rrna = true;
2812  }
2813  if (loc.GetStop(eExtreme_Biological) < 1) {
2814  new_rrna->SetLocation().SetPartialStop(true, eExtreme_Biological);
2815  change_this_rrna = true;
2816  }
2817  } else {
2818  if (loc.GetStart(eExtreme_Biological) < 1) {
2819  new_rrna->SetLocation().SetPartialStart(true, eExtreme_Biological);
2820  change_this_rrna = true;
2821  }
2822  if (loc.GetStop(eExtreme_Biological) >= sequence::GetLength(rna_feat.GetLocation(), &entry.GetScope())) {
2823  new_rrna->SetLocation().SetPartialStop(true, eExtreme_Biological);
2824  change_this_rrna = true;
2825  }
2826  }
2827 
2828  if (change_this_rrna) {
2829  CSeq_feat_EditHandle rrna_h(*rna_it);
2830  rrna_h.Replace(*new_rrna);
2831  any_changes = true;
2832  }
2833  }
2834  }
2835 
2836  for (CFeat_CI gene_it(entry, SAnnotSelector(CSeqFeatData::e_Gene)); gene_it; ++gene_it) {
2837  bool change_this_gene;
2838  CRef<CSeq_feat> new_gene(new CSeq_feat());
2839  new_gene->Assign(*(gene_it->GetSeq_feat()));
2840 
2841  change_this_gene = ExpandGeneToIncludeChildren(*new_gene, tse);
2842 
2843  change_this_gene |= SetGenePartialByLongestContainedFeature(*new_gene, entry.GetScope());
2844 
2845  if (change_this_gene) {
2846  CSeq_feat_EditHandle gene_h(*gene_it);
2847  gene_h.Replace(*new_gene);
2848  any_changes = true;
2849  }
2850  }
2851 
2852  NormalizeDescriptorOrder(entry);
2853 
2854  for (CBioseq_CI bi(entry, CSeq_inst::eMol_na); bi; ++bi) {
2855  any_changes |= SetGeneticCodes(*bi);
2856  }
2857 
2858  if (run_extended_cleanup) {
2859  auto pChanged = CCleanup::ExtendedCleanup(entry, options);
2860  if (pChanged->ChangeCount()>0) {
2861  return true;
2862  }
2863  }
2864  return any_changes;
2865 }
2866 
2867 
2868 bool CCleanup::x_HasShortIntron(const CSeq_loc& loc, size_t min_len)
2869 {
2870  CSeq_loc_CI li(loc);
2871  while (li && li.IsEmpty()) {
2872  ++li;
2873  }
2874  if (!li) {
2875  return false;
2876  }
2877  while (li) {
2878  TSeqPos prev_end;
2879  ENa_strand prev_strand;
2880  if (li.IsSetStrand() && li.GetStrand() == eNa_strand_minus) {
2881  prev_end = li.GetRange().GetFrom();
2882  prev_strand = eNa_strand_minus;
2883  } else {
2884  prev_end = li.GetRange().GetTo();
2885  prev_strand = eNa_strand_plus;
2886  }
2887  ++li;
2888  while (li && li.IsEmpty()) {
2889  ++li;
2890  }
2891  if (li) {
2892  TSeqPos this_start;
2893  ENa_strand this_strand;
2894  if (li.IsSetStrand() && li.GetStrand() == eNa_strand_minus) {
2895  this_start = li.GetRange().GetTo();
2896  this_strand = eNa_strand_minus;
2897  } else {
2898  this_start = li.GetRange().GetFrom();
2899  this_strand = eNa_strand_plus;
2900  }
2901  if (this_strand == prev_strand) {
2902  if (abs((long int)this_start - (long int)prev_end) < min_len) {
2903  return true;
2904  }
2905  }
2906  }
2907  }
2908  return false;
2909 }
2910 
2911 //LCOV_EXCL_START
2912 //not used by asn_cleanup but used by table2asn
2913 const string kLowQualitySequence = "low-quality sequence region";
2914 
2916 {
2917  bool any_change = false;
2918  if (!feat.IsSetExcept()) {
2919  any_change = true;
2920  feat.SetExcept(true);
2921  }
2922  if (!feat.IsSetExcept_text() || NStr::IsBlank(feat.GetExcept_text())) {
2924  any_change = true;
2925  } else if (NStr::Find(feat.GetExcept_text(), kLowQualitySequence) == string::npos) {
2926  feat.SetExcept_text(feat.GetExcept_text() + "; " + kLowQualitySequence);
2927  any_change = true;
2928  }
2929  return any_change;
2930 }
2931 
2932 
2934 {
2935  bool any_changes = false;
2936 
2937  SAnnotSelector sel(subtype);
2938  for (CFeat_CI cds_it(entry, sel); cds_it; ++cds_it) {
2939  bool change_this_cds = false;
2940  CRef<CSeq_feat> new_cds(new CSeq_feat());
2941  new_cds->Assign(*(cds_it->GetSeq_feat()));
2942  if (!sequence::IsPseudo(*(cds_it->GetSeq_feat()), entry.GetScope()) &&
2943  x_HasShortIntron(cds_it->GetLocation())) {
2944  change_this_cds = x_AddLowQualityException(*new_cds);
2945  }
2946 
2947  if (change_this_cds) {
2948  CSeq_feat_EditHandle cds_h(*cds_it);
2949 
2950  cds_h.Replace(*new_cds);
2951  any_changes = true;
2952  }
2953  }
2954  return any_changes;
2955 }
2956 
2957 
2959 {
2960  bool any_changes = x_AddLowQualityException(entry, CSeqFeatData::eSubtype_cdregion);
2962  return any_changes;
2963 }
2964 //LCOV_EXCL_STOP
2965 
2966 
2967 // maps the type of seqdesc to the order it should be in
2968 // (lowest to highest)
2971  // Note that ordering must match ordering
2972  // in CSeqdesc::E_Choice
2973  { CSeqdesc::e_Mol_type, 13 },
2974  { CSeqdesc::e_Modif, 14 },
2975  { CSeqdesc::e_Method, 15 },
2976  { CSeqdesc::e_Name, 7 },
2977  { CSeqdesc::e_Title, 1 },
2978  { CSeqdesc::e_Org, 16 },
2979  { CSeqdesc::e_Comment, 6 },
2980  { CSeqdesc::e_Num, 11 },
2981  { CSeqdesc::e_Maploc, 9 },
2982  { CSeqdesc::e_Pir, 18 },
2983  { CSeqdesc::e_Genbank, 22 },
2984  { CSeqdesc::e_Pub, 5 },
2985  { CSeqdesc::e_Region, 10 },
2986  { CSeqdesc::e_User, 8 },
2987  { CSeqdesc::e_Sp, 17 },
2988  { CSeqdesc::e_Dbxref, 12 },
2989  { CSeqdesc::e_Embl, 21 },
2990  { CSeqdesc::e_Create_date, 24 },
2991  { CSeqdesc::e_Update_date, 25 },
2992  { CSeqdesc::e_Prf, 19 },
2993  { CSeqdesc::e_Pdb, 20 },
2994  { CSeqdesc::e_Het, 4 },
2995 
2996  { CSeqdesc::e_Source, 2 },
2997  { CSeqdesc::e_Molinfo, 3 },
2998  { CSeqdesc::e_Modelev, 23 }
2999 };
3002 
3003 static
3005  // ordering assigned to unknown
3006  const int unknown_seqdesc = static_cast<int>(1 + sc_SeqdescOrderMap.size());
3007 
3008  TSeqdescOrderMap::const_iterator find_iter = sc_SeqdescOrderMap.find(chs);
3009  if (find_iter == sc_SeqdescOrderMap.end()) {
3010  return unknown_seqdesc;
3011  }
3012 
3013  return find_iter->second;
3014 }
3015 
3016 static
3017 bool s_SeqDescLessThan(const CRef<CSeqdesc> &desc1, const CRef<CSeqdesc> &desc2)
3018 {
3019  CSeqdesc::E_Choice chs1, chs2;
3020 
3021  chs1 = desc1->Which();
3022  chs2 = desc2->Which();
3023 
3024  return (s_SeqDescToOrdering(chs1) < s_SeqDescToOrdering(chs2));
3025 }
3026 
3028 {
3029  bool rval = false;
3030  if (!seq_mac_is_sorted(descr.Set().begin(), descr.Set().end(), s_SeqDescLessThan)) {
3031  descr.Set().sort(s_SeqDescLessThan);
3032  rval = true;
3033  }
3034  return rval;
3035 }
3036 
3038 {
3039  bool rval = false;
3040 
3042  while (ci) {
3044  if (edit.IsSetDescr()) {
3045  rval |= NormalizeDescriptorOrder(edit.SetDescr());
3046  }
3047  ++ci;
3048  }
3049 
3050  return rval;
3051 }
3052 
3053 
3055 {
3056  bool removed = false;
3057  if (seq.IsSetDescr()) {
3058  CConstRef<CSeqdesc> last_title;
3060  if ((*d)->IsTitle()) {
3061  if (last_title) {
3062  seq.RemoveSeqdesc(*last_title);
3063  removed = true;
3064  }
3065  last_title.Reset(d->GetPointer());
3066  }
3067  }
3068  }
3069  return removed;
3070 }
3071 
3072 
3074 {
3075  bool removed = false;
3076  if (set.IsSetDescr()) {
3077  CConstRef<CSeqdesc> last_title;
3078  ITERATE(CBioseq::TDescr::Tdata, d, set.GetDescr().Get()) {
3079  if ((*d)->IsTitle()) {
3080  if (last_title) {
3081  set.RemoveSeqdesc(*last_title);
3082  removed = true;
3083  }
3084  last_title.Reset(d->GetPointer());
3085  }
3086  }
3087  }
3088  return removed;
3089 }
3090 
3091 
3093 {
3094  if (seh.IsSet() && seh.GetSet().IsSetClass() &&
3096  return false;
3097  }
3098  CSeq_entry_EditHandle eh(seh);
3100  return true;
3101 }
3102 
3103 
3104 void s_GetAuthorsString(string *out_authors, const CAuth_list& auth_list)
3105 {
3106  string & auth_str = *out_authors;
3107  auth_str.clear();
3108 
3109  if (!auth_list.IsSetNames()) {
3110  return;
3111  }
3112 
3113  vector<string> name_list;
3114 
3115  if (auth_list.GetNames().IsStd()) {
3116  ITERATE(CAuth_list::TNames::TStd, auth_it, auth_list.GetNames().GetStd()) {
3117  if ((*auth_it)->IsSetName()) {
3118  string label;
3119  (*auth_it)->GetName().GetLabel(&label);
3120  name_list.push_back(label);
3121  }
3122  }
3123  } else if (auth_list.GetNames().IsMl()) {
3124  copy(BEGIN_COMMA_END(auth_list.GetNames().GetMl()),
3125  back_inserter(name_list));
3126  } else if (auth_list.GetNames().IsStr()) {
3127  copy(BEGIN_COMMA_END(auth_list.GetNames().GetStr()),
3128  back_inserter(name_list));
3129  }
3130 
3131  if (name_list.size() == 0) {
3132  return;
3133  } else if (name_list.size() == 1) {
3134  auth_str = name_list.back();
3135  return;
3136  }
3137 
3138  // join most of them by commas, but the last one gets an "and"
3139  string last_author;
3140  last_author.swap(name_list.back());
3141  name_list.pop_back();
3142  // swap is faster than assignment
3143  NStr::Join(name_list, ", ").swap(auth_str);
3144  auth_str += "and ";
3145  auth_str += last_author;
3146 
3147  return;
3148 }
3149 
3150 
3152  string *out_authors_string, const CPubdesc& pd)
3153 {
3154  string & authors_string = *out_authors_string;
3155  authors_string.clear();
3156 
3157  FOR_EACH_PUB_ON_PUBDESC(pub, pd) {
3158  if ((*pub)->IsSetAuthors()) {
3159  s_GetAuthorsString(&authors_string, (*pub)->GetAuthors());
3160  break;
3161  }
3162  }
3163 }
3164 
3165 
3167 (const CPubdesc& pd,
3168 vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
3169 vector<string>& published_labels,
3170 vector<string>& unpublished_labels)
3171 {
3172  string label;
3173  bool is_published = false;
3174  bool need_label = false;
3175 
3176  if (!pd.IsSetPub()) {
3177  return;
3178  }
3179  ITERATE(CPubdesc::TPub::Tdata, it, pd.GetPub().Get()) {
3180  if ((*it)->IsPmid()) {
3181  pmids.push_back((*it)->GetPmid());
3182  is_published = true;
3183  } else if ((*it)->IsMuid()) {
3184  muids.push_back((*it)->GetMuid());
3185  is_published = true;
3186  } else if ((*it)->IsGen()) {
3187  if ((*it)->GetGen().IsSetCit()
3188  && NStr::StartsWith((*it)->GetGen().GetCit(), "BackBone id_pub", NStr::eNocase)) {
3189  need_label = true;
3190  }
3191  if ((*it)->GetGen().IsSetSerial_number()) {
3192  serials.push_back((*it)->GetGen().GetSerial_number());
3193  if ((*it)->GetGen().IsSetCit()
3194  || (*it)->GetGen().IsSetJournal()
3195  || (*it)->GetGen().IsSetDate()) {
3196  need_label = true;
3197  }
3198  } else {
3199  need_label = true;
3200  }
3201  } else if ((*it)->IsArticle() && (*it)->GetArticle().IsSetIds()) {
3202  is_published = true;
3203  ITERATE(CArticleIdSet::Tdata, id, (*it)->GetArticle().GetIds().Get()) {
3204  if ((*id)->IsPubmed()) {
3205  pmids.push_back((*id)->GetPubmed());
3206  is_published = true;
3207  } else if ((*id)->IsMedline()) {
3208  muids.push_back((*id)->GetMedline());
3209  }
3210  }
3211  need_label = true;
3212  } else {
3213  need_label = true;
3214  }
3215  if (need_label && NStr::IsBlank(label)) {
3216  // create unique label
3217  (*it)->GetLabel(&label, CPub::eContent, true);
3218  string auth_str;
3219  s_GetAuthorsString(&auth_str, pd);
3220  label += "; ";
3221  label += auth_str;
3222  }
3223  }
3224  if (!NStr::IsBlank(label)) {
3225  if (is_published) {
3226  published_labels.push_back(label);
3227  } else {
3228  unpublished_labels.push_back(label);
3229  }
3230  }
3231 }
3232 
3233 
3234 vector<CConstRef<CPub> > CCleanup::GetCitationList(CBioseq_Handle bsh)
3235 {
3236  vector<CConstRef<CPub> > pub_list;
3237 
3238  // first get descriptor pubs
3239  CSeqdesc_CI di(bsh, CSeqdesc::e_Pub);
3240  while (di) {
3241  vector<TEntrezId> pmids;
3242  vector<TEntrezId> muids;
3243  vector<int> serials;
3244  vector<string> published_labels;
3245  vector<string> unpublished_labels;
3246  GetPubdescLabels(di->GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
3247  if (pmids.size() > 0) {
3248  CRef<CPub> pub(new CPub());
3249  pub->SetPmid().Set(pmids[0]);
3250  pub_list.push_back(pub);
3251  } else if (muids.size() > 0) {
3252  CRef<CPub> pub(new CPub());
3253  pub->SetMuid(muids[0]);
3254  pub_list.push_back(pub);
3255  } else if (serials.size() > 0) {
3256  CRef<CPub> pub(new CPub());
3257  pub->SetGen().SetSerial_number(serials[0]);
3258  pub_list.push_back(pub);
3259  } else if (published_labels.size() > 0) {
3260  CRef<CPub> pub(new CPub());
3261  pub->SetGen().SetCit(published_labels[0]);
3262  pub_list.push_back(pub);
3263  } else if (unpublished_labels.size() > 0) {
3264  CRef<CPub> pub(new CPub());
3265  pub->SetGen().SetCit(unpublished_labels[0]);
3266  pub_list.push_back(pub);
3267  }
3268 
3269  ++di;
3270  }
3271  // now get pub features
3273  while (fi) {
3274  vector<TEntrezId> pmids;
3275  vector<TEntrezId> muids;
3276  vector<int> serials;
3277  vector<string> published_labels;
3278  vector<string> unpublished_labels;
3279  GetPubdescLabels(fi->GetData().GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
3280  if (pmids.size() > 0) {
3281  CRef<CPub> pub(new CPub());
3282  pub->SetPmid().Set(pmids[0]);
3283  pub_list.push_back(pub);
3284  } else if (muids.size() > 0) {
3285  CRef<CPub> pub(new CPub());
3286  pub->SetMuid(muids[0]);
3287  pub_list.push_back(pub);
3288  } else if (serials.size() > 0) {
3289  CRef<CPub> pub(new CPub());
3290  pub->SetGen().SetSerial_number(serials[0]);
3291  pub_list.push_back(pub);
3292  } else if (published_labels.size() > 0) {
3293  CRef<CPub> pub(new CPub());
3294  pub->SetGen().SetCit(published_labels[0]);
3295  pub_list.push_back(pub);
3296  } else if (unpublished_labels.size() > 0) {
3297  CRef<CPub> pub(new CPub());
3298  pub->SetGen().SetCit(unpublished_labels[0]);
3299  pub_list.push_back(pub);
3300  }
3301 
3302  ++fi;
3303  }
3304  return pub_list;
3305 }
3306 
3307 
3309 {
3310  bool any_change = false;
3311  CSeq_descr::Tdata::iterator it1 = descr.Set().begin();
3312  while (it1 != descr.Set().end()) {
3313  if ((*it1)->IsPub()) {
3314  CSeq_descr::Tdata::iterator it2 = it1;
3315  ++it2;
3316  while (it2 != descr.Set().end()) {
3317  if ((*it2)->IsPub() && (*it1)->GetPub().Equals((*it2)->GetPub())) {
3318  it2 = descr.Set().erase(it2);
3319  any_change = true;
3320  } else {
3321  ++it2;
3322  }
3323  }
3324  }
3325  ++it1;
3326  }
3327  return any_change;
3328 }
3329 
3330 
3331 bool s_FirstPubMatchesSecond(const CPubdesc& pd1, const CPubdesc& pd2)
3332 {
3333  if (pd1.Equals(pd2)) {
3334  return true;
3335  } else if (pd1.IsSetPub() && pd2.IsSetPub() && pd1.GetPub().Get().size() == 1) {
3336  ITERATE(CPubdesc::TPub::Tdata, it, pd2.GetPub().Get()) {
3337  if (pd1.GetPub().Get().front()->Equals(**it)) {
3338  return true;
3339  }
3340  }
3341  }
3342  return false;
3343 }
3344 
3345 
3346 bool CCleanup::PubAlreadyInSet(const CPubdesc& pd, const CSeq_descr& descr)
3347 {
3348  ITERATE(CSeq_descr::Tdata, d, descr.Get()) {
3349  if ((*d)->IsPub() && s_FirstPubMatchesSecond(pd, (*d)->GetPub())) {
3350  return true;
3351  }
3352  }
3353  return false;
3354 }
3355 
3356 
3358 {
3359  bool is_embl_or_ddbj = false;
3360  ITERATE(CBioseq::TId, id, b.GetId()) {
3361  if ((*id)->IsEmbl() || (*id)->IsDdbj()) {
3362  is_embl_or_ddbj = true;
3363  break;
3364  }
3365  }
3366  return !is_embl_or_ddbj;
3367 }
3368 
3369 
3371 {
3372  if (pd.IsSetNum() || pd.IsSetName() || pd.IsSetFig() || pd.IsSetComment()) {
3373  return false;
3374  } else {
3375  return true;
3376  }
3377 }
3378 
3379 
3381 {
3382  // add descriptor to nuc-prot parent or sequence itself
3383  CBioseq_set_Handle parent = b.GetParentBioseq_set();
3384  if (!CCleanup::OkToPromoteNpPub(*(b.GetCompleteBioseq()))) {
3385  // add to sequence
3386  CBioseq_EditHandle eh(b);
3387  eh.AddSeqdesc(*d);
3390  } else if (parent && parent.IsSetClass() &&
3391  parent.GetClass() == CBioseq_set::eClass_nuc_prot &&
3392  parent.IsSetDescr() && PubAlreadyInSet(d->GetPub(), parent.GetDescr())) {
3393  // don't add descriptor, just delete feature
3394  } else if (OkToPromoteNpPub((d)->GetPub()) &&
3395  parent && parent.IsSetClass() &&
3396  parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
3397  CBioseq_set_EditHandle eh(parent);
3398  eh.AddSeqdesc(*d);
3401  } else {
3402  CBioseq_EditHandle eh(b);
3403  eh.AddSeqdesc(*d);
3406  }
3407  if (remove_feat) {
3408  // remove feature
3409  CSeq_feat_EditHandle feh(feat);
3410  feh.Remove();
3411  }
3412 }
3413 
3414 
3416 {
3417  bool any_change = false;
3418  for (CBioseq_CI b(seh); b; ++b) {
3419  for (CFeat_CI p(*b, CSeqFeatData::e_Pub); p; ++p) {
3420  if (p->GetLocation().IsInt() &&
3421  p->GetLocation().GetStart(eExtreme_Biological) == 0 &&
3422  p->GetLocation().GetStop(eExtreme_Biological) == b->GetBioseqLength() - 1) {
3423  CRef<CSeqdesc> d(new CSeqdesc());
3424  d->SetPub().Assign(p->GetData().GetPub());
3425  if (p->IsSetComment()) {
3426  if (d->GetPub().IsSetComment() && !NStr::IsBlank(d->GetPub().GetComment())) {
3427  d->SetPub().SetComment(d->GetPub().GetComment() + "; " + p->GetComment());
3428  } else {
3429  d->SetPub().SetComment();
3430  }
3431  }
3432  MoveOneFeatToPubdesc(*p, d, *b);
3433  any_change = true;
3434  }
3435  }
3436  }
3437  return any_change;
3438 }
3439 
3440 
3441 bool IsSiteRef(const CSeq_feat& sf)
3442 {
3443  if (sf.GetData().IsImp() &&
3444  sf.GetData().GetImp().IsSetKey() &&
3445  NStr::Equal(sf.GetData().GetImp().GetKey(), "Site-ref")) {
3446  return true;
3447  } else {
3448  return false;
3449  }
3450 }
3451 
3452 
3453 bool CCleanup::IsMinPub(const CPubdesc& pd, bool is_refseq_prot)
3454 {
3455  if (!pd.IsSetPub()) {
3456  return true;
3457  }
3458  bool found_non_minimal = false;
3459  ITERATE(CPubdesc::TPub::Tdata, it, pd.GetPub().Get()) {
3460  if ((*it)->IsMuid() || (*it)->IsPmid()) {
3461  if (is_refseq_prot) {
3462  found_non_minimal = true;
3463  break;
3464  }
3465  } else if ((*it)->IsGen()) {
3466  const CCit_gen& gen = (*it)->GetGen();
3467  if (gen.IsSetCit() && !gen.IsSetJournal() &&
3468  !gen.IsSetAuthors() && !gen.IsSetVolume() &&
3469  !gen.IsSetPages()) {
3470  //minimalish, keep looking
3471  } else {
3472  found_non_minimal = true;
3473  }
3474  } else {
3475  found_non_minimal = true;
3476  break;
3477  }
3478  }
3479 
3480  return !found_non_minimal;
3481 }
3482 
3483 
3485 {
3486  bool found_site_ref = false;
3488  while (f && !found_site_ref) {
3489  if (IsSiteRef(*(f->GetSeq_feat()))) {
3490  found_site_ref = true;
3491  }
3492  ++f;
3493  }
3494  if (!found_site_ref) {
3495  return false;
3496  }
3497 
3498  bool any_change = false;
3499  for (CBioseq_CI b(seh); b; ++b) {
3500  bool is_refseq_prot = false;
3501  if (b->IsAa()) {
3502  ITERATE(CBioseq::TId, id_it, b->GetCompleteBioseq()->GetId()) {
3503  if ((*id_it)->IsOther()) {
3504  is_refseq_prot = true;
3505  break;
3506  }
3507  }
3508  }
3509 
3510  for (CFeat_CI p(*b); p; ++p) {
3511  if (!p->IsSetCit() || p->GetCit().Which() != CPub_set::e_Pub) {
3512  continue;
3513  }
3514 
3515  bool is_site_ref = IsSiteRef(*(p->GetSeq_feat()));
3516  ITERATE(CSeq_feat::TCit::TPub, c, p->GetCit().GetPub()) {
3517  CRef<CSeqdesc> d(new CSeqdesc());
3518  if ((*c)->IsEquiv()) {
3519  ITERATE(CPub_equiv::Tdata, t, (*c)->GetEquiv().Get()) {
3520  CRef<CPub> pub_copy(new CPub());
3521  pub_copy->Assign(**t);
3522  d->SetPub().SetPub().Set().push_back(pub_copy);
3523  }
3524 
3525  } else {
3526  CRef<CPub> pub_copy(new CPub());
3527  pub_copy->Assign(**c);
3528  d->SetPub().SetPub().Set().push_back(pub_copy);
3529  }
3530  if (is_site_ref) {
3532  } else {
3534  }
3535  auto changes = makeCleanupChange(0);
3536  CNewCleanup_imp pubclean(changes, 0);
3537  pubclean.BasicCleanup(d->SetPub(), ShouldStripPubSerial(*(b->GetCompleteBioseq())));
3538  if (!IsMinPub(d->SetPub(), is_refseq_prot)) {
3539  MoveOneFeatToPubdesc(*p, d, *b, false);
3540  }
3541  }
3542  if (is_site_ref) {
3543 
3544  CSeq_feat_EditHandle feh(*p);
3545  CSeq_annot_Handle annot = feh.GetAnnot();
3546 
3547  feh.Remove();
3548 
3549  // remove old annot if now empty
3551  CSeq_annot_EditHandle annot_edit(annot);
3552  annot_edit.Remove();
3553  }
3554 
3555  }
3556  any_change = true;
3557  }
3558  }
3559  return any_change;
3560 }
3561 
3562 
3564 {
3565  if (src1.IsSetOrg() && src1.GetOrg().IsSetTaxname() &&
3566  src2.IsSetOrg() && src2.GetOrg().IsSetTaxname() &&
3567  NStr::Equal(src1.GetOrg().GetTaxname(), src2.GetOrg().GetTaxname())) {
3568  return true;
3569  } else {
3570  return false;
3571  }
3572 }
3573 
3574 
3575 static bool s_SubsourceCompareC (
3576  const CRef<CSubSource>& st1,
3577  const CRef<CSubSource>& st2
3578 )
3579 
3580 {
3581  const CSubSource& sbs1 = *(st1);
3582  const CSubSource& sbs2 = *(st2);
3583 
3584  TSUBSOURCE_SUBTYPE chs1 = GET_FIELD (sbs1, Subtype);
3585  TSUBSOURCE_SUBTYPE chs2 = GET_FIELD (sbs2, Subtype);
3586 
3587  if (chs1 < chs2) return true;
3588  if (chs1 > chs2) return false;
3589 
3590  if (FIELD_IS_SET (sbs2, Name)) {
3591  if (! FIELD_IS_SET (sbs1, Name)) return true;
3592  if (NStr::CompareNocase(GET_FIELD (sbs1, Name), GET_FIELD (sbs2, Name)) < 0) return true;
3593  }
3594 
3595  return false;
3596 }
3597 
3598 static bool s_SameSubtypeC(const CSubSource& s1, const CSubSource& s2)
3599 {
3600  if (!s1.IsSetSubtype() && !s2.IsSetSubtype()) {
3601  return true;
3602  } else if (!s1.IsSetSubtype() || !s2.IsSetSubtype()) {
3603  return false;
3604  } else {
3605  return s1.GetSubtype() == s2.GetSubtype();
3606  }
3607 }
3608 
3609 // close enough if second name contains the first
3610 static bool s_NameCloseEnoughC(const CSubSource& s1, const CSubSource& s2)
3611 {
3612  if (!s1.IsSetName() && !s2.IsSetName()) {
3613  return true;
3614  } else if (!s1.IsSetName() || !s2.IsSetName()) {
3615  return false;
3616  }
3617  const string& n1 = s1.GetName();
3618  const string& n2 = s2.GetName();
3619 
3620  if (NStr::Equal(n1, n2)) {
3621  return true;
3622  } else {
3623  return false;
3624  }
3625 }
3626 
3627 
3629 {
3630  bool res = false;
3631 
3632  // sort and remove duplicates.
3633  if (biosrc.IsSetSubtype() && biosrc.GetSubtype().size() > 1) {
3636  }
3637 
3638  // remove duplicates and subsources that contain previous values
3639  CBioSource::TSubtype::iterator s = biosrc.SetSubtype().begin();
3640  CBioSource::TSubtype::iterator s_next = s;
3641  ++s_next;
3642  while (s_next != biosrc.SetSubtype().end()) {
3643  if (s_SameSubtypeC(**s, **s_next) && s_NameCloseEnoughC(**s, **s_next)) {
3644  s = biosrc.SetSubtype().erase(s);
3645  res = true;
3646  } else {
3647  ++s;
3648  }
3649  ++s_next;
3650  }
3651  }
3652 
3653  return res;
3654 }
3655 
3657 {
3658  bool any_change = false;
3659  // genome
3660  if ((!src1.IsSetGenome() || src1.GetGenome() == CBioSource::eGenome_unknown) &&
3662  src1.SetGenome(add.GetGenome());
3663  any_change = true;
3664  }
3665  // origin
3666  if ((!src1.IsSetOrigin() || src1.GetOrigin() == CBioSource::eOrigin_unknown) &&
3668  src1.SetOrigin(add.GetOrigin());
3669  any_change = true;
3670  }
3671  // focus
3672  if (!src1.IsSetIs_focus() && add.IsSetIs_focus()) {
3673  src1.SetIs_focus();
3674  any_change = true;
3675  }
3676 
3677  // merge subtypes
3678  if (add.IsSetSubtype()) {
3680  CRef<CSubSource> a(new CSubSource());
3681  a->Assign(**it);
3682  src1.SetSubtype().push_back(a);
3683  }
3684  any_change = true;
3685  }
3686 
3687  x_MergeDupOrgRefs(src1.SetOrg(), add.GetOrg());
3688 
3689  if (s_SubSourceListUniqued(src1)) {
3690  any_change = true;
3691  }
3692 
3693  return any_change;
3694 }
3695 
3696 
3698 {
3699  bool any_change = false;
3700 
3701  // OrgMods
3702  if (add.IsSetMod()) {
3703  ITERATE(COrgName::TMod, it, add.GetMod()) {
3704  CRef<COrgMod> a(new COrgMod());
3705  a->Assign(**it);
3706  on1.SetMod().push_back(a);
3707  }
3708  any_change = true;
3709  }
3710 
3711  // gcode
3712  if ((!on1.IsSetGcode() || on1.GetGcode() == 0) && add.IsSetGcode() && add.GetGcode() != 0) {
3713  on1.SetGcode(add.GetGcode());
3714  any_change = true;
3715  }
3716 
3717  // mgcode
3718  if ((!on1.IsSetMgcode() || on1.GetMgcode() == 0) && add.IsSetMgcode() && add.GetMgcode() != 0) {
3719  on1.SetMgcode(add.GetMgcode());
3720  any_change = true;
3721  }
3722 
3723  // lineage
3724  if (!on1.IsSetLineage() && add.IsSetLineage()) {
3725  on1.SetLineage(add.GetLineage());
3726  any_change = true;
3727  }
3728 
3729  // div
3730  if (!on1.IsSetDiv() && add.IsSetDiv()) {
3731  on1.SetDiv(add.GetDiv());
3732  any_change = true;
3733  }
3734 
3735  return any_change;
3736 }
3737 
3738 
3739 bool HasMod(const COrg_ref& org, const string& mod)
3740 {
3741  if (!org.IsSetMod()) {
3742  return false;
3743  }
3744  ITERATE(COrg_ref::TMod, it, org.GetMod()) {
3745  if (NStr::Equal(*it, mod)) {
3746  return true;
3747  }
3748  }
3749  return false;
3750 }
3751 
3752 
3754 {
3755  bool any_change = false;
3756  // mods
3757  if (add.IsSetMod()) {
3758  ITERATE(COrg_ref::TMod, it, add.GetMod()) {
3759  if (!HasMod(org1, *it)) {
3760  org1.SetMod().push_back(*it);
3761  any_change = true;
3762  }
3763  }
3764  }
3765 
3766  // dbxrefs
3767  if (add.IsSetDb()) {
3768  ITERATE(COrg_ref::TDb, it, add.GetDb()) {
3769  CRef<CDbtag> a(new CDbtag());
3770  a->Assign(**it);
3771  org1.SetDb().push_back(a);
3772  }
3773  any_change = true;
3774  }
3775 
3776  // synonyms
3777  if (add.IsSetSyn()) {
3778  ITERATE(COrg_ref::TSyn, it, add.GetSyn()) {
3779  org1.SetSyn().push_back(*it);
3780  }
3781  any_change = true;
3782  }
3783 
3784  if (add.IsSetOrgname()) {
3785  any_change |= x_MergeDupOrgNames(org1.SetOrgname(), add.GetOrgname());
3786  }
3787 
3788  return any_change;
3789 }
3790 
3791 
3793 {
3794  bool any_change = false;
3795  CSeq_descr::Tdata::iterator src1 = seq_descr.Set().begin();
3796  while (src1 != seq_descr.Set().end()) {
3797  if ((*src1)->IsSource() && (*src1)->GetSource().IsSetOrg() && (*src1)->GetSource().GetOrg().IsSetTaxname()) {
3798  CSeq_descr::Tdata::iterator src2 = src1;
3799  ++src2;
3800  while (src2 != seq_descr.Set().end()) {
3801  if ((*src2)->IsSource() &&
3802  AreBioSourcesMergeable((*src1)->GetSource(), (*src2)->GetSource())) {
3803  MergeDupBioSources((*src1)->SetSource(), (*src2)->GetSource());
3804 
3805  auto changes = makeCleanupChange(0);
3806  CNewCleanup_imp srcclean(changes, 0);
3807  srcclean.ExtendedCleanup((*src1)->SetSource());
3808  src2 = seq_descr.Set().erase(src2);
3809  any_change = true;
3810  } else {
3811  ++src2;
3812  }
3813  }
3814  }
3815  ++src1;
3816  }
3817  return any_change;
3818 }
3819 
3820 /// Remove duplicate biosource descriptors
3822 {
3823  bool any_change = false;
3824  vector<CConstRef<CBioSource> > src_list;
3825  CSeq_descr::Tdata::iterator d = descr.Set().begin();
3826  while (d != descr.Set().end()) {
3827  if ((*d)->IsSource()) {
3828  bool found = false;
3829  ITERATE(vector<CConstRef<CBioSource> >, s, src_list) {
3830  if ((*d)->GetSource().Equals(**s)) {
3831  found = true;
3832  break;
3833  }
3834  }
3835  if (found) {
3836  d = descr.Set().erase(d);
3837  any_change = true;
3838  } else {
3839  CConstRef<CBioSource> src(&((*d)->GetSource()));
3840  src_list.push_back(src);
3841  ++d;
3842  }
3843  } else {
3844  ++d;
3845  }
3846  }
3847  return any_change;
3848 }
3849 
3850 
3852 {
3853  if (!f.IsSetData() || !f.GetData().IsBiosrc()) {
3854  return CRef<CBioSource>();
3855  }
3856  CRef<CBioSource> src(new CBioSource());
3857  src->Assign(f.GetData().GetBiosrc());
3858 
3859  // move comment to subsource note
3860  if (f.IsSetComment()) {
3861  CRef<CSubSource> s(new CSubSource());
3863  s->SetName(f.GetComment());
3864  src->SetSubtype().push_back(s);
3865 
3866  }
3867 
3868  // move dbxrefs on feature to source
3869  if (f.IsSetDbxref()) {
3870  ITERATE(CSeq_feat::TDbxref, it, f.GetDbxref()) {
3871  CRef<CDbtag> a(new CDbtag());
3872  a->Assign(**it);
3873  src->SetOrg().SetDb().push_back(a);
3874  }
3875  }
3876  auto changes = makeCleanupChange(0);
3877  CNewCleanup_imp srcclean(changes, 0);
3878  srcclean.ExtendedCleanup(*src);
3879 
3880  return src;
3881 }
3882 
3883 
3885 {
3886  bool any_change = false;
3887  for (CBioseq_CI b(seh); b; ++b) {
3888  bool transgenic_or_focus = false;
3889  CSeqdesc_CI existing_src(*b, CSeqdesc::e_Source);
3890  while (existing_src && !transgenic_or_focus) {
3891  if (existing_src->GetSource().IsSetIs_focus() ||
3893  transgenic_or_focus = true;
3894  }
3895  ++existing_src;
3896  }
3897  if (transgenic_or_focus) {
3898  continue;
3899  }
3900  for (CFeat_CI p(*b, CSeqFeatData::e_Biosrc); p; ++p) {
3901  if (p->GetLocation().IsInt() &&
3902  p->GetLocation().GetStart(eExtreme_Biological) == 0 &&
3903  p->GetLocation().GetStop(eExtreme_Biological) == b->GetBioseqLength() - 1) {
3904  CRef<CSeqdesc> d(new CSeqdesc());
3905  d->SetSource().Assign(*(BioSrcFromFeat(*(p->GetSeq_feat()))));
3906 
3907  // add descriptor to nuc-prot parent or sequence itself
3908  CBioseq_set_Handle parent = b->GetParentBioseq_set();
3909  if (parent && parent.IsSetClass() &&
3910  parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
3911  CBioseq_set_EditHandle eh(parent);
3912  eh.AddSeqdesc(*d);
3916  } else {
3917  CBioseq_EditHandle eh(*b);
3918  eh.AddSeqdesc(*d);
3922  }
3923 
3924  // remove feature
3925  CSeq_feat_EditHandle feh(*p);
3926  CSeq_annot_Handle ah = feh.GetAnnot();
3927  feh.Remove();
3929  CSeq_annot_EditHandle aeh(ah);
3930  aeh.Remove();
3931  }
3932 
3933  any_change = true;
3934  }
3935  }
3936  }
3937  return any_change;
3938 }
3939 
3940 
3941 
3943 {
3944  CFeat_CI fi(seh);
3945  size_t num_gene_locus = 0;
3946  size_t num_gene_locus_tag = 0;
3947  size_t num_gene_xref_locus = 0;
3948  size_t num_gene_xref_locus_tag = 0;
3949 
3950  while (fi) {
3951  if (fi->GetData().IsGene()) {
3952  if (fi->GetData().GetGene().IsSetLocus()) {
3953  num_gene_locus++;
3954  }
3955  if (fi->GetData().GetGene().IsSetLocus_tag()) {
3956  num_gene_locus_tag++;
3957  }
3958  } else if (fi->IsSetXref()) {
3959  const CGene_ref* g = fi->GetGeneXref();
3960  if (g) {
3961  if (g->IsSetLocus()) {
3962  num_gene_xref_locus++;
3963  }
3964  if (g->IsSetLocus_tag()) {
3965  num_gene_xref_locus_tag++;
3966  }
3967  }
3968  }
3969  if (num_gene_locus > 0) {
3970  if (num_gene_locus_tag > 0) {
3971  return false;
3972  }
3973  if (num_gene_xref_locus > 0) {
3974  return false;
3975  }
3976  }
3977  if (num_gene_locus_tag > 0) {
3978  if (num_gene_locus > 0) {
3979  return false;
3980  }
3981  if (num_gene_xref_locus_tag > 0) {
3982  return false;
3983  }
3984  }
3985  ++fi;
3986  }
3987 
3988  bool any_change = false;
3989  if (num_gene_locus == 0 && num_gene_locus_tag > 0) {
3990  if (num_gene_xref_locus > 0 && num_gene_xref_locus_tag == 0) {
3991  fi.Rewind();
3992  while (fi) {
3993  if (!fi->GetData().IsGene() && fi->GetGeneXref()) {
3994  bool this_change = false;
3995  CRef<CSeq_feat> new_f(new CSeq_feat());
3996  new_f->Assign(*(fi->GetSeq_feat()));
3997  NON_CONST_ITERATE(CSeq_feat::TXref, it, new_f->SetXref()) {
3998  if ((*it)->IsSetData() && (*it)->GetData().IsGene()
3999  && (*it)->GetData().GetGene().IsSetLocus()) {
4000  (*it)->SetData().SetGene().SetLocus_tag((*it)->GetData().GetGene().GetLocus());
4001  (*it)->SetData().SetGene().ResetLocus();
4002  this_change = true;
4003  }
4004  }
4005  if (this_change) {
4006  CSeq_feat_EditHandle eh(*fi);
4007  eh.Replace(*new_f);
4008  }
4009  }
4010  ++fi;
4011  }
4012  }
4013  } else if (num_gene_locus > 0 && num_gene_locus_tag == 0) {
4014  if (num_gene_xref_locus == 0 && num_gene_xref_locus_tag > 0) {
4015  fi.Rewind();
4016  while (fi) {
4017  if (!fi->GetData().IsGene() && fi->GetGeneXref()) {
4018  bool this_change = false;
4019  CRef<CSeq_feat> new_f(new CSeq_feat());
4020  new_f->Assign(*(fi->GetSeq_feat()));
4021  NON_CONST_ITERATE(CSeq_feat::TXref, it, new_f->SetXref()) {
4022  if ((*it)->IsSetData() && (*it)->GetData().IsGene()
4023  && (*it)->GetData().GetGene().IsSetLocus_tag()) {
4024  (*it)->SetData().SetGene().SetLocus((*it)->GetData().GetGene().GetLocus_tag());
4025  (*it)->SetData().SetGene().ResetLocus_tag();
4026  this_change = true;
4027  }
4028  }
4029  if (this_change) {
4030  CSeq_feat_EditHandle eh(*fi);
4031  eh.Replace(*new_f);
4032  any_change = true;
4033  }
4034  }
4035  ++fi;
4036  }
4037  }
4038  }
4039  return any_change;
4040 }
4041 
4042 
4044 {
4045  bool strip_serial = true;
4046  ITERATE(CBioseq::TId, id, bs.GetId()) {
4047  const CSeq_id& sid = **id;
4048  switch (sid.Which()) {
4049  case NCBI_SEQID(Genbank):
4050  case NCBI_SEQID(Tpg):
4051  {
4052  const CTextseq_id& tsid = *GET_FIELD(sid, Textseq_Id);
4053  if (FIELD_IS_SET(tsid, Accession)) {
4054  const string& acc = GET_FIELD(tsid, Accession);
4055  if (acc.length() == 6) {
4056  strip_serial = false;
4057  }
4058  }
4059  }
4060  break;
4061  case NCBI_SEQID(Embl):
4062  case NCBI_SEQID(Ddbj):
4063  strip_serial = false;
4064  break;
4065  case NCBI_SEQID(not_set):
4066  case NCBI_SEQID(Local):
4067  case NCBI_SEQID(Other):
4068  case NCBI_SEQID(General):
4069  break;
4070  case NCBI_SEQID(Gibbsq):
4071  case NCBI_SEQID(Gibbmt):
4072  case NCBI_SEQID(Pir):
4073  case NCBI_SEQID(Swissprot):
4074  case NCBI_SEQID(Patent):
4075  case NCBI_SEQID(Prf):
4076  case NCBI_SEQID(Pdb):
4077  case NCBI_SEQID(Gpipe):
4078  case NCBI_SEQID(Tpe):
4079  case NCBI_SEQID(Tpd):
4080  strip_serial = false;
4081  break;
4082  default:
4083  break;
4084  }
4085  }
4086  return strip_serial;
4087 }
4088 
4089 
4091 {
4092  bool change_made = false;
4094  if (seh.IsSet() && seh.GetSet().IsSetClass() &&
4095  entry->GetSet().IsSetSeq_set()) {
4096  CBioseq_set::TClass set_class = seh.GetSet().GetClass();
4097  if (set_class == CBioseq_set::eClass_nuc_prot) {
4098  if (entry->GetSet().GetSeq_set().size() == 1 &&
4099  entry->GetSet().GetSeq_set().front()->IsSeq()) {
4101  eh.ConvertSetToSeq();
4102  if (eh.GetSeq().IsSetDescr()) {
4103  RemoveUnseenTitles(eh.SetSeq());
4104  NormalizeDescriptorOrder(eh.SetSeq().SetDescr());
4105  }
4106  change_made = true;
4107  }
4108  } else if (set_class == CBioseq_set::eClass_genbank ||
4109  set_class == CBioseq_set::eClass_mut_set ||
4110  set_class == CBioseq_set::eClass_pop_set ||
4111  set_class == CBioseq_set::eClass_phy_set ||
4112  set_class == CBioseq_set::eClass_eco_set ||
4113  set_class == CBioseq_set::eClass_wgs_set ||
4114  set_class == CBioseq_set::eClass_gen_prod_set ||
4115  set_class == CBioseq_set::eClass_small_genome_set) {
4118  change_made |= RenormalizeNucProtSets(ch);
4119  }
4120  }
4121  }
4122  return change_made;
4123 }
4124 
4125 
4127 {
4128 // return false;
4129  bool change_made = false;
4130 
4131  // This is more complex than you might initially think is necessary
4132  // because this needs to be as efficient as possible since it's
4133  // called on every single string in an object.
4134 
4135  SIZE_TYPE amp = str.find('&');
4136  if( NPOS == amp ) {
4137  // Check for the common case of no replacements required
4138  return change_made;
4139  }
4140 
4141  // transformations done by this function:
4142  const static struct {
4143  string src_word;
4144  string result_word;
4145  } transformations[] = {
4146  // all start with an implicit ampersand
4147  // and end with an implicit semi-colon
4148  { "amp", "&" },
4149  { "apos", "\'" },
4150  { "gt", ">" },
4151  { "lt", "<" },
4152  { "quot", "\"" },
4153  { "#13&#10", "" },
4154  { "#13;&#10", "" },
4155  { "#916", "Delta" },
4156  { "#945", "alpha" },
4157  { "#946", "beta" },
4158  { "#947", "gamma" },
4159  { "#952", "theta" },
4160  { "#955", "lambda" },
4161  { "#956", "mu" },
4162  { "#957", "nu" },
4163  { "#8201", "" },
4164  { "#8206", "" },
4165  { "#8242", "'" },
4166  { "#8594", "->" },
4167  { "#8722", "-" },
4168  { "#8710", "delta" },
4169  { "#64257", "fi" },
4170  { "#64258", "fl" },
4171  { "#65292", "," }
4172  };
4173 
4174  // Collisions should be rare enough that the CFastMutex is
4175  // faster than recreating the searcher each time this function is called
4176  static CTextFsm<int> searcher;
4177  // set searcher's state, if not already done
4178  {
4179  // just in case of the tiny chance that two threads try to prime
4180  // the searcher at the same time.
4181  static CFastMutex searcher_mtx;
4182  CFastMutexGuard searcher_mtx_guard( searcher_mtx );
4183  if( ! searcher.IsPrimed() ) {
4184  for( int idx = 0;
4185  idx < sizeof(transformations)/sizeof(transformations[0]);
4186  ++idx )
4187  {
4188  // match type is index into transformations array
4189  searcher.AddWord( transformations[idx].src_word, idx );
4190  }
4191  searcher.Prime();
4192  }
4193  }
4194 
4195  // a smart compiler probably won't need this manual optimization,
4196  // but just in case.
4197  const SIZE_TYPE str_len = str.length();
4198 
4199  // fill result up to the first '&'
4200  string result;
4201  result.reserve( str_len );
4202  copy( str.begin(), str.begin() + amp,
4203  back_inserter(result) );
4204 
4205  // at the start of each loop, the result is filled in
4206  // up to the ampersand (amp)
4207  while( amp != NPOS && amp < str_len ) {
4208 
4209  // find out what the ampersand code represents
4210  // (if it represents anything)
4211  int state = searcher.GetInitialState();
4212  SIZE_TYPE search_pos = (amp + 1);
4213  if (str[search_pos] == ' ') {
4214  break;
4215  }
4216  for( ; search_pos < str_len ; ++search_pos ) {
4217  const char ch = str[search_pos];
4218  if( ch == ';' ) {
4219  break;
4220  }
4221  if( ch == '&' && state == 0 ) {
4222  --search_pos; // so we don't skip over the '&'
4223  state = searcher.GetInitialState(); // force "no-match"
4224  break;
4225  }
4226  state = searcher.GetNextState(state, ch);
4227  }
4228 
4229  if( search_pos == str_len && searcher.IsMatchFound(state) ) {
4230  // copy the translation of the XML code:
4231  _ASSERT( searcher.GetMatches(state).size() == 1 );
4232  const int match_idx = searcher.GetMatches(state)[0];
4233  const string & result_word = transformations[match_idx].result_word;
4234  copy( result_word.begin(), result_word.end(),
4235  back_inserter(result) );
4236  change_made = true;
4237  break;
4238  }
4239 
4240  if( search_pos >= str_len ) {
4241  // we reached the end without finding anything, so
4242  // copy the rest and break
4243  copy( str.begin() + amp, str.end(),
4244  back_inserter(result) );
4245  break;
4246  }
4247 
4248  if( searcher.IsMatchFound(state) ) {
4249  // copy the translation of the XML code:
4250  _ASSERT( searcher.GetMatches(state).size() == 1 );
4251  const int match_idx = searcher.GetMatches(state)[0];
4252  const string & result_word = transformations[match_idx].result_word;
4253  copy( result_word.begin(), result_word.end(),
4254  back_inserter(result) );
4255  change_made = true;
4256  } else {
4257  // no match found, so copy the text we looked at
4258  // as-is
4259  copy( str.begin() + amp, str.begin() + search_pos + 1,
4260  back_inserter(result) );
4261  }
4262 
4263  // find next_amp
4264  if( str[search_pos] == '&' ) {
4265  // special case that occurs when there are multiple '&' together
4266  ++search_pos;
4267  result += '&';
4268  }
4269  SIZE_TYPE next_amp = str.find('&', search_pos );
4270  if( NPOS == next_amp ) {
4271  // no more amps; copy the rest and break
4272  copy( str.begin() + search_pos + 1, str.end(),
4273  back_inserter(result) );
4274  break;
4275  }
4276 
4277  // copy up to the next amp
4278  if( (search_pos + 1) < next_amp ) {
4279  copy( str.begin() + search_pos + 1, str.begin() + next_amp,
4280  back_inserter(result) );
4281  }
4282  amp = next_amp;
4283  }
4284 
4285  if (change_made) {
4286  str = result;
4287  }
4288 
4289  return change_made;
4290 }
4291 
4292 
4293 CRef<CSeq_loc> CCleanup::GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, const CSeq_feat& cds, CScope& scope, bool require_inframe)
4294 {
4295  if (require_inframe) {
4296  feature::ELocationInFrame is_in_frame = feature::IsLocationInFrame(scope.GetSeq_featHandle(cds), nuc_loc);
4297  bool is_ok = false;
4298  switch (is_in_frame) {
4300  is_ok = true;
4301  break;
4304  is_ok = true;
4305  }
4306  break;
4309  is_ok = true;
4310  }
4311  break;
4315  is_ok = true;
4316  }
4317  break;
4319  break;
4320  }
4321  if (!is_ok) {
4322  return CRef<CSeq_loc>();
4323  }
4324  }
4325  CRef<CSeq_loc> new_loc;
4326  CRef<CSeq_loc_Mapper> nuc2prot_mapper(
4328  new_loc = nuc2prot_mapper->Map(nuc_loc);
4329  if (!new_loc) {
4330  return CRef<CSeq_loc>();
4331  }
4332 
4333  const CSeq_id* sid = new_loc->GetId();
4334  const CSeq_id* orig_id = nuc_loc.GetId();
4335  if (!sid || (orig_id && sid->Equals(*orig_id))) {
4336  // unable to map to protein location
4337  return CRef<CSeq_loc>();
4338  }
4339 
4340  new_loc->ResetStrand();
4341 
4342  // if location includes stop codon, remove it
4343  CBioseq_Handle prot = scope.GetBioseqHandle(*sid);
4344  if (prot && new_loc->GetStop(eExtreme_Positional) >= prot.GetBioseqLength())
4345  {
4346  CRef<CSeq_id> sub_id(new CSeq_id());
4347  sub_id->Assign(*sid);
4348  CSeq_loc sub(*sub_id, prot.GetBioseqLength(), new_loc->GetStop(eExtreme_Positional), new_loc->GetStrand());
4349  new_loc = sequence::Seq_loc_Subtract(*new_loc, sub, CSeq_loc::fMerge_All | CSeq_loc::fSort, &scope);
4350  if (nuc_loc.IsPartialStop(eExtreme_Biological)) {
4351  new_loc->SetPartialStop(true, eExtreme_Biological);
4352  }
4353  }
4354 
4355  if (!new_loc->IsInt() && !new_loc->IsPnt()) {
4357  new_loc = tmp;
4358  }
4359 
4360  // fix partials if protein feature starts or ends at beginning or end of protein sequence
4362  new_loc->GetStart(eExtreme_Biological) == 0) {
4363  if (new_loc->IsPartialStart(eExtreme_Biological)) {
4364  new_loc->SetPartialStart(false, eExtreme_Biological);
4365  }
4366  }
4368  new_loc->GetStop(eExtreme_Biological) == prot.GetBioseqLength() - 1) {
4369  if (new_loc->IsPartialStop(eExtreme_Biological)) {
4370  new_loc->SetPartialStop(false, eExtreme_Biological);
4371  }
4372  }
4373 
4374  return new_loc;
4375 }
4376 
4377 
4379 {
4380  CConstRef<CSeq_feat> cds = sequence::GetOverlappingCDS(nuc_loc, scope);
4381  if (!cds || !cds->IsSetProduct()) {
4382  // there is no overlapping coding region feature, so there is no appropriate
4383  // protein sequence to move to
4384  return CRef<CSeq_loc>();
4385  }
4386 
4387  return GetProteinLocationFromNucleotideLocation(nuc_loc, *cds, scope);
4388 }
4389 
4390 
4391 
4393 {
4394  if (!cds.IsSetProduct() || !cds.GetProduct().IsWhole()) {
4395  // no product, or product is specified weirdly
4396  return false;
4397  }
4399  if (!protein) {
4400  // protein is not in the same TSE
4401  return false;
4402  }
4403  if (protein.GetParentBioseq_set() == np) {
4404  // already in the right set
4405  return false;
4406  }
4407  CBioseq_set_EditHandle eh(np);
4408  CSeq_entry_Handle ph = protein.GetSeq_entry_Handle();
4409  CSeq_entry_EditHandle peh(ph);
4410  eh.TakeEntry(peh);
4411  return true;
4412 }
4413 
4414 
4416 {
4417  bool changed = false;
4419  while (si) {
4420  CBioseq_set_Handle set = si->GetSet();
4421  if (set.IsSetClass() && set.GetClass() == CBioseq_set::eClass_nuc_prot && set.HasAnnots()) {
4422  ITERATE(CBioseq_set::TAnnot, annot_it, set.GetCompleteBioseq_set()->GetAnnot()) {
4423  if ((*annot_it)->IsSetData() && (*annot_it)->IsFtable()) {
4424  ITERATE(CSeq_annot::TData::TFtable, feat_it, (*annot_it)->GetData().GetFtable()) {
4425  if ((*feat_it)->IsSetData() && (*feat_it)->GetData().IsCdregion()) {
4426  changed |= RepackageProteins(**feat_it, set);
4427  }
4428  }
4429  }
4430  }
4431  }
4432  ++si;
4433  }
4434  return changed;
4435 }
4436 
4437 
4439 {
4440  bool any_change = false;
4441  for (CBioseq_CI bi(seh, filter); bi; ++bi) {
4442  CBioseq_Handle bsh = *bi;
4443  CRef<CSeq_inst> inst(new CSeq_inst());
4444  inst->Assign(bsh.GetInst());
4445  if (inst->ConvertDeltaToRaw()) {
4446  CBioseq_EditHandle beh(bsh);
4447  beh.SetInst(*inst);
4448  any_change = true;
4449  }
4450  }
4451  return any_change;
4452 }
4453 
4454 
4456  CCdregion& cds,
4457  const CTempString& str,
4458  CScope& scope,
4459  IObjtoolsListener* pMessageListener)
4460 {
4461  if (str.empty() || !feat.IsSetLocation()) {
4462  return false;
4463  }
4464 
4465  const CSeq_id* feat_loc_seq_id = feat.GetLocation().GetId();
4466  if (!feat_loc_seq_id) {
4467  return false;
4468  }
4469 
4470  string::size_type aa_pos = NStr::Find(str, "aa:");
4471  string::size_type len = 0;
4472  string::size_type loc_pos, end_pos;
4473  char protein_letter = 'X';
4474  CRef<CSeq_loc> break_loc;
4475 
4476  if (aa_pos == string::npos) {
4477  aa_pos = NStr::Find(str, ",");
4478  if (aa_pos != string::npos) {
4479  aa_pos = NStr::Find(str, ":", aa_pos);
4480  }
4481  if (aa_pos != string::npos) {
4482  aa_pos++;
4483  }
4484  } else {
4485  aa_pos += 3;
4486  }
4487 
4488  if (aa_pos != string::npos) {
4489  while (aa_pos < str.length() && isspace(str[aa_pos])) {
4490  aa_pos++;
4491  }
4492  while (aa_pos + len < str.length() && isalpha(str[aa_pos + len])) {
4493  len++;
4494  }
4495  if (len != 0) {
4496  protein_letter = ValidAminoAcid(str.substr(aa_pos, len));
4497  }
4498  }
4499 
4500  loc_pos = NStr::Find(str, "(pos:");
4501 
4502  using TSubcode = CCleanupMessage::ESubcode;
4503  auto postMessage =
4504  [pMessageListener](string msg, TSubcode subcode) {
4505  pMessageListener->PutMessage(
4507  };
4508 
4509  if (loc_pos == string::npos) {
4510  if (pMessageListener) {
4511  string msg = "Unable to identify code-break location in '" + str + "'";
4512  postMessage(msg, TSubcode::eParseError);
4513  }
4514  return false;
4515  }
4516  loc_pos += 5;
4517  while (loc_pos < str.length() && isspace(str[loc_pos])) {
4518  loc_pos++;
4519  }
4520 
4521  end_pos = NStr::Find(str, ",aa:", loc_pos);
4522  if (end_pos == NPOS) {
4523  end_pos = NStr::Find(str, ",", loc_pos);
4524  if (end_pos == NPOS) {
4525  end_pos = str.length();
4526  }
4527  }
4528 
4529  string pos = NStr::TruncateSpaces_Unsafe(str.substr(loc_pos, end_pos - loc_pos));
4530 
4531  // handle multi-interval positions by adding a join() around them
4532  if (pos.find_first_of(",") != string::npos) {
4533  pos = "join(" + pos + ")";
4534  }
4535 
4536  break_loc = ReadLocFromText(pos, feat_loc_seq_id, &scope);
4537 
4538  if (!break_loc) {
4539  if (pMessageListener) {
4540  string msg = "Unable to extract code-break location from '" + str + "'";
4541  postMessage(msg, TSubcode::eParseError);
4542  }
4543  return false;
4544  }
4545 
4546  if (break_loc->IsInt() && sequence::GetLength(*break_loc, &scope) > 3) {
4547  if (pMessageListener) {
4548  string msg = "code-break location exceeds 3 bases";
4549  postMessage(msg, TSubcode::eBadLocation);
4550  }
4551  return false;
4552  }
4553  if ((break_loc->IsInt() || break_loc->IsPnt()) &&
4555  if (pMessageListener) {
4556  string msg = "code-break location lies outside of coding region";
4557  postMessage(msg, TSubcode::eBadLocation);
4558  }
4559  return false;
4560  }
4561 
4562  if (FIELD_IS_SET(feat.GetLocation(), Strand)) {
4563  if (GET_FIELD(feat.GetLocation(), Strand) == eNa_strand_minus) {
4564  break_loc->SetStrand(eNa_strand_minus);
4565  }
4566  else if (GET_FIELD(feat.GetLocation(), Strand) == eNa_strand_plus) {
4567  break_loc->SetStrand(eNa_strand_plus);
4568  }
4569  } else {
4570  RESET_FIELD(*break_loc, Strand);
4571  }
4572 
4573  // need to build code break object and add it to coding region
4574  CRef<CCode_break> newCodeBreak(new CCode_break());
4575  CCode_break::TAa& aa = newCodeBreak->SetAa();
4576  aa.SetNcbieaa(protein_letter);
4577  newCodeBreak->SetLoc(*break_loc);
4578 
4579  CCdregion::TCode_break& orig_list = cds.SetCode_break();
4580  orig_list.push_back(newCodeBreak);
4581 
4582  return true;
4583 }
4584 
4585 
4587 {
4588  if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
4589  !feat.IsSetQual() || !feat.IsSetLocation()) {
4590  return false;
4591  }
4592 
4593  bool any_removed = false;
4594  CSeq_feat::TQual::iterator it = feat.SetQual().begin();
4595  while (it != feat.SetQual().end()) {
4596  if ((*it)->IsSetQual() &&
4597  NStr::EqualNocase((*it)->GetQual(), "transl_except") &&
4598  (*it)->IsSetVal() &&
4599  ParseCodeBreak(feat, feat.SetData().SetCdregion(), (*it)->GetVal(), scope)) {
4600  it = feat.SetQual().erase(it);
4601  any_removed = true;
4602  } else {
4603  ++it;
4604  }
4605  }
4606  if (feat.GetQual().size() == 0) {
4607  feat.ResetQual();
4608  }
4609  return any_removed;
4610 }
4611 
4612 
4614 {
4616 
4617  CBioseq_CI bi(entry, CSeq_inst::eMol_na);
4618  while (bi) {
4619  CSeqdesc_CI src(*bi, CSeqdesc::e_Source);
4620  if (src && src->GetSource().IsSetOrg()) {
4621  string key = CInfluenzaSet::GetKey(src->GetSource().GetOrg());
4622  if (!NStr::IsBlank(key)) {
4623  // add to set
4624  auto it = flu_map.find(key);
4625  if (it == flu_map.end()) {
4626  CRef<CInfluenzaSet> new_set(new CInfluenzaSet(key));
4627  new_set->AddBioseq(*bi);
4628  flu_map[key] = new_set;
4629  } else {
4630  it->second->AddBioseq(*bi);
4631  }
4632  }
4633  }
4634  ++bi;
4635  }
4636  // now create sets
4637  size_t added = 0;
4638  for (auto& entry : flu_map) {
4639  if (entry.second->OkToMakeSet()) {
4640  entry.second->MakeSet();
4641  added++;
4642  }
4643  }
4644 
4645  return added;
4646 }
4647 
4648 
4650 {
4652 
4653  CSeq_annot_CI annot_ci(bh);
4654  for (; annot_ci; ++annot_ci) {
4655  if ((*annot_ci).IsFtable()) {
4656  ftable = *annot_ci;
4657  break;
4658  }
4659  }
4660 
4661  if (!ftable) {
4662  CBioseq_EditHandle beh = bh.GetEditHandle();
4663  CRef<CSeq_annot> new_annot(new CSeq_annot());
4664  ftable = beh.AttachAnnot(*new_annot);
4665  }
4666 
4668 
4669  CRef<CSeq_feat> f(new CSeq_feat());
4670  f->SetData().SetImp().SetKey("misc_feature");
4671  f->SetLocation().SetInt().SetFrom(0);
4672  f->SetLocation().SetInt().SetTo(bh.GetBioseqLength() - 1);
4673  f->SetLocation().SetInt().SetId().Assign(*(bh.GetSeqId()));
4674  CRef<CDbtag> xref(new CDbtag());
4675  xref->Assign(tag);
4676  f->SetDbxref().push_back(xref);
4677  CRef<CSeqFeatXref> suppress(new CSeqFeatXref());
4678  suppress->SetData().SetGene();
4679  f->SetXref().push_back(suppress);
4680  aeh.AddFeat(*f);
4681 }
4682 
4683 
4685 {
4686  bool any = false;
4687  CBioseq_CI bi(entry, CSeq_inst::eMol_na);
4688  while (bi) {
4689  CSeqdesc_CI src(*bi, CSeqdesc::e_Source);
4690  while (src) {
4691  if (src->GetSource().IsSetOrg() && src->GetSource().GetOrg().IsSetDb()) {
4692  CRef<COrg_ref> org(const_cast<COrg_ref *>(&(src->GetSource().GetOrg())));
4693  COrg_ref::TDb::iterator db = org->SetDb().begin();
4694  while (db != org->SetDb().end()) {
4695  if ((*db)->IsSetDb() && NStr::Equal((*db)->GetDb(), "IRD")) {
4696  AddIRDMiscFeature(*bi, **db);
4697  db = org->SetDb().erase(db);
4698  any = true;
4699  } else {
4700  ++db;
4701  }
4702  }
4703  if (org->GetDb().size() == 0) {
4704  org->ResetDb();
4705  }
4706  }
4707  ++src;
4708  }
4709  ++bi;
4710  }
4711  return any;
4712 }
4713 
4714 //LCOV_EXCL_START
4715 //not used by asn_cleanup but used by other applications
4716 const unsigned int methionine_encoded = 'M' - 'A';
4717 
4719 {
4720  if (!cb.IsSetAa()) {
4721  return false;
4722  }
4723  bool rval = false;
4724  switch (cb.GetAa().Which()) {
4726  if (cb.GetAa().GetNcbi8aa() == methionine_encoded) {
4727  rval = true;
4728  }
4729  break;
4731  if (cb.GetAa().GetNcbieaa() == 'M') {
4732  rval = true;
4733  }
4734  break;
4736  if (cb.GetAa().GetNcbistdaa() == methionine_encoded) {
4737  rval = true;
4738  }
4739  break;
4740  default:
4741  break;
4742  }
4743  return rval;
4744 }
4745 //LCOV_EXCL_STOP
4746 
4747 
4748 //LCOV_EXCL_START
4749 //not used by asn_cleanup but used by other applications
4751 {
4752  if (!cds.IsSetData() || !cds.GetData().IsCdregion() ||
4753  !cds.IsSetLocation() ||
4754  !cds.GetData().GetCdregion().IsSetCode_break()) {
4755  return CConstRef<CCode_break>();
4756  }
4757 
4758  TSeqPos frame = 0;
4759  if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetFrame())
4760  {
4761  switch(cds.GetData().GetCdregion().GetFrame())
4762  {
4764  case CCdregion::eFrame_one : frame = 0; break;
4765  case CCdregion::eFrame_two : frame = 1; break;
4766  case CCdregion::eFrame_three : frame = 2; break;
4767  default : frame = 0; break;
4768  }
4769  }
4770 
4771  for (auto cb : cds.GetData().GetCdregion().GetCode_break()) {
4772  if (cb->IsSetLoc()) {
4774  cb->GetLoc());
4775  if (offset >= frame &&
4776  ((offset - frame) / 3 ) + 1 == pos) {
4777  return cb;
4778  }
4779  }
4780  }
4781  return CConstRef<CCode_break>();
4782 }
4783 //LCOV_EXCL_STOP
4784 
4785 //LCOV_EXCL_START
4786 //appears not to be used
4787 void CCleanup::SetCodeBreakLocation(CCode_break& cb, size_t pos, const CSeq_feat& cds)
4788 {
4789  int start = static_cast<int>((pos-1)*3);
4790  //start -= 1;
4791  //start *= 3;
4792  int frame = 0;
4793  if (cds.IsSetData() && cds.GetData().IsCdregion() && cds.GetData().GetCdregion().IsSetFrame())
4794  {
4795  switch(cds.GetData().GetCdregion().GetFrame())
4796  {
4798  case CCdregion::eFrame_one : frame = 0; break;
4799  case CCdregion::eFrame_two : frame = 1; break;
4800  case CCdregion::eFrame_three : frame = 2; break;
4801  default : frame = 0; break;
4802  }
4803  }
4804  int frame_shift = (start - frame) % 3;
4805  if (frame_shift < 0) {
4806  frame_shift += 3;
4807  }
4808  if (frame_shift == 1)
4809  start += 2;
4810  else if (frame_shift == 2)
4811  start += 1;
4812 
4813  int offset = 0;
4814  CRef<CSeq_loc> packed (new CSeq_loc());
4815  for (CSeq_loc_CI loc_iter(cds.GetLocation()); loc_iter; ++loc_iter) {
4816  int len = loc_iter.GetRange().GetLength();
4817  if (offset <= start && offset + len > start) {
4819  tmp->SetId().Assign(loc_iter.GetSeq_id());
4820  if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
4821  tmp->SetStrand(eNa_strand_minus);
4822  tmp->SetTo(loc_iter.GetRange().GetTo() - (start - offset) );
4823  } else {
4824  tmp->SetFrom(loc_iter.GetRange().GetFrom() + start - offset);
4825  }
4826  if (offset <= start + 2 && offset + len > start + 2) {
4827  if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
4828  tmp->SetFrom(loc_iter.GetRange().GetTo() - (start - offset + 2) );
4829  } else {
4830  tmp->SetTo(loc_iter.GetRange().GetFrom() + start - offset + 2);
4831  }
4832  } else {
4833  if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
4834  tmp->SetFrom(loc_iter.GetRange().GetFrom());
4835  } else {
4836  tmp->SetTo(loc_iter.GetRange().GetTo());
4837  }
4838  }
4839  packed->SetPacked_int().Set().push_back(tmp);
4840  } else if (offset > start && offset <= start + 2) {
4841  // add new interval
4843  tmp->SetId().Assign(loc_iter.GetSeq_id());
4844  if (loc_iter.IsSetStrand() && loc_iter.GetStrand() == eNa_strand_minus) {
4845  tmp->SetStrand(eNa_strand_minus);
4846  tmp->SetTo(loc_iter.GetRange().GetTo());
4847  if (offset + len >= start + 2) {
4848  tmp->SetFrom(loc_iter.GetRange().GetTo() - (start - offset + 2) );
4849  } else {
4850  tmp->SetFrom(loc_iter.GetRange().GetFrom());
4851  }
4852  } else {
4853  tmp->SetFrom(loc_iter.GetRange().GetFrom());
4854  if (offset + len >= start + 2) {
4855  tmp->SetTo(loc_iter.GetRange().GetFrom() + start - offset + 2);
4856  } else {
4857  tmp->SetTo(loc_iter.GetRange().GetTo());
4858  }
4859  }
4860 
4861  packed->SetPacked_int().Set().push_back(tmp);
4862  }
4863  offset += len;
4864  }
4865  if (packed->Which() != CSeq_loc::e_Packed_int || packed->GetPacked_int().Get().size() == 0) {
4866  cb.ResetLoc();
4867  }
4868  if (packed->GetPacked_int().Get().size() == 1) {
4869  cb.SetLoc().SetInt().Assign(*(packed->GetPacked_int().Get().front()));
4870  } else {
4871  cb.SetLoc(*packed);
4872  }
4873 }
4874 //LCOV_EXCL_STOP
4875 
4876 
4877 //LCOV_EXCL_START
4878 //not used by asn_cleanup but used by other applications
4880 {
4881  if (!cds.IsSetData() || !cds.GetData().IsCdregion()) {
4882  return false;
4883  }
4884  if (!cds.IsSetLocation() ||
4886  return false;
4887  }
4889  if (cbstart && !CCleanup::IsMethionine(*cbstart)) {
4890  // already have a start translation exception AND it is not methionine
4891  return false;
4892  }
4893 
4894  bool any_change = false;
4895  if (!cds.IsSetExcept_text() || NStr::IsBlank(cds.GetExcept_text())) {
4896  cds.SetExcept_text("RNA editing");
4897  any_change = true;
4898  } else if (NStr::Find(cds.GetExcept_text(), "RNA editing") == string::npos) {
4899  cds.SetExcept_text(cds.GetExcept_text() + "; RNA editing");
4900  any_change = true;
4901  }
4902  if (!cds.IsSetExcept() || !cds.GetExcept()) {
4903  cds.SetExcept(true);
4904  any_change = true;
4905  }
4906  return any_change;
4907 }
4908 //LCOV_EXCL_STOP
4909 
4910 
4911 //LCOV_EXCL_START
4912 //not used by asn_cleanup but used by other applications
4914 {
4915  bool any_changes = false;
4916 
4917  vector<CRef<COrg_ref> > rq_list;
4918  vector<const CSeqdesc* > src_descs;
4919  vector<CConstRef<CSeq_feat> > src_feats;
4920 
4921  GetSourceDescriptors(*(seh.GetCompleteSeq_entry()), src_descs);
4922  vector<const CSeqdesc* >::iterator desc_it = src_descs.begin();
4923  while (desc_it != src_descs.end()) {
4924  if ((*desc_it)->GetSource().IsSetSubtype()) {
4925  CSeqdesc* desc = const_cast<CSeqdesc*>(*desc_it);
4926  for (auto s : desc->SetSource().SetSubtype()) {
4927  if (s->IsSetSubtype() && s->GetSubtype() == CSubSource::eSubtype_collection_date
4928  && s->IsSetName()) {
4929  bool month_ambiguous = false;
4930  string new_date = CSubSource::FixDateFormat(s->GetName(), month_first, month_ambiguous);
4931  if (!NStr::Equal(new_date, s->GetName())) {
4932  s->SetName(new_date);
4933  any_changes = true;
4934  }
4935  }
4936  }
4937  }
4938  ++desc_it;
4939  }
4940 
4942  while (feat) {
4943  if (feat->GetData().GetBiosrc().IsSetSubtype()) {
4944  CRef<CSeq_feat> new_feat(new CSeq_feat());
4945  new_feat->Assign(*(feat->GetOriginalSeq_feat()));
4946  bool local_change = false;
4947  for (auto s : new_feat->SetData().SetBiosrc().SetSubtype()) {
4948  if (s->IsSetSubtype() && s->GetSubtype() == CSubSource::eSubtype_collection_date
4949  && s->IsSetName()) {
4950  bool month_ambiguous = false;
4951  string new_date = CSubSource::FixDateFormat(s->GetName(), month_first, month_ambiguous);
4952  if (!NStr::Equal(new_date, s->GetName())) {
4953  s->SetName(new_date);
4954  local_change = true;
4955  }
4956  }
4957  }
4958  if (local_change) {
4959  any_changes = true;
4960  CSeq_feat_EditHandle efh(*feat);
4961  efh.Replace(*new_feat);
4962  }
4963  ++feat;
4964  }
4965  }
4966 
4967  return any_changes;
4968 }
4969 //LCOV_EXCL_STOP
4970 
4971 
4973 {
4974  // remove existing options (TODO)
4975  for (CBioseq_CI b(seh); b; ++b) {
4976  bool removed = true;
4977  while (removed) {
4978  removed = false;
4980  while (ud) {
4981  if (ud->GetUser().IsAutodefOptions()) {
4982  CSeq_entry_Handle s = ud.GetSeq_entry_Handle();
4984  se.RemoveSeqdesc(*ud);
4985  removed = true;
4986  break;
4987  }
4988  ++ud;
4989  }
4990  }
4991  }
4992 
4993  // create new options
4995  CRef<CSeqdesc> d(new CSeqdesc());
4996  d->SetUser().Assign(*auto_user);
4998  eh.AddSeqdesc(*d);
4999 
5001 }
5002 
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CRef< objects::CSeq_id > GetNewProtId(objects::CBioseq_Handle bsh, int &offset, string &id_label, bool general_only)
bool IsGeneralIdProtPresent(objects::CSeq_entry_Handle tse)
@Auth_list.hpp User-defined methods of the data storage class.
Definition: Auth_list.hpp:57
CSeqdesc & Set(bool skip_lookup=false)
Definition: Seq_descr.cpp:93
static CRef< CUser_object > CreateIDOptions(CSeq_entry_Handle seh)
Definition: autodef.cpp:1442
static bool RegenerateSequenceDefLines(CSeq_entry_Handle se)
Definition: autodef.cpp:1248
static string GetOrganelleByGenome(unsigned int genome)
Definition: BioSource.cpp:216
int GetGenCode(int def=1) const
Definition: BioSource.cpp:73
bool HasSubtype(CSubSource::TSubtype subtype) const
Definition: BioSource.cpp:2040
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_EditHandle –.
CBioseq_Handle –.
CBioseq_set_EditHandle –.
CBioseq_set_Handle –.
CConstRef< CBioseq_set > GetParentSet(void) const
Definition: Bioseq_set.cpp:294
CConstRef< CBioseq_set > GetParentSet(void) const
Definition: Bioseq_set.cpp:312
bool IsAa(void) const
Definition: Bioseq.cpp:350
CCdregion –.
Definition: Cdregion.hpp:66
vector< string > GetAllDescriptions() const
Definition: cleanup.cpp:301
vector< EChanges > GetAllChanges() const
Definition: cleanup.cpp:295
vector< string_view > GetDescriptions() const
Definition: cleanup.cpp:311
static string_view GetDescription(EChanges e)
Definition: cleanup.cpp:441
static bool RescueSiteRefPubs(CSeq_entry_Handle seh)
Rescue pubs from Site-ref features.
Definition: cleanup.cpp:3484
static bool ConvertDeltaSeqToRaw(CSeq_entry_Handle seh, CSeq_inst::EMol filter=CSeq_inst::eMol_not_set)
Definition: cleanup.cpp:4438
static bool RenormalizeNucProtSets(CSeq_entry_Handle seh)
Convert nuc-prot sets with just one sequence to just the sequence can't be done during the explore ph...
Definition: cleanup.cpp:4090
static bool ShouldStripPubSerial(const CBioseq &bs)
Definition: cleanup.cpp:4043
static bool RemoveOrphanLocus_tagGeneXrefs(CSeq_feat &f, CBioseq_Handle bsh)
Removes orphaned locus_tag Gene-xrefs.
Definition: cleanup.cpp:1024
static bool FixGeneXrefSkew(CSeq_entry_Handle seh)
Examine all genes and gene xrefs in the Seq-entry.
Definition: cleanup.cpp:3942
static void MoveOneFeatToPubdesc(CSeq_feat_Handle feat, CRef< CSeqdesc > d, CBioseq_Handle b, bool remove_feat=true)
Definition: cleanup.cpp:3380
static bool AddGenBankWrapper(CSeq_entry_Handle seh)
Add GenBank Wrapper Set.
Definition: cleanup.cpp:3092
static bool ConvertPubFeatsToPubDescs(CSeq_entry_Handle seh)
Convert full-length publication features to publication descriptors.
Definition: cleanup.cpp:3415
static void SetProteinName(CProt_ref &prot, const string &protein_name, bool append)
Definition: cleanup.cpp:1356
static void s_SetProductOnFeat(CSeq_feat &feat, const string &protein_name, bool append)
Definition: cleanup.cpp:1411
static bool AddPartialToProteinTitle(CBioseq &bioseq)
Adjusts protein title to reflect partialness.
Definition: cleanup.cpp:2398
static bool RemovePseudoProduct(CSeq_feat &cds, CScope &scope)
Removes protein product from pseudo coding region.
Definition: cleanup.cpp:2564
static bool FixECNumbers(CSeq_entry_Handle entry)
Fix EC numbers.
Definition: cleanup.cpp:1740
static bool AddMissingMolInfo(CBioseq &seq, bool is_product)
Adds missing MolInfo descriptor to sequence.
Definition: cleanup.cpp:1851
static void SetMrnaName(CSeq_feat &mrna, const string &protein_name)
Definition: cleanup.cpp:1369
static CRef< CSeq_entry > AddProtein(const CSeq_feat &cds, CScope &scope)
Definition: cleanup.cpp:2078
static bool OkToPromoteNpPub(const CPubdesc &pd)
Some pubs should not be promoted to nuc-prot set from sequence.
Definition: cleanup.cpp:3370
static void GetPubdescLabels(const CPubdesc &pd, vector< TEntrezId > &pmids, vector< TEntrezId > &muids, vector< int > &serials, vector< string > &published_labels, vector< string > &unpublished_labels)
For Publication Citations Get labels for a pubdesc.
Definition: cleanup.cpp:3167
static bool DecodeXMLMarkChanged(std::string &str)
decodes various tags, including carriage-return-line-feed constructs
Definition: cleanup.cpp:4126
static bool SetFeaturePartial(CSeq_feat &f)
Set feature partial based on feature location.
Definition: cleanup.cpp:1660
static bool AddProteinTitle(CBioseq_Handle bsh)
Creates missing protein title descriptor.
Definition: cleanup.cpp:1892
static size_t MakeSmallGenomeSet(CSeq_entry_Handle entry)
Definition: cleanup.cpp:4613
static bool ExtendToStopIfShortAndNotPartial(CSeq_feat &f, CBioseq_Handle bsh, bool check_for_stop=true)
Extends a coding region up to 50 nt.
Definition: cleanup.cpp:1302
static bool IsGeneXrefUnnecessary(const CSeq_feat &sf, CScope &scope, const CGene_ref &gene_xref)
Calculates whether a Gene-xref is unnecessary (because it refers to the same gene as would be calcula...
Definition: cleanup.cpp:755
static bool RemoveNcbiCleanupObject(CSeq_entry &seq_entry)
Removes NcbiCleanup User Objects in the Seq-entry.
Definition: cleanup.cpp:1925
static bool ClearInternalPartials(CSeq_loc &loc, bool is_first=true, bool is_last=true)
Clear internal partials.
Definition: cleanup.cpp:1580
static bool RepackageProteins(CSeq_entry_Handle seh)
Find proteins that are not packaged in the same nuc-prot set as the coding region for which they are ...
Definition: cleanup.cpp:4415
CCleanup(CScope *scope=nullptr, EScopeOptions scope_handling=eScope_Copy)
Definition: cleanup.cpp:89
static bool ParseCodeBreaks(CSeq_feat &feat, CScope &scope)
Parses all valid transl_except Gb-quals into code-breaks for cdregion, then removes the transl_except...
Definition: cleanup.cpp:4586
static bool SetMolinfoTech(CBioseq_Handle seq, CMolInfo::ETech tech)
Sets MolInfo::tech for a sequence.
Definition: cleanup.cpp:1803
static bool AddLowQualityException(CSeq_entry_Handle entry)
For table2asn -c s Adds an exception of "low-quality sequence region" to coding regions and mRNAs tha...
Definition: cleanup.cpp:2958
static bool RemoveUnseenTitles(CSeq_entry_EditHandle::TSeq seq)
Remove all titles in Seqdescr except the last, because it is the only one that would be displayed in ...
Definition: cleanup.cpp:3054
static bool RemoveDupBioSource(CSeq_descr &descr)
Remove duplicate biosource descriptors.
Definition: cleanup.cpp:3821
EScopeOptions
Definition: cleanup.hpp:83
@ eScope_UseInPlace
Definition: cleanup.hpp:85
static bool ExtendStopPosition(CSeq_feat &f, const CSeq_feat *cdregion, size_t extension=0)
Definition: cleanup.cpp:1083
TChanges ExtendedCleanup(CSeq_entry &se, Uint4 options=0)
Cleanup a Seq-entry.
Definition: cleanup.cpp:259
@ eClean_NoReporting
Definition: cleanup.hpp:73
static bool s_IsProductOnFeat(const CSeq_feat &cds)
Definition: cleanup.cpp:1390
static bool SetGenePartialByLongestContainedFeature(CSeq_feat &gene, CScope &scope)
Set partialness of gene to match longest feature contained in gene.
Definition: cleanup.cpp:1766
static CConstRef< CCode_break > GetCodeBreakForLocation(size_t pos, const CSeq_feat &cds)
utility function for finding the code break for a given amino acid position pos is the position of th...
Definition: cleanup.cpp:4750
TChanges BasicCleanup(CSeq_entry &se, Uint4 options=0)
Definition: cleanup.cpp:132
static bool SetCDSPartialsByFrameAndTranslation(CSeq_feat &cds, CScope &scope)
1.
Definition: cleanup.cpp:1540
static bool RemoveBadECNumbers(CProt_ref::TEc &ec_num_list)
Delete EC numbers.
Definition: cleanup.cpp:1716
void SetScope(CScope *scope)
Definition: cleanup.cpp:108
static bool RepairXrefs(const CSeq_feat &f, const CTSE_Handle &tse)
Repairs non-reciprocal xref pairs for specified feature if xrefs between subtypes are permitted and f...
Definition: cleanup.cpp:916
static bool ExtendToStopCodon(CSeq_feat &f, CBioseq_Handle bsh, size_t limit)
Extends a feature up to limit nt to a stop codon, or to the end of the sequence if limit == 0 (partia...
Definition: cleanup.cpp:1124
static CRef< CSeq_loc > GetProteinLocationFromNucleotideLocation(const CSeq_loc &nuc_loc, CScope &scope)
Definition: cleanup.cpp:4378
static bool ParseCodeBreak(const CSeq_feat &feat, CCdregion &cds, const CTempString &str, CScope &scope, IObjtoolsListener *pMessageListener=nullptr)
Parse string into code break and add to coding region.
Definition: cleanup.cpp:4455
static void SetCodeBreakLocation(CCode_break &cb, size_t pos, const CSeq_feat &cds)
utility function for setting code break location given offset pos is the position of the amino acid w...
Definition: cleanup.cpp:4787
static bool x_AddLowQualityException(CSeq_feat &feat)
Definition: cleanup.cpp:2915
static const string & GetProteinName(const CProt_ref &prot)
Definition: cleanup.cpp:1478
static CRef< CBioSource > BioSrcFromFeat(const CSeq_feat &f)
Get BioSource from feature to use for source descriptor.
Definition: cleanup.cpp:3851
static bool MergeDupBioSources(CSeq_descr &descr)
Definition: cleanup.cpp:3792
static bool SetFrameFromLoc(CCdregion &cdregion, const CSeq_loc &loc, CScope &scope)
Chooses best frame based on location 1.
Definition: cleanup.cpp:1264
static bool SetMolinfoBiomol(CBioseq_Handle seq, CMolInfo::EBiomol biomol)
Sets MolInfo::biomol for a sequence.
Definition: cleanup.cpp:1829
static bool MoveProteinSpecificFeats(CSeq_entry_Handle seh)
Moves protein-specific features from nucleotide sequences in the Seq-entry to the appropriate protein...
Definition: cleanup.cpp:735
static bool TaxonomyLookup(CSeq_entry_Handle seh)
Looks up Org-refs in the Seq-entry.
Definition: cleanup.cpp:1999
static bool PubAlreadyInSet(const CPubdesc &pd, const CSeq_descr &descr)
Definition: cleanup.cpp:3346
static bool NormalizeDescriptorOrder(CSeq_descr &descr)
Normalize Descriptor Order on a specific Seq-entry.
Definition: cleanup.cpp:3027
static bool ConvertSrcFeatsToSrcDescs(CSeq_entry_Handle seh)
Convert full-length source features to source descriptors.
Definition: cleanup.cpp:3884
static bool SetBestFrame(CSeq_feat &cds, CScope &scope)
Translates coding region and selects best frame (without stops, or longest)
Definition: cleanup.cpp:1205
static bool x_MergeDupOrgNames(COrgName &on1, const COrgName &add)
Definition: cleanup.cpp:3697
static bool FindMatchingLocusGene(CSeq_feat &f, const CGene_ref &gene_xref, CBioseq_Handle bsh)
Detects gene features with matching locus.
Definition: cleanup.cpp:953
static bool MoveFeatToProtein(CSeq_feat_Handle fh)
Moves one feature from nucleotide bioseq to the appropriate protein sequence.
Definition: cleanup.cpp:600
static bool RemoveOrphanLocusGeneXrefs(CSeq_feat &f, CBioseq_Handle bsh)
Removes orphaned locus Gene-xrefs.
Definition: cleanup.cpp:976
static void AddNcbiCleanupObject(int ncbi_cleanup_version, CSeq_descr &descr)
Adds NcbiCleanup User Object to Seq-descr.
Definition: cleanup.cpp:1956
CRef< CScope > m_Scope
Definition: cleanup.hpp:611
static bool AreBioSourcesMergeable(const CBioSource &src1, const CBioSource &src2)
Definition: cleanup.cpp:3563
static bool ExpandGeneToIncludeChildren(CSeq_feat &gene, CTSE_Handle &tse)
Expands gene to include features it cross-references.
Definition: cleanup.cpp:2598
static bool WGSCleanup(CSeq_entry_Handle entry, bool instantiate_missing_proteins=true, Uint4 options=0, bool run_extended_cleanup=true)
Performs WGS specific cleanup.
Definition: cleanup.cpp:2680
static vector< CConstRef< CPub > > GetCitationList(CBioseq_Handle bsh)
Get list of pubs that can be used for citations for Seq-feat on a Bioseq-handle.
Definition: cleanup.cpp:3234
static bool LocationMayBeExtendedToMatch(const CSeq_loc &orig, const CSeq_loc &improved)
Checks whether it is possible to extend the original location up to improved one.
Definition: cleanup.cpp:1344
static bool UpdateECNumbers(CProt_ref::TEc &ec_num_list)
Update EC numbers.
Definition: cleanup.cpp:1690
static bool FixRNAEditingCodingRegion(CSeq_feat &cds)
From GB-7563 An action has been requested that will do the following: 1.
Definition: cleanup.cpp:4879
static bool x_HasShortIntron(const CSeq_loc &loc, size_t min_len=11)
Definition: cleanup.cpp:2868
static bool SetGeneticCodes(CBioseq_Handle bsh)
Sets genetic codes for coding regions on Bioseq-Handle.
Definition: cleanup.cpp:2149
static bool RemoveUnnecessaryGeneXrefs(CSeq_feat &f, CScope &scope)
Removes unnecessary Gene-xrefs.
Definition: cleanup.cpp:790
static bool CleanupCollectionDates(CSeq_entry_Handle seh, bool month_first)
Definition: cleanup.cpp:4913