NCBI C++ ToolKit
newcleanupp.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Robert Smith, Jonathan Kans, Michael Kornbluh
27 *
28 * File Description:
29 * Basic and Extended Cleanup of CSeq_entries, etc.
30 *
31 * ===========================================================================
32 */
33 
34 // All this functionality is packed into this one file for ease of
35 // searching. If it gets big enough, it will be broken up in the future.
36 
37 #include <ncbi_pch.hpp>
38 
40 
42 #include <objmgr/annot_ci.hpp>
43 #include <objmgr/feat_ci.hpp>
44 #include <objmgr/seqdesc_ci.hpp>
45 #include <objmgr/scope.hpp>
46 #include <objmgr/seq_vector.hpp>
48 #include <objmgr/util/feature.hpp>
49 #include <objmgr/util/sequence.hpp>
51 
53 #include "newcleanupp.hpp"
54 
55 #include "cleanup_utils.hpp"
56 
60 
61 #include <objmgr/bioseq_ci.hpp>
63 #include <objmgr/scope.hpp>
64 
68 
69 #include <util/ncbi_cache.hpp>
72 #include <util/xregexp/regexp.hpp>
73 #include <util/strsearch.hpp>
74 
75 #include <objmgr/util/objutil.hpp>
76 
79 
82 
83 //const int CNewCleanup_imp::NCBI_CLEANUP_VERSION = 1;
84 
85 // We don't want to use CompressSpaces inside the likes of COMPRESS_STRING_MEMBER
86 // we prefer our own version
87 #define CompressSpaces x_CompressSpaces
88 
89 namespace {
90 
91  // a CRegexp that has lock and unlock methods,
92  // and also inherits from CObject
93  class CRegexpWithLock : public CObject, public CRegexp {
94  public:
95  CRegexpWithLock( const CTempStringEx & pattern,
96  CRegexp::TCompile flags ) : CRegexp(pattern, flags) { }
97 
98  void Lock(void) { m_mutex.Lock(); }
99  void Unlock(void) { m_mutex.Unlock(); }
100 
101  private:
102  CMutex m_mutex;
103  };
104  typedef CRef<CRegexpWithLock> TRegexpWithLockRef;
105 
106  // this protects its inner object by locking
107  // it as soon as it's created and unlocking it when destroyed.
108  // this way, there's only one working CLockingRef on the object at a time
109  template<typename TLockableObj>
110  class CLockingRef {
111  public:
112  explicit
113  CLockingRef(TLockableObj *pLockableObj) :
114  m_pLockableObj(pLockableObj)
115  {
116  m_pLockableObj->Lock();
117  }
118 
119  ~CLockingRef(void) {
120  m_pLockableObj->Unlock();
121  }
122 
123  TLockableObj * operator->(void) { return m_pLockableObj.GetPointer(); }
124 
125  private:
126  CRef<TLockableObj> m_pLockableObj;
127  };
128  typedef CLockingRef<CRegexpWithLock> CCachedRegexp;
129 
130  // careful! the key is compared as a *pointer*, NOT via
131  // strcmp or anything like that. For safety, just use
132  // string literals.
133  typedef pair<const char *, CRegexp::TCompile> TRegexpKey;
134  typedef TRegexpWithLockRef TRegexpValue;
135 
136  class CRegexpCacheHandler :
137  public CCacheElement_Handler<TRegexpKey, TRegexpValue>
138  {
139  public:
140  TRegexpValue CreateValue(const TRegexpKey & regexp_key )
141  {
142  return Ref(new CRegexpWithLock(
143  regexp_key.first, regexp_key.second));
144  }
145  };
146 
147  class CRegexpCache {
148  public:
149 
150  CRegexpCache(void)
151  : m_Cache(100) { }
152 
153  CCachedRegexp Get( const char * pattern,
155  {
156  TRegexpKey regexpKey(pattern, flags);
157  TRegexpWithLockRef regexpLockRef = m_Cache[regexpKey];
158  return CCachedRegexp(regexpLockRef.GetPointer());
159  }
160 
161  private:
162  typedef CCache<TRegexpKey, TRegexpValue,
163  CRegexpCacheHandler> TUnderlyingCache;
164  TUnderlyingCache m_Cache;
165  };
166 
167  // the actual cache
168  CRegexpCache regexpCache;
169 }
170 
171 // Constructor
173  : m_Changes(changes),
174  m_Options(options)
175 {
176  if (options & CCleanup::eClean_GpipeMode) {
177  m_IsGpipe = true;
178  }
179 
180  if (options & CCleanup::eClean_SyncGenCodes) {
181  m_SyncGenCodes = true;
182  }
183 
184  if (options & CCleanup::eClean_KeepTopSet) {
185  m_KeepTopNestedSet = true;
186  }
187 
188  if (options & CCleanup::eClean_KeepSingleSeqSet) {
189  m_KeepSingleSeqSet = true;
190  }
191 
193  m_Scope.Reset (new CScope (*m_Objmgr));
194 
195 }
196 
197 // Destructor
199 
200 {
201 }
202 
203 // Main methods
204 
206  CSeq_entry& se
207 )
208 
209 {
210  // The class CAutogeneratedCleanup is actually auto-generated code
211  // created by datatool from autogenerated_cleanup.txt
212  // It traverses into the CSeq_entry object we have here and
213  // calls our functions here.
214  // The idea is that we don't have to hand-write the
215  // error-prone traversal code.
216  SetGlobalFlags(se);
217  CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
218  auto_cleanup.BasicCleanupSeqEntry( se );
220 
222  CBioseq& bs = *bit;
223  SetGeneticCode (bs);
224  }
225 }
226 
227 //LCOV_EXCL_START
228 //not used by asn_cleanup because we clean the submit block separately
229 //and use read hooks for the seq-entries
231  CSeq_submit& ss
232 )
233 
234 {
235  SetGlobalFlags(ss);
236  CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
237  auto_cleanup.BasicCleanupSeqSubmit( ss );
239 
240  CRef<CSeq_entry> se (ss.SetData().SetEntrys().front());
241  if (se.NotEmpty()) {
243  CBioseq& bs = *bit;
244  SetGeneticCode (bs);
245  }
246  }
247 }
248 //LCOV_EXCL_STOP
249 
250 
252 
253 {
254  SubmitblockBC(block);
255 }
256 
257 
259  CSeq_annot& sa
260 )
261 
262 {
263  // no Seq-entry context, so skip setup function
265  CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
266  auto_cleanup.BasicCleanupSeqAnnot( sa );
268 }
269 
271  CBioseq& bs
272 )
273 
274 {
275  // no Seq-entry context, so skip setup function
276  SetGlobalFlags(bs);
277  CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
278  auto_cleanup.BasicCleanupBioseq( bs );
279 
281 
282  SetGeneticCode (bs);
283 }
284 
286  CBioseq_set& bss
287 )
288 
289 {
290  // no Seq-entry context, so skip setup function
291  SetGlobalFlags(bss);
292  CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
293  auto_cleanup.BasicCleanupBioseqSet( bss );
295 
297  CBioseq& bs = *bit;
298  SetGeneticCode (bs);
299  }
300 }
301 
303  CSeq_feat& sf
304 )
305 
306 {
307  // no Seq-entry context, so skip setup function
309  CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
310  auto_cleanup.BasicCleanupSeqFeat( sf );
312 }
313 
314 
316  CBioSource& src
317 )
318 
319 {
320  // no Seq-entry context, so skip setup function
322  CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
323  CRef<CSeq_feat> f(new CSeq_feat());
324  f->SetData().SetBiosrc().Assign(src);
325  auto_cleanup.BasicCleanupSeqFeat(*f);
327  src.Assign(f->GetData().GetBiosrc());
328 }
329 
330 
332 {
333  BiosourceBC(src);
334  BioSourceEC(src);
335 }
336 
337 
339  CSeq_entry_Handle& seh
340 )
341 {
343  CSeq_entry* se = const_cast<CSeq_entry*>(seq_entry.GetPointer());
345 }
346 
348  CBioseq_Handle& bsh
349 )
350 {
351  BasicCleanupBioseq(*const_cast<CBioseq*>(bsh.GetCompleteBioseq().GetPointer()));
352 }
353 
355  CBioseq_set_Handle& bssh
356 )
357 {
359 }
360 
362  CSeq_annot_Handle& sah
363 )
364 {
365  // clean a copy, and then update via the edit handle
366 
367  CRef<CSeq_annot> new_seq_annot( new CSeq_annot );
368  new_seq_annot->Assign( *sah.GetCompleteSeq_annot() );
369 
370  CSeq_annot_EditHandle edit_handle = sah.GetEditHandle();
371 
372  BasicCleanupSeqAnnot( *new_seq_annot );
373 
374  // Since CSeq_annot_EditHandle doesn't have ".Set[Fld]()" methods or
375  // a Replace() method, it's a little more tricky than the others.
376  CSeq_entry_EditHandle annot_parent = edit_handle.GetParentEntry();
377  if( annot_parent ) {
378  edit_handle.Remove();
379  sah = annot_parent.AttachAnnot( *new_seq_annot );
380  } else {
381  // if not part of anything else, a simple swap will do
382  CSeq_annot_Handle new_sah = m_Scope->AddSeq_annot( *new_seq_annot );
383  edit_handle.Swap( new_sah );
384  }
385 }
386 
388  CSeq_feat_Handle& sfh
389 )
390 {
391  // clean a copy, and then update via the edit handle
392 
393  CRef<CSeq_feat> new_seq_feat( new CSeq_feat );
394  new_seq_feat->Assign( *sfh.GetOriginalSeq_feat() );
395 
396  CSeq_feat_EditHandle edit_handle( sfh );
397 
398  BasicCleanupSeqFeat( *new_seq_feat );
399 
400  edit_handle.Replace( *new_seq_feat );
401 }
402 
403 
404 void CNewCleanup_imp::BasicCleanup(CPubdesc& pd, bool strip_serial)
405 {
406  bool was_strip_serial = m_StripSerial;
407  m_StripSerial = strip_serial;
408  PubdescBC(pd);
409  m_StripSerial = was_strip_serial;
410 }
411 
412 
414 {
416  CAutogeneratedCleanup auto_cleanup(*m_Scope, *this);
417  auto_cleanup.BasicCleanupSeqdesc(desc);
419 }
420 
421 
422 
423 
424 // Implementation methods
425 
427  CBioseq& bs
428 )
429 
430 {
431  if ( ! m_SyncGenCodes ) return;
432 
434  if (!bsh) return;
435 
436  if (CCleanup::SetGeneticCodes(bsh)) {
438  }
439 }
440 
442 {
443  if (m_Changes) {
444  m_Changes->SetChanged (e);
445  }
446 }
447 
449  CSeq_entry& se
450 )
451 
452 {
453 #if 0
454  SSeqEntryInfo seqEntryInfo;
455  if( ! m_SeqEntryInfoStack.empty() ) {
456  // inherit from parent by default
457  seqEntryInfo = m_SeqEntryInfoStack.top();
458  } else {
459  seqEntryInfo.m_IsEmblOrDdbj = false;
460  seqEntryInfo.m_StripSerial = true;
461  }
462 #endif
463 
464  // for cleanup Seq-entry and Seq-submit, set scope and parentize.
465  // We use exceptions for AddTopLevelSeqEntry because we need to detect
466  // if we've already processed the given Seq-entry.
467  {{
468  CSeq_entry_Handle seh =
470  if (seh) {
471 #if 0
472  // all code paths in this function must result
473  // in m_SeqEntryInfoStack getting a "push"
474  m_SeqEntryInfoStack.push( m_SeqEntryInfoStack.top() );
475 #endif
476  return;
477  }
478 
480  se.Parentize();
481  }}
482 
483 #if 0
484  // a few differences based on sequence identifier type
485  // (some values are reset here because they shouldn't inherit
486  // from higher seq-entry's)
487  VISIT_ALL_BIOSEQS_WITHIN_SEQENTRY (bs_itr, se) {
488  const CBioseq& bs = *bs_itr;
489  FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, bs) {
490  const CSeq_id& sid = **sid_itr;
491  SWITCH_ON_SEQID_CHOICE (sid) {
492  case NCBI_SEQID(Genbank):
493  case NCBI_SEQID(Tpg):
494  {
495  const CTextseq_id& tsid = *GET_FIELD (sid, Textseq_Id);
496  if (FIELD_IS_SET (tsid, Accession)) {
497  const string& acc = GET_FIELD (tsid, Accession);
498  if (acc.length() == 6) {
499  seqEntryInfo.m_StripSerial = false;
500  }
501  }
502  }
503  break;
504  case NCBI_SEQID(Embl):
505  case NCBI_SEQID(Ddbj):
506  seqEntryInfo.m_StripSerial = false;
507  seqEntryInfo.m_IsEmblOrDdbj = true;
508  break;
509  case NCBI_SEQID(not_set):
510  case NCBI_SEQID(Local):
511  case NCBI_SEQID(Other):
512  case NCBI_SEQID(General):
513  break;
514  case NCBI_SEQID(Gibbsq):
515  case NCBI_SEQID(Gibbmt):
516  case NCBI_SEQID(Pir):
517  case NCBI_SEQID(Swissprot):
518  case NCBI_SEQID(Patent):
519  case NCBI_SEQID(Prf):
520  case NCBI_SEQID(Pdb):
521  case NCBI_SEQID(Gpipe):
522  case NCBI_SEQID(Tpe):
523  case NCBI_SEQID(Tpd):
524  seqEntryInfo.m_StripSerial = false;
525  break;
526  default:
527  break;
528  }
529  }
530  }
531 
532  m_SeqEntryInfoStack.push(seqEntryInfo);
533 #endif
534 }
535 
537  CSeq_entry& se
538 )
539 
540 {
541 #if 0
542  m_SeqEntryInfoStack.pop();
543 #endif
544 }
545 
546 // Strips all spaces in string in following manner. If the function
547 // meets several spaces (spaces and tabs) in succession it replaces them
548 // with one space. Strips all spaces after '(' and before ( ')' or ',' ).
550 {
551  if (StripSpaces(str)) {
553  }
554 }
555 
557 {
560  }
561 }
562 
564 {
565 /*
566 #ifndef NCBI_OS_MSWIN
567  string orig = str;
568  NStr::ReplaceInPlace(str, "based on SOLiD3 (Applied Biosystems)~~", "based on SOLiD3 (Applied Biosystems)", false, false);
569  NStr::ReplaceInPlace(str, "Biological resourse center, NITE (NRBC)~~", "Biological resourse center, NITE (NRBC)", false, false);
570  NStr::ReplaceInPlace(str, "developmental01.html~~", "developmental01.html", false, false);
571  NStr::ReplaceInPlace(str, "http://bionano.toyo.ac.jp/~~", "http://bionano.toyo.ac.jp/", false, false);
572  NStr::ReplaceInPlace(str, "http://dictycdb1.biol.tsukuba.ac.jp/acytodb/~~", "http://dictycdb1.biol.tsukuba.ac.jp/acytodb/", false, false);
573  NStr::ReplaceInPlace(str, "http://egg.umh.es~~", "http://egg.umh.es", false, false);
574  NStr::ReplaceInPlace(str, "http://www.aist.go.jp/~~", "http://www.aist.go.jp/", false, false);
575  NStr::ReplaceInPlace(str, "http://www.bio.nite.go.jp/~~", "http://www.bio.nite.go.jp/", false, false);
576  NStr::ReplaceInPlace(str, "http://www.bio.nite.go.jp/ngac/e/~~", "http://www.bio.nite.go.jp/ngac/e/", false, false);
577  NStr::ReplaceInPlace(str, "http://www.brs.kyushu-u.ac.jp/~fcmic/~~", "http://www.brs.kyushu-u.ac.jp/~fcmic/", false, false);
578  NStr::ReplaceInPlace(str, "http://www.miyazaki-u.ac.jp/ir/english/index.html~~", "http://www.miyazaki-u.ac.jp/ir/english/index.html", false, false);
579  NStr::ReplaceInPlace(str, "URL:http://www.bio.nite.go.jp/ngac/e/~~", "URL:http://www.bio.nite.go.jp/ngac/e/", false, false);
580  if (!NStr::Equal(orig, str)) {
581  ChangeMade(CCleanupChange::eTrimSpaces);
582  }
583 #endif //NCBI_OS_MSWIN
584 */
585 }
586 
588 {
589  const size_t old_str_size = str.length();
591  if( old_str_size != str.length() ) {
593  }
594 }
595 
597 {
598  const size_t old_str_size = str.length();
600  if( old_str_size != str.length() ) {
602  }
603 }
604 
605 
606 
607 static void s_IncrementSeqCount(const CBioseq& bioseq,
608  int& num_nucs, int& num_prots)
609 {
610  if (!bioseq.IsSetInst()) {
611  return;
612  }
613 
614  const auto& inst = bioseq.GetInst();
615  if (inst.IsNa()) {
616  ++num_nucs;
617  } else if (inst.IsAa()) {
618  ++num_prots;
619  }
620 }
621 
622 
623 static bool s_IsValidNPSubset(const CBioseq_set& bioseqSet)
624 {
625  return (bioseqSet.IsSetClass() &&
626  (bioseqSet.GetClass() == CBioseq_set::eClass_parts ||
627  bioseqSet.GetClass() == CBioseq_set::eClass_segset));
628 
629 }
630 
631 
632 static void s_ScanWhilePossibleNPSet(const CBioseq_set& bioseqSet,
633  int& num_nucs, int& num_prots, bool& hasInvalidSubset)
634 {
635  if (bioseqSet.IsSetSeq_set()) {
636  for (const auto& pSubEntry : bioseqSet.GetSeq_set()) {
637  if (pSubEntry) {
638  if (pSubEntry->IsSeq()) {
639  s_IncrementSeqCount(pSubEntry->GetSeq(), num_nucs, num_prots);
640  } else {
641  const auto& bioseqSet = pSubEntry->GetSet();
642  if (!s_IsValidNPSubset(bioseqSet)) {
643  hasInvalidSubset = true;
644  return;
645  }
646  s_ScanWhilePossibleNPSet(bioseqSet, num_nucs, num_prots, hasInvalidSubset);
647  }
648  if (num_nucs > 1) {
649  return;
650  }
651  }
652  }
653  }
654 }
655 
656 
657 static bool s_LooksLikeNucProtSet(const CBioseq_set& bioseqSet)
658 {
659  int numNucs{0};
660  int numProts{0};
661  bool hasInvalidSubset{false};
662  s_ScanWhilePossibleNPSet(bioseqSet, numNucs, numProts, hasInvalidSubset);
663 
664  return (!hasInvalidSubset && numNucs == 1 && numProts > 0);
665 }
666 
667 
669 {
670  if (bss.IsSetClass() &&
673  return;
674  }
675 
676  if( s_LooksLikeNucProtSet(bss) ) {
679  } else {
682  }
683 }
684 
685 static CMolInfo::TCompleteness GetCompletenessFromFlags(bool partial5, bool partial3, bool partial)
686 {
688  if (partial5 && partial3) {
690  } else if (partial5) {
692  } else if (partial3) {
694  } else if (partial) {
696  }
697  return comp;
698 }
699 
701 {
702  // Bail if not protein
703  if (!bs.IsSetInst()) {
704  return;
705  }
706  CSeq_inst& inst = bs.SetInst();
707  if (!inst.IsSetMol() || inst.GetMol() != CSeq_inst::eMol_aa) {
708  return;
709  }
710 
712  bs.SetInst().ResetTopology();
714  }
715 
716  // Bail if no GIBBSQ ID
717  if (!bs.IsSetId()) {
718  return;
719  }
720  bool has_gibbsq = false;
721  ITERATE(CBioseq::TId, id, bs.GetId()) {
722  if ((*id)->IsGibbsq()) {
723  has_gibbsq = true;
724  break;
725  }
726  }
727  if (!has_gibbsq) {
728  return;
729  }
730 
731  // Bail if no title or no partialness clues in title
732  if (!bs.IsSetDescr()) {
733  return;
734  }
735  bool make_partial5 = false;
736  bool make_partial3 = false;
737  for (auto dit : bs.GetDescr().Get()) {
738  if (dit->IsTitle()) {
739  if (NStr::Find(dit->GetTitle(), "{C-terminal}") != string::npos) {
740  make_partial5 = true;
741  }
742  if (NStr::Find(dit->GetTitle(), "{N-terminal}") != string::npos) {
743  make_partial3 = true;
744  }
745  break;
746  }
747  }
748 
749  if (!make_partial5 && !make_partial3) {
750  return;
751  }
752 
753  // Bail if no protein feature with missing partials
754  if (!bs.IsSetAnnot()) {
755  return;
756  }
757  for (auto ait : bs.SetAnnot()) {
758  if (ait->IsSetData() && ait->GetData().IsFtable()) {
759  for (auto fi : ait->SetData().SetFtable()) {
760  if (fi->IsSetData() &&
761  fi->GetData().GetSubtype() == CSeqFeatData::eSubtype_prot &&
762  fi->IsSetPartial() && fi->GetPartial() &&
763  fi->IsSetLocation() &&
764  !fi->GetLocation().IsPartialStart(eExtreme_Biological) &&
765  !fi->GetLocation().IsPartialStop(eExtreme_Biological)) {
766  // note - we are only fixing partials if *both*
767  // ends were left as complete. One end being
768  // set as partial means that someone was doing this
769  // deliberately.
770  if (make_partial5) {
771  fi->SetLocation().SetPartialStart(true, eExtreme_Biological);
772  }
773  if (make_partial3) {
774  fi->SetLocation().SetPartialStop(true, eExtreme_Biological);
775  }
777 
778  CMolInfo::TCompleteness wanted = GetCompletenessFromFlags(make_partial5, make_partial3, true);
779  for (auto ds : bs.SetDescr().Set()) {
780  if (ds->IsMolinfo() &&
781  (!ds->GetMolinfo().IsSetCompleteness() ||
782  ds->GetMolinfo().GetCompleteness() != wanted)) {
783  ds->SetMolinfo().SetCompleteness(wanted);
785  break;
786  }
787  }
788  }
789  }
790  }
791  }
792 }
793 
794 
796 {
797  // try to find CObject_id in Seq-id for certain types
798  CRef<CObject_id> pObjectId;
799  if( seq_id.IsLocal() ) {
800  pObjectId.Reset( & GET_MUTABLE(seq_id, Local) );
801  }
802 
803  // currently, we only process the Str ones
804  if( ! pObjectId || ! FIELD_IS(*pObjectId, Str) ) {
805  return;
806  }
807 
808  x_TruncateSpacesMarkChanged( GET_MUTABLE(*pObjectId, Str) );
809 }
810 
811 // change the target string by searching for the given search_pattern
812 // and replacing it with replacement up to max_replace times (0 means unlimited)
813 //
814 // Example:
815 // string foo = "Test: FOO BAR :BAZ."
816 // s_RegexpReplace( foo, ":[ ]+", ": " );
817 // This turns foo into "Test: FOO BAR :BAZ."
818 // Returns "true" if a replacement was done
819 
821 
822 static
823 bool s_RegexpReplace( string &target,
824  const char *search_pattern,
825  const char *replacement,
826  int max_replace = s_RegexpReplace_UnlimitedReplacements,
828 {
829  CRegexpUtil replacer( target );
830  size_t num_replacements = replacer.Replace( search_pattern, replacement,
831  compile_flags, CRegexp::fMatch_default, max_replace );
832  // swap is faster than assignment
833  replacer.GetResult().swap( target );
834 
835  return ( num_replacements > 0 );
836 }
837 
838 // This is similar to lexicographical_compare_3way,
839 // but we have to implement it ourselves because
840 // it's an SGI extension, not in the standard.
841 template <class Iter1, class Iter2, class Compare>
843  Iter1 first1, Iter1 last1,
844  Iter2 first2, Iter2 last2,
845  Compare compare )
846 {
847  for( ; first1 != last1 && first2 != last2 ; ++first1, ++first2 ) {
848  int comparison = compare( *first1, *first2 );
849  if( comparison != 0 ) {
850  return comparison;
851  }
852  }
853 
854  if( first1 == last1 ) {
855  if( first2 == last2 ) {
856  return 0; // they're equal
857  } else {
858  // second is longer
859  return -1;
860  }
861  } else {
862  // first is longer
863  return 1;
864  }
865 }
866 
868 {
869 public:
870  bool operator()( const char ch1, const char ch2 ) const {
871  return toupper(ch1) == toupper(ch2);
872  }
873 };
874 
876 {
877 public:
878  bool operator()( const char ch1, const char ch2 ) const {
879  return toupper(ch1) < toupper(ch2);
880  }
881 };
882 
884 {
885 public:
886  int operator()( const char ch1, const char ch2 ) const {
887  return ( (int)toupper(ch1) - (int)toupper(ch2) );
888  }
889 };
890 
891 // C compares using toupper, as opposed to the built-in
892 // stuff which seems to use tolower, thus producing
893 // some differences in sorting order in some places.
894 // Once we've fully moved away from C there's probably
895 // no harm in replacing all calls to s_CompareNoCaseCStyle with
896 // normal functions like NStr::CompareNocase()
897 static
898 int s_CompareNoCaseCStyle( const string &s1, const string &s2 )
899 {
901  s1.begin(), s1.end(),
902  s2.begin(), s2.end(),
904 }
905 
906 static
907 const string &s_GenomeToPlastidName( const CBioSource& biosrc )
908 {
909  SWITCH_ON_BIOSOURCE_GENOME (biosrc) {
910  case NCBI_GENOME(apicoplast):
911  {
912  const static string apicoplast("apicoplast");
913  return apicoplast;
914  }
915  break;
916  case NCBI_GENOME(chloroplast):
917  {
918  const static string chloroplast("chloroplast");
919  return chloroplast;
920  }
921  break;
922  case NCBI_GENOME(chromoplast):
923  {
924  const static string chromoplast("chromoplast");
925  return chromoplast;
926  }
927  break;
928  case NCBI_GENOME(kinetoplast):
929  {
930  const static string kinetoplast("kinetoplast");
931  return kinetoplast;
932  }
933  break;
934  case NCBI_GENOME(leucoplast):
935  {
936  const static string leucoplast("leucoplast");
937  return leucoplast;
938  }
939  break;
940  case NCBI_GENOME(plastid):
941  {
942  const static string plastid("plastid");
943  return plastid;
944  }
945  break;
946  case NCBI_GENOME(proplastid):
947  {
948  const static string proplastid("proplastid");
949  return proplastid;
950  }
951  break;
952  default:
953  return kEmptyStr;
954  break;
955  }
956  return kEmptyStr;
957 }
958 
959 // If str starts with prefix, the prefix is removed from the string.
960 static
961 bool s_RemoveInitial( string &str, const string &prefix, NStr::ECase case_to_use )
962 {
963  if( NStr::StartsWith( str, prefix, case_to_use ) ) {
964  str.erase( 0, prefix.length() );
965  return true;
966  }
967  return false;
968 }
969 
970 // Given the position of the opening paren in a string, this returns
971 // the position of the closing paren (keeping track of any nested parens
972 // in the middle.
973 // It returns NPOS if the paren is not closed.
974 // This function is not currently smart; it doesn't know about quotes
975 // or anything
976 static
977 SIZE_TYPE s_MatchingParenPos( const string &str, SIZE_TYPE open_paren_pos )
978 {
979  _ASSERT( str[open_paren_pos] == '(' );
980  _ASSERT( open_paren_pos < str.length() );
981 
982  // nesting level. start at 1 since we know there's an open paren
983  int level = 1;
984 
985  SIZE_TYPE pos = open_paren_pos + 1;
986  for( ; pos < str.length(); ++pos ) {
987  switch( str[pos] ) {
988  case '(':
989  // nesting deeper
990  ++level;
991  break;
992  case ')':
993  // closed a level of nesting
994  --level;
995  if( 0 == level ) {
996  // reached the top: we're closing the initial paren,
997  // so we return our position
998  return pos;
999  }
1000  break;
1001  default:
1002  // ignore other characters.
1003  // maybe in the future we'll handle ignoring parens in quotes or
1004  // things like that.
1005  break;
1006  }
1007  }
1008  return NPOS;
1009 }
1010 
1011 static bool s_AccessionCompare (
1012  const string& str1,
1013  const string& str2
1014 )
1015 
1016 {
1017  return ( NStr::CompareNocase( str1, str2 ) < 0 );
1018 }
1019 
1020 static bool s_AccessionEqual (
1021  const string& str1,
1022  const string& str2
1023 )
1024 
1025 {
1026  if (NStr::EqualNocase (str1, str2)) return true;
1027 
1028  return false;
1029 }
1030 
1031 
1033 {
1034  if (CleanVisStringJunk(str)) {
1036  }
1037 }
1038 
1039 
1041  CGB_block& gbk
1042 )
1043 
1044 {
1045  CLEAN_STRING_LIST (gbk, Extra_accessions);
1046 
1050  }
1051 
1055  }
1056 
1057  // split keywords at semicolons
1058  if (gbk.IsSetKeywords()) {
1059  string one_string = NStr::Join(gbk.GetKeywords(), ";");
1060  gbk.ResetKeywords();
1061  NStr::Split(one_string, ";", gbk.SetKeywords());
1062  }
1063 
1064  CLEAN_STRING_LIST (gbk, Keywords);
1065 
1066  CCachedRegexp reassembly_regex
1067  = regexpCache.Get("^tpa(?:_|[_:]re)assembly$",
1069  EDIT_EACH_KEYWORD_ON_EMBLBLOCK(keyword_it, gbk) {
1070  string & sKeyword = *keyword_it;
1071  if( reassembly_regex->IsMatch(sKeyword) ) {
1072  sKeyword = "TPA:assembly";
1074  }
1075  }
1076 
1077  if( m_IsEmblOrDdbj ) {
1079  } else {
1081  }
1082 
1083  CLEAN_STRING_MEMBER_JUNK (gbk, Source);
1084  if( FIELD_EQUALS(gbk, Source, ".") ) {
1085  RESET_FIELD(gbk, Source);
1087  }
1088  if( FIELD_EQUALS(gbk, Origin, ".") ) {
1089  RESET_FIELD(gbk, Origin);
1091  }
1092 
1093  CLEAN_STRING_MEMBER (gbk, Date);
1094  CLEAN_STRING_MEMBER (gbk, Div);
1095  CLEAN_STRING_MEMBER (gbk, Taxonomy);
1096 }
1097 
1099  CEMBL_block& emb
1100 )
1101 
1102 {
1103  CLEAN_STRING_LIST (emb, Extra_acc);
1104 
1108  }
1109 
1113  }
1114 
1115  CLEAN_STRING_LIST (emb, Keywords);
1116 
1118 }
1119 
1120 
1121 // Give it a map that maps case-insensitive string to some other type,
1122 // and it will return any matches that are a prefix for str.
1123 // For example, if you have a mapping that includes ("foo" to 7), then passing
1124 // str as "Foo something", will return the ("foo" to 7) mapping.
1125 template< typename TMapType >
1126 typename TMapType::const_iterator s_FindInMapAsPrefix( const string &str_arg, const TMapType &the_map )
1127 {
1128  // holds the str we're looking at, which might be str_arg, or
1129  // might be another string constructed from it
1130  const string *str = &str_arg;
1131 
1132  // use this to delete strings created in this function, if any.
1133  // we don't read from it directly
1134  unique_ptr<string> temp_str;
1135 
1136  // chop off characters that can't be in the map, so they don't count
1137  SIZE_TYPE first_bad_char = 0;
1138  for( ; first_bad_char < str_arg.length(); ++first_bad_char ) {
1139  const char ch = str_arg[first_bad_char];
1140  if( ! isalnum(ch) && ch != '-' && ch != '_' && ch != ' ' ) {
1141  temp_str.reset( new string(str_arg, 0, first_bad_char) );
1142  str = temp_str.get();
1143  break;
1144  }
1145  }
1146 
1147  typename TMapType::const_iterator it = the_map.lower_bound( *str );
1148  if( it != the_map.begin() && ( it == the_map.end() || ! NStr::EqualNocase(*str, it->first) ) ) {
1149  --it;
1150  }
1151  if ( it != the_map.end() && NStr::StartsWith(*str, it->first, NStr::eNocase)) {
1152  return it;
1153  }
1154  return the_map.end();
1155 }
1156 
1157 // s_FindInMapAsPrefix, but for data structures like sets.
1158 template< typename TSetType >
1159 typename TSetType::const_iterator s_FindInSetAsPrefix( const string &str, const TSetType &the_set )
1160 {
1161  typename TSetType::const_iterator it = the_set.lower_bound( str );
1162  if( it != the_set.begin() && ( it == the_set.end() || ! NStr::EqualNocase(str, *it) ) ) {
1163  --it;
1164  }
1165  if ( it != the_set.end() && NStr::StartsWith(str, *it, NStr::eNocase)) {
1166  return it;
1167  }
1168  return the_set.end();
1169 }
1170 
1171 
1172 // copy "str" because we're changing it anyway
1173 // returns true if we found anything
1174 static
1175 bool s_StringHasOrgModPrefix(const string &str, string::size_type &out_val_start_pos, TORGMOD_SUBTYPE &out_subtype)
1176 {
1177  SIZE_TYPE pos = str.find_first_of(": ="), pos2;
1178  if (pos != 0 && pos != NPOS
1179  && (pos2 = str.find_first_not_of(": =", pos)) != NPOS) {
1180  try {
1181  string val = str.substr(0, pos);
1183  if ( !COrgMod::IsDiscouraged(subtype) ) {
1184  out_subtype = subtype;
1185  out_val_start_pos = pos2;
1186  return true;
1187  }
1188  } catch (CSerialException&) {
1189  }
1190  }
1191  return false;
1192 }
1193 
1194 // returns true if we found anything
1195 static
1196 bool s_StringHasSubSourcePrefix(const string &str, string::size_type &out_val_start_pos, TSUBSOURCE_SUBTYPE &out_subtype)
1197 {
1198  SIZE_TYPE pos = str.find_first_of(": ="), pos2;
1199  if (pos != 0 && pos != NPOS
1200  && (pos2 = str.find_first_not_of(": =", pos)) != NPOS) {
1201  try {
1202  string val = str.substr(0, pos);
1203  CSubSource::TSubtype subtype;
1204  if (NStr::EqualNocase(val, "Lat-long") || NStr::EqualNocase(val, "Latitude-Longitude")) {
1205  subtype = CSubSource::eSubtype_lat_lon;
1206  } else {
1208  }
1209  if ( subtype == CSubSource::eSubtype_fwd_primer_name ||
1213  !CSubSource::IsDiscouraged(subtype) ) {
1214  out_subtype = subtype;
1215  out_val_start_pos = pos2;
1216  return true;
1217  }
1218  } catch (CSerialException&) {
1219  }
1220  } else {
1221  // did not find delimiters
1222  try {
1224  if ( !CSubSource::IsDiscouraged(subtype) && CSubSource::NeedsNoText(subtype)) {
1225  out_subtype = subtype;
1226  out_val_start_pos = str.length();
1227  return true;
1228  }
1229  } catch (CSerialException&) {
1230  }
1231  }
1232  return false;
1233 }
1234 
1235 
1236 // is st1 < st2
1237 
1238 static bool s_SubsourceCompare (
1239  const CRef<CSubSource>& st1,
1240  const CRef<CSubSource>& st2
1241 )
1242 
1243 {
1244  const CSubSource& sbs1 = *(st1);
1245  const CSubSource& sbs2 = *(st2);
1246 
1247  TSUBSOURCE_SUBTYPE chs1 = GET_FIELD (sbs1, Subtype);
1248  TSUBSOURCE_SUBTYPE chs2 = GET_FIELD (sbs2, Subtype);
1249 
1250  if (chs1 < chs2) return true;
1251  if (chs1 > chs2) return false;
1252 
1253  if (FIELD_IS_SET (sbs2, Name)) {
1254  if (! FIELD_IS_SET (sbs1, Name)) return true;
1255  if (s_CompareNoCaseCStyle(GET_FIELD (sbs1, Name), GET_FIELD (sbs2, Name)) < 0) return true;
1256  }
1257 
1258  return false;
1259 }
1260 
1261 // Two SubSource's are equal and duplicates if:
1262 // they have the same subtype
1263 // and the same name (or don't require a name).
1264 
1265 static bool s_SubsourceEqual (
1266  const CRef<CSubSource>& st1,
1267  const CRef<CSubSource>& st2
1268 )
1269 
1270 {
1271  const CSubSource& sbs1 = *(st1);
1272  const CSubSource& sbs2 = *(st2);
1273 
1274  TSUBSOURCE_SUBTYPE chs1 = GET_FIELD (sbs1, Subtype);
1275  TSUBSOURCE_SUBTYPE chs2 = GET_FIELD (sbs2, Subtype);
1276 
1277  if (chs1 != chs2) return false;
1278  if (CSubSource::NeedsNoText (chs2)) return true;
1279 
1280  if (FIELD_IS_SET (sbs1, Name) && FIELD_IS_SET (sbs2, Name)) {
1281  if (NStr::EqualNocase (GET_FIELD (sbs1, Name), GET_FIELD (sbs2, Name))) return true;
1282  }
1283  if (! FIELD_IS_SET (sbs1, Name) && ! FIELD_IS_SET (sbs2, Name)) return true;
1284 
1285  return false;
1286 }
1287 
1289  CBioSource& biosrc,
1290  CSeq_feat & seqfeat
1291 )
1292 {
1293  // consolidate all orgmods of subtype "other" into one
1294  CRef<COrgMod> pFirstOtherOrgMod;
1295  if (biosrc.IsSetOrg() && biosrc.GetOrg().IsSetOrgname() && biosrc.GetOrg().GetOrgname().IsSetMod()) {
1296  auto& mod_set = biosrc.SetOrg().SetOrgname().SetMod();
1297  auto mod_it = mod_set.begin();
1298  while (mod_it != mod_set.end()) {
1299  COrgMod & orgmod = **mod_it;
1300 
1301  // we're only cleaning the ones of type "other"
1302  if (!FIELD_EQUALS(orgmod, Subtype, NCBI_ORGMOD(other)) ||
1303  !FIELD_IS_SET(orgmod, Subname))
1304  {
1305  ++mod_it;
1306  continue;
1307  }
1308 
1309  if (pFirstOtherOrgMod) {
1310  STRING_FIELD_APPEND(*pFirstOtherOrgMod, Subname, "; ", GET_STRING_FLD_OR_BLANK(orgmod, Subname));
1312  mod_it = mod_set.erase(mod_it);
1314  } else {
1315  pFirstOtherOrgMod.Reset(&orgmod);
1316  ++mod_it;
1317  }
1318  }
1319  }
1320 
1321  // consolidate all subsources of subtype "other" into one
1322  CRef<CSubSource> pFirstOtherSubSource;
1323  EDIT_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
1324  CSubSource &subsrc = **subsrc_iter;
1325 
1326  // we're only cleaning the ones of type "other"
1327  if( ! FIELD_EQUALS(subsrc, Subtype, NCBI_SUBSOURCE(other) ) ||
1328  ! FIELD_IS_SET(subsrc, Name) )
1329  {
1330  continue;
1331  }
1332 
1333  if( pFirstOtherSubSource ) {
1334  STRING_FIELD_APPEND(*pFirstOtherSubSource, Name, "; ", GET_STRING_FLD_OR_BLANK(subsrc, Name) );
1336  ERASE_SUBSOURCE_ON_BIOSOURCE(subsrc_iter, biosrc);
1338  } else {
1339  pFirstOtherSubSource.Reset( &subsrc );
1340  }
1341  }
1342 
1343  // transfer feat comment (if any) to the end of the last other subsource note
1344  if( FIELD_IS_SET(seqfeat, Comment) ) {
1345 
1346  if( ! pFirstOtherSubSource ) {
1347  // create an empty subsource note if none found
1348  pFirstOtherSubSource.Reset( new CSubSource );
1349  SET_FIELD(*pFirstOtherSubSource, Subtype, NCBI_SUBSOURCE(other) );
1350  ADD_SUBSOURCE_TO_BIOSOURCE(biosrc, pFirstOtherSubSource);
1351  }
1352 
1353  STRING_FIELD_APPEND(*pFirstOtherSubSource, Name, "; ", GET_FIELD(seqfeat, Comment));
1355  RESET_FIELD(seqfeat, Comment);
1357  }
1358 
1359  // special orgmod cleanup just for features (yes, is stupid, but is what C toolkit does)
1360  if (biosrc.IsSetOrg() && biosrc.GetOrg().IsSetMod()) {
1361  EDIT_EACH_MOD_ON_ORGREF (it, biosrc.SetOrg()) {
1362  if (x_CompressSpaces(*it)) {
1364  }
1365  }
1366  }
1367 }
1368 
1369 static void s_CorrectTildes (
1370  string& str
1371 )
1372 
1373 {
1374 #ifndef NCBI_OS_MSWIN
1375  NStr::ReplaceInPlace (str, "were ~25 cm in height (~3 weeks)", "were ~~25 cm in height (~~3 weeks)");
1376  NStr::ReplaceInPlace (str, "generally ~3 weeks", "generally ~~3 weeks");
1377  NStr::ReplaceInPlace (str, "sequencing (~4 96-well plates)", "sequencing (~~4 96-well plates)");
1378  NStr::ReplaceInPlace (str, "size distribution (~2 kb)", "size distribution (~~2 kb)");
1379  NStr::ReplaceInPlace (str, "sequencing (~3 96-well plates)", "sequencing (~~3 96-well plates)");
1380  NStr::ReplaceInPlace (str, "vector. 1~2 ul of ligated", "vector. 1~~2 ul of ligated");
1381  /*
1382  NStr::ReplaceInPlace (str, "Lambda FLC I.~Islet cells were provided", "Lambda FLC I.~~Islet cells were provided");
1383  */
1384  NStr::ReplaceInPlace (str, "different strains~of mice", "different strains of mice");
1385  NStr::ReplaceInPlace (str, "oligo-dT-NotI primer~(5'-biotin", "oligo-dT-NotI primer (5'-biotin");
1386  NStr::ReplaceInPlace (str, "sizes of 200~800 bp were purified", "sizes of 200~~800 bp were purified");
1387  NStr::ReplaceInPlace (str, "Tween 20 (~50 ml per tree)", "Tween 20 (~~50 ml per tree)");
1388  NStr::ReplaceInPlace (str, "the SMART approach (~http://www.evrogen.com", "the SMART approach (http://www.evrogen.com");
1389  NStr::ReplaceInPlace (str, "the morning (~10 am) with", "the morning (~~10 am) with");
1390  NStr::ReplaceInPlace (str, "(host) sequences (~10%)", "(host) sequences (~~10%)");
1391  /*
1392  NStr::ReplaceInPlace (str, "unidirectionally.~ High quality", "unidirectionally. High quality");
1393  NStr::ReplaceInPlace (str, "onlysubmitted.~ Average", "onlysubmitted. Average");
1394  */
1395  NStr::ReplaceInPlace (str, "Plasmid; ~The F03-1270", "Plasmid; The F03-1270");
1396  NStr::ReplaceInPlace (str, "using STS-PCR~from Eb", "using STS-PCR from Eb");
1397  NStr::ReplaceInPlace (str, "specific to~the Eb", "specific to the Eb");
1398  NStr::ReplaceInPlace (str, "side of insert); , M.F., Lennon", "side of insert); Bonaldo, M.F., Lennon");
1399  NStr::ReplaceInPlace (str, "Uni-ZAP XR vector. 1~2 ul of", "Uni-ZAP XR vector. 1~~2 ul of");
1400  NStr::ReplaceInPlace (str, "from diploid~Secale montanum", "from diploid Secale montanum");
1401  NStr::ReplaceInPlace (str, "homology with~U43516,", "homology with U43516,");
1402  /*
1403  NStr::ReplaceInPlace (str, "from http//www.biobase.dk/~ddbase", "from http//www.biobase.dk/~~ddbase");
1404  */
1405  NStr::ReplaceInPlace (str, "plasmid; ~Assembled EST", "plasmid; Assembled EST");
1406  NStr::ReplaceInPlace (str, "databases.~Different cDNA", "databases. Different cDNA");
1407  NStr::ReplaceInPlace (str, "enzyme PstI.~DH5-alpha", "enzyme PstI. DH5-alpha");
1408  NStr::ReplaceInPlace (str, "as they~were prepared", "as they were prepared");
1409  NStr::ReplaceInPlace (str, "loci in~the genome", "loci in the genome");
1410  NStr::ReplaceInPlace (str, "P{CaSpeR}Cp1~50C (FBti0004219)", "P{CaSpeR}Cp1~~50C (FBti0004219)");
1411  NStr::ReplaceInPlace (str, "seedlings with 2~4 leaves", "seedlings with 2~~4 leaves");
1412  NStr::ReplaceInPlace (str, "tween 20 (~50mLs per tree)", "tween 20 (~~50mLs per tree)");
1413 #endif //NCBI_OS_MSWIN
1414 }
1415 
1416 
1417 bool s_SameSubtype(const CSubSource& s1, const CSubSource& s2)
1418 {
1419  if (!s1.IsSetSubtype() && !s2.IsSetSubtype()) {
1420  return true;
1421  } else if (!s1.IsSetSubtype() || !s2.IsSetSubtype()) {
1422  return false;
1423  } else {
1424  return s1.GetSubtype() == s2.GetSubtype();
1425  }
1426 }
1427 
1428 
1429 // close enough if second name contains the first
1430 bool s_NameCloseEnough(const CSubSource& s1, const CSubSource& s2)
1431 {
1432  if (!s1.IsSetName() && !s2.IsSetName()) {
1433  return true;
1434  } else if (!s1.IsSetName() || !s2.IsSetName()) {
1435  return false;
1436  }
1437  const string& n1 = s1.GetName();
1438  const string& n2 = s2.GetName();
1439 
1440  if (NStr::Equal(n1, n2)) {
1441  return true;
1442  } else {
1443  return false;
1444  }
1445 }
1446 
1447 
1449 {
1450  if (!biosrc.IsSetSubtype()) {
1451  return;
1452  }
1453 
1454  // sort and remove duplicates.
1455  if (biosrc.IsSetSubtype() && biosrc.GetSubtype().size() > 1) {
1459  }
1460 
1461  // remove duplicates and subsources that contain previous values
1462  CBioSource::TSubtype::iterator s = biosrc.SetSubtype().begin();
1463  CBioSource::TSubtype::iterator s_next = s;
1464  ++s_next;
1465  while (s_next != biosrc.SetSubtype().end()) {
1466  if (s_SameSubtype(**s, **s_next) && s_NameCloseEnough(**s, **s_next)) {
1467  s = biosrc.SetSubtype().erase(s);
1469  } else {
1470  ++s;
1471  }
1472  ++s_next;
1473  }
1474  }
1475 }
1476 
1477 static string s_RepairISOCollDateTimeString (string& date_string)
1478 {
1479  vector<string> components;
1480  NStr::Split(date_string, "T", components);
1481 
1482  if (components.size() == 1) {
1483  return date_string;
1484  }
1485 
1486  if (components.size() == 2) {
1487  string dat = components[0];
1488  string tim = components[1];
1489  size_t zee = tim.length();
1490  if (zee > 4 && tim[zee-1] == 'Z' && tim[1] == ':') {
1491  return dat + "T" + "0" + tim;
1492  }
1493  }
1494 
1495  return date_string;
1496 }
1497 
1498 static string s_RepairISOCollDateTimePair (string& coll_date)
1499 {
1500  vector<string> pieces;
1501  NStr::Split(coll_date, "/", pieces);
1502 
1503  if (pieces.size() == 1) {
1504  string newdate = s_RepairISOCollDateTimeString(pieces[0]);
1505  }
1506 
1507  if (pieces.size() == 2) {
1508  string fstdate = s_RepairISOCollDateTimeString(pieces[0]);
1509  string scddate = s_RepairISOCollDateTimeString(pieces[1]);
1510  return fstdate + "/" + scddate;
1511  }
1512 
1513  return coll_date;
1514 }
1515 
1516 string s_CleanupLatLon( string &subname ) {
1517  string lat;
1518  string north_or_south;
1519  string lon;
1520  string east_or_west;
1521 
1522  if (subname.length() < 1) {
1523  return subname;
1524  }
1525  char ch = subname[0];
1526  if (ch < '0' || ch > '9') {
1527  return subname;
1528  }
1529 
1530  // extract the pieces
1531  CNcbiIstrstream lat_lon_stream( subname );
1532  lat_lon_stream >> lat;
1533  lat_lon_stream >> north_or_south;
1534  lat_lon_stream >> lon;
1535  lat_lon_stream >> east_or_west;
1536  if( lat_lon_stream.bad() ) {
1537  return subname;
1538  }
1539 
1540  if( north_or_south != "N" && north_or_south != "S" ) {
1541  return subname;
1542  }
1543 
1544  if( east_or_west != "E" && east_or_west != "W" ) {
1545  return subname;
1546  }
1547 
1548  size_t pos = NStr::Find(lat, ".");
1549  if (pos > 0) {
1550  size_t len = lat.length();
1551  if (pos + 9 < len) {
1552  lat.erase(pos + 9);
1553  }
1554  }
1555 
1556  pos = NStr::Find(lon, ".");
1557  if (pos > 0) {
1558  size_t len = lon.length();
1559  if (pos + 9 < len) {
1560  lon.erase(pos + 9);
1561  }
1562  }
1563 
1564  return lat + " " + north_or_south + " " + lon + " " + east_or_west;
1565 }
1566 
1568  CBioSource& biosrc
1569 )
1570 {
1571  if( FIELD_EQUALS( biosrc, Genome, CBioSource::eGenome_virion ) ) {
1572  RESET_FIELD( biosrc, Genome );
1574  }
1575 
1576  if( FIELD_EQUALS( biosrc, Origin, NCBI_ORIGIN(unknown) ) ) {
1577  RESET_FIELD(biosrc, Origin);
1579  }
1580 
1581  // remove spaces and convert to lowercase in fwd_primer_seq and rev_primer_seq.
1582  if( FIELD_IS_SET(biosrc, Subtype) ) {
1583  SUBSOURCE_ON_BIOSOURCE_Type::iterator prev =
1584  SUBSOURCE_ON_BIOSOURCE_Set(biosrc).end();
1585  EDIT_EACH_SUBSOURCE_ON_BIOSOURCE (it, biosrc) {
1586  CSubSource& sbs = **it;
1587 
1588  TSUBSOURCE_SUBTYPE chs = GET_FIELD (sbs, Subtype);
1589  if (CSubSource::NeedsNoText (chs)) {
1590  // name is required - set it to empty string
1591  if( ! FIELD_IS_SET(sbs, Name) || ! GET_FIELD(sbs, Name).empty() ) {
1592  SET_FIELD (sbs, Name, "");
1594  }
1595  CLEAN_STRING_MEMBER(sbs, Attrib);
1596  } else {
1598  if( ! FIELD_IS_SET(sbs, Name) ) {
1599  // name must be set
1600  SET_FIELD (sbs, Name, "");
1602  }
1603  x_RemoveFlankingQuotes( GET_MUTABLE(sbs, Name) );
1604  CLEAN_STRING_MEMBER(sbs, Attrib);
1605  }
1606 
1607  if( chs == NCBI_SUBSOURCE(country) ) {
1608  string &country = GET_MUTABLE(sbs, Name);
1609  static const string kUSPrefix( "United States:" );
1610  if( NStr::EqualNocase(country, "United States") ||
1611  NStr::EqualNocase(country, "United States of America") ||
1612  NStr::EqualNocase(country, "U.S.A.") )
1613  {
1614  country = "USA";
1616  } else if( NStr::StartsWith(country, kUSPrefix, NStr::eNocase) ) {
1617  country.replace( 0, kUSPrefix.length(), "USA:" );
1619  }
1620 
1621  }
1622 
1623  if( chs == NCBI_SUBSOURCE(altitude) ) {
1624  string &altitude = GET_MUTABLE(sbs, Name);
1625 
1626  // normalize units part (that is, the ending) if possible
1627  // (e.g. "meters", etc. to "m.")
1628  // Note that we do NOT count a match if it's just a number because
1629  // we can't be sure that the submitter wasn't thinking "feet" or whatever.
1630  CCachedRegexp altitude_regex = regexpCache.Get(
1631  "^([+-]?[0-9]+(\\.[0-9]+)?) ?(m|meter[s]?|metre[s]?)\\.?$",
1633 
1634  if( altitude_regex->IsMatch(altitude) ) {
1635  string new_altitude = altitude_regex->GetSub(altitude, 1);
1636  new_altitude += " m";
1637  if( altitude != new_altitude ) {
1638  altitude = new_altitude;
1640  }
1641  }
1642  }
1643 
1644  if( chs == NCBI_SUBSOURCE(lat_lon) ) {
1645  string &lat_lon = GET_MUTABLE(sbs, Name);
1646 
1647  string subname = s_CleanupLatLon(lat_lon);
1648  if ( lat_lon != subname ) {
1649  lat_lon = subname;
1651  }
1652  }
1653 
1654  /*
1655  if( chs == NCBI_SUBSOURCE(lat_lon) ) {
1656  string &lat_lon = GET_MUTABLE(sbs, Name);
1657 
1658  CCachedRegexp lat_lon_with_comma = regexpCache.Get(
1659  "^[-.0-9]+ ., [-.0-9]+ .$");
1660  if( lat_lon_with_comma->IsMatch(lat_lon) ) {
1661  // remove the comma
1662  SIZE_TYPE comma_pos = lat_lon.find(',');
1663  _ASSERT(comma_pos != NPOS );
1664  lat_lon.erase(comma_pos, 1);
1665  ChangeMade(CCleanupChange::eCleanSubsource);
1666  }
1667  }
1668  */
1669 
1670  if ( chs == NCBI_SUBSOURCE(collection_date) ) {
1671  string &coll_date = GET_MUTABLE(sbs, Name);
1672  string new_date = s_RepairISOCollDateTimePair(coll_date);
1673  if (!NStr::Equal(new_date, coll_date)) {
1674  coll_date = new_date;
1676  }
1677  }
1678 
1679  if ( chs == NCBI_SUBSOURCE(fwd_primer_seq) ||
1680  chs == NCBI_SUBSOURCE(rev_primer_seq) )
1681  {
1682  const string before = GET_FIELD (sbs, Name);
1683  CPCRPrimerSeq::Clean( GET_MUTABLE(sbs, Name) );
1684  const string& after = GET_FIELD (sbs, Name);
1685  if ( before != after ) {
1687  }
1688  }
1689 
1690  // determine whether we should remove this subsource:
1691  if( (! FIELD_IS_SET(sbs, Name) || GET_FIELD(sbs, Name).empty()) &&
1692  ! CSubSource::NeedsNoText( chs ) )
1693  {
1694  ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
1696  continue;
1697  } else if( chs == NCBI_SUBSOURCE(plastid_name) &&
1698  STRING_FIELD_MATCH(sbs, Name, s_GenomeToPlastidName(biosrc) ) )
1699  {
1700  ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
1702  continue;
1703  } else if( prev != SUBSOURCE_ON_BIOSOURCE_Set(biosrc).end() ) {
1704  TSUBSOURCE_SUBTYPE prev_chs = GET_FIELD (**prev, Subtype);
1705  const string &name = GET_FIELD(sbs, Name);
1706  const string &prev_name = GET_FIELD(**prev, Name);
1707 
1708  if ( (chs == prev_chs) &&
1709  ( CSubSource::NeedsNoText(chs) ||
1710  NStr::EqualNocase(prev_name, name) ||
1711  (prev_chs == NCBI_SUBSOURCE(other) &&
1712  NStr::Find(prev_name, name) != NPOS)))
1713  {
1714  ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
1716  continue;
1717  } else if ( (chs == prev_chs) &&
1718  prev_chs == NCBI_SUBSOURCE(other) &&
1719  NStr::Find (name, prev_name) != NPOS )
1720  {
1721  (**prev).Assign( sbs );
1722  ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
1724  continue;
1725  }
1726  }
1727 
1728  prev = it;
1729  }
1730  }
1731 
1732  // sort and remove duplicates.
1733  SubSourceListBC(biosrc);
1734 
1735  // PCR Primers
1736  if( FIELD_IS_SET(biosrc, Pcr_primers) ) {
1737  PCRReactionSetBC( GET_MUTABLE(biosrc, Pcr_primers) );
1738  if( GET_FIELD(biosrc, Pcr_primers).Get().empty() ) {
1739  RESET_FIELD(biosrc, Pcr_primers);
1741  }
1742  }
1743 
1744  // correct specific cases of inconsistently applied tildes
1745  if (biosrc.IsSetOrg() && biosrc.GetOrg().IsSetOrgname()) {
1746  auto& orgname = biosrc.SetOrg().SetOrgname();
1747  if (orgname.IsSetMod()) {
1748  auto& mod_set = orgname.SetMod();
1749  for (auto& orgmod_it : mod_set) {
1750  COrgMod & orgmod = *orgmod_it;
1751 
1752  // we're only correcting tildes for the ones of type "other"
1753  if (!FIELD_EQUALS(orgmod, Subtype, NCBI_ORGMOD(other)) ||
1754  !FIELD_IS_SET(orgmod, Subname))
1755  {
1756  continue;
1757  }
1758 
1759  string &subname = GET_MUTABLE(orgmod, Subname);
1761  }
1762  }
1763  }
1764 
1765  EDIT_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
1766  CSubSource &subsrc = **subsrc_iter;
1767 
1768  // we're only correcting tildes for the ones of type "other"
1769  if( ! FIELD_EQUALS(subsrc, Subtype, NCBI_SUBSOURCE(other) ) ||
1770  ! FIELD_IS_SET(subsrc, Name) )
1771  {
1772  continue;
1773  }
1774 
1775  string &name = GET_MUTABLE(subsrc, Name);
1776  s_CorrectTildes(name);
1777  }
1778 
1779  if (biosrc.IsSetOrg()) {
1780  if (biosrc.GetOrg().IsSetOrgname()) {
1781  OrgnameBC(biosrc.SetOrg().SetOrgname(), biosrc.SetOrg());
1782  }
1783  }
1784 
1785  if (biosrc.FixEnvironmentalSample()) {
1787  }
1788  if (biosrc.RemoveNullTerms()) {
1790  }
1791  if (biosrc.FixGenomeForQualifiers()) {
1793  }
1794 
1795  x_PostBiosource(biosrc);
1796  if (biosrc.IsSetOrg()) {
1797  x_PostOrgRef(biosrc.SetOrg());
1798  }
1799 }
1800 
1802 {
1803  if( FIELD_EQUALS(biosrc, Genome, NCBI_GENOME(unknown) ) ) {
1804  RESET_FIELD(biosrc, Genome);
1806  }
1807 
1808  if (BIOSOURCE_HAS_SUBSOURCE (biosrc)) {
1809 
1810  // remove plastid-name subsource if the value is the same as the biosource location
1811  const string &plastid_name = s_GenomeToPlastidName( biosrc );
1812 
1813  bool plasmid_subsource_found = false;
1814  EDIT_EACH_SUBSOURCE_ON_BIOSOURCE (it, biosrc) {
1815  CSubSource& sbs = **it;
1816  TSUBSOURCE_SUBTYPE chs = GET_FIELD (sbs, Subtype);
1817  if (CSubSource::NeedsNoText (chs)) {
1818  if (sbs.IsSetName() && !NStr::IsBlank(sbs.GetName())) {
1819  RESET_FIELD (sbs, Name);
1820  SET_FIELD (sbs, Name, "");
1822  }
1823  } else if (chs == NCBI_SUBSOURCE(plastid_name)) {
1824  // plasTid
1825  if (NStr::EqualNocase (GET_FIELD (sbs, Name), plastid_name)) {
1826  ERASE_SUBSOURCE_ON_BIOSOURCE (it, biosrc);
1828  }
1829  } else if ( chs == NCBI_SUBSOURCE(plasmid_name) ) {
1830  // plasMid
1831  plasmid_subsource_found = true;
1832  }
1833  }
1834 
1835  // set genome to "plasmid" under some conditions
1836  if( plasmid_subsource_found ) {
1837  if( ! FIELD_IS_SET(biosrc, Genome) ||
1838  GET_FIELD(biosrc, Genome) == NCBI_GENOME(unknown) ||
1839  GET_FIELD(biosrc, Genome) == NCBI_GENOME(genomic) )
1840  {
1841  biosrc.SetGenome( NCBI_GENOME(plasmid) );
1843  }
1844  }
1845 
1846  // remove those with no name unless it has a subtype that doesn't need a name.
1847  EDIT_EACH_SUBSOURCE_ON_BIOSOURCE (it, biosrc) {
1848  CSubSource& sbs = **it;
1849  if (FIELD_IS_SET (sbs, Name) && ! GET_FIELD(sbs, Name).empty() ) continue;
1850  TSUBSOURCE_SUBTYPE chs = GET_FIELD (sbs, Subtype);
1851  if (CSubSource::NeedsNoText (chs)) continue;
1852  ERASE_SUBSOURCE_ON_BIOSOURCE (it, biosrc);
1854  }
1855 
1856  // sort and remove duplicates.
1860  }
1861 
1865  }
1866 
1868  }
1869 }
1870 
1871 
1872 
1873 
1874 static bool s_DbtagIsBad (
1875  CDbtag& dbt
1876 )
1877 
1878 {
1879  if (! FIELD_IS_SET (dbt, Db)) return true;
1880  const string& db = GET_FIELD(dbt, Db);
1881  if (NStr::IsBlank (db)) return true;
1882  if( NStr::EqualNocase(db, "PID") ||
1883  NStr::EqualNocase(db, "PIDg") ||
1884  NStr::EqualNocase(db, "NID") ) {
1885  return true;
1886  }
1887 
1888  if (! FIELD_IS_SET( dbt, Tag)) return true;
1889  const CObject_id& oid = GET_FIELD(dbt, Tag);
1890 
1891  if (FIELD_IS (oid, Id)) {
1892  if (GET_FIELD (oid, Id) == 0) return true;
1893  } else if (FIELD_IS (oid, Str)) {
1894  const string& str = GET_FIELD (oid, Str);
1895  if (NStr::IsBlank (str)) return true;
1896  } else return true;
1897 
1898  return false;
1899 }
1900 
1902 {
1905  }
1906 }
1907 
1909 
1910 {
1911  CLEAN_STRING_MEMBER (org, Taxname);
1912  CLEAN_STRING_MEMBER (org, Common);
1913  CLEAN_STRING_LIST (org, Syn);
1914 
1915  if (FIELD_IS_SET (org, Orgname)) {
1916  COrgName& onm = GET_MUTABLE (org, Orgname);
1917  OrgnameBC (onm, org);
1918  }
1919 
1920 
1921  if (ORGREF_HAS_DBXREF (org)) {
1922  vector< CRef< CDbtag > > new_dbtags;
1923  EDIT_EACH_DBXREF_ON_ORGREF (it, org) {
1924  CDbtag& dbt = **it;
1925  DbtagBC(dbt);
1926  x_SplitDbtag(dbt, new_dbtags );
1927  }
1928  if( ! new_dbtags.empty() ) {
1929  copy( new_dbtags.begin(), new_dbtags.end(), back_inserter( org.SetDb() ) );
1931  }
1932  }
1933 }
1934 
1936 {
1937  EDIT_EACH_DBXREF_ON_ORGREF (it, org) {
1938  CDbtag& dbt = **it;
1939  if (s_DbtagIsBad (dbt)) {
1940  ERASE_DBXREF_ON_ORGREF (it, org);
1942  }
1943  }
1944 
1945  // sort/unique db_xrefs
1949  }
1953  }
1954 
1955  // sort/unique syns
1959  }
1963  }
1964 
1965 }
1966 
1967 // is om1 < om2
1968 // to sort subtypes together.
1969 
1970 static bool s_OrgModCompare (
1971  const CRef<COrgMod>& om1,
1972  const CRef<COrgMod>& om2
1973 )
1974 
1975 {
1976  const COrgMod& omd1 = *(om1);
1977  const COrgMod& omd2 = *(om2);
1978 
1979  // subtype comparison
1980  TORGMOD_SUBTYPE subtype1 = GET_FIELD (omd1, Subtype);
1981  TORGMOD_SUBTYPE subtype2 = GET_FIELD (omd2, Subtype);
1982  if (subtype1 < subtype2) return true;
1983  if (subtype1 > subtype2) return false;
1984 
1985  // subname comparison
1986  const string& subname1 = GET_FIELD (omd1, Subname);
1987  const string& subname2 = GET_FIELD (omd2, Subname);
1988  const int subname_comparison = NStr::CompareNocase( subname1, subname2 );
1989  if( subname_comparison < 0 ) {
1990  return true;
1991  } else if( subname_comparison > 0 ) {
1992  return false;
1993  }
1994 
1995  // attrib comparison (realistically, we don't expect to fall back to this)
1996  const string& attrib1 = ( FIELD_IS_SET(omd1, Attrib) ? GET_FIELD (omd1, Attrib) : kEmptyStr );
1997  const string& attrib2 = ( FIELD_IS_SET(omd2, Attrib) ? GET_FIELD (omd2, Attrib) : kEmptyStr );
1998 
1999  const int attrib_comparison = NStr::CompareNocase( attrib1, attrib2 );
2000  if (attrib_comparison < 0) {
2001  return true;
2002  } else {
2003  return false;
2004  }
2005 }
2006 
2007 // Two OrgMod's are equal and duplicates if:
2008 // they have the same subname and same subtype
2009 
2010 static bool s_OrgModEqual (
2011  const CRef<COrgMod>& om1,
2012  const CRef<COrgMod>& om2
2013 )
2014 
2015 {
2016  const COrgMod& omd1 = *(om1);
2017  const COrgMod& omd2 = *(om2);
2018 
2019  const string& subname1 = GET_FIELD (omd1, Subname);
2020  const string& subname2 = GET_FIELD (omd2, Subname);
2021  if (! NStr::EqualNocase (subname1, subname2)) return false;
2022 
2023  const string& attrib1 = ( FIELD_IS_SET(omd1, Attrib) ? GET_FIELD (omd1, Attrib) : kEmptyStr );
2024  const string& attrib2 = ( FIELD_IS_SET(omd2, Attrib) ? GET_FIELD (omd2, Attrib) : kEmptyStr );
2025  if (! NStr::EqualNocase (attrib1, attrib2)) return false;
2026 
2027  TORGMOD_SUBTYPE chs1 = GET_FIELD (omd1, Subtype);
2028  TORGMOD_SUBTYPE chs2 = GET_FIELD (omd2, Subtype);
2029  if (chs1 == chs2) return true;
2030 
2031  return false;
2032 }
2033 
2035  COrgName& onm, COrg_ref &org_ref
2036 )
2037 
2038 {
2039  CLEAN_STRING_MEMBER (onm, Attrib);
2040  CLEAN_STRING_MEMBER (onm, Lineage);
2041  CLEAN_STRING_MEMBER_JUNK (onm, Div);
2042 
2043  EDIT_EACH_ORGMOD_ON_ORGNAME (it, onm) {
2044  COrgMod& omd = **it;
2045  OrgmodBC (omd);
2046  if (! FIELD_IS_SET (omd, Subname) || NStr::IsBlank (GET_FIELD (omd, Subname))) {
2047  ERASE_ORGMOD_ON_ORGNAME (it, onm);
2049  }
2050  }
2051 
2052  // erase structured notes that already match value
2053  // (Note: This is O(N^2). Maybe worth converting to a faster algo?)
2054  EDIT_EACH_ORGMOD_ON_ORGNAME (it, onm) {
2055  COrgMod& omd = **it;
2056  if (omd.GetSubtype() == NCBI_ORGMOD(other)) {
2057  bool do_erase = false;
2058  string val_name, otherval;
2059  NStr::SplitInTwo( omd.GetSubname(), " =:", val_name, otherval );
2060  try {
2061  COrgMod::TSubtype subtype = COrgMod::GetSubtypeValue(val_name);
2062  NStr::TruncateSpacesInPlace(otherval);
2063  FOR_EACH_ORGMOD_ON_ORGNAME (match_it, onm) {
2064  if ((*match_it)->GetSubtype() == subtype
2065  && NStr::EqualCase((*match_it)->GetSubname(), otherval)) {
2066  do_erase = true;
2067  break;
2068  }
2069  }
2070  } catch (CSerialException& ) {
2071  }
2072 
2073  if (do_erase) {
2074  ERASE_ORGMOD_ON_ORGNAME (it, onm);
2076  }
2077  }
2078  }
2079 
2083  }
2084 
2085  // clean Orgmod list
2086  x_OrgnameModBC( onm, GET_STRING_FLD_OR_BLANK(org_ref, Common) );
2087 
2091  }
2092 
2096  }
2097 
2099 }
2100 
2102  string& str
2103 )
2104 
2105 {
2106  // May need to create a custom implementation if this
2107  // regex becomes a bottleneck
2108  return s_RegexpReplace( str, "[ ]*:[ ]*", ":");
2109 }
2110 
2112  COrgMod& omd
2113 )
2114 {
2115  CLEAN_AND_COMPRESS_STRING_MEMBER (omd, Subname);
2116  if (FIELD_IS_SET (omd, Subname)) {
2118  x_RemoveFlankingQuotes( GET_MUTABLE(omd, Subname) );
2119  }
2120 
2121  CLEAN_AND_COMPRESS_STRING_MEMBER (omd, Attrib);
2122 
2123  TORGMOD_SUBTYPE subtype = GET_FIELD (omd, Subtype);
2124 
2125  if( subtype == NCBI_ORGMOD(specimen_voucher) ||
2126  subtype == NCBI_ORGMOD(culture_collection) ||
2127  subtype == NCBI_ORGMOD(bio_material) )
2128  {
2129  if (FIELD_IS_SET (omd, Subname)) {
2130  string &subname = GET_MUTABLE (omd, Subname);
2131  const string::size_type old_len = subname.length();
2133  NStr::ReplaceInPlace( subname, "::", ":", 0, 1 );
2134  if( old_len != subname.length() ) {
2136  }
2137  }
2138  }
2139 
2140  if (omd.RemoveAbbreviation()) {
2142  }
2143 }
2144 
2145 bool s_IsAllDigits(const string& str)
2146 {
2147  if (str.length() == 0) {
2148  return false;
2149  }
2150  bool all_digits = true;
2151  ITERATE(string, s, str) {
2152  if (!isdigit(*s)) {
2153  all_digits = false;
2154  break;
2155  }
2156  }
2157  return all_digits;
2158 }
2159 
2161  CDbtag& dbtag
2162 )
2163 
2164 {
2165  if (! FIELD_IS_SET (dbtag, Db)) return;
2166  if (! FIELD_IS_SET (dbtag, Tag)) return;
2167 
2168  string& db = GET_MUTABLE (dbtag, Db);
2169  if (NStr::IsBlank (db)) return;
2170 
2171  size_t len = db.length();
2173  if (len != db.length()) {
2175  }
2176 
2177  if (dbtag.GetTag().IsStr()) {
2178  if (TrimSpacesSemicolonsAndCommas(dbtag.SetTag().SetStr())) {
2180  }
2181  }
2182 
2183  if (NStr::EqualNocase(db, "Swiss-Prot")
2184  || NStr::EqualNocase (db, "SWISSPROT")
2185  || NStr::EqualNocase (db, "UniProt/Swiss-Prot")) {
2186  db = "UniProtKB/Swiss-Prot";
2188  } else if (NStr::EqualNocase(db, "SPTREMBL") ||
2189  NStr::EqualNocase(db, "TrEMBL") ||
2190  NStr::EqualNocase(db, "UniProt/TrEMBL") ) {
2191  db = "UniProtKB/TrEMBL";
2193  } else if (NStr::EqualNocase(db, "SUBTILIS")) {
2194  db = "SubtiList";
2196  } else if (NStr::EqualNocase(db, "LocusID")) {
2197  db = "GeneID";
2199  } else if (NStr::EqualNocase(db, "MaizeDB")) {
2200  db = "MaizeGDB";
2202  } else if (NStr::EqualNocase(db, "GeneW")) {
2203  db = "HGNC";
2205  } else if (NStr::EqualNocase(db, "MGD")) {
2206  db = "MGI";
2208  } else if (NStr::EqualNocase(db, "IFO")) {
2209  db = "NBRC";
2211  } else if (NStr::EqualNocase(db, "BHB") ||
2212  NStr::EqualNocase(db, "BioHealthBase")) {
2213  db = "IRD";
2215  } else if (NStr::Equal(db, "GENEDB")) {
2216  db = "GeneDB";
2218  } else if (NStr::Equal(db, "cdd")) {
2219  db = "CDD";
2221  } else if (NStr::Equal(db, "FlyBase")) {
2222  db = "FLYBASE";
2224  } else if (NStr::Equal(db, "GreengenesID")) {
2225  db = "Greengenes";
2227  } else if (NStr::Equal(db, "HMPID")) {
2228  db = "HMP";
2230  } else if (NStr::Equal(db, "ATCC (inhost)")) {
2231  db = "ATCC(in host)";
2233  } else if (NStr::Equal(db, "ATCC (dna)")) {
2234  db = "ATCC(dna)";
2236  }
2237 
2238  CObject_id& oid = GET_MUTABLE (dbtag, Tag);
2239 
2240  if (FIELD_IS (oid, Id)) {
2241  const string& db = dbtag.GetDb();
2242  if (NStr::EqualNocase (db, "HGNC") || NStr::EqualNocase (db, "VGNC") || NStr::EqualNocase (db, "MGI") ) {
2243  int val = dbtag.GetTag().GetId();
2244  string str = db + ":" + NStr::IntToString(val);
2245  dbtag.SetTag().SetStr(str);
2247  }
2248  }
2249 
2250  if (! FIELD_IS (oid, Str)) return;
2251 
2252  string& str = GET_MUTABLE(oid, Str);
2253  if (NStr::IsBlank (str)) return;
2254 
2255  db = dbtag.GetDb();
2256  str = dbtag.GetTag().GetStr();
2257  if (NStr::EqualNocase(db, "HPRD") && NStr::StartsWith (str, "HPRD_")) {
2258  dbtag.SetTag().SetStr (str.substr (5));
2260  } else if (NStr::EqualNocase (db, "MGI") ) {
2261  if (!NStr::Equal(db, "MGI")) {
2262  dbtag.SetDb("MGI");
2264  }
2265  if(NStr::StartsWith (str, "MGI:")) {
2266  /*
2267  dbtag.SetTag().SetStr (dbtag.GetTag().GetStr().substr (4));
2268  ChangeMade(CCleanupChange::eChangeDbxrefs);
2269  */
2270  }
2271  else if (NStr::StartsWith(str, "MGD:")) {
2272  dbtag.SetTag().SetStr("MGI:" + dbtag.GetTag().GetStr().substr(4));
2274  } else if (NStr::StartsWith(str, "J:")) {
2275  if (s_IsAllDigits(str.substr(2))) {
2276  dbtag.SetTag().SetStr("MGI:");
2277  }
2278  } else {
2279  string newstr = "MGI:" + str;
2280  dbtag.SetTag().SetStr(newstr);
2282  }
2283  } else if (NStr::EqualNocase (db, "HGNC") ) {
2284  if(! NStr::StartsWith (str, "HGNC:")) {
2285  string newstr = "HGNC:" + str;
2286  dbtag.SetTag().SetStr(newstr);
2288  /*
2289  dbtag.SetTag().SetStr (dbtag.GetTag().GetStr().substr (5));
2290  ChangeMade(CCleanupChange::eChangeDbxrefs);
2291  */
2292  }
2293  } else if (NStr::EqualNocase (db, "VGNC") ) {
2294  if(! NStr::StartsWith (str, "VGNC:")) {
2295  string newstr = "VGNC:" + str;
2296  dbtag.SetTag().SetStr(newstr);
2298  /*
2299  dbtag.SetTag().SetStr (dbtag.GetTag().GetStr().substr (5));
2300  ChangeMade(CCleanupChange::eChangeDbxrefs);
2301  */
2302  }
2303  } else if (NStr::EqualNocase (db, "RGD") ) {
2304  if(NStr::StartsWith (str, "RGD:")) {
2305  dbtag.SetTag().SetStr (str.substr (4));
2307  }
2308  }
2309 
2310  /*
2311  // convert to number if all digits
2312  if (s_IsAllDigits(str) && !NStr::StartsWith(str, "0")) {
2313  try {
2314  // extract the part before the first space for conversion
2315  string::size_type pos_of_first_space = 0;
2316  while (pos_of_first_space < str.length() && !isspace(str[pos_of_first_space])) {
2317  ++pos_of_first_space;
2318  }
2319  CTempString sStrOfNum(str, 0, pos_of_first_space);
2320 
2321  // only convert str to int if it fits into the non-negative side
2322  // of an int.
2323  int value = NStr::StringToInt(sStrOfNum, NStr::fConvErr_NoThrow);
2324  if (value > 0) {
2325  dbtag.SetTag().SetId(NStr::StringToUInt(sStrOfNum));
2326  ChangeMade(CCleanupChange::eChangeDbxrefs);
2327  }
2328  } catch (CStringException&) {
2329  // just leave things as are
2330  }
2331  }
2332  */
2333 }
2334 
2335 
2337  CPubdesc& pubdesc
2338 )
2339 {
2340  if (CCleanupPub::CleanPubdesc(pubdesc, m_StripSerial)) {
2342  }
2343  // need to construct m_PubToNewPubLabelMap separately
2344  if (pubdesc.IsSetPub()) {
2345  for (auto p : pubdesc.SetPub().Set()) {
2346  string new_label;
2347  p->GetLabel(&new_label, CPub::eContent, true);
2348  m_PubToNewPubLabelMap[p] = new_label;
2349  }
2350  }
2351 }
2352 
2353 
2354 typedef pair<string, CRef<CPub> > TCit;
2355 struct TSortCit {
2356  bool operator ()(const TCit& c1, const TCit& c2) const {
2357 
2358  // First, try to compare case-insensitively
2359  // (We compare as if it were all-caps to match C's behavior )
2360  const int label_compare_no_case = s_CompareNoCaseCStyle(c1.first, c2.first);
2361  if( label_compare_no_case != 0 ) {
2362  return (label_compare_no_case < 0);
2363  }
2364 
2365  // if they're the same, try to compare case-sensitively
2366  const int label_compare_case = NStr::CompareCase( c1.first, c2.first );
2367  if( label_compare_case != 0 ) {
2368  return (label_compare_case < 0);
2369  }
2370 
2371  // if they're still the same, fall back on cit-gen titles, if possible
2372  return CitGenTitlesLess(*c1.second, *c2.second);
2373  }
2374  bool CitGenTitlesLess(const CPub& p1, const CPub& p2) const {
2375  if ( ! p1.IsGen() || ! p2.IsGen() ) {
2376  return false;
2377  }
2378  const CCit_gen& g1 = p1.GetGen();
2379  const CCit_gen& g2 = p2.GetGen();
2380  if ( g1.IsSetTitle() != g2.IsSetTitle() ) {
2381  return (g1.IsSetTitle() || g2.IsSetTitle());
2382  } else if( ! g1.IsSetTitle() && ! g2.IsSetTitle() ) {
2383  return false;
2384  }
2385  return g1.GetTitle() < g2.GetTitle();
2386  }
2387 };
2388 
2389 static
2390 bool cmpSortedvsOld(const TCit& e1, const CRef<CPub>& e2) {
2391  return e1.second == e2;
2392 }
2393 
2395 {
2396  // The Pub-set should always be pub. Ignore if not.
2397  if( ! pub_set.IsPub() ) {
2398  return;
2399  }
2400 
2401  // sort and unique by putting everything into a set
2402  // indexed by a label generated for each CPub.
2403  typedef set<TCit, TSortCit> TCitSet;
2404  TCitSet cit_set;
2405  for (auto cit_it : pub_set.GetPub()) {
2406  string label;
2408  // the following line may fail due to dups
2409  // (that's okay; it lets us automatically remove dups)
2410  cit_set.insert( TCit(label, cit_it) );
2411  }
2412  auto& publist = pub_set.SetPub();
2413  // Has anything been deleted, or has the order changed?
2414  if ( cit_set.size() != publist.size() ||
2415  ! equal(cit_set.begin(), cit_set.end(), publist.begin(), cmpSortedvsOld) )
2416  {
2417  // put everything left back into the feature's citation list.
2418  publist.clear();
2419  ITERATE (TCitSet, citset_it, cit_set) {
2420  publist.push_back(citset_it->second);
2421  }
2423  }
2424 }
2425 
2426 
2428 {
2429  if( ! FIELD_IS_SET_AND_IS(feat, Data, Imp) ) {
2430  return;
2431  }
2432 
2433  CImp_feat &imf = GET_MUTABLE( feat.SetData(), Imp );
2434 
2435  CLEAN_STRING_MEMBER_JUNK(imf, Key);
2436  CLEAN_STRING_MEMBER(imf, Loc);
2437  CLEAN_STRING_MEMBER(imf, Descr);
2438 
2439  if (imf.IsSetKey() && CSeqFeatData::FixImportKey(imf.SetKey())) {
2441  }
2442 
2443  if ( FIELD_IS_SET(imf, Key) ) {
2444  const CImp_feat::TKey& key = GET_FIELD(imf, Key);
2445  if ( key == "satellite" && ! m_IsEmblOrDdbj ) {
2446  SET_FIELD(imf, Key, "repeat_region");
2448 
2449  CRef<CGb_qual> satellite_qual( new CGb_qual );
2450  satellite_qual->SetQual("satellite");
2451  string val;
2452  if( FIELD_IS_SET(feat, Comment) ) {
2453  val = x_ExtractSatelliteFromComment( GET_MUTABLE(feat, Comment) );
2454  }
2455  if( val.empty() ) {
2456  val = "satellite";
2457  }
2458  satellite_qual->SetVal( val );
2459 
2460  feat.SetQual().push_back( satellite_qual );
2461  } else if ( key == "LTR" ) {
2462  SET_FIELD(imf, Key, "repeat_region");
2464 
2465  CRef<CGb_qual> rpt_type_qual( new CGb_qual );
2466  rpt_type_qual->SetQual( "rpt_type" );
2467  rpt_type_qual->SetVal( "long_terminal_repeat" );
2468 
2469  feat.SetQual().push_back( rpt_type_qual );
2470  }
2471 
2472  CSeqFeatData::ESubtype subtype = feat.GetData().GetSubtype();
2473  if (CSeqFeatData::IsRegulatory(subtype)) {
2474  string regulatory_class = CSeqFeatData::GetRegulatoryClass(subtype);
2475  SET_FIELD(imf, Key, "regulatory");
2477  CRef<CGb_qual> regulatory_class_qual( new CGb_qual );
2478  regulatory_class_qual->SetQual("regulatory_class");
2479  if (NStr::IsBlank(regulatory_class)) {
2480  regulatory_class_qual->SetVal( "other" );
2481  } else {
2482  regulatory_class_qual->SetVal( regulatory_class );
2483  }
2484  feat.SetQual().push_back( regulatory_class_qual );
2485  }
2486 
2487  if( key == "repeat_region" && ! m_IsEmblOrDdbj ) {
2488  string val;
2489  if( FIELD_IS_SET(feat, Comment) ) {
2490  val = x_ExtractSatelliteFromComment( GET_MUTABLE(feat, Comment) );
2491  }
2492  if( ! val.empty() ) {
2493  CRef<CGb_qual> satellite_qual( new CGb_qual );
2494  satellite_qual->SetQual("satellite");
2495  satellite_qual->SetVal( val );
2496 
2497  feat.SetQual().push_back( satellite_qual );
2499  }
2500  }
2501 
2502  if( key == "CDS" ) {
2503  if( ! m_IsEmblOrDdbj ) {
2504  CRef<CCdregion> new_cdregion( new CCdregion );
2505  // get frame from location
2506  if( ! FIELD_EQUALS( feat, Pseudo, true ) &&
2507  feat.IsSetLocation() &&
2508  CCleanup::SetFrameFromLoc(*new_cdregion, feat.GetLocation(), *m_Scope)) {
2510  }
2512 
2513  CdregionFeatBC( *new_cdregion, feat );
2514  feat.SetData().SetCdregion(*new_cdregion);
2515  return;
2516  }
2517  }
2518  }
2519 
2520  if( FIELD_IS_SET(imf, Loc) ) {
2521  if ( NStr::Find(imf.GetLoc(), "replace") != NPOS ) {
2522  x_AddReplaceQual(feat, imf.GetLoc());
2523  RESET_FIELD(imf, Loc);
2525  }
2526  } else if( FIELD_IS_SET(imf, Key) ) {
2527  const string &key = GET_FIELD(imf, Key);
2528 
2529  TRNAREF_TYPE rna_ref_type = NCBI_RNAREF(unknown);
2530  if ( key == "precursor_RNA" ) {
2531  rna_ref_type = NCBI_RNAREF(premsg);
2532  } else if ( key == "mRNA" ) {
2533  rna_ref_type = NCBI_RNAREF(mRNA);
2534  } else if ( key == "tRNA" ) {
2535  rna_ref_type = NCBI_RNAREF(tRNA);
2536  } else if ( key == "rRNA" ) {
2537  rna_ref_type = NCBI_RNAREF(rRNA);
2538  } else if ( key == "snRNA" ) {
2539  rna_ref_type = NCBI_RNAREF(snRNA);
2540  } else if ( key == "scRNA" ) {
2541  rna_ref_type = NCBI_RNAREF(scRNA);
2542  } else if ( key == "snoRNA" ) {
2543  rna_ref_type = NCBI_RNAREF(snoRNA);
2544  } else if ( key == "misc_RNA" ) {
2545  rna_ref_type = NCBI_RNAREF(other);
2546  }
2547  if (rna_ref_type != NCBI_RNAREF(unknown) ) {
2548  CRef<CRNA_ref> new_rna_ref( new CRNA_ref );
2549  new_rna_ref->SetType( rna_ref_type );
2550  feat.SetData().SetRna( *new_rna_ref );
2552  x_CleanSeqFeatQuals(feat);
2553  RnaFeatBC(feat.SetData().SetRna(), feat);
2554  } else {
2555  TPROTREF_PROCESSED processed = NCBI_PROTREF(not_set);
2556  if ( key == "proprotein" || key == "preprotein" ) {
2557  processed = NCBI_PROTREF(preprotein);
2558  } else if ( key == "mat_peptide" ) {
2559  processed = NCBI_PROTREF(mature);
2560  } else if ( key == "sig_peptide" ) {
2561  processed = NCBI_PROTREF(signal_peptide);
2562  } else if ( key == "transit_peptide" ) {
2563  processed = NCBI_PROTREF(transit_peptide);
2564  } else if ( key == "propeptide" ) {
2565  processed = NCBI_PROTREF(propeptide);
2566  }
2567  if (processed != NCBI_PROTREF(not_set) || key == "Protein" ) {
2568  const CSeq_id* location_seq_id = ( feat.IsSetLocation() ? feat.GetLocation().GetId() : nullptr );
2569  if( location_seq_id ) {
2570  CBioseq_Handle bioseq_handle = m_Scope->GetBioseqHandle(*location_seq_id);
2571  if ( bioseq_handle && bioseq_handle.IsAa() ) {
2572  CRef<CProt_ref> new_prot_ref( new CProt_ref );
2573  new_prot_ref->SetProcessed( processed );
2574  if (feat.IsSetComment() && !NStr::IsBlank(feat.GetComment())) {
2575  new_prot_ref->SetName().push_back(feat.GetComment());
2576  feat.ResetComment();
2577  }
2578  feat.SetData().SetProt( *new_prot_ref );
2580  x_CleanSeqFeatQuals(feat);
2581  }
2582  }
2583  }
2584  }
2585  }
2586 }
2587 
2588 
2590 static const TSiteElem sc_site_map[] = {
2591  { "acetylation", CSeqFeatData::eSite_acetylation },
2592  { "active", CSeqFeatData::eSite_active },
2593  { "amidation", CSeqFeatData::eSite_amidation },
2594  { "binding", CSeqFeatData::eSite_binding },
2595  { "blocked", CSeqFeatData::eSite_blocked },
2596  { "cleavage", CSeqFeatData::eSite_cleavage },
2597  { "dna binding", CSeqFeatData::eSite_dna_binding },
2598  { "dna-binding", CSeqFeatData::eSite_dna_binding },
2599  { "gamma carboxyglutamic acid", CSeqFeatData::eSite_gamma_carboxyglutamic_acid },
2600  { "gamma-carboxyglutamic-acid", CSeqFeatData::eSite_gamma_carboxyglutamic_acid },
2601  { "glycosylation", CSeqFeatData::eSite_glycosylation },
2602  { "hydroxylation", CSeqFeatData::eSite_hydroxylation },
2603  { "inhibit", CSeqFeatData::eSite_inhibit },
2604  { "lipid binding", CSeqFeatData::eSite_lipid_binding },
2605  { "lipid-binding", CSeqFeatData::eSite_lipid_binding },
2606  { "metal binding", CSeqFeatData::eSite_metal_binding },
2607  { "metal-binding", CSeqFeatData::eSite_metal_binding },
2608  { "methylation", CSeqFeatData::eSite_methylation },
2609  { "modifi", CSeqFeatData::eSite_modified },
2610  { "mutagenized", CSeqFeatData::eSite_mutagenized },
2611  { "myristoylation", CSeqFeatData::eSite_myristoylation },
2612  { "nitrosylation", CSeqFeatData::eSite_nitrosylation },
2613  { "np binding", CSeqFeatData::eSite_np_binding },
2614  { "np-binding", CSeqFeatData::eSite_np_binding },
2615  { "oxidative deamination", CSeqFeatData::eSite_oxidative_deamination },
2616  { "oxidative-deamination", CSeqFeatData::eSite_oxidative_deamination },
2617  { "phosphorylation", CSeqFeatData::eSite_phosphorylation },
2618  { "pyrrolidone carboxylic acid", CSeqFeatData::eSite_pyrrolidone_carboxylic_acid },
2619  { "pyrrolidone-carboxylic-acid", CSeqFeatData::eSite_pyrrolidone_carboxylic_acid },
2620  { "signal peptide", CSeqFeatData::eSite_signal_peptide },
2621  { "signal-peptide", CSeqFeatData::eSite_signal_peptide },
2622  { "sulfatation", CSeqFeatData::eSite_sulfatation },
2623  { "transit peptide", CSeqFeatData::eSite_transit_peptide },
2624  { "transit-peptide", CSeqFeatData::eSite_transit_peptide },
2625  { "transmembrane region", CSeqFeatData::eSite_transmembrane_region },
2626  { "transmembrane-region", CSeqFeatData::eSite_transmembrane_region }
2627 };
2630 
2632 {
2633  // If site set to "other", try to extract it from the comment
2634  if ( FIELD_IS_SET(feat, Comment) &&
2636  {
2637  // extract if comment starts with any informative possibilities listed in sc_SiteMap
2638  const string& comment = GET_FIELD(feat, Comment);
2639  TSiteMap::const_iterator it = s_FindInMapAsPrefix<TSiteMap>( comment, sc_SiteMap );
2640  if ( it != sc_SiteMap.end() ) {
2641  feat.SetData().SetSite(it->second);
2643  // erase the comment if it contains no further useful info aside from the site
2644  if (NStr::IsBlank(comment, it->first.length()) ||
2645  NStr::EqualNocase(comment, it->first.length(), NPOS, " site")) {
2646  feat.ResetComment();
2648  }
2649  }
2650  }
2651 }
2652 
2654 {
2655  switch (loc.Which()) {
2656  case CSeq_loc::e_Int :
2657  x_SeqIntervalBC( GET_MUTABLE(loc, Int) );
2658  break;
2659  case CSeq_loc::e_Packed_int :
2660  {
2661  CSeq_loc::TPacked_int::Tdata& ints = loc.SetPacked_int().Set();
2662  NON_CONST_ITERATE(CSeq_loc::TPacked_int::Tdata, interval_it, ints) {
2663  x_SeqIntervalBC(**interval_it);
2664  }
2665  if (ints.size() == 1) {
2666  CRef<CSeq_interval> int_ref = ints.front();
2667  loc.SetInt(*int_ref);
2669  }
2670  }
2671  break;
2672  case CSeq_loc::e_Pnt :
2673  {
2674  CSeq_loc::TPnt& pnt = loc.SetPnt();
2675 
2676  if (pnt.IsSetStrand()) {
2677  if (pnt.GetStrand() == eNa_strand_unknown) {
2680  }
2681  }
2682  else {
2685  }
2686 
2687  // normalize Seq-point fuzz tl to tr and decrement position
2688  if (pnt.IsSetFuzz() && pnt.GetFuzz().IsLim() &&
2689  pnt.GetFuzz().GetLim() == CInt_fuzz::eLim_tl) {
2690  TSeqPos pos = pnt.GetPoint();
2691  if (pos > 0) {
2692  pnt.SetFuzz().SetLim(CInt_fuzz::eLim_tr);
2693  pnt.SetPoint(pos - 1);
2695  }
2696  }
2697  }
2698  break;
2699  case CSeq_loc::e_Mix :
2700  {
2701  typedef CSeq_loc::TMix::Tdata TMixList;
2702  // delete Null type Seq-locs from beginning and end of Mix list.
2703 
2704  // deleting from beginning:
2705  TMixList& sl_list = loc.SetMix().Set();
2706  TMixList::iterator sl_it = sl_list.begin();
2707  while (sl_it != sl_list.end()) {
2708  if ((*sl_it)->IsNull()) {
2709  sl_it = sl_list.erase(sl_it);
2711  } else {
2712  break;
2713  }
2714  }
2715 
2716  // deleting from end:
2717  if( sl_list.size() > 0 ) {
2718  sl_it = sl_list.end();
2719  while (sl_it != sl_list.begin()) {
2720  --sl_it;
2721  if ( ! (*sl_it)->IsNull()) {
2722  break;
2723  }
2724  }
2725  ++sl_it;
2726  if (sl_it != sl_list.end()) {
2727  sl_list.erase(sl_it, sl_list.end());
2729  }
2730  }
2731 
2732  if (sl_list.size() == 0) {
2733  loc.SetNull();
2735  } else if (sl_list.size() == 1) {
2736  CRef<CSeq_loc> only_sl = sl_list.front();
2737  loc.Assign(*only_sl);
2739  }
2740  }
2741  break;
2742  default:
2743  break;
2744  }
2745 
2746  // don't allow strandedness on protein sequences
2747  {
2748  CBioseq_Handle bsh;
2749  if (m_Scope) {
2750  ITERATE( CSeq_loc, loc_ci, loc ) {
2751  bsh = m_Scope->GetBioseqHandle(loc_ci.GetSeq_id());
2752  if( bsh ) {
2753  break;
2754  }
2755  }
2756  }
2757  if ( bsh && bsh.IsProtein() && FIELD_IS_SET(loc, Strand) ) {
2758  RESET_FIELD(loc, Strand);
2760  }
2761  }
2762 
2763 }
2764 
2766 {
2767  if (loc.IsWhole() && m_Scope) {
2768 
2769  // change the Seq-loc/whole to a Seq-loc/interval which covers the whole sequence.
2770  CRef<CSeq_id> id(new CSeq_id());
2771  id->Assign(loc.GetWhole());
2772  CBioseq_Handle bsh;
2773 
2774  if( id ) {
2775  bsh = m_Scope->GetBioseqHandle(*id);
2776  }
2777  if (bsh) {
2778  TSeqPos bs_len = bsh.GetBioseqLength();
2779  auto& interval = loc.SetInt();
2780  interval.SetId(*id);
2781  interval.SetFrom(0);
2782  interval.SetTo(bs_len - 1);
2784  }
2785  }
2786 }
2787 
2788 static void
2790  CSeq_loc_mix::Tdata & mix_pieces,
2791  bool any_nulls_seen )
2792 {
2793  NON_CONST_ITERATE( CSeq_loc_mix::Tdata, old_mix_iter, mix_pieces ) {
2794  CRef<CSeq_loc> old_piece( *old_mix_iter );
2795  if( old_piece->IsNull() ) {
2796  // ignore
2797  } else if( old_piece->IsMix() ) {
2798  s_AddSeqLocMix( new_mix_pieces, old_piece->SetMix(),
2799  any_nulls_seen );
2800  } else {
2801  if( any_nulls_seen && ! new_mix_pieces.empty() ) {
2802  CRef<CSeq_loc> null_piece( new CSeq_loc );
2803  null_piece->SetNull();
2804  new_mix_pieces.push_back( null_piece );
2805  }
2806  new_mix_pieces.push_back( old_piece );
2807  }
2808  }
2809 }
2810 
2812 {
2813  if( ! loc_mix.IsSet() || loc_mix.Set().empty() ) {
2814  return;
2815  }
2816 
2817  // This function does two things simultaneously:
2818  // It checks for mix-inside-mix and also checks if
2819  // we need to do "NULL-normalization"
2820  bool have_seen_inner_mix = false;
2821  bool any_nulls_seen = false;
2822  bool alternates_not_null_then_null = true;
2823 
2824  CSeq_loc_mix::Tdata & mix_pieces = loc_mix.Set();
2825  if( (mix_pieces.size() % 2) == 0 ) {
2826  // can't do notnull-null-notnull-null-notnull-....-null-notnull
2827  // if we have an even number of items
2828  alternates_not_null_then_null = false;
2829  }
2830 
2831  bool last_piece_was_null = true;
2832  ITERATE( CSeq_loc_mix::Tdata, outer_mix_iter, mix_pieces ) {
2833  const CSeq_loc &this_piece = **outer_mix_iter;
2834  const bool this_piece_is_null = this_piece.IsNull();
2835 
2836  // see if we've found any NULLs in this loc
2837  if( this_piece_is_null ) {
2838  any_nulls_seen = true;
2839  }
2840 
2841  // see if we break alternation of notnull and null
2842  if( alternates_not_null_then_null ) {
2843  if( this_piece_is_null == last_piece_was_null ) {
2844  // two of the same kind in a row: does not alternate
2845  alternates_not_null_then_null = false;
2846  }
2847  }
2848 
2849  // see if there's a nested mix in here
2850  if( this_piece.IsMix() ) {
2851  have_seen_inner_mix = true;
2852  alternates_not_null_then_null = false; // mix breaks alternation
2853  // We have to check if the inner-mix contains any NULLs
2854  if( ! any_nulls_seen ) {
2855  CSeq_loc_CI inner_ci( this_piece, CSeq_loc_CI::eEmpty_Allow );
2856  for( ; inner_ci; ++inner_ci ) {
2857  if( inner_ci.IsEmpty() ) {
2858  any_nulls_seen = true;
2859  }
2860  }
2861  }
2862  }
2863 
2864  // for next iteration
2865  last_piece_was_null = this_piece_is_null;
2866  }
2867 
2868  // we've examined the location, so if there are any problems, we have
2869  // to rebuild it.
2870  if( have_seen_inner_mix ||
2871  (any_nulls_seen && ! alternates_not_null_then_null) )
2872  {
2873  CSeq_loc_mix new_mix;
2874  CSeq_loc_mix::Tdata & new_mix_pieces = new_mix.Set();
2875 
2876  // has to be in a separate function because it's recursive
2877  s_AddSeqLocMix( new_mix_pieces, mix_pieces, any_nulls_seen );
2878 
2879  // swap is faster than assignment
2880  loc_mix.Set().swap( new_mix_pieces );
2881  }
2882 }
2883 
2884 static bool s_IsJustQuotes (const string& str)
2885 
2886 {
2887  FOR_EACH_CHAR_IN_STRING (str_itr, str) {
2888  const char& ch = *str_itr;
2889  if (ch > ' ' && ch != '"' && ch != '\'') return false;
2890  }
2891  return true;
2892 }
2893 
2895  CGb_qual& gbq
2896 )
2897 
2898 {
2899  CLEAN_STRING_MEMBER (gbq, Qual);
2900  if (! FIELD_IS_SET (gbq, Qual)) {
2901  SET_FIELD (gbq, Qual, kEmptyStr);
2903  }
2904 
2905  if (FIELD_IS_SET (gbq, Val)) {
2906  const string::size_type old_length = gbq.GetVal().length();
2907  CleanVisString (gbq.SetVal());
2909  x_CompressSpaces( gbq.SetVal() );
2910  if (gbq.GetVal().length() != old_length) {
2912  }
2913  }
2914  if (FIELD_IS_SET (gbq, Val) && s_IsJustQuotes (GET_FIELD (gbq, Val))) {
2915  SET_FIELD (gbq, Val, kEmptyStr);
2917  }
2918  if (! FIELD_IS_SET (gbq, Val)) {
2919  SET_FIELD (gbq, Val, kEmptyStr);
2921  }
2922 
2923  _ASSERT (FIELD_IS_SET (gbq, Qual) && FIELD_IS_SET (gbq, Val));
2924 
2925  if (NStr::EqualNocase(gbq.GetQual(), "rpt_unit_seq")) {
2926  if (x_IsBaseRange(gbq.GetVal())) {
2927  gbq.SetQual("rpt_unit_range");
2930  } else if (CGb_qual::CleanupRptUnitSeq(gbq.SetVal())) {
2932  }
2933  x_CleanupRptUnit(gbq);
2934  } else if (NStr::EqualNocase(gbq.GetQual(), "rpt_unit_range")) {
2935  if (! x_IsBaseRange(gbq.GetVal())) {
2936  gbq.SetQual("rpt_unit_seq");
2939  } else if (CGb_qual::CleanupRptUnitRange(gbq.SetVal())) {
2941  }
2942  } else if (NStr::EqualNocase(gbq.GetQual(), "rpt_unit")) {
2943  if (x_CleanupRptUnit(gbq)) {
2945  }
2946  } else if (NStr::EqualNocase(gbq.GetQual(), "replace")) {
2947  if (CGb_qual::CleanupReplace(gbq.SetVal())) {
2949  }
2950  } else if (NStr::EqualNocase(gbq.GetQual(), "repeat_type")) {
2951  if (CGb_qual::FixRptTypeValue(gbq.SetVal())) {
2953  }
2954  } else if (NStr::EqualNocase(gbq.GetQual(), "regulatory_class")) {
2957  }
2958  } else if (NStr::EqualNocase(gbq.GetQual(), "pseudogene")) {
2959  if (CGb_qual::FixPseudogeneValue(gbq.SetVal())) {
2961  }
2962  }
2963 
2964 
2967 
2968  if (NStr::EqualNocase(GET_FIELD(gbq, Qual), "mobile_element")) {
2969  SET_FIELD( gbq, Qual, "mobile_element_type" );
2971  }
2972  if (NStr::EqualNocase(gbq.GetQual(), "mobile_element_type") &&
2973  gbq.IsSetVal() &&
2976  }
2977 }
2978 
2979 static
2980 const char *s_FindImpFeatType( const CImp_feat &imp )
2981 {
2982  // keep sorted in ASCII-betical order
2983  static const char *allowed_types[] = {
2984  "-10_signal", "-35_signal", "3'UTR", "3'clip", "5'UTR",
2985  "5'clip", "CAAT_signal", "CDS", "C_region", "D-loop",
2986  "D_segment", "GC_signal", "Import", "J_segment", "LTR",
2987  "N_region", "RBS", "STS", "S_region", "Site-ref",
2988  "TATA_signal", "V_region", "V_segment", "allele", "attenuator",
2989  "centromere", "conflict", "enhancer", "exon", "gap",
2990  "iDNA", "intron", "mat_peptide", "misc_RNA", "misc_binding",
2991  "misc_difference","misc_feature", "misc_recomb", "misc_signal", "misc_structure",
2992  "mobile_element", "modified_base","mutation", "old_sequence", "operon",
2993  "oriT", "polyA_signal", "polyA_site", "precursor_RNA", "prim_transcript",
2994  "primer_bind", "promoter", "protein_bind", "regulatory", "rep_origin",
2995  "repeat_region", "repeat_unit", "satellite", "sig_peptide", "source",
2996  "stem_loop", "telomere", "terminator", "transit_peptide","unsure",
2997  "variation", "virion"
2998  };
2999  static const int kAllowedTypesNumElems = ( sizeof(allowed_types) / sizeof(allowed_types[0]));
3000 
3001  static const char *kFeatBad = "???";
3002 
3003  if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(imp, Key) ) {
3004  // the C logic is more complex than this
3005  const char *key = GET_FIELD(imp, Key).c_str();
3006  if( binary_search( allowed_types, allowed_types + kAllowedTypesNumElems,
3007  key, PCase_CStr() ) )
3008  {
3009  return key;
3010  }
3011  }
3012 
3013  return kFeatBad;
3014 }
3015 
3016 static
3017 const char *s_FindKeyFromFeatDefType( const CSeq_feat &feat )
3018 {
3019  static const char *kFeatBad = "???";
3020  const CSeqFeatData& fdata = feat.GetData();
3021 
3022  switch (fdata.Which()) {
3023  case NCBI_SEQFEAT(Gene):
3024  return "Gene";
3025  case NCBI_SEQFEAT(Org):
3026  return "Org";
3027  case NCBI_SEQFEAT(Cdregion):
3028  return "CDS";
3029  case NCBI_SEQFEAT(Prot):
3030  if(fdata.GetProt().IsSetProcessed() ) {
3031  switch( feat.GetData().GetProt().GetProcessed() ) {
3032  case NCBI_PROTREF(not_set):
3033  return "Protein";
3034  case NCBI_PROTREF(preprotein):
3035  return "proprotein";
3036  case NCBI_PROTREF(mature):
3037  return "mat_peptide";
3038  case NCBI_PROTREF(signal_peptide):
3039  return "sig_peptide";
3040  case NCBI_PROTREF(transit_peptide):
3041  return "transit_peptide";
3042  case NCBI_PROTREF(propeptide):
3043  return "propeptide";
3044  default:
3045  return kFeatBad;
3046  }
3047  }
3048  return "Protein";
3049  case NCBI_SEQFEAT(Rna):
3050  if(fdata.GetRna().IsSetType() ) {
3051  const auto& rna = fdata.GetRna();
3052  switch (rna.GetType() )
3053  {
3054  case NCBI_RNAREF(unknown):
3055  return "misc_RNA"; // unknownrna mapped to otherrna
3056  case NCBI_RNAREF(premsg):
3057  return "precursor_RNA";
3058  case NCBI_RNAREF(mRNA):
3059  return "mRNA";
3060  case NCBI_RNAREF(tRNA):
3061  return "tRNA";
3062  case NCBI_RNAREF(rRNA):
3063  return "rRNA";
3064  case NCBI_RNAREF(snRNA):
3065  return "snRNA";
3066  case NCBI_RNAREF(scRNA):
3067  return "scRNA";
3068  case NCBI_RNAREF(snoRNA):
3069  return "snoRNA";
3070  case NCBI_RNAREF(ncRNA):
3071  return "ncRNA";
3072  case NCBI_RNAREF(tmRNA):
3073  return "tmRNA";
3074  case NCBI_RNAREF(miscRNA):
3075  return "misc_RNA";
3076  case NCBI_RNAREF(other):
3077  if ( FIELD_IS_SET_AND_IS(rna, Ext, Name) ) {
3078  const string &name = rna.GetExt().GetName();
3079  if ( NStr::EqualNocase(name, "misc_RNA")) return "misc_RNA";
3080  if ( NStr::EqualNocase(name, "ncRNA") ) return "ncRNA";
3081  if ( NStr::EqualNocase(name, "tmRNA") ) return "tmRNA";
3082  }
3083  return "misc_RNA";
3084  default:
3085  return kFeatBad;
3086  }
3087  }
3088  return kFeatBad;
3089  case NCBI_SEQFEAT(Pub):
3090  return "Cit";
3091  case NCBI_SEQFEAT(Seq):
3092  return "Xref";
3093  case NCBI_SEQFEAT(Imp):
3094  return s_FindImpFeatType( fdata.GetImp() );
3095  case NCBI_SEQFEAT(Region):
3096  return "Region";
3097  case NCBI_SEQFEAT(Comment):
3098  return "Comment";
3099  case NCBI_SEQFEAT(Bond):
3100  return "Bond";
3101  case NCBI_SEQFEAT(Site):
3102  return "Site";
3103  case NCBI_SEQFEAT(Rsite):
3104  return "Rsite";
3105  case NCBI_SEQFEAT(User):
3106  return "User";
3107  case NCBI_SEQFEAT(Txinit):
3108  return "TxInit";
3109  case NCBI_SEQFEAT(Num):
3110  return "Num";
3111  case NCBI_SEQFEAT(Psec_str):
3112  return "SecStr";
3113  case NCBI_SEQFEAT(Non_std_residue):
3114  return "NonStdRes";
3115  case NCBI_SEQFEAT(Het):
3116  return "Het";
3117  case NCBI_SEQFEAT(Biosrc):
3118  return "Src";
3119  case NCBI_SEQFEAT(Clone):
3120  return "CloneRef";
3121  case NCBI_SEQFEAT(Variation):
3122  return "VariationRef";
3123  default:
3124  return kFeatBad;
3125  }
3126  return kFeatBad;
3127 }
3128 
3129 
3130 static bool SetExceptFromGbqual(const CGb_qual& gb_qual, CSeq_feat& feat)
3131 {
3132  bool rval = false;
3133  if (!feat.IsSetExcept() || !feat.GetExcept()) {
3134  feat.SetExcept(true);
3135  rval = true;
3136  }
3137 
3138  if (!gb_qual.IsSetQual()) {
3139  return rval;
3140  }
3141  if (feat.IsSetExcept_text() && !NStr::IsBlank(feat.GetExcept_text())) {
3142  return rval;
3143  }
3144  // for whatever reason, C Toolkit only sets text if Gbqual was blank
3145  if (gb_qual.IsSetVal() && !NStr::IsBlank(gb_qual.GetVal())) {
3146  return rval;
3147  }
3148  string exc = gb_qual.GetQual();
3149  NStr::ReplaceInPlace (exc, "-", " ");
3150  NStr::ReplaceInPlace (exc, "_", " ");
3151  feat.SetExcept_text(exc);
3152  return true;
3153 }
3154 
3155 
3156 static bool s_StringsAreEquivalent(const string& str1, const string& str2)
3157 {
3158  string s1 = NStr::Replace(str1, " ", "_");
3159  NStr::ReplaceInPlace(s1, "-", "_");
3160  string s2 = NStr::Replace(str2, " ", "_");
3161  NStr::ReplaceInPlace(s2, "-", "_");
3162  return NStr::EqualNocase(s1, s2);
3163 }
3164 
3165 
3167 {
3168  if( ! FIELD_IS_SET(feat, Data) ) {
3169  return eAction_Nothing;
3170  }
3171  CSeqFeatData &data = GET_MUTABLE(feat, Data);
3172 
3173  string& qual = GET_MUTABLE(gb_qual, Qual);
3174  string& val = GET_MUTABLE(gb_qual, Val);
3175 
3176  if( FIELD_EQUALS(feat, Pseudo, false) ) {
3177  RESET_FIELD(feat, Pseudo);
3179  }
3180 
3181  if( FIELD_EQUALS(feat, Partial, false) ) {
3182  RESET_FIELD(feat, Partial);
3184  }
3185 
3186  if (NStr::EqualNocase(qual, "cons_splice")) {
3187  return eAction_Erase;
3188  } else if (s_StringsAreEquivalent(qual, "ribosomal-slippage") ||
3189  s_StringsAreEquivalent(qual, "trans-splicing") ||
3190  s_StringsAreEquivalent(qual, "artificial-location")) {
3191  if (SetExceptFromGbqual(gb_qual, feat)) {
3193  }
3194  return eAction_Erase;
3195  } else if (NStr::EqualNocase(qual, "partial")) {
3196  feat.SetPartial(true);
3198  return eAction_Erase; // mark qual for deletion
3199  } else if (NStr::EqualNocase(qual, "evidence")) {
3200  return eAction_Erase; // mark qual for deletion
3201  } else if (NStr::EqualNocase(qual, "exception")) {
3202  if( ! FIELD_EQUALS(feat, Except, true ) ) {
3203  SET_FIELD(feat, Except, true);
3205  }
3206  if (!NStr::IsBlank(val) && !NStr::EqualNocase(val, "true")) {
3207  if (!feat.IsSetExcept_text()) {
3208  feat.SetExcept_text(val);
3210  }
3211  }
3212  return eAction_Erase; // mark qual for deletion
3213  } else if (NStr::EqualNocase(qual, "experiment")) {
3214  if (NStr::EqualNocase(val, "experimental evidence, no additional details recorded")) {
3216  return eAction_Erase; // mark qual for deletion
3217  }
3218  } else if (NStr::EqualNocase(qual, "inference")) {
3219  if (NStr::EqualNocase(val, "non-experimental evidence, no additional details recorded")) {
3221  return eAction_Erase; // mark qual for deletion
3222  } else {
3224  }
3225  } else if (NStr::EqualNocase(qual, "note") ||
3226  NStr::EqualNocase(qual, "notes") ||
3227  NStr::EqualNocase(qual, "comment")) {
3228  if (!feat.IsSetComment()) {
3229  feat.SetComment(val);
3230  } else {
3231  (feat.SetComment() += "; ") += val;
3232  }
3235  return eAction_Erase; // mark qual for deletion
3236  } else if( NStr::EqualNocase(qual, "label") ) {
3238  // skip label that is simply the feature key
3239  } else if ( ! FIELD_IS_SET(feat, Comment) || NStr::FindNoCase(GET_FIELD(feat, Comment), "label") == NPOS) {
3240  // if label is not already in comment, append
3241  if( GET_STRING_FLD_OR_BLANK(feat, Comment).empty() ) {
3242  SET_FIELD(feat, Comment, "label: " + val );
3243  } else {
3244  GET_MUTABLE(feat, Comment) += "; label: " + val;
3245  }
3247  }
3248  return eAction_Erase;
3249  } else if (NStr::EqualNocase(qual, "regulatory_class")) {
3250  string::size_type colon_pos = val.find_first_of(":");
3251  if (colon_pos != string::npos && ! NStr::StartsWith (val, "other:")) {
3252  string comment = val.substr( colon_pos + 1 );
3253  val.resize( colon_pos );
3254  if( GET_STRING_FLD_OR_BLANK(feat, Comment).empty() ) {
3255  SET_FIELD(feat, Comment, comment );
3256  } else {
3257  GET_MUTABLE(feat, Comment) += "; " + comment;
3258  }
3260  }
3261  } else if (NStr::EqualNocase(qual, "db_xref")) {
3262  string tag, db;
3263  if (NStr::SplitInTwo(val, ":", db, tag)) {
3264  CRef<CDbtag> dbp(new CDbtag);
3265  dbp->SetDb(db);
3266  dbp->SetTag().SetStr(tag);
3267 
3268  feat.SetDbxref().push_back(dbp);
3270  return eAction_Erase; // mark qual for deletion
3271  }
3272  } else if (NStr::EqualNocase(qual, "gdb_xref")) {
3273  CRef<CDbtag> dbp(new CDbtag);
3274  dbp->SetDb("GDB");
3275  dbp->SetTag().SetStr(val);
3276  feat.SetDbxref().push_back(dbp);
3278  return eAction_Erase; // mark qual for deletion
3279  } else if ( NStr::EqualNocase(qual, "pseudo") ) {
3280  feat.SetPseudo(true);
3282  return eAction_Erase; // mark qual for deletion
3283  } else if ( NStr::EqualNocase(qual, "pseudogene") )
3284  {
3285  if( ! FIELD_EQUALS(feat, Pseudo, true) ) {
3286  feat.SetPseudo(true);
3288  }
3289 
3290  // lowercase pseudogene qual
3291  string new_val = val;
3292  NStr::ToLower(new_val);
3293  if( new_val != val ) {
3294  val = new_val;
3296  }
3297  } else if ( FIELD_IS(data, Gene) && x_GeneGBQualBC( GET_MUTABLE(data, Gene), gb_qual) == eAction_Erase) {
3298  return eAction_Erase; // mark qual for deletion
3299  } else if ( FIELD_IS(data, Cdregion) && x_SeqFeatCDSGBQualBC(feat, GET_MUTABLE(data, Cdregion), gb_qual) == eAction_Erase ) {
3300  return eAction_Erase; // mark qual for deletion
3301  } else if (data.IsRna() && x_SeqFeatRnaGBQualBC(feat, data.SetRna(), gb_qual) == eAction_Erase) {
3302  return eAction_Erase; // mark qual for deletion
3303  } else if (data.IsProt() && x_ProtGBQualBC(data.SetProt(), gb_qual, eGBQualOpt_normal) == eAction_Erase) {
3304  return eAction_Erase; // mark qual for deletion
3305  } else if (NStr::EqualNocase(qual, "gene")) {
3306  if (!NStr::IsBlank(val)) {
3307  CRef<CSeqFeatXref> xref(new CSeqFeatXref);
3308  xref->SetData().SetGene().SetLocus(val);
3309  feat.SetXref().insert(feat.SetXref().begin(), xref);
3311  return eAction_Erase; // mark qual for deletion
3312  }
3313  } else if (NStr::EqualNocase(qual, "codon_start")) {
3314  if (!data.IsCdregion()) {
3315  // not legal on anything but CDS, so remove it
3316  return eAction_Erase; // mark qual for deletion
3317  }
3318  } else if ( NStr::EqualNocase(qual, "EC_number") ) {
3320  } else if( qual == "satellite" ) {
3322  } else if ( NStr::EqualNocase(qual, "replace") && data.GetSubtype() == CSeqFeatData::eSubtype_variation) {
3323  string orig = val;
3324  NStr::ToLower(val);
3325  if (!NStr::Equal(orig, val)) {
3327  }
3328  }
3329  else if (NStr::EqualNocase(qual, "recombination_class")) {
3332  }
3333  }
3334 
3335 
3336  if( NStr::EqualNocase( qual, "mobile_element_type" ) ) {
3337  // trim spaces around first colon but only if there are no colons
3338  // with spaces before and after
3339  if( NPOS != NStr::Find(val, " :") || NPOS != NStr::Find(val, ": ") ) {
3340  if( s_RegexpReplace( val, "[ ]*:[ ]*", ":", 1 ) ) {
3342  }
3343  }
3344 
3345  if( data.IsImp() && STRING_FIELD_MATCH( data.GetImp(), Key, "repeat_region" ) && ! val.empty() ) {
3346  qual = "mobile_element_type";
3347  data.SetImp().SetKey( "mobile_element" );
3349  }
3350  }
3351 
3352  // estimated_length must be a number or "unknown"
3353  if( NStr::EqualNocase( qual, "estimated_length" ) ) {
3354  if( ! s_IsAllDigits(val) && ! NStr::EqualNocase(val, "unknown") ) {
3355  val = "unknown";
3357  }
3358  }
3359 
3360  // conflict is obsolete. Make it misc_difference, but add a note
3361  // to the feature comment as to what it used to be.
3362  if( data.IsImp() && STRING_FIELD_MATCH( data.GetImp(), Key, "conflict" ) ) {
3363  data.SetImp().SetKey( "misc_difference");
3364  if( feat.IsSetComment() ) {
3365  GET_MUTABLE(feat, Comment) = "conflict; " + GET_FIELD(feat, Comment);
3366  } else {
3367  SET_FIELD(feat, Comment, "conflict");
3368  }
3370  }
3371 
3372  if( qual.empty() && val.empty() ) {
3373  return eAction_Erase;
3374  }
3375 
3376  return eAction_Nothing;
3377 }
3378 
3379 bool CNewCleanup_imp::x_IsDotBaseRange(const string& val)
3380 {
3381  size_t pos = NStr::Find(val, "..");
3382  if (string::npos == pos) {
3383  return false;
3384  }
3385  try {
3386  long start = NStr::StringToLong(val.substr(0, pos));
3387  long stop = NStr::StringToLong(val.substr(pos + 2));
3388  if (start < 1 || stop < 1) {
3389  return false;
3390  }
3391  } catch (...) {
3392  return false;
3393  }
3394  return true;
3395 }
3396 
3397 
3399 {
3400  size_t pos = NStr::Find(val, "-");
3401  if (string::npos == pos) {
3402  return false;
3403  }
3404  try {
3405  long start = NStr::StringToLong(val.substr(0, pos));
3406  long stop = NStr::StringToLong(val.substr(pos + 1));
3407  if (start < 1 || stop < 1) {
3408  return false;
3409  }
3410  } catch (...) {
3411  return false;
3412  }
3413  return true;
3414 }
3415 
3416 
3417 bool CNewCleanup_imp::x_IsBaseRange(const string& val)
3418 {
3419  if (val.length() > 25) {
3420  return false;
3421  }
3422  if (x_IsDotBaseRange(val)) {
3423  return true;
3424  } else if (x_IsHyphenBaseRange(val)) {
3425  return true;
3426  } else {
3427  return false;
3428  }
3429 }
3430 
3431 
3433 {
3435  if (x_IsBaseRange(gbq.GetVal())) {
3436  gbq.SetQual("rpt_unit_range");
3437  if (x_IsHyphenBaseRange(gbq.GetVal())) {
3438  NStr::ReplaceInPlace(gbq.SetVal(), "-", "..");
3439  }
3440  } else {
3441  gbq.SetQual("rpt_unit_seq");
3443  }
3444  return true;
3445 }
3446 
3448 //
3449 // As of Dec 2006, "transposon" is no longer legal as a qualifier. The replacement
3450 // qualifier is "mobile_element". In addition, the value has to be massaged to
3451 // indicate "integron" or "transposon".
3452 //
3453 {
3454  static const string integronValues[] = {
3455  "class I integron",
3456  "class II integron",
3457  "class III integron",
3458  "class 1 integron",
3459  "class 2 integron",
3460  "class 3 integron"
3461  };
3462  static const string* endIntegronValues
3463  = integronValues + sizeof(integronValues)/sizeof(*integronValues);
3464 
3465  if (NStr::EqualNocase( GET_FIELD(gbq, Qual), "transposon")) {
3466  SET_FIELD( gbq, Qual, "mobile_element");
3467 
3468  // If the value is one of the IntegronValues, change it to "integron: class XXX":
3469  const string* pValue = std::find(integronValues, endIntegronValues, GET_FIELD(gbq, Val) );
3470  if ( pValue != endIntegronValues ) {
3471  string::size_type cutoff = pValue->find( " integron" );
3472  _ASSERT( cutoff != string::npos ); // typo in IntegronValues?
3473  SET_FIELD( gbq, Val, string("integron: ") + pValue->substr(0, cutoff) );
3474  }
3475  // Otherwise, just prefix it with "transposon: ":
3476  else {
3477  SET_FIELD( gbq, Val, string("transposon: ") + GET_FIELD(gbq, Val) );
3478  }
3479 
3481  }
3482 }
3483 
3485 //
3486 // As of Dec 2006, "insertion_seq" is no longer legal as a qualifier. The replacement
3487 // qualifier is "mobile_element". In addition, the value has to be massaged to
3488 // reflect the "insertion_seq".
3489 //
3490 {
3491  if (NStr::EqualNocase( GET_FIELD(gbq, Qual), "insertion_seq")) {
3492  gbq.SetQual("mobile_element");
3493  gbq.SetVal( string("insertion sequence:") + GET_FIELD(gbq, Val) );
3495  }
3496 }
3497 
3499  const string& value )
3500 //
3501 // Format of compound rpt_type values: (value[,value]*)
3502 //
3503 // These are internal to sequin and are in theory cleaned up before the material
3504 // is released. However, some compound values have escaped into the wild and have
3505 // not been retro-fixed yet (as of 2006-03-17).
3506 //
3507 {
3508  if (NStr::IsBlank(value) || value.length() < 3 ||
3509  !NStr::StartsWith(value, "(") || !NStr::EndsWith(value, ")")) {
3510  return false;
3511  }
3512 
3513  bool last_char_was_close_paren = false;
3514  string::const_iterator s = value.begin();
3515  ++s;
3516  while (s != value.end()) {
3517  if (*s == '(') {
3518  return false;
3519  } else if (last_char_was_close_paren) {
3520  return false;
3521  } else if (*s == ')') {
3522  last_char_was_close_paren = true;
3523  }
3524  ++s;
3525  }
3526  return true;
3527 }
3528 
3529 static
3531  CSeq_feat::TQual& quals, // the list of CGb_qual's.
3532  CSeq_feat::TQual::iterator& it, // points to the one qual we might expand.
3533  CSeq_feat::TQual& new_quals ) // new quals that will need to be inserted
3534 //
3535 // Rules for "rpt_type" qualifiers (as of 2006-03-07):
3536 //
3537 // There can be multiple occurrences of this qualifier, and we need to keep them
3538 // all.
3539 // The value of this qualifier can also be a *list of values* which is *not*
3540 // conforming to the ASN.1 and thus needs to be cleaned up.
3541 //
3542 // The cleanup entails turning the list of values into multiple occurrences of the
3543 // given qualifier, each occurrence taking one of the values in the original
3544 // list.
3545 //
3546 {
3547  CGb_qual& qual = **it;
3548  string qual_type = qual.GetQual();
3549  string& val = qual.SetVal();
3550  if (NStr::Equal(val, "()")) {
3551  val.clear();
3552  return;
3553  }
3554  if ( ! s_IsCompoundRptTypeValue( val ) ) {
3555  //
3556  // nothing to do ...
3557  //
3558  return;
3559  }
3560 
3561  //
3562  // Generate list of cleaned up values. Fix original qualifier and generate
3563  // list of new qualifiers to be added to the original list:
3564  //
3565  vector< string > newValues;
3566  string valueList = val.substr(1, val.length() - 2);
3567  NStr::Split(valueList, ",", newValues, NStr::fSplit_Tokenize);
3568 
3569  qual.SetVal( newValues[0] );
3570 
3571  for ( size_t i=1; i < newValues.size(); ++i ) {
3572  CRef< CGb_qual > newQual( new CGb_qual() );
3573  newQual->SetQual( qual_type );
3574  newQual->SetVal( newValues[i] );
3575  new_quals.push_back( newQual );
3576  }
3577 }
3578 
3579 
3581 {
3582  CSeq_feat::TQual new_quals;
3583  NON_CONST_ITERATE (CSeq_feat::TQual, it, quals) {
3584  CGb_qual& gb_qual = **it;
3585 
3586  string& qual = GET_MUTABLE(gb_qual, Qual);
3587  string& val = GET_MUTABLE(gb_qual, Val);
3588 
3589  // convert curly braces to parens for some quals
3590  if( (val.length() > 1) && (val[0] == '{') &&
3591  (val[val.length()-1] == '}') )
3592  {
3593  val[0] = '(';
3594  val[val.length()-1] = ')';
3596  }
3597 
3598  if (NStr::EqualNocase(qual, "rpt_type")) {
3599  s_ExpandThisQual( quals, it, new_quals );
3600  } else if (NStr::EqualNocase(qual, "rpt_unit")) {
3601  s_ExpandThisQual( quals, it, new_quals );
3602  } else if (NStr::EqualNocase(qual, "rpt_unit_range")) {
3603  s_ExpandThisQual( quals, it, new_quals );
3604  } else if (NStr::EqualNocase(qual, "rpt_unit_seq")) {
3605  s_ExpandThisQual( quals, it, new_quals );
3606  } else if (NStr::EqualNocase(qual, "usedin")) {
3607  s_ExpandThisQual( quals, it, new_quals );
3608  } else if (NStr::EqualNocase(qual, "old_locus_tag")) {
3609  s_ExpandThisQual( quals, it, new_quals );
3610  } else if (NStr::EqualNocase(qual, "compare")) {
3611  s_ExpandThisQual( quals, it, new_quals );
3612  } else if (NStr::EqualNocase(qual, "replace")) {
3613  s_ExpandThisQual( quals, it, new_quals );
3614  }
3615  }
3616 
3617  if ( ! new_quals.empty() ) {
3618  quals.insert(quals.end(), new_quals.begin(), new_quals.end());
3620  NON_CONST_ITERATE (CSeq_feat::TQual, it, quals) {
3621  GBQualBC(**it);
3622  }
3623  }
3624 }
3625 
3628 {
3629  const string& qual = GET_FIELD(gb_qual, Qual);
3630  const string& val = GET_FIELD(gb_qual, Val);
3631 
3632  if( NStr::IsBlank(val) ) {
3633  return eAction_Nothing;
3634  }
3635 
3636  bool change_made = false;
3637  if (NStr::EqualNocase(qual, "map")) {
3638  if (! gene.IsSetMaploc() ) {
3639  change_made = true;
3640  gene.SetMaploc(val);
3641  }
3642  } else if (NStr::EqualNocase(qual, "allele")) {
3643  if ( gene.IsSetAllele() ) {
3645  } else {
3646  change_made = true;
3647  gene.SetAllele(val);
3648  }
3649  } else if (NStr::EqualNocase(qual, "locus_tag")) {
3650  if ( ! gene.IsSetLocus_tag() ) {
3651  change_made = true;
3652  gene.SetLocus_tag(val);
3653  }
3654  } else if (NStr::EqualNocase(qual, "gene_synonym")) {
3655  change_made = true;
3656  gene.SetSyn().push_back(val);
3657  }
3658  if (change_made) {
3660  }
3661 
3662  return ( change_made ? eAction_Erase : eAction_Nothing );
3663 }
3664 
3667 {
3668  const string& qual = gb_qual.GetQual();
3669  const string& val = gb_qual.GetVal();
3670 
3671  // transl_except qual -> Cdregion.code_break
3672  if (NStr::EqualNocase(qual, "transl_except")) {
3673  // could not be parsed earlier
3674  return eAction_Nothing;
3675  }
3676 
3677  // codon_start qual -> Cdregion.frame
3678  if (NStr::EqualNocase(qual, "codon_start")) {
3679  CCdregion::TFrame frame = GET_FIELD(cds, Frame);
3681  if (new_frame == CCdregion::eFrame_one ||
3682  new_frame == CCdregion::eFrame_two ||
3683  new_frame == CCdregion::eFrame_three) {
3684  if (frame == CCdregion::eFrame_not_set ||
3685  ( FIELD_EQUALS( feat, Pseudo, true ) && ! FIELD_IS_SET(feat, Product) )) {
3686  cds.SetFrame(new_frame);
3688  }
3689  return eAction_Erase;
3690  }
3691  }
3692 
3693  // transl_table qual -> Cdregion.code
3694  if (NStr::EqualNocase(qual, "transl_table")) {
3695  if ( FIELD_IS_SET(cds, Code) ) {
3696  const CCdregion::TCode& code = GET_FIELD(cds, Code);
3697  int transl_table = 1;
3698  ITERATE (CCdregion::TCode::Tdata, it, code.Get()) {
3699  if ( FIELD_IS(**it, Id) && GET_FIELD(**it, Id) != 0) {
3700  transl_table = GET_FIELD(**it, Id);
3701  break;
3702  }
3703  }
3704 
3705  if (NStr::EqualNocase(NStr::UIntToString(transl_table), val)) {
3706  return eAction_Erase;
3707  }
3708  } else {
3709  int new_val = NStr::StringToNonNegativeInt(val);
3710  if (new_val > 0) {
3712  SET_FIELD(*gc, Id, new_val);
3713  cds.SetCode().Set().push_back(gc);
3714 
3715  // we don't have to check except-text because we're
3716  // setting an unset genetic_code, not changing an existing one
3717  // (the except-text would be: "genetic code exception")
3719  return eAction_Erase;
3720  }
3721  }
3722  }
3723 
3724  // look for qualifiers that should be applied to protein feature
3725  // note - this should be moved to the "indexed" portion of basic cleanup,
3726  // because it needs to locate another sequence and feature
3727  if (NStr::Equal(qual, "product") || NStr::Equal (qual, "function") || NStr::EqualNocase (qual, "EC_number")
3728  || NStr::Equal (qual, "prot_note"))
3729  {
3730  // get protein sequence for product
3731  CRef<CSeq_feat> prot_feat;
3732  CRef<CProt_ref> prot_ref;
3733 
3734  // try to get existing prot_feat
3735  CBioseq_Handle prot_handle;
3736  if ( FIELD_IS_SET(feat, Product) ) {
3737  const CSeq_id *prod_seq_id = feat.GetProduct().GetId();
3738  if( prod_seq_id ) {
3739  prot_handle = m_Scope->GetBioseqHandle(*prod_seq_id);
3740  }
3741  }
3742  if (prot_handle) {
3743  // find main protein feature
3744  CConstRef<CBioseq> pseq = prot_handle.GetCompleteBioseq();
3745  if (pseq && pseq->IsSetAnnot()) {
3746  for (auto ait : pseq->GetAnnot()) {
3747  if (ait->IsFtable()) {
3748  for (auto fit : ait->GetData().GetFtable()) {
3749  if (fit->IsSetData() && fit->GetData().GetSubtype() == CSeqFeatData::eSubtype_prot) {
3750  prot_feat.Reset(const_cast<CSeq_feat*>(fit.GetPointer()));
3751  prot_ref.Reset(&(prot_feat->SetData().SetProt()));
3752  }
3753  }
3754  }
3755  }
3756  }
3757  }
3758 
3759  bool push_back_xref_on_success = false;
3760  CRef<CSeqFeatXref> xref;
3761  if ( ! prot_ref ) {
3762  // otherwise make cross reference
3763  prot_ref.Reset( new CProt_ref );
3764 
3765  // see if this seq-feat already has a prot xref
3766  EDIT_EACH_SEQFEATXREF_ON_SEQFEAT( xref_iter, feat ) {
3767  if( (*xref_iter)->IsSetData() && (*xref_iter)->GetData().IsProt() ) {
3768  xref = *xref_iter;
3769  }
3770  }
3771  // seq-feat has no prot xref. We make our own.
3772  if ( ! xref ) {
3773  xref.Reset( new CSeqFeatXref );
3774  xref->SetData().SetProt( *prot_ref );
3775  // we will push the xref onto the feat if the add was successful
3776  push_back_xref_on_success = true;
3777  }
3778  prot_ref.Reset( &xref->SetData().SetProt() );
3779  }
3780 
3781  // replacement prot feature
3782  EAction action = eAction_Nothing;
3783 
3784  if (NStr::Equal(qual, "prot_note") ) {
3785  if( prot_feat ) {
3786  if (!prot_feat->IsSetComment() || NStr::IsBlank (prot_feat->GetComment())) {
3787  SET_FIELD( *prot_feat, Comment, val);
3788  } else {
3789  SET_FIELD( *prot_feat, Comment, (prot_feat->GetComment() + "; " + val) );
3790  }
3792  action = eAction_Erase;
3793  }
3794  } else {
3795  action = x_ProtGBQualBC( *prot_ref, gb_qual, eGBQualOpt_CDSMode );
3796  }
3797 
3798  if( push_back_xref_on_success ) {
3799  feat.SetXref().push_back( xref );
3801  }
3802 
3803  return action;
3804  }
3805 
3806  if (NStr::EqualNocase(qual, "translation")) {
3807  return eAction_Erase;
3808  }
3809 
3810  return eAction_Nothing;
3811 }
3812 
3814 
3815 static const TTrnaKey trna_key_to_subtype [] = {
3816  { "Ala", 'A' },
3817  { "Alanine", 'A' },
3818  { "Arg", 'R' },
3819  { "Arginine", 'R' },
3820  { "Asn", 'N' },
3821  { "Asp", 'D' },
3822  { "Asp or Asn", 'B' },
3823  { "Asparagine", 'N' },
3824  { "Aspartate", 'D' },
3825  { "Aspartic", 'D' },
3826  { "Aspartic Acid", 'D' },
3827  { "Asx", 'B' },
3828  { "Cys", 'C' },
3829  { "Cysteine", 'C' },
3830  { "fMet", 'M' },
3831  { "Gln", 'Q' },
3832  { "Glu", 'E' },
3833  { "Glu or Gln", 'Z' },
3834  { "Glutamate", 'E' },
3835  { "Glutamic", 'E' },
3836  { "Glutamic Acid", 'E' },
3837  { "Glutamine", 'Q' },
3838  { "Glx", 'Z' },
3839  { "Gly", 'G' },
3840  { "Glycine", 'G' },
3841  { "His", 'H' },
3842  { "Histidine", 'H' },
3843  { "Ile", 'I' },
3844  { "Ile2", 'I' },
3845  { "iMet", 'M' },
3846  { "Isoleucine", 'I' },
3847  { "Leu", 'L' },
3848  { "Leu or Ile", 'J' },
3849  { "Leucine", 'L' },
3850  { "Lys", 'K' },
3851  { "Lysine", 'K' },
3852  { "Met", 'M' },
3853  { "Methionine", 'M' },
3854  { "OTHER", 'X' },
3855  { "Phe", 'F' },
3856  { "Phenylalanine", 'F' },
3857  { "Pro", 'P' },
3858  { "Proline", 'P' },
3859  { "Pyl", 'O' },
3860  { "Pyrrolysine", 'O' },
3861  { "Sec", 'U' },
3862  { "Selenocysteine", 'U' },
3863  { "Ser", 'S' },
3864  { "Serine", 'S' },
3865  { "Ter", '*' },
3866  { "TERM", '*' },
3867  { "Termination", '*' },
3868  { "Thr", 'T' },
3869  { "Threonine", 'T' },
3870  { "Trp", 'W' },
3871  { "Tryptophan", 'W' },
3872  { "Tyr", 'Y' },
3873  { "Tyrosine", 'Y' },
3874  { "Val", 'V' },
3875  { "Valine", 'V' },
3876  { "Xle", 'J' },
3877  { "Xxx", 'X' }
3878 };
3879 
3882 
3883 // This maps in the opposite direction of sm_TrnaKeys
3884 class CAminoAcidCharToSymbol : public multimap<char, const char*, PNocase_LessChar>
3885 {
3886 public:
3887  CAminoAcidCharToSymbol( const TTrnaKey keys[], int num_keys )
3888  {
3889  int ii = 0;
3890  for( ; ii < num_keys; ++ii ) {
3891  insert(value_type( keys[ii].second, keys[ii].first ));
3892  }
3893  }
3894 };
3897  (sizeof(trna_key_to_subtype) / sizeof(trna_key_to_subtype[0])) );
3898 
3899 static CRef<CTrna_ext> s_ParseTRnaFromAnticodonString (const string &str, const CSeq_feat& feat, CScope *scope)
3900 {
3901  CRef<CTrna_ext> trna;
3902 
3903  if (NStr::IsBlank (str)) return trna;
3904 
3905  if (NStr::StartsWith (str, "(pos:")) {
3906  // find position of closing paren
3907  string::size_type pos_end = s_MatchingParenPos( str, 0 );
3908  if (pos_end != string::npos) {
3909  trna.Reset( new CTrna_ext );
3910  string pos_str = str.substr (5, pos_end - 5);
3911  string::size_type aa_start = NStr::FindNoCase (pos_str, "aa:");
3912  if (aa_start != string::npos) {
3913  string abbrev = pos_str.substr (aa_start + 3);
3914  TTrnaMap::const_iterator t_iter = sm_TrnaKeys.find (abbrev.c_str ());
3915  if (t_iter == sm_TrnaKeys.end ()) {
3916  // unable to parse
3917  return trna;
3918  }
3920  aa->SetIupacaa (t_iter->second);
3921  trna->SetAa(*aa);
3922  pos_str = pos_str.substr (0, aa_start);
3923  NStr::TruncateSpacesInPlace (pos_str);
3924  if (NStr::EndsWith (pos_str, ",")) {
3925  pos_str = pos_str.substr (0, pos_str.length() - 1);
3926  }
3927  }
3928  const CSeq_loc& loc = feat.GetLocation();
3929  CRef<CSeq_loc> anticodon = ReadLocFromText (pos_str, loc.GetId(), scope);
3930  if( anticodon ) {
3931  CBioseq_Handle bsh = scope->GetBioseqHandle(*(loc.GetId()));
3932  if (!bsh) {
3933  trna.Reset();
3934  return trna;
3935  }
3936  if (anticodon->GetStop(eExtreme_Positional) >= bsh.GetInst_Length()) {
3937  trna.Reset();
3938  return trna;
3939  }
3940  if (feat.GetLocation().IsSetStrand()) {
3941  anticodon->SetStrand(loc.GetStrand());
3942  } else {
3943  anticodon->SetStrand(eNa_strand_plus); // anticodon is always on plus strand
3944  }
3945  }
3946  if (!anticodon) {
3947  trna->ResetAa();
3948  } else {
3949  trna->SetAnticodon(*anticodon);
3950  }
3951  }
3952  }
3953  return trna;
3954 }
3955 
3956 static
3957 char s_FindTrnaAA( const string &str )
3958 {
3959  if ( str.empty() ) return '\0';
3960  string tmp = str;
3962 
3963  if( tmp.length() == 1 ) {
3964  // if the string is a valid one-letter code, just return that
3965  const char aminoAcidLetter = toupper(tmp[0]);
3966  if( sm_TrnaInverseKeys.find(aminoAcidLetter) != sm_TrnaInverseKeys.end() ) {
3967  return aminoAcidLetter;
3968  }
3969  } else {
3970  // translate 3-letter codes and full-names to one-letter codes
3971  TTrnaMap::const_iterator trna_iter = sm_TrnaKeys.find (tmp.c_str ());
3972  if( trna_iter != sm_TrnaKeys.end() ) {
3973  return trna_iter->second;
3974  }
3975  }
3976 
3977  return '\0';
3978 }
3979 
3980 class CCharInSet {
3981 public:
3982  CCharInSet( const string &list_of_characters ) {
3983  copy( list_of_characters.begin(), list_of_characters.end(),
3984  inserter( char_set, char_set.begin() ) );
3985  }
3986 
3987  bool operator()( const char ch ) const {
3988  return ( char_set.find(ch) != char_set.end() );
3989  }
3990 
3991 private:
3993 };
3994 
3995 static
3996 void s_TokenizeTRnaString (const string &tRNA_string, list<string> &out_string_list )
3997 {
3998  out_string_list.clear();
3999  if ( tRNA_string.empty() ) return;
4000 
4001  // SGD Tx(NNN)c or Tx(NNN)c#, where x is the amino acid, c is the chromosome (A-P, Q for mito),
4002  // and optional # is presumably for individual tRNAs with different anticodons and the same
4003  // amino acid.
4004  CCachedRegexp valid_sgd_regex = regexpCache.Get(
4005  "^[Tt][A-Za-z]\\(...\\)[A-Za-z]\\d?\\d?$");
4006  if ( valid_sgd_regex->IsMatch(tRNA_string) ) {
4007  // parse SGD tRNA anticodon
4008  out_string_list.push_back(kEmptyStr);
4009  string &new_SGD_tRNA_anticodon = out_string_list.back();
4010  string raw_codon_part = tRNA_string.substr(3,3);
4011  NStr::ToUpper( raw_codon_part );
4012  string reverse_complement;
4014  new_SGD_tRNA_anticodon = string("(") + reverse_complement + ')';
4015 
4016  // parse SGD tRNA amino acid
4017  out_string_list.push_back(tRNA_string.substr(1,1));
4018  return;
4019  }
4020 
4021  string tRNA_string_copy = tRNA_string;
4022  // Note that we do NOT remove "*", since it might be a terminator tRNA symbol
4023  replace_if( tRNA_string_copy.begin(), tRNA_string_copy.end(),
4024  CCharInSet("-,;:()=\'_~"), ' ' );
4025 
4026  vector<string> tRNA_tokens;
4027  // " \t\n\v\f\r" are the standard whitespace chars
4028  // ( source: http://www.cplusplus.com/reference/clibrary/cctype/isspace/ )
4029  NStr::Split(tRNA_string_copy, " \t\n\v\f\r", tRNA_tokens, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
4030 
4031  EDIT_EACH_STRING_IN_VECTOR( tRNA_token_iter, tRNA_tokens ) {
4032  string &tRNA_token = *tRNA_token_iter;
4033  // remove initial "tRNA", if any
4034  if ( NStr::StartsWith(tRNA_token, "tRNA", NStr::eNocase) ) {
4035  tRNA_token = tRNA_token.substr(4);
4036  }
4037  CCachedRegexp threeLettersPlusDigits = regexpCache.Get(
4038  "^[A-Za-z][A-Za-z][A-Za-z]\\d*$");
4039  if (! tRNA_token.empty() ) {
4040  if ( threeLettersPlusDigits->IsMatch(tRNA_token) ) {
4041  tRNA_token = tRNA_token.substr(0, 3);
4042  }
4043  out_string_list.push_back(tRNA_token);
4044  }
4045  }
4046 }
4047 
4048 
4049 // based on C's ParseTRnaString
4050 static
4051 char s_ParseSeqFeatTRnaString( const string &comment, bool *out_justTrnaText, string &tRNA_codon, bool noSingleLetter )
4052 {
4053  if (out_justTrnaText) {
4054  *out_justTrnaText = false;
4055  }
4056  tRNA_codon.clear();
4057 
4058  if ( comment.empty() ) return '\0';
4059 
4060  CRef<CTrna_ext> tr( new CTrna_ext );
4061 
4062  char aa = '\0';
4063  list<string> head;
4064  s_TokenizeTRnaString (comment, head);
4065  bool justt = true;
4066  list<string>::const_iterator head_iter = head.begin();
4067  bool is_ambig = false;
4068  for( ; head_iter != head.end(); ++head_iter ) {
4069  const string &str = *head_iter;
4070  if( str.empty() ) continue;
4071  char curraa = '\0';
4072  if (noSingleLetter && str.length() == 1) {
4073  curraa = '\0';
4074  } else {
4075  curraa = s_FindTrnaAA (str);
4076  }
4077  if(curraa != '\0') {
4078  if (aa == '\0') {
4079  aa = curraa;
4080  } else if( curraa != aa) {
4081  is_ambig = true;
4082  }
4083  } else if ( ! NStr::EqualNocase ("tRNA", str) &&
4084  ! NStr::EqualNocase ("transfer", str) &&
4085  ! NStr::EqualNocase ("RNA", str) &&
4086  ! NStr::EqualNocase ("product", str) )
4087  {
4088  justt = false;
4089  }
4090  }
4091  if( is_ambig ) {
4092  aa = 0;
4093  }
4094 
4095  if (justt) {
4096  if( comment.find_first_of("0123456789") != string::npos ) {
4097  justt = false;
4098  }
4099  }
4100  if (out_justTrnaText) {
4101  *out_justTrnaText = justt;
4102  }
4103  return aa;
4104 }
4105 
4106 
4107 void CNewCleanup_imp::x_AddToComment(CSeq_feat& feat, const string& comment)
4108 {
4109  if (!feat.IsSetComment()) {
4110  feat.SetComment(comment);
4111  }
4112  else {
4113  feat.SetComment() += "; " + comment;
4114  }
4116 }
4117 
4120 {
4121  CRNA_ref::TType& rna_type = rna.SetType();
4122 
4123  if (rna_type != CRNA_ref::eType_tRNA &&
4124  rna_type != CRNA_ref::eType_other &&
4125  rna_type != CRNA_ref::eType_unknown) {
4126  return eAction_Nothing;
4127  }
4128 
4129  if (rna_type == NCBI_RNAREF(tRNA) && rna.IsSetExt() && rna.GetExt().IsName()) {
4130  string name = rna.GetExt().GetName();
4131  bool justTrnaText = false;
4132  string codon;
4133  char aa = s_ParseSeqFeatTRnaString(name, &justTrnaText, codon, false);
4134  if (aa != '\0') {
4135  const bool is_fMet = (NStr::Find(name, "fMet") != NPOS);
4136  const bool is_iMet = (NStr::Find(name, "iMet") != NPOS);
4137  const bool is_Ile2 = (NStr::Find(name, "Ile2") != NPOS);
4138  CRNA_ref_Base::C_Ext::TTRNA &trp = rna.SetExt().SetTRNA();
4139  trp.SetAa().SetNcbieaa(aa);
4140  if (aa == 'M') {
4141  if (is_fMet) {
4142  x_AddToComment(feat, "fMet");
4143  } else if (is_iMet) {
4144  x_AddToComment(feat, "iMet");
4145  }
4146  } else if (aa == 'I') {
4147  if (is_Ile2) {
4148  x_AddToComment(feat, "Ile2");
4149  }
4150  }
4151  x_SeqFeatTRNABC(feat, trp);
4153  }
4154  }
4155  if (rna_type == NCBI_RNAREF(tRNA) && !rna.IsSetExt()) {
4156  // this part inserted from: AddQualifierToFeature (sfp, "product", gb_qual_val);
4157  bool justTrnaText = false;
4158  string codon;
4159  char aa = s_ParseSeqFeatTRnaString(product, &justTrnaText, codon, false);
4160  if (aa != '\0') {
4161 
4162  CRNA_ref_Base::C_Ext::TTRNA& trna = rna.SetExt().SetTRNA();
4163  trna.SetAa().SetNcbieaa(aa);
4164 
4165  if (!justTrnaText || !NStr::IsBlank(codon)) {
4166  x_AddToComment(feat, product);
4167  }
4168 
4169  if (aa == 'M') {
4170  if (NStr::Find(product, "fMet") != NPOS &&
4171  (!feat.IsSetComment() || NStr::Find(feat.GetComment(), "fMet") == NPOS)) {
4172  // x_AddToComment(feat, "fMet");
4174  return eAction_Nothing;
4175  } else if (NStr::Find(product, "iMet") != NPOS &&
4176  (!feat.IsSetComment() || NStr::Find(feat.GetComment(), "iMet") == NPOS)) {
4177  // x_AddToComment(feat, "iMet");
4179  return eAction_Nothing;
4180  }
4181  } else if (aa == 'I') {
4182  if (NStr::Find(product, "Ile2") != NPOS &&
4183  (!feat.IsSetComment() || NStr::Find(feat.GetComment(), "Ile2") == NPOS)) {
4184  // x_AddToComment(feat, "Ile2");
4186  return eAction_Nothing;
4187  }
4188  }
4189 
4191  }
4192  else {
4193  x_AddToComment(feat, product);
4194  }
4195  return eAction_Erase;
4196  }
4197  if (rna_type == NCBI_RNAREF(tRNA) && rna.IsSetExt() && rna.GetExt().IsTRNA()) {
4198  CRNA_ref_Base::C_Ext::TTRNA& trp = rna.SetExt().SetTRNA();
4199  if (trp.IsSetAa() && trp.GetAa().IsNcbieaa()) {
4200  string ignored;
4201  if (trp.GetAa().GetNcbieaa() == s_ParseSeqFeatTRnaString(product, nullptr, ignored, false) &&
4202  NStr::IsBlank(ignored)) {
4203  } else {
4204  // don't remove product qual because it conflicts with existing aa value
4205  return eAction_Nothing;
4206  }
4207  if (NStr::CompareNocase (product, "tRNA-fMet") == 0 || NStr::CompareNocase (product, "iRNA-fMet") == 0) {
4208  return eAction_Nothing;
4209  }
4210  if (NStr::CompareNocase (product, "tRNA-iMet") == 0 || NStr::CompareNocase (product, "iRNA-iMet") == 0) {
4211  return eAction_Nothing;
4212  }
4213  if (NStr::CompareNocase (product, "tRNA-Ile2") == 0 || NStr::CompareNocase (product, "iRNA-Ile2") == 0) {
4214  return eAction_Nothing;
4215  }
4216  return eAction_Erase;
4217  } else if (!trp.IsSetAa()) {
4218  string ignored;
4219  bool justTrnaText = false;
4220  char aa = s_ParseSeqFeatTRnaString(product, &justTrnaText, ignored, false);
4221  if (aa != '\0') {
4222  trp.SetAa().SetNcbieaa(aa);
4223  if (!justTrnaText || !NStr::IsBlank(ignored)) {
4224  x_AddToComment(feat, product);
4225  }
4226  if (NStr::CompareNocase(product, "tRNA-fMet") == 0 ||
4227  NStr::CompareNocase(product, "iRNA-fMet") == 0 ||
4228  NStr::CompareNocase(product, "tRNA-iMet") == 0 ||
4229  NStr::CompareNocase(product, "iRNA-iMet") == 0 ||
4230  NStr::CompareNocase(product, "tRNA-Ile2") == 0 ||
4231  NStr::CompareNocase(product, "iRNA-Ile2") == 0) {
4232  return eAction_Nothing;
4233  }
4234  return eAction_Erase;
4235  }
4236  }
4237  }
4238 
4239  if (rna.IsSetExt() && rna.GetExt().IsName() && NStr::Equal(rna.GetExt().GetName(), product)) {
4240  return eAction_Erase;
4241  }
4242 
4243  return eAction_Nothing;
4244 }
4245 
4246 
4248 {
4249  return eAction_Nothing;
4250 }
4251 
4252 
4253 // homologous to C's HandledGBQualOnRNA.
4254 // That func was copy-pasted, then translated into C++.
4255 // Later we can go back and actually refactor the code
4256 // to make it more efficient or cleaner.
4259 {
4260  if( ! gb_qual.IsSetVal()) {
4261  return eAction_Nothing;
4262  }
4263  const string &gb_qual_qual = gb_qual.GetQual();
4264  string &gb_qual_val = gb_qual.SetVal();
4265  TRNAREF_TYPE& rna_type = rna.SetType();
4266 
4267  if (NStr::EqualNocase(gb_qual_qual, "standard_name")) {
4268  return x_HandleStandardNameRnaGBQual(feat, rna, gb_qual_val);
4269  }
4270  if (NStr::IsBlank(gb_qual_val)) {
4271  return eAction_Nothing;
4272  }
4273 
4274  if (NStr::EqualNocase( gb_qual_qual, "product" ))
4275  {
4276  if (rna_type == NCBI_RNAREF(unknown)) {
4277  rna_type = NCBI_RNAREF(other);
4279  }
4280  if ( rna.IsSetExt() && rna.GetExt().IsName() ) {
4281  const string &name = rna.SetExt().SetName();
4282  if ( name.empty() ) {
4283  rna.ResetExt();
4285  }
4286  }
4287  if (x_HandleTrnaProductGBQual(feat, rna, gb_qual_val) == eAction_Erase) {
4288  return eAction_Erase;
4289  }
4290 
4291  if (!rna.IsSetExt()) {
4292  string remainder;
4293  rna.SetRnaProductName(gb_qual_val, remainder);
4295  if (NStr::IsBlank(remainder)) {
4296  return eAction_Erase;
4297  } else {
4298  gb_qual.SetQual(remainder);
4299  return eAction_Nothing;
4300  }
4301  }
4302  if( rna.GetExt().IsGen() ) {
4303  CRNA_gen & rna_gen = rna.SetExt().SetGen();
4304  if( RAW_FIELD_IS_EMPTY_OR_UNSET(rna_gen, Product) ) {
4305  rna_gen.SetProduct(gb_qual_val);
4307  return eAction_Erase;
4308  }
4309  return eAction_Nothing;
4310  }
4311  if (rna.GetExt().IsName() && NStr::Equal(rna.GetExt().GetName(), gb_qual_val)) {
4312  return eAction_Erase;
4313  }
4314  if ( rna.IsSetExt() && ! rna.GetExt().IsName() ) return eAction_Nothing;
4315  const string &name = ( rna.IsSetExt() ? rna.GetExt().GetName() : kEmptyStr );
4316  if (! name.empty() ) {
4317  SIZE_TYPE rDNA_pos = NStr::Find( gb_qual_val, "rDNA");
4318  if (rDNA_pos != NPOS) {
4319  gb_qual_val[rDNA_pos+1] = 'R';
4321  }
4322  if ( NStr::EqualNocase(name, gb_qual_val) ) {
4323  return eAction_Erase;
4324  }
4325  if (rna_type == NCBI_RNAREF(other) || rna_type == NCBI_RNAREF(ncRNA) ||
4326  rna_type == NCBI_RNAREF(tmRNA) || rna_type == NCBI_RNAREF(miscRNA) )
4327  {
4328  // new convention follows ASN.1 spec comments, allows new RNA types
4329  return eAction_Nothing;
4330  }
4331  // subsequent /product now added to comment
4332  x_AddToComment(feat, gb_qual_val);
4334  return eAction_Erase;
4335  }
4336  if (rna_type == NCBI_RNAREF(ncRNA) ||
4337  rna_type == NCBI_RNAREF(tmRNA) || rna_type == NCBI_RNAREF(miscRNA) )
4338  {
4339  // new convention follows ASN.1 spec comments, allows new RNA types
4340  return eAction_Nothing;
4341  }
4342  if ( ! FIELD_CHOICE_EQUALS( rna, Ext, Name, gb_qual_val) ) {
4343  rna.SetExt().SetName( gb_qual_val );
4345  return eAction_Erase;
4346  }
4347  } else if (NStr::EqualNocase(gb_qual_qual, "anticodon") ) {
4348  if (!rna.IsSetType() || rna.GetType() == CRNA_ref::eType_unknown) {
4349  rna.SetType(CRNA_ref::eType_other);
4351  }
4352  if (rna.GetType() != CRNA_ref::eType_tRNA) {
4353  return eAction_Nothing;
4354  }
4355 
4357  if (!trna) {
4358  return eAction_Nothing;
4359  }
4360 
4361  x_SeqFeatTRNABC( feat, *trna );
4362  if (trna->IsSetAa() || trna->IsSetAnticodon()) {
4363  // don't apply at all if there are conflicts
4364  bool apply_aa = false;
4365  bool apply_anticodon = false;
4366  bool ok_to_apply = true;
4367 
4368  // look for conflict with aa
4369  if (!rna.IsSetExt() || !rna.GetExt().IsTRNA()) {
4370  if (trna->IsSetAa()) {
4371  apply_aa = true;
4372  }
4373  if (trna->IsSetAnticodon()) {
4374  apply_anticodon = true;
4375  }
4376  }
4377  else {
4378  if (trna->IsSetAa()) {
4379  if (rna.GetExt().GetTRNA().IsSetAa()) {
4380  if (rna.GetExt().GetTRNA().GetAa().IsIupacaa()) {
4381  if (trna->GetAa().GetIupacaa() != rna.GetExt().GetTRNA().GetAa().GetIupacaa()) {
4382  ok_to_apply = false;
4383  }
4384  }
4385  }
4386  else {
4387  apply_aa = true;
4388  }
4389  }
4390  // look for conflict with anticodon
4391  if (trna->IsSetAnticodon()) {
4392  if (rna.GetExt().GetTRNA().IsSetAnticodon()) {
4393  if (sequence::Compare(rna.GetExt().GetTRNA().GetAnticodon(),
4394  trna->GetAnticodon(), m_Scope, sequence::fCompareOverlapping) != sequence::eSame) {
4395  ok_to_apply = false;
4396  }
4397  } else {
4398  apply_anticodon = true;
4399  }
4400  }
4401  }
4402  if (ok_to_apply) {
4403  if (apply_aa) {
4404  rna.SetExt().SetTRNA().SetAa().SetIupacaa(trna->GetAa().GetNcbieaa());
4406  }
4407  if (apply_anticodon) {
4408  CRef<CSeq_loc> anticodon(new CSeq_loc());
4409  anticodon->Add(trna->GetAnticodon());
4410  rna.SetExt().SetTRNA().SetAnticodon(*anticodon);
4412  }
4413  return eAction_Erase;
4414  }
4415  }
4416  }
4417  return eAction_Nothing;
4418 }
4419 
4420 
4423 {
4424  const string& qual = gb_qual.GetQual();
4425  const string& val = gb_qual.GetVal();
4426 
4427  if (NStr::EqualNocase(qual, "product") || NStr::EqualNocase(qual, "standard_name")) {
4428  if (opt == eGBQualOpt_CDSMode || !prot.IsSetName() || NStr::IsBlank(prot.GetName().front())) {
4431  } else {
4432  return eAction_Nothing;
4433  }
4434  } else if (NStr::EqualNocase(qual, "function")) {
4435  ADD_STRING_TO_LIST(prot.SetActivity(), val);
4437  } else if (NStr::EqualNocase(qual, "EC_number")) {
4438  ADD_STRING_TO_LIST(prot.SetEc(), val);
4440  }
4441 
4442  // labels to leave alone
4443  static const char * const ignored_quals[] =
4444  { "label", "allele", "experiment", "inference", "UniProtKB_evidence",
4445  "dbxref", "replace", "rpt_unit_seq", "rpt_unit_range" };
4446  static set<string, PNocase> ignored_quals_raw;
4447 
4448  // the mutex is just there in the unlikely event that two separate
4449  // threads both try to initialized ignored_quals_raw. It's NOT
4450  // needed for reading
4451  static CMutex ignored_quals_raw_initialization_mutex;
4452  {
4453  CMutexGuard guard(ignored_quals_raw_initialization_mutex);
4454  if (ignored_quals_raw.empty()) {
4455  copy(ignored_quals, ignored_quals + sizeof(ignored_quals) / sizeof(ignored_quals[0]),
4456  inserter(ignored_quals_raw, ignored_quals_raw.begin()));
4457  }
4458  }
4459 
4460  if (ignored_quals_raw.find(qual) != ignored_quals_raw.end()) {
4461  return eAction_Nothing;
4462  }
4463 
4464  // all other gbquals not appropriate on protein features
4465  return eAction_Erase;
4466 }
4467 
4468 
4470 {
4472  if (biosrc.IsSetOrg()) {
4473  x_CleanupOldName(biosrc.SetOrg());
4474  x_CleanupOrgModNoteEC(biosrc.SetOrg());
4475  }
4476 }
4477 
4478 
4480 {
4481  if (!biosrc.IsSetOrg()) {
4482  return;
4483  }
4484  auto& org = biosrc.SetOrg();
4485  // add environmental_sample or metagenomic based on lineage or div
4486 
4487  if ( org.IsSetOrgname()) {
4488  const auto& orgname = org.GetOrgname();
4489  bool needs_env_sample = false;
4490  bool needs_metagenomic = false;
4491  if (orgname.IsSetLineage()) {
4492  string lineage = orgname.GetLineage();
4493  if (NStr::FindNoCase(lineage, "environmental sample") != string::npos) {
4494  needs_env_sample = true;
4495  }
4496  if (NStr::FindNoCase(lineage, "metagenomes") != string::npos) {
4497  needs_metagenomic = true;
4498  }
4499  }
4500  if (orgname.IsSetDiv()
4501  && NStr::Equal(biosrc.GetOrg().GetOrgname().GetDiv(), "ENV")) {
4502  needs_env_sample = true;
4503  }
4504 
4505  if (needs_env_sample || needs_metagenomic) {
4506  bool has_env_sample = false;
4507  bool has_metagenomic = false;
4508  if ( biosrc.IsSetSubtype()) {
4509  ITERATE(CBioSource::TSubtype, it, biosrc.GetSubtype()) {
4510  if ((*it)->IsSetSubtype()) {
4511  if ((*it)->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
4512  has_env_sample = true;
4513  }
4514  if ((*it)->GetSubtype() == CSubSource::eSubtype_metagenomic) {
4515  has_metagenomic = true;
4516  }
4517  }
4518  }
4519  }
4520  if (needs_env_sample && !has_env_sample) {
4522  biosrc.SetSubtype().push_back(s);
4524  }
4525  if (needs_metagenomic && !has_metagenomic) {
4527  biosrc.SetSubtype().push_back(s);
4529  }
4530  }
4531  }
4532 }
4533 
4534 
4536 {
4537  const string& m_Taxname;
4539  {
4540  return (mod->IsSetSubtype() &&
4541  mod->GetSubtype() == COrgMod::eSubtype_old_name &&
4542  mod->IsSetSubname() &&
4543  NStr::Equal(mod->GetSubname(), m_Taxname) &&
4544  (!mod->IsSetAttrib() || NStr::IsBlank(mod->GetAttrib())));
4545  }
4546 };
4547 
4548 
4550 {
4551  if (org.IsSetTaxname() && org.IsSetOrgname() && org.GetOrgname().IsSetMod()) {
4552  SRemovableOldname matcher{ org.GetTaxname() };
4553  auto& modset = org.SetOrgname().SetMod();
4554  size_t before = modset.size();
4555  modset.erase(std::remove_if(modset.begin(), modset.end(), matcher), modset.end());
4556  if (before != modset.size()) {
4558  }
4559  if (modset.empty()) {
4560  org.SetOrgname().ResetMod();
4562  }
4563  }
4564 }
4565 
4566 
4567 
4568 bool s_HasMatchingGBMod(const COrgName& org, const string& val)
4569 {
4570  if (!org.IsSetMod()) {
4571  return false;
4572  }
4573  ITERATE(COrgName::TMod, it, org.GetMod()) {
4574  if ((*it)->IsSetSubtype() &&
4575  ((*it)->GetSubtype() == COrgMod::eSubtype_gb_acronym ||
4576  (*it)->GetSubtype() == COrgMod::eSubtype_gb_anamorph ||
4577  (*it)->GetSubtype() == COrgMod::eSubtype_gb_synonym) &&
4578  (*it)->IsSetSubname() &&
4579  NStr::Equal((*it)->GetSubname(), val)) {
4580  return true;
4581  }
4582  }
4583  return false;
4584 }
4585 
4586 
4588  const COrg_ref& org;
4590  return (mod->IsSetSubtype() &&
4591  mod->GetSubtype() == COrgMod::eSubtype_other &&
4592  mod->IsSetSubname() &&
4593  (s_HasMatchingGBMod(org.GetOrgname(), mod->GetSubname()) ||
4594  (org.IsSetTaxname() && NStr::Equal(org.GetTaxname(), mod->GetSubname()))));
4595 
4596  }
4597 };
4598 
4600 {
4601  if (!org.IsSetOrgname() || !org.GetOrgname().IsSetMod()) {
4602  return;
4603  }
4604  auto& modset = org.SetOrgname().SetMod();
4605  SRemovableOrgModNote matcher{ org };
4606  size_t before = modset.size();
4607  modset.erase(std::remove_if(modset.begin(), modset.end(), matcher), modset.end());
4608  if (before != modset.size()) {
4610  }
4611  if (modset.empty()) {
4612  org.SetOrgname().ResetMod();
4614  }
4615 }
4616 
4617 
4618 #if 0
4619 void CNewCleanup_imp::x_FlattenPubEquiv(CPub_equiv& pub_equiv)
4620 {
4621  CPub_equiv::Tdata& data = pub_equiv.Set();
4622 
4623  EDIT_EACH_PUB_ON_PUBEQUIV(pub_iter, pub_equiv ) {
4624  if( FIELD_IS(**pub_iter, Equiv) ) {
4625  CPub_equiv& equiv = GET_MUTABLE(**pub_iter, Equiv);
4626  x_FlattenPubEquiv(equiv);
4627  copy(equiv.Set().begin(), equiv.Set().end(), back_inserter(data));
4628  ERASE_PUB_ON_PUBEQUIV( pub_iter, pub_equiv );
4630  }
4631  }
4632 }
4633 #endif
4634 
4635 
4637 {
4638  if ( FIELD_OUT_OF_RANGE(date, Month, 1, 12) ) {
4639  RESET_FIELD(date, Month);
4641  }
4642 
4643  // Maybe we should have the max range set on a per-month basis? (e.g. 30 days for April).
4644  // ( This could get complex with leap years and such. )
4645  if ( FIELD_OUT_OF_RANGE(date, Day, 1, 31) ) {
4646  RESET_FIELD(date, Day);
4648  }
4649 
4650  if ( FIELD_OUT_OF_RANGE(date, Second, 0, 59) ) {
4651  RESET_FIELD(date, Second);
4653  }
4654 
4655  if (date.IsSetMinute()) {
4656  if (date.GetMinute() < 0 || date.GetMinute() > 59) {
4657  date.ResetMinute();
4658  date.ResetSecond();
4660  }
4661  } else if (date.IsSetSecond()) {
4662  date.ResetSecond();
4664  }
4665 
4666  if (date.IsSetHour()) {
4667  if (date.GetHour() < 0 || date.GetHour() > 23) {
4668  date.ResetHour();
4669  date.ResetMinute();
4670  date.ResetSecond();
4672  }
4673  } else if (date.IsSetMinute() || date.IsSetSecond()) {
4674  date.ResetMinute();
4675  date.ResetSecond();
4677  }
4678 
4679 }
4680 
4681 
4683 {
4684  if (!NStr::EndsWith(str, ')')) {
4685  return;
4686  }
4687 
4688  SIZE_TYPE start = str.find_first_of('\"');
4689  if (start != NPOS) {
4690  SIZE_TYPE end = str.find_first_of('\"', start + 1);
4691  if (end != NPOS) {
4692  string replace_val = str.substr(start + 1, (end - start) - 1);
4693  NStr::ToLower(replace_val);
4694  feat.AddQualifier("replace", replace_val );
4696  }
4697  }
4698 }
4699 
4701 {
4702  // Fix backwards intervals
4703  if ( seq_interval.CanGetFrom() && seq_interval.CanGetTo() && seq_interval.GetFrom() > seq_interval.GetTo()) {
4704  swap(seq_interval.SetFrom(), seq_interval.SetTo());
4706  }
4707  // change bad strand values.
4708  if (m_Scope && seq_interval.IsSetId()) {
4709  auto seq_type = m_Scope->GetSequenceType(seq_interval.GetId(), CScope::fDoNotRecalculate);
4710  if (seq_type != CSeq_inst::eMol_not_set) {
4711  if (CSeq_inst::IsAa(seq_type)) {
4712  if (seq_interval.IsSetStrand()) {
4713  seq_interval.ResetStrand();
4715  }
4716  } else if (seq_interval.IsSetStrand()) {
4717  if (seq_interval.GetStrand() == eNa_strand_unknown) {
4718  seq_interval.SetStrand(eNa_strand_plus);
4720  }
4721  } else {
4722  seq_interval.SetStrand(eNa_strand_plus);
4724  }
4725  }
4726  }
4727 }
4728 
4730 {
4731  switch (loc.Which()) {
4732  case CSeq_loc::e_Int :
4733  x_BothStrandBC( GET_MUTABLE(loc, Int) );
4734  break;
4735  case CSeq_loc::e_Packed_int :
4736  {
4737  CSeq_loc::TPacked_int::Tdata& ints = loc.SetPacked_int().Set();
4738  NON_CONST_ITERATE(CSeq_loc::TPacked_int::Tdata, interval_it, ints) {
4739  x_BothStrandBC(**interval_it);
4740  }
4741  }
4742  break;
4743  case CSeq_loc::e_Pnt :
4744  {
4745  CSeq_loc::TPnt& pnt = loc.SetPnt();
4746 
4747  // change both and both-rev to plus and minus, respectively
4748  if (pnt.CanGetStrand()) {
4749  ENa_strand strand = pnt.GetStrand();
4750  if (strand == eNa_strand_both) {
4753  } else if (strand == eNa_strand_both_rev) {
4756  }
4757  }
4758  }
4759  break;
4760 
4761  default:
4762  break;
4763  }
4764 }
4765 
4767 {
4768  if (seq_interval.CanGetStrand()) {
4769  ENa_strand strand = seq_interval.GetStrand();
4770  if (strand == eNa_strand_both) {
4771  seq_interval.SetStrand(eNa_strand_plus);
4773  } else if (strand == eNa_strand_both_rev) {
4774  seq_interval.SetStrand(eNa_strand_minus);
4776  }
4777  }
4778 }
4779 
4780 void CNewCleanup_imp::x_SplitDbtag( CDbtag &dbt, vector< CRef< CDbtag > > & out_new_dbtags )
4781 {
4782  // check the common case of nothing to split
4783  if (!dbt.IsSetTag()) {
4784  return;
4785  }
4786  auto& tag = dbt.SetTag();
4787  if (!tag.IsStr()) {
4788  return;
4789  }
4790  if( tag.GetStr().find(":") == string::npos ) {
4791  return;
4792  }
4793 
4794  // check if we're trying to split something we shouldn't
4795  if (dbt.IsSetDb()) {
4796  string db = dbt.GetDb();
4797  if (NStr::Equal(db, "MGD") || NStr::Equal(db, "MGI") || NStr::Equal(db, "HGNC") || NStr::Equal(db, "VGNC") || NStr::Equal(db, "AllianceGenome")) {
4798  return;
4799  }
4800  }
4801 
4802  if ( m_IsEmblOrDdbj) {
4803  return;
4804  }
4805 
4806  // split by colon and generate new tags
4807  vector<string> tags;
4808  NStr::Split(tag.GetStr(), ":", tags, NStr::fSplit_Tokenize);
4809  _ASSERT( tags.size() >= 2 );
4810 
4811  // treat the CDbtag argument as the first of the new CDbtags
4812  tag.SetStr( tags.front() );
4813  vector<string>::const_iterator str_iter = tags.begin() + 1;
4814  for( ; str_iter != tags.end(); ++str_iter ) {
4815  CRef<CDbtag> new_tag( new CDbtag );
4816  new_tag->Assign( dbt );
4817  new_tag->SetTag().SetStr( *str_iter );
4818  out_new_dbtags.push_back( new_tag );
4819  }
4820 
4822 }
4823 
4824 inline
4825 static
4826 bool s_CodonCompare( const int &codon1, const int &codon2 ) {
4827  return (codon1 < codon2);
4828 }
4829 
4830 inline
4831 static
4832 bool s_CodonEqual( int codon1, int codon2 ) {
4833  return (codon1 == codon2);
4834 }
4835 
4836 static
4837 char s_ConvertTrnaAaToLetter( const CTrna_ext::C_Aa &trna_aa, CSeqUtil::ECoding coding, char *out_aa_char = nullptr )
4838 {
4839  char temp_aa = '\0';
4840 
4841  size_t num_converted = 0;
4842  char new_aa = '\0';
4843  switch( trna_aa.Which() ) {
4845  temp_aa = trna_aa.GetIupacaa();
4846  num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Iupacaa, 0, 1, &new_aa, coding );
4847  break;
4849  temp_aa = trna_aa.GetNcbieaa();
4850  num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Ncbieaa, 0, 1, &new_aa, coding );
4851  break;
4853  temp_aa = trna_aa.GetNcbi8aa();
4854  num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Ncbi8aa, 0, 1, &new_aa, coding );
4855  break;
4857  temp_aa = trna_aa.GetNcbistdaa();
4858  num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Ncbistdaa, 0, 1, &new_aa, coding );
4859  break;
4860  default:
4861  break;
4862  }
4863  if( out_aa_char ) {
4864  *out_aa_char = temp_aa;
4865  }
4866  if( num_converted > 0 ) {
4867  return new_aa;
4868  } else {
4869  return '\0';
4870  }
4871 }
4872 
4874 {
4875  if( tRNA.IsSetAa() && tRNA.GetAa().IsIupacaa() ) {
4876  const int old_value = tRNA.GetAa().GetIupacaa();
4877  tRNA.SetAa().SetNcbieaa( old_value );
4879  }
4880 
4884  }
4885 
4886  if( ! CODON_ON_TRNAEXT_IS_UNIQUE(tRNA, s_CodonEqual) ) {
4889  }
4890 
4892 }
4893 
4894 static
4895 void s_ParsePCRComponent(vector<string> &out_list, const string *component)
4896 {
4897  out_list.clear();
4898 
4899  if( !component ) return;
4900  if ( component->empty() ) return;
4901 
4902  string component_copy = *component; //copy so we can modify it
4903  // Remove enclosing parens, if any
4904  const string::size_type len = component_copy.length();
4905  if ( len > 1 && component_copy[0] == '(' && component_copy[len - 1] == ')' && component_copy.find('(', 1) == string::npos ) {
4906  component_copy = component_copy.substr( 1, component_copy.length() - 2 );
4907  }
4908 
4909  NStr::Split(component_copy, string(","), out_list, NStr::fSplit_Tokenize);
4910  EDIT_EACH_STRING_IN_VECTOR( str_iter, out_list ) {
4911  NStr::TruncateSpacesInPlace( *str_iter );
4912  }
4913 }
4914 
4916 public:
4918  const string* fwd_seq,
4919  const string* rev_seq,
4920  const string* fwd_name,
4921  const string* rev_name) :
4922  m_Fwd_seq( fwd_seq ? *fwd_seq : kEmptyStr),
4923  m_Rev_seq( rev_seq ? *rev_seq : kEmptyStr),
4924  m_Fwd_name(fwd_name ? *fwd_name : kEmptyStr),
4925  m_Rev_name(rev_name ? *rev_name : kEmptyStr),
4927 
4928  const string &GetFwdSeq() const { return m_Fwd_seq; }
4929  const string &GetRevSeq() const { return m_Rev_seq; }
4930  const string &GetFwdName() const { return m_Fwd_name; }
4931  const string &GetRevName() const { return m_Rev_name; }
4932 
4933  bool operator <( const CPCRParsedSet &rhs ) const {
4934  if ( int diff = NStr::CompareNocase( m_Fwd_seq, rhs.m_Fwd_seq ) )
4935  return diff < 0;
4936  if ( int diff = NStr::CompareNocase( m_Rev_seq, rhs.m_Rev_seq ) )
4937  return diff < 0;
4938  if ( int diff = NStr::CompareNocase( m_Fwd_name, rhs.m_Fwd_name ) )
4939  return diff < 0;
4940  if ( int diff = NStr::CompareNocase( m_Rev_name, rhs.m_Rev_name ) )
4941  return diff < 0;
4942  // last resort
4943  return m_Original_order < rhs.m_Original_order;
4944  }
4945 
4946 private:
4947  string m_Fwd_seq;
4948  string m_Rev_seq;
4949  string m_Fwd_name;
4950  string m_Rev_name;
4952 
4954 };
4955 
4957 
4958 static
4959 void s_ParsePCRSet( const CBioSource &biosrc, list<CPCRParsedSet> &out_pcr_set )
4960 {
4961  out_pcr_set.clear();
4962 
4963  const string* fwd_primer_seq = nullptr;
4964  const string* rev_primer_seq = nullptr;
4965  const string* fwd_primer_name = nullptr;
4966  const string* rev_primer_name = nullptr;
4967 
4968 // convenience macro
4969 #define PARSEPCRSET_CASE(Subtype) \
4970  case NCBI_SUBSOURCE(Subtype): \
4971  if( (*subsrc_iter)->IsSetName() ) { \
4972  Subtype = &((*subsrc_iter)->GetName()); \
4973  } \
4974  break;
4975 
4976 
4977  FOR_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
4978  SWITCH_ON_SUBSOURCE_CHOICE( **subsrc_iter ) {
4979  PARSEPCRSET_CASE(fwd_primer_seq)
4980  PARSEPCRSET_CASE(rev_primer_seq)
4981  PARSEPCRSET_CASE(fwd_primer_name)
4982  PARSEPCRSET_CASE(rev_primer_name)
4983  default:
4984  // ignore
4985  break;
4986  }
4987  }
4988 #undef PARSEPCRSET_CASE
4989 
4990  // ParsePCRStrings
4991  vector<string> fwd_seq_list;
4992  s_ParsePCRComponent(fwd_seq_list, fwd_primer_seq);
4993  vector<string> rev_seq_list;
4994  s_ParsePCRComponent(rev_seq_list, rev_primer_seq);
4995  vector<string> fwd_name_list;
4996  s_ParsePCRComponent(fwd_name_list, fwd_primer_name);
4997  vector<string> rev_name_list;
4998  s_ParsePCRComponent(rev_name_list, rev_primer_name);
4999 
5000  vector<string>::iterator curr_fwd_seq = fwd_seq_list.begin();
5001  vector<string>::iterator curr_rev_seq = rev_seq_list.begin();
5002  vector<string>::iterator curr_fwd_name = fwd_name_list.begin();
5003  vector<string>::iterator curr_rev_name = rev_name_list.begin();
5004 
5005  while (curr_fwd_seq != fwd_seq_list.end() ||
5006  curr_rev_seq != rev_seq_list.end() ||
5007  curr_fwd_name != fwd_name_list.end() ||
5008  curr_rev_name != rev_name_list.end() )
5009  {
5010  const string* fwd_seq = ( curr_fwd_seq != fwd_seq_list.end() ? &*curr_fwd_seq++ : nullptr );
5011  const string* rev_seq = ( curr_rev_seq != rev_seq_list.end() ? &*curr_rev_seq++ : nullptr );
5012  const string* fwd_name = ( curr_fwd_name != fwd_name_list.end() ? &*curr_fwd_name++ : nullptr );
5013  const string* rev_name = ( curr_rev_name != rev_name_list.end() ? &*curr_rev_name++ : nullptr );
5014 
5015  out_pcr_set.push_back( CPCRParsedSet(fwd_seq, rev_seq, fwd_name, rev_name) );
5016  }
5017 }
5018 
5019 // split by colon and trim spaces off the pieces
5020 static
5021 void s_ParsePCRColonString( vector<string> &out_list, const string &str )
5022 {
5023  NStr::Split(str, ":", out_list, NStr::fSplit_Tokenize);
5024  EDIT_EACH_STRING_IN_VECTOR(str_iter, out_list ) {
5025  NStr::TruncateSpacesInPlace( *str_iter );
5026  if( str_iter->empty() ) {
5027  ERASE_STRING_IN_VECTOR(str_iter, out_list);
5028  }
5029  }
5030 }
5031 
5032 static
5033 CRef<CPCRPrimerSet> s_ModernizePCRPrimerHalf (const string &seq, const string &name)
5034 {
5035  // Construct the value we will return
5036  // ( and extract its primer set for easy access )
5038  list< CRef< CPCRPrimer > > &primer_list = return_value->Set();
5039 
5040  vector<string> seq_list;
5041  s_ParsePCRColonString (seq_list, seq);
5042  vector<string> name_list;
5043  s_ParsePCRColonString (name_list, name);
5044 
5045  vector<string>::const_iterator name_iter = name_list.begin();
5046 
5047  CRef<CPCRPrimer> last_primer;
5048 
5049  // create a PCRPrimer for each seq (and attach its name, if possible)
5050  FOR_EACH_STRING_IN_VECTOR( seq_iter, seq_list ) {
5051 
5052  const string* curr_name = nullptr;
5053  if ( name_iter != name_list.end() ) {
5054  curr_name = &*name_iter;
5055  ++name_iter;
5056  }
5057 
5058  CRef<CPCRPrimer> curr_primer( new CPCRPrimer );
5059  curr_primer->SetSeq().Set( *seq_iter );
5060  if( curr_name ) {
5061  curr_primer->SetName().Set( *curr_name );
5062  }
5063  primer_list.push_back( curr_primer );
5064  last_primer = curr_primer;
5065  }
5066 
5067  if( last_primer ) {
5068  // attach any leftover names to the end of the name of the last seq
5069  for ( ; name_iter != name_list.end() ; ++name_iter ) {
5070  last_primer->SetName().Set() += ":" + *name_iter;
5071  }
5072  } else {
5073  // This differs from C. C breaks as soon as it's looked at the
5074  // first name, but this version will create CPCRPrimer for all names.
5075  for ( ; name_iter != name_list.end() ; ++name_iter ) {
5076  CRef<CPCRPrimer> curr_primer( new CPCRPrimer );
5077  curr_primer->SetName().Set( *name_iter );
5078  primer_list.push_back( curr_primer );
5079  }
5080  }
5081 
5082  // If the CPCRPrimerSet contains nothing inside, return a null ref
5083  if( primer_list.empty() ) {
5084  return CRef<CPCRPrimerSet>();
5085  } else {
5086  return return_value;
5087  }
5088 }
5089 
5090 cla