NCBI C++ ToolKit
restriction.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: restriction.cpp 92272 2020-12-29 20:09:48Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Josh Cherry
27  *
28  * File Description: Classes for representing and finding restriction sites
29  *
30  */
31 
32 
33 #include <ncbi_pch.hpp>
34 #include <algorithm>
35 #include <corelib/ncbistd.hpp>
36 
43 
44 #include <objmgr/seq_vector.hpp>
45 #include <objmgr/util/sequence.hpp>
46 
48 
49 
52 
53 NCBI_PARAM_DECL(string, RESTRICTION_SITES, REBASE);
54 NCBI_PARAM_DEF (string, RESTRICTION_SITES, REBASE, "");
55 typedef NCBI_PARAM_TYPE(RESTRICTION_SITES, REBASE) TRebaseData;
56 
57 
58 bool CRSpec::operator<(const CRSpec& rhs) const
59 {
60  if (GetSeq() != rhs.GetSeq()) {
61  return GetSeq() < rhs.GetSeq();
62  }
63  // otherwise sequences identical
64  if (GetPlusCuts() != rhs.GetPlusCuts()) {
65  return GetPlusCuts() < rhs.GetPlusCuts();
66  }
67  if (GetMinusCuts() != rhs.GetMinusCuts()) {
68  return GetMinusCuts() < rhs.GetMinusCuts();
69  }
70  // otherwise arguments are equal, and < is false
71  return false;
72 }
73 
74 
75 void CREnzyme::Reset(void)
76 {
77  m_Name.erase();
78  m_Specs.clear();
79 }
80 
81 
82 // helper functor for sorting enzymes by specificity
84 {
85  bool operator()(const CREnzyme& lhs, const CREnzyme& rhs)
86  {
87  return lhs.GetSpecs() < rhs.GetSpecs();
88  }
89 };
90 
91 
93 {
94  stable_sort(enzymes.begin(), enzymes.end(), SCompareSpecs());
96  ITERATE (TEnzymes, enzyme, enzymes) {
97  if (enzyme != enzymes.begin() &&
98  enzyme->GetSpecs() == result.back().GetSpecs()) {
99  result.back().SetName() += "/";
100  result.back().SetName() += enzyme->GetName();
101  } else {
102  result.push_back(*enzyme);
103  result.back().SetPrototype();
104  }
105  }
106  swap(enzymes, result);
107 }
108 
109 
110 void CRSpec::Reset(void)
111 {
112  m_Seq.erase();
113  m_PlusCuts.clear();
114  m_MinusCuts.clear();
115 }
116 
117 
118 ostream& operator<<(ostream& os, const CRSite& site)
119 {
120  os << "Recog. site: " << site.GetStart() << '-'
121  << site.GetEnd() << endl;
122  os << "Plus strand cuts: ";
123  string s;
124  ITERATE (vector<int>, cut, site.GetPlusCuts()) {
125  if (!s.empty()) {
126  s += " ,";
127  }
128  s += NStr::IntToString(*cut);
129  }
130  os << s << endl;
131 
132  os << "Minus strand cuts: ";
133  s.erase();
134  ITERATE (vector<int>, cut, site.GetMinusCuts()) {
135  if (!s.empty()) {
136  s += " ,";
137  }
138  s += NStr::IntToString(*cut);
139  }
140  os << s << endl;
141 
142  return os;
143 }
144 
145 
147 {
148  // site contains a string such as
149  // GACGTC, GACGT^C, GCAGC(8/12), or (8/13)GAGNNNNNCTC(13/8)
150 
151  CRSpec spec;
152 
153  int plus_cut, minus_cut;
154 
155  string s = site;
156  if (s[0] == '(') {
157  string::size_type idx = s.find_first_of(")");
158  if (idx == std::string::npos) {
159  throw runtime_error(string("Error parsing site ")
160  + site);
161  }
162  x_ParseCutPair(s.substr(0, idx + 1), plus_cut, minus_cut);
163  s.erase(0, idx + 1);
164  spec.SetPlusCuts().push_back(-plus_cut);
165  spec.SetMinusCuts().push_back(-minus_cut);
166  }
167  if (s[s.length() - 1] == ')') {
168  string::size_type idx = s.find_last_of("(");
169  if (idx == std::string::npos) {
170  throw runtime_error(string("Error parsing site ")
171  + site);
172  }
173  x_ParseCutPair(s.substr(idx), plus_cut, minus_cut);
174  s.erase(idx);
175  spec.SetPlusCuts().push_back(plus_cut + s.length());
176  spec.SetMinusCuts().push_back(minus_cut + s.length());
177  }
178  for (unsigned int i = 0; i < s.length(); i++) {
179  if (s[i] == '^') {
180 
181  // This should be simple. If we have
182  // G^GATCC, the cut on the plus strand is
183  // at 1, and on the minus it's at the
184  // symmetric position, 5.
185  // The complication is that TspRI
186  // is given as CASTGNN^, not NNCASTGNN^.
187  // Someone someday might write NNCASTGNN^,
188  // and something like ^NNGATC might arise,
189  // so code defensively.
190 
191  s.erase(i, 1);
192  int plus_cut = i;
193 
194  // this is the slightly complicated bit
195  // trim any leading and trailing N's;
196  // in case leading N's removed, adjust plus_cut accordingly
197  string::size_type idx = s.find_first_not_of("N");
198  if (idx == string::npos) {
199  // bizarre situation; all N's (possibly empty)
200  s.erase();
201  plus_cut = 0;
202  } else {
203  s.erase(0, idx);
204  plus_cut -= idx;
205  }
206  idx = s.find_last_not_of("N");
207  s.erase(idx + 1);
208 
209  // plus strand cut
210  spec.SetPlusCuts().push_back(plus_cut);
211  // symmetric cut on minus strand
212  spec.SetMinusCuts().push_back(s.length() - plus_cut);
213  break; // there better be just one '^'
214  }
215  }
216  spec.SetSeq(s);
217  return spec;
218 }
219 
220 
221 CREnzyme CRebase::MakeREnzyme(const string& name, const string& sites)
222 {
223 
224  CREnzyme re;
225 
226  re.SetName(name);
227 
228  vector<string> site_vec;
229  NStr::Split(sites, ",", site_vec);
230  ITERATE (vector<string>, iter, site_vec) {
231  CRSpec spec = CRebase::MakeRSpec(*iter);
232  re.SetSpecs().push_back(spec);
233  }
234 
235  return re;
236 }
237 
238 
240 {
241  return TRebaseData::GetDefault();
242 }
243 
244 
245 void CRebase::x_ParseCutPair(const string& s, int& plus_cut, int& minus_cut)
246 {
247  // s should look like "(8/13)"; plus_cut gets set to 8 and minus_cut to 13
248  list<string> l;
249  NStr::Split(s.substr(1, s.length() - 2), "/", l, NStr::fSplit_Tokenize);
250  if (l.size() != 2) {
251  throw runtime_error(string("Couldn't parse cut locations ")
252  + s);
253  }
254  plus_cut = NStr::StringToInt(l.front());
255  minus_cut = NStr::StringToInt(l.back());
256 }
257 
258 
260  TEnzymes& enzymes,
261  enum EEnzymesToLoad which)
262 {
263  string line;
264  CREnzyme enzyme;
265  string name;
266  TEnzymes::size_type prototype_idx(0);
267 
268  while (getline(input, line)) {
269  vector<string> fields;
270  NStr::Split(line, "\t", fields);
271  // the lines we're interested in have more than one field
272  if (fields.size() < 2) {
273  continue;
274  }
275  // name of enzyme is in field one or two (field zero is empty)
276  name = fields[1];
277  bool is_prototype(true);
278  if (name == " ") {
279  if (which == ePrototype) {
280  continue; // skip this non-protype
281  }
282  is_prototype = false;
283  name = fields[2];
284  }
285  if (which == eCommercial && fields[5].empty()) {
286  continue; // skip this commercially unavailable enzyme
287  }
288  string sites = fields[3]; // this contains a comma-separted list of sites,
289  // usually just one (but TaqII has two)
290  enzyme = MakeREnzyme(name, sites);
291  enzymes.push_back(enzyme);
292 
293  // Link isoschizomers with their prototypes.
294  if (is_prototype) {
295  prototype_idx = enzymes.size();
296  } else if (prototype_idx) {
297  CREnzyme& prototype = enzymes[prototype_idx - 1];
298  enzyme.SetPrototype(prototype.GetName());
299  prototype.SetIsoschizomers().push_back(name);
300  }
301  }
302 }
303 
304 
305 ostream& operator<<(ostream& os, const CREnzResult& er)
306 {
307  os << "Enzyme: " << er.GetEnzymeName() << endl;
308  os << er.GetDefiniteSites().size() << " definite sites:" << endl;
309  ITERATE (vector<CRSite>, site, er.GetDefiniteSites()) {
310  os << *site;
311  }
312  os << er.GetPossibleSites().size() << " possible sites:" << endl;
313  ITERATE (vector<CRSite>, site, er.GetPossibleSites()) {
314  os << *site;
315  }
316  return os;
317 }
318 
319 
321 {
322  bool operator() (const CRSite& lhs, const CRSite& rhs) const
323  {
324  return lhs.GetStart() < rhs.GetStart();
325  }
326 };
327 
328 
329 // Class for internal use by restriction site finding.
330 // Holds a pattern in ncbi8na, integers that specify
331 // which enzyme (of a sequential collection)
332 // and which specifity of that
333 // enzyme it represents, a field indicating
334 // whether it represents the complement of the specificity,
335 // and the length of the initial subpattern that was
336 // put into the fsm.
338 {
339 public:
340  CPatternRec(string pattern, int enzyme_index, int spec_index,
341  ENa_strand strand,
342  unsigned int fsm_pat_size) : m_Pattern(pattern),
343  m_EnzymeIndex(enzyme_index),
344  m_SpecIndex(spec_index),
345  m_Strand(strand),
346  m_FsmPatSize(fsm_pat_size) {}
347  // member access
348  const string& GetPattern(void) const {return m_Pattern;}
349  int GetEnzymeIndex(void) const {return m_EnzymeIndex;}
350  int GetSpecIndex(void) const {return m_SpecIndex;}
351  ENa_strand GetStrand(void) const {return m_Strand;}
352  unsigned int GetFsmPatSize(void) const {return m_FsmPatSize;}
353 private:
354  // pattern in ncbi8na
355  string m_Pattern;
356  // which enzyme and specificity we represent
359  // whether we represent the complement of the specificity
361  unsigned int m_FsmPatSize;
362 };
363 
364 
365 // helper functor for sorting CRef<CREnzResult> by the enzyme name
367 {
368  bool operator()
369  (const CRef<CREnzResult>& lhs, const CRef<CREnzResult>& rhs) const
370  {
371  return lhs->GetEnzymeName() < rhs->GetEnzymeName();
372  }
373 };
374 
375 
376 // helper functor for sorting CREnzyme by the enzyme name
378 {
379  bool operator()
380  (const CREnzyme& lhs, const CREnzyme& rhs) const
381  {
382  return lhs.GetName() < rhs.GetName();
383  }
384 };
385 
386 
387 // helper functor for sorting by the number of definite sites
389 {
390  bool operator()
391  (const CRef<CREnzResult>& lhs, const CRef<CREnzResult>& rhs) const
392  {
393  return lhs->GetDefiniteSites().size() < rhs->GetDefiniteSites().size();
394  }
395 };
396 
397 
398 // helper functor for sorting CRef<CSeq_loc>s by location
400 {
401  bool operator()
402  (const CRef<CSeq_loc>& lhs, const CRef<CSeq_loc>& rhs) const
403  {
404  return (lhs->Compare(*rhs, CSeq_loc::fCompare_Default) < 0);
405  }
406 };
407 
408 
409 
410 
411 void CFindRSites::x_ExpandRecursion(string& s, unsigned int pos,
412  CTextFsm<int>& fsm, int match_value)
413 {
414  if (pos == s.size()) {
415  // this is the place
416  fsm.AddWord(s, match_value);
417  return;
418  }
419 
420  char orig_ch = s[pos];
421  for (char x = 1; x <= 8; x <<= 1) {
422  if (orig_ch & x) {
423  s[pos] = x;
424  x_ExpandRecursion(s, pos + 1, fsm, match_value);
425  }
426  }
427  // restore original character
428  s[pos] = orig_ch;
429 }
430 
431 
432 CFindRSites::CFindRSites(const string& refile,
433  CRebase::EEnzymesToLoad which_enzymes,
434  TFlags flags)
435  : m_Flags(flags)
436 {
437  x_LoadREnzymeData(refile.empty()
439  : refile,
440  which_enzymes);
441 }
442 
443 
445 {
446  return m_Enzymes;
447 }
448 
449 
451 {
452  return m_Enzymes;
453 }
454 
455 
456 void CFindRSites::x_LoadREnzymeData(const string& refile,
457  CRebase::EEnzymesToLoad which_enzymes)
458 {
459  if (! refile.empty()) {
460  ifstream istr(refile.c_str());
461  CRebase::ReadNARFormat(istr, m_Enzymes, which_enzymes);
462 
464  // first sort alphabetically by enzyme name
465  sort(m_Enzymes.begin(), m_Enzymes.end(), SNameCompare());
466  // now combine isoschizomers
468  }
469  }
470 }
471 
472 
473 // remap a child location to a parent
475  const CSeq_loc& child,
476  CScope* scope)
477 {
478  CSeq_loc dummy_parent;
479  dummy_parent.SetWhole(const_cast<CSeq_id&>(sequence::GetId(parent, 0)));
480  SRelLoc converter(dummy_parent, child, scope, SRelLoc::fNoMerge);
481  converter.m_ParentLoc = &parent;
482  return converter.Resolve(scope);
483 }
484 
485 static
486 void s_AddSitesToAnnot(const vector<CRSite>& sites,
487  const CREnzResult& result,
488  CSeq_annot& annot,
489  CScope& scope,
490  const CSeq_loc& parent_loc,
491  bool definite = true)
492 {
493  const CSeq_id& id = sequence::GetId(parent_loc, 0);
494 
495  ITERATE (vector<CRSite>, site, sites) {
496  // create feature
497  CRef<CSeq_feat> feat(new CSeq_feat());
498 
499  // start to set up Rsite
500  feat->SetData().SetRsite().SetDb().SetDb("REBASE");
501  feat->SetData().SetRsite().SetDb()
502  .SetTag().SetStr("REBASE");
503 
504  string str(result.GetEnzymeName());
505  if ( !definite ) {
506  str = "Possible " + str;
507  }
508  feat->SetData().SetRsite().SetStr(str);
509 
510  //
511  // build our location
512  //
513 
514  vector< CRef<CSeq_loc> > locs;
515 
516  // a loc for the recognition site
517  CRef<CSeq_loc> recog_site(new CSeq_loc);
518  recog_site->SetInt().SetFrom(site->GetStart());
519  recog_site->SetInt().SetTo (site->GetEnd());
520  recog_site->SetInt().SetStrand(site->GetStrand());
521  recog_site->SetId(id);
522  locs.push_back(recog_site);
523 
524  ENa_strand cut_strand =
525  IsReverse(site->GetStrand()) ? eNa_strand_minus
526  : eNa_strand_plus;
527 
528  // locs for the cleavage sites
529  int negative_cut_locs = 0; // count these exceptions
530  ITERATE (vector<int>, cut, site->GetPlusCuts()) {
531  if (*cut >= 0 ) {
532  CRef<CSeq_loc> cut_site(new CSeq_loc);
533  cut_site->SetPnt().SetPoint(*cut);
534  // indicate that the cut is to the "left"
535  cut_site->SetPnt()
536  .SetFuzz().SetLim(CInt_fuzz::eLim_tl);
537  cut_site->SetPnt().SetStrand(cut_strand);
538  cut_site->SetId(id);
539  locs.push_back(cut_site);
540  } else {
541  negative_cut_locs++;
542  }
543  }
544 
545  ITERATE (vector<int>, cut, site->GetMinusCuts()) {
546  if (*cut >= 0 ) {
547  CRef<CSeq_loc> cut_site(new CSeq_loc);
548  cut_site->SetPnt().SetPoint(*cut);
549  // indicate that the cut is to the "left"
550  cut_site->SetPnt()
551  .SetFuzz().SetLim(CInt_fuzz::eLim_tl);
552  cut_site->SetPnt().SetStrand(Reverse(cut_strand));
553  cut_site->SetId(id);
554  locs.push_back(cut_site);
555  } else {
556  negative_cut_locs++;
557  }
558  }
559 
560  // comment for those few cases where there are
561  // cuts before the sequence begins
562  if (negative_cut_locs > 0) {
563  string a_comm = NStr::IntToString(negative_cut_locs)
564  + " cleavage sites are located before the"
565  " beginning of the sequence and are not reported";
566  feat->SetComment(a_comm);
567  }
568 
569  sort(locs.begin(), locs.end(), SLessSeq_loc());
570  copy(locs.begin(), locs.end(),
571  back_inserter(feat->SetLocation().SetMix().Set()));
572 
573  CRef<CSeq_loc> mapped =
574  s_RemapChildToParent(parent_loc, feat->GetLocation(), &scope);
575  feat->SetLocation(*mapped);
576 
577  // save in annot
578  annot.SetData().SetFtable().push_back(feat);
579  }
580 }
581 
582 
584 CFindRSites::GetAnnot(CScope& scope, const CSeq_loc& loc) const
585 {
586  CTime now(CTime::eCurrent);
587  // get sequence in binary (8na) form
588  CSeqVector vec(loc, scope, CBioseq_Handle::eCoding_Ncbi);
589 
590  // a place to store results (one per enzyme)
591  typedef vector<CRef<CREnzResult> > TResults;
592  TResults results;
593 
594  // do the big search
595  Find(vec, m_Enzymes, results, m_Flags);
596 
597  // deal with stored enzyme results
598 
599  sort(results.begin(), results.end(), SEnzymeNameCompare());
600 
602  // do a stablesort so alphabetical order preserved
603  // within sets with the same number of sites
604  stable_sort(results.begin(), results.end(), SLessDefSites());
605  }
606 
607  TAnnot annot;
608  int total_definite_sites = 0, total_possible_sites = 0;
609  int total_non_cutters = 0;
610 
611  ITERATE (TResults, result, results) {
612  const vector<CRSite>& definite_sites =
613  (*result)->GetDefiniteSites();
614  const vector<CRSite>& possible_sites =
615  (*result)->GetPossibleSites();
616 
617  int count_definite_sites = definite_sites.size();
618  int count_possible_sites = possible_sites.size();
619 
620  if (count_definite_sites || count_possible_sites) {
621  total_definite_sites += count_definite_sites;
622  total_possible_sites += count_possible_sites;
623 
624  if (m_Flags & fSplitAnnotsByEnzyme || annot.empty()) {
625  CRef<CSeq_annot> new_annot(new CSeq_annot);
626  // add description to annot
627  const string title("Restriction sites");
628  new_annot->SetNameDesc(title);
629  new_annot->SetTitleDesc(title);
630  new_annot->SetCreateDate(now);
631  CRef<CAnnotdesc> region(new CAnnotdesc);
632  region->SetRegion().Assign(loc);
633  new_annot->SetDesc().Set().push_back(region);
634  annot.push_back(new_annot);
635  }
636  CSeq_annot& curr_annot(*annot.back());
637 
638  //
639  // add features to annot
640  //
641  s_AddSitesToAnnot(definite_sites,
642  **result, curr_annot, scope, loc);
643  s_AddSitesToAnnot(possible_sites,
644  **result, curr_annot, scope, loc, false);
645  } else {
646  ++total_non_cutters;
647  }
648  }
649 
650  /**
651  // in order to build dialog efficiently,
652  // pre-allocate rows
653  int total_rows = total_definite_sites + total_possible_sites
654  + total_non_cutters;
655  **/
656 
657  _TRACE("Found " << total_definite_sites << " definite and "
658  << total_possible_sites << " possible sites");
659 
660  return annot;
661 }
662 
663 
666 {
668  CRef<CSeq_loc> loc(new CSeq_loc);
669  loc->SetWhole().Assign(*idh.GetSeqId());
670  return GetAnnot(bsh.GetScope(), *loc);
671 }
672 
673 
674 void CFindRSites::x_AddPattern(const string& pat, CTextFsm<int>& fsm,
675  int match_value)
676 {
677  string s = pat;
678  x_ExpandRecursion(s, 0, fsm, match_value);
679 }
680 
681 
682 bool CFindRSites::x_IsAmbig (char nuc)
683 {
684  static const bool ambig_table[16] = {
685  0, 0, 0, 1, 0, 1, 1, 1,
686  0, 1, 1, 1, 1, 1, 1, 1
687  };
688  return ambig_table[(size_t) nuc];
689 }
690 
691 
692 /// Find all definite and possible sites in a sequence
693 /// for a vector of enzymes, using a finite state machine.
694 /// Templatized so that seq can be various containers
695 /// (e.g., string, vector<char>, CSeqVector),
696 /// but it must yield ncbi8na.
697 template<class Seq>
698 void x_FindRSite(const Seq& seq, const CFindRSites::TEnzymes& enzymes,
699  vector<CRef<CREnzResult> >& results,
701 {
702 
703  results.clear();
704  results.reserve(enzymes.size());
705 
706  // vector of patterns for internal use
707  vector<CPatternRec> patterns;
708  patterns.reserve(enzymes.size()); // an underestimate
709 
710  // the finite state machine for the search
711  CTextFsm<int> fsm;
712 
713  // iterate over enzymes
714  ITERATE (CFindRSites::TEnzymes, enzyme, enzymes) {
715  // Unless isoschizomers are requested, skip them.
717  enzyme->IsPrototype())) {
718  continue;
719  }
720 
721  results.push_back(CRef<CREnzResult>
722  (new CREnzResult(enzyme->GetName())));
723 
724  string pat;
725  const vector<CRSpec>& specs = enzyme->GetSpecs();
726 
727  // iterate over specificities for this enzyme
728  ITERATE (vector<CRSpec>, spec, specs) {
729  // one or two patterns for this specificity
730  // (one for pallindrome, two otherwise)
731 
732  // to avoid combinatorial explosion,
733  // if there are more than two Ns only use
734  // part of pattern before first N
735  CSeqMatch::IupacToNcbi8na(spec->GetSeq(), pat);
736  // Check if the pattern is pallindromic.
737  string comp = pat;
739  ENa_strand strand =
740  (comp == pat) ? eNa_strand_both : eNa_strand_plus;
741 
742  SIZE_TYPE fsm_pat_size = pat.find_first_of(0x0f);
743  {{
744  SIZE_TYPE pos = pat.find_first_of(0x0f, fsm_pat_size + 1);
745  if (pos == NPOS
746  || pat.find_first_of(0x0f, pos + 1) == NPOS) {
747  fsm_pat_size = pat.size();
748  }
749  }}
750 
751  patterns.push_back(CPatternRec(pat, enzyme - enzymes.begin(),
752  spec - specs.begin(),
753  strand,
754  fsm_pat_size));
755  // add pattern to fsm
756  // (add only fsm_pat_size of it)
757  CFindRSites::x_AddPattern(pat.substr(0, fsm_pat_size), fsm,
758  patterns.size() - 1);
759 
760  // If not palidromic do a search for its complement too
761  if (strand != eNa_strand_both) {
762  {{
763  fsm_pat_size = comp.find_first_of(0x0f);
764  SIZE_TYPE pos = comp.find_first_of(0x0f, fsm_pat_size + 1);
765  if (pos == NPOS
766  || comp.find_first_of(0x0f, pos + 1) == NPOS) {
767  fsm_pat_size = comp.size();
768  }
769  }}
770  patterns.push_back(CPatternRec(comp, enzyme
771  - enzymes.begin(),
772  spec - specs.begin(),
773  Reverse(strand),
774  fsm_pat_size));
775  // add pattern to fsm
776  // (add only fsm_pat_size of it)
777  CFindRSites::x_AddPattern(comp.substr(0, fsm_pat_size), fsm,
778  patterns.size() - 1);
779  }
780  }
781  }
782 
783  // The fsm is set up.
784  // Now do the search.
785 
786  fsm.Prime();
787  vector<int> ambig_nucs; // for dealing with ambiguities later
788 
789  int state = fsm.GetInitialState();
790  for (unsigned int i = 0; i < seq.size(); i++) {
791  if (CFindRSites::x_IsAmbig(seq[i])) {
792  ambig_nucs.push_back(i);
793  }
794  state = fsm.GetNextState(state, seq[i]);
795  if (fsm.IsMatchFound(state)) {
796  const vector<int>& matches = fsm.GetMatches(state);
797  ITERATE (vector<int>, match, matches) {
798  const CPatternRec& pattern = patterns[*match];
799  TSeqPos begin_pos = i + 1 - pattern.GetFsmPatSize();
800  TSeqPos end_pos = begin_pos
801  + pattern.GetPattern().size() - 1;
802 
803  // check for a full match to sequence
804  if (pattern.GetFsmPatSize()
805  != pattern.GetPattern().size()) {
806  // Pattern put into fsm was less than the full pattern.
807  // Must check that the sequence really matches.
808  if (end_pos >= seq.size()) {
809  // full pattern goes off the end of sequence;
810  // not a match
811  continue;
812  }
813  // could really test match for just
814  // right end of pattern here
815  if (CSeqMatch::MatchNcbi8na(seq, pattern.GetPattern(),
816  begin_pos)
817  == CSeqMatch::eYes) {
818  // There must be a site here. However, because
819  // we'll later check all stretches containing
820  // ambiguities, we must ignore them here to keep
821  // from recording them twice.
822  bool ambig = false;
823  for (unsigned int n = begin_pos; n <= end_pos;
824  n++) {
825  if (CFindRSites::x_IsAmbig(seq[n])) {
826  ambig = true;
827  break;
828  }
829  }
830  if (ambig) {
831  continue; // ignore matches with ambig. seq.
832  }
833  } else {
834  continue; // not a real match
835  }
836  }
837  const CREnzyme& enzyme =
838  enzymes[pattern.GetEnzymeIndex()];
839  const CRSpec& spec =
840  enzyme.GetSpecs()[pattern.GetSpecIndex()];
841 
842  CRSite site(begin_pos, end_pos);
843  site.SetStrand(pattern.GetStrand());
844  const vector<int>& plus_cuts = spec.GetPlusCuts();
845  ITERATE (vector<int>, cut, plus_cuts) {
846  if (IsReverse(pattern.GetStrand())) {
847  site.SetPlusCuts()
848  .push_back(begin_pos
849  + pattern.GetPattern().size()
850  - *cut);
851  } else {
852  site.SetPlusCuts().push_back(begin_pos + *cut);
853  }
854  }
855  const vector<int>& minus_cuts = spec.GetMinusCuts();
856  ITERATE (vector<int>, cut, minus_cuts) {
857  if (IsReverse(pattern.GetStrand())) {
858  site.SetMinusCuts()
859  .push_back(begin_pos
860  + pattern.GetPattern().size()
861  - *cut);
862  } else {
863  site.SetMinusCuts().push_back(begin_pos + *cut);
864  }
865  }
866 
867  results[pattern.GetEnzymeIndex()]->SetDefiniteSites()
868  .push_back(site);
869  }
870  }
871  } // end iteration over the sequence
872 
873  // We've found all the matches involving unambiguous characters
874  // in sequence. Now go back and examine any ambiguous places.
875  if (!ambig_nucs.empty()) {
876  ITERATE (vector<CPatternRec>, pattern, patterns) {
877  const string& pat = pattern->GetPattern();
878 
879  // for reordering later
880  int ds_pos = results[pattern->GetEnzymeIndex()]
881  ->GetDefiniteSites().size();
882  int ps_pos = results[pattern->GetEnzymeIndex()]
883  ->GetPossibleSites().size();
884 
885  // the next possible (starting) position to check
886  int next_pos = 0;
887 
888  // iterate over ambiguous positions
889  ITERATE (vector<int>, pos, ambig_nucs) {
890  int begin_check = *pos - pat.size() + 1;
891  // don't try to check a negative position
892  begin_check = max(begin_check, 0);
893  // to avoid recording a site multiple times
894  // when there are two nearby ambiguities
895  begin_check = max(begin_check, next_pos);
896  int end_check = min(*pos, (int) (seq.size() - pat.size()));
897  int i;
898  for (i = begin_check; i <= end_check; i++) {
900  = CSeqMatch::MatchNcbi8na(seq, pat, i);
901  if (match == CSeqMatch::eNo) {
902  continue; // no match of any sort
903  }
904  // otherwise, a definite or possible site
905 
906  CRSite site(i, i + pat.size() -1);
907  site.SetStrand(pattern->GetStrand());
908 
909  // figure out cleavage locations
910  const vector<int>& plus_cuts
911  = enzymes[pattern->GetEnzymeIndex()]
912  .GetSpecs()[pattern->GetSpecIndex()].GetPlusCuts();
913  ITERATE (vector<int>, cut, plus_cuts) {
914  if (IsReverse(pattern->GetStrand())) {
915  site.SetPlusCuts()
916  .push_back(i + pattern->GetPattern().size()
917  - *cut);
918  } else {
919  site.SetPlusCuts().push_back(i + *cut);
920  }
921  }
922 
923  const vector<int>& minus_cuts
924  = enzymes[pattern->GetEnzymeIndex()]
925  .GetSpecs()[pattern->GetSpecIndex()]
926  .GetMinusCuts();
927  ITERATE (vector<int>, cut, minus_cuts) {
928  if (IsReverse(pattern->GetStrand())) {
929  site.SetMinusCuts()
930  .push_back(i + pattern->GetPattern().size()
931  - *cut);
932  } else {
933  site.SetMinusCuts().push_back(i + *cut);
934  }
935  }
936 
937 
938  if (match == CSeqMatch::eYes) {
939  results[pattern->GetEnzymeIndex()]
940  ->SetDefiniteSites().push_back(site);
941  } else if (flags & CFindRSites::fFindPossibleSites) {
942  results[pattern->GetEnzymeIndex()]
943  ->SetPossibleSites().push_back(site);
944  }
945  }
946  next_pos = i;
947  } // end ITERATE over ambiguous positions
948 
949  // definite_sites and possible_sites may be out of order
950  vector<CRSite>& def_sites = results[pattern->GetEnzymeIndex()]
951  ->SetDefiniteSites();
952  inplace_merge(def_sites.begin(),
953  def_sites.begin() + ds_pos,
954  def_sites.end(),
955  SCompareLocation());
956 
957  vector<CRSite>& pos_sites = results[pattern->GetEnzymeIndex()]
958  ->SetPossibleSites();
959  inplace_merge(pos_sites.begin(),
960  pos_sites.begin() + ps_pos,
961  pos_sites.end(),
962  SCompareLocation());
963 
964  } // end ITERATE over patterns
965  } // end if (!ambig_nucs.empty())
966 }
967 
968 
969 // CFindRSites::Find for various sequence containers
970 
971 void CFindRSites::Find(const string& seq,
972  const TEnzymes& enzymes,
973  vector<CRef<CREnzResult> >& results,
974  TFlags flags)
975 {
976  x_FindRSite(seq, enzymes, results, flags);
977 }
978 
979 
980 void CFindRSites::Find(const vector<char>& seq,
981  const TEnzymes& enzymes,
982  vector<CRef<CREnzResult> >& results,
983  TFlags flags)
984 {
985  x_FindRSite(seq, enzymes, results, flags);
986 }
987 
988 
990  const TEnzymes& enzymes,
991  vector<CRef<CREnzResult> >& results,
992  TFlags flags)
993 {
994  string seq_ncbi8na;
995  CSeqVector vec(seq);
996  vec.SetNcbiCoding();
997  vec.GetSeqData(0, vec.size(), seq_ncbi8na);
998  x_FindRSite(seq_ncbi8na, enzymes, results, flags);
999 }
1000 
1001 
1002 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool IsReverse(ENa_strand s)
Definition: Na_strand.hpp:75
ENa_strand Reverse(ENa_strand s)
Definition: Na_strand.hpp:90
User-defined methods of the data storage class.
CAnnotdesc –.
Definition: Annotdesc.hpp:66
CBioseq_Handle –.
TEnzymes & SetEnzymes()
CBioseq::TAnnot TAnnot
static bool x_IsAmbig(char nuc)
friend void x_FindRSite(const Seq &seq, const TEnzymes &enzymes, vector< CRef< CREnzResult > > &results, CFindRSites::TFlags)
Find all definite and possible sites in a sequence for a vector of enzymes, using a finite state mach...
const TEnzymes & GetEnzymes()
void x_LoadREnzymeData(const string &refile, CRebase::EEnzymesToLoad which_enzymes)
CREnzyme::TEnzymes TEnzymes
TFlags m_Flags
CFindRSites(const string &refile=kEmptyStr, CRebase::EEnzymesToLoad which_enzymes=CRebase::eAll, TFlags flags=fDefault)
unsigned int TFlags
@ fFindIsoschizomers
Lump together all enzymes with identical specificities.
static void Find(const string &seq, const TEnzymes &enzymes, vector< CRef< CREnzResult > > &results, TFlags flags=0)
TEnzymes m_Enzymes
static void x_AddPattern(const string &pat, CTextFsm< int > &fsm, int match_value)
static void x_ExpandRecursion(string &s, unsigned int pos, CTextFsm< int > &fsm, int match_value)
TAnnot GetAnnot(CScope &scope, const CSeq_loc &loc) const
string m_Pattern
unsigned int GetFsmPatSize(void) const
int GetSpecIndex(void) const
unsigned int m_FsmPatSize
const string & GetPattern(void) const
int GetEnzymeIndex(void) const
CPatternRec(string pattern, int enzyme_index, int spec_index, ENa_strand strand, unsigned int fsm_pat_size)
ENa_strand m_Strand
ENa_strand GetStrand(void) const
This class represents the results of a search for sites of a particular enzyme.
const vector< CRSite > & GetPossibleSites(void) const
const vector< CRSite > & GetDefiniteSites(void) const
const string & GetEnzymeName(void) const
This class represents a restriction enzyme (an enzyme name and a vector of cleavage specificities)
void SetName(const string &s)
const string & GetName(void) const
static void CombineIsoschizomers(TEnzymes &enzymes)
Definition: restriction.cpp:92
vector< string > & SetIsoschizomers(void)
vector< CREnzyme > TEnzymes
vector< CRSpec > m_Specs
vector< CRSpec > & SetSpecs(void)
const vector< CRSpec > & GetSpecs(void) const
void SetPrototype(const string &s=kEmptyStr)
string m_Name
void Reset(void)
Definition: restriction.cpp:75
This class represents a particular occurrence of a restriction site on a sequence (not to be confused...
Definition: restriction.hpp:55
int GetStart(void) const
This class represents a restriction enzyme specificity, i.e., a sequence recognition pattern and vect...
void Reset(void)
string m_Seq
bool operator<(const CRSpec &rhs) const
Definition: restriction.cpp:58
const string & GetSeq(void) const
void SetSeq(const string &s)
vector< int > m_MinusCuts
const vector< int > & GetPlusCuts(void) const
const vector< int > & GetMinusCuts(void) const
vector< int > & SetPlusCuts(void)
vector< int > & SetMinusCuts(void)
vector< int > m_PlusCuts
static CRSpec MakeRSpec(const string &site)
CREnzyme::TEnzymes TEnzymes
static string GetDefaultDataPath()
static void x_ParseCutPair(const string &s, int &plus_cut, int &minus_cut)
static CREnzyme MakeREnzyme(const string &name, const string &sites)
static void ReadNARFormat(istream &input, TEnzymes &enzymes, enum EEnzymesToLoad which)
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
static char IupacToNcbi8na(char in)
stuff for dealing with ncbi8na.
Definition: seq_match.cpp:43
static EMatch MatchNcbi8na(const Seq &seq, const Pat &pat, TSeqPos pos)
Definition: seq_match.hpp:93
static void CompNcbi8na(string &seq8na)
complement an ncbi8na sequence in place
Definition: seq_match.cpp:121
CSeqVector –.
Definition: seq_vector.hpp:65
void SetNameDesc(const string &name)
Definition: Seq_annot.cpp:66
void SetCreateDate(const CTime &dt)
Definition: Seq_annot.cpp:121
void SetTitleDesc(const string &title)
Definition: Seq_annot.cpp:96
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CTime –.
Definition: ncbitime.hpp:296
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
static const char * str(char *buf, int n)
Definition: stats.c:84
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define _TRACE(message)
Definition: ncbidbg.hpp:122
CConstRef< CSeq_id > GetSeqId(void) const
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
void SetPnt(TPnt &v)
Definition: Seq_loc.hpp:985
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
@ fCompare_Default
Definition: Seq_loc.hpp:245
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
CConstRef< CSeq_loc > m_ParentLoc
Definition: sequence.hpp:1148
CRef< CSeq_loc > Resolve(CScope *scope=0, TFlags flags=0) const
Definition: sequence.hpp:1143
@ eGetId_Canonical
Definition: sequence.hpp:114
@ fNoMerge
don't merge adjacent intervals
Definition: sequence.hpp:1126
CScope & GetScope(void) const
Get scope this handle belongs to.
@ eCoding_Ncbi
Set coding to binary coding (Ncbi4na or Ncbistdaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void SetNcbiCoding(void)
Set coding to either Ncbi8aa or Ncbi8na depending on molecule type.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
void AddWord(const string &word, const MatchType &match)
Definition: strsearch.hpp:343
int GetNextState(int state, char letter) const
Definition: strsearch.hpp:404
const vector< MatchType > & GetMatches(int state) const
Definition: strsearch.hpp:515
bool IsMatchFound(int state) const
Definition: strsearch.hpp:521
void Prime(void)
Definition: strsearch.hpp:376
int GetInitialState(void) const
Definition: strsearch.hpp:235
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2510
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
@ eLim_tl
space to left of position
Definition: Int_fuzz_.hpp:214
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
TRegion & SetRegion(void)
Select the variant.
Definition: Annotdesc_.cpp:300
static int input()
int i
yy_size_t n
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
ESERV_Site site
T max(T x_, T y_)
T min(T x_, T y_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
Definition: pcre2_match.c:594
static patstr * patterns
Definition: pcre2grep.c:356
static int * results[]
USING_SCOPE(objects)
NCBI_PARAM_DECL(string, RESTRICTION_SITES, REBASE)
ostream & operator<<(ostream &os, const CRSite &site)
typedef NCBI_PARAM_TYPE(RESTRICTION_SITES, REBASE) TRebaseData
NCBI_PARAM_DEF(string, RESTRICTION_SITES, REBASE, "")
void x_FindRSite(const Seq &seq, const CFindRSites::TEnzymes &enzymes, vector< CRef< CREnzResult > > &results, CFindRSites::TFlags flags)
Find all definite and possible sites in a sequence for a vector of enzymes, using a finite state mach...
static CRef< CSeq_loc > s_RemapChildToParent(const CSeq_loc &parent, const CSeq_loc &child, CScope *scope)
static void s_AddSitesToAnnot(const vector< CRSite > &sites, const CREnzResult &result, CSeq_annot &annot, CScope &scope, const CSeq_loc &parent_loc, bool definite=true)
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
bool operator()(const CRSite &lhs, const CRSite &rhs) const
bool operator()(const CREnzyme &lhs, const CREnzyme &rhs)
Definition: restriction.cpp:85
Location relative to a base Seq-loc: one (usually) or more ranges of offsets.
Definition: sequence.hpp:1124
else result
Definition: token2.c:20
static bool ambig(char c)
Modified on Fri Sep 20 14:57:19 2024 by modify_doxy.py rev. 669887