97 if (enzyme != enzymes.begin() &&
98 enzyme->GetSpecs() ==
result.back().GetSpecs()) {
99 result.back().SetName() +=
"/";
100 result.back().SetName() += enzyme->GetName();
102 result.push_back(*enzyme);
103 result.back().SetPrototype();
120 os <<
"Recog. site: " <<
site.GetStart() <<
'-'
121 <<
site.GetEnd() << endl;
122 os <<
"Plus strand cuts: ";
132 os <<
"Minus strand cuts: ";
153 int plus_cut, minus_cut;
157 string::size_type idx = s.find_first_of(
")");
158 if (idx == std::string::npos) {
159 throw runtime_error(
string(
"Error parsing site ")
167 if (s[s.length() - 1] ==
')') {
168 string::size_type idx = s.find_last_of(
"(");
169 if (idx == std::string::npos) {
170 throw runtime_error(
string(
"Error parsing site ")
175 spec.
SetPlusCuts().push_back(plus_cut + s.length());
178 for (
unsigned int i = 0;
i < s.length();
i++) {
197 string::size_type idx = s.find_first_not_of(
"N");
198 if (idx == string::npos) {
206 idx = s.find_last_not_of(
"N");
228 vector<string> site_vec;
230 ITERATE (vector<string>, iter, site_vec) {
241 return TRebaseData::GetDefault();
251 throw runtime_error(
string(
"Couldn't parse cut locations ")
266 TEnzymes::size_type prototype_idx(0);
268 while (getline(
input, line)) {
269 vector<string> fields;
272 if (fields.size() < 2) {
277 bool is_prototype(
true);
282 is_prototype =
false;
288 string sites = fields[3];
291 enzymes.push_back(enzyme);
295 prototype_idx = enzymes.size();
296 }
else if (prototype_idx) {
297 CREnzyme& prototype = enzymes[prototype_idx - 1];
342 unsigned int fsm_pat_size) :
m_Pattern(pattern),
371 return lhs->GetEnzymeName() < rhs->GetEnzymeName();
382 return lhs.GetName() < rhs.GetName();
393 return lhs->GetDefiniteSites().size() < rhs->GetDefiniteSites().size();
414 if (pos == s.size()) {
420 char orig_ch = s[pos];
421 for (
char x = 1; x <= 8; x <<= 1) {
459 if (! refile.empty()) {
460 ifstream istr(refile.c_str());
482 return converter.
Resolve(scope);
491 bool definite =
true)
500 feat->
SetData().SetRsite().SetDb().SetDb(
"REBASE");
501 feat->
SetData().SetRsite().SetDb()
502 .SetTag().SetStr(
"REBASE");
514 vector< CRef<CSeq_loc> > locs;
518 recog_site->
SetInt().SetFrom(
site->GetStart());
520 recog_site->
SetInt().SetStrand(
site->GetStrand());
521 recog_site->
SetId(
id);
522 locs.push_back(recog_site);
529 int negative_cut_locs = 0;
533 cut_site->
SetPnt().SetPoint(*cut);
537 cut_site->
SetPnt().SetStrand(cut_strand);
539 locs.push_back(cut_site);
548 cut_site->
SetPnt().SetPoint(*cut);
554 locs.push_back(cut_site);
562 if (negative_cut_locs > 0) {
564 +
" cleavage sites are located before the"
565 " beginning of the sequence and are not reported";
570 copy(locs.begin(), locs.end(),
571 back_inserter(feat->
SetLocation().SetMix().Set()));
578 annot.
SetData().SetFtable().push_back(feat);
591 typedef vector<CRef<CREnzResult> > TResults;
608 int total_definite_sites = 0, total_possible_sites = 0;
609 int total_non_cutters = 0;
612 const vector<CRSite>& definite_sites =
613 (*result)->GetDefiniteSites();
614 const vector<CRSite>& possible_sites =
615 (*result)->GetPossibleSites();
617 int count_definite_sites = definite_sites.size();
618 int count_possible_sites = possible_sites.size();
620 if (count_definite_sites || count_possible_sites) {
621 total_definite_sites += count_definite_sites;
622 total_possible_sites += count_possible_sites;
627 const string title(
"Restriction sites");
633 new_annot->
SetDesc().Set().push_back(region);
634 annot.push_back(new_annot);
642 **
result, curr_annot, scope, loc);
644 **
result, curr_annot, scope, loc,
false);
657 _TRACE(
"Found " << total_definite_sites <<
" definite and "
658 << total_possible_sites <<
" possible sites");
684 static const bool ambig_table[16] = {
685 0, 0, 0, 1, 0, 1, 1, 1,
686 0, 1, 1, 1, 1, 1, 1, 1
688 return ambig_table[(size_t)
nuc];
704 results.reserve(enzymes.size());
717 enzyme->IsPrototype())) {
725 const vector<CRSpec>& specs = enzyme->GetSpecs();
728 ITERATE (vector<CRSpec>, spec, specs) {
742 SIZE_TYPE fsm_pat_size = pat.find_first_of(0x0f);
744 SIZE_TYPE pos = pat.find_first_of(0x0f, fsm_pat_size + 1);
746 || pat.find_first_of(0x0f, pos + 1) ==
NPOS) {
747 fsm_pat_size = pat.size();
752 spec - specs.begin(),
763 fsm_pat_size = comp.find_first_of(0x0f);
764 SIZE_TYPE pos = comp.find_first_of(0x0f, fsm_pat_size + 1);
766 || comp.find_first_of(0x0f, pos + 1) ==
NPOS) {
767 fsm_pat_size = comp.size();
772 spec - specs.begin(),
787 vector<int> ambig_nucs;
790 for (
unsigned int i = 0;
i < seq.size();
i++) {
792 ambig_nucs.push_back(
i);
808 if (end_pos >= seq.size()) {
823 for (
unsigned int n = begin_pos;
n <= end_pos;
845 ITERATE (vector<int>, cut, plus_cuts) {
852 site.SetPlusCuts().push_back(begin_pos + *cut);
856 ITERATE (vector<int>, cut, minus_cuts) {
863 site.SetMinusCuts().push_back(begin_pos + *cut);
875 if (!ambig_nucs.empty()) {
877 const string& pat = pattern->GetPattern();
880 int ds_pos =
results[pattern->GetEnzymeIndex()]
881 ->GetDefiniteSites().size();
882 int ps_pos =
results[pattern->GetEnzymeIndex()]
883 ->GetPossibleSites().size();
889 ITERATE (vector<int>, pos, ambig_nucs) {
890 int begin_check = *pos - pat.size() + 1;
892 begin_check =
max(begin_check, 0);
895 begin_check =
max(begin_check, next_pos);
896 int end_check =
min(*pos, (
int) (seq.size() - pat.size()));
898 for (
i = begin_check;
i <= end_check;
i++) {
907 site.SetStrand(pattern->GetStrand());
910 const vector<int>& plus_cuts
911 = enzymes[pattern->GetEnzymeIndex()]
912 .GetSpecs()[pattern->GetSpecIndex()].GetPlusCuts();
913 ITERATE (vector<int>, cut, plus_cuts) {
916 .push_back(
i + pattern->GetPattern().size()
919 site.SetPlusCuts().push_back(
i + *cut);
923 const vector<int>& minus_cuts
924 = enzymes[pattern->GetEnzymeIndex()]
925 .GetSpecs()[pattern->GetSpecIndex()]
927 ITERATE (vector<int>, cut, minus_cuts) {
930 .push_back(
i + pattern->GetPattern().size()
933 site.SetMinusCuts().push_back(
i + *cut);
939 results[pattern->GetEnzymeIndex()]
940 ->SetDefiniteSites().push_back(
site);
942 results[pattern->GetEnzymeIndex()]
943 ->SetPossibleSites().push_back(
site);
950 vector<CRSite>& def_sites =
results[pattern->GetEnzymeIndex()]
951 ->SetDefiniteSites();
952 inplace_merge(def_sites.begin(),
953 def_sites.begin() + ds_pos,
957 vector<CRSite>& pos_sites =
results[pattern->GetEnzymeIndex()]
958 ->SetPossibleSites();
959 inplace_merge(pos_sites.begin(),
960 pos_sites.begin() + ps_pos,
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool IsReverse(ENa_strand s)
ENa_strand Reverse(ENa_strand s)
User-defined methods of the data storage class.
static bool x_IsAmbig(char nuc)
friend void x_FindRSite(const Seq &seq, const TEnzymes &enzymes, vector< CRef< CREnzResult > > &results, CFindRSites::TFlags)
Find all definite and possible sites in a sequence for a vector of enzymes, using a finite state mach...
const TEnzymes & GetEnzymes()
void x_LoadREnzymeData(const string &refile, CRebase::EEnzymesToLoad which_enzymes)
CREnzyme::TEnzymes TEnzymes
CFindRSites(const string &refile=kEmptyStr, CRebase::EEnzymesToLoad which_enzymes=CRebase::eAll, TFlags flags=fDefault)
@ fFindIsoschizomers
Lump together all enzymes with identical specificities.
static void Find(const string &seq, const TEnzymes &enzymes, vector< CRef< CREnzResult > > &results, TFlags flags=0)
static void x_AddPattern(const string &pat, CTextFsm< int > &fsm, int match_value)
static void x_ExpandRecursion(string &s, unsigned int pos, CTextFsm< int > &fsm, int match_value)
TAnnot GetAnnot(CScope &scope, const CSeq_loc &loc) const
unsigned int GetFsmPatSize(void) const
int GetSpecIndex(void) const
unsigned int m_FsmPatSize
const string & GetPattern(void) const
int GetEnzymeIndex(void) const
CPatternRec(string pattern, int enzyme_index, int spec_index, ENa_strand strand, unsigned int fsm_pat_size)
ENa_strand GetStrand(void) const
This class represents the results of a search for sites of a particular enzyme.
const vector< CRSite > & GetPossibleSites(void) const
const vector< CRSite > & GetDefiniteSites(void) const
const string & GetEnzymeName(void) const
This class represents a restriction enzyme (an enzyme name and a vector of cleavage specificities)
void SetName(const string &s)
const string & GetName(void) const
static void CombineIsoschizomers(TEnzymes &enzymes)
vector< string > & SetIsoschizomers(void)
vector< CREnzyme > TEnzymes
vector< CRSpec > & SetSpecs(void)
const vector< CRSpec > & GetSpecs(void) const
void SetPrototype(const string &s=kEmptyStr)
This class represents a particular occurrence of a restriction site on a sequence (not to be confused...
This class represents a restriction enzyme specificity, i.e., a sequence recognition pattern and vect...
bool operator<(const CRSpec &rhs) const
const string & GetSeq(void) const
void SetSeq(const string &s)
vector< int > m_MinusCuts
const vector< int > & GetPlusCuts(void) const
const vector< int > & GetMinusCuts(void) const
vector< int > & SetPlusCuts(void)
vector< int > & SetMinusCuts(void)
static CRSpec MakeRSpec(const string &site)
CREnzyme::TEnzymes TEnzymes
static string GetDefaultDataPath()
static void x_ParseCutPair(const string &s, int &plus_cut, int &minus_cut)
static CREnzyme MakeREnzyme(const string &name, const string &sites)
static void ReadNARFormat(istream &input, TEnzymes &enzymes, enum EEnzymesToLoad which)
static char IupacToNcbi8na(char in)
stuff for dealing with ncbi8na.
static EMatch MatchNcbi8na(const Seq &seq, const Pat &pat, TSeqPos pos)
static void CompNcbi8na(string &seq8na)
complement an ncbi8na sequence in place
void SetNameDesc(const string &name)
void SetCreateDate(const CTime &dt)
void SetTitleDesc(const string &title)
namespace ncbi::objects::
Include a standard set of the NCBI C++ Toolkit most basic headers.
static const char * str(char *buf, int n)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
CConstRef< CSeq_id > GetSeqId(void) const
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
CConstRef< CSeq_loc > m_ParentLoc
CRef< CSeq_loc > Resolve(CScope *scope=0, TFlags flags=0) const
@ fNoMerge
don't merge adjacent intervals
CScope & GetScope(void) const
Get scope this handle belongs to.
@ eCoding_Ncbi
Set coding to binary coding (Ncbi4na or Ncbistdaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
void SetNcbiCoding(void)
Set coding to either Ncbi8aa or Ncbi8na depending on molecule type.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
void AddWord(const string &word, const MatchType &match)
int GetNextState(int state, char letter) const
const vector< MatchType > & GetMatches(int state) const
bool IsMatchFound(int state) const
int GetInitialState(void) const
NCBI_NS_STD::string::size_type SIZE_TYPE
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
@ eCurrent
Use current time. See also CCurrentTime.
@ eLim_tl
space to left of position
void SetLocation(TLocation &value)
Assign a value to Location data member.
void SetComment(const TComment &value)
Assign a value to Comment data member.
const TLocation & GetLocation(void) const
Get the Location member data.
void SetData(TData &value)
Assign a value to Data data member.
ENa_strand
strand of nucleic acid
@ eNa_strand_both
in forward orientation
void SetData(TData &value)
Assign a value to Data data member.
void SetDesc(TDesc &value)
Assign a value to Desc data member.
TRegion & SetRegion(void)
Select the variant.
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
NCBI_PARAM_DECL(string, RESTRICTION_SITES, REBASE)
ostream & operator<<(ostream &os, const CRSite &site)
typedef NCBI_PARAM_TYPE(RESTRICTION_SITES, REBASE) TRebaseData
NCBI_PARAM_DEF(string, RESTRICTION_SITES, REBASE, "")
void x_FindRSite(const Seq &seq, const CFindRSites::TEnzymes &enzymes, vector< CRef< CREnzResult > > &results, CFindRSites::TFlags flags)
Find all definite and possible sites in a sequence for a vector of enzymes, using a finite state mach...
static CRef< CSeq_loc > s_RemapChildToParent(const CSeq_loc &parent, const CSeq_loc &child, CScope *scope)
static void s_AddSitesToAnnot(const vector< CRSite > &sites, const CREnzResult &result, CSeq_annot &annot, CScope &scope, const CSeq_loc &parent_loc, bool definite=true)
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
bool operator()(const CRSite &lhs, const CRSite &rhs) const
bool operator()(const CREnzyme &lhs, const CREnzyme &rhs)
Location relative to a base Seq-loc: one (usually) or more ranges of offsets.
static bool ambig(char c)