1 /* $Id: seedtop.cpp 77822 2017-05-09 14:42:25Z madden $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Ning Ma
27  *
28  */
30 /// @file seedtop.cpp
31 /// Implements the CSeedTop class.
33 #include <ncbi_pch.hpp>
43 #include "blast_setup.hpp"
45 /** @addtogroup AlgoBlast
46  *
47  * @{
48  */
52 BEGIN_SCOPE(blast)
54 CSeedTop::CSeedTop(const string & pattern)
55  : m_Pattern(pattern)
56 {
57  x_ParsePattern();
58  x_MakeScoreBlk();
59  x_MakeLookupTable();
60 }
63 {
64  vector <string> units;
66  NStr::Split(NStr::ToUpper(m_Pattern), "-", units, 0);
67  ITERATE(vector<string>, unit, units){
68  if (*unit != "") {
69  char ch = (*unit)[0];
70  if (ch=='[' || ch=='{' || ch=='X' || (*unit).length()==1 || (*unit)[1]=='(') {
71  m_Units.push_back(SPatternUnit(*unit));
72  } else {
73  for (SIZE_TYPE i=0; i<(*unit).length(); ++i) {
74  m_Units.push_back(SPatternUnit(string(*unit, i, 1)));
75  }
76  }
77  }
78  }
79 }
82 {
83  CLookupTableOptions lookup_options;
84  LookupTableOptionsNew(m_Program, &lookup_options);
85  lookup_options->phi_pattern = strdup(m_Pattern.c_str());
86  // Lookup segments, scoreblk, and rps info arguments are irrelevant
87  // and passed as NULL.
88  LookupTableWrapInit(NULL, lookup_options, NULL, NULL,
90 }
93 {
94  CBlastScoringOptions score_options;
95  BlastScoringOptionsNew(m_Program, &score_options);
98  BlastSetup_ScoreBlkInit(NULL, query_info, score_options, m_Program,
100 }
103 {
104  BlastOffsetPair* offset_pairs = (BlastOffsetPair*)
107  CRef<CSeq_id> sid;
108  TSeqPos slen;
109  TSeedTopResults retv;
111  BlastSeqSrcGetSeqArg seq_arg;
112  memset((void*) &seq_arg, 0, sizeof(seq_arg));
115  BlastSeqSrc *seq_src = db->MakeSeqSrc();
116  IBlastSeqInfoSrc *seq_info_src = db->MakeSeqInfoSrc();
118  (MAX(BlastSeqSrcGetNumSeqs(seq_src)/100, 1));
120  while( (seq_arg.oid = BlastSeqSrcIteratorNext(seq_src, itr))
122  if (seq_arg.oid == BLAST_SEQSRC_ERROR) break;
123  if (BlastSeqSrcGetSequence(seq_src, &seq_arg) < 0) continue;
125  Int4 start_offset = 0;
126  GetSequenceLengthAndId(seq_info_src, seq_arg.oid, sid, &slen);
128  while (start_offset < seq_arg.seq->length) {
129  // Query block and array size arguments are not used when scanning
130  // subject for pattern hits, so pass NULL and 0 for respective
131  // arguments.
132  Int4 hit_count =
133  PHIBlastScanSubject(m_Lookup, NULL, seq_arg.seq, &start_offset,
134  offset_pairs, 0);
136  if (hit_count == 0) break;
138  for (int index = 0; index < hit_count; ++index) {
139  vector<vector<int> > pos_list;
140  vector<int> pos(m_Units.size());
141  unsigned int start = offset_pairs[index].phi_offsets.s_start;
142  unsigned int end = offset_pairs[index].phi_offsets.s_end + 1;
143  x_GetPatternRanges(pos, 0, seq_arg.seq->sequence + start, end-start, pos_list);
144  ITERATE(vector<vector<int> >, it_pos, pos_list) {
145  CSeq_loc::TRanges ranges;
146  int r_start(start);
147  int r_end(r_start);
148  int uid(0);
149  ITERATE(vector<int>, q, *it_pos) {
150  if (m_Units[uid].is_x) {
151  ranges.push_back(CRange<TSeqPos>(r_start, r_end-1));
152  r_start = r_end + *q;
153  r_end = r_start;
154  } else {
155  r_end += (*q);
156  }
157  ++uid;
158  }
159  ranges.push_back(CRange<TSeqPos>(r_start, r_end-1));
160  CRef<CSeq_loc> hit(new CSeq_loc(*sid, ranges));
161  retv.push_back(hit);
162  }
163  // skip the next pos_list.size()-1 hits
164  _ASSERT(index + (Int4)(pos_list.size()) - 1 < hit_count);
165  for (unsigned int i = 1; i< pos_list.size(); ++i) {
166  _ASSERT(offset_pairs[index + i].phi_offsets.s_start == start);
167  _ASSERT(offset_pairs[index + i].phi_offsets.s_end + 1 == end);
168  }
169  index += pos_list.size() - 1;
170  }
171  }
173  BlastSeqSrcReleaseSequence(seq_src, &seq_arg);
174  }
176  BlastSequenceBlkFree(seq_arg.seq);
177  itr = BlastSeqSrcIteratorFree(itr);
178  sfree(offset_pairs);
179  return retv;
180 }
183 {
184  CConstRef<CSeq_id> sid = bhl.GetSeqId();
185  CSeq_loc sl;
186  sl.SetWhole();
187  sl.SetId(*sid);
188  SSeqLoc subject(sl, bhl.GetScope());
189  TSeqLocVector subjects;
190  subjects.push_back(subject);
191  CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(subjects));
192  CRef<CBlastOptionsHandle> opt_handle
194  CRef<CLocalDbAdapter> db(new CLocalDbAdapter(qf, opt_handle));
195  return Run(db);
196 }
198 void
199 CSeedTop::x_GetPatternRanges(vector<int> &pos, Uint4 off, Uint1 *seq, Uint4 len,
200  vector<vector<int> > &ranges)
201 {
202  // Not enough sequence letters
203  if (len + off + m_Units[off].at_least < m_Units.size() + 1) return;
204  // at least test
205  unsigned int rep;
206  for (rep =0; rep < m_Units[off].at_least; ++rep) {
207  if (!m_Units[off].test(NCBISTDAA_TO_AMINOACID[seq[rep]])) return;
208  }
209  // at most test
210  while(off < m_Units.size() - 1) {
211  pos[off] = rep;
212  x_GetPatternRanges(pos, off+1, seq+rep, len-rep, ranges);
213  ++rep;
214  if (rep >= m_Units[off].at_most) return;
215  if (len + off + 1 < m_Units.size() + rep) return;
216  if (!m_Units[off].test(NCBISTDAA_TO_AMINOACID[seq[rep]])) return;
217  }
218  // the last unit of the pattern
219  if (m_Units[off].at_most <= len) return;
220  for (; rep < len; ++rep) {
221  if (!m_Units[off].test(NCBISTDAA_TO_AMINOACID[seq[rep]])) return;
222  }
223  pos[off] = rep;
224  ranges.push_back(pos);
225  return;
226 }
228 END_SCOPE(blast)
232 /* @} */
