NCBI C++ ToolKit
fit_blast.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34 #include <strstream>
35 
37  (
38  const CBioseq& left,
39  const CBioseq& right,
40  string& common_subject
41  )
42 {
43  bool result=false;
44  // check if prot
45  if(PrintDetails()) NcbiCerr << "fit_blast, seq level " << NcbiEndl;
46 // string qname = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
47 // string qrname = CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA);
48  string qname = GetStringDescr (left);
49  string qrname = GetStringDescr (right);
50  // if(PrintDetails()) NcbiCerr << "left " << CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA) << NcbiEndl;
51  // if(PrintDetails()) NcbiCerr << "right " << CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA) << NcbiEndl;
52  if(PrintDetails()) NcbiCerr << "left " << GetStringDescr (left) << NcbiEndl;
53  if(PrintDetails()) NcbiCerr << "right " << GetStringDescr (right) << NcbiEndl;
54 // assuming that protein sequences come in one piece of seq-set
55  if(left.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
56  if(PrintDetails()) NcbiCerr << "left is aa" << NcbiEndl;
57  if(right.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
58  if(PrintDetails()) NcbiCerr << "right is aa" << NcbiEndl;
59 /*
60  if(!hasGenomicInterval(left)) return result;
61  if(!hasGenomicInterval(right)) return result;
62 
63  const CSeq_interval& left_genomic_int = getGenomicInterval(left);
64  const CSeq_interval& right_genomic_int = getGenomicInterval(right);
65 */
66  if(!hasGenomicLocation(left)) return result;
67  if(!hasGenomicLocation(right)) return result;
68 
69  const CSeq_loc& left_genomic_int = getGenomicLocation(left);
70  const CSeq_loc& right_genomic_int = getGenomicLocation(right);
71  TSeqPos from1, to1, from2, to2;
72 
73  ENa_strand left_strand;
74  ENa_strand right_strand;
75  getFromTo(left_genomic_int, from1, to1, left_strand);
76  getFromTo(right_genomic_int, from2, to2, right_strand);
77 
78  int left_frame = (from1-1)%3+1; // might be not making sense of location is composite
79  int right_frame = (from2-1)%3+1;
80  if(left_strand != eNa_strand_plus ) left_frame=-left_frame;
81  if(right_strand != eNa_strand_plus ) right_frame=-right_frame;
82  if(left_strand != right_strand)
83  {
84  if(PrintDetails())
85  {
86  NcbiCerr << "Strands mismatch for " << NcbiEndl;
87  NcbiCerr << "left " << GetStringDescr (left) << NcbiEndl;
88  NcbiCerr << "right " << GetStringDescr (right) << NcbiEndl;
89  }
90  return result;
91  }
92  if(PrintDetails()) NcbiCerr << "Matching strands" << NcbiEndl;
93  bool reverse = left_strand == eNa_strand_minus;
94  int space = ((int)from2-(int)to1)/3+1; // +1 for stop codon, /3 to convert n to aa
95  if(PrintDetails()) NcbiCerr << "space: " << space << NcbiEndl;
96  int left_qLen = getQueryLen(left);
97  int right_qLen = getQueryLen(right);
98  int num=0, numFit=0, numGI=0;
99  int numLeft=0, numRight=0;
100  vector<perfectHitStr> left_perfect;
101  vector<perfectHitStr> right_perfect;
102  collectPerfectHits(left_perfect, left);
103  collectPerfectHits(right_perfect, right);
104 
105  if(PrintDetails()) NcbiCerr << "perfect hits are : " << left_perfect.size() << " " << right_perfect.size() << NcbiEndl;
106  int n_frameshift_pairs=0;
107  list<problemStr> problemsl;
108  list<problemStr> problemsr;
110  common_subject="";
111  string any_common_subject="";
112  ITERATE(CBioseq::TAnnot, left_annot, left.GetAnnot())
113  {
114  if(PrintDetails()) NcbiCerr << "Next left annot: numLeft: " << numLeft << NcbiEndl;
115  if(!(*left_annot)->GetData().IsAlign()) continue;
116  numLeft++;
117  if(PrintDetails()) NcbiCerr << "Left annot is alignment: numLeft: " << numLeft << NcbiEndl;
118  vector<long> left_gis = getGIs(left_annot);
119  if(PrintDetails()) NcbiCerr << "Left annot is alignment: left_gis: " << left_gis.size() << NcbiEndl;
120 
122  ITERATE(CBioseq::TAnnot, right_annot, right.GetAnnot())
123  {
124  if(PrintDetails()) NcbiCerr << "Next right annot: numRight: " << numRight << NcbiEndl;
125  if(!(*right_annot)->GetData().IsAlign()) continue;
126  numRight++;
127  if(PrintDetails()) NcbiCerr << "Right annot is alignment: numRight: " << numRight << NcbiEndl;
128  vector<long> right_gis = getGIs(right_annot);
129  if(PrintDetails()) NcbiCerr << "Right annot is alignment: right_gis: " << right_gis.size() << NcbiEndl;
130  PushVerbosity();
131  if(!giMatch(left_gis, right_gis))
132  {
133  PopVerbosity();
134  continue;
135  }
136  PopVerbosity();
137  if(PrintDetails()) NcbiCerr << "!!!!!!!!! GI Match !!!!!!!!!!" << NcbiEndl;
138  numGI++;
139  distanceReportStr *report = new distanceReportStr;
140  report->left_strand = left_strand;
141  report->right_strand = right_strand;
142  report->s_id = left_gis[0];
143  report->q_loc_left_from = reverse ? from2: from1;
144  report->q_loc_right_from = reverse ? from1 : from2;
145  report->q_loc_left_to = reverse ? to2 : to1;
146  report->q_loc_right_to = reverse ? to1 : to2;
147  int frame_from=report->q_loc_left_from;
148  int frame_to =frame_from;
149  frame_from = min(frame_from, report->q_loc_left_from);
150  frame_to = max(frame_to , report->q_loc_left_from);
151  frame_from = min(frame_from, report->q_loc_right_from);
152  frame_to = max(frame_to , report->q_loc_right_from);
153  frame_from = min(frame_from, report->q_loc_left_to );
154  frame_to = max(frame_to , report->q_loc_left_to );
155  frame_from = min(frame_from, report->q_loc_right_to );
156  frame_to = max(frame_to , report->q_loc_right_to );
157  report->left_frame = left_frame;
158  report->right_frame = right_frame;
159  if( (!reverse && fit_blast(left, right, left_annot, right_annot, left_qLen, right_qLen, space, report)) ||
160  ( reverse && fit_blast(right, left, right_annot, left_annot, right_qLen, left_qLen, space, report))
161  )
162  {
163  numFit++;
164  string this_subject = getAnnotName(left_annot);
165  if(numFit==1) any_common_subject = this_subject;
166  if(common_subject.size()==0 && this_subject.find("hypothetical")==string::npos)
167  {
168 // best subject
169  common_subject = this_subject;
170  if(PrintDetails()) NcbiCerr << "zero common_subject changed to non-hypothetical this_subject " << this_subject
171  << NcbiEndl;
172  }
173  if(PrintDetails()) NcbiCerr << "!!!! Blast bounds match !!!!! numFit: " << numFit << NcbiEndl;
174 
175  char bufferchar[20480]; memset(bufferchar, 0, 20480);
176  strstream buffer(bufferchar, 20480);
177  printReport(report, buffer);
178  CNcbiStrstream misc_feat;
179  if(common_subject.size())
180  {
181  misc_feat << "potential frameshift: common BLAST hit: "
182  << common_subject << '\0';
183  }
184  else
185  {
186  misc_feat << '\0';
187  }
188  {
189  if(PrintDetails()) NcbiCerr << "added a problem: " << misc_feat.str() << NcbiEndl;
190  if(PrintDetails()) NcbiCerr << "added a problem(left): " << qname << NcbiEndl;
191  if(PrintDetails()) NcbiCerr << "added a problem(right): " << qrname << NcbiEndl;
192  problemStr problem = { eFrameShift, buffer.str(), misc_feat.str(),
193  qname, qrname,
194  frame_from,
195  frame_to ,
196  report->left_strand};
197  problemsl.push_back(problem);
198  }
199  {
200  problemStr problem = { eFrameShift, "", "", "", "", -1, -1, eNa_strand_unknown };
201  problemsr.push_back(problem);
202  }
203  n_frameshift_pairs++;
204  }
205  else
206  delete report;
207  }
209  }
211  num=numLeft*numRight;
212  if(common_subject.size()==0) common_subject = any_common_subject;
213 
214 
215  if(numFit) // this block determines whether it is frameshift or not
216  {
217 // one of the queries is hypothetical - frameshift!
218  if(qname.find("hypothetical") != string::npos || qrname.find("hypothetical") != string::npos)
219  {
220  if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
221  << "one of those is hypo: FRAMESHIFT" << NcbiEndl;
222  result = true;
223  }
224  else
225  {
226 // not enough frameshift evidence - not frameshift!
227  if(numFit<2)
228  {
229  if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
230  << "numFit < 2: NOT A FRAMESHIFT" << NcbiEndl;
231  result = false;
232  }
233  else
234  {
235 // there are truely exhonerating hits - not frameshift!
236  if(left_perfect.size() || right_perfect.size())
237  {
238  if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
239  << "there are exhonerating hits: NOT A FRAMESHIFT" << NcbiEndl;
240  result = false;
241  }
242 // the rest are frameshifts
243  else
244  {
245  if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
246  << "there are no exhonerating hits: FRAMESHIFT" << NcbiEndl;
247  result = true;
248  }
249  }
250  }
251  }
252  else
253  {
254  if(PrintDetails()) NcbiCerr << "Frameshift or not? (" << qname << ", " << qrname << "): "
255  << "numFit = 0: NOT A FRAMESHIFT" << NcbiEndl;
256  result = false;
257  }
258  if(PrintDetails()) NcbiCerr << "after, left " << GetStringDescr (left) << NcbiEndl;
259  if(PrintDetails()) NcbiCerr << "after, right " << GetStringDescr (right) << NcbiEndl;
260  if(PrintDetails()) NcbiCerr << "perfect hits are : " << left_perfect.size() << " " << right_perfect.size() << NcbiEndl;
261  if(PrintDetails()) NcbiCerr << "final numGI, numFit : " << numGI << " " << numFit << NcbiEndl;
262  if( !result && numFit )
263 // print exhonerating hits to stderr if detailed printing
264  {
265  char bufferchar[2048]; memset(bufferchar, 0, 2048);
266  strstream buffer(bufferchar, 2048);
267  buffer << "Left sequence has " << left_perfect.size() << " perfect hits." << NcbiEndl;
268  ITERATE(vector<perfectHitStr>, hit, left_perfect)
269  {
270  printPerfectHit(*hit, buffer);
271  }
272  buffer << "Right sequence has " << right_perfect.size() << " perfect hits." << NcbiEndl;
273  ITERATE(vector<perfectHitStr>, hit, right_perfect)
274  {
275  printPerfectHit(*hit, buffer);
276  }
277  buffer << "In total, " << numFit << " pairs of hits (out of " << num << ") match for these two sequences." << NcbiEndl;
278  if(PrintDetails()) NcbiCerr << buffer.str();
279  if(PrintDetails()) NcbiCerr << "that is, left sequence (" << qname << ")." << NcbiEndl;
280  }
281  if(result)
282  {
283  addProblems(m_diag[qname].problems, problemsl);
284  addProblems(m_diag[qrname].problems, problemsr);
285  }
286 
287  return result;
288 }
289 
291  (
292  const CBioseq& left,
293  const CBioseq& right,
294  CBioseq::TAnnot::const_iterator& left_annot,
295  CBioseq::TAnnot::const_iterator& right_annot,
296  int left_qLen, int right_qLen,
297  int space, distanceReportStr* report
298  )
299 {
300  bool result=false;
301  // report->q_id_left = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
302  // report->q_id_right = CSeq_id::GetStringDescr (right, CSeq_id::eFormat_FastA);
303  report->q_id_left = GetStringDescr (left);
304  report->q_id_right = GetStringDescr (right);
305  report->q_name_left = GetProtName(left);
306  report->q_name_right = GetProtName(right);
307 
308  int left_sLen = getLenScore(left_annot);
309  int right_sLen = getLenScore(right_annot);
310 
311  int left_qFrom, left_qTo, right_qFrom, right_qTo;
312  int left_sFrom, left_sTo, right_sFrom, right_sTo;
313  getBounds(left_annot, &left_qFrom, &left_qTo, &left_sFrom, &left_sTo);
314  getBounds(right_annot, &right_qFrom, &right_qTo, &right_sFrom, &right_sTo);
315 
316  report->q_left_left = left_qFrom-1;
317  report->q_left_middle = left_qTo-left_qFrom+1;
318  report->q_left_right = left_qLen-left_qTo;
319 
320  report->space = space;
321  report->s_name ="cannot get subject name";
322  report->s_name = getAnnotName(left_annot);
323  report->alignment_left = getAnnotComment(left_annot);
324  report->alignment_right = getAnnotComment(right_annot);
325 
326  if(PrintDetails()) NcbiCerr << "fit_blast annot level: " << report->s_name.c_str() << NcbiEndl;
327 
328  report->q_right_left = right_qFrom-1;
329  report->q_right_middle = right_qTo-right_qFrom+1;
330  report->q_right_right = right_qLen-right_qTo;
331 
332  report->s_left_left = left_sFrom-1;
333  report->s_left_middle = left_sTo-left_sFrom+1;
334  report->s_left_right = left_sLen-left_sTo;
335 
336  report->s_right_left = right_sFrom-1;
337  report->s_right_middle = right_sTo-right_sFrom+1;
338  report->s_right_right = right_sLen-right_sTo;
339 
340 
341  int result_left = report->diff_left =
342  (report->s_left_right - report->s_right_right) -
343  (report->q_left_right + report->space + report->q_right_left)
344  ;
345  int result_right = report->diff_right =
346  (report->s_right_left - report->s_left_left ) -
347  (report->q_left_right + report->space + report->q_right_left)
348  ;
349 
350  report->diff_left -= report->q_right_middle;
351  report->diff_right -= report->q_left_middle;
352 
353  report->diff_edge_left = report->s_right_left -
354  (report->s_left_left + report->s_left_middle +
355  report->space + report->q_left_right + report->q_right_left );
356  report->diff_edge_right = report->s_left_right -
357  (report->s_right_right+ report->s_right_middle +
358  report->space + report->q_left_right + report->q_right_left );
359  if(PrintDetails())
360  printReport(report, NcbiCerr);
361 
362 
363 
364  // result = diff<150;
365 // subject's hypothetical or not does not play a role
366  result = result_left > 0 && result_right > 0; // && report->s_name.find("hypothetical")==string::npos;
367  return result;
368 }
370 {
371  bool result;
372  if(PrintDetails()) NcbiCerr << "has_blast_hits?" << NcbiEndl;
373  result=seq.GetAnnot().size()>1;
374  if(PrintDetails()) NcbiCerr << result << NcbiEndl;
375  return result;
376 }
377 
378 
379 
static bool PrintDetails(int current_verbosity=m_current_verbosity)
static string GetProtName(const CBioseq &seq)
Definition: shortcuts.cpp:82
static bool giMatch(const vector< long > &left, const vector< long > &right)
static int getQueryLen(const CBioseq &seq)
Definition: shortcuts.cpp:96
static vector< long > getGIs(CBioseq::TAnnot::const_iterator &annot)
static bool has_blast_hits(const CBioseq &seq)
Definition: fit_blast.cpp:369
static void IncreaseVerbosity(void)
static int collectPerfectHits(vector< perfectHitStr > &perfect, const CBioseq &seq)
static string getAnnotComment(CBioseq::TAnnot::const_iterator &annot)
Definition: shortcuts.cpp:471
static string getAnnotName(CBioseq::TAnnot::const_iterator &annot)
Definition: shortcuts.cpp:459
static bool hasGenomicLocation(const CBioseq &seq)
Definition: locations.cpp:100
static void DecreaseVerbosity(void)
static void getBounds(CBioseq::TAnnot::const_iterator &annot, int *qFrom, int *qTo, int *sFrom, int *sTo)
Definition: shortcuts.cpp:435
static void printPerfectHit(const perfectHitStr &hit, ostream &out=NcbiCout)
static void PopVerbosity(void)
static const CSeq_loc & getGenomicLocation(const CBioseq &seq)
Definition: locations.cpp:120
static void printReport(distanceReportStr *report, ostream &out=NcbiCout)
static int getLenScore(CBioseq::TAnnot::const_iterator &annot)
Definition: shortcuts.cpp:420
static void getFromTo(const CSeq_loc &loc, TSeqPos &from, TSeqPos &to, ENa_strand &strand)
Definition: locations.cpp:34
bool fit_blast(const CBioseq &left, const CBioseq &right, string &common_subject)
Definition: fit_blast.cpp:37
static void PushVerbosity(void)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NcbiEndl
Definition: ncbistre.hpp:548
#define NcbiCerr
Definition: ncbistre.hpp:544
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
T max(T x_, T y_)
T min(T x_, T y_)
static pcre_uint8 * buffer
Definition: pcretest.c:1051
int addProblems(list< problemStr > &dest, const list< problemStr > &src)
Definition: problems.cpp:844
@ eFrameShift
string GetStringDescr(const CBioseq &bioseq)
Definition: shortcuts.cpp:309
else result
Definition: token2.c:20
Modified on Mon May 27 04:37:12 2024 by modify_doxy.py rev. 669887