1 /* $Id: cuBlast2Seq.cpp 51593 2011-10-17 14:21:18Z lanczyck $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Charlie Liu
27  *
28  * File Description:
29  *
30  * Functions to call C++ BLAST
31  *
32  * ===========================================================================
33  */
35 #include <ncbi_pch.hpp>
36 #include <vector>
37 #include <string>
38 #include <list>
40 #include <objects/seq/Bioseq.hpp>
46 //#include <algo/blast/api/bl2seq.hpp>
60 //#include <objtools/simple/simple_om.hpp>
65 #include <objects/seq/NCBIeaa.hpp>
70 BEGIN_SCOPE(cd_utils)
72 const int CdBlaster::CDD_DATABASE_SIZE = 1000000;
74 const double CdBlaster::E_VAL_WHEN_NO_SEQ_ALIGN = 1000000; // eval when Blast doesn't return a seq-align
75 const double CdBlaster::SCORE_WHEN_NO_SEQ_ALIGN = -1;
77 const long CdBlaster::DEFAULT_NR_SIZE = 1196146007;
78 const int CdBlaster::DEFAULT_NR_SEQNUM = 3479934;
81 : m_ac(&source), m_queryRows(0), m_subjectRows(0), m_seqs(0), m_scoringMatrix(matrixName), m_useWhole(false),
82 m_scoreType(CSeq_align::eScore_Score), m_psiTargetCd(0)
83 {
84  //m_offsets.assign(m_ac->GetNumRows(), 0);
85  m_batchSizes.assign(m_ac->GetNumRows(), 0);
86  m_nExt = m_cExt = 0;
89 }
91 CdBlaster::CdBlaster(vector< CRef<CBioseq> >& seqs, string matrixName)
92 : m_ac(0), m_seqs(&seqs), m_scoringMatrix(matrixName), m_useWhole(false), m_psiTargetCd(0)
93 {
94  //m_offsets.assign(seqs.size(), 0);
95  m_batchSizes.assign(seqs.size(), 0);
96  m_nExt = m_cExt = 0;
99 }
102 {
103  m_useWhole = whole;
104 }
106 void CdBlaster::setFootprintExtension(int nExt, int cExt)
107 {
108  m_nExt = nExt;
109  m_cExt = cExt;
110  m_useWhole = false;
111 }
113 //to do psi-blast
115 {
116  m_psiTargetPssm = pssm;
117 }
120 {
121  m_psiTargetCd = targetCD;
122  cd_utils::PssmMaker pm(targetCD,true,true); // 2rd param is useConsensus. generally "true".
123  cd_utils::PssmMakerOptions config;
124  config.requestFrequencyRatios = true;
125  config.matrixName = m_scoringMatrix;
126  pm.setOptions(config);
127  m_psiTargetPssm = pm.make();
128  return(m_psiTargetPssm);
129 }
132 {
133  int nrows = 0;
134  int numBlastsDone = 0;
135  int totalBlasts = 0;
136  CRef< CBioseq > bioseqRef;
137  if (m_queryRows && m_subjectRows)
138  {
139  nrows = m_queryRows->size() + m_subjectRows->size();
140  totalBlasts = m_queryRows->size() * m_subjectRows->size();
141  for (unsigned int q = 0; q < m_queryRows->size(); q++)
142  {
143  if (m_useWhole)
144  {
145  m_ac->GetBioseqForRow((*m_queryRows)[q], bioseqRef);
146  m_truncatedBioseqs.push_back(bioseqRef);
147  }
148  else
150  }
151  for (unsigned int s = 0; s < m_subjectRows->size(); s++)
152  {
153  if (m_useWhole)
154  {
155  m_ac->GetBioseqForRow((*m_subjectRows)[s], bioseqRef);
156  m_truncatedBioseqs.push_back(bioseqRef);
157  }
158  else
160  }
161  }
162  else
163  {
164  nrows = m_ac->GetNumRows();
165  for (int i = 0; i < nrows; i++)
166  {
167  if (m_useWhole)
168  {
169  m_ac->GetBioseqForRow(i, bioseqRef);
170  m_truncatedBioseqs.push_back(bioseqRef);
171  }
172  m_truncatedBioseqs.push_back(truncateBioseq(i));
173  }
174  totalBlasts = (int)((double)nrows * (((double)nrows-1)/2));
175  }
178  options->SetEvalueThreshold(10.0);
179  //options->SetPercentIdentity(10.0);
180  options->SetMatrixName(m_scoringMatrix.c_str());
181  options->SetSegFiltering(false);
182  options->SetDbLength(CDD_DATABASE_SIZE);
183  options->SetHitlistSize(nrows);
184  options->SetDbSeqNum(1);
188  CRef< CSeq_align > nullRef;
189  m_scores.reserve(totalBlasts);
191  // use objmgr interface
194  CScope scope(*objmgr);
196  int numQueries = 0;
197  if (m_queryRows)
198  numQueries = m_queryRows->size();
199  else
200  numQueries = nrows-1;
202  bool result = true;
203  CSeq_loc querySeqLoc, subjectSeqLoc;
204  for (int qr = 0; qr < numQueries; qr++)
205  {
206  CBlastQueryVector queryVector, subjectVector;
207  scope.ResetDataAndHistory();
209  // Set up the QueryFactory for the query sequence
210  scope.AddBioseq(*m_truncatedBioseqs[qr]);
211  if (FillOutSeqLoc(m_truncatedBioseqs[qr], querySeqLoc)) {
212  CRef<CBlastSearchQuery> bsqQuery(new CBlastSearchQuery(querySeqLoc, scope));
213  queryVector.AddQuery(bsqQuery);
214  } else {
215  result = false;
216  m_batchSizes[qr] = 0;
217  continue;
218  }
221 /*
222  CRef< CBioseq > queryBioseq = m_truncatedBioseqs[qr];
223  CRef<IQueryFactory> query(new CObjMgrFree_QueryFactory(queryBioseq));
225  CRef< CBioseq_set > bioseqset(new CBioseq_set);
226  list< CRef< CSeq_entry > >& seqEntryList = bioseqset->SetSeq_set();
227 */
228  int subStart = qr +1;
229  int batchSize = (nrows -1) - (qr + 1) + 1;
230  if (m_queryRows)
231  {
232  subStart = m_queryRows->size();
233  batchSize = m_subjectRows->size();
234  }
235  //loop for subject rows
236  for (int sr = subStart; sr < nrows; sr++)
237  {
238 /*
239  CRef< CSeq_entry > seqEntry(new CSeq_entry);
240  seqEntry->SetSeq(*m_truncatedBioseqs[sr]);
241  seqEntryList.push_back(seqEntry);
242  comIndex++;
243 */
245  scope.AddBioseq(*m_truncatedBioseqs[sr]);
246  // Set up the QueryFactory for the subject sequences
247  if (FillOutSeqLoc(m_truncatedBioseqs[sr], subjectSeqLoc)) {
248  CRef<CBlastSearchQuery> bsqSubject(new CBlastSearchQuery(subjectSeqLoc, scope));
249  subjectVector.AddQuery(bsqSubject);
250  }
251  }
253  assert((unsigned)batchSize == subjectVector.Size());
254  m_batchSizes[qr] = subjectVector.Size(); // in case there was a problem w/ FillOutSeqLoc above, use the actual size submitted instead of batchSize
256 // CRef<IQueryFactory> subject(new CObjMgrFree_QueryFactory(bioseqset));
258  CPsiBl2Seq blaster(query,subject,blastOptions);
259  CSearchResultSet& hits = *blaster.Run();
260  numBlastsDone += batchSize; // don't use subjectVector.Size() so notifier(...) works normally even if FillOutSeqLoc failed above
261  if (notifier)
262  notifier(numBlastsDone, totalBlasts);
263  processBlastHits(qr, hits);
264  }
265  return result;
266 }
269 {
270  //debug
272  int nrows = 0;
273  if (m_ac)
274  nrows = m_ac->GetNumRows();
275  else
276  nrows= m_seqs->size();
279  options->SetMatrixName(m_scoringMatrix.c_str());
280  options->SetDbLength(m_dbSize);
281  options->SetDbSeqNum(m_dbSeqNum);
282  options->SetHitlistSize(nrows);
284  options->SetSegFiltering(false);
286  //options->SetEffectiveSearchSpace(27608309120);
287  // debugging
288 // options->SetCompositionBasedStatsMode(true);
290  // use objmgr interface
293  CScope scope(*objmgr);
294  CBlastQueryVector subjectVector;
296 /*
297  CRef< CBioseq_set > bioseqset(new CBioseq_set);
298  list< CRef< CSeq_entry > >& seqEntryList = bioseqset->SetSeq_set();
299 */
300  CSeq_loc subjectSeqLoc;
301  for (int sr = 0; sr < nrows; sr++)
302  {
304  scope.AddBioseq(*bs);
305  // Set up the QueryFactory for the subject sequences
306  if (FillOutSeqLoc(bs, subjectSeqLoc)) {
307  CRef<CBlastSearchQuery> bsqSubject(new CBlastSearchQuery(subjectSeqLoc, scope));
308  subjectVector.AddQuery(bsqSubject);
309  }
310 /*
311  // use footprint
313  CRef< CSeq_entry > seqEntry(new CSeq_entry);
314  seqEntry->SetSeq(*truncateBioseq(sr));
315  seqEntryList.push_back(seqEntry);
316 */
317  // use whole sequence
318  /*
319  CRef< CSeq_entry > seqEntry;
320  if (m_ac->GetSeqEntryForRow(sr, seqEntry))
321  seqEntryList.push_back(seqEntry);
322  else
323  bool wrong = true;
324  */
325  }
327 // CRef<IQueryFactory> subject(new CObjMgrFree_QueryFactory(bioseqset));
329  CPsiBl2Seq blaster(m_psiTargetPssm, subject, options);
330  CRef<CSearchResultSet> hits = blaster.Run();
331  unsigned int index, total = hits->GetNumResults();
332  for (index = 0; index < total; ++index)
333  {
334  const list< CRef< CSeq_align > >& seqAlignList = (*hits)[index].GetSeqAlign()->Get();
335  if (seqAlignList.empty())
336  m_alignments.push_back(CRef< CSeq_align>());
337  else
338  m_alignments.push_back(*(seqAlignList.begin()));
339  }
340  assert (m_alignments.size() == (unsigned) nrows);
341  return m_alignments.size();
342 }
345 {
346  return m_alignments[getCompositeIndex(row1, row2)];
347 }
349 double CdBlaster::getPairwiseScore(int row1, int row2)
350 {
351  return(m_scores[getCompositeIndex(row1, row2)]);
352 }
354 double CdBlaster::getPairwiseEValue(int row1, int row2)
355 {
356  return(m_evals[getCompositeIndex(row1, row2)]);
357 }
360 {
361  double score = SCORE_WHEN_NO_SEQ_ALIGN;
363  if (!sa.Empty())
364  {
365  sa->GetNamedScore("score", score);
366  }
367  return score;
368 }
371 {
372  double evalue = E_VAL_WHEN_NO_SEQ_ALIGN;
374  if (!sa.Empty())
375  {
376  sa->GetNamedScore("e_value", evalue);
377  }
378  return evalue;
379 }
382 {
383  return m_alignments[row];
384 }
387 {
388  CRef<CBioseq> bioseq;
389  int len = 0;
390  int from = 0;
391  int to = 0;
393  if (m_ac)
394  {
395  if (!m_ac->GetBioseqForRow(row, bioseq))
396  return bioseq;
397  from = m_ac->GetLowerBound(row);
398  to = m_ac->GetUpperBound(row);
399  }
400  else if (m_seqs)
401  {
402  bioseq = (*m_seqs)[row];
403  if (bioseq.Empty())
404  return bioseq;
405  from = 0;
406  to = bioseq->GetInst().GetLength() - 1;
407  if (bioseq->IsSetAnnot())
408  {
409  const list< CRef< CSeq_annot > >& annots = bioseq->GetAnnot();
410  for (list< CRef< CSeq_annot > >::const_iterator cit = annots.begin(); cit != annots.end(); cit++)
411  {
412  if ((*cit)->IsSetData())
413  {
414  if ((*cit)->GetData().IsLocs())
415  {
416  const list< CRef< CSeq_loc > >& locs = (*cit)->GetData().GetLocs();
417  if (locs.size() > 0)
418  {
419  CRef< CSeq_loc > seqLoc = *locs.begin();
420  if (seqLoc->IsInt())
421  {
422  from = seqLoc->GetInt().GetFrom();
423  to = seqLoc->GetInt().GetTo();
424  }
425  }
426  }
427  }
428  }
429  }
431  }
432  len = bioseq->GetInst().GetLength();
433  if(m_useWhole)
434  return bioseq;
435  CRef<CBioseq> tbioseq(new CBioseq);
436  tbioseq->Assign(*bioseq);
438  string seqData;
439  GetNcbieaaString(*bioseq, seqData);
440  ApplyEndShiftToRange(from, m_nExt, to, m_cExt, len);
441  //m_offsets[row] = from; //keep this for using to remap seq-align later.
442  tbioseq->SetInst().SetLength(to - from + 1);
443  CNCBIeaa tr(seqData.substr(from, to - from + 1));
444  tbioseq->SetInst().SetSeq_data().SetNcbieaa(tr);
445  return tbioseq;
446 }
449 {
450  double score, idScore;
451  int seqLen = m_truncatedBioseqs[queryRow]->GetInst().GetLength();
452  int nhits = hits.GetNumResults();
453  assert (nhits == m_batchSizes[queryRow]);
454  for (int i = 0; i < nhits; i++)
455  {
456  score = 0.0;
457  const list< CRef< CSeq_align > >& seqAlignList = hits[i].GetSeqAlign()->Get();
458  if (seqAlignList.size() > 0)
459  {
460  CRef< CSeq_align > sa = ExtractFirstSeqAlign(*(seqAlignList.begin()));
461  if (!sa.Empty()) {
463  {
464  idScore = 0.0;
466  if (seqLen != 0) {
467  score = 100*idScore/seqLen;
468  }
469  }
470  else
471  sa->GetNamedScore(m_scoreType, score);
472  }
473  }
474  m_scores.push_back(score);
475  }
476 }
479 {
480  //make sure query < subject, otherwise swap
481  int comp = -1;
482  if (m_queryRows == 0)
483  {
484  int realQuery = query;
485  int realSubject = subject;
486  if (query > subject)
487  {
488  realQuery = subject;
489  realSubject = query;
490  }
491  int nrows = m_ac->GetNumRows();
492  int totalBeforeQuery = (nrows - 1 + nrows - realQuery)*realQuery/2;
493  comp = totalBeforeQuery + (realSubject - realQuery - 1);
494  }
495  else
496  {
497  comp = query * m_subjectRows->size() + subject;
498  }
499  return comp;
500 }
502 // A couple functions to manage extensions at end of aligned range
503 bool CdBlaster::IsFootprintValid(int from, int to, int len) {
505  // The positions on a sequence are offsets from start of sequence and hence
506  // run from [0, length-1]. See Bioseq section of C Toolkit docs.
507  bool result = false;
508  if (from < 0 || to < 0 || len <=0) return result;
509  if (from <= to && to < len && (to - from + 1 >= 0)) {
510  result = true;
511  }
512  return result;
513 }
515 void CdBlaster::ApplyEndShiftToRange(int& from, int nTermShift, int& to, int cTermShift, int len)
516 {
518  // Shift ends of range by indicated amounts: positive values extend, and negative
519  // values shorten.
520  // A full sequence is indicated if from = to = 0, or from = 0 and to = len - 1,
521  // and in this case it should be negative. For a footprint, the positive shift extends
522  // the footprint defined by [from, to] by the shift or to the end, whichever is closer.
523  // If on shortening, the shifts cause a crossing of from/to values, revert to using
524  // the zero shifts.
526  if (nTermShift == 0 && cTermShift == 0) {
527  return;
528  }
529 // nTermShift = (nTermShift < 0) ? -nTermShift : nTermShift;
530 // cTermShift = (cTermShift < 0) ? -cTermShift : cTermShift;
532  // truncate full sequence; if shifts aren't negative, set them to zero.
533  if (from == 0 && (to == 0 || to == len - 1)) {
534  nTermShift = (nTermShift < 0) ? nTermShift : 0;
535  cTermShift = (cTermShift < 0) ? cTermShift : 0;
536  if (-nTermShift < len - 1 + cTermShift ) {
537  from = -nTermShift;
538  to += cTermShift;
539  } else {
540  from = 0;
541  to = len - 1;
542  }
544  } else { // extend or shrink footprint
545  if (nTermShift >= 0) {
546  from = (nTermShift <= from) ? from - nTermShift : 0;
547  } else {
548  from = (-nTermShift < to - 1 + cTermShift) ? from - nTermShift : from;
549  }
550  if (cTermShift >= 0) {
551  to = (cTermShift <= len - 1 - to) ? to + cTermShift : len - 1;
552  } else {
553  to = (-cTermShift <= to - from - 1 + nTermShift) ? to + cTermShift : to;
554  }
555  }
557 }
560 {
561  bool result = true;
562  CSeq_interval& seqInt = seqLoc.SetInt();
563  CSeq_id& seqId = seqInt.SetId();
564  seqInt.SetFrom(0);
566  // Assign the first identifier from the bioseq
567  if (bs.NotEmpty() && bs->GetFirstId() != 0) {
568  seqInt.SetTo(bs->GetLength() - 1);
569  seqId.Assign(*(bs->GetFirstId()));
570  } else {
571  result = false;
572  }
574  return result;
575 }
578  int i = 1;
581  om->GetRegisteredNames(loader_names);
582  ITERATE(CObjectManager::TRegisteredNames, itr, loader_names) {
583  om->RevokeDataLoader(*itr);
584  ++i;
585  }
586 }
589 END_SCOPE(cd_utils)
