$Id: cn3d_blast.cpp 92483 2021-01-26 18:35:06Z grichenk $
National Center for Biotechnology Information
Authors: Paul Thiessen
File Description:
module for aligning with BLAST and related algorithms
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbistr.hpp>
42 #include <objects/seq/Bioseq.hpp>
43 #include <objects/seq/Seq_inst.hpp>
44 #include <objects/seq/Seq_data.hpp>
51 #include "cn3d_blast.hpp"
53 #include "cn3d_pssm.hpp"
54 #include "sequence_set.hpp"
55 #include "cn3d_tools.hpp"
56 #include "structure_set.hpp"
57 #include "molecule_identifier.hpp"
58 #include "asn_reader.hpp"
66 class TruncatedSequence : public CObject
67 {
68 public:
71  int fromIndex, toIndex;
72 };
74 typedef vector < CRef < TruncatedSequence > > TruncatedSequences;
77  const BlockMultipleAlignment *pair, int alnNum, bool isMaster, int extension)
78 {
81  // master sequence (only used for blast-two-sequences)
82  if (isMaster) {
84  ts->originalFullSequence = pair->GetMaster();
87  // use alignMasterTo/From if present and reasonable
88  if (pair->alignMasterFrom >= 0 && pair->alignMasterFrom < (int)ts->originalFullSequence->Length() &&
89  pair->alignMasterTo >= 0 && pair->alignMasterTo < (int)ts->originalFullSequence->Length() &&
90  pair->alignMasterFrom <= pair->alignMasterTo)
91  {
92  ts->fromIndex = pair->alignMasterFrom;
93  ts->toIndex = pair->alignMasterTo;
94  }
96  // use aligned footprint + extension if multiple has any aligned blocks
97  else if (multiple && multiple->GetUngappedAlignedBlocks(&uaBlocks) > 0)
98  {
99  ts->fromIndex = uaBlocks.front()->GetRangeOfRow(0)->from - extension;
100  if (ts->fromIndex < 0)
101  ts->fromIndex = 0;
102  ts->toIndex = uaBlocks.back()->GetRangeOfRow(0)->to + extension;
103  if (ts->toIndex >= (int)ts->originalFullSequence->Length())
104  ts->toIndex = ts->originalFullSequence->Length() - 1;
105  }
107  // otherwise, just use the whole sequence
108  else {
109  ts->fromIndex = 0;
110  ts->toIndex = ts->originalFullSequence->Length() - 1;
111  }
112  }
114  // dependent sequence
115  else {
117  ts->originalFullSequence = pair->GetSequenceOfRow(1);
119  // use alignDependentTo/From if present and reasonable
120  if (pair->alignDependentFrom >= 0 && pair->alignDependentFrom < (int)ts->originalFullSequence->Length() &&
121  pair->alignDependentTo >= 0 && pair->alignDependentTo < (int)ts->originalFullSequence->Length() &&
122  pair->alignDependentFrom <= pair->alignDependentTo)
123  {
124  ts->fromIndex = pair->alignDependentFrom;
125  ts->toIndex = pair->alignDependentTo;
126  }
128  // otherwise, just use the whole sequence
129  else {
130  ts->fromIndex = 0;
131  ts->toIndex = ts->originalFullSequence->Length() - 1;
132  }
133  }
135  // create new Bioseq (contained in a Seq-entry) with the truncated sequence
136  ts->truncatedSequence.Reset(new CSeq_entry);
137  CBioseq& bioseq = ts->truncatedSequence->SetSeq();
138  CRef < CSeq_id > id(new CSeq_id);
139  id->SetLocal().SetId(alnNum);
140  bioseq.SetId().push_back(id);
141  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_raw);
142  bioseq.SetInst().SetMol(CSeq_inst::eMol_aa);
143  bioseq.SetInst().SetLength(ts->toIndex - ts->fromIndex + 1);
144  TRACEMSG("truncated " << ts->originalFullSequence->identifier->ToString()
145  << " from " << (ts->fromIndex+1) << " to " << (ts->toIndex+1) << "; length " << bioseq.GetInst().GetLength());
146  bioseq.SetInst().SetSeq_data().SetNcbistdaa().Set().resize(ts->toIndex - ts->fromIndex + 1);
147  for (int j=ts->fromIndex; j<=ts->toIndex; ++j)
148  bioseq.SetInst().SetSeq_data().SetNcbistdaa().Set()[j - ts->fromIndex] =
149  LookupNCBIStdaaNumberFromCharacter(ts->originalFullSequence->sequenceString[j]);
151  return ts;
152 }
154 static inline bool IsLocalID(const CSeq_id& sid, int localID)
155 {
156  return (sid.IsLocal() && (
157  (sid.GetLocal().IsStr() && sid.GetLocal().GetStr() == NStr::IntToString(localID)) ||
158  (sid.GetLocal().IsId() && sid.GetLocal().GetId() == localID)));
159 }
161 /*
162 static inline bool GetLocalID(const CSeq_id& sid, int *localID)
163 {
164  *localID = kMin_Int;
165  if (!sid.IsLocal())
166  return false;
167  if (sid.GetLocal().IsId())
168  *localID = sid.GetLocal().GetId();
169  else try {
170  *localID = NStr::StringToInt(sid.GetLocal().GetStr());
171  } catch (...) {
172  return false;
173  }
174  return true;
175 }
176 */
178 static inline bool SeqIdMatchesMaster(const CSeq_id& sid, bool usePSSM)
179 {
180  // if blast-sequence-vs-pssm, master will be consensus
181  if (usePSSM)
182  return (sid.IsLocal() && sid.GetLocal().IsStr() && sid.GetLocal().GetStr() == "consensus");
184  // if blast-two-sequences, master will be local id -1
185  else
186  return IsLocalID(sid, -1);
187 }
189 static void MapBlockFromConsensusToMaster(int consensusStart, int dependentStart, int length,
190  BlockMultipleAlignment *newAlignment, const BlockMultipleAlignment *multiple)
191 {
192  // get mapping of each position of consensus -> master on this block
193  vector < int > masterLoc(length);
194  int i;
195  for (i=0; i<length; ++i)
196  masterLoc[i] = multiple->GetPSSM().MapConsensusToMaster(consensusStart + i);
198  UngappedAlignedBlock *subBlock = NULL;
199  for (i=0; i<length; ++i) {
201  // is this the start of a sub-block?
202  if (!subBlock && masterLoc[i] >= 0) {
203  subBlock = new UngappedAlignedBlock(newAlignment);
204  subBlock->SetRangeOfRow(0, masterLoc[i], masterLoc[i]);
205  subBlock->SetRangeOfRow(1, dependentStart + i, dependentStart + i);
206  subBlock->width = 1;
207  }
209  // continue existing sub-block
210  if (subBlock) {
212  // is this the end of a sub-block?
213  if (i == length - 1 || // last position of block
214  masterLoc[i + 1] < 0 || // next position is unmapped
215  masterLoc[i + 1] != masterLoc[i] + 1) // next position is discontinuous
216  {
217  newAlignment->AddAlignedBlockAtEnd(subBlock);
218  subBlock = NULL;
219  }
221  // extend block by one
222  else {
223  const Block::Range *range = subBlock->GetRangeOfRow(0);
224  subBlock->SetRangeOfRow(0, range->from, range->to + 1);
225  range = subBlock->GetRangeOfRow(1);
226  subBlock->SetRangeOfRow(1, range->from, range->to + 1);
227  ++(subBlock->width);
228  }
229  }
230  }
232  if (subBlock)
233  ERRORMSG("MapBlockFromConsensusToMaster() - unterminated sub-block");
234 }
236 static void RemoveAllDataLoaders() {
239  om->GetRegisteredNames(loader_names);
240  ITERATE(CObjectManager::TRegisteredNames, itr, loader_names) {
241  om->RevokeDataLoader(*itr);
242  }
243 }
245 static bool SimpleSeqLocFromBioseq(const CRef< CBioseq>& bs, CSeq_loc& seqLoc)
246 {
247  bool result = true;
248  CSeq_interval& seqInt = seqLoc.SetInt();
249  CSeq_id& seqId = seqInt.SetId();
250  seqInt.SetFrom(0);
252  // Assign the first identifier from the bioseq
253  if (bs.NotEmpty() && bs->GetFirstId() != 0) {
254  seqInt.SetTo(bs->GetLength() - 1);
255  seqId.Assign(*(bs->GetFirstId()));
256  } else {
257  result = false;
258  }
260  return result;
261 }
264  const AlignmentList& toRealign, AlignmentList *newAlignments, bool usePSSM)
265 {
266  newAlignments->clear();
267  if (usePSSM && (!multiple || multiple->HasNoAlignedBlocks())) {
268  ERRORMSG("usePSSM true, but NULL or zero-aligned block multiple alignment");
269  return;
270  }
271  if (!usePSSM && toRealign.size() > 1) {
272  ERRORMSG("CreateNewPairwiseAlignmentsByBlast() - currently can only do single blast-2-sequences at a time");
273  return;
274  }
275  if (toRealign.size() == 0)
276  return;
278  try {
279  const Sequence *master = (multiple ? multiple->GetMaster() : NULL);
281  int extension = 0;
283  WARNINGMSG("Can't get footprint residue extension from registry");
285  // Make sure object manager loads only data from our alignment object.
288  CScope scope(*objmgr);
289  CRef< CBioseq > queryBioseq, subjectBioseq;
290  CRef<CSeq_loc> querySeqLoc(new CSeq_loc);
291  blast::CBlastQueryVector queryVector, subjectVector;
292  scope.ResetDataAndHistory();
294  // collect subject(s) - second sequence of each realignment
295  TruncatedSequences subjectTSs;
296  int localID = 0;
297  AlignmentList::const_iterator a, ae = toRealign.end();
298  for (a=toRealign.begin(); a!=ae; ++a, ++localID) {
299  if (!master)
300  master = (*a)->GetMaster();
301  if ((*a)->GetMaster() != master) {
302  ERRORMSG("CreateNewPairwiseAlignmentsByBlast() - all masters must be the same");
303  return;
304  }
305  if ((*a)->NRows() != 2) {
306  ERRORMSG("CreateNewPairwiseAlignmentsByBlast() - can only realign pairwise alignments");
307  return;
308  }
309  subjectTSs.push_back(CreateTruncatedSequence(multiple, *a, localID, false, extension));
311  CRef< CSeq_loc > subjectSeqLoc(new CSeq_loc);
312  subjectBioseq = &(subjectTSs.back()->truncatedSequence->SetSeq());
313  scope.AddBioseq(*subjectBioseq);
314  // Set up the QueryFactory for the subject sequences
315  if (SimpleSeqLocFromBioseq(subjectBioseq, *subjectSeqLoc)) {
316  CRef< blast::CBlastSearchQuery > bsqSubject(new blast::CBlastSearchQuery(*subjectSeqLoc, scope));
317  subjectVector.AddQuery(bsqSubject);
318  }
320  }
321  CRef < blast::IQueryFactory > sequenceSubjects(new blast::CObjMgr_QueryFactory(subjectVector));
323  // main blast engine
324  CRef < blast::CPsiBl2Seq > blastEngine;
326  // setup searches: blast-sequence-vs-pssm
329  if (usePSSM) {
330  pssmQuery.Reset(new CPssmWithParameters);
331  pssmQuery->Assign(multiple->GetPSSM().GetPSSM());
332  pssmOptions.Reset(new blast::CPSIBlastOptionsHandle);
334  // NR stats at 3/21/2006
335  pssmOptions->SetDbLength(1196146007);
336  pssmOptions->SetDbSeqNum(3479934);
337  pssmOptions->SetHitlistSize(subjectTSs.size());
338  pssmOptions->SetMatrixName("BLOSUM62");
339  pssmOptions->SetCompositionBasedStats(eCompositionBasedStats);
340  pssmOptions->SetSegFiltering(false);
342  blastEngine.Reset(new
343  blast::CPsiBl2Seq(
344  pssmQuery,
345  sequenceSubjects,
347  }
349  // setup searches: blast-two-sequences
351  CRef < blast::IQueryFactory > sequenceQuery;
353  if (!usePSSM) {
354  masterTS = CreateTruncatedSequence(multiple, toRealign.front(), -1, true, extension);
356  // Set up a QueryFactory for the query sequence
357  queryBioseq = &(masterTS->truncatedSequence->SetSeq());
358  scope.AddBioseq(*queryBioseq);
359  if (SimpleSeqLocFromBioseq(queryBioseq, *querySeqLoc)) {
360  CRef< blast::CBlastSearchQuery> bsqQuery(new blast::CBlastSearchQuery(*querySeqLoc, scope));
361  queryVector.AddQuery(bsqQuery);
362  }
363  sequenceQuery.Reset(new blast::CObjMgr_QueryFactory(queryVector));
365  sequenceOptions.Reset(new blast::CBlastProteinOptionsHandle);
366  sequenceOptions->SetMatrixName("BLOSUM62");
367  sequenceOptions->SetHitlistSize(subjectTSs.size());
368  blastEngine.Reset(new
369  blast::CPsiBl2Seq(
370  sequenceQuery,
371  sequenceSubjects,
373  }
375  // actually do the alignment(s)
376  CRef < blast::CSearchResultSet > results(blastEngine->Run());
378  // parse the alignments
379  if (results->size() != toRealign.size())
380  {
381  ERRORMSG("CreateNewPairwiseAlignmentsByBlast() - did not get one result alignment per input sequence");
382  return;
383  }
385  localID = 0;
386  for (unsigned int i=0; i<results->size(); ++i, ++localID) {
388 // string err;
389 // WriteASNToFile("Seq-align-set.txt", (*results)[i].GetSeqAlign().GetObject(), false, &err);
391  // create new alignment structure
393  (*seqs)[0] = master;
394  (*seqs)[1] = subjectTSs[localID]->originalFullSequence;
395  string dependentTitle = subjectTSs[localID]->originalFullSequence->identifier->ToString();
396  unique_ptr < BlockMultipleAlignment > newAlignment(
398  newAlignment->SetRowDouble(0, kMax_Double);
399  newAlignment->SetRowDouble(1, kMax_Double);
401  // check for valid or empty alignment
402  if (!((*results)[i].HasAlignments())) {
403  WARNINGMSG("BLAST did not find a significant alignment for "
404  << dependentTitle << " with " << (usePSSM ? string("PSSM") : master->identifier->ToString()));
405  } else {
407  // get Seq-align; use first one for this result, which assumes blast returns the highest scoring alignment first
408  const CSeq_align& sa = (*results)[i].GetSeqAlign()->Get().front().GetObject();
410  if (!sa.IsSetDim() || sa.GetDim() != 2 || sa.GetType() != CSeq_align::eType_partial) {
411  ERRORMSG("CreateNewPairwiseAlignmentsByBlast() - returned alignment not in expected format (dim 2, partial)");
412  } else if (sa.GetSegs().IsDenseg()) {
414  // unpack Dense-seg
415  const CDense_seg& ds = sa.GetSegs().GetDenseg();
416  if (!ds.IsSetDim() || ds.GetDim() != 2 || ds.GetIds().size() != 2 ||
417  (int)ds.GetLens().size() != ds.GetNumseg() || (int)ds.GetStarts().size() != 2 * ds.GetNumseg()) {
418  ERRORMSG("CreateNewPairwiseAlignmentsByBlast() - returned alignment format error (denseg dims)");
419  } else if (!SeqIdMatchesMaster(ds.GetIds().front().GetObject(), usePSSM) ||
420  !IsLocalID(ds.GetIds().back().GetObject(), localID)) {
421  ERRORMSG("CreateNewPairwiseAlignmentsByBlast() - returned alignment format error (ids)");
422  } else {
424  // unpack segs
425  CDense_seg::TStarts::const_iterator s = ds.GetStarts().begin();
426  CDense_seg::TLens::const_iterator l, le = ds.GetLens().end();
427  for (l=ds.GetLens().begin(); l!=le; ++l) {
428  int masterStart = *(s++), dependentStart = *(s++);
429  if (masterStart >= 0 && dependentStart >= 0) { // skip gaps
430  dependentStart += subjectTSs[localID]->fromIndex;
432  if (usePSSM) {
433  MapBlockFromConsensusToMaster(masterStart, dependentStart, *l, newAlignment.get(), multiple);
434  } else {
435  masterStart += masterTS->fromIndex;
436  UngappedAlignedBlock *newBlock = new UngappedAlignedBlock(newAlignment.get());
437  newBlock->SetRangeOfRow(0, masterStart, masterStart + (*l) - 1);
438  newBlock->SetRangeOfRow(1, dependentStart, dependentStart + (*l) - 1);
439  newBlock->width = *l;
440  newAlignment->AddAlignedBlockAtEnd(newBlock);
441  }
442  }
443  }
444  }
446  } else {
447  ERRORMSG("CreateNewPairwiseAlignmentsByBlast() - returned alignment in unrecognized format");
448  }
450  // unpack score
451  if (!sa.IsSetScore() || sa.GetScore().size() == 0) {
452  WARNINGMSG("BLAST did not return an alignment score for " << dependentTitle);
453  } else {
454  CNcbiOstrstream oss;
455  oss << "BLAST result scores for " << dependentTitle << " vs. "
456  << (usePSSM ? string("PSSM") : master->identifier->ToString()) << ':';
458  bool haveE = false;
459  CSeq_align::TScore::const_iterator sc, sce = sa.GetScore().end();
460  for (sc=sa.GetScore().begin(); sc!=sce; ++sc) {
461  if ((*sc)->IsSetId() && (*sc)->GetId().IsStr()) {
463  // E-value (put in status line and double values)
464  if ((*sc)->GetValue().IsReal() && (*sc)->GetId().GetStr() == "e_value") {
465  haveE = true;
466  newAlignment->SetRowDouble(0, (*sc)->GetValue().GetReal());
467  newAlignment->SetRowDouble(1, (*sc)->GetValue().GetReal());
468  string status = string("E-value: ") + NStr::DoubleToString((*sc)->GetValue().GetReal());
469  newAlignment->SetRowStatusLine(0, status);
470  newAlignment->SetRowStatusLine(1, status);
471  oss << ' ' << status;
472  }
474  // raw score
475  else if ((*sc)->GetValue().IsInt() && (*sc)->GetId().GetStr() == "score") {
476  oss << " raw: " << (*sc)->GetValue().GetInt();
477  }
479  // bit score
480  else if ((*sc)->GetValue().IsReal() && (*sc)->GetId().GetStr() == "bit_score") {
481  oss << " bit score: " << (*sc)->GetValue().GetReal();
482  }
483  }
484  }
486  INFOMSG((string) CNcbiOstrstreamToString(oss));
487  if (!haveE)
488  WARNINGMSG("BLAST did not return an E-value for " << dependentTitle);
489  }
490  }
492  // finalize and and add new alignment to list
493  if (newAlignment->AddUnalignedBlocks() && newAlignment->UpdateBlockMapAndColors(false))
494  newAlignments->push_back(newAlignment.release());
495  else
496  ERRORMSG("error finalizing alignment");
497  }
499  } catch (exception& e) {
500  ERRORMSG("CreateNewPairwiseAlignmentsByBlast() failed with exception: " << e.what());
501  }
502 }
505 {
506  if (!multiple) {
507  ERRORMSG("NULL multiple alignment");
508  return;
509  }
511  int extension = 0;
513  WARNINGMSG("Can't get footprint residue extension from registry");
516  multiple->GetUngappedAlignedBlocks(&uaBlocks);
517  if (uaBlocks.size() == 0) {
518  ERRORMSG("Can't calculate self-hits with no aligned blocks");
519  return;
520  }
522  // do BLAST-vs-pssm on all rows, using footprint for each row
523  AlignmentList rowPairs;
524  unsigned int row;
525  for (row=0; row<multiple->NRows(); ++row) {
527  (*seqs)[0] = multiple->GetMaster();
528  (*seqs)[1] = multiple->GetSequenceOfRow(row);
529  BlockMultipleAlignment *newAlignment = new BlockMultipleAlignment(seqs, multiple->GetMaster()->parentSet->alignmentManager);
530  const Block::Range *range = uaBlocks.front()->GetRangeOfRow(row);
531  newAlignment->alignDependentFrom = range->from - extension;
532  if (newAlignment->alignDependentFrom < 0)
533  newAlignment->alignDependentFrom = 0;
534  range = uaBlocks.back()->GetRangeOfRow(row);
535  newAlignment->alignDependentTo = range->to + extension;
536  if (newAlignment->alignDependentTo >= (int)multiple->GetSequenceOfRow(row)->Length())
537  newAlignment->alignDependentTo = multiple->GetSequenceOfRow(row)->Length() - 1;
538  rowPairs.push_back(newAlignment);
539  }
541  CreateNewPairwiseAlignmentsByBlast(multiple, rowPairs, &results, true);
543  if (results.size() != multiple->NRows()) {
545  ERRORMSG("CalculateSelfHitScores() - CreateNewPairwiseAlignmentsByBlast() didn't return right # alignments");
546  return;
547  }
549  // extract scores, assumes E-value is in RowDouble
550  AlignmentList::const_iterator r = results.begin();
551  for (row=0; row<multiple->NRows(); ++row, ++r) {
552  double score = (*r)->GetRowDouble(1);
553  multiple->SetRowDouble(row, score);
554  string status;
555  if (score >= 0.0 && score < kMax_Double)
556  status = string("Self hit E-value: ") + NStr::DoubleToString(score);
557  else
558  status = "No detectable self hit";
559  multiple->SetRowStatusLine(row, status);
560  }
563  // print out overall self-hit rate
564  static const double threshold = 0.01;
565  unsigned int nSelfHits = 0;
566  for (row=0; row<multiple->NRows(); ++row) {
567  if (multiple->GetRowDouble(row) >= 0.0 && multiple->GetRowDouble(row) <= threshold)
568  ++nSelfHits;
569  }
570  INFOMSG("Self hits with E-value <= " << setprecision(3) << threshold << ": "
571  << (100.0*nSelfHits/multiple->NRows()) << "% ("
572  << nSelfHits << '/' << multiple->NRows() << ')' << setprecision(6));
573 }
575 double GetStandardProbability(char ch)
576 {
577  typedef map < char, double > CharDoubleMap;
578  static CharDoubleMap standardProbabilities;
580  if (standardProbabilities.size() == 0) { // initialize static stuff
581  if (BLASTAA_SIZE != 28) {
582  ERRORMSG("GetStandardProbability() - confused by BLASTAA_SIZE != 28");
583  return 0.0;
584  }
585  double *probs = BLAST_GetStandardAaProbabilities();
586  for (unsigned int i=0; i<28; ++i) {
587  standardProbabilities[LookupCharacterFromNCBIStdaaNumber(i)] = probs[i];
588 // TRACEMSG("standard probability " << LookupCharacterFromNCBIStdaaNumber(i) << " : " << probs[i]);
589  }
590  sfree(probs);
591  }
593  CharDoubleMap::const_iterator f = standardProbabilities.find(toupper((unsigned char) ch));
594  if (f != standardProbabilities.end())
595  return f->second;
596  WARNINGMSG("GetStandardProbability() - unknown residue character " << ch);
597  return 0.0;
598 }
