36 #include <ncbi_pch.hpp>
43 #include <algo/cobalt/cobalt.hpp>
51 #include <serial/iterator.hpp>
53 #include <algorithm>
55 /// @file rps.cpp
56 /// Use RPS blast to find domain hits
59 BEGIN_SCOPE(cobalt)
61 USING_SCOPE(blast);
64 /// Given an RPS blast database, load a list of block offsets
65 /// for each database sequence. The list is resident in a text
66 /// file, where each line is as follows
67 /// <pre>
68 /// [seq ID] [oid of block] [start block offset] [end block offset]
69 /// </pre>
70 /// Note that block offsets are zero-based
71 /// @param blockfile Name of file containing list of offsets [in]
72 /// @param blocklist the list of offsets read from file [out]
73 ///
74 void
75 CMultiAligner::x_LoadBlockBoundaries(string blockfile,
76  vector<SSegmentLoc>& blocklist)
77 {
78  CNcbiIfstream blockstream(blockfile.c_str());
79  if (blockstream.bad() ||
80  NCBI_THROW(CBlastException, eInvalidArgument,
81  "Cannot open RPS blockfile");
83  char buf[64];
85  int oid = 0;
86  int block_idx;
87  int start, end;
89  blockstream >> buf;
90  blockstream >> block_idx;
91  blockstream >> start;
92  blockstream >> end;
93  blocklist.push_back(SSegmentLoc(oid, start, end));
95  while (!blockstream.eof()) {
96  blockstream >> buf;
97  // This allows for new line at the end of block file
98  if (blockstream.eof()) {
99  break;
100  }
101  blockstream >> block_idx;
102  blockstream >> start;
103  blockstream >> end;
105  if (block_idx == 0)
106  oid++;
108  blocklist.push_back(SSegmentLoc(oid, start, end));
109  }
110 }
113 void
115  vector<SSegmentLoc>& blocklist,
116  CProfileData& profile_data)
117 {
118  // scale up the gap penalties used by the aligner, to match
119  // the scaling used by the RPS PSSMs
121  /// @todo FIXME the scale factor should be chosen dynamically
129  m_Aligner.SetEndSpaceFree(false, false, false, false);
131  // for each RPS hit
133  for (int i = 0; i < rps_hits.Size(); i++) {
135  CHit *hit = rps_hits.GetHit(i);
137  int db_seq = hit->m_SeqIndex2;
138  int *db_seq_offsets = profile_data.GetSeqOffsets();
139  int **pssm = profile_data.GetPssm() + db_seq_offsets[db_seq];
140  int db_seq_length = db_seq_offsets[db_seq + 1] - db_seq_offsets[db_seq];
141  int last_fudge = 0;
143  _ASSERT(!(hit->HasSubHits()));
145  // ignore this alignment if its extent is less than
146  // 60% of the extent of query and DB sequence
148  if ((hit->m_SeqRange1.GetLength() < 0.6 * query.GetLength()) &&
149  (hit->m_SeqRange2.GetLength() < 0.6 * db_seq_length)) {
150  rps_hits.SetKeepHit(i, false);
151  continue;
152  }
154  SSegmentLoc target(db_seq, hit->m_SeqRange2.GetFrom(),
155  hit->m_SeqRange2.GetTo());
157  // locate the first block in the subject sequence
158  // that contains a piece of the HSP
160  vector<SSegmentLoc>::iterator
161  itr = lower_bound(blocklist.begin(), blocklist.end(),
162  target, compare_sseg_db_idx());
164  _ASSERT(itr != blocklist.end() &&
165  target.seq_index == itr->seq_index);
167  // walk up to the first block that is not
168  // in front of the alignment
170  while (itr != blocklist.end() &&
171  itr->seq_index == target.seq_index &&
172  itr->GetTo() < target.GetFrom()) {
173  itr++;
174  }
176  vector<SSegmentLoc>::iterator prev_itr(itr);
177  vector<SSegmentLoc>::iterator next_itr(itr);
178  if (itr != blocklist.begin()) {
179  prev_itr--;
180  }
181  next_itr++;
183  // for each block that contains a portion of the
184  // original alignment
186  while (itr != blocklist.end() && itr->seq_index == db_seq
187  && itr->GetFrom() < target.GetTo()) {
189  const int kMaxFudge = 6;
190  TRange q_range, new_s_range;
191  TRange tback_range;
193  // calculate the offsets into the subject sequence
194  // that correspond to the current block
196  TRange s_range(itr->range.IntersectionWith(target.range));
197  _ASSERT(!s_range.Empty() && itr->range.Contains(s_range));
199  int left_fudge, right_fudge;
201  // calculate how much extra room on the query sequence
202  // to allow for realignment. The size of the 'fudge'
203  // to the left is the different between the length of
204  // the loop region to the left and the length of the
205  // previous fudge, up to a limit of kMaxFudge
207  if (itr == blocklist.begin() ||
208  prev_itr == blocklist.begin() ||
209  prev_itr->seq_index != db_seq) {
210  left_fudge = 0;
211  }
212  else {
213  left_fudge = s_range.GetFrom() -
214  prev_itr->GetTo() - last_fudge - 1;
215  left_fudge = min(left_fudge, kMaxFudge);
216  }
218  // The extra room on the right is half the length
219  // of the loop region to the right, up to the same limit
221  if (itr == blocklist.end() ||
222  next_itr == blocklist.end() ||
223  next_itr->seq_index != db_seq) {
224  right_fudge = 0;
225  }
226  else {
227  right_fudge = (next_itr->GetFrom() - s_range.GetTo() - 1) / 2;
228  right_fudge = min(right_fudge, kMaxFudge);
229  }
231  last_fudge = right_fudge;
233  // compute the start and stop offsets into the
234  // query sequence that correspond to the subject range
235  // specified by the current block.
237  hit->GetRangeFromSeq2(s_range, q_range, new_s_range, tback_range);
239  // pre-advance the iterators
241  if (prev_itr != itr) {
242  prev_itr++;
243  }
244  itr++;
245  if (next_itr != blocklist.end()) {
246  next_itr++;
247  }
249  // Throw away alignments whose query range is too small
251  if (q_range.GetLength() <= CHit::kMinHitSize)
252  continue;
254  // or for which the difference between query and database
255  // regions is too large (i.e. query sequence has a big gap)
257  if (s_range.GetLength() > 3 * q_range.GetLength() / 2) {
258  if (m_Options->GetVerbose()) {
259  printf("ignore aligning query %d %d-%d db %d block %d-%d\n",
260  hit->m_SeqIndex1, q_range.GetFrom(), q_range.GetTo(),
261  db_seq, s_range.GetFrom(), s_range.GetTo());
262  }
263  continue;
264  }
266  q_range.SetFrom(max(hit->m_SeqRange1.GetFrom(),
267  q_range.GetFrom() - left_fudge));
268  q_range.SetTo(min(hit->m_SeqRange1.GetTo(),
269  q_range.GetTo() + right_fudge));
271  // Now realign the block to the query sequence
273  m_Aligner.SetSequences((const int **)(pssm + s_range.GetFrom()),
274  s_range.GetLength(),
275  (const char *)query.GetSequence() + q_range.GetFrom(),
276  q_range.GetLength());
278  int score = m_Aligner.Run();
279  const CNWAligner::TTranscript tback(m_Aligner.GetTranscript(false));
280  int tback_size = tback.size();
281  CEditScript final_script;
283  if ((tback[0] == CNWAligner::eTS_Delete &&
284  tback[tback_size-1] == CNWAligner::eTS_Insert) ||
285  (tback[0] == CNWAligner::eTS_Insert &&
286  tback[tback_size-1] == CNWAligner::eTS_Delete)) {
288  // The query region falls outside the DB region.
289  // Throw away the alignment and reuse the original one.
291  hit->GetRangeFromSeq2(s_range, q_range, s_range, tback_range);
293  // throw away alignments that are too small
295  if (q_range.GetLength() <= CHit::kMinHitSize ||
296  s_range.GetLength() <= CHit::kMinHitSize)
297  continue;
298  score = hit->GetEditScript().GetScore(
299  tback_range,
301  hit->m_SeqRange2.GetFrom()),
302  query, pssm,
304  final_script = hit->GetEditScript().MakeEditScript(tback_range);
305  }
306  else {
308  // strip off leading and trailing gaps in the
309  // database sequence. Modify the alignment score
310  // accordingly
312  int first_tback = 0;
313  int last_tback = tback_size - 1;
314  int q_start = q_range.GetFrom();
315  int q_stop = q_range.GetTo();
316  int s_start = s_range.GetFrom();
317  int s_stop = s_range.GetTo();
319  for (int k = 0; k < tback_size &&
320  tback[k] != CNWAligner::eTS_Match; k++) {
321  first_tback++;
322  if (tback[k] == CNWAligner::eTS_Delete)
323  s_start++;
324  else if (tback[k] == CNWAligner::eTS_Insert)
325  q_start++;
327  score -= m_Aligner.GetWs();
328  if (k == 0)
329  score -= m_Aligner.GetEndWg();
330  else if (tback[k] != tback[k-1])
331  score -= m_Aligner.GetWg();
332  }
334  for (int k = tback_size - 1; k >= 0 &&
335  tback[k] != CNWAligner::eTS_Match; k--) {
336  last_tback--;
337  if (tback[k] == CNWAligner::eTS_Delete)
338  s_stop--;
339  else if (tback[k] == CNWAligner::eTS_Insert)
340  q_stop--;
342  score -= m_Aligner.GetWs();
343  if (k == tback_size - 1)
344  score -= m_Aligner.GetEndWg();
345  else if (tback[k] != tback[k+1])
346  score -= m_Aligner.GetWg();
347  }
349  // throw away alignments that are too small
351  q_range.Set(q_start, q_stop);
352  s_range.Set(s_start, s_stop);
353  if (q_range.GetLength() <= CHit::kMinHitSize ||
354  s_range.GetLength() <= CHit::kMinHitSize)
355  continue;
357  _ASSERT(tback[first_tback] == CNWAligner::eTS_Match);
358  _ASSERT(tback[last_tback] == CNWAligner::eTS_Match);
360  final_script = CEditScript::MakeEditScript(tback,
361  TRange(first_tback, last_tback));
362  }
364  // save the new block alignment if the rounded-down
365  // version of its score is positive
367  if (score > kRpsScaleFactor / 2) {
368  hit->InsertSubHit(new CHit(hit->m_SeqIndex1,
369  hit->m_SeqIndex2,
370  q_range, s_range,
371  score, final_script));
372  }
373  }
375  // finish processing hit i
377  if (hit->HasSubHits()) {
378  hit->ResolveSubHitConflicts(query, pssm,
379  m_Aligner.GetWg(),
380  m_Aligner.GetWs());
381  hit->AddUpSubHits();
382  }
383  else {
384  rps_hits.SetKeepHit(i, false);
385  }
387  // check for interrupt
390  "Alignment interrupted");
391  }
392  }
394  // remove RPS hits that do not have block alignments,
395  // or were deleted for some other reason
397  rps_hits.PurgeUnwantedHits();
399  // restore the original gap penalties
407 }
410 void
412  const vector<int>& indices,
413  CHitList& rps_hits)
414 {
415  _ASSERT(queries.size() == indices.size());
417  int num_queries = queries.size();
421  // deliberately set the cutoff e-value too high, to
422  // account for alignments where the gapped score is
423  // very different from the ungapped score
425  opts->SetEvalueThreshold(max(m_Options->GetRpsEvalue(), 10.0));
426  opts->SetFilterString("F");
428  (dynamic_cast<CBlastRPSOptionsHandle*>(opts.GetNonNullPointer()))
429  ->SetCompositionBasedStats(false);
431  // run RPS blast
433  CSearchDatabase search_database(m_Options->GetRpsDb(),
435  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(queries));
437  CLocalBlast blaster(query_factory, opts, search_database);
438  CSearchResultSet results = *blaster.Run();
440  // convert the results to the internal format used by
441  // the rest of CMultiAligner
445  // iterate over queries
447  for (int i = 0; i < num_queries; i++) {
449  // iterate over hitlists
451  ITERATE(CSeq_align_set::Tdata, itr, results[i].GetSeqAlign()->Get()) {
453  // iterate over hits
455  const CSeq_align& s = **itr;
456  const CDense_seg& denseg = s.GetSegs().GetDenseg();
457  int align_score = 0;
458  double evalue = 0;
460  // compute the score of the hit
462  ITERATE(CSeq_align::TScore, score_itr, s.GetScore()) {
463  const CScore& curr_score = **score_itr;
464  if (curr_score.GetId().GetStr() == "score")
465  align_score = curr_score.GetValue().GetInt();
466  else if (curr_score.GetId().GetStr() == "e_value")
467  evalue = curr_score.GetValue().GetReal();
468  }
470  // check if the hit is worth saving
471  if (evalue > m_Options->GetRpsEvalue())
472  continue;
474  // locate the ID of the database sequence that
475  // produced the hit, and save the hit
477  int db_oid;
478  seqdb.SeqidToOid(*denseg.GetIds()[1], db_oid);
479  rps_hits.AddToHitList(new CHit(indices[i], db_oid,
480  align_score, denseg));
481  }
483  // check for interrupt
486  "Alignment interrupted");
487  }
488  }
491  //-------------------------------------------------------
492  if (m_Options->GetVerbose()) {
493  printf("RPS hits:\n");
494  for (int i = 0; i < rps_hits.Size(); i++) {
495  CHit *hit = rps_hits.GetHit(i);
496  printf("query %d %4d - %4d db %d %4d - %4d score %d\n",
497  hit->m_SeqIndex1,
498  hit->m_SeqRange1.GetFrom(),
499  hit->m_SeqRange1.GetTo(),
500  hit->m_SeqIndex2,
501  hit->m_SeqRange2.GetFrom(),
502  hit->m_SeqRange2.GetTo(),
503  hit->m_Score);
504  }
505  printf("\n\n");
506  }
507  //-------------------------------------------------------
508 }
511 void
513  CProfileData& profile_data)
514 {
515  if (rps_hits.Empty()) {
516  return;
517  }
519  rps_hits.SortByScore();
521  // for each hit
523  for (int i = 0; i < rps_hits.Size(); i++) {
524  CHit *hit = rps_hits.GetHit(i);
526  _ASSERT(hit->HasSubHits());
528  // skip hit i if it overlaps on the query sequence
529  // with a higher-scoring HSP.
531  int j;
532  for (j = 0; j < i; j++) {
533  CHit *better_hit = rps_hits.GetHit(j);
535  if (better_hit->m_SeqIndex1 != hit->m_SeqIndex1)
536  continue;
538  if (rps_hits.GetKeepHit(j) == true &&
539  better_hit->m_SeqRange1.IntersectingWith(hit->m_SeqRange1))
540  break;
541  }
542  if (j < i) {
543  continue;
544  }
546  // The hit does not conflict; use the traceback of each block
547  // to locate each position where a substitution occurs,
548  // and assign the appropriate column of residue frequencies
549  // at that position
552  CSequence::TFreqMatrix& matrix = query.GetFreqs();
553  _ASSERT(hit->m_SeqIndex1 < (int)m_RPSLocs.size());
554  m_RPSLocs[hit->m_SeqIndex1].clear();
556  Int4** ref_freqs = profile_data.GetResFreqs() +
557  (profile_data.GetSeqOffsets())[hit->m_SeqIndex2];
559  double domain_res_freq_boost = m_Options->GetDomainResFreqBoost();
561  NON_CONST_ITERATE(vector<CHit *>, itr, hit->GetSubHit()) {
562  CHit *subhit = *itr;
563  vector<TOffsetPair> sub_list(
565  TOffsetPair(subhit->m_SeqRange1.GetFrom(),
566  subhit->m_SeqRange2.GetFrom()) ));
568  for (j = 0; j < (int)sub_list.size(); j += 2) {
569  TOffsetPair& start_pair(sub_list[j]);
570  TOffsetPair& stop_pair(sub_list[j+1]);
571  int q = start_pair.first;
572  int s = start_pair.second;
574  _ASSERT(stop_pair.second - stop_pair.first ==
575  start_pair.second - start_pair.first);
576  _ASSERT(stop_pair.first-1 < query.GetLength());
578  for (int k = 0; k < stop_pair.first - start_pair.first; k++) {
579  for (int m = 0; m < kAlphabetSize; m++) {
580  matrix(q+k, m) =
581  (1 - domain_res_freq_boost) *
582  ((double)ref_freqs[s+k][m] / FREQ_RATIO_SCALE);
584  }
585  matrix(q+k, query.GetLetter(q+k)) += domain_res_freq_boost;
586  }
587  // mark range as RPS-identified conserved domain
588  m_RPSLocs[hit->m_SeqIndex1].push_back(TRange(start_pair.first,
589  stop_pair.first));
590  }
591  }
593  // check for interrupt
596  "Alignment interrupted");
597  }
598  }
599 }
602 void
604 {
605  // Assign background residue frequencies to otherwise
606  // unassigned columns. The actual residue at a given
607  // position is upweighted by a specified amount, and
608  // all other frequencies are downweighted
611  Blast_ResFreq *std_freqs = Blast_ResFreqNew(sbp);
612  Blast_ResFreqStdComp(sbp, std_freqs);
613  double local_res_freq_boost = m_Options->GetLocalResFreqBoost();
615  for (size_t i = 0; i < m_QueryData.size(); i++) {
617  CSequence::TFreqMatrix& matrix = query.GetFreqs();
619  for (int j = 0; j < query.GetLength(); j++) {
620  for (int k = 0; k < kAlphabetSize; k++) {
621  matrix(j, k) = (1 - local_res_freq_boost) *
622  std_freqs->prob[k];
623  }
624  matrix(j, query.GetLetter(j)) += local_res_freq_boost;
625  }
627  // check for interrupt
630  "Alignment interrupted");
631  }
632  }
636  for (size_t i = 0; i < m_AllQueryData.size(); i++) {
638  CSequence::TFreqMatrix& matrix = query.GetFreqs();
639  for (int j = 0; j < query.GetLength(); j++) {
640  for (int k = 0; k < kAlphabetSize; k++) {
641  matrix(j, k) = (1 - local_res_freq_boost) *
642  std_freqs->prob[k];
643  }
644  matrix(j, query.GetLetter(j)) += local_res_freq_boost;
645  }
646  }
649  }
651  Blast_ResFreqFree(std_freqs);
652  BlastScoreBlkFree(sbp);
653 }
656 bool compare_seqids(const pair<const CSeq_id*, int>& a,
657  const pair<const CSeq_id*, int>& b)
658 {
659  _ASSERT(a.first && b.first);
660  return a.first->CompareOrdered(*b.first) > 0;
661 }
663 void
665  const vector<int>& indices,
666  const CBlast4_archive& archive)
667 {
668  // This function sets pre-computed alignments with of queries
669  // with conserved domains. Note that the results need not include all
670  // cobalt queries and not all domain queries need to be cobalt sequences
672  _ASSERT(!pre_queries.empty());
673  _ASSERT(pre_queries.size() == indices.size());
675  // initialize all queries as not searched for conserved domains
676  m_IsDomainSearched.resize(m_tQueries.size(), false);
679  // create a sorted list query seq_ids
680  vector< pair<const CSeq_id*, int> > queries;
681  queries.reserve(pre_queries.size());
682  for (size_t i=0;i < pre_queries.size();i++) {
683  _ASSERT(pre_queries[i].seqloc->GetId());
684  queries.push_back(make_pair(pre_queries[i].seqloc->GetId(), indices[i]));
685  }
686  sort(queries.begin(), queries.end(), compare_seqids);
688  // mark queries for which domain search was done,
689  // we use query list here in case the search retruned no results
690  const CBlast4_queries& b4_queries
691  = archive.GetRequest().GetBody().GetQueue_search().GetQueries();
693  if (!b4_queries.IsSeq_loc_list() && !b4_queries.IsBioseq_set()) {
694  NCBI_THROW(CMultiAlignerException, eInvalidInput, "Unsupported BLAST"
695  " 4 archive format");
696  }
698  // if domain queries are seq_locs
699  if (b4_queries.IsSeq_loc_list()) {
700  ITERATE (list< CRef<CSeq_loc> >, it, b4_queries.GetSeq_loc_list()) {
702  // iterate over domain queries
704  // search for the query in the list of sequence to align
705  pair<const CSeq_id*, int> p((*it)->GetId(), -1);
706  vector< pair<const CSeq_id*, int> >::iterator id_itr
707  = lower_bound(queries.begin(), queries.end(), p,
710  // if query was found, then mark it as searched
711  if (id_itr != queries.end()
712  && id_itr->first->CompareOrdered(*p.first) == 0) {
713  m_IsDomainSearched[id_itr->second] = true;
714  }
715  }
716  }
718  // if domain queries are bioseqs
719  if (b4_queries.IsBioseq_set()) {
721  eDetectLoops));
722  // iterate over domain queries
723  for (; itr; ++itr) {
725  // search for the query in the list of sequences to align
726  pair<const CSeq_id*, int> p(itr->GetFirstId(), -1);
727  vector< pair<const CSeq_id*, int> >::iterator id_itr
728  = lower_bound(queries.begin(), queries.end(), p,
731  // if query was found, then mark it as searched
732  if (id_itr != queries.end()
733  && id_itr->first->CompareOrdered(*p.first) == 0) {
734  m_IsDomainSearched[id_itr->second] = true;
735  }
736  }
737  }
738  //-------------------------------------------------------
739  if (m_Options->GetVerbose()) {
740  printf("Pre-computed RPS queries:\n");
741  for (size_t i=0;i < pre_queries.size();i++) {
742  _ASSERT(indices[i] < (int)m_IsDomainSearched.size());
743  if (m_IsDomainSearched[indices[i]]) {
744  printf("query: %d\n", indices[i]);
745  }
746  }
747  printf("\n");
748  }
749  //-------------------------------------------------------
751  // check if at least one domain query matched cobalt query
752  bool is_presearched = false;
753  ITERATE (vector<bool>, it, m_IsDomainSearched) {
754  if (*it) {
755  is_presearched = true;
756  }
757  }
758  // if not, there is no need to analyze domain hits
759  if (!is_presearched) {
760  // empty array indicates that pre-computed domain hits are not set
761  m_IsDomainSearched.clear();
762  return;
763  }
766  is_presearched = false;
768  // get domain hits
769  const CSeq_align_set& aligns = archive.GetResults().GetAlignments();
770  int query_idx = -1;
771  const CSeq_id* last_query_id = NULL;
772  ITERATE (CSeq_align_set::Tdata, it, aligns.Get()) {
774  // iterate over hits
776  const CSeq_align& s = **it;
777  const CDense_seg& denseg = s.GetSegs().GetDenseg();
778  int align_score = 0;
779  double evalue = 0;
781  // find query index in m_tQueries
782  const CSeq_id& query_id = s.GetSeq_id(0);
784  // search for query in sequences to align only if the current hit
785  // query is different from the previous one
786  if (!last_query_id || query_id.CompareOrdered(*last_query_id) != 0) {
788  // find query seq id
789  pair<const CSeq_id*, int> p(&query_id, -1);
790  vector< pair<const CSeq_id*, int> >::iterator id_itr
791  = lower_bound(queries.begin(), queries.end(), p,
794  // if the hit query is not to be aligned, then skip processing
795  // this Seq_align
796  if (id_itr == queries.end()
797  || id_itr->first->CompareOrdered(*p.first) != 0) {
799  query_idx = -1;
800  continue;
801  }
802  query_idx = id_itr->second;
803  last_query_id = id_itr->first;
804  }
805  if (query_idx < 0) {
806  continue;
807  }
809  // compute the score of the hit
811  ITERATE(CSeq_align::TScore, score_itr, s.GetScore()) {
812  const CScore& curr_score = **score_itr;
813  if (curr_score.GetId().GetStr() == "score")
814  align_score = curr_score.GetValue().GetInt();
815  else if (curr_score.GetId().GetStr() == "e_value")
816  evalue = curr_score.GetValue().GetReal();
817  }
819  // check if the hit is worth saving
820  if (evalue > m_Options->GetRpsEvalue())
821  continue;
823  // locate the ID of the database sequence that
824  // produced the hit, and save the hit
826  int db_oid;
827  seqdb.SeqidToOid(*denseg.GetIds()[1], db_oid);
828  if (db_oid < 0) {
829  NCBI_THROW(CMultiAlignerException, eInvalidInput, "The pre-computed"
830  " subject domain " + denseg.GetIds()[1]->AsFastaString()
831  + " does not exist in the domain database "
832  + m_Options->GetRpsDb());
833  }
834  m_DomainHits.AddToHitList(new CHit(query_idx, db_oid,
835  align_score, denseg));
837  is_presearched = true;
838  }
840  if (!is_presearched) {
841  m_IsDomainSearched.clear();
842  }
844  //-------------------------------------------------------
845  if (m_Options->GetVerbose()) {
846  printf("Pre-computed RPS hits:\n");
847  for (int i = 0; i < m_DomainHits.Size(); i++) {
848  CHit *hit = m_DomainHits.GetHit(i);
849  printf("query %d %4d - %4d db %d %4d - %4d score %d\n",
850  hit->m_SeqIndex1,
851  hit->m_SeqRange1.GetFrom(),
852  hit->m_SeqRange1.GetTo(),
853  hit->m_SeqIndex2,
854  hit->m_SeqRange2.GetFrom(),
855  hit->m_SeqRange2.GetTo(),
856  hit->m_Score);
857  }
858  printf("\n\n");
859  }
860  //-------------------------------------------------------
863 }
865 void
867  const vector<int>& indices)
868 {
869  string rps_db = m_Options->GetRpsDb();
870  string blockfile = rps_db + ".blocks";
871  string freqfile = rps_db + ".freq";
873  if (rps_db.empty()) {
874  return;
875  }
877  // set pre-computed domain hits if available
878  if (m_Options->CanGetDomainHits()) {
879  x_SetDomainHits(queries, indices, *m_Options->GetDomainHits());
880  }
884  // empty previously found hits
888  // if there are no pre-computed domain hits search for domain in all
889  // queries
890  if (m_IsDomainSearched.empty()) {
893  // run RPS blast
895  x_FindRPSHits(queries, indices, m_DomainHits);
896  }
897  else {
898  // otherwise, search only queries that were not searched for
899  // pre-computed results
901  _ASSERT(m_IsDomainSearched.size() == m_tQueries.size());
903  // find if there is at least one query that was not pre-searched
904  bool do_search = false;
905  for (size_t i=0;i < indices.size();i++) {
906  _ASSERT(indices[i] < (int)m_IsDomainSearched.size());
907  if (!m_IsDomainSearched[indices[i]]) {
908  do_search = true;
909  break;
910  }
911  }
913  // search for domains
914  if (do_search) {
915  TSeqLocVector queries_not_searched;
916  vector<int> indices_not_searched;
917  for (size_t i=0;i < queries.size();i++) {
918  if (!m_IsDomainSearched[indices[i]]) {
919  queries_not_searched.push_back(queries[i]);
920  indices_not_searched.push_back(indices[i]);
921  }
922  }
923  // run RPS blast
924  x_FindRPSHits(queries_not_searched, indices_not_searched,
925  m_DomainHits);
926  }
927  }
929  // check for interrupt
932  "Alignment interrupted");
933  }
935  vector<SSegmentLoc> blocklist;
936  x_LoadBlockBoundaries(blockfile, blocklist);
938  // Load the RPS PSSMs and perform block realignment
940  CProfileData profile_data;
941  profile_data.Load(CProfileData::eGetPssm, rps_db);
942  x_RealignBlocks(m_DomainHits, blocklist, profile_data);
943  blocklist.clear();
944  profile_data.Clear();
946  //-------------------------------------------------------
947  if (m_Options->GetVerbose()) {
948  printf("\n\nBlock alignments with conflicts resolved:\n");
949  for (int i = 0; i < m_DomainHits.Size(); i++) {
950  CHit *hit = m_DomainHits.GetHit(i);
951  NON_CONST_ITERATE(vector<CHit *>, itr, hit->GetSubHit()) {
952  CHit *subhit = *itr;
953  printf("query %d %4d - %4d db %d %4d - %4d score %d ",
954  subhit->m_SeqIndex1,
955  subhit->m_SeqRange1.GetFrom(),
956  subhit->m_SeqRange1.GetTo(),
957  subhit->m_SeqIndex2,
958  subhit->m_SeqRange2.GetFrom(),
959  subhit->m_SeqRange2.GetTo(),
960  subhit->m_Score);
962  printf("\n");
963  }
964  }
965  printf("\n\n");
966  }
967  //-------------------------------------------------------
969  if (m_DomainHits.Empty())
970  return;
974  // propagate the residue frequencies of the best
975  // RPS hits onto the query sequences
977  m_RPSLocs.resize(m_tQueries.size());
978  profile_data.Load(CProfileData::eGetResFreqs, rps_db, freqfile);
979  x_AssignRPSResFreqs(m_DomainHits, profile_data);
980  profile_data.Clear();
982  // Connect together RPS hits to the same region of the
983  // same database sequence
987  // Remove the scaling on the scores
989  const int kRpsScale = CMultiAligner::kRpsScaleFactor;
990  for (int i = 0; i < m_CombinedHits.Size(); i++) {
991  CHit *hit = m_CombinedHits.GetHit(i);
992  hit->m_Score = (hit->m_Score + kRpsScale/2) / kRpsScale;
993  NON_CONST_ITERATE(CHit::TSubHit, subitr, hit->GetSubHit()) {
994  CHit *subhit = *subitr;
995  subhit->m_Score = (subhit->m_Score + kRpsScale/2) / kRpsScale;
996  }
997  }
999  //-------------------------------------------------------
1000  if (m_Options->GetVerbose()) {
1001  printf("\n\nMatched block alignments:\n");
1002  for (int i = 0; i < m_CombinedHits.Size(); i++) {
1003  CHit *hit = m_CombinedHits.GetHit(i);
1004  NON_CONST_ITERATE(vector<CHit *>, itr, hit->GetSubHit()) {
1005  CHit *subhit = *itr;
1006  printf("query %d %4d - %4d query %d %4d - %4d score %d\n",
1007  subhit->m_SeqIndex1,
1008  subhit->m_SeqRange1.GetFrom(),
1009  subhit->m_SeqRange1.GetTo(),
1010  subhit->m_SeqIndex2,
1011  subhit->m_SeqRange2.GetFrom(),
1012  subhit->m_SeqRange2.GetTo(),
1013  subhit->m_Score);
1014  }
1015  }
1016  printf("\n\n");
1017  }
1018  //-------------------------------------------------------
1019 }
1021 END_SCOPE(cobalt)
