NCBI C++ ToolKit
cobalt.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: cobalt.cpp
29 
30 Author: Jason Papadopoulos
31 
32 Contents: Implementation of CMultiAligner class
33 
34 ******************************************************************************/
35 
36 
37 #include <ncbi_pch.hpp>
42 #include <algo/cobalt/cobalt.hpp>
43 
44 /// @file cobalt.cpp
45 /// Implementation of the CMultiAligner class
46 
48 BEGIN_SCOPE(cobalt)
49 
50 
52  : m_Options(new CMultiAlignerOptions(CMultiAlignerOptions::kDefaultMode
53  | CMultiAlignerOptions::fNoRpsBlast))
54 {
55  x_Init();
56  x_InitParams();
57  x_InitAligner();
58 }
59 
60 
61 CMultiAligner::CMultiAligner(const string& rps_db)
62  : m_Options(new CMultiAlignerOptions(CMultiAlignerOptions::kDefaultMode))
63 {
64  x_Init();
65  x_InitParams();
66  x_InitAligner();
67 }
68 
69 
71  : m_Options(options)
72 {
73  x_Init();
74  x_InitParams();
75  x_InitAligner();
76 }
77 
78 
80 {
82 
85 
86  int score = m_Options->GetUserConstraintsScore();
90 
91  m_UserHits.AddToHitList(new CHit(it->seq1_index, it->seq2_index,
92  TRange(it->seq1_start, it->seq1_stop),
93  TRange(it->seq2_start, it->seq2_stop),
94  score, CEditScript()));
95 
96  }
97 
98  //Note: Patterns are kept in m_Options
99 }
100 
101 
103 {
111 }
112 
113 
115 {
116  ITERATE (vector<CSequence>, it, m_QueryData) {
117  const unsigned char* sequence = it->GetSequence();
118  for (int i=0;i < it->GetLength();i++) {
119  if (sequence[i] == CSequence::kGapChar) {
120  NCBI_THROW(CMultiAlignerException, eInvalidInput, "Gaps in "
121  "input sequences are not allowed");
122  }
123  }
124  }
125 
126  return true;
127 }
128 
130 {
131  // check that both input alignments are set
132  if (m_InMSA1.empty() || m_InMSA2.empty()) {
133  NCBI_THROW(CMultiAlignerException, eInvalidInput,
134  "Empty input alignment");
135  }
136 
137  // check that indices of representative sequences are not out of bounds
138  // validate indices for MSA1
139  ITERATE (vector<int>, it, m_Msa1Repr) {
140  if (*it >= (int)m_InMSA1.size() || *it < 0) {
141  NCBI_THROW(CMultiAlignerException, eInvalidInput,
142  (string)"Sequence index " + NStr::IntToString(*it) +
143  " for MSA 1 out of bounds");
144  }
145  }
146 
147  // validate indices for MSA2
148  ITERATE (vector<int>, it, m_Msa2Repr) {
149  if (*it >= (int)m_InMSA2.size() || *it < 0) {
150  NCBI_THROW(CMultiAlignerException, eInvalidInput,
151  (string)"Sequence index " + NStr::IntToString(*it) +
152  " for MSA 2 out of bounds");
153  }
154  }
155 
156  return true;
157 }
158 
160 {
161  for (int i = 0; i < m_UserHits.Size(); i++) {
162  CHit* hit = m_UserHits.GetHit(i);
163  if (hit->m_SeqIndex1 < 0 || hit->m_SeqIndex2 < 0 ||
164  hit->m_SeqIndex1 >= (int)m_QueryData.size() ||
165  hit->m_SeqIndex2 >= (int)m_QueryData.size()) {
166  NCBI_THROW(CMultiAlignerException, eInvalidInput,
167  "Sequence specified by constraint is out of range");
168  }
169  int from1 = hit->m_SeqRange1.GetFrom();
170  int from2 = hit->m_SeqRange2.GetFrom();
171  int to1 = hit->m_SeqRange1.GetTo();
172  int to2 = hit->m_SeqRange2.GetTo();
173  int index1 = hit->m_SeqIndex1;
174  int index2 = hit->m_SeqIndex2;
175 
176  if (from1 > to1 || from2 > to2) {
177  NCBI_THROW(CMultiAlignerException, eInvalidInput,
178  "Range specified by constraint is invalid");
179  }
180  if (from1 >= m_QueryData[index1].GetLength() ||
181  to1 >= m_QueryData[index1].GetLength() ||
182  from2 >= m_QueryData[index2].GetLength() ||
183  to2 >= m_QueryData[index2].GetLength()) {
184  NCBI_THROW(CMultiAlignerException, eInvalidInput,
185  "Constraint is out of range");
186  }
187  }
188 
189  return true;
190 }
191 
192 
193 void
195  CRef<objects::CScope> scope)
196 {
197  if (queries.size() < 2) {
198  NCBI_THROW(CMultiAlignerException, eInvalidInput,
199  "Aligner requires at least two input sequences");
200  }
201 
202  m_Scope = scope;
203 
204  m_tQueries.resize(queries.size());
205  copy(queries.begin(), queries.end(), m_tQueries.begin());
206 
207  m_QueryData.clear();
208  ITERATE(vector< CRef<objects::CSeq_loc> >, itr, m_tQueries) {
209  m_QueryData.push_back(CSequence(**itr, *m_Scope));
210  }
211 
214  Reset();
215 }
216 
217 
218 void
220 {
221  if (queries.size() < 2) {
222  NCBI_THROW(CMultiAlignerException, eInvalidInput,
223  "Aligner requires at least two input sequences");
224  }
225 
227  = objects::CObjectManager::GetInstance();
228 
229  m_Scope.Reset(new objects::CScope(*objmgr));
230  m_Scope->AddDefaults();
231 
232  vector<objects::CBioseq_Handle> bioseq_handles;
233  ITERATE(vector< CRef<objects::CBioseq> >, it, queries) {
234  bioseq_handles.push_back(m_Scope->AddBioseq(**it));
235  }
236 
237  m_tQueries.clear();
238  ITERATE(vector<objects::CBioseq_Handle>, it, bioseq_handles) {
240  seq_loc(new objects::CSeq_loc(objects::CSeq_loc::e_Whole));
241 
242  try {
243  seq_loc->SetId(*it->GetSeqId());
244  }
245  catch (objects::CObjMgrException e) {
246  NCBI_THROW(CMultiAlignerException, eInvalidInput,
247  (string)"Missing seq-id in bioseq. " + e.GetMsg());
248  }
249  m_tQueries.push_back(seq_loc);
250  }
251 
252  m_QueryData.clear();
253  ITERATE(vector< CRef<objects::CSeq_loc> >, itr, m_tQueries) {
254  m_QueryData.push_back(CSequence(**itr, *m_Scope));
255  }
256 
259  Reset();
260 }
261 
262 
264 {
265  if (queries.size() < 2) {
266  NCBI_THROW(CMultiAlignerException, eInvalidInput,
267  "Aligner requires at least two input sequences");
268  }
269 
270  m_Scope = queries[0].scope;
271 
272  m_tQueries.resize(queries.size());
273  for (size_t i=0;i < queries.size();i++) {
274  m_tQueries[i].Reset(new objects::CSeq_loc());
275  try {
276  m_tQueries[i]->Assign(*queries[i].seqloc);
277  }
278  catch (...) {
279  NCBI_THROW(CMultiAlignerException, eInvalidInput, "Bad SSeqLoc");
280  }
281  if (i > 0) {
282  m_Scope->AddScope(*queries[i].scope);
283  }
284  }
285 
286  m_QueryData.clear();
287  ITERATE(vector< CRef<objects::CSeq_loc> >, itr, m_tQueries) {
288  m_QueryData.push_back(CSequence(**itr, *m_Scope));
289  }
290 
293  Reset();
294 }
295 
296 void
297 CMultiAligner::SetInputMSAs(const objects::CSeq_align& msa1,
298  const objects::CSeq_align& msa2,
299  const set<int>& repr1,
300  const set<int>& repr2,
301  CRef<objects::CScope> scope)
302 {
303  m_Scope = scope;
304 
305  m_InMSA1.clear();
306  m_InMSA2.clear();
307  m_tQueries.clear();
308 
309  // get MSA data
310  CSequence::CreateMsa(msa1, *scope, m_InMSA1);
311  CSequence::CreateMsa(msa2, *scope, m_InMSA2);
312 
313  // get MSA sequences as Seq_locs
314  ITERATE (objects::CDense_seg::TIds, it,
315  msa1.GetSegs().GetDenseg().GetIds()) {
316 
318  new objects::CSeq_loc(objects::CSeq_loc::e_Whole)));
319  m_tQueries.back()->SetId(**it);
320  }
321  ITERATE (objects::CDense_seg::TIds, it,
322  msa2.GetSegs().GetDenseg().GetIds()) {
323 
325  new objects::CSeq_loc(objects::CSeq_loc::e_Whole)));
326  m_tQueries.back()->SetId(**it);
327  }
328 
329  // process indices of representative sequences
330  if (!repr1.empty()) {
331  m_Msa1Repr.resize(repr1.size());
332  copy(repr1.begin(), repr1.end(), m_Msa1Repr.begin());
333  }
334  else {
335  m_Msa1Repr.reserve(m_InMSA1.size());
336  for (int i=0;i < (int)m_InMSA1.size();i++) {
337  m_Msa1Repr.push_back(i);
338  }
339  }
340 
341  if (!repr2.empty()) {
342  m_Msa2Repr.resize(repr2.size());
343  copy(repr2.begin(), repr2.end(), m_Msa2Repr.begin());
344  }
345  else {
346  m_Msa1Repr.reserve(m_InMSA2.size());
347  for (int i=0;i < (int)m_InMSA2.size();i++) {
348  m_Msa2Repr.push_back(i);
349  }
350  }
351 
354  Reset();
355 }
356 
358 {
359  if (!m_Tree.GetTree()) {
360  NCBI_THROW(CMultiAlignerException, eInvalidInput,
361  "No tree to return");
362  }
363 
366 
367  return btc;
368 }
369 
372  void* user_data)
373 {
374  FInterruptFn prev_fun = m_Interrupt;
375  m_Interrupt = fnptr;
376  m_ProgressMonitor.user_data = user_data;
377  return prev_fun;
378 }
379 
380 void
381 CMultiAligner::x_SetScoreMatrix(const char *matrix_name)
382 {
383  if (strcmp(matrix_name, "BLOSUM62") == 0)
385  else if (strcmp(matrix_name, "BLOSUM45") == 0)
387  else if (strcmp(matrix_name, "BLOSUM80") == 0)
389  else if (strcmp(matrix_name, "PAM30") == 0)
391  else if (strcmp(matrix_name, "PAM70") == 0)
393  else if (strcmp(matrix_name, "PAM250") == 0)
395  else
396  NCBI_THROW(CMultiAlignerException, eInvalidScoreMatrix,
397  "Unsupported score matrix. Valid matrix names: BLOSUM45, "\
398  "BLOSUM62, BLOSUM80, PAM30, PAM70 and PAM250");
399 }
400 
401 
402 void
404 {
405  m_Results.clear();
410 }
411 
412 
413 void
415 {
416  // Convert the current collection of pairwise
417  // hits into a distance matrix. This is the only
418  // nonlinear operation that involves the scores of
419  // alignments, so the raw scores should be converted
420  // into bit scores before becoming distances.
421  //
422  // To do that, we need Karlin parameters for the
423  // score matrix and gap penalties chosen. Since both
424  // of those can change independently, calculate the
425  // Karlin parameters right now
426 
428 
429  Blast_KarlinBlk karlin_blk;
430  const Int4 kGapOpen = 11;
431  const Int4 kGapExtend = 1;
432  if (Blast_KarlinBlkGappedLoadFromTables(&karlin_blk, kGapOpen, kGapExtend,
433  m_Options->GetScoreMatrixName().c_str(), true) != 0) {
434 
435  NCBI_THROW(blast::CBlastException, eInvalidArgument,
436  "Cannot generate Karlin block");
437  }
438 
439  CDistances distances(m_QueryData,
442  karlin_blk);
443 
445 
447  const CDistMethods::TMatrix& bigmat = distances.GetMatrix();
448 
449  const CClusterer::TClusters& clusters = m_Clusterer.GetClusters();
450  dmat.Resize(clusters.size(), clusters.size(), 0.0);
451  for (size_t i=0;i < clusters.size() - 1;i++) {
452  for (size_t j=i+1;j < clusters.size();j++) {
453  dmat(i, j) = bigmat(clusters[i].GetPrototype(),
454  clusters[j].GetPrototype());
455  dmat(j, i) = dmat(i, j);
456  }
457  }
458  }
459  else {
460  dmat = distances.GetMatrix();
461  }
462 
463 
464  //--------------------------------
465  if (m_Options->GetVerbose()) {
466  const CDistMethods::TMatrix& matrix = dmat;
467  printf("distance matrix:\n");
468  printf(" ");
469  for (int i = matrix.GetCols() - 1; i > 0; i--)
470  printf("%5d ", i);
471  printf("\n");
472 
473  for (int i = 0; i < (int)matrix.GetRows() - 1; i++) {
474  printf("%2d: ", i);
475  for (int j = matrix.GetCols() - 1; j > i; j--) {
476  printf("%5.3f ", matrix(i, j));
477  }
478  printf("\n");
479  }
480  printf("\n\n");
481  }
482  //--------------------------------
483 
484  // build the guide tree associated with the matrix
486 
487  CClusterer clusterer(dmat);
488  // max in-cluster distance ensures one cluster hence one tree
489  clusterer.ComputeClusters(DBL_MAX, CClusterer::eCompleteLinkage, true,
490  1.0);
491  _ASSERT(clusterer.GetClusters().size() == 1);
492  m_Tree.SetTree(clusterer.ReleaseTree());
493  }
494  else {
495  m_Tree.ComputeTree(dmat,
497  }
498 
499  //--------------------------------
500  if (m_Options->GetVerbose()) {
502  }
503  //--------------------------------
504 
505  // check for interrupt
508  "Alignment interrupted");
509  }
510 }
511 
512 
514 {
515  // put sequences with gaps in structures used by the alignment methods
516  ITERATE (vector<CSequence>, it, m_InMSA1) {
517  m_QueryData.push_back(*it);
518  }
519  ITERATE (vector<CSequence>, it, m_InMSA2) {
520  m_QueryData.push_back(*it);
521  }
522 
523  // create a dummy progressive alignment tree
524  vector<CTree::STreeLeaf> node_list1;
525  vector<CTree::STreeLeaf> node_list2;
526  int i = 0;
527  for (;i < (int)m_InMSA1.size();i++) {
528  node_list1.push_back(CTree::STreeLeaf(i, 1.0));
529  }
530  _ASSERT(node_list1.back().query_idx == (int)m_InMSA1.size() - 1);
531  for (;i < (int)(m_InMSA1.size() + m_InMSA2.size());i++) {
532  node_list2.push_back(CTree::STreeLeaf(i, 1.0));
533  }
534  _ASSERT(node_list2.back().query_idx == (int)(m_InMSA1.size()
535  + m_InMSA2.size() - 1));
536 
537  // remove gap-only columns in the input alignments
538  vector<int> compress_inds;
539  for (i=0;i < (int)m_InMSA1.size();i++) {
540  compress_inds.push_back(i);
541  }
543  compress_inds.clear();
544  for (;i < (int)m_QueryData.size();i++) {
545  compress_inds.push_back(i);
546  }
548 
549  // make sure that no clustering of input sequences is done
551 
552  // find constraints
553  blast::TSeqLocVector blast_queries;
554  vector<int> indices;
555  x_CreateBlastQueries(blast_queries, indices);
556  x_FindDomainHits(blast_queries, indices);
557  x_FindLocalHits(blast_queries, indices);
558 
559  vector<const CSequence*> pattern_queries;
560  x_CreatePatternQueries(pattern_queries, indices);
561  x_FindPatternHits(pattern_queries, indices);
563 
564  // index constraints by sequence pairs
565  CNcbiMatrix<CHitList> pair_info(m_QueryData.size(), m_QueryData.size(),
566  CHitList());
567 
568  for (int i = 0; i < m_CombinedHits.Size(); i++) {
569  CHit *hit = m_CombinedHits.GetHit(i);
570  pair_info(hit->m_SeqIndex1, hit->m_SeqIndex2).AddToHitList(hit);
571  pair_info(hit->m_SeqIndex2, hit->m_SeqIndex1).AddToHitList(hit);
572  }
573 
574  // do alignment
575  int iteration = 0;
576  x_AlignProfileProfile(node_list1, node_list2, m_QueryData, pair_info,
577  iteration);
578 
579  // clear pair-wise constraints to aoid double memory release
580  for (unsigned int i = 0; i < pair_info.GetRows(); i++) {
581  for (unsigned int j = 0; j < pair_info.GetCols(); j++) {
582  pair_info(i, j).ResetList();
583  }
584  }
585 
586  // save result
587  m_QueryData.swap(m_Results);
588 }
589 
591 {
592  // if aligning MSAs
593  if (!m_InMSA1.empty()) {
594  x_AlignMSAs();
595  return;
596  }
597 
598  // otherwise aligning sequences
599 
600  if ((int)m_tQueries.size() >= kClusterNodeId) {
601  NCBI_THROW(CMultiAlignerException, eInvalidInput,
602  (string)"Number of queries exceeds maximum of "
604  }
605 
606  bool is_cluster_found = false;
607  vector<TPhyTreeNode*> cluster_trees;
608 
609  switch (m_ClustAlnMethod) {
611 
612  break;
613 
615  if ((is_cluster_found = x_FindQueryClusters())) {
617 
618  // No multiple alignment is done for one cluster
619  if (m_Clusterer.GetClusters().size() == 1) {
620  m_tQueries.swap(m_AllQueries);
621  return;
622  }
623  }
624  break;
625 
627  if ((is_cluster_found = x_FindQueryClusters())) {
628  x_ComputeClusterTrees(cluster_trees);
629  x_FindLocalInClusterHits(cluster_trees);
630  }
631  break;
632 
633  default:
634  NCBI_THROW(CMultiAlignerException, eInvalidOptions,
635  "Invalid clustering option");
636  }
637 
638  blast::TSeqLocVector blast_queries;
639  vector<int> indices;
640  x_CreateBlastQueries(blast_queries, indices);
641  x_FindDomainHits(blast_queries, indices);
642  x_FindLocalHits(blast_queries, indices);
643 
644  vector<const CSequence*> pattern_queries;
645  x_CreatePatternQueries(pattern_queries, indices);
646  x_FindPatternHits(pattern_queries, indices);
648 
649  switch (m_ClustAlnMethod) {
651  x_ComputeTree();
653  break;
654 
656  x_ComputeTree();
658  if (is_cluster_found) {
660  }
661  break;
662 
664  if (m_Clusterer.GetClusters().size() == 1) {
665  // node id >= kClusterNodeId denotes root of cluster tree
666  cluster_trees[0]->GetValue().SetId(kClusterNodeId);
667  m_Tree.SetTree(cluster_trees[0]);
668  }
669  else {
670  x_ComputeTree();
671  x_BuildFullTree(cluster_trees);
672  }
674  break;
675 
676  default:
677  NCBI_THROW(CMultiAlignerException, eInvalidOptions,
678  "Invalid clustering option");
679  }
680 
681 }
682 
684 {
685  EStatus status = eSuccess;
686 
687  try {
688  x_Run();
689  }
690  catch (CMultiAlignerException e) {
693 
694  switch (err_code) {
697  status = eOptionsError;
698  break;
699 
701  status = eQueriesError;
702  break;
703 
705  status = eInterrupt;
706  break;
707 
709  status = eOutOfMemory;
710  break;
711 
712  default:
713  status = eInternalError;
714  }
715 
716  m_Messages.push_back(e.GetMsg());
717  }
718  catch (blast::CBlastException e) {
719  blast::CBlastException::EErrCode err_code
720  = (blast::CBlastException::EErrCode)e.GetErrCode();
721 
722  status = (err_code == blast::CBlastException::eInvalidArgument
724 
725  m_Messages.push_back(e.GetMsg());
726  }
727  catch (CException e) {
728  status = eInternalError;
729  m_Messages.push_back(e.GetMsg());
730  }
731  catch (std::exception e) {
732  status = eInternalError;
733  m_Messages.push_back((string)e.what());
734  }
735  catch (...) {
736  status = eInternalError;
737  }
738 
739  return (TStatus)status;
740 }
741 
742 
743 bool
745 {
747 
748  // Compute k-mer counts and distances between query sequences
749  vector<TKmerCounts> kmer_counts;
753 
754  // TO DO: Remove distance matrix, currently it is needed for finding
755  // cluster representatives
756 
757  // distance matrix is need for fining cluster representatives
758  shared_ptr<CClusterer::TDistMatrix> dmat
759  = TKMethods::ComputeDistMatrix(kmer_counts,
761 
762  // If the central sequence is set, make the distance between this sequence
763  // and all others zero. This will result in a progressive alignment tree
764  // that will align the central sequence early so that its constraints
765  // will have more influence on the multiple alignment. This is used for
766  // fast alignment built based mostly on constraints from local hits.
767  if (m_Options->GetCentralSeq() >= 0) {
768  int center = m_Options->GetCentralSeq();
769  for (size_t i=0;i < dmat->GetRows();i++) {
770  if (i == center) {
771  continue;
772  }
773  (*dmat)(center, i) = 0.0;
774  (*dmat)(i, center) = 0.0;
775  }
776  }
777 
778  // find sequences with user constraints
779  set<int> constr_q;
780  for (int i=0;i < m_UserHits.Size();i++) {
781  CHit* hit = m_UserHits.GetHit(i);
782  constr_q.insert(hit->m_SeqIndex1);
783  constr_q.insert(hit->m_SeqIndex2);
784  }
785 
786  // find a set of graph edges between sequences (will be used for clustering)
787  CRef<CLinks> links(new CLinks((Uint4)kmer_counts.size()));
788  for (int i=0;i < (int)dmat->GetCols() - 1;i++) {
789 
790  // do no create links for sequences with user constraints as
791  // they must not be clustered together with other sequences
792  if (!constr_q.empty() && constr_q.find(i) != constr_q.end()) {
793  continue;
794  }
795 
796  for (int j=i+1;j < (int)dmat->GetCols();j++) {
797 
798  if (!constr_q.empty() && constr_q.find(j) != constr_q.end()) {
799  continue;
800  }
801 
802  if ((*dmat)(i, j) < m_Options->GetMaxInClusterDist()) {
803  links->AddLink(i, j, (*dmat)(i, j));
804  }
805  }
806  }
807  links->Sort();
808 
809  // Set distances between queries that appear in user constraints and all
810  // other queries to maximum, so that they form one-element clusters
811  const double kMaxDistance = 1.5;
812 
813  // set distances to maximum
814  ITERATE(set<int>, it, constr_q) {
815  for (int i=0;i < (int)dmat->GetRows();i++) {
816  if (i != *it) {
817  (*dmat)(i, *it) = kMaxDistance;
818  (*dmat)(*it, i) = kMaxDistance;
819  }
820  }
821  }
822 
823  //-------------------------------------------------------
824  if (m_Options->GetVerbose()) {
825  printf("K-mer counts distance matrix:\n");
826  printf(" ");
827  for (size_t i=dmat->GetCols() - 1;i > 0;i--) {
828  printf("%6d", (int)i);
829  }
830  printf("\n");
831  for (size_t i=0;i < dmat->GetRows() - 1;i++) {
832  printf("%3d:", (int)i);
833  for (size_t j=dmat->GetCols() - 1;j > i;j--) {
834  printf("%6.3f", (*dmat)(i, j));
835  }
836  printf("\n");
837  }
838  printf("\n");
839  }
840  //-------------------------------------------------------
841 
842  // Compute query clusters
843  m_Clusterer.SetLinks(links);
847  m_Clusterer.Run();
848 
849  // save distance matrix in clusterer
851 
852  // links are not needed any more
853  links.Reset();
854 
855  const CClusterer::TClusters& clusters = m_Clusterer.GetClusters();
856 
857  // If there are only single-element clusters
858  // COBALT will be run without clustering informartion
859  if (clusters.size() == m_QueryData.size()) {
860  //-----------------------------------------------------------------
861  if (m_Options->GetVerbose()) {
862  printf("\nNumber of queries in clusters: 0 (0%%)\n");
863  printf("Number of domain searches reduced by: 0 (0%%)\n\n");
864  printf("Only single-element clusters were found."
865  " No clustering information will be used.\n");
866  }
867  //-----------------------------------------------------------------
868 
869  m_Clusterer.Reset();
871 
872  return false;
873  }
874 
875  // Select cluster prototypes
877 
878  // For one-element clusters same element
879  if (it->size() == 1) {
880  it->SetPrototype(*it->begin());
881 
882  } else if (it->size() == 2) {
883 
884  // For two-element clusters - the longer sequence
885  int len1 = m_QueryData[(*it)[0]].GetLength();
886  int len2 = m_QueryData[(*it)[1]].GetLength();
887  int prot = (len1 > len2) ? (*it)[0] : (*it)[1];
888  it->SetPrototype(prot);
889 
890  } else {
891 
892  // For more than two elements - cluster center
893  it->SetPrototype(it->FindCenterElement(m_Clusterer.GetDistMatrix()));
894  }
895  }
896 
897  // Rearrenging input sequences to consider cluster information
898  vector< CRef<objects::CSeq_loc> > cluster_prototypes;
901  cluster_prototypes.push_back(m_tQueries[cluster_it->GetPrototype()]);
902  m_AllQueryData.push_back(m_QueryData[cluster_it->GetPrototype()]);
903  }
904 
905  m_tQueries.swap(cluster_prototypes);
906  cluster_prototypes.swap(m_AllQueries);
908  }
909 
910  //-------------------------------------------------------
911  if (m_Options->GetVerbose()) {
912  const vector<CSequence>& q =
915  printf("Query clusters:\n");
916  int cluster_idx = 0;
917  size_t num_in_clusters = 0;
918  ITERATE(CClusterer::TClusters, it_cl, clusters) {
919  printf("Cluster %3d: ", cluster_idx++);
920  printf("(prototype: %3d) ", it_cl->GetPrototype());
921 
922  ITERATE(CClusterer::TSingleCluster, it_el, *it_cl) {
923  printf("%d (%d), ", *it_el, q[*it_el].GetLength());
924  }
925  printf("\n");
926  if (it_cl->size() > 1) {
927  num_in_clusters += it_cl->size();
928  }
929  }
930 
931  size_t gain = m_QueryData.size() - clusters.size();
932  printf("\nNumber of queries in clusters: %lu (%.0f%%)\n",
933  num_in_clusters,
934  (double)num_in_clusters / m_QueryData.size() * 100.0);
935  printf("Number of domain searches reduced by: %lu (%.0f%%)\n\n", gain,
936  (double) gain / m_QueryData.size() * 100.0);
937 
939  printf("Distances in clusters:\n");
940  for (size_t cluster_idx=0;cluster_idx < clusters.size();
941  cluster_idx++) {
942 
943  const CClusterer::TSingleCluster& cluster = clusters[cluster_idx];
944  if (cluster.size() == 1) {
945  continue;
946  }
947 
948  printf("Cluster %d:\n", (int)cluster_idx);
949  if (cluster.size() == 2) {
950  printf(" %6.3f\n\n", d(cluster[0], cluster[1]));
951  continue;
952  }
953 
954  printf(" ");
955  for (size_t i= cluster.size() - 1;i > 0;i--) {
956  printf("%6d", (int)cluster[i]);
957  }
958  printf("\n");
959  for (size_t i=0;i < cluster.size() - 1;i++) {
960  printf("%3d:", (int)cluster[i]);
961  for (size_t j=cluster.size() - 1;j > i;j--) {
962  printf("%6.3f", d(cluster[i], cluster[j]));
963  }
964  printf("\n");
965  }
966  printf("\n\n");
967  }
968 
969  printf("Sequences that belong to different clusters with distance"
970  " smaller than threshold (exludes prototypes):\n");
971  ITERATE(CClusterer::TClusters, it, clusters) {
972  if (it->size() == 1) {
973  continue;
974  }
975  ITERATE(CClusterer::TSingleCluster, elem, *it) {
976  ITERATE(CClusterer::TClusters, cl, clusters) {
977  if (it == cl) {
978  continue;
979  }
980 
982 
983  if (*el == cl->GetPrototype()) {
984  continue;
985  }
986 
987  if (d(*elem, *el) < m_Options->GetMaxInClusterDist()) {
988  printf("%3d, %3d: %f\n", *elem, *el, d(*elem, *el));
989  }
990  }
991  }
992  }
993  }
994  printf("\n\n");
995 
996 
997  }
998  //-------------------------------------------------------
999 
1000  // Distance matrix is not needed any more, release memory
1002 
1004  }
1005 
1006  return true;
1007 }
1008 
1010 {
1011  const CClusterer::TClusters& clusters = m_Clusterer.GetClusters();
1012  m_ClusterGapPositions.clear();
1013  m_ClusterGapPositions.resize(clusters.size());
1014 
1021 
1022  // For each cluster
1023  for (size_t cluster_idx=0;cluster_idx < clusters.size();cluster_idx++) {
1024 
1025  const CClusterer::TSingleCluster& cluster = clusters[cluster_idx];
1026  CSequence& cluster_prot = m_AllQueryData[cluster.GetPrototype()];
1027 
1028  // One-element clusters contain only prototype sequence hence
1029  // nothing to align
1030  if (clusters[cluster_idx].size() > 1) {
1031 
1032  // Iterating over cluster elements
1033  ITERATE(CClusterer::TSingleCluster, seq_idx, cluster) {
1034  ASSERT((size_t)*seq_idx < m_AllQueryData.size());
1035 
1036  bool is_gap_in_prototype = false;
1037 
1038  // Skipping prototype sequence
1039  if (*seq_idx == cluster.GetPrototype()) {
1040  continue;
1041  }
1042  CSequence& cluster_seq = m_AllQueryData[*seq_idx];
1043 
1044  // Aligning cluster sequence to cluster prototype
1045  m_Aligner.SetSequences((const char*)cluster_seq.GetSequence(),
1046  cluster_seq.GetLength(),
1047  (const char*)cluster_prot.GetSequence(),
1048  cluster_prot.GetLength());
1049 
1050  m_Aligner.SetEndSpaceFree(false, false, false, false);
1051 
1052  // If there is a large length disparity between the two
1053  // sequences, reduce or eliminate gap penalties.
1054  int len1 = cluster_seq.GetLength();
1055  int len2 = cluster_prot.GetLength();
1056  if (len1 > 1.2 * len2 || len2 > 1.2 * len1) {
1059  }
1060 
1061  // Run aligner
1062  m_Aligner.Run();
1063 
1064  // Reset gap penalties
1071 
1073  cluster_seq.PropagateGaps(t, CNWAligner::eTS_Insert);
1074 
1075  // Saving gap positions with respect to non-gap letters only
1076  for (size_t j=0;j < t.size();j++) {
1077  if (t[j] == CNWAligner::eTS_Delete) {
1078  is_gap_in_prototype = true;
1079  break;
1080  }
1081  }
1082 
1083  if (!is_gap_in_prototype) {
1084  continue;
1085  }
1086 
1087  cluster_prot.PropagateGaps(t, CNWAligner::eTS_Delete);
1088 
1089  // If gaps are added to cluster prototype they also need to
1090  // be added to sequences that were already aligned
1092  for (it=clusters[cluster_idx].begin();it != seq_idx;++it) {
1093  if (*it == cluster.GetPrototype()) {
1094  continue;
1095  }
1096 
1097  m_AllQueryData[*it].PropagateGaps(t,
1099  }
1100  }
1101 
1102  Uint4 pos = 0;
1103  for (Uint4 i=0;i < (Uint4)cluster_prot.GetLength();i++) {
1104  if (cluster_prot.GetLetter(i) == CSequence::kGapChar) {
1105  m_ClusterGapPositions[cluster_idx].push_back(pos);
1106  }
1107  else {
1108  pos++;
1109  }
1110  }
1111 
1112  //------------------------------------------------------------
1113  if (m_Options->GetVerbose()) {
1114  printf("Aligning in cluster %d:\n", (int)cluster_idx);
1115  ITERATE(CClusterer::TSingleCluster, elem, cluster) {
1116  const CSequence& seq = m_AllQueryData[*elem];
1117  printf("%3d: ", *elem);
1118  for (int i=0;i < seq.GetLength();i++) {
1119  printf("%c", seq.GetPrintableLetter(i));
1120  }
1121  printf("\n");
1122  }
1123  printf("\n\n");
1124  }
1125  //------------------------------------------------------------
1126  }
1127 
1128  // check for interrupt
1131  "Alignement Interrupted");
1132  }
1133  }
1134  //--------------------------------------------------------------------
1135  if (m_Options->GetVerbose()) {
1136  for (size_t i=0;i < m_ClusterGapPositions.size();i++) {
1137  if (m_ClusterGapPositions[i].empty()) {
1138  continue;
1139  }
1140 
1141  printf("Gaps in cluster %d: ", (int)i);
1142  size_t j;
1143  for (j=0;j < m_ClusterGapPositions[i].size() - 1;j++) {
1144  printf("%3d, ", m_ClusterGapPositions[i][j]);
1145  }
1146  printf("%3d\n", m_ClusterGapPositions[i][j]);
1147  }
1148  printf("\n\n");
1149  }
1150  //--------------------------------------------------------------------
1151 
1152  // If there is only one cluster no multiple alignment is done
1153  if (clusters.size() == 1) {
1154  m_AllQueryData.swap(m_Results);
1155  }
1156 
1157 }
1158 
1159 // Initiate regular column of multiple alignment
1160 void CMultiAligner::x_InitColumn(vector<CMultiAligner::SColumn>::iterator& it,
1161  size_t len)
1162 {
1163  it->insert = false;
1164  it->letters.resize(len);
1165  for (size_t i=0;i < len;i++) {
1166  it->letters[i] = -1;
1167  }
1168  it->number = 1;
1169  it->cluster = -1;
1170 }
1171 
1172 
1173 // Initiate a range from in-cluster alignment for insertion into multiple
1174 // alignment
1176  vector<CMultiAligner::SColumn>::iterator& it,
1177  size_t len, int num, int cluster)
1178 {
1179  it->insert = true;
1180  it->letters.resize(len);
1181  for (size_t i=0;i < len;i++) {
1182  it->letters[i] = -1;
1183  }
1184  it->number = num;
1185  it->cluster = cluster;
1186 }
1187 
1189 {
1190  const CClusterer::TClusters& clusters = m_Clusterer.GetClusters();
1191 
1192  int seq_length = m_Results[0].GetLength();
1193  size_t num_seqs = m_AllQueryData.size();
1194 
1195  vector<int> letter_inds(clusters.size());
1196  vector<SColumn> columns(seq_length);
1197 
1198  // Represent multiple alignment as list of columns of positions in input
1199  // sequences (n-th letter)
1200  int col = 0;
1201  NON_CONST_ITERATE(vector<SColumn>, it, columns) {
1202  x_InitColumn(it, num_seqs);
1203  for (size_t j=0;j < clusters.size();j++) {
1204  if (m_Results[j].GetLetter(col) != CSequence::kGapChar) {
1205  it->letters[clusters[j].GetPrototype()] = letter_inds[j]++;
1206  }
1207  }
1208  col++;
1209  }
1210 
1211  // Insert in-cluster ranges to columns
1212  int new_length = seq_length;
1213 
1214  // for each cluster
1215  for (size_t cluster_idx=0;cluster_idx < clusters.size();cluster_idx++) {
1216 
1217  // for each gap position
1218  for (size_t i=0;i < m_ClusterGapPositions[cluster_idx].size();i++) {
1219 
1220  // get letter before which the gap needs to be placed
1221  Uint4 letter = m_ClusterGapPositions[cluster_idx][i];
1222  Uint4 offset = (Uint4)i;
1223  int num = 1;
1224 
1225  // combine all gaps before the same letter as one range
1226  while (i < m_ClusterGapPositions[cluster_idx].size() - 1
1227  && m_ClusterGapPositions[cluster_idx][i + 1] == letter) {
1228  i++;
1229  num++;
1230  }
1231 
1232  // find that column that contains this letter
1233  vector<SColumn>::iterator it = columns.begin();
1234  size_t prototype_idx = clusters[cluster_idx].GetPrototype();
1235  while (it != columns.end()
1236  && (it->insert || it->letters[prototype_idx] < (int)letter)) {
1237  ++it;
1238  }
1239 
1240  // insert the range in all cluster sequences
1241  it = columns.insert(it, SColumn());
1242  x_InitInsertColumn(it, num_seqs, num, (int)cluster_idx);
1243  ITERATE(CClusterer::TSingleCluster, elem, clusters[cluster_idx]) {
1244 
1245  // for insert ranges leter index is absolute index in
1246  // in-cluster alignment
1247  it->letters[*elem] = letter + offset;
1248  }
1249 
1250  // extend the length of the sequences by added ranges
1251  new_length += num;
1252 
1253  // check for interrupt
1256  "Alignment interrupted");
1257  }
1258  }
1259  }
1260 
1261 
1262  // Convert columns to array of CSequence
1263  vector<CSequence> results(m_AllQueryData.size());
1264 
1265  // Initialize all sequences to gaps
1266  NON_CONST_ITERATE(vector<CSequence>, it, results) {
1267  it->Reset(new_length);
1268  }
1269 
1270 
1271  // offsets caused by in-cluster gaps in cluster prototypes
1272  vector<int> gap_offsets(clusters.size());
1273  col = 0;
1274 
1275  // for each column
1276  ITERATE(vector<SColumn>, it, columns) {
1277  if (!it->insert) {
1278 
1279  // for regular column
1280  // for each cluster
1281  for (size_t i=0;i < clusters.size();i++) {
1282 
1283  // find letter index in input sequnece (n-th letter)
1284  size_t prototype_idx = clusters[i].GetPrototype();
1285  int letter = it->letters[prototype_idx];
1286 
1287  // if gap in this position, do nothing
1288  if (letter < 0) {
1289  continue;
1290  }
1291 
1292  // find correct location by skipping gaps in in-cluster
1293  // alingment
1294  // NOTE: This index juggling could be simplified if we
1295  // kept an array of unmodified input sequences
1296  while (m_AllQueryData[prototype_idx].GetLetter(letter
1297  + gap_offsets[i]) == CSequence::kGapChar) {
1298  gap_offsets[i]++;
1299  }
1300 
1301  // insert letter in all cluster sequences
1302  ITERATE(CClusterer::TSingleCluster, elem, clusters[i]) {
1303  results[*elem].SetLetter(col,
1304  m_AllQueryData[*elem].GetLetter(letter + gap_offsets[i]));
1305  }
1306  }
1307  }
1308  else {
1309 
1310  // for insetr columns
1311  // for each cluster sequence copy letters from in-cluster alignment
1312  ITERATE(CClusterer::TSingleCluster, elem, clusters[it->cluster]) {
1313  for (int i=0; i < it->number;i++) {
1314  results[*elem].SetLetter(col + i,
1315  m_AllQueryData[*elem].GetLetter(it->letters[*elem] + i));
1316  }
1317  }
1318 
1319  }
1320  col += it->number;
1321 
1322  // check for interrupt
1325  "Alignment interrupted");
1326  }
1327  }
1328 
1329 
1330  //----------------------------------------------------------------------
1331  if (m_Options->GetVerbose()) {
1332  printf("Cluster prototypes:\n");
1333  ITERATE(CClusterer::TClusters, it, clusters) {
1334  const CSequence& seq = results[it->GetPrototype()];
1335  for (int i=0;i < seq.GetLength();i++) {
1336  printf("%c", (char)seq.GetPrintableLetter(i));
1337  }
1338  printf("\n");
1339  }
1340  printf("\n\n");
1341 
1342  printf("Individual clusters:\n");
1343  for (int i=0;i < (int)clusters.size();i++) {
1344  if (clusters[i].size() > 1) {
1345  printf("Cluster %d:\n", i);
1346  ITERATE(CClusterer::TSingleCluster, elem, clusters[i]) {
1347  const CSequence& seq = results[*elem];
1348  for (int j=0;j < seq.GetLength();j++) {
1349  printf("%c", (char)seq.GetPrintableLetter(j));
1350  }
1351  printf("\n");
1352  }
1353  printf("\n");
1354  }
1355  }
1356  printf("\n\n");
1357 
1358 
1359  printf("All queries:\n");
1360  ITERATE(vector<CSequence>, it, results) {
1361  const CSequence& seq = *it;
1362  for (int i=0;i < seq.GetLength();i++) {
1363  printf("%c", (char)seq.GetPrintableLetter(i));
1364  }
1365  printf("\n");
1366  }
1367  printf("\n\n");
1368  }
1369  //----------------------------------------------------------------------
1370 
1371  m_Results.swap(results);
1372  m_tQueries.swap(m_AllQueries);
1373 }
1374 
1375 
1376 // Frequencies are not normalized
1378 {
1379  // Iterate over all clusters
1380  const CClusterer::TClusters& clusters = m_Clusterer.GetClusters();
1381  for (size_t cluster_idx=0;cluster_idx < clusters.size();cluster_idx++) {
1382 
1383  const CClusterer::TSingleCluster& cluster = clusters[cluster_idx];
1384  _ASSERT(cluster.size() >= 1);
1385 
1386  // Skip one-element clusters
1387  if (cluster.size() == 1) {
1388  continue;
1389  }
1390 
1391  CSequence& prototype = m_QueryData[cluster_idx];
1392  CSequence::TFreqMatrix& freqs = prototype.GetFreqs();
1393  int size = prototype.GetLength();
1394 
1395  // For all cluster elements
1396  ITERATE(CClusterer::TSingleCluster, elem, cluster) {
1397 
1398  // Values from the prototype are already in the matrix
1399  if (*elem == cluster.GetPrototype()) {
1400  continue;
1401  }
1402 
1403  // Add frequencies from current sequence
1404  CSequence::TFreqMatrix& matrix = m_AllQueryData[*elem].GetFreqs();
1405  _ASSERT(matrix.GetRows() == freqs.GetRows()
1406  + m_ClusterGapPositions[cluster_idx].size());
1407  Uint4 gap_idx = 0, offset = 0;
1408  for (Uint4 i=0;i < (Uint4)size;i++) {
1409  while (gap_idx < m_ClusterGapPositions[cluster_idx].size()
1410  && i == m_ClusterGapPositions[cluster_idx][gap_idx]) {
1411  offset++;
1412  gap_idx++;
1413  }
1414  for (int k=0;k < kAlphabetSize;k++) {
1415  freqs(i, k) += matrix(i + offset, k);
1416  }
1417  }
1418  _ASSERT(offset == m_ClusterGapPositions[cluster_idx].size()
1419  || m_ClusterGapPositions[cluster_idx][gap_idx]
1420  == (Uint4)prototype.GetLength());
1421  }
1422 
1423  // check for interrupt
1426  "Alignment interrupted");
1427  }
1428 
1429  }
1430 
1431 }
1432 
1434  vector<int>& indices)
1435 {
1436  queries.clear();
1437  indices.clear();
1438 
1439  // if aligning MSAs
1440  if (!m_InMSA1.empty()) {
1441  ITERATE (vector<int>, it, m_Msa1Repr) {
1442  _ASSERT(*it < (int)m_InMSA1.size());
1443 
1444  blast::SSeqLoc sl(*m_tQueries[*it], *m_Scope);
1445  queries.push_back(sl);
1446  indices.push_back(*it);
1447  }
1448  ITERATE (vector<int>, it, m_Msa2Repr) {
1449  _ASSERT(*it < (int)m_InMSA2.size());
1450  _ASSERT(m_InMSA1.size() + *it < m_tQueries.size());
1451 
1452  blast::SSeqLoc sl(*m_tQueries[m_InMSA1.size() + *it], *m_Scope);
1453  queries.push_back(sl);
1454  indices.push_back((int)m_InMSA1.size() + *it);
1455  }
1456 
1457  return;
1458  }
1459 
1460  // if aligning sequences
1461 
1462  switch (m_ClustAlnMethod) {
1463 
1466  ITERATE(vector< CRef<objects::CSeq_loc> >, it, m_tQueries) {
1467  blast::SSeqLoc sl(**it, *m_Scope);
1468  queries.push_back(sl);
1469  }
1470  indices.resize(m_tQueries.size());
1471  for (int i=0;i < (int)m_tQueries.size();i++) {
1472  indices[i] = i;
1473  }
1474  break;
1475 
1478  int index = it->GetPrototype();
1479  blast::SSeqLoc sl(*m_tQueries[index], *m_Scope);
1480  queries.push_back(sl);
1481  indices.push_back(index);
1482  }
1483  break;
1484 
1485  default:
1486  NCBI_THROW(CMultiAlignerException, eInvalidOptions,
1487  "Invalid in-cluster alignment method");
1488  }
1489 
1490 }
1491 
1492 
1493 void CMultiAligner::x_CreatePatternQueries(vector<const CSequence*>& queries,
1494  vector<int>& indices)
1495 {
1496 
1497  switch (m_ClustAlnMethod) {
1498 
1501  queries.resize(m_QueryData.size());
1502  indices.resize(m_QueryData.size());
1503  for (size_t i=0;i < m_QueryData.size();i++) {
1504  queries[i] = &m_QueryData[i];
1505  indices[i] = (int)i;
1506  }
1507  break;
1508 
1510  {
1511  const CClusterer::TClusters& clusters = m_Clusterer.GetClusters();
1512  queries.resize(clusters.size());
1513  indices.resize(clusters.size());
1514  for (size_t i=0;i < clusters.size();i++) {
1515  int index = clusters[i].GetPrototype();
1516  queries[i] = &m_QueryData[index];
1517  indices[i] = index;
1518  }
1519  }
1520  break;
1521 
1522  default:
1523  NCBI_THROW(CMultiAlignerException, eInvalidOptions,
1524  "Invalid in-cluster alignment method");
1525  }
1526 
1527 }
1528 
1529 /// Create phylogenetic tree for two sequences. This will be root and two
1530 /// children.
1531 /// @param ids Indices of the sequences in the array of queries [in]
1532 /// @param distance Distance between the sequences [in]
1533 /// @return Root of computed phylogenetic tree
1535  double distance)
1536 {
1537  _ASSERT(ids.size() == 2);
1538 
1539  TPhyTreeNode *root = new TPhyTreeNode();
1540  root->GetValue().SetDist(0.0);
1541  double node_dist = distance / 2.0;
1542 
1543  // so that edges can be scaled later
1544  if (node_dist <= 0.0) {
1545  node_dist = 1.0;
1546  }
1547 
1548  TPhyTreeNode* node = new TPhyTreeNode();
1549  node->GetValue().SetId(ids[0]);
1550  // Label is set so that serialized tree can be used in external programs
1551  node->GetValue().SetLabel(NStr::IntToString(ids[0]));
1552  node->GetValue().SetDist(node_dist);
1553  root->AddNode(node);
1554 
1555  node = new TPhyTreeNode();
1556  node->GetValue().SetId(ids[1]);
1557  node->GetValue().SetLabel(NStr::IntToString(ids[1]));
1558  node->GetValue().SetDist(node_dist);
1559  root->AddNode(node);
1560 
1561  return root;
1562 }
1563 
1564 /// Change ids of leaf nodes in a given tree to desired values (recursive).
1565 /// Function assumes that current leaf ids are 0,...,number of lefs. Each id i
1566 /// will be changed to i-th element of the given array.
1567 /// @param node Tree root [in|out]
1568 /// @param ids List of desired ids [in]
1569 static void s_SetLeafIds(TPhyTreeNode* node,
1570  const CClusterer::CSingleCluster& ids)
1571 {
1572  if (node->IsLeaf()) {
1573  _ASSERT(node->GetValue().GetId() < (int)ids.size());
1574 
1575  int id = ids[node->GetValue().GetId()];
1576  node->GetValue().SetId(id);
1577 
1578  // Labels are used to identify sequence in serialized tree.
1579  // They are set so that the tree can be used by external programs
1580  node->GetValue().SetLabel(NStr::IntToString(id));
1581 
1582  return;
1583  }
1584 
1586  while (child != node->SubNodeEnd()) {
1587 
1588  s_SetLeafIds(*child, ids);
1589  child++;
1590  }
1591 }
1592 
1593 void CMultiAligner::x_ComputeClusterTrees(vector<TPhyTreeNode*>& trees)
1594 {
1595  const CClusterer::TClusters& clusters = m_Clusterer.GetClusters();
1596 
1598  m_Clusterer.ReleaseTrees(trees);
1599  _ASSERT(trees.size() == clusters.size());
1600 
1601  // Trees for one-element clusters are not needed
1602  // Tree root == NULL indicates one-elemet cluster
1603  for (size_t i=0;i < trees.size();i++) {
1604  _ASSERT(trees[i]);
1605 
1606  if (clusters[i].size() == 1) {
1607  delete trees[i];
1608  trees[i] = NULL;
1609  }
1610  }
1611  }
1612  else {
1613 
1614  trees.resize(clusters.size());
1615  for (int clust_idx=0;clust_idx < (int)clusters.size();clust_idx++) {
1616  const CClusterer::CSingleCluster& cluster = clusters[clust_idx];
1617 
1618  // Tree root == NULL indicates one-elemet cluster
1619  if (cluster.size() == 1) {
1620  trees[clust_idx] = NULL;
1621  continue;
1622  }
1623 
1624  if (cluster.size() == 2) {
1625  trees[clust_idx] = s_MakeTwoLeafTree(cluster,
1626  (m_Clusterer.GetDistMatrix())(cluster[0], cluster[1]));
1627 
1628  continue;
1629  }
1630 
1632  m_Clusterer.GetClusterDistMatrix(clust_idx, mat);
1633  CTree single_tree(mat,
1635  TPhyTreeNode* root = single_tree.ReleaseTree();
1636 
1637  // Set node id's that correspod to cluster sequences
1638  s_SetLeafIds(root, cluster);
1639 
1640  trees[clust_idx] = root;
1641  }
1642  }
1643 
1644  //----------------------------------------------------------------
1645  if (m_Options->GetVerbose()) {
1646  for (size_t i=0;i < trees.size();i++) {
1647  if (trees[i]) {
1648  printf("Tree for cluster %d:\n", (int)i);
1649  CTree::PrintTree(trees[i]);
1650  printf("\n");
1651  }
1652  }
1653  }
1654  //----------------------------------------------------------------
1655 }
1656 
1657 
1658 /// Compute length of the edge or distance from root for each leaf (recursive).
1659 /// @param tree Tree root [in]
1660 /// @param dist_from_root Current distance from root, used for recurence [in]
1661 /// @param leaf_dists Leaf distances, vector must be allocated [out]
1662 /// @param leaf_nodes Pointers to leaf nodes. Need to be initialized to NULLs
1663 /// [out]
1664 /// @param last_edge_only If true, length of last edge is returned for each
1665 /// leaf, distance from root otherwise [in]
1666 static void s_FindLeafDistances(TPhyTreeNode* tree, double dist_from_root,
1667  vector<double>& leaf_dists,
1668  vector<TPhyTreeNode*>& leaf_nodes,
1669  bool last_edge_only = false)
1670 {
1671  _ASSERT(!tree->GetParent() || tree->GetValue().IsSetDist());
1672 
1673  if (tree->IsLeaf()) {
1674 
1675  int id = tree->GetValue().GetId();
1676  double dist = tree->GetValue().GetDist();
1677  if (!last_edge_only) {
1678  dist += dist_from_root;
1679  }
1680 
1681  _ASSERT(id < (int)leaf_dists.size());
1682  leaf_dists[id] = dist;
1683 
1684 
1685  _ASSERT(id < (int)leaf_nodes.size() && !leaf_nodes[id]);
1686  leaf_nodes[id] = tree;
1687 
1688  return;
1689  }
1690 
1691  double dist;
1692  if (tree->GetParent() && tree->GetValue().IsSetDist() && !last_edge_only) {
1693  dist = tree->GetValue().GetDist();
1694  }
1695  else {
1696  dist = 0.0;
1697  }
1698 
1699  TPhyTreeNode::TNodeList_CI it = tree->SubNodeBegin();
1700  while (it != tree->SubNodeEnd()) {
1701  s_FindLeafDistances(*it, dist_from_root + dist, leaf_dists, leaf_nodes,
1702  last_edge_only);
1703  it++;
1704  }
1705 }
1706 
1707 /// Find distance from root for selected node (recursive).
1708 /// @param node Tree root [in]
1709 /// @param id Node id for selected node [in]
1710 /// @param dist_from_root Current distance from root, for recurrence [in]
1711 /// @return Distance from root for node with given id, or -1.0 if node not
1712 /// found
1713 static double s_FindNodeDistance(const TPhyTreeNode* node, int id,
1714  double dist_from_root)
1715 {
1716  _ASSERT(!node->GetParent() || node->GetValue().IsSetDist());
1717 
1718  if (node->GetValue().GetId() == id) {
1719  return dist_from_root + node->GetValue().GetDist();
1720  }
1721 
1722  if (node->IsLeaf()) {
1723  return -1.0;
1724  }
1725 
1726  double dist;
1727  if (!node->GetParent()) {
1728  dist = 0.0;
1729  }
1730  else {
1731  _ASSERT(node->GetValue().IsSetDist());
1732  dist = node->GetValue().GetDist();
1733  }
1734 
1736 
1737  double result = -1.0;
1738  while (it != node->SubNodeEnd() && result <= -1.0) {
1739  result = s_FindNodeDistance(*it, id, dist_from_root + dist);
1740  it++;
1741  }
1742 
1743  return result;
1744 }
1745 
1746 /// Scale all tree edges by given factor (recursive).
1747 /// @param node Tree root [in|out]
1748 /// @param scale Scaling factor [in]
1749 static void s_ScaleTreeEdges(TPhyTreeNode* node, double scale)
1750 {
1751  _ASSERT(!node->GetParent() || node->GetValue().IsSetDist());
1752 
1753  node->GetValue().SetDist(node->GetValue().GetDist() * scale);
1754 
1755  if (node->IsLeaf()) {
1756  return;
1757  }
1758 
1760  while (it != node->SubNodeEnd()) {
1761  s_ScaleTreeEdges(*it, scale);
1762  it++;
1763  }
1764 }
1765 
1766 /// Rescale tree so that node with given id has desired distance from root
1767 /// @param tree Tree root [in|out]
1768 /// @param id Id of node that is to have desired distance from root [in]
1769 /// @param dist Desired distance from root for desired node [in]
1770 static void s_RescaleTree(TPhyTreeNode* tree, int id, double dist)
1771 {
1772  // Find current distance from root
1773  double curr_dist = s_FindNodeDistance(tree, id, 0.0);
1774 
1775  _ASSERT(dist > 0.0);
1776 
1777  // Find scale
1778  double scale;
1779  if (curr_dist > 0.0) {
1780  scale = dist / curr_dist;
1781  }
1782  else {
1783  scale = dist;
1784  }
1785 
1786  // Scale all edges of the tree
1787  s_ScaleTreeEdges(tree, scale);
1788 }
1789 
1791  const vector<TPhyTreeNode*>& cluster_trees,
1792  const vector<TPhyTreeNode*>& cluster_leaves)
1793 {
1794  ITERATE(vector<TPhyTreeNode*>, it, cluster_leaves) {
1795  // For each leaf here
1796  TPhyTreeNode* node = *it;
1797 
1798  _ASSERT(node && node->IsLeaf());
1799  _ASSERT(node->GetValue().IsSetDist());
1800 
1801  // find query cluster it represents and get cluster subtree
1802  int cluster_id = node->GetValue().GetId();
1803  TPhyTreeNode* subtree = cluster_trees[cluster_id];
1804 
1805  // NULL pointer indicates one-element cluster
1806  // there is no subtree to attach, but node id must be changed from
1807  // cluster id to sequence id
1808  if (!subtree) {
1809  const CClusterer::CSingleCluster& cluster
1810  = m_Clusterer.GetClusters()[cluster_id];
1811  int seq_id = cluster[0];
1812 
1813  node->GetValue().SetId(seq_id);
1814  node->GetValue().SetLabel(NStr::IntToString(seq_id));
1815 
1816  continue;
1817  }
1818 
1819  // id >= kClusterNodeId denotes root of cluster subtree
1820  node->GetValue().SetId(kClusterNodeId + cluster_id);
1821  node->GetValue().SetLabel("");
1822 
1823  // Detach subtree children and attach them to the leaf node.
1824  // This prevents problems in recursion
1825  vector<TPhyTreeNode*> children;
1826  TPhyTreeNode::TNodeList_I child(subtree->SubNodeBegin());
1827  while (child != subtree->SubNodeEnd()) {
1828  children.push_back(*child);
1829  child++;
1830  }
1831  ITERATE(vector<TPhyTreeNode*>, it, children) {
1832  subtree->DetachNode(*it);
1833  node->AddNode(*it);
1834  }
1835  delete subtree;
1836  subtree = NULL;
1837 
1838  // node replaces root of the subtree
1839  node->GetValue().SetDist(0.0);
1840 
1841  }
1842 }
1843 
1844 void CMultiAligner::x_BuildFullTree(const vector<TPhyTreeNode*>& cluster_trees)
1845 {
1846  _ASSERT(m_Tree.GetTree());
1847 
1848  const CClusterer::TClusters& clusters = m_Clusterer.GetClusters();
1849  _ASSERT(cluster_trees.size() == clusters.size());
1850 
1851  // Find leaf nodes and lengths of leaf edges in the tree of cluster
1852  // prototypes
1853  vector<double> cluster_dists(clusters.size(), 0.0);
1854  vector<TPhyTreeNode*> cluster_leaves(clusters.size(), NULL);
1855  s_FindLeafDistances(m_Tree.GetTree(), 0.0, cluster_dists, cluster_leaves,
1856  true);
1857  //------------------------------------------------------------
1858  if (m_Options->GetVerbose()) {
1859  vector<TPhyTreeNode*> dummy_vect(clusters.size(), NULL);
1860  vector<double>d(cluster_dists.size());
1861  s_FindLeafDistances(m_Tree.GetTree(), 0.0, d, dummy_vect, false);
1862  for (size_t i=0;i < d.size();i++) {
1863  printf("%d:%f ", (int)i, d[i]);
1864  }
1865  printf("\n");
1866  }
1867  //------------------------------------------------------------
1868 
1869  // For each cluster tree
1870  for (size_t i=0;i < cluster_trees.size();i++) {
1871 
1872  // skip one-element clusters
1873  if (!cluster_trees[i]) {
1874  continue;
1875  }
1876 
1877  const CClusterer::CSingleCluster& cluster = clusters[i];
1878 
1879  // if the length of leaf edge is non-positive, set it to a small value
1880  if (cluster_dists[i] <= 0.0) {
1881  cluster_dists[i] = 1e-5;
1882  }
1883 
1884  // rescale cluster tree so that distance from root to cluster
1885  // representative is the same as leaf edge in the tree of cluster
1886  // prototypes
1887  s_RescaleTree(cluster_trees[i], cluster.GetPrototype(),
1888  cluster_dists[i]);
1889  }
1890 
1891  // Attach cluster trees to the guide tree
1892  x_AttachClusterTrees(cluster_trees, cluster_leaves);
1893 
1894  //------------------------------------------------------------
1895  if (m_Options->GetVerbose()) {
1896  vector<TPhyTreeNode*> dummy_vect(m_QueryData.size(), NULL);
1897  cluster_dists.resize(m_QueryData.size(), 0.0);
1898  s_FindLeafDistances(m_Tree.GetTree(), 0.0, cluster_dists, dummy_vect,
1899  false);
1900  for (size_t i=0;i < cluster_dists.size();i++) {
1901  printf("%d:%f ", (int)i, cluster_dists[i]);
1902  }
1903  printf("\n");
1904  }
1905 
1906  if (m_Options->GetVerbose()) {
1907  printf("Full tree:\n");
1909  printf("\n");
1910  }
1911  //------------------------------------------------------------
1912 
1913 
1914 }
1915 
1916 
1917 END_SCOPE(cobalt)
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Definition: base.hpp:119
Declares the BLAST exception class.
Int2 Blast_KarlinBlkGappedLoadFromTables(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Boolean standard_only)
Attempts to fill KarlinBlk for given gap opening, extensions etc.
Definition: blast_stat.c:3576
int GetPrototype(void) const
Get cluster prototype.
Definition: clusterer.hpp:103
size_t size(void) const
Get cluster size.
Definition: clusterer.hpp:118
vector< int >::const_iterator const_iterator
Definition: clusterer.hpp:77
Interface for CClusterer class used for clustering any type of data based on distance matrix.
Definition: clusterer.hpp:53
void ReleaseTrees(vector< TPhyTreeNode * > &trees)
Get list of trees for clusters and release ownership to caller.
Definition: clusterer.cpp:1073
@ eClique
Clusters can be joined if there is a link between all pairs of their elements.
Definition: clusterer.hpp:67
@ eCompleteLinkage
Maximum distance between elements.
Definition: clusterer.hpp:61
TPhyTreeNode * ReleaseTree(int index=0)
Get cluster tree and release ownership to caller.
Definition: clusterer.cpp:1101
void Reset(void)
Clear clusters and distance matrix.
Definition: clusterer.cpp:1148
void SetMakeTrees(bool trees)
Set make cluster tree/dendrogram option.
Definition: clusterer.hpp:230
void SetDistMatrix(const TDistMatrix &dmat)
Set new distance matrix.
Definition: clusterer.cpp:97
const TDistMatrix & GetDistMatrix(void) const
Get distance matrix.
Definition: clusterer.cpp:118
void SetLinks(CRef< CLinks > links)
Set distance links.
Definition: clusterer.cpp:113
void Run(void)
Cluster elements.
Definition: clusterer.cpp:1157
const TClusters & GetClusters(void) const
Get clusters.
Definition: clusterer.hpp:272
void ComputeClusters(double max_diam, EDistMethod dist_method=eCompleteLinkage, bool do_trees=true, double infinity=-1.0)
Compute clusters.
Definition: clusterer.cpp:272
void SetClustMethod(EClustMethod method)
Set clustering method for links.
Definition: clusterer.hpp:220
vector< TSingleCluster > TClusters
Definition: clusterer.hpp:160
TClusters & SetClusters(void)
Set clusters.
Definition: clusterer.hpp:277
void PurgeDistMatrix(void)
Delete distance matrix.
Definition: clusterer.hpp:323
void GetClusterDistMatrix(int index, TDistMatrix &mat) const
Get distance matrix for elements of a selected cluster.
Definition: clusterer.cpp:1123
Representation of pairwise distances, intended for use in multiple sequence alignment applications.
Definition: dist.hpp:56
const CDistMethods::TMatrix & GetMatrix() const
Access the current distance matrix.
Definition: dist.hpp:103
Interface for the traceback from blast hits.
Definition: traceback.hpp:55
An ordered collection of CHit objects.
Definition: hitlist.hpp:50
int Size() const
Retrieve number of hits in list.
Definition: hitlist.hpp:75
void PurgeAllHits()
Delete all hits unconditionally.
Definition: hitlist.hpp:148
CHit * GetHit(int index)
Retrieve a hit from the hitlist.
Definition: hitlist.hpp:93
void AddToHitList(CHit *hit)
Append a hit to the hitlist.
Definition: hitlist.hpp:84
A generalized representation of a pairwise alignment.
Definition: hit.hpp:86
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
Definition: hit.hpp:97
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
Definition: hit.hpp:101
TRange m_SeqRange1
The range of offsets on the first sequence.
Definition: hit.hpp:107
TRange m_SeqRange2
The range of offsets on the second sequence.
Definition: hit.hpp:110
Options and parameters for multiple alignement.
Definition: options.hpp:95
@ eFastME
Fast Minimum Evolution.
Definition: options.hpp:260
@ eClusters
Clustering dendrogram.
Definition: options.hpp:261
TScore GetEndGapExtendPenalty(void) const
Get gap extension penalty for end gaps in pairwise global alignment of profiles.
Definition: options.hpp:658
double GetMaxInClusterDist(void) const
Get maximum allowed distance between sequences in a cluster.
Definition: options.hpp:476
string GetScoreMatrixName(void) const
Get alignment score matrix name.
Definition: options.hpp:606
TKMethods::EDistMeasures GetKmerDistMeasure(void) const
Get method for computing distance between word count vectors.
Definition: options.hpp:463
int GetCentralSeq(void) const
Get central sequence.
Definition: options.hpp:740
EInClustAlnMethod GetInClustAlnMethod(void) const
Definition: options.hpp:696
TScore GetGapExtendPenalty(void) const
Get gap extension penlaty for middle gaps in pairwise global alignment of profiles.
Definition: options.hpp:632
const TConstraints & GetUserConstraints(void) const
Get user constraints.
Definition: options.hpp:410
TScore GetGapOpenPenalty(void) const
Get gap opening penalty for middle gaps in pairwise global alignment of profiles.
Definition: options.hpp:619
TScore GetEndGapOpenPenalty(void) const
Get gap opening penalty for end gaps in pairwise global alignment of profiles.
Definition: options.hpp:645
TKMethods::ECompressedAlphabet GetKmerAlphabet(void) const
Get alphabet used for creating word count vectors.
Definition: options.hpp:450
ETreeMethod GetTreeMethod(void) const
Get method for creating tree that guides progressive alignment.
Definition: options.hpp:579
vector< SConstraint > TConstraints
Definition: options.hpp:222
bool GetUseQueryClusters(void) const
Check if query clustering option is on.
Definition: options.hpp:311
@ eToPrototype
All cluster elements are aligner to cluster prototype.
Definition: options.hpp:266
@ eMulti
Alignment guide tree for each cluster is attached to the main alignment guide tree.
Definition: options.hpp:269
@ eNone
No clustering.
Definition: options.hpp:265
int GetKmerLength(void) const
Get word size for creating word count vectors.
Definition: options.hpp:439
int GetUserConstraintsScore(void) const
Get score for user alignment constraints.
Definition: options.hpp:424
bool GetVerbose(void) const
Get verbose mode.
Definition: options.hpp:691
Simultaneously align multiple protein sequences.
Definition: cobalt.hpp:69
vector< CSequence > m_AllQueryData
Definition: cobalt.hpp:728
vector< vector< Uint4 > > m_ClusterGapPositions
Definition: cobalt.hpp:726
CMultiAlignerOptions::EInClustAlnMethod m_ClustAlnMethod
Definition: cobalt.hpp:744
void x_SetScoreMatrix(const char *matrix_name)
Set the score matrix the aligner will use.
Definition: cobalt.cpp:381
SProgress m_ProgressMonitor
Definition: cobalt.hpp:737
CRef< objects::CScope > m_Scope
Definition: cobalt.hpp:689
void x_ComputeClusterTrees(vector< TPhyTreeNode * > &trees)
Compute independent phylogenetic trees each cluster.
Definition: cobalt.cpp:1593
void x_FindLocalInClusterHits(const vector< TPhyTreeNode * > &cluster_trees)
Run blast on sequences from each cluster subtree.
Definition: blast.cpp:481
static void x_InitInsertColumn(vector< SColumn >::iterator &it, size_t len, int num, int cluster)
Definition: cobalt.cpp:1175
vector< int > m_Msa2Repr
Indices of sequence representatives in input alignment 2.
Definition: cobalt.hpp:701
vector< CRef< objects::CSeq_loc > > m_tQueries
Definition: cobalt.hpp:688
void x_MakeClusterResidueFrequencies()
Compute profile residue frequencies for clusters.
Definition: cobalt.cpp:1377
vector< CRef< objects::CSeq_loc > > m_AllQueries
Definition: cobalt.hpp:727
CHitList m_CombinedHits
Definition: cobalt.hpp:718
TStatus Run(void)
Align the current set of input sequences (reset any existing alignment information).
Definition: cobalt.cpp:683
struct CMultiAligner::SColumn SColumn
Column in an alignment used for combining result from multiple alignment and pair-wise in-cluster ali...
void x_ComputeTree()
Given the current list of domain and local hits, generate a phylogenetic tree that clusters the curre...
Definition: cobalt.cpp:414
void x_FindLocalHits(const blast::TSeqLocVector &queries, const vector< int > &indices)
Run blast on selected input sequences and postprocess the results.
Definition: blast.cpp:306
EStatus
Return status.
Definition: cobalt.hpp:77
@ eOutOfMemory
Out of memory error.
Definition: cobalt.hpp:84
@ eInternalError
Unexpected error occured.
Definition: cobalt.hpp:82
@ eSuccess
Alignment successfully completed.
Definition: cobalt.hpp:78
@ eInterrupt
Alignment interruped through callback function.
Definition: cobalt.hpp:83
@ eOptionsError
Error related to options occured.
Definition: cobalt.hpp:79
@ eDatabaseError
Error related to RPS database occured.
Definition: cobalt.hpp:81
@ eQueriesError
Error related to query sequences occured.
Definition: cobalt.hpp:80
void x_BuildFullTree(const vector< TPhyTreeNode * > &cluster_trees)
Combine alignment guide tree computed for clusters with guide trees computed for each cluster.
Definition: cobalt.cpp:1844
vector< CSequence > m_QueryData
Definition: cobalt.hpp:691
bool x_ValidateInputMSAs(void) const
Validate input alignments.
Definition: cobalt.cpp:129
bool x_ValidateQueries(void) const
Validate query sequences.
Definition: cobalt.cpp:114
CTree m_Tree
Definition: cobalt.hpp:711
void x_CreatePatternQueries(vector< const CSequence * > &queries, vector< int > &indices)
Create query set for PROSITE pattern search along with indices in multiple alignment queries array.
Definition: cobalt.cpp:1493
CHitList m_UserHits
Definition: cobalt.hpp:721
void SetInputMSAs(const objects::CSeq_align &msa1, const objects::CSeq_align &msa2, const set< int > &representatives1, const set< int > &representatives2, CRef< objects::CScope > scope)
Set input alignments.
Definition: cobalt.cpp:297
void x_AlignMSAs(void)
Align multiple sequence alignments.
Definition: cobalt.cpp:513
CClusterer m_Clusterer
Definition: cobalt.hpp:712
void x_MultiAlignClusters()
Combine pair-wise in-cluster alignements with multiple alignments of cluster prototypes.
Definition: cobalt.cpp:1188
vector< CSequence > m_InMSA1
Input alignment.
Definition: cobalt.hpp:694
vector< CSequence > m_Results
Definition: cobalt.hpp:703
CHitList m_DomainHits
Definition: cobalt.hpp:716
void x_FindConsistentHitSubset(void)
Find consistent subset of pair-wise hits that can be used as alignment constraints.
Definition: seg.cpp:175
CConstRef< CMultiAlignerOptions > m_Options
Definition: cobalt.hpp:686
void x_InitParams(void)
Initiate parameters using m_Options.
Definition: cobalt.cpp:79
void x_AlignProfileProfile(vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, int iteration)
Align two collections of sequences.
Definition: prog.cpp:793
void SetQueries(const vector< CRef< objects::CSeq_loc > > &queries, CRef< objects::CScope > scope)
Set query sequences.
Definition: cobalt.cpp:194
void Reset(void)
Clear out the state left by the previous alignment operation.
Definition: cobalt.cpp:403
void x_AlignInClusters()
Pair-wise align each cluster sequence to cluster representative.
Definition: cobalt.cpp:1009
vector< string > m_Messages
Definition: cobalt.hpp:739
static void x_InitColumn(vector< SColumn >::iterator &it, size_t len)
Definition: cobalt.cpp:1160
virtual void x_Run(void)
Align the current set of input sequences (reset any existing alignment information).
Definition: cobalt.cpp:590
bool(* FInterruptFn)(SProgress *progress)
Prototype for function pointer to dertermine whether alignment should proceed of be interrupted.
Definition: cobalt.hpp:112
void x_FindDomainHits(blast::TSeqLocVector &queries, const vector< int > &indices)
Run RPS blast on seletced input sequences and postprocess the results.
Definition: rps.cpp:866
CMultiAligner(void)
Create mutli aligner with default options.
Definition: cobalt.cpp:51
CHitList m_PatternHits
Definition: cobalt.hpp:719
void x_BuildAlignment()
Given the current domain, local, pattern and user hits, along with the current tree,...
Definition: prog.cpp:2010
void x_Init(void)
Initiate class attributes that are not alignment parameters.
Definition: cobalt.hpp:579
void x_FindPatternHits(const vector< const CSequence * > &queries, const vector< int > &indices)
Find PROSITE pattern hits on selected input sequences.
Definition: phi.cpp:62
CRef< objects::CBioTreeContainer > GetTreeContainer(void) const
Get serializable tree used as guide in progressive alignment.
Definition: cobalt.cpp:357
CPSSMAligner m_Aligner
Definition: cobalt.hpp:710
@ eQueryClustering
Definition: cobalt.hpp:92
@ eTreeComputation
Definition: cobalt.hpp:96
vector< int > m_Msa1Repr
Indices of sequence representatives in input alignment 1.
Definition: cobalt.hpp:699
void x_AttachClusterTrees(const vector< TPhyTreeNode * > &cluster_trees, const vector< TPhyTreeNode * > &cluster_leaves)
Replace leaves in the alignment guide tree of clusters with cluster trees.
Definition: cobalt.cpp:1790
bool x_FindQueryClusters()
Find clusters of similar queries, select cluster representative sequences, and prepare input to multi...
Definition: cobalt.cpp:744
void x_CreateBlastQueries(blast::TSeqLocVector &queries, vector< int > &indices)
Create query set for RPS Blast and Blastp searches along with indices in multiple alignment queries a...
Definition: cobalt.cpp:1433
bool x_ValidateUserHits(void)
Validate user constraints with queries.
Definition: cobalt.cpp:159
void x_InitAligner(void)
Initiate PSSM aligner parameters.
Definition: cobalt.cpp:102
static const int kClusterNodeId
Definition: cobalt.hpp:747
FInterruptFn SetInterruptCallback(FInterruptFn fnptr, void *user_data=NULL)
Set a function callback to be invoked by multi aligner to allow interrupting alignment in progress.
Definition: cobalt.cpp:370
FInterruptFn m_Interrupt
Definition: cobalt.hpp:736
vector< CSequence > m_InMSA2
Input alignment.
Definition: cobalt.hpp:696
CHitList m_LocalHits
Definition: cobalt.hpp:717
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
Definition: matrix.hpp:390
size_t GetRows() const
get the number of rows in this matrix
Definition: matrix.hpp:298
size_t GetCols() const
get the number of columns in this matrix
Definition: matrix.hpp:305
Class for representing protein sequences.
Definition: seq.hpp:54
int GetLength() const
Get the length of the current sequence.
Definition: seq.hpp:125
unsigned char GetLetter(int pos) const
Access the sequence letter at a specified position.
Definition: seq.hpp:107
TFreqMatrix & GetFreqs()
Access the list of position frequencies associated with a sequence.
Definition: seq.hpp:89
static void CompressSequences(vector< CSequence > &seq, vector< int > index_list)
Given a collection of sequences, remove all sequence positions where a subset of the sequences all co...
Definition: seq.cpp:232
void PropagateGaps(const CNWAligner::TTranscript &transcript, CNWAligner::ETranscriptSymbol gap_choice)
Given an edit script, insert gaps into a sequence.
Definition: seq.cpp:133
static const unsigned char kGapChar
The ncbistdaa code for a gap.
Definition: seq.hpp:58
unsigned char * GetSequence()
Access the raw sequence data, in ncbistdaa format.
Definition: seq.hpp:96
static void CreateMsa(const objects::CSeq_align &seq_align, objects::CScope &scope, vector< CSequence > &msa)
Create a vector of CSequence objects that represents the alignment in given Seq_align.
Definition: seq.cpp:274
unsigned char GetPrintableLetter(int pos) const
Access the sequence letter at a specified position, and return an ASCII representation of that letter...
Definition: seq.cpp:48
definition of a Culling tree
Definition: ncbi_tree.hpp:100
A wrapper for controlling access to the phylogenetic tree generated by CDistMethods.
Definition: tree.hpp:54
void SetTree(TPhyTreeNode *tree)
Set tree.
Definition: tree.hpp:134
static void PrintTree(const TPhyTreeNode *node, int level=0)
Debug routine to recursively print out a tree.
Definition: tree.cpp:46
const TPhyTreeNode * GetTree() const
Access the current tree.
Definition: tree.hpp:124
TPhyTreeNode * ReleaseTree()
Get the current tree and release ownership.
Definition: tree.hpp:139
void ComputeTree(const CDistMethods::TMatrix &distances, bool use_fastme=false)
Compute a new tree.
Definition: tree.cpp:136
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, double(*fsim)(const TKmerCounts &, const TKmerCounts &), TDistMatrix &dmat)
Compute matrix of distances between given counts vectors.
Definition: kmercounts.hpp:559
static void ComputeCounts(const vector< CRef< objects::CSeq_loc > > &seqs, objects::CScope &scope, vector< TKmerCounts > &counts)
Create k-mer counts vectors for given sequences.
Definition: kmercounts.hpp:534
static void SetParams(unsigned kmer_len, unsigned alphabet_size)
Set default counts vector parameters.
Definition: kmercounts.hpp:446
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
size_type size() const
Definition: set.hpp:132
bool empty() const
Definition: set.hpp:133
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
static TPhyTreeNode * s_MakeTwoLeafTree(const CClusterer::CSingleCluster &ids, double distance)
Create phylogenetic tree for two sequences.
Definition: cobalt.cpp:1534
static void s_ScaleTreeEdges(TPhyTreeNode *node, double scale)
Scale all tree edges by given factor (recursive).
Definition: cobalt.cpp:1749
static void s_SetLeafIds(TPhyTreeNode *node, const CClusterer::CSingleCluster &ids)
Change ids of leaf nodes in a given tree to desired values (recursive).
Definition: cobalt.cpp:1569
static double s_FindNodeDistance(const TPhyTreeNode *node, int id, double dist_from_root)
Find distance from root for selected node (recursive).
Definition: cobalt.cpp:1713
static void s_RescaleTree(TPhyTreeNode *tree, int id, double dist)
Rescale tree so that node with given id has desired distance from root.
Definition: cobalt.cpp:1770
static void s_FindLeafDistances(TPhyTreeNode *tree, double dist_from_root, vector< double > &leaf_dists, vector< TPhyTreeNode * > &leaf_nodes, bool last_edge_only=false)
Compute length of the edge or distance from root for each leaf (recursive).
Definition: cobalt.cpp:1666
Interface for CMultiAligner.
CRef< objects::CBioTreeContainer > MakeBioTreeContainer(const TPhyTreeNode *tree)
Conversion from TPhyTreeNode to CBioTreeContainer.
void SetStartWg(TScore value)
TTranscript GetTranscript(bool reversed=true) const
Definition: nw_aligner.cpp:909
void SetEndWs(TScore value)
virtual CNWAligner::TScore Run(void)
void SetScoreMatrix(const SNCBIPackedScoreMatrix *scoremat)
SNCBIFullScoreMatrix & GetMatrix()
void SetEndWg(TScore value)
vector< ETranscriptSymbol > TTranscript
Definition: nw_aligner.hpp:199
void SetWs(TScore value)
void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
void SetWg(TScore value)
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
Definition: nw_aligner.cpp:192
void SetStartWs(TScore value)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
TNodeList::iterator TNodeList_I
Definition: ncbi_tree.hpp:109
TTreeType * DetachNode(TTreeType *subnode)
Remove the subtree from the tree without destroying it.
Definition: ncbi_tree.hpp:720
TNodeList_CI SubNodeBegin(void) const
Return first const iterator on subnode list.
Definition: ncbi_tree.hpp:160
TNodeList::const_iterator TNodeList_CI
Definition: ncbi_tree.hpp:110
void AddNode(TTreeType *subnode)
Add new subnode.
Definition: ncbi_tree.hpp:743
bool IsLeaf() const
Report whether this is a leaf node.
Definition: ncbi_tree.hpp:296
TNodeList_CI SubNodeEnd(void) const
Return last const iterator on subnode list.
Definition: ncbi_tree.hpp:166
const TValue & GetValue(void) const
Return node's value.
Definition: ncbi_tree.hpp:184
const TTreeType * GetParent(void) const
Get node's parent.
Definition: ncbi_tree.hpp:139
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int len
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
CSequnceHelper< CObject > CSequence
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
EIPRangeType t
Definition: ncbi_localip.c:101
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
The Object manager core.
CTreeNode< CPhyNodeData > TPhyTreeNode
Definition: phy_node.hpp:81
const SNCBIPackedScoreMatrix NCBISM_Pam30
Definition: sm_pam30.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum62
Definition: sm_blosum62.c:92
const SNCBIPackedScoreMatrix NCBISM_Pam250
Definition: sm_pam250.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum80
Definition: sm_blosum80.c:92
const SNCBIPackedScoreMatrix NCBISM_Pam70
Definition: sm_pam70.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum45
The standard matrices.
Definition: sm_blosum45.c:92
int offset
Definition: replacements.h:160
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Structure to hold the Karlin-Altschul parameters.
Definition: blast_stat.h:66
EAlignmentStage stage
Definition: cobalt.hpp:103
Structure for listing tree leaves.
Definition: tree.hpp:72
#define _ASSERT
else result
Definition: token2.c:20
static Uint4 letter(char c)
Modified on Fri Dec 08 08:23:13 2023 by modify_doxy.py rev. 669887