CMultiAligner Class Reference

Simultaneously align multiple protein sequences. More...

#include <algo/cobalt/cobalt.hpp>

class  compare_sseg_db_idx
struct  SColumn
 Column in an alignment used for combining result from multiple alignment and pair-wise in-cluster alignments. More...
struct  SGraphNode
struct  SProgress
 Structure for reporting alignment progress. More...
struct  SSegmentLoc

Public Types

enum  EStatus {
  eSuccess = 0 , eOptionsError , eQueriesError , eDatabaseError ,
  eInternalError , eInterrupt , eOutOfMemory , eLastStatus = eOutOfMemory
 Return status. More...
enum  EAlignmentStage {
  eBegin , eQueryClustering , eDomainHitsSearch , eLocalHitsSearch ,
  ePatternHitsSearch , eTreeComputation , eProgressiveAlignment , eIterativeAlignment
typedef int TStatus
typedef bool(* FInterruptFn) (SProgress *progress)
 Prototype for function pointer to dertermine whether alignment should proceed of be interrupted. More...
typedef CSparseKmerCounts TKmerCounts
typedef TKmerMethods< TKmerCountsTKMethods
typedef pair< TRange, TRangeTRangePair
Public Member Functions

 CMultiAligner (void)
 Create mutli aligner with default options. More...
 CMultiAligner (const string &rps_db)
 Create multi aligner with selected RPS data base and default options. More...
 CMultiAligner (const CConstRef< CMultiAlignerOptions > &options)
 Create mutli aligner with given options. More...
void SetQueries (const vector< CRef< objects::CSeq_loc > > &queries, CRef< objects::CScope > scope)
 Set query sequences. More...
void SetQueries (const vector< CRef< objects::CBioseq > > &queries)
 Set query sequences. More...
void SetQueries (const blast::TSeqLocVector &queries)
 Set query sequences. More...
void SetInputMSAs (const objects::CSeq_align &msa1, const objects::CSeq_align &msa2, const set< int > &representatives1, const set< int > &representatives2, CRef< objects::CScope > scope)
 Set input alignments. More...
const vector< CRef< objects::CSeq_loc > > & GetQueries (void) const
 Get query sequences. More...
CRef< objects::CScope > GetScope (void)
 Get scope. More...
CConstRef< CMultiAlignerOptionsGetOptions (void) const
 Get mutli aligner parameters. More...
TStatus Run (void)
 Align the current set of input sequences (reset any existing alignment information). More...
void Reset (void)
 Clear out the state left by the previous alignment operation. More...
CRef< objects::CSeq_align > GetResults (void) const
 Retrieve the current aligned results in Seq-align format. More...
CRef< objects::CSeq_align > GetResults (vector< int > &indices) const
 Retrieve a selection of the current aligned results, in Seq-align format. More...
const vector< CSequence > & GetSeqResults (void) const
 Retrieve the current aligned results in CSequence format. More...
const TPhyTreeNodeGetTree (void) const
 Get ree used guide in progressive alignment. More...
CRef< objects::CBioTreeContainer > GetTreeContainer (void) const
 Get serializable tree used as guide in progressive alignment. More...
const CClusterer::TClustersGetQueryClusters (void) const
 Get clusters of query sequences. More...
int GetScore (void) const
 Get alignment score. More...
FInterruptFn SetInterruptCallback (FInterruptFn fnptr, void *user_data=NULL)
 Set a function callback to be invoked by multi aligner to allow interrupting alignment in progress. More...
const vector< string > & GetMessages (void) const
 Get Error/Warning messages. More...
bool IsMessage (void) const
 Check whether there are any error/warning messages. More...
Static Public Attributes

static const int kMajorVersion = 3
 Version information. More...
static const int kMinorVersion = 0
static const int kPatchVersion = 0
static const CNWAligner::TScore kDefaultGapOpen = -11
 Default gap open penalty. More...
static const CNWAligner::TScore kDefaultGapExtend = -1
 Default gap extension penalty. More...
Protected Types

typedef struct CMultiAligner::SSegmentLoc SSegmentLoc

Protected Member Functions

bool x_ValidateQueries (void) const
 Validate query sequences. More...
bool x_ValidateInputMSAs (void) const
 Validate input alignments. More...
bool x_ValidateUserHits (void)
 Validate user constraints with queries. More...
void x_CreateBlastQueries (blast::TSeqLocVector &queries, vector< int > &indices)
 Create query set for RPS Blast and Blastp searches along with indices in multiple alignment queries array. More...
void x_CreatePatternQueries (vector< const CSequence * > &queries, vector< int > &indices)
 Create query set for PROSITE pattern search along with indices in multiple alignment queries array. More...
void x_FindDomainHits (blast::TSeqLocVector &queries, const vector< int > &indices)
 Run RPS blast on seletced input sequences and postprocess the results. More...
void x_FindLocalHits (const blast::TSeqLocVector &queries, const vector< int > &indices)
 Run blast on selected input sequences and postprocess the results. More...
void x_FindLocalInClusterHits (const vector< TPhyTreeNode * > &cluster_trees)
 Run blast on sequences from each cluster subtree. More...
void x_FindPatternHits (const vector< const CSequence * > &queries, const vector< int > &indices)
 Find PROSITE pattern hits on selected input sequences. More...
void x_FindConsistentHitSubset (void)
 Find consistent subset of pair-wise hits that can be used as alignment constraints. More...
void x_ComputeTree ()
 Given the current list of domain and local hits, generate a phylogenetic tree that clusters the current input sequences. More...
void x_BuildAlignment ()
 Given the current domain, local, pattern and user hits, along with the current tree, compute a multiple alignment of the input sequences. More...
bool x_FindQueryClusters ()
 Find clusters of similar queries, select cluster representative sequences, and prepare input to multiple alignement composed of only representatives. More...
void x_AlignInClusters ()
 Pair-wise align each cluster sequence to cluster representative. More...
void x_MakeClusterResidueFrequencies ()
 Compute profile residue frequencies for clusters. More...
void x_MultiAlignClusters ()
 Combine pair-wise in-cluster alignements with multiple alignments of cluster prototypes. More...
void x_ComputeClusterTrees (vector< TPhyTreeNode * > &trees)
 Compute independent phylogenetic trees each cluster. More...
void x_AttachClusterTrees (const vector< TPhyTreeNode * > &cluster_trees, const vector< TPhyTreeNode * > &cluster_leaves)
 Replace leaves in the alignment guide tree of clusters with cluster trees. More...
void x_BuildFullTree (const vector< TPhyTreeNode * > &cluster_trees)
 Combine alignment guide tree computed for clusters with guide trees computed for each cluster. More...
virtual void x_Run (void)
 Align the current set of input sequences (reset any existing alignment information). More...
void x_AlignMSAs (void)
 Align multiple sequence alignments. More...
void x_SetDomainHits (const blast::TSeqLocVector &queruies, const vector< int > &indices, const objects::CBlast4_archive &archive)
 Set pre-computed domain hits using BLAST archive format. More...
Protected Attributes

CConstRef< CMultiAlignerOptionsm_Options
vector< CRef< objects::CSeq_loc > > m_tQueries
CRef< objects::CScope > m_Scope
vector< CSequencem_QueryData
vector< CSequencem_InMSA1
 Input alignment. More...
vector< CSequencem_InMSA2
 Input alignment. More...
vector< intm_Msa1Repr
 Indices of sequence representatives in input alignment 1. More...
vector< intm_Msa2Repr
 Indices of sequence representatives in input alignment 2. More...
vector< CSequencem_Results
int m_Score
 Alignment score. More...
CPSSMAligner m_Aligner
CTree m_Tree
CClusterer m_Clusterer
CHitList m_DomainHits
CHitList m_LocalHits
CHitList m_CombinedHits
CHitList m_PatternHits
CHitList m_LocalInClusterHits
CHitList m_UserHits
vector< boolm_IsDomainSearched
 Marks sequences with pre-computed domain hits. More...
vector< vector< Uint4 > > m_ClusterGapPositions
vector< CRef< objects::CSeq_loc > > m_AllQueries
vector< CSequencem_AllQueryData
vector< vector< TRange > > m_RPSLocs
FInterruptFn m_Interrupt
SProgress m_ProgressMonitor
vector< stringm_Messages
CMultiAlignerOptions::EInClustAlnMethod m_ClustAlnMethod

Static Protected Attributes

static const int kClusterNodeId = 16000

Private Types

enum  EEndGapCostStrategy { fReduceLeft = 1 , fReduceRight = 2 , fReduceBoth = fReduceLeft | fReduceRight }
 Strategy for reducing end gap penalties for profile-profile alignment. More...
typedef struct CMultiAligner::SGraphNode SGraphNode
typedef struct CMultiAligner::SColumn SColumn
 Column in an alignment used for combining result from multiple alignment and pair-wise in-cluster alignments. More...

Private Member Functions

void x_InitParams (void)
 Initiate parameters using m_Options. More...
void x_InitAligner (void)
 Initiate PSSM aligner parameters. More...
void x_SetScoreMatrix (const char *matrix_name)
 Set the score matrix the aligner will use. More...
void x_Init (void)
 Initiate class attributes that are not alignment parameters. More...
void x_LoadBlockBoundaries (string blockfile, vector< SSegmentLoc > &blocklist)
 Given an RPS blast database, load a list of block offsets for each database sequence. More...
void x_FindRPSHits (blast::TSeqLocVector &queries, const vector< int > &indices, CHitList &rps_hits)
void x_RealignBlocks (CHitList &rps_hits, vector< SSegmentLoc > &blocklist, CProfileData &profile_data)
void x_AssignRPSResFreqs (CHitList &rps_hits, CProfileData &profile_data)
void x_AssignDefaultResFreqs ()
void x_AddNewSegment (vector< CRef< objects::CSeq_loc > > &loc_list, const CRef< objects::CSeq_loc > &query, TOffset from, TOffset to, vector< SSegmentLoc > &seg_list, int query_index)
 Create a new query sequence that is a subset of a previous query sequence. More...
void x_MakeFillerBlocks (const vector< int > &indices, vector< CRef< objects::CSeq_loc > > &filler_locs, vector< SSegmentLoc > &filler_segs)
 Turn all fragments of selected query sequence not already covered by a domain hit into a separate query sequence, used as input to a blast search. More...
void x_AlignFillerBlocks (const blast::TSeqLocVector &queries, const vector< int > &indices, vector< CRef< objects::CSeq_loc > > &filler_locs, vector< SSegmentLoc > &filler_segs)
 Run blastp, aligning the collection of filler fragments against the entire input dataset. More...
void x_FindAlignmentSubsets ()
SGraphNodex_FindBestPath (vector< SGraphNode > &nodes)
 Find a maximum weight path in a directed acyclic graph. More...
void x_BuildAlignmentIterative (vector< CTree::STreeEdge > &edges, double cluster_cutoff)
 Main driver for the progressive alignment process. More...
void x_FindConservedColumns (vector< CSequence > &new_alignment, CHitList &conserved)
 Create a list of constraints that reflect conserved columns in a multiple alignment. More...
void x_AlignProgressive (const TPhyTreeNode *tree, vector< CSequence > &query_data, CNcbiMatrix< CHitList > &pair_info, int iteration, bool is_cluster)
 Main driver for progressive alignment. More...
double x_RealignSequences (const TPhyTreeNode *input_cluster, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, double score, int iteration)
 Perform a single bipartition on a multiple alignment. More...
void x_AlignProfileProfile (vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, int iteration)
 Align two collections of sequences. More...
void x_AlignProfileProfileUsingHit (vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, int iteration)
 Align two profiles with all sequences that belong to the same cluster. More...
void x_FindConstraints (vector< size_t > &constraint, vector< CSequence > &alignment, vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, CNcbiMatrix< CHitList > &pair_info, int iteration)
 Find the set of constraints to use for a profile-profile alignment. More...
void x_FindInClusterConstraints (vector< CSequence > &alignment, vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, CNcbiMatrix< CHitList > &pair_info, vector< TRangePair > &match_ranges) const
 Find constraint to use for profile to profile alignment in clusters. More...
double x_GetScoreOneCol (vector< CSequence > &align, int col)
 Calculate the entropy score of one column of a multiple alignment (see the COBALT papaer for details) More...
double x_GetScore (vector< CSequence > &align)
 Compute the entropy score of a multiple alignment. More...
CRef< objects::CSeq_align > x_GetSeqalign (const vector< CSequence > &align, vector< int > &indices) const
 Produce a seqalign representing the specified alignment, using a subset of the sequences. More...
void x_AddRpsFreqsToCluster (const CClusterer::CSingleCluster &cluster, vector< CSequence > &query_data, const vector< TRange > &gaps)
unique_ptr< vector< int > > x_AlignClusterQueries (const TPhyTreeNode *node)
void x_ComputeProfileRangeAlignment (vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, vector< size_t > &constraints, const TRange &range1, const TRange &range2, int full_prof_len1, int full_prof_len2, EEndGapCostStrategy strat, CNWAligner::TTranscript &t)
 Compute profile profile alignmnet for a ranges of given profiles. More...

Static Private Member Functions

static void x_InitColumn (vector< SColumn >::iterator &it, size_t len)
static void x_InitInsertColumn (vector< SColumn >::iterator &it, size_t len, int num, int cluster)

Static Private Attributes

static const int kRpsScaleFactor = 100


class compare_sseg_db_idx
class ::CMultiAlignerTest

Detailed Description

Simultaneously align multiple protein sequences.

Definition at line 68 of file cobalt.hpp.

Member Typedef Documentation

◆ FInterruptFn

typedef bool(* CMultiAligner::FInterruptFn) (SProgress *progress)

Prototype for function pointer to dertermine whether alignment should proceed of be interrupted.

If this function returns true, all processing stops

Definition at line 112 of file cobalt.hpp.

◆ SColumn

Column in an alignment used for combining result from multiple alignment and pair-wise in-cluster alignments.

◆ SGraphNode

◆ SSegmentLoc

◆ TKmerCounts

Definition at line 114 of file cobalt.hpp.

◆ TKMethods

Definition at line 115 of file cobalt.hpp.

◆ TRangePair

Definition at line 116 of file cobalt.hpp.

◆ TStatus

Definition at line 88 of file cobalt.hpp.

Member Enumeration Documentation

◆ EAlignmentStage


Definition at line 90 of file cobalt.hpp.

◆ EEndGapCostStrategy

Strategy for reducing end gap penalties for profile-profile alignment.


Reduce penalty only for left end gaps.


Reduce penalty only for right end gaps.


Reduce penalty for both end gaps.

Definition at line 553 of file cobalt.hpp.

◆ EStatus

Return status.


Alignment successfully completed.


Error related to options occured.


Error related to query sequences occured.


Error related to RPS database occured.


Unexpected error occured.


Alignment interruped through callback function.


Out of memory error.


Definition at line 77 of file cobalt.hpp.

Constructor & Destructor Documentation

◆ CMultiAligner() [1/3]

CMultiAligner::CMultiAligner ( void  )

Create mutli aligner with default options.

Definition at line 51 of file cobalt.cpp.

◆ CMultiAligner() [2/3]

CMultiAligner::CMultiAligner ( const string rps_db)

Create multi aligner with selected RPS data base and default options.

rps_dbRPS data base path [in]

Definition at line 61 of file cobalt.cpp.

References x_Init(), x_InitAligner(), and x_InitParams().

◆ CMultiAligner() [3/3]

CMultiAligner::CMultiAligner ( const CConstRef< CMultiAlignerOptions > &  options)

Create mutli aligner with given options.

optionsParameters [in]

Definition at line 70 of file cobalt.cpp.

References x_Init(), x_InitAligner(), and x_InitParams().

Member Function Documentation

◆ GetMessages()

const vector<string>& CMultiAligner::GetMessages ( void  ) const

Get Error/Warning messages.

Errors are reported by exceptions, hence the messages will be mostly warnings.


Definition at line 284 of file cobalt.hpp.

Referenced by CMultiApplication::Run().

◆ GetOptions()

CConstRef<CMultiAlignerOptions> CMultiAligner::GetOptions ( void  ) const

Get mutli aligner parameters.


Definition at line 196 of file cobalt.hpp.

Referenced by s_TestResults().

◆ GetQueries()

const vector< CRef<objects::CSeq_loc> >& CMultiAligner::GetQueries ( void  ) const

Get query sequences.

List of seq-ids and locations [in]

Definition at line 182 of file cobalt.hpp.

Referenced by CMultiAlignerTest::CompareDomainHits(), s_TestQueriesAsBioseqs(), s_TestQueriesAsSeq_locs(), and s_TestResults().

◆ GetQueryClusters()

const CClusterer::TClusters& CMultiAligner::GetQueryClusters ( void  ) const

Get clusters of query sequences.

Query clusters

Definition at line 255 of file cobalt.hpp.

Referenced by s_TestResults().

◆ GetResults() [1/2]

CRef< CSeq_align > CMultiAligner::GetResults ( vector< int > &  indices) const

Retrieve a selection of the current aligned results, in Seq-align format.

The Seq-align is of global type, with a single denseg. Columns that have gaps in all the selected sequences are removed

indicesList of ordinal IDs of sequences that the Seq-align will contain. Indices may appear in any order and may be repeated
The results

Definition at line 177 of file seqalign.cpp.

References CSequence::CompressSequences(), i, m_Results, NCBI_THROW, and x_GetSeqalign().

◆ GetResults() [2/2]

CRef< CSeq_align > CMultiAligner::GetResults ( void  ) const

Retrieve the current aligned results in Seq-align format.

The Seq-align is of global type, with a single denseg

The results

Definition at line 157 of file seqalign.cpp.

References i, m_Results, NCBI_THROW, and x_GetSeqalign().

Referenced by CMultiApplication::Run(), and s_TestResults().

◆ GetScope()

CRef<objects::CScope> CMultiAligner::GetScope ( void  )

Get scope.


Definition at line 188 of file cobalt.hpp.

References m_Scope.

Referenced by CMultiApplication::Run(), and s_TestResults().

◆ GetScore()

int CMultiAligner::GetScore ( void  ) const

Get alignment score.

Alignment score

Definition at line 261 of file cobalt.hpp.

◆ GetSeqResults()

const vector<CSequence>& CMultiAligner::GetSeqResults ( void  ) const

Retrieve the current aligned results in CSequence format.

Included for backward compatibility with previous API.

The results, on CSequence for each input sequence

Definition at line 240 of file cobalt.hpp.

Referenced by CMultiApplication::Run(), and s_TestResults().

◆ GetTree()

const TPhyTreeNode* CMultiAligner::GetTree ( void  ) const

Get ree used guide in progressive alignment.


Definition at line 245 of file cobalt.hpp.

Referenced by s_TestResults(), x_BuildAlignment(), x_BuildAlignmentIterative(), and x_RealignSequences().

◆ GetTreeContainer()

CRef< objects::CBioTreeContainer > CMultiAligner::GetTreeContainer ( void  ) const

Get serializable tree used as guide in progressive alignment.


Definition at line 357 of file cobalt.cpp.

References CTree::GetTree(), m_Tree, MakeBioTreeContainer(), and NCBI_THROW.

Referenced by s_TestResults().

◆ IsMessage()

bool CMultiAligner::IsMessage ( void  ) const

Check whether there are any error/warning messages.

True if there are messages, false otherwise

Definition at line 289 of file cobalt.hpp.

◆ Reset()

void CMultiAligner::Reset ( void  )

Clear out the state left by the previous alignment operation.

Definition at line 403 of file cobalt.cpp.

References m_CombinedHits, m_DomainHits, m_LocalHits, m_PatternHits, m_Results, and CHitList::PurgeAllHits().

Referenced by SetInputMSAs(), and SetQueries().

◆ Run()

CMultiAligner::TStatus CMultiAligner::Run ( void  )

Align the current set of input sequences (reset any existing alignment information).

This function handles the generation of all internal state in the correct order. It is sufficient for 'black box' applications that only want a final answer without tweaking internal state. x_Run() is called for computing alignment.

Computation status: success (0), warnings (1), error (>1)

Definition at line 683 of file cobalt.cpp.

References eDatabaseError, eInternalError, eInterrupt, CMultiAlignerException::eInterrupt, CMultiAlignerException::eInvalidInput, CMultiAlignerException::eInvalidOptions, CMultiAlignerException::eInvalidScoreMatrix, eOptionsError, eOutOfMemory, CMultiAlignerException::eOutOfMemory, eQueriesError, eSuccess, CException::GetErrCode(), CException::GetMsg(), m_Messages, CException::what(), and x_Run().

Referenced by BOOST_AUTO_TEST_CASE(), and CMultiApplication::Run().

◆ SetInputMSAs()

void CMultiAligner::SetInputMSAs ( const objects::CSeq_align &  msa1,
const objects::CSeq_align &  msa2,
const set< int > &  representatives1,
const set< int > &  representatives2,
CRef< objects::CScope >  scope 

Set input alignments.

msa1The first input alignment [in]
msa2The second input alignment [in]
representatives1List of sequence indices in msa1 to be used for computing constraints [in]
representatives2List of sequence indices in msa2 to be used for computing constraints [in]
scopeScope [in]

Definition at line 297 of file cobalt.cpp.

References set< Key, Compare >::begin(), copy(), CSequence::CreateMsa(), set< Key, Compare >::empty(), set< Key, Compare >::end(), i, int, ITERATE, m_InMSA1, m_InMSA2, m_Msa1Repr, m_Msa2Repr, m_Scope, m_tQueries, Reset(), set< Key, Compare >::size(), x_ValidateInputMSAs(), and x_ValidateUserHits().

Referenced by CMultiApplication::Run().

◆ SetInterruptCallback()

CMultiAligner::FInterruptFn CMultiAligner::SetInterruptCallback ( CMultiAligner::FInterruptFn  fnptr,
void *  user_data = NULL 

Set a function callback to be invoked by multi aligner to allow interrupting alignment in progress.

fnptrPointer to callback function [in]
user_datauser data to be attached to progress structure [in]
Previously set callback function

Definition at line 370 of file cobalt.cpp.

References m_Interrupt, m_ProgressMonitor, and CMultiAligner::SProgress::user_data.

◆ SetQueries() [1/3]

void CMultiAligner::SetQueries ( const blast::TSeqLocVector queries)

Set query sequences.

This automatically clears out the intermediate state of the last alignment.

queriesList of query sequences [in]

Definition at line 263 of file cobalt.cpp.

References i, ITERATE, m_QueryData, m_Scope, m_tQueries, NCBI_THROW, Reset(), x_ValidateQueries(), and x_ValidateUserHits().

◆ SetQueries() [2/3]

void CMultiAligner::SetQueries ( const vector< CRef< objects::CBioseq > > &  queries)

Set query sequences.

This automatically clears out the intermediate state of the last alignment.

queriesList of query sequences [in]

Definition at line 219 of file cobalt.cpp.

References CException::GetMsg(), ITERATE, m_QueryData, m_Scope, m_tQueries, NCBI_THROW, Reset(), CRef< C, Locker >::Reset(), x_ValidateQueries(), and x_ValidateUserHits().

◆ SetQueries() [3/3]

void CMultiAligner::SetQueries ( const vector< CRef< objects::CSeq_loc > > &  queries,
CRef< objects::CScope >  scope 

Set query sequences.

This automatically clears out the intermediate state of the last alignment.

queriesList of query sequences or seq-ids [in]
scopeScope object [in]

Definition at line 194 of file cobalt.cpp.

References copy(), ITERATE, m_QueryData, m_Scope, m_tQueries, NCBI_THROW, Reset(), x_ValidateQueries(), and x_ValidateUserHits().

Referenced by BOOST_AUTO_TEST_CASE(), CMultiApplication::Run(), s_TestQueriesAsBioseqs(), and s_TestQueriesAsSeq_locs().

◆ x_AddNewSegment()

void CMultiAligner::x_AddNewSegment ( vector< CRef< objects::CSeq_loc > > &  loc_list,
const CRef< objects::CSeq_loc > &  query,
TOffset  from,
TOffset  to,
vector< SSegmentLoc > &  seg_list,
int  query_index 

Create a new query sequence that is a subset of a previous query sequence.

loc_listList of previously generated sequence fragments [in/out]
querySequence that contains the current fragment [in]
fromStart offset of fragment [in]
toEnd offset of fragment [in]
seg_listList of simplified representations of previous fragments [in/out]
query_indexOrdinal ID of 'query'

Definition at line 68 of file blast.cpp.

References eNa_strand_unknown, GetId(), m_Scope, query, and CSeq_loc::SetInt().

Referenced by x_MakeFillerBlocks().

◆ x_AddRpsFreqsToCluster()

void CMultiAligner::x_AddRpsFreqsToCluster ( const CClusterer::CSingleCluster cluster,
vector< CSequence > &  query_data,
const vector< TRange > &  gaps 

◆ x_AlignClusterQueries()

unique_ptr< vector< int > > CMultiAligner::x_AlignClusterQueries ( const TPhyTreeNode node)

◆ x_AlignFillerBlocks()

void CMultiAligner::x_AlignFillerBlocks ( const blast::TSeqLocVector queries,
const vector< int > &  indices,
vector< CRef< objects::CSeq_loc > > &  filler_locs,
vector< SSegmentLoc > &  filler_segs 

Run blastp, aligning the collection of filler fragments against the entire input dataset.

queriesList of queries selected for blastp alignment [in]
indicesList of indices of each selected query in the queries array [in]
filler_locsList of generated sequences [in]
filler_segsSimplified representation of filler_locs [in]

Definition at line 169 of file blast.cpp.

References CHitList::AddToHitList(), CSeq_align_Base::C_Segs::e_Dendiag, CSeq_align_Base::C_Segs::e_Denseg, eInterrupt, Get(), CMultiAlignerOptions::GetBlastpEvalue(), CSeq_align_Base::C_Segs::GetDendiag(), CSeq_align_Base::C_Segs::GetDenseg(), CSeq_interval_Base::GetFrom(), CScore_Base::GetId(), CScore_Base::C_Value::GetInt(), CSeq_loc_Base::GetInt(), CScore_Base::C_Value::GetReal(), CSeq_align_Base::GetScore(), CDense_diag_Base::GetScores(), CSeq_align_Base::GetSegs(), CObject_id_Base::GetStr(), CSeq_interval_Base::GetTo(), CScore_Base::GetValue(), i, int, ITERATE, m_Interrupt, m_LocalHits, m_Options, m_ProgressMonitor, m_Scope, max(), NCBI_THROW, CBl2Seq::Run(), and CSeq_align_Base::C_Segs::Which().

Referenced by x_FindLocalHits().

◆ x_AlignInClusters()

void CMultiAligner::x_AlignInClusters ( void  )

◆ x_AlignMSAs()

void CMultiAligner::x_AlignMSAs ( void  )

◆ x_AlignProfileProfile()

void CMultiAligner::x_AlignProfileProfile ( vector< CTree::STreeLeaf > &  node_list1,
vector< CTree::STreeLeaf > &  node_list2,
vector< CSequence > &  alignment,
CNcbiMatrix< CHitList > &  pair_info,
int  iteration 

Align two collections of sequences.

All sequences within a single collection begin with the same size

node_list1List of sequence number in first collection [in]
node_list2List of sequence number in second collection [in]
alignmentComplete list of aligned sequences (may contain sequences that will not be aligned). On output, new gaps will be propagated to all affected sequences [in][out]
pair_infoConstraints that may be used in the alignment [in]
iterationThe iteration number [in]

Definition at line 793 of file prog.cpp.

References CNWAligner::eTS_Delete, CNWAligner::eTS_Insert, CMultiAlignerOptions::GetEndGapExtendPenalty(), CMultiAlignerOptions::GetEndGapOpenPenalty(), CMultiAlignerOptions::GetGapExtendPenalty(), CMultiAlignerOptions::GetGapOpenPenalty(), CNWAligner::GetTranscript(), CMultiAlignerOptions::GetVerbose(), i, int, kAlphabetSize, kRpsScaleFactor, kScale, m_Aligner, m_Options, query, CPSSMAligner::Run(), CNWAligner::SetEndSpaceFree(), CPSSMAligner::SetEndWg(), CPSSMAligner::SetEndWs(), CNWAligner::SetPattern(), CPSSMAligner::SetSequences(), CPSSMAligner::SetStartWg(), CPSSMAligner::SetStartWs(), CPSSMAligner::SetWg(), CPSSMAligner::SetWs(), t, x_FillResidueFrequencies(), x_FindConstraints(), and x_NormalizeResidueFrequencies().

Referenced by x_AlignMSAs(), x_AlignProfileProfileUsingHit(), x_AlignProgressive(), and x_RealignSequences().

◆ x_AlignProfileProfileUsingHit()

void CMultiAligner::x_AlignProfileProfileUsingHit ( vector< CTree::STreeLeaf > &  node_list1,
vector< CTree::STreeLeaf > &  node_list2,
vector< CSequence > &  alignment,
CNcbiMatrix< CHitList > &  pair_info,
int  iteration 

Align two profiles with all sequences that belong to the same cluster.

A pair-wise constraint alignment between the most similar sequences is used for aligning the profiles. Sequence positions that match in the pair-wise alignment will also match in the profile alignment. Ranges between the matching positions are aligned with CPSSMAligner.

Definition at line 1056 of file prog.cpp.

References _ASSERT, COpenRange< Position >::Empty(), CNWAligner::eTS_Delete, CNWAligner::eTS_Insert, CNWAligner::eTS_Match, fReduceBoth, fReduceLeft, fReduceRight, CMultiAlignerOptions::GetFastAlign(), GetLength(), COpenRange< Position >::GetLength(), CMultiAlignerOptions::GetVerbose(), i, int, NStr::IntToString(), ITERATE, m_Messages, m_Options, COpenRange< Position >::NotEmpty(), query, t, x_AlignProfileProfile(), x_ComputeProfileRangeAlignment(), and x_FindInClusterConstraints().

Referenced by x_AlignProgressive().

◆ x_AlignProgressive()

void CMultiAligner::x_AlignProgressive ( const TPhyTreeNode tree,
vector< CSequence > &  query_data,
CNcbiMatrix< CHitList > &  pair_info,
int  iteration,
bool  is_cluster 

Main driver for progressive alignment.

treeAlignment guide tree [in]
query_dataThe sequences to align. The first call assumes this list contains the unaligned sequences, and intermediate alignment stages will update the list as alignment progresses. On output query_data contains the aligned version of all sequences [in][out]
pair_infoList of alignment constraints [in]
iterationThe iteration number [in]
is_clusterIs the curretly traversed node inside a cluster subtree [in]

Definition at line 1534 of file prog.cpp.

References eInterrupt, CHitList::Empty(), CClusterer::GetClusters(), CTreeNode< TValue, TKeyGetterP >::GetValue(), CTreeNode< TValue, TKeyGetterP >::IsLeaf(), kClusterNodeId, CTree::ListTreeLeaves(), m_Clusterer, m_DomainHits, m_Interrupt, m_ProgressMonitor, NCBI_THROW, x_AddRpsFreqsToCluster(), x_AlignProfileProfile(), x_AlignProfileProfileUsingHit(), and x_GetClusterGapLocations().

Referenced by x_BuildAlignmentIterative().

◆ x_AssignDefaultResFreqs()

void CMultiAligner::x_AssignDefaultResFreqs ( )

◆ x_AssignRPSResFreqs()

void CMultiAligner::x_AssignRPSResFreqs ( CHitList rps_hits,
CProfileData profile_data 

◆ x_AttachClusterTrees()

void CMultiAligner::x_AttachClusterTrees ( const vector< TPhyTreeNode * > &  cluster_trees,
const vector< TPhyTreeNode * > &  cluster_leaves 

Replace leaves in the alignment guide tree of clusters with cluster trees.

cluster_treesList of phylogenetic trees computed for each cluster of input sequences [in]
cluster_leavesList of pointers to leaves of the alignment guide tree computed for clusters [in]

Definition at line 1790 of file cobalt.cpp.

References _ASSERT, CTreeNode< TValue, TKeyGetterP >::AddNode(), CTreeNode< TValue, TKeyGetterP >::DetachNode(), CClusterer::GetClusters(), CTreeNode< TValue, TKeyGetterP >::GetValue(), NStr::IntToString(), CTreeNode< TValue, TKeyGetterP >::IsLeaf(), ITERATE, kClusterNodeId, m_Clusterer, NULL, CTreeNode< TValue, TKeyGetterP >::SubNodeBegin(), and CTreeNode< TValue, TKeyGetterP >::SubNodeEnd().

Referenced by x_BuildFullTree().

◆ x_BuildAlignment()

void CMultiAligner::x_BuildAlignment ( )

Given the current domain, local, pattern and user hits, along with the current tree, compute a multiple alignment of the input sequences.

Intended for applications that want fine-grained control of the alignment process

Definition at line 2010 of file prog.cpp.

References _ASSERT, eProgressiveAlignment, GetTree(), CMultiAlignerOptions::GetVerbose(), i, INT4_MAX, kClusterNodeId, CTree::ListTreeEdges(), m_Options, m_ProgressMonitor, m_QueryData, min(), ct::sort(), CMultiAligner::SProgress::stage, and x_BuildAlignmentIterative().

Referenced by x_Run().

◆ x_BuildAlignmentIterative()

void CMultiAligner::x_BuildAlignmentIterative ( vector< CTree::STreeEdge > &  edges,
double  cluster_cutoff 

◆ x_BuildFullTree()

void CMultiAligner::x_BuildFullTree ( const vector< TPhyTreeNode * > &  cluster_trees)

Combine alignment guide tree computed for clusters with guide trees computed for each cluster.

Leaves of the guide tree computed for cluster prototypes are replaced with clusters trees. Cluster trees are rescaled so that distance from root to cluster prototype in cluster tree is the same as length of the leaf edge in the alignment guide tree (for cluster prototypes).

cluster_treesList of cluster trees [in]

Definition at line 1844 of file cobalt.cpp.

References _ASSERT, CClusterer::GetClusters(), CClusterer::CSingleCluster::GetPrototype(), CTree::GetTree(), CMultiAlignerOptions::GetVerbose(), i, m_Clusterer, m_Options, m_QueryData, m_Tree, NULL, CTree::PrintTree(), s_FindLeafDistances(), s_RescaleTree(), and x_AttachClusterTrees().

Referenced by x_Run().

◆ x_ComputeClusterTrees()

void CMultiAligner::x_ComputeClusterTrees ( vector< TPhyTreeNode * > &  trees)

◆ x_ComputeProfileRangeAlignment()

void CMultiAligner::x_ComputeProfileRangeAlignment ( vector< CTree::STreeLeaf > &  node_list1,
vector< CTree::STreeLeaf > &  node_list2,
vector< CSequence > &  alignment,
vector< size_t > &  constraints,
const TRange range1,
const TRange range2,
int  full_prof_len1,
int  full_prof_len2,
CMultiAligner::EEndGapCostStrategy  strat,
CNWAligner::TTranscript t 

Compute profile profile alignmnet for a ranges of given profiles.

Resambles x_AlignProfileProfile, but works on sequence ranges, does not allow end space free for large sequence lengths difference, returns transcript, and does not update the vector of input sequences.

node_list1List of sequence numbers in first collecton [in]
node_list2List of sequence numbers in second collection [in]
alignmentList of sequences [in]
constraintsConstraints for alignment [in]
range1Range for alignment of the first profile [in]
range2Range for alignment of the second profile [in]
tAlignmet transcript [out]

Definition at line 964 of file prog.cpp.

References fReduceLeft, fReduceRight, CMultiAlignerOptions::GetEndGapExtendPenalty(), CMultiAlignerOptions::GetEndGapOpenPenalty(), CRange_Base::GetFrom(), CMultiAlignerOptions::GetGapExtendPenalty(), CMultiAlignerOptions::GetGapOpenPenalty(), CRange_Base::GetTo(), CNWAligner::GetTranscript(), i, kAlphabetSize, kRpsScaleFactor, kScale, m_Aligner, m_Options, CPSSMAligner::Run(), CNWAligner::SetEndSpaceFree(), CPSSMAligner::SetEndWg(), CPSSMAligner::SetEndWs(), CNWAligner::SetPattern(), CPSSMAligner::SetSequences(), CPSSMAligner::SetStartWg(), CPSSMAligner::SetStartWs(), CPSSMAligner::SetWg(), CPSSMAligner::SetWs(), t, x_FillResidueFrequencies(), and x_NormalizeResidueFrequencies().

Referenced by x_AlignProfileProfileUsingHit().

◆ x_ComputeTree()

void CMultiAligner::x_ComputeTree ( void  )

◆ x_CreateBlastQueries()

void CMultiAligner::x_CreateBlastQueries ( blast::TSeqLocVector queries,
vector< int > &  indices 

Create query set for RPS Blast and Blastp searches along with indices in multiple alignment queries array.

Searches for conserved regions can be performed for a subset of multiple alignment queries (typically to reduce computation time).

queriesQueries for RPS Blast and Blastp searches [out]
indicesIndexes of each query in m_tQueries [out]

Definition at line 1433 of file cobalt.cpp.

References _ASSERT, CMultiAlignerOptions::eMulti, CMultiAlignerOptions::eNone, CMultiAlignerOptions::eToPrototype, CClusterer::GetClusters(), i, int, ITERATE, m_ClustAlnMethod, m_Clusterer, m_InMSA1, m_InMSA2, m_Msa1Repr, m_Msa2Repr, m_Scope, m_tQueries, and NCBI_THROW.

Referenced by CMultiAlignerTest::SetDomainHits(), x_AlignMSAs(), and x_Run().

◆ x_CreatePatternQueries()

void CMultiAligner::x_CreatePatternQueries ( vector< const CSequence * > &  queries,
vector< int > &  indices 

Create query set for PROSITE pattern search along with indices in multiple alignment queries array.

Searches for conserved regions can be performed for a subset of multiple alignment queries (typically to reduce computation time).

queriesQueries for PROSITE patterns searches [out]
indicesIndexes of each query in m_tQueries [out]

Definition at line 1493 of file cobalt.cpp.

References CMultiAlignerOptions::eMulti, CMultiAlignerOptions::eNone, CMultiAlignerOptions::eToPrototype, CClusterer::GetClusters(), i, int, m_ClustAlnMethod, m_Clusterer, m_QueryData, and NCBI_THROW.

Referenced by x_AlignMSAs(), and x_Run().

◆ x_FindAlignmentSubsets()

void CMultiAligner::x_FindAlignmentSubsets ( )

◆ x_FindBestPath()

CMultiAligner::SGraphNode * CMultiAligner::x_FindBestPath ( vector< SGraphNode > &  nodes)

Find a maximum weight path in a directed acyclic graph.

nodesThe graph [in/modified]
Pointer to the first node in the optimal path

Definition at line 52 of file seg.cpp.

References i, INT4_MIN, CHit::m_SeqRange1, CHit::m_SeqRange2, NULL, and CLocalRange< Position >::StrictlyBelow().

Referenced by x_FindAlignmentSubsets(), and x_FindConstraints().

◆ x_FindConservedColumns()

void CMultiAligner::x_FindConservedColumns ( vector< CSequence > &  new_alignment,
CHitList conserved 

Create a list of constraints that reflect conserved columns in a multiple alignment.

new_alignmentThe multiple alignment to analyze [in]
conservedThe list of pairwise constraints [out]

Definition at line 1677 of file prog.cpp.

References CHitList::AddToHitList(), Empty(), CMultiAlignerOptions::GetConservedCutoffScore(), CRange_Base::GetFrom(), GetLength(), COpenRange< Position >::GetLength(), CRange_Base::GetTo(), CMultiAlignerOptions::GetVerbose(), i, int, CSequence::kGapChar, m_Options, and x_GetScoreOneCol().

Referenced by x_BuildAlignmentIterative().

◆ x_FindConsistentHitSubset()

void CMultiAligner::x_FindConsistentHitSubset ( void  )

◆ x_FindConstraints()

void CMultiAligner::x_FindConstraints ( vector< size_t > &  constraint,
vector< CSequence > &  alignment,
vector< CTree::STreeLeaf > &  node_list1,
vector< CTree::STreeLeaf > &  node_list2,
CNcbiMatrix< CHitList > &  pair_info,
int  iteration 

Find the set of constraints to use for a profile-profile alignment.

This routine considers only constraints between one collection of equal-size sequences and another collection of equal-size sequences

constraintList of compute constraints, in the format expected by CPSSMAligner [out]
alignmentCurrent multiple alignment of sequences [in]
node_list1List of sequences in first collection [in]
node_list2List of sequences in second collection [in]
pair_infoList of pairwise constraints (between sequences) [in]
iterationThe iteration number [in]

Definition at line 382 of file prog.cpp.

References _ASSERT, abs, CHitList::AddToHitList(), eInterrupt, CHitList::Empty(), CRange_Base::GetFrom(), CHitList::GetHit(), COpenRange< Position >::GetLength(), CHit::GetSubHit(), CRange_Base::GetTo(), CMultiAlignerOptions::GetVerbose(), CHit::HasSubHits(), CMultiAligner::SGraphNode::hit, i, CHit::InsertSubHit(), int, m_Interrupt, m_Options, m_ProgressMonitor, CHit::m_Score, CHit::m_SeqIndex1, CHit::m_SeqIndex2, CHit::m_SeqRange1, CHit::m_SeqRange2, NCBI_THROW, NON_CONST_ITERATE, CMultiAligner::SGraphNode::path_next, CHitList::Size(), CHitList::SortByStartOffset(), CLocalRange< Position >::StrictlyBelow(), x_ExpandRange(), x_FindBestPath(), and x_HitToConstraints().

Referenced by x_AlignProfileProfile().

◆ x_FindDomainHits()

void CMultiAligner::x_FindDomainHits ( blast::TSeqLocVector queries,
const vector< int > &  indices 

◆ x_FindInClusterConstraints()

void CMultiAligner::x_FindInClusterConstraints ( vector< CSequence > &  alignment,
vector< CTree::STreeLeaf > &  node_list1,
vector< CTree::STreeLeaf > &  node_list2,
CNcbiMatrix< CHitList > &  pair_info,
vector< TRangePair > &  match_ranges 
) const

Find constraint to use for profile to profile alignment in clusters.

Finds in-cluster constraint (blastp hit) between two most similar sequences from each profile.

alignmentCurrent multiple alignment of sequences [in]
node_list1List of sequences in first collection [in]
node_list2List of sequences in second collection [in]
pair_infoList of pairwise constraints (between sequences) [in]
match_rangesA list of pairs of ungapped blastp alignment ranges on the profiles [out]

Definition at line 700 of file prog.cpp.

References _ASSERT, first(), CClusterer::GetDistMatrix(), CHit::GetEditScript(), CRange_Base::GetFrom(), CMultiAlignerOptions::GetVerbose(), i, CEditScript::ListMatchRegions(), m_Clusterer, m_Options, CHit::m_Score, CHit::m_SeqIndex1, CHit::m_SeqIndex2, CHit::m_SeqRange1, CHit::m_SeqRange2, and x_GetProfileMatchRanges().

Referenced by x_AlignProfileProfileUsingHit().

◆ x_FindLocalHits()

void CMultiAligner::x_FindLocalHits ( const blast::TSeqLocVector queries,
const vector< int > &  indices 

Run blast on selected input sequences and postprocess the results.

Intended for applications that want fine-grained control of the alignment process

queriesQueries for Blast alignment [in]
indicesIndex of each Blast query in the array of input sequences [in]

Definition at line 306 of file blast.cpp.

References CHitList::Append(), eLocalHitsSearch, CHitList::Empty(), CRange_Base::GetFrom(), CHitList::GetHit(), CRange_Base::GetTo(), CMultiAlignerOptions::GetVerbose(), i, m_CombinedHits, m_DomainHits, m_LocalHits, m_Options, m_ProgressMonitor, CHit::m_Score, CHit::m_SeqIndex1, CHit::m_SeqIndex2, CHit::m_SeqRange1, CHit::m_SeqRange2, CHitList::PurgeAllHits(), CHitList::Size(), CMultiAligner::SProgress::stage, x_AlignFillerBlocks(), x_AssignDefaultResFreqs(), and x_MakeFillerBlocks().

Referenced by x_AlignMSAs(), and x_Run().

◆ x_FindLocalInClusterHits()

void CMultiAligner::x_FindLocalInClusterHits ( const vector< TPhyTreeNode * > &  cluster_trees)

Run blast on sequences from each cluster subtree.

Each tree is traversed. Blast is run for pairs of most similar sequences such that each belongs to different subtree. Indented for use with query clustering for faster in-cluster pair-wise alignments.

cluster_treesList of query cluster trees [in]

Definition at line 481 of file blast.cpp.

References CRange_Base::GetFrom(), CHitList::GetHit(), CRange_Base::GetTo(), CMultiAlignerOptions::GetVerbose(), i, ITERATE, m_LocalInClusterHits, m_Options, CHit::m_Score, CHit::m_SeqIndex1, CHit::m_SeqIndex2, CHit::m_SeqRange1, CHit::m_SeqRange2, CHitList::PurgeAllHits(), CHitList::Size(), and x_AlignClusterQueries().

Referenced by x_Run().

◆ x_FindPatternHits()

void CMultiAligner::x_FindPatternHits ( const vector< const CSequence * > &  queries,
const vector< int > &  indices 

Find PROSITE pattern hits on selected input sequences.

Intended for applications that want fine-grained control of the alignment process

queriesQueries for PROSITE patter search [in]
indicesIndex of each PROSITE pattern search query in the array of input sequences [in]

Definition at line 62 of file phi.cpp.

References _ASSERT, CHitList::AddToHitList(), BLASTAA_SEQ_CODE, BlastScoreBlkFree(), BlastScoreBlkNew(), eInterrupt, ePatternHitsSearch, FALSE, FindPatternHits(), CMultiAlignerOptions::GetCddPatterns(), CRange_Base::GetFrom(), CHitList::GetHit(), GetLength(), GetSequence(), CRange_Base::GetTo(), CMultiAlignerOptions::GetVerbose(), i, int, m_Interrupt, m_Options, m_PatternHits, m_ProgressMonitor, CHit::m_Score, CHit::m_SeqIndex1, CHit::m_SeqIndex2, CHit::m_SeqRange1, CHit::m_SeqRange2, NCBI_THROW, NULL, patterns, PHI_MAX_HIT, CHitList::PurgeAllHits(), CHitList::Size(), SPHIPatternSearchBlkFree(), SPHIPatternSearchBlkNew(), and CMultiAligner::SProgress::stage.

Referenced by x_AlignMSAs(), and x_Run().

◆ x_FindQueryClusters()

bool CMultiAligner::x_FindQueryClusters ( )

Find clusters of similar queries, select cluster representative sequences, and prepare input to multiple alignement composed of only representatives.

True if at least one cluster was found, false otherwise

Definition at line 744 of file cobalt.cpp.

References CLinks::AddLink(), TKmerMethods< TKmerCounts >::ComputeCounts(), TKmerMethods< TKmerCounts >::ComputeDistMatrix(), CClusterer::eClique, CMultiAlignerOptions::eClusters, set< Key, Compare >::empty(), set< Key, Compare >::end(), CMultiAlignerOptions::eNone, eQueryClustering, CMultiAlignerOptions::eToPrototype, set< Key, Compare >::find(), CMultiAlignerOptions::GetCentralSeq(), CClusterer::GetClusters(), CClusterer::GetDistMatrix(), CHitList::GetHit(), CMultiAlignerOptions::GetKmerAlphabet(), CMultiAlignerOptions::GetKmerDistMeasure(), CMultiAlignerOptions::GetKmerLength(), GetLength(), CMultiAlignerOptions::GetMaxInClusterDist(), CMultiAlignerOptions::GetTreeMethod(), CMultiAlignerOptions::GetVerbose(), i, set< Key, Compare >::insert(), int, ITERATE, m_AllQueries, m_AllQueryData, m_ClustAlnMethod, m_Clusterer, m_Options, m_ProgressMonitor, m_QueryData, m_Scope, CHit::m_SeqIndex1, CHit::m_SeqIndex2, m_tQueries, m_UserHits, NON_CONST_ITERATE, prot, CClusterer::PurgeDistMatrix(), CClusterer::Reset(), CRef< C, Locker >::Reset(), CClusterer::Run(), CClusterer::SetClusters(), CClusterer::SetClustMethod(), CClusterer::SetDistMatrix(), CClusterer::SetLinks(), CClusterer::SetMakeTrees(), TKmerMethods< TKmerCounts >::SetParams(), CHitList::Size(), CClusterer::CSingleCluster::size(), CLinks::Sort(), and CMultiAligner::SProgress::stage.

Referenced by x_Run().

◆ x_FindRPSHits()

void CMultiAligner::x_FindRPSHits ( blast::TSeqLocVector queries,
const vector< int > &  indices,
CHitList rps_hits 

◆ x_GetScore()

double CMultiAligner::x_GetScore ( vector< CSequence > &  align)

Compute the entropy score of a multiple alignment.

aligncomplete multiple alignment [in]
the alignment score

Definition at line 1392 of file prog.cpp.

References H, i, and x_GetScoreOneCol().

Referenced by x_BuildAlignmentIterative(), and x_RealignSequences().

◆ x_GetScoreOneCol()

double CMultiAligner::x_GetScoreOneCol ( vector< CSequence > &  align,
int  col 

Calculate the entropy score of one column of a multiple alignment (see the COBALT papaer for details)

alignThe alignment [in]
colthe column number to score
the computed score

Definition at line 1351 of file prog.cpp.

References count, CMultiAlignerOptions::GetPseudocount(), kDerivedFreqs, kNumClasses, kRes2Class, log, m_Options, and r().

Referenced by x_FindConservedColumns(), and x_GetScore().

◆ x_GetSeqalign()

CRef< CSeq_align > CMultiAligner::x_GetSeqalign ( const vector< CSequence > &  align,
vector< int > &  indices 
) const

Produce a seqalign representing the specified alignment, using a subset of the sequences.

alignThe alignment to convert
indices0-based list of sequence numbers that will participate in the resulting Seq-align. indices may appear out of order and may be repeated
The generated seqalign

Definition at line 58 of file seqalign.cpp.

References CSeq_align_Base::eType_global, CSeq_interval_Base::GetFrom(), CSeq_interval_Base::GetId(), CSeq_loc_Base::GetInt(), CSeq_loc_Base::GetWhole(), i, CSeq_loc_Base::IsInt(), CSeq_loc_Base::IsWhole(), CSequence::kGapChar, CDense_seg_Base::SetDim(), CSeq_align_Base::SetDim(), CDense_seg_Base::SetIds(), CDense_seg_Base::SetLens(), CDense_seg_Base::SetNumseg(), CSeq_align_Base::SetSegs(), CDense_seg_Base::SetStarts(), and CSeq_align_Base::SetType().

Referenced by GetResults().

◆ x_Init()

void CMultiAligner::x_Init ( void  )

Initiate class attributes that are not alignment parameters.

Definition at line 579 of file cobalt.hpp.

References NULL.

Referenced by CMultiAligner().

◆ x_InitAligner()

void CMultiAligner::x_InitAligner ( void  )

◆ x_InitColumn()

void CMultiAligner::x_InitColumn ( vector< SColumn >::iterator &  it,
size_t  len 

Definition at line 1160 of file cobalt.cpp.

References i, and len.

Referenced by x_MultiAlignClusters().

◆ x_InitInsertColumn()

void CMultiAligner::x_InitInsertColumn ( vector< SColumn >::iterator &  it,
size_t  len,
int  num,
int  cluster 

Definition at line 1175 of file cobalt.cpp.

References i, and len.

Referenced by x_MultiAlignClusters().

◆ x_InitParams()

void CMultiAligner::x_InitParams ( void  )

◆ x_LoadBlockBoundaries()

void CMultiAligner::x_LoadBlockBoundaries ( string  blockfile,
vector< SSegmentLoc > &  blocklist 

Given an RPS blast database, load a list of block offsets for each database sequence.

The list is resident in a text file, where each line is as follows

[seq ID] [oid of block] [start block offset] [end block offset]

Note that block offsets are zero-based

blockfileName of file containing list of offsets [in]
blocklistthe list of offsets read from file [out]

Definition at line 75 of file rps.cpp.

References buf, NCBI_THROW, and tmp.

Referenced by x_FindDomainHits().

◆ x_MakeClusterResidueFrequencies()

void CMultiAligner::x_MakeClusterResidueFrequencies ( void  )

◆ x_MakeFillerBlocks()

void CMultiAligner::x_MakeFillerBlocks ( const vector< int > &  blastp_indices,
vector< CRef< objects::CSeq_loc > > &  filler_locs,
vector< SSegmentLoc > &  filler_segs 

Turn all fragments of selected query sequence not already covered by a domain hit into a separate query sequence, used as input to a blast search.

blastp_indicesIndices of query sequences selected for blastp search [in]
filler_locsList of generated sequences [out]
filler_segsSimplified representation of filler_locs [out]

Definition at line 95 of file blast.cpp.

References _ASSERT, CHitList::GetHit(), GetLength(), CHit::GetSubHit(), CMultiAlignerOptions::GetVerbose(), CHit::HasSubHits(), i, int, ITERATE, CHit::kMinHitSize, m_CombinedHits, m_Options, m_QueryData, m_Scope, CHit::m_SeqIndex1, CHit::m_SeqIndex2, CHit::m_SeqRange1, CHit::m_SeqRange2, m_tQueries, CHitList::Size(), and x_AddNewSegment().

Referenced by x_FindLocalHits().

◆ x_MultiAlignClusters()

void CMultiAligner::x_MultiAlignClusters ( void  )

◆ x_RealignBlocks()

void CMultiAligner::x_RealignBlocks ( CHitList rps_hits,
vector< SSegmentLoc > &  blocklist,
CProfileData profile_data 

Definition at line 114 of file rps.cpp.

References _ASSERT, CHit::AddUpSubHits(), eInterrupt, COpenRange< Position >::Empty(), CNWAligner::eTS_Delete, CNWAligner::eTS_Insert, CNWAligner::eTS_Match, CHit::GetEditScript(), CMultiAlignerOptions::GetEndGapExtendPenalty(), CMultiAlignerOptions::GetEndGapOpenPenalty(), CPSSMAligner::GetEndWg(), CMultiAligner::SSegmentLoc::GetFrom(), CRange_Base::GetFrom(), CMultiAlignerOptions::GetGapExtendPenalty(), CMultiAlignerOptions::GetGapOpenPenalty(), CHitList::GetHit(), COpenRange< Position >::GetLength(), CProfileData::GetPssm(), CHit::GetRangeFromSeq2(), CEditScript::GetScore(), CProfileData::GetSeqOffsets(), CMultiAligner::SSegmentLoc::GetTo(), CRange_Base::GetTo(), CNWAligner::GetTranscript(), CMultiAlignerOptions::GetVerbose(), CNWAligner::GetWg(), CNWAligner::GetWs(), CHit::HasSubHits(), i, CHit::InsertSubHit(), CHit::kMinHitSize, kRpsScaleFactor, m_Aligner, m_Interrupt, m_Options, m_ProgressMonitor, m_QueryData, CHit::m_SeqIndex1, CHit::m_SeqIndex2, CHit::m_SeqRange1, CHit::m_SeqRange2, CEditScript::MakeEditScript(), max(), min(), NCBI_THROW, CHitList::PurgeUnwantedHits(), query, CMultiAligner::SSegmentLoc::range, CHit::ResolveSubHitConflicts(), CPSSMAligner::Run(), CMultiAligner::SSegmentLoc::seq_index, COpenRange< Position >::Set(), CNWAligner::SetEndSpaceFree(), CPSSMAligner::SetEndWg(), CPSSMAligner::SetEndWs(), CRange_Base::SetFrom(), CHitList::SetKeepHit(), CPSSMAligner::SetSequences(), CPSSMAligner::SetStartWg(), CPSSMAligner::SetStartWs(), CRange_Base::SetTo(), CPSSMAligner::SetWg(), CPSSMAligner::SetWs(), and CHitList::Size().

Referenced by x_FindDomainHits().

◆ x_RealignSequences()

double CMultiAligner::x_RealignSequences ( const TPhyTreeNode input_cluster,
vector< CSequence > &  alignment,
CNcbiMatrix< CHitList > &  pair_info,
double  score,
int  iteration 

Perform a single bipartition on a multiple alignment.

input_clusterA tree describing sequences that form one half of the bipartition [in]
alignmentA complete multiple alignment, updated with the the bipartition results if they have a higher score [in][out]
pair_infoPairwise constraints on the alignment [in]
scoreStarting alignment score [in]
iterationThe iteration number [in]
The score of the new multiple alignment

Definition at line 1415 of file prog.cpp.

References CSequence::CompressSequences(), GetTree(), CMultiAlignerOptions::GetVerbose(), i, int, CTree::ListTreeLeaves(), m_Options, m_QueryData, x_AlignProfileProfile(), and x_GetScore().

Referenced by x_BuildAlignmentIterative().

◆ x_Run()

void CMultiAligner::x_Run ( void  )

Align the current set of input sequences (reset any existing alignment information).

This function handles the generation of all internal state in the correct order. It is sufficient for 'black box' applications that only want a final answer without tweaking internal state.

Definition at line 590 of file cobalt.cpp.

References CMultiAlignerOptions::eMulti, CMultiAlignerOptions::eNone, CMultiAlignerOptions::eToPrototype, CClusterer::GetClusters(), NStr::IntToString(), kClusterNodeId, m_AllQueries, m_ClustAlnMethod, m_Clusterer, m_InMSA1, m_tQueries, m_Tree, NCBI_THROW, CTree::SetTree(), x_AlignInClusters(), x_AlignMSAs(), x_BuildAlignment(), x_BuildFullTree(), x_ComputeClusterTrees(), x_ComputeTree(), x_CreateBlastQueries(), x_CreatePatternQueries(), x_FindConsistentHitSubset(), x_FindDomainHits(), x_FindLocalHits(), x_FindLocalInClusterHits(), x_FindPatternHits(), x_FindQueryClusters(), and x_MultiAlignClusters().

Referenced by Run().

◆ x_SetDomainHits()

void CMultiAligner::x_SetDomainHits ( const blast::TSeqLocVector queruies,
const vector< int > &  indices,
const objects::CBlast4_archive &  archive 

Set pre-computed domain hits using BLAST archive format.

queriesSequences for which alignment hits will be searched for in the pre-computed set [in]
indicesIndex of each query in the array of sequences to align [in]
archiveBLAST archive format [in]

Queries from the archive are matched to COBALT queries by Seq_ids. The two sets of queries do not need to be the same. Domain hits for the archive queries that do match to any of the COBALT queries are ignored. COBALT will do RPS-BLAST search for all of its sequences that were not matched to the archive queries. It is the responsibility of user to ensure that the pre-computed hits and COBALT refer to the same domain database.

Definition at line 664 of file rps.cpp.

References _ASSERT, CHitList::AddToHitList(), compare_seqids(), CSeq_id::CompareOrdered(), ConstBegin(), eDetectLoops, CSeqDB::eProtein, CSeq_align_set_Base::Get(), CBlast4_get_search_results_reply_Base::GetAlignments(), CBlast4_queries_Base::GetBioseq_set(), CBlast4_request_Base::GetBody(), CSeq_align_Base::C_Segs::GetDenseg(), CRange_Base::GetFrom(), CHitList::GetHit(), CScore_Base::GetId(), CDense_seg_Base::GetIds(), CScore_Base::C_Value::GetInt(), CBlast4_queue_search_request_Base::GetQueries(), CBlast4_request_body_Base::GetQueue_search(), CScore_Base::C_Value::GetReal(), CBlast4_archive_Base::GetRequest(), CBlast4_archive_Base::GetResults(), CMultiAlignerOptions::GetRpsDb(), CMultiAlignerOptions::GetRpsEvalue(), CSeq_align_Base::GetScore(), CSeq_align_Base::GetSegs(), CSeq_align::GetSeq_id(), CBlast4_queries_Base::GetSeq_loc_list(), CObject_id_Base::GetStr(), CRange_Base::GetTo(), CScore_Base::GetValue(), CMultiAlignerOptions::GetVerbose(), i, CBlast4_queries_Base::IsBioseq_set(), CBlast4_queries_Base::IsSeq_loc_list(), ITERATE, m_DomainHits, m_IsDomainSearched, m_Options, CHit::m_Score, CHit::m_SeqIndex1, CHit::m_SeqIndex2, CHit::m_SeqRange1, CHit::m_SeqRange2, m_tQueries, NCBI_THROW, NULL, CHitList::PurgeAllHits(), CSeqDB::SeqidToOid(), CHitList::Size(), and ct::sort().

Referenced by CMultiAlignerTest::SetDomainHits(), and x_FindDomainHits().

◆ x_SetScoreMatrix()

void CMultiAligner::x_SetScoreMatrix ( const char *  matrix_name)

Set the score matrix the aligner will use.

NOTE that at present any hits between sequences will always be scored using BLOSUM62; the matrix chosen here is only used when forming the complete alignment

matrix_nameThe score matrix to use; limited to the same list of matrices as blast [in]

Definition at line 381 of file cobalt.cpp.

References m_Aligner, NCBI_THROW, NCBISM_Blosum45, NCBISM_Blosum62, NCBISM_Blosum80, NCBISM_Pam250, NCBISM_Pam30, NCBISM_Pam70, CPSSMAligner::SetScoreMatrix(), and util::strcmp().

Referenced by x_InitAligner().

◆ x_ValidateInputMSAs()

bool CMultiAligner::x_ValidateInputMSAs ( void  ) const

Validate input alignments.

Throws if alignments do not pass validation.

Trueif validation passed

Definition at line 129 of file cobalt.cpp.

References NStr::IntToString(), ITERATE, m_InMSA1, m_InMSA2, m_Msa1Repr, m_Msa2Repr, and NCBI_THROW.

Referenced by SetInputMSAs().

◆ x_ValidateQueries()

bool CMultiAligner::x_ValidateQueries ( void  ) const

Validate query sequences.

Check for gaps in sequences. Throws if queries do not pass validation.

True if validation passed

Definition at line 114 of file cobalt.cpp.

References i, ITERATE, CSequence::kGapChar, m_QueryData, and NCBI_THROW.

Referenced by SetQueries().

◆ x_ValidateUserHits()

bool CMultiAligner::x_ValidateUserHits ( void  )

Validate user constraints with queries.

Throws if constraints do not pass validation.

True if validation passed

Definition at line 159 of file cobalt.cpp.

References CRange_Base::GetFrom(), CHitList::GetHit(), GetLength(), CRange_Base::GetTo(), i, m_QueryData, CHit::m_SeqIndex1, CHit::m_SeqIndex2, CHit::m_SeqRange1, CHit::m_SeqRange2, m_UserHits, NCBI_THROW, and CHitList::Size().

Referenced by SetInputMSAs(), and SetQueries().

Friends And Related Function Documentation

◆ ::CMultiAlignerTest

friend class ::CMultiAlignerTest

Definition at line 749 of file cobalt.hpp.

◆ compare_sseg_db_idx

friend class compare_sseg_db_idx

Definition at line 519 of file cobalt.hpp.

Member Data Documentation

◆ kClusterNodeId

const int CMultiAligner::kClusterNodeId = 16000

Definition at line 747 of file cobalt.hpp.

Referenced by x_AlignProgressive(), x_AttachClusterTrees(), x_BuildAlignment(), and x_Run().

◆ kDefaultGapExtend

const CNWAligner::TScore CMultiAligner::kDefaultGapExtend = -1

Default gap extension penalty.

Definition at line 300 of file cobalt.hpp.

◆ kDefaultGapOpen

const CNWAligner::TScore CMultiAligner::kDefaultGapOpen = -11

Default gap open penalty.

Definition at line 297 of file cobalt.hpp.

◆ kMajorVersion

const int CMultiAligner::kMajorVersion = 3

Version information.

Definition at line 72 of file cobalt.hpp.

◆ kMinorVersion

const int CMultiAligner::kMinorVersion = 0

Definition at line 73 of file cobalt.hpp.

◆ kPatchVersion

const int CMultiAligner::kPatchVersion = 0

Definition at line 74 of file cobalt.hpp.

◆ kRpsScaleFactor

const int CMultiAligner::kRpsScaleFactor = 100

◆ m_Aligner

CPSSMAligner CMultiAligner::m_Aligner

◆ m_AllQueries

vector< CRef<objects::CSeq_loc> > CMultiAligner::m_AllQueries

Definition at line 727 of file cobalt.hpp.

Referenced by x_FindQueryClusters(), x_MultiAlignClusters(), and x_Run().

◆ m_AllQueryData

vector<CSequence> CMultiAligner::m_AllQueryData

◆ m_ClustAlnMethod

CMultiAlignerOptions::EInClustAlnMethod CMultiAligner::m_ClustAlnMethod

◆ m_Clusterer

CClusterer CMultiAligner::m_Clusterer

◆ m_ClusterGapPositions

vector< vector<Uint4> > CMultiAligner::m_ClusterGapPositions

◆ m_CombinedHits

CHitList CMultiAligner::m_CombinedHits

◆ m_DomainHits

CHitList CMultiAligner::m_DomainHits

◆ m_InMSA1

vector<CSequence> CMultiAligner::m_InMSA1

Input alignment.

Definition at line 694 of file cobalt.hpp.

Referenced by SetInputMSAs(), x_AlignMSAs(), x_CreateBlastQueries(), x_Run(), and x_ValidateInputMSAs().

◆ m_InMSA2

vector<CSequence> CMultiAligner::m_InMSA2

Input alignment.

Definition at line 696 of file cobalt.hpp.

Referenced by SetInputMSAs(), x_AlignMSAs(), x_CreateBlastQueries(), and x_ValidateInputMSAs().

◆ m_Interrupt

FInterruptFn CMultiAligner::m_Interrupt

◆ m_IsDomainSearched

vector<bool> CMultiAligner::m_IsDomainSearched

Marks sequences with pre-computed domain hits.

Definition at line 724 of file cobalt.hpp.

Referenced by CMultiAlignerTest::GetIsDomainSearched(), x_FindDomainHits(), and x_SetDomainHits().

◆ m_LocalHits

CHitList CMultiAligner::m_LocalHits

Definition at line 717 of file cobalt.hpp.

Referenced by Reset(), x_AlignFillerBlocks(), and x_FindLocalHits().

◆ m_LocalInClusterHits

CHitList CMultiAligner::m_LocalInClusterHits

◆ m_Messages

vector<string> CMultiAligner::m_Messages

Definition at line 739 of file cobalt.hpp.

Referenced by Run(), and x_AlignProfileProfileUsingHit().

◆ m_Msa1Repr

vector<int> CMultiAligner::m_Msa1Repr

Indices of sequence representatives in input alignment 1.

Definition at line 699 of file cobalt.hpp.

Referenced by SetInputMSAs(), x_CreateBlastQueries(), and x_ValidateInputMSAs().

◆ m_Msa2Repr

vector<int> CMultiAligner::m_Msa2Repr

Indices of sequence representatives in input alignment 2.

Definition at line 701 of file cobalt.hpp.

Referenced by SetInputMSAs(), x_CreateBlastQueries(), and x_ValidateInputMSAs().

◆ m_Options

CConstRef<CMultiAlignerOptions> CMultiAligner::m_Options

◆ m_PatternHits

CHitList CMultiAligner::m_PatternHits

Definition at line 719 of file cobalt.hpp.

Referenced by Reset(), x_BuildAlignmentIterative(), and x_FindPatternHits().

◆ m_ProgressMonitor

SProgress CMultiAligner::m_ProgressMonitor

◆ m_QueryData

vector<CSequence> CMultiAligner::m_QueryData

◆ m_Results

vector<CSequence> CMultiAligner::m_Results

◆ m_RPSLocs

vector< vector<TRange> > CMultiAligner::m_RPSLocs

Definition at line 731 of file cobalt.hpp.

Referenced by x_AddRpsFreqsToCluster(), x_AssignRPSResFreqs(), and x_FindDomainHits().

◆ m_Scope

CRef<objects::CScope> CMultiAligner::m_Scope

◆ m_Score

int CMultiAligner::m_Score

Alignment score.

Definition at line 706 of file cobalt.hpp.

Referenced by x_BuildAlignmentIterative().

◆ m_tQueries

vector< CRef<objects::CSeq_loc> > CMultiAligner::m_tQueries

◆ m_Tree

CTree CMultiAligner::m_Tree

Definition at line 711 of file cobalt.hpp.

Referenced by GetTreeContainer(), x_BuildFullTree(), x_ComputeTree(), and x_Run().

◆ m_UserHits

CHitList CMultiAligner::m_UserHits

