40 #include "../core/blast_psi_priv.h"
70 #ifndef GAP_IN_ALIGNMENT
72 # define GAP_IN_ALIGNMENT ((Uint4)-1)
78 unsigned int query_length,
82 const char* matrix_name,
86 const string& query_title)
87 : m_GapExistence(gap_existence), m_GapExtension(gap_extension)
93 if ( !sset || sset->Get().front()->GetDim() != 2) {
95 "Only 2-dimensional alignments are supported");
135 "Multiple alignment data structure");
183 return hit_ids.
size();
200 inline unsigned char*
236 CPsiBlastInputData::x_ExtractAlignmentDataUseBestAlign()
247 const CSeq_align::C_Segs::TDisc::Tdata& hsp_list =
248 (*itr)->GetSegs().GetDisc().Get();
249 CSeq_align::C_Segs::TDisc::Tdata::const_iterator best_alignment;
254 ITERATE(CSeq_align::C_Segs::TDisc::Tdata, hsp_itr, hsp_list) {
258 if ( !(*hsp_itr)->GetSegs().IsDenseg() ) {
260 "Segment type not supported");
263 double evalue = s_GetLowestEvalue((*hsp_itr)->GetScore());
264 if (evalue < min_evalue) {
265 best_alignment = hsp_itr;
269 _ASSERT(best_alignment != hsp_list.end());
272 seq_index, min_evalue);
307 CSeq_id* current_sid =
const_cast<CSeq_id*
> (&(*itr)->GetSeq_id(1));
311 if (last_sid && !current_sid->
Match(*last_sid)) {
318 const CDense_seg& seg = (*itr)->GetSegs().GetDenseg();
321 last_sid = current_sid;
327 unsigned int msa_index,
336 const int kNumSegments = denseg.GetNumseg();
337 const TSeqPos kDimensions = denseg.GetDim();
350 if (seq.size() == 0) {
358 #ifdef DEBUG_PSSM_ENGINE
359 _ASSERT(denseg.CanGetIds() && denseg.GetIds().size() == 2);
360 if (denseg.GetIds().back()->IsGi()) {
361 m_Msa->seqinfo[msa_index].gi = denseg.GetIds().back()->GetGi();
363 m_Msa->seqinfo[msa_index].evalue = evalue;
364 m_Msa->seqinfo[msa_index].bit_score = bit_score;
368 for (
int segmt_idx = 0; segmt_idx < kNumSegments; segmt_idx++) {
370 TSeqPos query_offset = starts[query_index];
371 TSeqPos subject_offset = starts[subj_index];
374 query_index += kDimensions;
375 subj_index += kDimensions;
380 subj_seq_idx += lengths[segmt_idx];
386 for (
TSeqPos i = 0;
i < lengths[segmt_idx];
i++) {
397 for (
TSeqPos i = 0;
i < lengths[segmt_idx];
i++, subj_seq_idx++) {
401 msa_cell.
letter =
static_cast<Uint1>(seq[subj_seq_idx]);
413 objects::CScope& scope,
414 string& sequence_data)
419 bool subj_start_found =
false;
420 const int kNumSegments = ds.GetNumseg();
421 const TSeqPos kDimensions = ds.GetDim();
427 for (
int i = 0;
i < kNumSegments;
i++) {
430 if ( !subj_start_found ) {
431 subj_start = starts[subj_index];
432 subj_start_found =
true;
434 subjlen += lengths[
i];
437 subj_index += kDimensions;
442 subj_start+subjlen-1);
449 sequence_data.erase();
User-defined methods of the data storage class.
Declares the BLAST exception class.
PSIMsa * PSIMsaFree(PSIMsa *msa)
Deallocates the PSIMsa structure.
PSIMsa * PSIMsaNew(const PSIMsaDimensions *dimensions)
Allocates and initializes the multiple sequence alignment data structure for use as input to the PSSM...
const unsigned int kQueryIndex
Index into multiple sequence alignment structure for the query sequence.
Defines BLAST error codes (user errors included)
Defines system exceptions occurred while running BLAST.
Auxiliary class to retrieve sequence identifiers its position in the alignment which are below the in...
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
void Process()
The work to process the alignment is done here.
unsigned int GetNumAlignedSequences() const
Returns the number of sequences that make up the multiple sequence alignment.
#define GAP_IN_ALIGNMENT
Representation of GAP in Seq-align.
CConstRef< objects::CSeq_align_set > m_SeqAlignSet
Pairwise alignment result of a BLAST search.
void x_CopyQueryToMsa()
Copies query sequence data to multiple alignment data structure.
PSIMsa * m_Msa
Structure representing the multiple sequence alignment.
PSIDiagnosticsRequest * m_DiagnosticsRequest
Diagnostics request structure.
virtual const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
const PSIBlastOptions * GetOptions()
Obtain the options for the PSSM engine.
PSIMsaDimensions m_MsaDimensions
Multiple sequence alignment dimensions.
PSIBlastOptions m_Opts
Algorithm options.
virtual ~CPsiBlastInputData()
virtual destructor
CRef< objects::CScope > m_Scope
Scope where to retrieve the sequences in the aligment from.
unsigned int x_CountAndSelectQualifyingAlignments()
Examines the sequence alignment and keeps track of those hits which have an HSP with an e-value below...
unsigned int GetQueryLength()
Get the query's length.
CRef< objects::CBioseq > m_QueryBioseq
Query as CBioseq for PSSM.
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
void x_ProcessDenseg(const objects::CDense_seg &denseg, unsigned int msa_index, double evalue, double bit_score)
Iterates over the Dense-seg passed in and extracts alignment information to multiple alignment data s...
const PSIDiagnosticsRequest * GetDiagnosticsRequest()
Obtain the diagnostics data that is requested from the PSSM engine.
const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
PSIMsa * GetData()
Obtain the multiple sequence alignment structure.
double GetLowestEvalue(const objects::CDense_seg::TScores &scores, double *bit_score)
Returns the lowest score from the list of scores in CDense_seg::TScores.
void x_ExtractAlignmentData()
Populates the multiple alignment data structure.
unsigned char * m_Query
Pointer to query sequence.
CPsiBlastInputData(const unsigned char *query, unsigned int query_length, CConstRef< objects::CSeq_align_set > sset, CRef< objects::CScope > scope, const PSIBlastOptions &opts, const char *matrix_name=NULL, int gap_existence=0, int gap_opening=0, const PSIDiagnosticsRequest *diags=NULL, const string &query_title="")
Construct a concrete strategy, used to configure the CPssmEngine object.
string m_MatrixName
Underlying matrix to use.
string m_QueryTitle
Title of query.
unsigned char * GetQuery()
Get the query sequence used as master for the multiple sequence alignment in ncbistdaa encoding.
static void x_GetSubjectSequence(const objects::CDense_seg &ds, objects::CScope &scope, string &sequence_data)
Tries to fetch the sequence data for the subject for the segments specified in the Dense-seg.
void x_ExtractQueryForPssm()
Extracts the query bioseq from m_SeqAlignSet.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Warning(CExceptionArgs_Base &args)
const string AsFastaString(void) const
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
void SetCoding(TCoding coding)
void Reset(void)
Reset reference object.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
uint8_t Uint1
1-byte (8-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define USING_SCOPE(ns)
Use the specified namespace.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
vector< TSignedSeqPos > TStarts
list< CRef< CSeq_align > > Tdata
const TId & GetId(void) const
Get the Id member data.
const TInt & GetInt(void) const
Get the variant data.
TTitle & SetTitle(void)
Select the variant.
@ eRepr_raw
continuous sequence
@ e_Ncbistdaa
consecutive codes for std aas
Declarations of auxiliary functions/classes for PSI-BLAST.
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double inclusion_ethresh
Minimum evalue for inclusion in PSSM calculation.
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Structure to describe the characteristics of a position in the multiple sequence alignment data struc...
Boolean is_aligned
Is this letter part of the alignment?
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Uint4 query_length
Length of the query.
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)