40 #include "../core/blast_psi_priv.h"
63 #ifndef GAP_IN_ALIGNMENT
65 # define GAP_IN_ALIGNMENT ((Uint4)-1)
75 const string& matrix_name ,
79 const string& query_title )
80 : m_QueryTitle(query_title),
82 m_SeqalignSet(seqaligns),
85 m_MatrixName(matrix_name),
86 m_DiagnosticsRequest(diags),
88 m_GapExistence(gap_existence),
89 m_GapExtension(gap_extension)
95 if (seqaligns.
Empty()) {
106 for (
unsigned int i=0;
i <
m_Hits.size();
i++) {
118 "Minimum RPS-BLAST e-value is larger than the maximum one");
158 "Evalue not found in Seq-align");
161 if (evalue >= min_evalue && evalue < max_evalue) {
162 m_Hits.push_back(
new CHit((*it)->GetSegs().GetDenseg(), evalue));
177 vector<CHit*> new_hits;
178 new_hits.reserve(
m_Hits.size());
180 new_hits.push_back(
m_Hits[0]);
182 vector<CHit*>::iterator it(
m_Hits.begin());
186 for (;it !=
m_Hits.end();++it) {
190 for (
int i=
static_cast<int>(new_hits.size()) - 1;
i >= 0
191 && (*it)->m_SubjectId->Match(*new_hits[
i]->m_SubjectId);
i--) {
193 const CHit* kept_hit = new_hits[
i];
198 CHit intersection(*kept_hit);
204 (*it)->Subtract(intersection);
206 if ((*it)->IsEmpty()) {
213 new_hits.push_back(*it);
235 (*it)->FillData(seqdb, *profile_data);
242 const int kQueryLength =
static_cast<int>(
m_QueryData.size());
243 const int kNumCds =
static_cast<int>(
m_Hits.size());
250 m_MsaData.resize(kQueryLength * (kNumCds), cell);
254 "Multiple alignment data structure");
256 for (
int i=0;
i < kNumCds;
i++) {
261 for (
size_t hit_idx=0;hit_idx <
m_Hits.size();hit_idx++) {
265 m_Hits[hit_idx]->GetSegments()) {
267 const int kNumQueryColumns
268 = (*it)->m_QueryRange.GetTo() - (*it)->m_QueryRange.GetFrom();
270 int q_from = (*it)->m_QueryRange.GetFrom();
273 for (
int i=0;
i < kNumQueryColumns;
i++) {
276 m_Msa[hit_idx][q_from +
i].
data = &(*it)->m_MsaData[
i];
279 m_Hits[hit_idx]->m_MsaIdx =
static_cast<int>(hit_idx);
289 const int kQueryLength =
static_cast<int>(
m_QueryData.size());
290 const int kNumCds =
static_cast<int>(
m_Hits.size());
292 for (
int i=0;
i < kNumCds;
i++) {
296 for (
int i=0;
i < kNumCds;
i++) {
297 for (
int j=0;j < kQueryLength;j++) {
301 "Query sequence cannot contain gaps");
312 if (data->iobsr <= 0.0) {
314 "Zero independent observations in domain model");
320 if (data->wfreqs[k] < 0.0) {
322 "Negative residue frequency in a domain "
325 s += data->wfreqs[k];
331 if (
fabs(s - 1.0) > 1e-5) {
333 "Domain residue frequencies do not sum to 1");
344 : m_Evalue(evalue), m_MsaIdx(-1)
346 const int kNumDims = denseg.
GetDim();
347 const int kNumSegments = denseg.
GetNumseg();
351 m_SubjectId.Reset(denseg.
GetIds()[1].GetNonNullPointer());
353 const vector<TSignedSeqPos>& starts = denseg.
GetStarts();
354 const vector<TSeqPos>& lens = denseg.
GetLens();
359 for (
int seg=0;seg < kNumSegments;seg++) {
360 TSeqPos query_offset = starts[query_index];
361 TSeqPos subject_offset = starts[subject_index];
363 query_index += kNumDims;
364 subject_index += kNumDims;
370 m_SegmentList.push_back(
new CHitSegment(
371 TRange(query_offset, query_offset + lens[seg]),
372 TRange(subject_offset, subject_offset
375 query_offset += lens[seg];
376 subject_offset += lens[seg];
383 : m_SubjectId(hit.m_SubjectId),
384 m_Evalue(hit.m_Evalue),
385 m_MsaIdx(hit.m_MsaIdx)
396 ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
409 ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
410 result += (*it)->GetLength();
426 (*it)->FillData(db_oid, profile_data);
484 ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
495 if (m_SegmentList.empty()) {
499 ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
500 if (!(*it)->IsEmpty()) {
515 vector<TRange>::const_iterator r_itr = ranges.begin();
516 vector<CHitSegment*>::iterator seg_it = m_SegmentList.begin();
517 vector<CHitSegment*> new_segs;
518 while (seg_it != m_SegmentList.end() && r_itr != ranges.end()) {
522 = (app == eSubject ? (*seg_it)->m_SubjectRange
523 : (*seg_it)->m_QueryRange);
526 while (r_itr != ranges.end() && r_itr->GetTo() < seg_range.
GetFrom()) {
530 if (r_itr == ranges.end()) {
538 if (intersection == seg_range) {
544 if (intersection.
Empty()) {
554 while (r_itr != ranges.end() && r_itr->GetFrom() < seg_range.
GetTo()) {
558 r_itr->GetFrom()) - seg_range.
GetFrom();
560 r_itr->GetTo()) - seg_range.
GetTo();
565 new_segs.push_back(new_seg);
580 while (seg_it != m_SegmentList.end()) {
587 ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
589 new_segs.push_back(*it);
594 m_SegmentList.swap(new_segs);
601 vector<TRange> ranges;
604 ranges.push_back(app == eQuery ? (*it)->m_QueryRange
605 : (*it)->m_SubjectRange);
610 IntersectWith(ranges, app);
618 if (IsEmpty() || hit.
IsEmpty()) {
626 int from = hit.
GetSegments().front()->m_QueryRange.GetFrom();
627 int to = hit.
GetSegments().back()->m_QueryRange.GetTo();
630 if (m_SegmentList.front()->m_QueryRange.GetFrom() >= to
631 || m_SegmentList.back()->m_QueryRange.GetTo() <= from) {
637 vector<CHitSegment*>::iterator it = m_SegmentList.begin();
639 vector<CHitSegment*> new_segments;
640 new_segments.reserve(m_SegmentList.size());
644 while (it != m_SegmentList.end() && (*it)->m_QueryRange.GetTo() <= from) {
646 new_segments.push_back(*it);
653 if (it == m_SegmentList.end() || (*it)->m_QueryRange.GetFrom() > to) {
658 if ((*it)->m_QueryRange.GetTo() > to) {
665 if ((*it)->m_QueryRange.GetFrom() < from) {
670 int d_to = from - (*it)->m_QueryRange.GetTo();
672 (*it)->AdjustRanges(0, d_to);
673 _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
674 new_segments.push_back(*it);
684 _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
685 new_segments.push_back(new_seg);
689 for (;it != m_SegmentList.end();++it) {
690 new_segments.push_back(*it);
697 if ((*it)->m_QueryRange.GetFrom() >= from) {
704 int d_to = from - (*it)->m_QueryRange.GetTo();
707 (*it)->AdjustRanges(0, d_to);
708 _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
709 new_segments.push_back(*it);
714 while (it != m_SegmentList.end()
715 && (*it)->m_QueryRange.GetTo() <= to) {
723 if (it != m_SegmentList.end()) {
725 if ((*it)->m_QueryRange.GetFrom() < to) {
726 int d_from = to - (*it)->m_QueryRange.GetFrom();
729 (*it)->AdjustRanges(d_from, 0);
730 _ASSERT((*it)->m_QueryRange.GetFrom()
731 < (*it)->m_QueryRange.GetTo());
733 new_segments.push_back(*it);
742 while (it != m_SegmentList.end()) {
743 new_segments.push_back(*it);
749 m_SegmentList.swap(new_segments);
759 m_MsaData.resize(m_QueryRange.GetTo() - m_QueryRange.GetFrom(), d);
761 x_FillResidueCounts(db_oid, profile_data);
762 x_FillObservations(db_oid, profile_data);
768 _ASSERT(m_QueryRange.GetFrom() >= 0 && m_QueryRange.GetTo() >= 0);
769 _ASSERT(m_SubjectRange.GetFrom() >= 0 && m_SubjectRange.GetTo() >= 0);
771 const int kQueryLength = m_QueryRange.GetTo() - m_QueryRange.GetFrom();
772 const int kSubjectLength = m_SubjectRange.GetTo() - m_SubjectRange.GetFrom();
774 if (kQueryLength != kSubjectLength) {
790 m_QueryRange.SetFrom(m_QueryRange.GetFrom() + d_from);
791 m_QueryRange.SetTo(m_QueryRange.GetTo() + d_to);
793 m_SubjectRange.SetFrom(m_SubjectRange.GetFrom() + d_from);
794 m_SubjectRange.SetTo(m_SubjectRange.GetTo() + d_to);
800 return m_QueryRange.GetFrom() > m_QueryRange.GetTo()
801 || m_SubjectRange.GetFrom() > m_SubjectRange.GetTo();
807 _ASSERT(profile_data()->freq_header);
812 _ASSERT(db_oid < num_profiles);
821 int db_seq_length = db_seq_offsets[db_oid + 1] - db_seq_offsets[db_oid];
826 _ASSERT(m_SubjectRange.GetTo() <= db_seq_length);
831 for (
int i=0;
i < num_columns;
i++) {
845 (double)counts[(m_SubjectRange.GetFrom() +
i) *
kAlphabetSize + j]
856 _ASSERT(profile_data()->obsr_header);
861 _ASSERT(db_oid < num_profiles);
865 const TObsr* data_start
868 const TObsr* data = data_start + offsets[db_oid];
869 int data_size = offsets[db_oid + 1] - offsets[db_oid];
873 for (
int i=0;
i < data_size;
i+=2) {
878 for (
int j=0;j < num;j++) {
883 int num_columns = m_SubjectRange.GetTo() - m_SubjectRange.GetFrom();
884 for (
int i=0;
i < num_columns;
i++) {
User-defined methods of the data storage class.
Declares the BLAST exception class.
Defines BLAST error codes (user errors included)
Wrapper class to manage the BlastRPSInfo structure, as currently there aren't any allocation or deall...
Defines system exceptions occurred while running BLAST.
bool SeqidToOid(const CSeq_id &seqid, int &oid) const
Translate a Seq-id to any matching OID.
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
#define GAP_IN_ALIGNMENT
Representation of GAP in Seq-align.
virtual ~CCddInputData()
Virtual destructor.
CRef< objects::CBioseq > m_QueryBioseq
Query as Bioseq.
static const char kGapChar('-')
The representation of a gap in ASCII format.
void AdjustRanges(int d_from, int d_to)
Change ranges on query and subject by given values.
TRange m_QueryRange
Segment range on query.
vector< CHit * > m_Hits
RPS-BLAST hits in internal representation.
bool IsEmpty(void) const
Is hit empty.
string m_DbName
CDD database name.
CHit(const objects::CDense_seg &denseg, double evalue)
Constructor.
vector< CHitSegment * > m_SegmentList
List of hit segments.
void x_CreateMsa(void)
Create multiple alignment of CDs.
void x_ProcessAlignments(double min_evalue, double max_evalue)
Process RPS-BLAST hits.
void x_RemoveMultipleCdHits(void)
Remove multiple hits to the same CD.
vector< CHitSegment * > & GetSegments(void)
Get hit segments.
void Subtract(const CHit &hit)
Subtract from another hit from this hit using query ranges.
PSIBlastOptions m_Opts
Delta BLAST options for PSSM Engine.
int GetLength(void) const
Get hit length in residues, counts number of matching residues, gaps are not counted.
void Process(void)
Pre-process CD matches to query.
double m_MinEvalue
Min e-value threshold for all hits to be included in PSSM computation.
PSICdMsaCell ** m_Msa
Pointer to MSA.
CConstRef< objects::CSeq_align_set > m_SeqalignSet
RPS-BLAST hits for the query.
CCddInputData(const Uint1 *query, unsigned int query_length, CConstRef< objects::CSeq_align_set > seqaligns, const PSIBlastOptions &opts, const string &dbname, const string &matrix_name="BLOSUM62", int gap_existence=0, int gap_extension=0, PSIDiagnosticsRequest *diags=NULL, const string &query_title="")
Constructor.
bool x_ValidateHits(void) const
Validate internal representation of RPS-BLAST hits.
PSICdMsa m_CddData
MSA of CDs and CD data.
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
vector< Uint1 > m_QueryData
Query sequence.
PSIMsaDimensions m_MsaDimensions
MSA dimensions, used by PSSM engine.
void x_FillResidueCounts(int db_oid, const CBlastRPSInfo &profile_data)
Populate arrays of weighted residue counts.
void x_FillObservations(int db_oid, const CBlastRPSInfo &profile_data)
Populate arrays of effective numbers of observations.
void FillData(int db_oid, const CBlastRPSInfo &profile_data)
Allocate and populate arrays for MSA data (weighted residue counts and effective observations used fo...
static const int kAlphabetSize
bool Validate(void) const
Validate hit.
vector< PSICdMsaCell > m_MsaData
MSA data.
Uint4 TFreqs
Type used for residue frequencies stored in CDD.
void IntersectWith(const vector< TRange > &segments, EApplyTo app)
Intersect hit segments with list of ranges and store result in hit segments.
bool Validate(void) const
Validate hit segment.
unsigned int GetQueryLength(void)
Get query length.
string m_QueryTitle
Query title (for PSSM)
static const int kRpsScaleFactor
Scale of residue frequencies and number of independent observations stored in CDD.
bool x_ValidateMsa(void) const
Validate multiple alignment of CDs.
void x_ExtractQueryForPssm(void)
Create query as Bioseq.
bool IsEmpty(void) const
Does the hit segment represent an empty range.
void FillData(const CSeqDB &seqdb, const CBlastRPSInfo &profile_data)
Allocate and populate arrays of data for PSSM computation.
Uint4 TObsr
Type used for number of independent observations stored in CDD.
void x_FillHitsData(void)
Read data needed for PSSM computation from CDD and populate arrays.
EApplyTo
Master selection for operations involving ranges.
@ fDeltaBlast
Flags set for DELTA-BLAST.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
TThisType IntersectionWith(const TThisType &r) const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define USING_SCOPE(ns)
Use the specified namespace.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
const TStarts & GetStarts(void) const
Get the Starts member data.
const TLens & GetLens(void) const
Get the Lens member data.
TDim GetDim(void) const
Get the Dim member data.
const TIds & GetIds(void) const
Get the Ids member data.
TNumseg GetNumseg(void) const
Get the Numseg member data.
list< CRef< CSeq_align > > Tdata
TTitle & SetTitle(void)
Select the variant.
@ eRepr_raw
continuous sequence
char * dbname(DBPROCESS *dbproc)
Get name of current database.
unsigned int
A callback function used to compare two keys in a database.
bool is_aligned(T *p) noexcept
Check pointer alignment.
constexpr auto sort(_Init &&init)
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double inclusion_ethresh
Minimum evalue for inclusion in PSSM calculation.
Data needed for PSSM computation stored in MSA cell for single column in CD aligned to a position in ...
double iobsr
Effective number of independent observations in a CD column.
double * wfreqs
Frequencies for each residue in CD column.
Alignment cell that represents one column of CD aligned to a position in the query.
Uint1 is_aligned
Does this cell represent column aligned to a CD.
PSICdMsaCellData * data
Data needed for PSSM computation.
PSIMsaDimensions * dimensions
Query length and number of aligned cds.
unsigned char * query
Query sequence as Ncbistdaa.
PSICdMsaCell ** msa
Multiple alignment of CDs.
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Uint4 query_length
Length of the query.