36 #define BLAST_SECONDARY_SOURCE 1
111 static int GetAlphabetSize(
void);
115 static int GetProfilesDataScale(
void);
139 int qfrom,
int sfrom,
int len);
168 virtual unsigned char*
GetQuery(
void) {
return &m_Query[0];}
181 static const int kQueryLength = 6;
212 _ASSERT(m_CdMsa.dimensions->num_seqs > 0);
213 _ASSERT(m_CdMsa.msa[0][0].data);
214 _ASSERT(m_CdMsa.msa[0][0].data->wfreqs);
225 _ASSERT(m_CdMsa.dimensions->num_seqs > 0);
226 _ASSERT(m_CdMsa.msa[0][0].data);
227 _ASSERT(m_CdMsa.msa[0][0].data->wfreqs);
232 m_CdMsa.msa[0][0].data->wfreqs[kResidueA] = -0.001;
233 m_CdMsa.msa[0][0].data->wfreqs[kResidueC] += 0.001;
243 _ASSERT(m_CdMsa.dimensions->num_seqs > 0);
244 _ASSERT(m_CdMsa.msa[0][0].data);
245 _ASSERT(m_CdMsa.msa[0][0].data->wfreqs);
249 m_CdMsa.msa[0][0].data->wfreqs[kResidueA] += 0.01;
258 _ASSERT(m_CdMsa.dimensions->num_seqs > 0);
259 _ASSERT(m_CdMsa.msa[0][0].data);
260 _ASSERT(m_CdMsa.msa[0][0].data->wfreqs);
262 m_CdMsa.msa[0][0].data->iobsr = 0.0;
306 seq_weights.
get()) == 0);
312 BOOST_REQUIRE_CLOSE(seq_weights->independent_observations[
i],
317 BOOST_REQUIRE_CLOSE(seq_weights->match_weights[
i][j],
350 seq_weights.
get()) == 0);
360 BOOST_REQUIRE_CLOSE(seq_weights->independent_observations[
i],
368 sum += seq_weights->match_weights[
i][j];
370 BOOST_REQUIRE_CLOSE(sum, 1.0, 1e-5);
400 seq_weights.
get()) == 0);
473 Int4 pseudo_count = 0;
479 sbp.
Get(), pseudo_count,
480 internal_pssm.
get()));
484 internal_pssm.
get()));
488 internal_pssm.
get()));
492 internal_pssm.
get()));
495 sbp.
Get(), pseudo_count,
NULL));
509 const string seqalign(
"data/cdd-129295.asn");
510 const string rpsdb(
"data/deltatest");
517 unique_ptr<CObjectIStream>
in
522 BOOST_REQUIRE(sas->
Get().size() != 0);
539 pssm_input->Process();
546 BOOST_REQUIRE(profile_data()->freq_header);
547 BOOST_REQUIRE(profile_data()->obsr_header);
548 BOOST_REQUIRE_EQUAL(profile_data()->freq_header->num_profiles,
549 profile_data()->obsr_header->num_profiles);
579 const PSICdMsa* cd_msa = pssm_input->GetData();
606 const CDense_seg& ds = (*hsp)->GetSegs().GetDenseg();
607 BOOST_REQUIRE_EQUAL(ds.
GetDim(), 2);
609 const vector<TSignedSeqPos>& starts = ds.
GetStarts();
610 const vector<TSeqPos>& lengths = ds.
GetLens();
614 seqdb.SeqidToOid(
subject, db_oid);
615 BOOST_REQUIRE(db_oid >= 0 && db_oid < kNumDomains);
623 obsr_start + obsr_offsets[db_oid];
627 obsr_offsets[db_oid + 1] - obsr_offsets[db_oid];
628 vector<CCddInputData::TObsr> obsr;
629 for (
int i=0;
i < obsr_size;
i+=2) {
633 for (
int j=0;j < num;j++) {
644 BOOST_REQUIRE(msa_index >= 0
645 && msa_index < (
int)sas->
Get().size());
656 while (k < q_index) {
662 if (s_index == kGap) {
666 for (
TSeqPos pos = 0; pos < lengths[
i]; pos++) {
672 }
else if (q_index == kGap) {
673 s_index += lengths[
i];
676 for (
TSeqPos pos = 0; pos < lengths[
i]; pos++) {
686 cd_msa->
msa[msa_index][q_index + pos].
data;
695 - (
Int4)obsr[s_index + pos]) < 2);
706 (
int)freqs[(s_index + pos)
712 k = q_index + lengths[
i];
718 }
catch (
const exception& e) {
719 cerr << e.what() << endl;
720 BOOST_REQUIRE(
false);
722 cerr <<
"Unknown exception" << endl;
723 BOOST_REQUIRE(
false);
737 BOOST_REQUIRE_EQUAL(pre_num_hits, 2);
744 BOOST_REQUIRE_EQUAL(post_num_hits, 1);
753 BOOST_REQUIRE_EQUAL(pre_num_hits, 2);
760 BOOST_REQUIRE_EQUAL(post_num_hits, 2);
773 const string seqalign(
"data/cdd-129295.asn");
774 const string rpsdb(
"data/deltatest");
776 unique_ptr<CObjectIStream>
in
791 memset((
void*) &request, 0,
sizeof(request));
797 const string kTitle(
"Test defline");
824 if((*iter)->IsTitle()) {
825 query_descr += (*iter)->GetTitle();
829 BOOST_REQUIRE_EQUAL(query_descr,
kTitle);
832 const size_t kNumElements =
839 BOOST_REQUIRE_EQUAL(kNumElements, wres_freqs.size());
844 BOOST_REQUIRE_EQUAL(kNumElements, freq_ratios.size());
849 BOOST_REQUIRE_EQUAL(seq.
length-2, obsr.size());
854 BOOST_REQUIRE_EQUAL(kNumElements, scores.size());
878 string rpsdb =
"data/deltatest";
879 const string kMatrix =
"BLOSUM62";
882 const int kGapStart = 2;
883 const int kGapLen = 160;
895 ids.push_back(query_id);
896 ids.push_back(subject_id);
901 starts.push_back(kGapStart);
902 starts.push_back(-1);
903 starts.push_back(kGapStart + kGapLen + 1);
904 starts.push_back(kGapStart + kGapLen + 1);
906 lens.push_back(kGapLen);
915 seq_align_set->
Set().push_back(seq_align);
929 seq_align_set, *opts,
936 unique_ptr< CNcbiMatrix<int> > pssm_scores(
943 (
size_t)pssm_scores->GetCols());
945 (
size_t)pssm_scores->GetRows());
947 BOOST_REQUIRE(kGapStart + kGapLen < pssm->GetPssm().GetNumColumns());
950 const int kResiduesUOJstar = 24;
955 for (
int i=kGapStart;
i < kGapStart + kGapLen;
i++) {
956 for (
int j = 0; j < kResiduesUOJstar; j++) {
959 if (j == kGapResidue) {
961 ss <<
"Position " <<
i <<
" residue "
970 pssm_input->GetQuery()[
i], j);
973 ss <<
"Position " <<
i <<
" residue "
978 BOOST_REQUIRE_MESSAGE (bl_score - (*pssm_scores)(j,
i) <= 1
979 && bl_score - (*pssm_scores)(j,
i) >= -1,
990 string rpsdb =
"data/deltatest";
991 const string kMatrix =
"BLOSUM62";
1010 seq_align_set, *opts,
1017 unique_ptr< CNcbiMatrix<int> > pssm_scores(
1025 (
size_t)pssm_scores->GetCols());
1027 (
size_t)pssm_scores->GetRows());
1030 const int kResiduesUOJstar = 24;
1035 for (
int i=0;
i < pssm->
GetPssm().GetNumColumns();
i++) {
1036 for (
int j = 0; j < kResiduesUOJstar; j++) {
1039 if (j == kGapResidue) {
1041 ss <<
"Position " <<
i <<
" residue "
1049 pssm_input->GetQuery()[
i], j);
1052 ss <<
"Position " <<
i <<
" residue "
1057 BOOST_REQUIRE_MESSAGE (bl_score - (*pssm_scores)(j,
i) <= 1
1058 && bl_score - (*pssm_scores)(j,
i) >= -1,
1075 int qfrom,
int sfrom,
int len)
1084 ids.push_back(query_id);
1085 ids.push_back(subject_id);
1088 starts.push_back(qfrom);
1089 starts.push_back(sfrom);
1090 lens.push_back(
len);
1107 seq_align_set->
Set().push_back(x_CreateAlignment(query_id, subject_id,
1110 seq_align_set->
Set().push_back(x_CreateAlignment(query_id, subject_id,
1112 return seq_align_set;
1122 seq_align_set->
Set().push_back(x_CreateAlignment(query_id, subject_id,
1125 seq_align_set->
Set().push_back(x_CreateAlignment(query_id, subject_id,
1127 return seq_align_set;
1134 const string rpsdb =
"data/deltatest";
1135 const string kMatrix =
"BLOSUM62";
1141 case eDuplicateOverlappingHit:
1142 seq_align_set = x_CreateDuplicateOverlappingHit(query_id);
1145 case eDuplicateNonOverlappingHit:
1146 seq_align_set = x_CreateDuplicateNonOverlappingHit(query_id);
1162 seq_align_set, *opts,
1165 pssm_input->x_ProcessAlignments(-1.0, 10.0);
1173 return input.m_MsaData.size();
1203 return input.m_Hits.size();
1208 input.x_RemoveMultipleCdHits();
1215 m_Query.resize(kQueryLength, 1);
1217 m_Dimensions.query_length = kQueryLength;
1218 m_Dimensions.num_seqs = 0;
1221 m_CdMsa.dimensions = &m_Dimensions;
1222 m_CdMsa.query = &m_Query[0];
1231 int freqs[] = {0, 2, 2, 1, 5, 0, 0, 0, 0, 0,
1232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1233 0, 0, 0, 0, 0, 0, 0, 0};
1236 int obsr[] = {21, 34, 56, 21, 21, 21};
1240 for (
int i=0;
i < 28;
i++) {
1241 m_Freqs[
i] = (double)freqs[
i] / 10.0;
1244 BOOST_REQUIRE_CLOSE(sum, 1.0, 1e-5);
1250 m_MsaData.resize(kQueryLength, data);
1255 m_CdMsa.msa = &m_Msa[0];
1257 for (
int i=0;
i < kQueryLength;
i++) {
1258 m_Msa[0][
i].is_aligned =
true;
1259 m_Msa[0][
i].data = &m_MsaData[
i];
1260 m_Msa[0][
i].data->wfreqs = &m_Freqs[0];
1261 m_Msa[0][
i].data->iobsr = (double)obsr[
i] / 10.0;
1264 m_Dimensions.num_seqs = 1;
1269 ITERATE (vector<PSICdMsaCell*>, it, m_Msa) {
1279 int obsr[] = {22, 41, 76, 21, 200, 21};
1281 for (
int i=0;
i < kQueryLength;
i++) {
1283 m_Msa[1][
i].data = &m_MsaData[
i];
1284 m_Msa[1][
i].data->wfreqs = &m_Freqs[0];
1285 m_Msa[1][
i].data->iobsr = (double)obsr[
i] / 10.0;
1288 m_Dimensions.num_seqs = 2;
1289 m_CdMsa.msa = &m_Msa[0];
1308 BOOST_REQUIRE_EQUAL(internal_pssm->nrows, (
unsigned int)sbp->
alphabet_size);
1314 internal_pssm.
get()) == 0);
1323 BOOST_REQUIRE_EQUAL(internal_pssm->freq_ratios[
i][j] < 1e-5,
1324 seq_weights->std_prob[j] < 1e-5);
1334 internal_pssm.
get(), cd_msa->
query,
1335 sbp.
Get(), seq_weights->std_prob) == 0);
1346 if (j == kXResidue || j == kStarResidue) {
1351 double q_over_p_estimate = internal_pssm->freq_ratios[
i][j]
1352 / seq_weights->std_prob[j];
1356 BOOST_REQUIRE_EQUAL(q_over_p_estimate > 1e-5,
1357 internal_pssm->scaled_pssm[
i][j]
1361 BOOST_REQUIRE_EQUAL(q_over_p_estimate > 1.0,
1362 internal_pssm->scaled_pssm[
i][j] >= 0);
1365 BOOST_REQUIRE_EQUAL(q_over_p_estimate < 1.0
1366 && q_over_p_estimate > 1e-5,
1367 internal_pssm->scaled_pssm[
i][j] <= 0
1368 && internal_pssm->scaled_pssm[
i][j]
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static const string kScale
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Definitions which are dependant on the NCBI C++ Object Manager.
Int2 PSIBlastOptionsNew(PSIBlastOptions **psi_options)
Initialize default options for PSI BLAST.
int _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData *internal_pssm, const Uint1 *query, const BlastScoreBlk *sbp, const double *std_probs)
Converts the PSSM's frequency ratios obtained in the previous stage to a PSSM of scores.
int _PSIComputeFreqRatiosFromCDs(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, Int4 pseudo_count, _PSIInternalPssmData *internal_pssm)
Main function to compute CD-based PSSM's frequency ratios.
_PSISequenceWeights * _PSISequenceWeightsNew(const PSIMsaDimensions *dimensions, const BlastScoreBlk *sbp)
Allocates and initializes the _PSISequenceWeights structure.
_PSIInternalPssmData * _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new _PSIInternalPssmData structure.
int _PSIComputeFrequenciesFromCDs(const PSICdMsa *cd_msa, BlastScoreBlk *sbp, const PSIBlastOptions *options, _PSISequenceWeights *seq_weights)
Main function to calculate CD weights and combine weighted residue counts from matched CDs.
Private interface for Position Iterated BLAST API, contains the PSSM generation engine.
#define BLAST_SCORE_MIN
minimum allowed score (for one letter comparison).
TSeqPos GetLength(void) const
Defines BLAST error codes (user errors included)
Wrapper class to manage the BlastRPSInfo structure, as currently there aren't any allocation or deall...
Wrapper class for BlastScoreBlk .
const CSeq_id & GetSeq_id(TDim row) const
void Validate(bool full_test=false) const
Wrapper class for PSIBlastOptions .
static unsigned char * x_GuardProteinQuery(const unsigned char *query, unsigned int query_length)
Accesses CPssmEngine private method.
Computes a PSSM as specified in PSI-BLAST.
void SetNamedScore(const string &id, int score)
static CTestObjMgr & Instance()
TSeqPos length
Length of the buffer above (not necessarily sequence length!)
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
static CNcbiMatrix< int > * GetScores(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
#define BLASTAA_SIZE
Size of aminoacid alphabet.
BlastScoreBlk * Get() const
TAutoUint1Ptr data
Sequence data.
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
static const int kAlphabetSize
void Reset(BlastScoreBlk *p=NULL)
Uint4 TFreqs
Type used for residue frequencies stored in CDD.
static const int kRpsScaleFactor
Scale of residue frequencies and number of independent observations stored in CDD.
AutoPtr< Uint1, CDeleter< Uint1 > > TAutoUint1Ptr
Declares TAutoUint1Ptr (for Uint1 arrays allocated with malloc/calloc)
Uint4 TObsr
Type used for number of independent observations stored in CDD.
SBlastSequence GetSequence(const objects::CSeq_loc &sl, EBlastEncoding encoding, objects::CScope *scope, objects::ENa_strand strand=objects::eNa_strand_plus, ESentinelType sentinel=eSentinels, std::string *warnings=NULL)
Retrieves a sequence using the object manager.
@ eBlastEncodingProtein
NCBIstdaa.
@ fDeltaBlast
Flags set for DELTA-BLAST.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
element_type * get(void) const
Get pointer.
@ eSerial_AsnText
ASN.1 text.
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
const TFreqRatios & GetFreqRatios(void) const
Get the FreqRatios member data.
const TNumIndeptObsr & GetNumIndeptObsr(void) const
Get the NumIndeptObsr member data.
TNumRows GetNumRows(void) const
Get the NumRows member data.
const TScores & GetScores(void) const
Get the Scores member data.
const TWeightedResFreqsPerPos & GetWeightedResFreqsPerPos(void) const
Get the WeightedResFreqsPerPos member data.
const TFinalData & GetFinalData(void) const
Get the FinalData member data.
TNumColumns GetNumColumns(void) const
Get the NumColumns member data.
list< double > TWeightedResFreqsPerPos
const TIntermediateData & GetIntermediateData(void) const
Get the IntermediateData member data.
list< double > TNumIndeptObsr
list< double > TFreqRatios
const TPssm & GetPssm(void) const
Get the Pssm member data.
Tdata & Set(void)
Assign a value to data member.
TLens & SetLens(void)
Assign a value to Lens data member.
const TStarts & GetStarts(void) const
Get the Starts member data.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
const TLens & GetLens(void) const
Get the Lens member data.
void SetDim(TDim value)
Assign a value to Dim data member.
vector< TSignedSeqPos > TStarts
void SetDim(TDim value)
Assign a value to Dim data member.
vector< CRef< CSeq_id > > TIds
TDim GetDim(void) const
Get the Dim member data.
TStarts & SetStarts(void)
Assign a value to Starts data member.
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
TNumseg GetNumseg(void) const
Get the Numseg member data.
list< CRef< CSeq_align > > Tdata
TIds & SetIds(void)
Assign a value to Ids data member.
const Tdata & Get(void) const
Get the member data.
list< CRef< CSeqdesc > > Tdata
const Tdata & Get(void) const
Get the member data.
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
const TDescr & GetDescr(void) const
Get the Descr member data.
unsigned int
A callback function used to compare two keys in a database.
char GetResidue(unsigned int res)
Returns character representation of a residue from ncbistdaa.
Magic spell ;-) needed for some weird compilers... very empiric.
std::istream & in(std::istream &in_, double &x_)
Declarations of auxiliary functions/classes for PSI-BLAST.
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
C++ API for the PSI-BLAST PSSM engine.
BlastScoreBlk * InitializeBlastScoreBlk(const unsigned char *query, Uint4 query_size)
Utilities to develop and debug unit tests that deal with PSSM computation.
static const string kSubjectId
Domain subject id used for tests (present in test CDD)
static const string kQueryId
Query id used for tests.
static void s_TestCreatePssmFromFreqs(const PSICdMsa *cd_msa, CBlastScoreBlk &sbp, const PSIBlastOptions *opts, AutoPtr< _PSISequenceWeights > &seq_weights)
BOOST_AUTO_TEST_CASE(TestCreatePssmFromSingleCd)
const SNCBIPackedScoreMatrix NCBISM_Blosum62
TNCBIScore NCBISM_GetScore(const SNCBIPackedScoreMatrix *sm, int aa1, int aa2)
Look up an entry in a packed score matrix.
Declares auxiliary classes to manage RPS-BLAST related C-structures.
Int2 alphabet_size
size of alphabet.
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Int4 pseudo_count
Pseudocount constant.
Data needed for PSSM computation stored in MSA cell for single column in CD aligned to a position in ...
double iobsr
Effective number of independent observations in a CD column.
double * wfreqs
Frequencies for each residue in CD column.
Alignment cell that represents one column of CD aligned to a position in the query.
Uint1 is_aligned
Does this cell represent column aligned to a CD.
PSICdMsaCellData * data
Data needed for PSSM computation.
Data structure representing multiple alignemnt of CDs and query sequence along with data needed for P...
PSIMsaDimensions * dimensions
Query length and number of aligned cds.
unsigned char * query
Query sequence as Ncbistdaa.
PSICdMsaCell ** msa
Multiple alignment of CDs.
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Boolean information_content
request information content
Boolean frequency_ratios
request frequency ratios
Boolean independent_observations
request number of independent observations
Boolean weighted_residue_frequencies
request observed weighted residue frequencies
Structure representing the dimensions of the multiple sequence alignment data structure.
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Uint4 query_length
Length of the query.
Structure to store sequence data and its length for use in the CORE of BLAST (it's a malloc'ed array ...
Utility stuff for more convenient using of Boost.Test library.
static const string kTitle
CTraceGlyph inline method implementation.