79 if (blockstream.bad() || blockstream.fail())
81 "Cannot open RPS blockfile");
90 blockstream >> block_idx;
95 while (!blockstream.eof()) {
98 if (blockstream.eof()) {
101 blockstream >> block_idx;
102 blockstream >> start;
115 vector<SSegmentLoc>& blocklist,
133 for (
int i = 0;
i < rps_hits.
Size();
i++) {
139 int **pssm = profile_data.
GetPssm() + db_seq_offsets[db_seq];
140 int db_seq_length = db_seq_offsets[db_seq + 1] - db_seq_offsets[db_seq];
160 vector<SSegmentLoc>::iterator
161 itr = lower_bound(blocklist.begin(), blocklist.end(),
164 _ASSERT(itr != blocklist.end() &&
170 while (itr != blocklist.end() &&
172 itr->GetTo() < target.
GetFrom()) {
176 vector<SSegmentLoc>::iterator prev_itr(itr);
177 vector<SSegmentLoc>::iterator next_itr(itr);
178 if (itr != blocklist.begin()) {
186 while (itr != blocklist.end() && itr->seq_index == db_seq
187 && itr->GetFrom() < target.
GetTo()) {
189 const int kMaxFudge = 6;
190 TRange q_range, new_s_range;
196 TRange s_range(itr->range.IntersectionWith(target.
range));
197 _ASSERT(!s_range.
Empty() && itr->range.Contains(s_range));
199 int left_fudge, right_fudge;
207 if (itr == blocklist.begin() ||
208 prev_itr == blocklist.begin() ||
209 prev_itr->seq_index != db_seq) {
213 left_fudge = s_range.
GetFrom() -
214 prev_itr->GetTo() - last_fudge - 1;
215 left_fudge =
min(left_fudge, kMaxFudge);
221 if (itr == blocklist.end() ||
222 next_itr == blocklist.end() ||
223 next_itr->seq_index != db_seq) {
227 right_fudge = (next_itr->GetFrom() - s_range.
GetTo() - 1) / 2;
228 right_fudge =
min(right_fudge, kMaxFudge);
231 last_fudge = right_fudge;
241 if (prev_itr != itr) {
245 if (next_itr != blocklist.end()) {
259 printf(
"ignore aligning query %d %d-%d db %d block %d-%d\n",
267 q_range.
GetFrom() - left_fudge));
269 q_range.
GetTo() + right_fudge));
280 int tback_size = tback.size();
313 int last_tback = tback_size - 1;
314 int q_start = q_range.
GetFrom();
315 int q_stop = q_range.
GetTo();
316 int s_start = s_range.
GetFrom();
317 int s_stop = s_range.
GetTo();
319 for (
int k = 0; k < tback_size &&
330 else if (tback[k] != tback[k-1])
334 for (
int k = tback_size - 1; k >= 0 &&
343 if (k == tback_size - 1)
345 else if (tback[k] != tback[k+1])
351 q_range.
Set(q_start, q_stop);
352 s_range.
Set(s_start, s_stop);
361 TRange(first_tback, last_tback));
371 score, final_script));
390 "Alignment interrupted");
412 const vector<int>& indices,
415 _ASSERT(queries.size() == indices.size());
417 int num_queries = queries.size();
429 ->SetCompositionBasedStats(
false);
437 CLocalBlast blaster(query_factory, opts, search_database);
447 for (
int i = 0;
i < num_queries;
i++) {
463 const CScore& curr_score = **score_itr;
480 align_score, denseg));
486 "Alignment interrupted");
493 printf(
"RPS hits:\n");
494 for (
int i = 0;
i < rps_hits.
Size();
i++) {
496 printf(
"query %d %4d - %4d db %d %4d - %4d score %d\n",
515 if (rps_hits.
Empty()) {
523 for (
int i = 0;
i < rps_hits.
Size();
i++) {
532 for (j = 0; j <
i; j++) {
563 vector<TOffsetPair> sub_list(
568 for (j = 0; j < (
int)sub_list.size(); j += 2) {
571 int q = start_pair.first;
572 int s = start_pair.second;
574 _ASSERT(stop_pair.second - stop_pair.first ==
575 start_pair.second - start_pair.first);
578 for (
int k = 0; k < stop_pair.first - start_pair.first; k++) {
581 (1 - domain_res_freq_boost) *
585 matrix(q+k,
query.GetLetter(q+k)) += domain_res_freq_boost;
596 "Alignment interrupted");
619 for (
int j = 0; j <
query.GetLength(); j++) {
621 matrix(j, k) = (1 - local_res_freq_boost) *
624 matrix(j,
query.GetLetter(j)) += local_res_freq_boost;
630 "Alignment interrupted");
639 for (
int j = 0; j <
query.GetLength(); j++) {
641 matrix(j, k) = (1 - local_res_freq_boost) *
644 matrix(j,
query.GetLetter(j)) += local_res_freq_boost;
657 const pair<const CSeq_id*, int>&
b)
660 return a.first->CompareOrdered(*
b.first) > 0;
665 const vector<int>& indices,
673 _ASSERT(pre_queries.size() == indices.size());
680 vector< pair<const CSeq_id*, int> > queries;
681 queries.reserve(pre_queries.size());
682 for (
size_t i=0;
i < pre_queries.size();
i++) {
683 _ASSERT(pre_queries[
i].seqloc->GetId());
684 queries.push_back(make_pair(pre_queries[
i].seqloc->GetId(), indices[
i]));
695 " 4 archive format");
705 pair<const CSeq_id*, int> p((*it)->GetId(), -1);
706 vector< pair<const CSeq_id*, int> >::iterator id_itr
707 = lower_bound(queries.begin(), queries.end(), p,
711 if (id_itr != queries.end()
712 && id_itr->first->CompareOrdered(*p.first) == 0) {
726 pair<const CSeq_id*, int> p(itr->GetFirstId(), -1);
727 vector< pair<const CSeq_id*, int> >::iterator id_itr
728 = lower_bound(queries.begin(), queries.end(), p,
732 if (id_itr != queries.end()
733 && id_itr->first->CompareOrdered(*p.first) == 0) {
740 printf(
"Pre-computed RPS queries:\n");
741 for (
size_t i=0;
i < pre_queries.size();
i++) {
744 printf(
"query: %d\n", indices[
i]);
752 bool is_presearched =
false;
755 is_presearched =
true;
759 if (!is_presearched) {
766 is_presearched =
false;
786 if (!last_query_id || query_id.
CompareOrdered(*last_query_id) != 0) {
789 pair<const CSeq_id*, int> p(&query_id, -1);
790 vector< pair<const CSeq_id*, int> >::iterator id_itr
791 = lower_bound(queries.begin(), queries.end(), p,
796 if (id_itr == queries.end()
797 || id_itr->first->CompareOrdered(*p.first) != 0) {
802 query_idx = id_itr->second;
803 last_query_id = id_itr->first;
812 const CScore& curr_score = **score_itr;
830 " subject domain " + denseg.
GetIds()[1]->AsFastaString()
831 +
" does not exist in the domain database "
835 align_score, denseg));
837 is_presearched =
true;
840 if (!is_presearched) {
846 printf(
"Pre-computed RPS hits:\n");
849 printf(
"query %d %4d - %4d db %d %4d - %4d score %d\n",
867 const vector<int>& indices)
870 string blockfile = rps_db +
".blocks";
871 string freqfile = rps_db +
".freq";
873 if (rps_db.empty()) {
904 bool do_search =
false;
905 for (
size_t i=0;
i < indices.size();
i++) {
916 vector<int> indices_not_searched;
917 for (
size_t i=0;
i < queries.size();
i++) {
919 queries_not_searched.push_back(queries[
i]);
920 indices_not_searched.push_back(indices[
i]);
932 "Alignment interrupted");
935 vector<SSegmentLoc> blocklist;
944 profile_data.
Clear();
948 printf(
"\n\nBlock alignments with conflicts resolved:\n");
953 printf(
"query %d %4d - %4d db %d %4d - %4d score %d ",
980 profile_data.
Clear();
994 CHit *subhit = *subitr;
1001 printf(
"\n\nMatched block alignments:\n");
1005 CHit *subhit = *itr;
1006 printf(
"query %d %4d - %4d query %d %4d - %4d score %d\n",
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
CLocalRange< TOffset > TRange
define for the fundamental building block of sequence ranges
pair< TOffset, TOffset > TOffsetPair
Basic type specifying a range on a sequence.
RPS BLAST structure definitions.
#define FREQ_RATIO_SCALE
header for RPS blast frequency ratios ('.freq') file
Declares the CBlastRPSOptionsHandle class.
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Blast_ResFreq * Blast_ResFreqFree(Blast_ResFreq *rfp)
Deallocates Blast_ResFreq and prob0 element.
Int2 Blast_ResFreqStdComp(const BlastScoreBlk *sbp, Blast_ResFreq *rfp)
Calculates residues frequencies given a standard distribution.
Blast_ResFreq * Blast_ResFreqNew(const BlastScoreBlk *sbp)
Allocates a new Blast_ResFreq structure and fills in the prob element based upon the contents of sbp.
BlastScoreBlk * BlastScoreBlkNew(Uint1 alphabet, Int4 number_of_contexts)
Allocates and initializes BlastScoreBlk.
@ eRPSBlast
protein-pssm (reverse-position-specific BLAST)
Defines BLAST error codes (user errors included)
Handle to the rpsblast options to the BLAST algorithm.
Interface for the traceback from blast hits.
int GetScore(TRange tback_range, TOffsetPair start_offsets, CSequence &seq1, int **seq2_pssm, int gap_open, int gap_extend)
Compute the score associated with (a portion of) an alignment Assumes that seq1 is a sequence and tha...
CEditScript MakeEditScript(TRange tback_range)
Return an edit script corresponding to a subset of the complete traceback available.
vector< TOffsetPair > ListMatchRegions(TOffsetPair start_offsets)
Compile a list of regions in the current edit script that contain substitutions.
An ordered collection of CHit objects.
int Size() const
Retrieve number of hits in list.
void SetKeepHit(int index, bool keep)
Set whether a hit in the hitlist will be scheduled for deletion.
void PurgeAllHits()
Delete all hits unconditionally.
bool Empty()
Determine whether a list contains no hits.
CHit * GetHit(int index)
Retrieve a hit from the hitlist.
void PurgeUnwantedHits()
Delete all hits scheduled to be deleted.
bool GetKeepHit(int index)
Determine whether a hit in the hitlist has been scheduled for deletion.
void SortByScore()
Sort the hits in the hitlist in order of decreasing score.
void MatchOverlappingSubHits(CHitList &matched_list)
For each pair of hits with the same sequence2, produce a list of hits between sequence1 of the first ...
void AddToHitList(CHit *hit)
Append a hit to the hitlist.
A generalized representation of a pairwise alignment.
TSubHit & GetSubHit()
Retrieve a list of subhits.
void ResolveSubHitConflicts(CSequence &seq1, int **seq2_pssm, CNWAligner::TScore gap_open, CNWAligner::TScore gap_extend)
If pairs of subhits have overlapping ranges, either delete one or change one so that the overlap is a...
void InsertSubHit(CHit *hit)
Add a to a CHit's list of subhits.
void GetRangeFromSeq2(TRange seq_range2, TRange &seq_range1, TRange &new_seq_range2, TRange &traceback_range)
Retrieve the seq1 range corresponding to a specified seq2 range.
void AddUpSubHits()
Sum the score of all subhits, and make the sequence ranges the union of the ranges of all subhits.
int m_Score
Score of alignment.
CEditScript & GetEditScript()
Retrieve the traceback associated with a CHit.
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
TRange m_SeqRange1
The range of offsets on the first sequence.
static const int kMinHitSize
Not always used, but useful to avoid extremely small hits.
TRange m_SeqRange2
The range of offsets on the second sequence.
bool HasSubHits()
Query if a CHit has a hierarchy of subhits available.
vector< CHit * > TSubHit
Hits can be grouped hierarchically.
Class to perform a BLAST search on local BLAST databases Note that PHI-BLAST can be run using this cl...
TScore GetEndGapExtendPenalty(void) const
Get gap extension penalty for end gaps in pairwise global alignment of profiles.
string GetRpsDb(void) const
Get RPS Blast data base name.
double GetLocalResFreqBoost(void) const
Get frequency boost for a letter that appears in query sequence in given position.
int GetDomainHitlistSize(void) const
Get hitlist size (per sequence) for domain searches.
CConstRef< objects::CBlast4_archive > GetDomainHits(void) const
Get pre-computed domain hits.
TScore GetGapExtendPenalty(void) const
Get gap extension penlaty for middle gaps in pairwise global alignment of profiles.
bool CanGetDomainHits(void) const
Are pre-computed domain hits set.
TScore GetGapOpenPenalty(void) const
Get gap opening penalty for middle gaps in pairwise global alignment of profiles.
TScore GetEndGapOpenPenalty(void) const
Get gap opening penalty for end gaps in pairwise global alignment of profiles.
double GetRpsEvalue(void) const
Get e-value threshold for accepting RPS Blast hits.
@ eToPrototype
All cluster elements are aligner to cluster prototype.
bool GetVerbose(void) const
Get verbose mode.
double GetDomainResFreqBoost(void) const
Get boost for residue frequencies in conserved domains from RPS data base.
Simultaneously align multiple protein sequences.
vector< CSequence > m_AllQueryData
CMultiAlignerOptions::EInClustAlnMethod m_ClustAlnMethod
SProgress m_ProgressMonitor
vector< CRef< objects::CSeq_loc > > m_tQueries
void x_MakeClusterResidueFrequencies()
Compute profile residue frequencies for clusters.
void x_AssignRPSResFreqs(CHitList &rps_hits, CProfileData &profile_data)
@ eInterrupt
Alignment interruped through callback function.
vector< CSequence > m_QueryData
vector< bool > m_IsDomainSearched
Marks sequences with pre-computed domain hits.
void x_AssignDefaultResFreqs()
vector< vector< TRange > > m_RPSLocs
void x_LoadBlockBoundaries(string blockfile, vector< SSegmentLoc > &blocklist)
Given an RPS blast database, load a list of block offsets for each database sequence.
void x_FindRPSHits(blast::TSeqLocVector &queries, const vector< int > &indices, CHitList &rps_hits)
CConstRef< CMultiAlignerOptions > m_Options
void x_SetDomainHits(const blast::TSeqLocVector &queruies, const vector< int > &indices, const objects::CBlast4_archive &archive)
Set pre-computed domain hits using BLAST archive format.
void x_FindDomainHits(blast::TSeqLocVector &queries, const vector< int > &indices)
Run RPS blast on seletced input sequences and postprocess the results.
void x_RealignBlocks(CHitList &rps_hits, vector< SSegmentLoc > &blocklist, CProfileData &profile_data)
static const int kRpsScaleFactor
NCBI C++ Object Manager dependant implementation of IQueryFactory.
Represent databases of PSSM data and residue frequencies.
Int4 ** GetResFreqs() const
Assuming the database is a list of columns of profiles, frequencies, retrieve a list of all of the pr...
@ eGetResFreqs
Retrieve residue frequencies.
@ eGetPssm
Retrieve PSSMs.
void Clear()
Free previously loaded PSSM or profile data.
void Load(EMapChoice choice, string dbname, string resfreq_file="")
Load information from a given database.
Int4 * GetSeqOffsets() const
Retrieve a list of offsets where database sequences begin.
Int4 ** GetPssm() const
Assuming the database is a list of PSSM columns, retrieve a list of all of the PSSMs in the database ...
Search Results for All Queries.
bool SeqidToOid(const CSeq_id &seqid, int &oid) const
Translate a Seq-id to any matching OID.
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Class for representing protein sequences.
Template class for iteration on objects of class C (non-medifiable version)
Interface for CMultiAligner.
void SetStartWg(TScore value)
TTranscript GetTranscript(bool reversed=true) const
void SetEndWs(TScore value)
virtual CNWAligner::TScore Run(void)
void SetEndWg(TScore value)
vector< ETranscriptSymbol > TTranscript
void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
void SetStartWs(TScore value)
void SetEvalueThreshold(double eval)
Sets EvalueThreshold.
CRef< CSearchResultSet > Run()
Executes the search.
static CBlastOptionsHandle * Create(EProgram program, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested program,...
#define BLASTAA_SEQ_CODE
== Seq_code_ncbistdaa
void SetHitlistSize(int s)
Sets HitlistSize.
void SetFilterString(const char *f, bool clear=true)
Sets FilterString.
@ eBlastDbIsProtein
protein
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
int CompareOrdered(const CSeq_id &sid2) const
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
TObjectType * GetNonNullPointer(void)
Get pointer value and throw a null pointer exception if pointer is null.
int32_t Int4
4-byte (32-bit) signed integer
position_type GetLength(void) const
bool IntersectingWith(const TThisType &r) const
TThisType & Set(position_type from, position_type to)
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
bool IsSeq_loc_list(void) const
Check if variant Seq_loc_list is selected.
const TRequest & GetRequest(void) const
Get the Request member data.
const TQueue_search & GetQueue_search(void) const
Get the variant data.
const TBioseq_set & GetBioseq_set(void) const
Get the variant data.
bool IsBioseq_set(void) const
Check if variant Bioseq_set is selected.
const TSeq_loc_list & GetSeq_loc_list(void) const
Get the variant data.
const TQueries & GetQueries(void) const
Get the Queries member data.
const TAlignments & GetAlignments(void) const
Get the Alignments member data.
const TResults & GetResults(void) const
Get the Results member data.
const TBody & GetBody(void) const
Get the Body member data.
void SetFrom(TFrom value)
Assign a value to From data member.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
void SetTo(TTo value)
Assign a value to To data member.
const TStr & GetStr(void) const
Get the variant data.
const TDenseg & GetDenseg(void) const
Get the variant data.
vector< CRef< CScore > > TScore
TInt GetInt(void) const
Get the variant data.
const TValue & GetValue(void) const
Get the Value member data.
const TIds & GetIds(void) const
Get the Ids member data.
list< CRef< CSeq_align > > Tdata
const TScore & GetScore(void) const
Get the Score member data.
TReal GetReal(void) const
Get the variant data.
const TId & GetId(void) const
Get the Id member data.
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
unsigned int
A callback function used to compare two keys in a database.
Main class to perform a BLAST search on the local machine.
const TYPE & Get(const CNamedParameterList *param)
constexpr auto sort(_Init &&init)
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
bool compare_seqids(const pair< const CSeq_id *, int > &a, const pair< const CSeq_id *, int > &b)
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Structure used for scoring calculations.
Stores the letter frequency of a sequence or database.
double * prob
letter probs, (possible) non-zero offset.