84 ts->originalFullSequence = pair->
GetMaster();
99 ts->fromIndex = uaBlocks.front()->GetRangeOfRow(0)->from - extension;
100 if (ts->fromIndex < 0)
102 ts->toIndex = uaBlocks.back()->GetRangeOfRow(0)->to + extension;
103 if (ts->toIndex >= (
int)ts->originalFullSequence->Length())
104 ts->toIndex = ts->originalFullSequence->Length() - 1;
110 ts->toIndex = ts->originalFullSequence->Length() - 1;
131 ts->toIndex = ts->originalFullSequence->Length() - 1;
137 CBioseq& bioseq = ts->truncatedSequence->SetSeq();
139 id->SetLocal().SetId(alnNum);
140 bioseq.
SetId().push_back(
id);
143 bioseq.
SetInst().SetLength(ts->toIndex - ts->fromIndex + 1);
144 TRACEMSG(
"truncated " << ts->originalFullSequence->identifier->ToString()
145 <<
" from " << (ts->fromIndex+1) <<
" to " << (ts->toIndex+1) <<
"; length " << bioseq.
GetInst().
GetLength());
146 bioseq.
SetInst().SetSeq_data().SetNcbistdaa().Set().resize(ts->toIndex - ts->fromIndex + 1);
147 for (
int j=ts->fromIndex; j<=ts->toIndex; ++j)
148 bioseq.
SetInst().SetSeq_data().SetNcbistdaa().Set()[j - ts->fromIndex] =
193 vector < int > masterLoc(length);
195 for (
i=0;
i<length; ++
i)
196 masterLoc[
i] = multiple->
GetPSSM().MapConsensusToMaster(consensusStart +
i);
199 for (
i=0;
i<length; ++
i) {
202 if (!subBlock && masterLoc[
i] >= 0) {
213 if (
i == length - 1 ||
214 masterLoc[
i + 1] < 0 ||
215 masterLoc[
i + 1] != masterLoc[
i] + 1)
233 ERRORMSG(
"MapBlockFromConsensusToMaster() - unterminated sub-block");
239 om->GetRegisteredNames(loader_names);
241 om->RevokeDataLoader(*itr);
266 newAlignments->clear();
268 ERRORMSG(
"usePSSM true, but NULL or zero-aligned block multiple alignment");
271 if (!usePSSM && toRealign.size() > 1) {
272 ERRORMSG(
"CreateNewPairwiseAlignmentsByBlast() - currently can only do single blast-2-sequences at a time");
275 if (toRealign.size() == 0)
283 WARNINGMSG(
"Can't get footprint residue extension from registry");
291 blast::CBlastQueryVector queryVector, subjectVector;
297 AlignmentList::const_iterator
a, ae = toRealign.end();
298 for (
a=toRealign.begin();
a!=ae; ++
a, ++localID) {
300 master = (*a)->GetMaster();
301 if ((*a)->GetMaster() != master) {
302 ERRORMSG(
"CreateNewPairwiseAlignmentsByBlast() - all masters must be the same");
305 if ((*a)->NRows() != 2) {
306 ERRORMSG(
"CreateNewPairwiseAlignmentsByBlast() - can only realign pairwise alignments");
312 subjectBioseq = &(subjectTSs.back()->truncatedSequence->SetSeq());
317 subjectVector.AddQuery(bsqSubject);
332 pssmOptions.
Reset(
new blast::CPSIBlastOptionsHandle);
335 pssmOptions->SetDbLength(1196146007);
336 pssmOptions->SetDbSeqNum(3479934);
337 pssmOptions->SetHitlistSize(subjectTSs.size());
338 pssmOptions->SetMatrixName(
"BLOSUM62");
340 pssmOptions->SetSegFiltering(
false);
342 blastEngine.
Reset(
new
357 queryBioseq = &(masterTS->truncatedSequence->SetSeq());
361 queryVector.AddQuery(bsqQuery);
363 sequenceQuery.
Reset(
new blast::CObjMgr_QueryFactory(queryVector));
365 sequenceOptions.
Reset(
new blast::CBlastProteinOptionsHandle);
366 sequenceOptions->SetMatrixName(
"BLOSUM62");
367 sequenceOptions->SetHitlistSize(subjectTSs.size());
368 blastEngine.
Reset(
new
379 if (
results->size() != toRealign.size())
381 ERRORMSG(
"CreateNewPairwiseAlignmentsByBlast() - did not get one result alignment per input sequence");
386 for (
unsigned int i=0;
i<
results->size(); ++
i, ++localID) {
394 (*seqs)[1] = subjectTSs[localID]->originalFullSequence;
395 string dependentTitle = subjectTSs[localID]->originalFullSequence->
identifier->
ToString();
396 unique_ptr < BlockMultipleAlignment > newAlignment(
402 if (!((*
results)[
i].HasAlignments())) {
403 WARNINGMSG(
"BLAST did not find a significant alignment for "
404 << dependentTitle <<
" with " << (usePSSM ?
string(
"PSSM") : master->
identifier->
ToString()));
408 const CSeq_align& sa = (*results)[
i].GetSeqAlign()->Get().front().GetObject();
411 ERRORMSG(
"CreateNewPairwiseAlignmentsByBlast() - returned alignment not in expected format (dim 2, partial)");
418 ERRORMSG(
"CreateNewPairwiseAlignmentsByBlast() - returned alignment format error (denseg dims)");
421 ERRORMSG(
"CreateNewPairwiseAlignmentsByBlast() - returned alignment format error (ids)");
425 CDense_seg::TStarts::const_iterator s = ds.
GetStarts().begin();
426 CDense_seg::TLens::const_iterator
l,
le = ds.
GetLens().end();
428 int masterStart = *(s++), dependentStart = *(s++);
429 if (masterStart >= 0 && dependentStart >= 0) {
430 dependentStart += subjectTSs[localID]->fromIndex;
435 masterStart += masterTS->fromIndex;
438 newBlock->
SetRangeOfRow(1, dependentStart, dependentStart + (*
l) - 1);
440 newAlignment->AddAlignedBlockAtEnd(newBlock);
447 ERRORMSG(
"CreateNewPairwiseAlignmentsByBlast() - returned alignment in unrecognized format");
452 WARNINGMSG(
"BLAST did not return an alignment score for " << dependentTitle);
455 oss <<
"BLAST result scores for " << dependentTitle <<
" vs. "
459 CSeq_align::TScore::const_iterator sc, sce = sa.
GetScore().end();
460 for (sc=sa.
GetScore().begin(); sc!=sce; ++sc) {
461 if ((*sc)->IsSetId() && (*sc)->GetId().IsStr()) {
464 if ((*sc)->GetValue().IsReal() && (*sc)->GetId().GetStr() ==
"e_value") {
466 newAlignment->SetRowDouble(0, (*sc)->GetValue().GetReal());
467 newAlignment->SetRowDouble(1, (*sc)->GetValue().GetReal());
469 newAlignment->SetRowStatusLine(0, status);
470 newAlignment->SetRowStatusLine(1, status);
471 oss <<
' ' << status;
475 else if ((*sc)->GetValue().IsInt() && (*sc)->GetId().GetStr() ==
"score") {
476 oss <<
" raw: " << (*sc)->GetValue().GetInt();
480 else if ((*sc)->GetValue().IsReal() && (*sc)->GetId().GetStr() ==
"bit_score") {
481 oss <<
" bit score: " << (*sc)->GetValue().GetReal();
488 WARNINGMSG(
"BLAST did not return an E-value for " << dependentTitle);
493 if (newAlignment->AddUnalignedBlocks() && newAlignment->UpdateBlockMapAndColors(
false))
494 newAlignments->push_back(newAlignment.release());
496 ERRORMSG(
"error finalizing alignment");
499 }
catch (exception& e) {
500 ERRORMSG(
"CreateNewPairwiseAlignmentsByBlast() failed with exception: " << e.what());
507 ERRORMSG(
"NULL multiple alignment");
513 WARNINGMSG(
"Can't get footprint residue extension from registry");
517 if (uaBlocks.size() == 0) {
518 ERRORMSG(
"Can't calculate self-hits with no aligned blocks");
534 range = uaBlocks.back()->GetRangeOfRow(
row);
538 rowPairs.push_back(newAlignment);
545 ERRORMSG(
"CalculateSelfHitScores() - CreateNewPairwiseAlignmentsByBlast() didn't return right # alignments");
550 AlignmentList::const_iterator
r =
results.begin();
552 double score = (*r)->GetRowDouble(1);
558 status =
"No detectable self hit";
564 static const double threshold = 0.01;
565 unsigned int nSelfHits = 0;
570 INFOMSG(
"Self hits with E-value <= " << setprecision(3) << threshold <<
": "
571 << (100.0*nSelfHits/multiple->
NRows()) <<
"% ("
572 << nSelfHits <<
'/' << multiple->
NRows() <<
')' << setprecision(6));
578 static CharDoubleMap standardProbabilities;
580 if (standardProbabilities.size() == 0) {
582 ERRORMSG(
"GetStandardProbability() - confused by BLASTAA_SIZE != 28");
586 for (
unsigned int i=0;
i<28; ++
i) {
593 CharDoubleMap::const_iterator
f = standardProbabilities.find(
toupper((
unsigned char) ch));
594 if (
f != standardProbabilities.end())
596 WARNINGMSG(
"GetStandardProbability() - unknown residue character " << ch);
User-defined methods of the data storage class.
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
double * BLAST_GetStandardAaProbabilities(void)
Get the standard amino acid probabilities.
std::list< BlockMultipleAlignment * > AlignmentList
void CalculateSelfHitScores(const BlockMultipleAlignment *multiple)
void CreateNewPairwiseAlignmentsByBlast(const BlockMultipleAlignment *multiple, const AlignmentList &toRealign, AlignmentList *newAlignments, bool usePSSM)
std::vector< const Sequence * > SequenceList
void SetRowDouble(unsigned int row, double value) const
const BLAST_Matrix * GetPSSM(void) const
void SetRowStatusLine(unsigned int row, const std::string &value) const
const Sequence * GetMaster(void) const
std::vector< const UngappedAlignedBlock * > UngappedAlignedBlockList
const Sequence * GetSequenceOfRow(unsigned int row) const
void GetUngappedAlignedBlocks(UngappedAlignedBlockList *blocks) const
unsigned int NRows(void) const
bool HasNoAlignedBlocks(void) const
double GetRowDouble(unsigned int row) const
bool AddAlignedBlockAtEnd(UngappedAlignedBlock *newBlock)
void SetRangeOfRow(unsigned int row, int from, int to)
const Range * GetRangeOfRow(int row) const
const CSeq_id * GetFirstId() const
TSeqPos GetLength(void) const
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
std::string ToString(void) const
const MoleculeIdentifier * identifier
unsigned int Length(void) const
AlignmentManager * alignmentManager
const Sequence * originalFullSequence
CRef< CSeq_entry > truncatedSequence
double GetStandardProbability(char ch)
static void RemoveAllDataLoaders()
static CRef< TruncatedSequence > CreateTruncatedSequence(const BlockMultipleAlignment *multiple, const BlockMultipleAlignment *pair, int alnNum, bool isMaster, int extension)
static bool SeqIdMatchesMaster(const CSeq_id &sid, bool usePSSM)
static void MapBlockFromConsensusToMaster(int consensusStart, int dependentStart, int length, BlockMultipleAlignment *newAlignment, const BlockMultipleAlignment *multiple)
static bool SimpleSeqLocFromBioseq(const CRef< CBioseq > &bs, CSeq_loc &seqLoc)
vector< CRef< TruncatedSequence > > TruncatedSequences
static bool IsLocalID(const CSeq_id &sid, int localID)
@ eCompositionBasedStats
Composition-based statistics as in NAR 29:2994-3005, 2001.
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
vector< string > TRegisteredNames
void ResetDataAndHistory(void)
Clear all information in the scope except added data loaders.
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_SCOPE(ns)
Define a new scope.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
bool IsStr(void) const
Check if variant Str is selected.
bool IsId(void) const
Check if variant Id is selected.
const TStr & GetStr(void) const
Get the variant data.
TId GetId(void) const
Get the variant data.
const TDenseg & GetDenseg(void) const
Get the variant data.
bool IsSetDim(void) const
dimensionality Check if a value has been assigned to Dim data member.
const TStarts & GetStarts(void) const
Get the Starts member data.
TDim GetDim(void) const
Get the Dim member data.
const TLens & GetLens(void) const
Get the Lens member data.
TDim GetDim(void) const
Get the Dim member data.
TType GetType(void) const
Get the Type member data.
bool IsSetDim(void) const
dimensionality Check if a value has been assigned to Dim data member.
const TIds & GetIds(void) const
Get the Ids member data.
bool IsSetScore(void) const
for whole alignment Check if a value has been assigned to Score data member.
TNumseg GetNumseg(void) const
Get the Numseg member data.
const TScore & GetScore(void) const
Get the Score member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
bool IsDenseg(void) const
Check if variant Denseg is selected.
@ eType_partial
mapping pieces together
void SetTo(TTo value)
Assign a value to To data member.
void SetId(TId &value)
Assign a value to Id data member.
void SetFrom(TFrom value)
Assign a value to From data member.
const TLocal & GetLocal(void) const
Get the variant data.
bool IsLocal(void) const
Check if variant Local is selected.
TId & SetId(void)
Assign a value to Id data member.
const TInst & GetInst(void) const
Get the Inst member data.
TLength GetLength(void) const
Get the Length member data.
void SetInst(TInst &value)
Assign a value to Inst data member.
@ eRepr_raw
continuous sequence
range(_Ty, _Ty) -> range< _Ty >
bool le(T x_, T y_, T round_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Declares CPsiBl2Seq, the C++ API for the PSI-BLAST 2 Sequences engine.
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
CRef< objects::CObjectManager > om
#define row(bind, expected)
#define DELETE_ALL_AND_CLEAR(container, ContainerType)
unsigned char LookupNCBIStdaaNumberFromCharacter(char r)
char LookupCharacterFromNCBIStdaaNumber(unsigned char n)