64 bool in_quotes =
false;
71 for (
unsigned int i = 0;
i < s.size(); ++
i) {
73 if ((c ==
' ' || c ==
'\t') && !in_quotes) {
85 if (c ==
'\'' || c ==
'"') {
87 if (c == quote_char) {
104 throw runtime_error(
"Unbalanced quotes (')");
110 vector<CContigAssembly::SAlignStats::STails>& tails)
121 tls.
right = seq_len - stop - 1;
124 tls.
left = seq_len - stop - 1;
126 tails.push_back(tls);
133 const string& param_string,
137 query_loc.
SetWhole().Assign(query_id);
139 subject_loc.
SetWhole().Assign(subject_id);
140 return Blastn(query_loc, subject_loc, param_string, scope);
146 const string& param_string,
149 SSeqLoc query_sl(query_loc, scope);
150 SSeqLoc subject_sl(subject_loc, scope);
158 for (
unsigned int i = 0;
i < args.size();
i += 2) {
159 const string& name = args[
i];
160 if (
i + 1 >= args.size()) {
161 throw runtime_error(
"no value given for " + name);
163 const string&
value = args[
i + 1];
166 }
else if (name ==
"-r") {
168 }
else if (name ==
"-q") {
170 }
else if (name ==
"-e") {
172 }
else if (name ==
"-Z") {
174 }
else if (name ==
"-F") {
176 }
else if (name ==
"-G") {
178 }
else if (name ==
"-E") {
181 throw runtime_error(
"invalid option: " + name);
191 vector<unsigned int>& plus_vec,
192 vector<unsigned int>& minus_vec)
195 *align_set.
Get().front()->GetSegs().GetDenseg().GetIds()[0];
197 *align_set.
Get().front()->GetSegs().GetDenseg().GetIds()[1];
203 plus_vec.resize(len0 + len1);
206 minus_vec.resize(len0 + len1);
210 const CDense_seg& ds = (*aln)->GetSegs().GetDenseg();
214 if (start0 == -1 || start1 == -1) {
219 TSeqPos diag = (start0 + seg_len - 1) + start1;
220 minus_vec[diag] += seg_len;
223 const CDense_seg& ds = (*aln)->GetSegs().GetDenseg();
227 if (start0 == -1 || start1 == -1) {
232 TSeqPos diag = start1 - start0 + len0 - 1 ;
233 plus_vec[diag] += seg_len;
243 vector<TRange>& max_range)
245 unsigned int running_sum = 0;
247 for (
i = 0;
i < window; ++
i) {
248 running_sum += vec[
i];
252 max_range.push_back(
TRange(window - 1, window - 1));
254 for (
i = window;
i < vec.size(); ++
i) {
255 running_sum -= vec[
i - window];
256 running_sum += vec[
i];
257 if (running_sum >=
max) {
258 if (running_sum >
max) {
262 if (max_range.size() && max_range.back().GetFrom() ==
i - 1) {
263 max_range.back().SetFrom(
i);
279 vector<unsigned int> plus_vec, minus_vec;
280 DiagCounts(align_set, scope, plus_vec, minus_vec);
282 unsigned int plus_count;
283 vector<TRange> plus_range;
286 unsigned int minus_count;
287 vector<TRange> minus_range;
291 vector<TRange>*
r =
NULL;
292 if (plus_count > minus_count) {
304 (
r->front().GetFrom() +
r->front().GetTo() + 1) / 2 -
window_size / 2;
314 unsigned int half_width,
CScope& scope)
345 if (diag <= seq0.size()) {
347 shift = seq0.size() - 1 - diag;
350 shift = diag - (seq0.size() - 1);
363 ds->
SetIds().push_back(cr_id0);
364 ds->
SetIds().push_back(cr_id1);
398 unsigned int slop,
CScope& scope)
404 if( (
stats.tails[0].right <= slop &&
stats.tails[1].left <= slop) ||
405 (
stats.tails[0].left <= slop &&
stats.tails[1].right <= slop) ) {
446 unsigned int slop,
CScope& scope)
462 unsigned int slop,
CScope& scope)
478 bool FirstContainsSecond;
479 bool SecondContainsFirst;
481 FirstContainsSecond = ((((long)
stats.tails[0].left -
stats.tails[1].left) >= -
int(slop)) &&
482 (((
long)
stats.tails[0].right -
stats.tails[1].right) >= -
int(slop)));
483 SecondContainsFirst = ((((long)
stats.tails[1].left -
stats.tails[0].left) >= -
int(slop)) &&
484 (((
long)
stats.tails[1].right -
stats.tails[0].right) >= -
int(slop)));
486 return (FirstContainsSecond | SecondContainsFirst);
501 Ident =
stats.pct_identity/100.0;
511 int Wg = -5, Wm = 1, Wms = -2, Ws = -2;
521 vector<int> scores(sz);
523 for (
unsigned int i = 0;
i < scores.size(); ++
i) {
527 previous_score = scores[
i - 1];
533 scores[
i] = previous_score + Ws;
535 scores[
i] = previous_score + Wg + Ws;
537 }
else if (res1 ==
'-') {
539 scores[
i] = previous_score + Ws;
541 scores[
i] = previous_score + Wg + Ws;
543 }
else if (res0 == res1) {
545 scores[
i] = previous_score + Wm;
548 scores[
i] = previous_score + Wms;
559 unsigned int right_end = 0;
560 unsigned int left_end;
561 for (
unsigned int i = 0;
i < scores.size(); ++
i) {
562 if (scores[
i] > max_score) {
563 max_score = scores[
i];
570 if (right_end == 0) {
577 for (
i = right_end - 1;
i > 0; --
i) {
578 if (scores[
i] == 0) {
585 if (scores[0] == 0) {
601 vector<CRef<CSeq_align> >
603 const string& blast_params,
double min_ident,
604 unsigned int max_end_slop,
CScope& scope,
606 const vector<unsigned int>& band_halfwidths,
607 unsigned int diag_finding_window,
608 unsigned int min_align_length,
611 if (min_ident > 1 || min_ident < 0) {
612 throw runtime_error(
"min_ident must be between zero and one (got "
623 *ostr <<
"Filtering on " << min_ident <<
"%, slop " << max_end_slop
624 <<
"bp, min length " << min_align_length <<
"bp"
625 <<
" and strands " << strandmap[strand0] <<
", " << strandmap[strand1] << endl;
629 alns =
Blastn(id0, id1, blast_params, scope);
631 catch (exception& e) {
633 *ostr <<
"blast failed:\n" << e.what() << endl;
635 return vector<CRef<CSeq_align> >();
638 vector<CRef<CSeq_align> > good_alns;
644 if (
IsDovetail((*aln)->GetSegs().GetDenseg(), max_end_slop, scope)
645 &&
FracIdent((*aln)->GetSegs().GetDenseg(), scope) >= min_ident
647 &&
x_DensegLength((*aln)->GetSegs().GetDenseg()) >= min_align_length ) {
649 good_alns.push_back(*aln);
652 if (!good_alns.empty()) {
654 *ostr <<
"Found "<< good_alns.size() <<
655 " acceptable dovetail alignment(s) by blast "
656 << blast_params << endl;
661 *ostr <<
"Found no acceptable dovetail alignments by blast "
662 << blast_params << endl;
664 if (alns->
Get().empty()) {
666 *ostr <<
"No alignments found by blast; "
667 "can't do banded alignment" << endl;
669 return vector<CRef<CSeq_align> >();
676 ITERATE(vector<unsigned int>, band_halfwidth, band_halfwidths) {
678 *ostr <<
"Trying banded global alignment with bandwidth = "
679 << 2 * *band_halfwidth + 1 << endl;
685 diag, *band_halfwidth, scope);
689 *ostr <<
"banded alignment failed:\n" << e.
what() << endl;
696 *ostr <<
"banded alignment failed: num segs == 0\n" << endl;
702 double frac_ident =
FracIdent(*local_ds, scope);
704 *ostr <<
"Fraction identity: " << frac_ident << endl;
706 if (
IsDovetail(*local_ds, max_end_slop, scope)
707 &&
FracIdent(*local_ds, scope) >= min_ident
711 *ostr <<
"Alignment acceptable (full dovetail)" << endl;
715 aln->
SetSegs().SetDenseg(*local_ds);
717 return vector<CRef<CSeq_align> >(1, aln);
722 *ostr <<
"No acceptable alignments from banded alignment algorithm"
731 &&
FracIdent((*aln)->GetSegs().GetDenseg(), scope) >= min_ident
733 &&
x_DensegLength((*aln)->GetSegs().GetDenseg()) >= min_align_length
736 good_alns.push_back(*aln);
740 *ostr <<
"Found " << good_alns.size() <<
741 " acceptable half-dovetail "
742 "or contained alignment(s) by blast" << endl;
744 if (!good_alns.empty()) {
751 &&
FracIdent(*local_ds, scope) >= min_ident
754 string dovetail_string;
756 dovetail_string =
"contained";
758 dovetail_string =
"half-dovetail";
761 *ostr <<
"Banded alignment acceptable ("
762 << dovetail_string <<
")" << endl;
766 aln->
SetSegs().SetDenseg(*local_ds);
768 return vector<CRef<CSeq_align> >(1, aln);
772 *ostr <<
"Banded alignment not an acceptable "
773 "half-dovetail or contained" << endl;
775 return vector<CRef<CSeq_align> >();
783 objects::CScope& scope)
793 _ASSERT(row1.size() == row2.size());
796 for (
unsigned int i = 0;
i < row1.size(); ++
i) {
797 if (row1[
i] !=
'N' && row2[
i] !=
'N') {
800 if (row1[
i] != row2[
i]) {
801 if (row1[
i] ==
'-') {
803 while (
i+1 < row1.size() && row1[
i+1] ==
'-') ++
i;
804 }
else if (row2[
i] ==
'-') {
806 while (
i+1 < row1.size() && row2[
i+1] ==
'-') ++
i;
829 align_stats.
gaps.clear();
832 vector<CRef<CSeq_loc> > dust_locs;
843 loc->SetPacked_int(*res);
844 dust_locs.push_back(loc);
857 unsigned int other_row = (j + 1) % 2;
867 gap_loc.
SetInt().SetId().Set(
"lcl|dummy");
879 }
else if (gap_simple == -1) {
911 SAlignStats& align_stats)
920 SAlignStats& align_stats)
933 if(
stats.tails[0].left <
stats.tails[1].left) {
944 bool matches[2] = {
false,
false};
957 if(!(matches[0] & matches[1])) {
964 return (matches[0] & matches[1]);
973 int Dim = ds.GetDim();
975 for(
unsigned int Seg = 0; Seg < Lens.size(); Seg++) {
977 if(Starts[(Seg*Dim)] == -1 || Starts[(Seg*Dim)+1] == -1)
991 for (
int i = 0;
i < vec.GetNumSegs(); ++
i) {
993 for (
int j = 0; j < vec.GetNumRows(); ++j) {
994 if (vec.GetStart(j,
i) == -1) {
1000 AlignedLength += vec.GetLen(
i);
1006 unsigned int identities = 0;
1007 for (
int i = 0;
i < vec.GetNumSegs(); ++
i) {
1009 vec.GetSegSeqString(s1, 0,
i);
1010 for (
int j = 1; j < vec.GetNumRows(); ++j) {
1012 vec.GetSegSeqString(s2, j,
i);
1014 for (
unsigned int k = 0; k <
min(s1.size(), s2.size()); ++k) {
1015 identities += (s1[k] == s2[k]);
1020 align_stats.
mismatches = AlignedLength - identities;
1022 100.0 * double(identities) / double(AlignedLength);
Declares the CBl2Seq (BLAST 2 Sequences) class.
Declares the CBlastNucleotideOptionsHandle class.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
Definitions of special type used in BLAST.
vector< CRef< objects::CSeq_align_set > > TSeqAlignVector
Vector of Seq-align-sets.
@ eBlastn
Nucl-Nucl (traditional blastn)
TSignedSeqPos GetStop(TNumrow row, TNumseg seg, int offset=0) const
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
bool IsPositiveStrand(TNumrow row) const
TSignedSeqPos GetSeqPosFromAlnPos(TNumrow for_row, TSeqPos aln_pos, ESearchDirection dir=eNone, bool try_reverse_dir=true) const
TDim GetNumRows(void) const
TSeqPos GetAlnStop(TNumseg seg) const
TSeqPos GetLen(TNumseg seg, int offset=0) const
TSeqPos GetSeqStop(TNumrow row) const
TNumseg GetNumSegs(void) const
TSeqPos GetSeqStart(TNumrow row) const
const CBioseq_Handle & GetBioseqHandle(TNumrow row) const
void SetEndChar(TResidue gap_char)
void SetGapChar(TResidue gap_char)
CScope & GetScope(void) const
string & GetAlnSeqString(string &buffer, TNumrow row, const CAlnMap::TSignedRange &aln_rng) const
TResidue GetResidue(TNumrow row, TSeqPos aln_pos) const
Runs the BLAST algorithm between 2 sequences.
Handle to the nucleotide-nucleotide options to the BLAST algorithm.
unsigned int m_AdjustedLen
CAlnStats(unsigned int adjusted_len, unsigned int mm, unsigned int gaps)
static void FindMaxRange(const vector< unsigned int > &vec, unsigned int window, unsigned int &max, vector< TRange > &max_range)
static void x_GatherIdentStats(const objects::CAlnVec &vec, SAlignStats &align_stats)
static CRef< objects::CDense_seg > BestLocalSubAlignment(const objects::CDense_seg &ds_in, objects::CScope &scope)
Find the highest-scoring local subalignment.
static bool IsAtLeastHalfDovetail(const objects::CDense_seg &ds, unsigned int slop, objects::CScope &scope)
static CRef< objects::CSeq_align_set > Blastn(const objects::CSeq_id &query_id, const objects::CSeq_id &subject_id, const string ¶m_string, objects::CScope &scope)
Utility for running blastn.
static void GatherAlignStats(const objects::CAlnVec &vec, SAlignStats &align_stats)
static bool x_IsAllowedStrands(const objects::CDense_seg &ds, objects::ENa_strand strand0, objects::ENa_strand strand1)
static void FindDiagFromAlignSet(const objects::CSeq_align_set &align_set, objects::CScope &scope, unsigned int window_size, objects::ENa_strand &strand, unsigned int &diag)
Given a set of alignments, pick out a diagonal to use as the center of a band in a banded alignment.
CRange< unsigned int > TRange
Find the range (or more than one tied range) containing the maximal diagonal count,...
static void DiagCounts(const objects::CSeq_align_set &align_set, objects::CScope &scope, vector< unsigned int > &plus_vec, vector< unsigned int > &minus_vec)
Count the cells with "ink" along each diagonal in a dot-matrix-type plot of some set of alignments (e...
static vector< CRef< objects::CSeq_align > > Align(const objects::CSeq_id &id0, const objects::CSeq_id &id1, const string &blast_params, double min_ident, unsigned int max_end_slop, objects::CScope &scope, CNcbiOstream *ostr=0, const vector< unsigned int > &band_halfwidths=vector< unsigned int >(1, 200), unsigned int diag_finding_window=200, unsigned int min_align_length=50, objects::ENa_strand strand0=objects::eNa_strand_unknown, objects::ENa_strand strand1=objects::eNa_strand_unknown)
Most users of the class need only to call this function.
static CRef< objects::CDense_seg > BandedGlobalAlignment(const objects::CSeq_id &id0, const objects::CSeq_id &id1, objects::ENa_strand strand, unsigned int diag, unsigned int half_width, objects::CScope &scope)
Do a banded global alignment using an arbitrary band.
static void x_OrientAlign(objects::CDense_seg &ds, objects::CScope &scope)
static TSeqPos x_DensegLength(const objects::CDense_seg &ds)
static bool IsDovetail(const objects::CDense_seg &ds, unsigned int slop, objects::CScope &scope)
static bool IsContained(const objects::CDense_seg &ds, unsigned int slop, objects::CScope &scope)
static double FracIdent(const objects::CDense_seg &ds, objects::CScope &scope)
ENa_strand GetSeqStrand(TDim row) const
TSeqPos GetSeqStop(TDim row) const
void Reverse(void)
Reverse the segments' orientation.
TSeqPos GetSeqStart(TDim row) const
void FromTranscript(TSeqPos query_start, ENa_strand query_strand, TSeqPos subj_start, ENa_strand subj_strand, const string &transcript)
Initialize from pairwise alignment transcript (a string representation produced by CNWAligner)
CRef< CDense_seg > ExtractSlice(TDim row, TSeqPos from, TSeqPos to) const
Extract a slice of the alignment that includes the specified range.
Looks for low complexity parts of sequences according to the symmetric version of DUST algorithm.
CRef< objects::CPacked_seqint > GetMaskedInts(objects::CSeq_id &seq_id, const sequence_type &seq)
Mask a sequence and return result as a CPacked_seqint instance.
static void s_SplitCommandLine(string s, vector< string > &result)
static void s_GetTails(const CAlnVec &vec, vector< CContigAssembly::SAlignStats::STails > &tails)
void SetSpaceLimit(const size_t &maxmem)
void SetShift(Uint1 where, size_t offset)
string GetTranscriptString(void) const
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
void SetEvalueThreshold(double eval)
Sets EvalueThreshold.
void SetMatchReward(int r)
Sets MatchReward.
void SetTraditionalBlastnDefaults()
Sets TraditionalBlastnDefaults.
void SetMismatchPenalty(int p)
Sets MismatchPenalty.
void SetGapXDropoffFinal(double x)
Sets GapXDropoffFinal.
void SetGapExtensionCost(int e)
Sets GapExtensionCost.
void SetFilterString(const char *f, bool clear=true)
Sets FilterString.
void SetWordSize(int ws)
Sets WordSize.
void SetGapOpeningCost(int g)
Sets GapOpeningCost.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
TSeqPos GetBioseqLength(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eStrand_Plus
Plus strand.
@ eStrand_Minus
Minus strand.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
void SetIupacCoding(void)
Set coding to either Iupacaa or Iupacna depending on molecule type.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
const TDenseg & GetDenseg(void) const
Get the variant data.
Tdata & Set(void)
Assign a value to data member.
TLens & SetLens(void)
Assign a value to Lens data member.
const TStarts & GetStarts(void) const
Get the Starts member data.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
const TLens & GetLens(void) const
Get the Lens member data.
vector< TSignedSeqPos > TStarts
void SetType(TType value)
Assign a value to Type data member.
TStarts & SetStarts(void)
Assign a value to Starts data member.
TStrands & SetStrands(void)
Assign a value to Strands data member.
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
const TIds & GetIds(void) const
Get the Ids member data.
bool CanGetStrands(void) const
Check if it is safe to call GetStrands method.
TNumseg GetNumseg(void) const
Get the Numseg member data.
list< CRef< CSeq_align > > Tdata
TIds & SetIds(void)
Assign a value to Ids data member.
const TStrands & GetStrands(void) const
Get the Strands member data.
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
@ eType_partial
mapping pieces together
ENa_strand
strand of nucleic acid
unsigned int
A callback function used to compare two keys in a database.
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Uint8 GetPhysicalMemorySize(void)
Return the amount of physical memory available in the system.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define row(bind, expected)
Alignment characterization.
vector< STails > tails
unaligned tails
double pct_identity
% identity (varies from 0 to 100)
TSeqPos mismatches
number of mismatched bases
TSeqPos total_length
total covered length of the alignment, including gaps
TSeqPos gap_count
count of total number of gaps
vector< bool > is_simple
for each gap, whether is consists of "simple sequence"
vector< TSeqPos > gaps
the set of gap lengths for this alignment
TSeqPos aligned_length
total number of bases included in the alignment
Structure to represent a single sequence to be fed to BLAST.