64 if (!constraint.empty()) {
65 int last = constraint.size();
66 if (constraint[
last-4] >= (
size_t)seq1_start ||
67 constraint[
last-2] >= (
size_t)seq2_start) {
72 constraint.push_back(seq1_start);
73 constraint.push_back(seq1_start);
74 constraint.push_back(seq2_start);
75 constraint.push_back(seq2_start);
80 if (seq1_start >= seq1_stop ||
81 seq2_start >= seq2_stop)
84 constraint.push_back(seq1_stop);
85 constraint.push_back(seq1_stop);
86 constraint.push_back(seq2_stop);
87 constraint.push_back(seq2_stop);
122 vector<CSequence>& query_data,
123 vector<CTree::STreeLeaf>& node_list,
133 for (
size_t i = 0;
i < node_list.size();
i++) {
134 sum += node_list[
i].distance;
138 for (
size_t i = 0;
i < node_list.size();
i++) {
143 int index = node_list[
i].query_idx;
144 double weight = node_list[
i].distance / sum;
151 if (node_list[
i].distance == 0 && sum == 0)
154 weight = node_list[
i].distance / sum;
156 int start =
range.GetFrom() >= 0 ?
range.GetFrom() : 0;
158 : query_data[index].GetLength();
165 for (
int j = 0; j <
size; j++) {
167 freq_data[j][0] +=
weight;
171 freq_data[j][k] +=
weight * matrix(start + j, k);
187 for (
int i = 0;
i < freq_size;
i++) {
193 sum += freq_data[
i][j];
200 freq_data[
i][j] *= sum;
249 {
return first < p.first && second < p.second;}
261 vector<CMultiAligner::TRangePair>& prof_ranges)
277 _ASSERT(p.first >= 0 && p.second >= 0);
278 if (p.first < 0 || p.second < 0) {
283 r.first.SetFrom(p.first);
284 r.second.SetFrom(p.second);
288 while (p <
len && s < end_seq) {
292 while (p <
len && s < end_seq
298 if ((p.first >=
len.first && s.first < end_seq.first)
299 || (p.second >=
len.second && s.second < end_seq.second)) {
304 r.first.SetTo(p.first - 1);
305 r.second.SetTo(p.second - 1);
306 _ASSERT(
r.first.GetLength() ==
r.second.GetLength());
307 prof_ranges.push_back(
r);
310 while (p.first <
len.first
316 while (p.second <
len.second
323 r.first.SetFrom(p.first);
324 r.second.SetFrom(p.second);
343 for (
i = 0;
i <
len;
i++) {
383 vector<CSequence>& alignment,
384 vector<CTree::STreeLeaf>& node_list1,
385 vector<CTree::STreeLeaf>& node_list2,
389 const int kMinAlignLength = 11;
395 for (
int i = 0;
i < (
int)node_list1.size();
i++) {
397 for (
int j = 0; j < (
int)node_list2.size(); j++) {
399 int seq1 = node_list1[
i].query_idx;
400 int seq2 = node_list2[j].query_idx;
405 for (
int k = 0; k < pair_info(seq1, seq2).Size(); k++) {
406 CHit *hit = pair_info(seq1, seq2).GetHit(k);
407 CHit *new_hit =
new CHit(seq1, seq2);
427 CHit *subhit = *subitr;
428 CHit *new_subhit =
new CHit(seq1, seq2);
445 "Alignment interrupted");
450 if (profile_hitlist.
Empty())
458 for (
int i = 0;
i < profile_hitlist.
Size();
i++) {
464 CHit *subhit = *subitr;
472 "Alignment interrupted");
482 printf(
"possible constraints (offsets wrt input profiles):\n");
483 for (
int i = 0;
i < profile_hitlist.
Size();
i++) {
486 printf(
"query %3d %4d - %4d query %3d %4d - %4d score %d %c\n",
500 vector<SGraphNode> graph;
501 for (
int j = 0; j < profile_hitlist.
Size(); j++) {
510 if (iteration == 0 && hit->
m_Score < 1000 &&
519 for (
i = 0;
i < (
int)graph.size();
i++) {
520 CHit *ghit = graph[
i].hit;
560 if (
i == (
int)graph.size())
566 "Alignment interrupted");
577 if (iteration == 0) {
584 int num_hits = profile_hitlist.
Size();
585 vector<int> list1, list2;
586 list1.reserve(num_hits);
587 list2.reserve(num_hits);
591 for (
int i = 0;
i < (
int)graph.size();
i++) {
592 CHit *ghit = graph[
i].hit;
603 for (
int j = 0; j < num_hits; j++) {
625 for (k = 0; k < (
int)list1.size(); k++) {
629 if (k == (
int)list1.size())
632 for (k = 0; k < (
int)list2.size(); k++) {
636 if (k == (
int)list2.size())
645 graph[
i].best_score = graph[
i].hit->
m_Score *
646 list1.size() * list2.size();
650 "Alignment interrupted");
661 itr->best_score = itr->hit->m_Score;
668 while (best_path != 0) {
673 printf(
"pick query %3d %4d - %4d query %3d %4d - %4d\n",
701 vector<CTree::STreeLeaf>& node_list1,
702 vector<CTree::STreeLeaf>& node_list2,
704 vector<TRangePair>& match_ranges)
const
709 int seq1 = -1, seq2 = -1;
711 for (
size_t i=0;
i < node_list1.size();
i++) {
712 for (
size_t j=0;j < node_list2.size();j++) {
713 if (dist > dmat(node_list1[
i].query_idx, node_list2[j].query_idx)
716 seq1 = node_list1[
i].query_idx;
717 seq2 = node_list2[j].query_idx;
718 dist = dmat(seq1, seq2);
725 if (pair_info(seq1, seq2).Size() == 0) {
729 CHit* hit = pair_info(seq1, seq2).GetHit(0);
730 for (
int k=1;k < pair_info(seq1, seq2).Size();k++) {
731 if (pair_info(seq1, seq2).GetHit(k)->m_Score > hit->
m_Score) {
732 hit = pair_info(seq1, seq2).GetHit(k);
738 vector<TOffsetPair> seq_match_regions(
747 _ASSERT(seq_match_regions.size() >= 2);
748 for (
size_t i=0;
i < seq_match_regions.size();
i+=2) {
749 _ASSERT(
i < seq_match_regions.size() - 1);
751 == seq_match_regions[
i + 1].second - seq_match_regions[
i].second);
755 seq_match_regions[
i + 1].
first);
756 TRange seq_range2(seq_match_regions[
i].second,
757 seq_match_regions[
i + 1].second);
768 printf(
"possible constraints (offsets wrt input profiles):\n");
769 printf(
"query %3d %4d - %4d query %3d %4d - %4d score %d\n",
771 match_ranges.front().first.GetFrom(),
772 match_ranges.back().first.GetTo(),
774 match_ranges.front().second.GetFrom(),
775 match_ranges.back().second.GetTo(),
794 vector<CTree::STreeLeaf>& node_list1,
795 vector<CTree::STreeLeaf>& node_list2,
796 vector<CSequence>& alignment,
804 int freq1_size = alignment[node_list1[0].query_idx].GetLength();
805 int freq2_size = alignment[node_list2[0].query_idx].GetLength();
809 printf(
"\nalign profile (size %d) with profile (size %d)\n",
810 freq1_size, freq2_size);
817 freq1_data =
new double* [freq1_size];
820 for (
int i = 1;
i < freq1_size;
i++)
823 memset(freq1_data[0], 0,
kAlphabetSize * freq1_size *
sizeof(
double));
830 freq2_data =
new double* [freq2_size];
833 for (
int i = 1;
i < freq2_size;
i++)
836 memset(freq2_data[0], 0,
kAlphabetSize * freq2_size *
sizeof(
double));
843 (
const double**)freq2_data, freq2_size,
kScale);
846 vector<size_t> constraint;
851 node_list2, pair_info, iteration);
859 printf(
"constraints: ");
860 for (
int i = 0;
i < (
int)constraint.size();
i+=4) {
861 printf(
"(seq1 %d seq2 %d)->", (
int)constraint[
i],
862 (
int)constraint[
i+2]);
880 if (freq1_size > 1.2 * freq2_size ||
881 freq2_size > 1.2 * freq1_size) {
885 if ((freq1_size > 1.5 * freq2_size ||
886 freq2_size > 1.5 * freq1_size) &&
887 !constraint.empty()) {
901 delete [] freq1_data[0];
902 delete [] freq1_data;
903 delete [] freq2_data[0];
904 delete [] freq2_data;
910 for (
int i = 0;
i < (
int)node_list1.size();
i++) {
911 alignment[node_list1[
i].query_idx].PropagateGaps(
t,
914 for (
int i = 0;
i < (
int)node_list2.size();
i++) {
915 alignment[node_list2[
i].query_idx].PropagateGaps(
t,
922 for (
int i = 0;
i < (
int)
t.size() / 10;
i++)
923 printf(
"%10d",
i + 1);
925 for (
int i = 0;
i < (
int)
t.size();
i++)
926 printf(
"%d",
i % 10);
929 for (
int i = 0;
i < (
int)node_list1.size();
i++) {
930 int index = node_list1[
i].query_idx;
932 printf(
"%3d: ", index);
933 for (
int j = 0; j <
query.GetLength(); j++) {
934 printf(
"%c",
query.GetPrintableLetter(j));
939 for (
int i = 0;
i < (
int)node_list2.size();
i++) {
940 int index = node_list2[
i].query_idx;
942 printf(
"%3d: ", index);
943 for (
int j = 0; j <
query.GetLength(); j++) {
944 printf(
"%c",
query.GetPrintableLetter(j));
965 vector<CTree::STreeLeaf>& node_list1,
966 vector<CTree::STreeLeaf>& node_list2,
967 vector<CSequence>& alignment,
968 vector<size_t>& constraints,
970 int full_prof_len1,
int full_prof_len2,
983 freq1_data =
new double* [freq1_size];
986 for (
int i = 1;
i < freq1_size;
i++)
989 memset(freq1_data[0], 0,
kAlphabetSize * freq1_size *
sizeof(
double));
995 freq2_data =
new double* [freq2_size];
998 for (
int i = 1;
i < freq2_size;
i++) {
1002 memset(freq2_data[0], 0,
kAlphabetSize * freq2_size *
sizeof(
double));
1008 (
const double**)freq2_data, freq2_size,
kScale);
1025 if (full_prof_len1 > 1.2 * full_prof_len2 ||
1026 full_prof_len2 > 1.2 * full_prof_len1) {
1048 delete [] freq1_data[0];
1049 delete [] freq1_data;
1050 delete [] freq2_data[0];
1051 delete [] freq2_data;
1057 vector<CTree::STreeLeaf>& node_list1,
1058 vector<CTree::STreeLeaf>& node_list2,
1059 vector<CSequence>& alignment,
1070 printf(
"\nalign profile (size %d) with profile (size %d)\n",
1071 alignment[node_list1[0].query_idx].
GetLength(),
1072 alignment[node_list2[0].query_idx].
GetLength());
1078 vector<TRangePair> match_ranges;
1080 pair_info, match_ranges);
1083 if (match_ranges.empty()) {
1086 string message =
"No significant alignments were found for cluster"
1087 " containing sequences: ";
1088 ITERATE(vector<CTree::STreeLeaf>, it, node_list1) {
1092 ITERATE(vector<CTree::STreeLeaf>, it, node_list2) {
1095 message +=
". Decreasing maximum in-cluster distance or turing off"
1096 " clustering option may improve results.";
1104 ITERATE (vector<TRangePair>, rit, match_ranges) {
1105 printf(
"in-cluster constraints: "
1106 "(seq1 %d seq2 %d)->(seq1 %d seq2 %d)\n",
1107 rit->first.GetFrom(), rit->second.GetFrom(),
1108 rit->first.GetTo(), rit->second.GetTo());
1120 int prof1_length = alignment[node_list1.front().query_idx].GetLength();
1121 int prof2_length = alignment[node_list2.front().query_idx].GetLength();
1126 if (match_ranges.front().first.GetFrom() > 0
1127 && match_ranges.front().second.GetFrom() > 0) {
1129 TRange range1(0, match_ranges.front().first.GetFrom() - 1);
1130 TRange range2(0, match_ranges.front().second.GetFrom() - 1);
1144 vector<size_t> constr;
1147 constr, range1, range2,
1154 int len1 = match_ranges.front().first.GetFrom();
1155 int len2 = match_ranges.front().second.GetFrom();
1157 for (
int i=0;
i < len1;
i++) {
1160 for (
int i=0;
i < len2;
i++) {
1167 vector<TRangePair>::const_iterator it(match_ranges.begin());
1170 for (
int i=0;
i < it->first.GetLength();
i++) {
1174 vector<TRangePair>::const_iterator prev_it(it);
1178 for (; it != match_ranges.end(); ++it, ++prev_it) {
1179 _ASSERT(it->first.GetLength() == it->second.GetLength());
1182 TRange space1(prev_it->first.GetToOpen(), it->first.GetFrom() - 1);
1183 TRange space2(prev_it->second.GetToOpen(), it->second.GetFrom() - 1);
1187 if (space1.
Empty()) {
1194 else if (space2.
Empty()) {
1213 vector<size_t> constr;
1223 transcr.push_back(*
t);
1229 for (
int i=0;
i < it->first.GetLength();
i++) {
1236 if (prof1_length - match_ranges.back().first.GetTo() - 1 > 0
1237 && prof2_length - match_ranges.back().second.GetTo() - 1 > 0) {
1239 TRange seq1_range(match_ranges.back().first.GetTo() + 1,
1241 TRange seq2_range(match_ranges.back().second.GetTo() + 1,
1255 vector<size_t> constr;
1258 constr, seq1_range, seq2_range,
1259 prof1_length, prof2_length,
1263 transcr.push_back(*it);
1270 int len1 = prof1_length - match_ranges.back().first.GetTo() - 1;
1271 int len2 = prof2_length - match_ranges.back().second.GetTo() - 1;
1273 for (
int i=0;
i < len1;
i++) {
1276 for (
int i=0;
i < len2;
i++) {
1282 for (
int i=0;
i < (
int)node_list1.size();
i++) {
1283 alignment[node_list1[
i].query_idx].PropagateGaps(transcr,
1286 for (
int i=0;
i < (
int)node_list2.size();
i++) {
1287 alignment[node_list2[
i].query_idx].PropagateGaps(transcr,
1295 for (
int i = 0;
i < (
int)
t.size() / 10;
i++)
1296 printf(
"%10d",
i + 1);
1298 for (
int i = 0;
i < (
int)
t.size();
i++)
1299 printf(
"%d",
i % 10);
1302 for (
int i = 0;
i < (
int)node_list1.size();
i++) {
1303 int index = node_list1[
i].query_idx;
1305 printf(
"%3d: ", index);
1306 for (
int j = 0; j <
query.GetLength(); j++) {
1307 printf(
"%c",
query.GetPrintableLetter(j));
1312 for (
int i = 0;
i < (
int)node_list2.size();
i++) {
1313 int index = node_list2[
i].query_idx;
1315 printf(
"%3d: ", index);
1316 for (
int j = 0; j <
query.GetLength(); j++) {
1317 printf(
"%c",
query.GetPrintableLetter(j));
1335 0.000000, 0.019250, 0.073770, 0.052030, 0.228450,
1336 0.084020, 0.021990, 0.207660, 0.108730, 0.204100
1341 0, 7, 9, 1, 9, 9, 5, 2, 6, 4, 8, 4, 4,
1342 9, 3, 9, 8, 7, 7, 4, 5, 0, 5, 9, 0, 0
1355 int num_queries = align.size();
1357 double r = 1.0 / num_queries;
1362 for (
int j = 0; j < num_queries; j++)
1363 count[
kRes2Class[align[j].GetLetter(col)]]++;
1366 freq[j] = count[j] *
r;
1374 H1 += freq[j] *
log((num_queries - 1) / pseudocount +
1382 return H1 - H2 *
log((num_queries - 1) *
1383 (pseudocount + H3) / pseudocount);
1394 int align_len = align[0].GetLength();
1397 for (
int i = 0;
i < align_len;
i++)
1417 vector<CSequence>& alignment,
1426 vector<CTree::STreeLeaf> full_seq_list;
1431 vector<CTree::STreeLeaf> cluster_seq_list;
1436 printf(
"cluster: ");
1437 for (
int i = 0;
i < (
int)cluster_seq_list.size();
i++) {
1438 printf(
"%d ", cluster_seq_list[
i].query_idx);
1444 vector<int> cluster_idx;
1445 vector<int> other_idx;
1446 vector<CTree::STreeLeaf> other_seq_list;
1447 int cluster_size = cluster_seq_list.size();
1452 for (
int i = 0;
i < num_queries;
i++) {
1454 for (j = 0; j < cluster_size; j++) {
1455 if (full_seq_list[
i].query_idx ==
1456 cluster_seq_list[j].query_idx) {
1460 if (j == cluster_size) {
1461 other_seq_list.push_back(full_seq_list[
i]);
1462 other_idx.push_back(full_seq_list[
i].query_idx);
1465 for (
int i = 0;
i < cluster_size;
i++) {
1466 cluster_idx.push_back(cluster_seq_list[
i].query_idx);
1469 vector<CSequence> tmp_align = alignment;
1477 tmp_align, pair_info, iteration);
1484 printf(
"realigned score = %f\n\n", new_score);
1486 if (new_score > score) {
1488 alignment.swap(tmp_align);
1500 const vector<CSequence>& query_data,
1501 vector<TRange>& gaps)
1517 gaps.push_back(
range);
1536 vector<CSequence>& query_data,
1538 int iteration,
bool is_cluster)
1551 if (!left_child->
IsLeaf())
1553 pair_info, iteration, is_cluster);
1556 if (!right_child->
IsLeaf())
1558 pair_info, iteration, is_cluster);
1562 vector<CTree::STreeLeaf> node_list1, node_list2;
1566 right_child->
GetValue().GetDist());
1569 if (is_cluster && iteration == 0) {
1571 query_data, pair_info, iteration);
1575 query_data, pair_info, iteration);
1581 "Alignment interrupted");
1592 vector<TRange> gaps;
1601 vector<CSequence>& query_data,
1602 const vector<TRange>& gaps)
1615 vector<TRange>::const_iterator gap_it(gaps.begin());
1623 for (
int i=it->GetFrom();i < it->GetTo();
i++) {
1627 while (gap_it != gaps.end() && gap_it->GetFrom() <
i +
offset) {
1628 offset += gap_it->GetTo() - gap_it->GetFrom() + 1;
1643 for (
int j=0;j < (
int)matrix.
GetCols();j++) {
1648 matrix(
i +
offset, j) = rps_freqs(
i, j);
1657 -= domain_res_freq_boost;
1662 += domain_res_freq_boost;
1678 vector<CSequence>& new_alignment,
1681 int num_queries = new_alignment.size();
1682 int align_length = new_alignment[0].GetLength();
1683 vector<double> hvec(align_length);
1684 vector<TRange> chosen_cols;
1689 for (
i = 0;
i < align_length;
i++) {
1696 for (
i = 0;
i < align_length;
i++) {
1698 if (!chosen_cols.empty() &&
1699 chosen_cols.back().GetTo() ==
i-1) {
1700 chosen_cols.back().SetTo(
i);
1703 chosen_cols.push_back(
TRange(
i,
i));
1710 for (
i = j = 0; j < (
int)chosen_cols.size(); j++) {
1712 chosen_cols[
i++] = chosen_cols[j];
1715 chosen_cols.resize(
i);
1719 for (
i = 0;
i < (
int)chosen_cols.size();
i++) {
1720 printf(
"constraint at position %3d - %3d\n",
1721 chosen_cols[
i].GetFrom(), chosen_cols[
i].GetTo());
1726 vector<TRange> range_nogap(num_queries);
1728 for (
i = 0;
i < (
int)chosen_cols.size();
i++) {
1730 TRange& curr_range = chosen_cols[
i];
1731 int start = curr_range.
GetFrom();
1737 for (j = 0; j < num_queries; j++) {
1742 for (k = 0; k < length; k++) {
1747 range_nogap[j].SetEmpty();
1754 for (m = 0; m <= curr_range.
GetFrom(); m++) {
1759 int offset2 = offset1;
1760 for (; m <= curr_range.
GetTo(); m++) {
1764 range_nogap[j].Set(offset1, offset2);
1770 for (k = 0; k < num_queries - 1; k++) {
1771 for (m = k + 1; m < num_queries; m++) {
1772 if (!range_nogap[k].
Empty() && !range_nogap[m].Empty()) {
1774 range_nogap[m], 1000,
1783 printf(
"Per-column score\n");
1784 for (
i = 0;
i < align_length;
i++) {
1785 printf(
"%6.2f ", hvec[
i]);
1799 for (
int i = 0;
i < num_queries;
i++) {
1800 for (
int j = 0; j < num_queries; j++) {
1801 pair_info(
i, j).ResetList();
1813 vector<CTree::STreeEdge>& edges,
1814 double cluster_cutoff)
1819 int new_conserved_cols;
1820 double new_score = 0.0;
1853 pair_info, iteration,
false);
1859 double realign_score =
x_GetScore(tmp_aligned);
1861 printf(
"start score: %f\n", realign_score);
1868 for (
int i = 0;
i < num_tries;
i++) {
1869 new_score = realign_score;
1876 for (
int j = 0; j < (
int)edges.size() &&
1877 edges[j].distance >= cluster_cutoff; j++) {
1889 if (new_score - realign_score <= 0.02 *
fabs(realign_score)) {
1892 realign_score = new_score;
1895 realign_score =
max(realign_score, new_score);
1900 for (
int i = 0;
i < num_queries;
i++) {
1902 printf(
"%c", tmp_aligned[
i].GetPrintableLetter(j));
1906 printf(
"score = %f\n", realign_score);
1910 if (realign_score > best_score) {
1915 printf(
"REPLACE ALIGNMENT\n\n");
1917 best_score = realign_score;
1927 && iteration >= 1) {
1935 for (
int i = 0;
i < num_queries;
i++) {
1936 for (
int j = 0; j < num_queries; j++) {
1937 pair_info(
i, j).ResetList();
1948 new_conserved_cols = 0;
1961 for (
int i = 0;
i < conserved_regions.
Size();
i++) {
1971 if (new_conserved_cols <= conserved_cols)
1975 conserved_cols = new_conserved_cols;
1981 pair_info, iteration,
false);
1986 catch (std::bad_alloc ex) {
2004 return a.distance >
b.distance;
2021 vector<CTree::STreeEdge> edges;
2024 int num_edges = edges.size();
2030 int num_internal_edges =
min(11, (
int)(0.3 * num_edges + 0.5));
2039 for (
i = 0;
i < num_internal_edges - 1;
i++) {
2040 if (edges[
i].distance > 2 * edges[
i+1].distance) {
2041 cluster_cutoff = edges[
i].distance;
2045 if (
i == num_internal_edges - 1) {
2046 cluster_cutoff = edges[
i].distance;
2051 for (
i = 0;
i < num_edges;
i++)
2052 printf(
"%f ", edges[
i].distance);
2053 printf(
"cutoff = %f\n", cluster_cutoff);
static const string kScale
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
CLocalRange< TOffset > TRange
define for the fundamental building block of sequence ranges
pair< TOffset, TOffset > TOffsetPair
Basic type specifying a range on a sequence.
int GetPrototype(void) const
Get cluster prototype.
const TDistMatrix & GetDistMatrix(void) const
Get distance matrix.
const TClusters & GetClusters(void) const
Get clusters.
Interface for the traceback from blast hits.
vector< TOffsetPair > ListMatchRegions(TOffsetPair start_offsets)
Compile a list of regions in the current edit script that contain substitutions.
An ordered collection of CHit objects.
int Size() const
Retrieve number of hits in list.
void PurgeAllHits()
Delete all hits unconditionally.
bool Empty()
Determine whether a list contains no hits.
CHit * GetHit(int index)
Retrieve a hit from the hitlist.
void SortByStartOffset()
Sort the hits in the hitlist by increasing sequence1 index, then by increasing sequence1 start offset...
void AddToHitList(CHit *hit)
Append a hit to the hitlist.
A generalized representation of a pairwise alignment.
TSubHit & GetSubHit()
Retrieve a list of subhits.
void InsertSubHit(CHit *hit)
Add a to a CHit's list of subhits.
int m_Score
Score of alignment.
CEditScript & GetEditScript()
Retrieve the traceback associated with a CHit.
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
TRange m_SeqRange1
The range of offsets on the first sequence.
TRange m_SeqRange2
The range of offsets on the second sequence.
bool HasSubHits()
Query if a CHit has a hierarchy of subhits available.
vector< CHit * > TSubHit
Hits can be grouped hierarchically.
A pair of integers, defined to privide increment and less-then operators.
bool operator<(const CIntPair &p) const
True if each element of this is smaller than the corresponding element of p.
CIntPair & operator++(void)
Increment both elements.
CIntPair operator++(int)
Increment both elements.
bool StrictlyBelow(const TThisType &r)
Test whether 'this' represents a sequence range strictly less than a given sequence range.
bool GetFastAlign(void) const
Check if fast alignment is to be used.
TScore GetEndGapExtendPenalty(void) const
Get gap extension penalty for end gaps in pairwise global alignment of profiles.
double GetConservedCutoffScore(void) const
Get cutoff score for conserved aligned columns.
bool GetIterate(void) const
Check if iterative alignmnet option is used.
double GetPseudocount(void) const
Get pseudocount for calculating column entropy.
TScore GetGapExtendPenalty(void) const
Get gap extension penlaty for middle gaps in pairwise global alignment of profiles.
TScore GetGapOpenPenalty(void) const
Get gap opening penalty for middle gaps in pairwise global alignment of profiles.
TScore GetEndGapOpenPenalty(void) const
Get gap opening penalty for end gaps in pairwise global alignment of profiles.
bool GetRealign(void) const
Check if MSA is to be realigned for different rooting of progressive alignment tree.
@ eMulti
Alignment guide tree for each cluster is attached to the main alignment guide tree.
bool GetVerbose(void) const
Get verbose mode.
double GetDomainResFreqBoost(void) const
Get boost for residue frequencies in conserved domains from RPS data base.
EEndGapCostStrategy
Strategy for reducing end gap penalties for profile-profile alignment.
@ fReduceLeft
Reduce penalty only for left end gaps.
@ fReduceBoth
Reduce penalty for both end gaps.
@ fReduceRight
Reduce penalty only for right end gaps.
CMultiAlignerOptions::EInClustAlnMethod m_ClustAlnMethod
SProgress m_ProgressMonitor
SGraphNode * x_FindBestPath(vector< SGraphNode > &nodes)
Find a maximum weight path in a directed acyclic graph.
void x_AlignProfileProfileUsingHit(vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, int iteration)
Align two profiles with all sequences that belong to the same cluster.
void x_AddRpsFreqsToCluster(const CClusterer::CSingleCluster &cluster, vector< CSequence > &query_data, const vector< TRange > &gaps)
void x_BuildAlignmentIterative(vector< CTree::STreeEdge > &edges, double cluster_cutoff)
Main driver for the progressive alignment process.
CHitList m_LocalInClusterHits
void x_FindConservedColumns(vector< CSequence > &new_alignment, CHitList &conserved)
Create a list of constraints that reflect conserved columns in a multiple alignment.
const TPhyTreeNode * GetTree(void) const
Get ree used guide in progressive alignment.
@ eOutOfMemory
Out of memory error.
@ eInterrupt
Alignment interruped through callback function.
vector< CSequence > m_QueryData
void x_FindInClusterConstraints(vector< CSequence > &alignment, vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, CNcbiMatrix< CHitList > &pair_info, vector< TRangePair > &match_ranges) const
Find constraint to use for profile to profile alignment in clusters.
vector< vector< TRange > > m_RPSLocs
void x_ComputeProfileRangeAlignment(vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, vector< size_t > &constraints, const TRange &range1, const TRange &range2, int full_prof_len1, int full_prof_len2, EEndGapCostStrategy strat, CNWAligner::TTranscript &t)
Compute profile profile alignmnet for a ranges of given profiles.
vector< CSequence > m_Results
CConstRef< CMultiAlignerOptions > m_Options
void x_AlignProfileProfile(vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, int iteration)
Align two collections of sequences.
double x_GetScoreOneCol(vector< CSequence > &align, int col)
Calculate the entropy score of one column of a multiple alignment (see the COBALT papaer for details)
vector< string > m_Messages
void x_AlignProgressive(const TPhyTreeNode *tree, vector< CSequence > &query_data, CNcbiMatrix< CHitList > &pair_info, int iteration, bool is_cluster)
Main driver for progressive alignment.
int m_Score
Alignment score.
void x_BuildAlignment()
Given the current domain, local, pattern and user hits, along with the current tree,...
void x_FindConstraints(vector< size_t > &constraint, vector< CSequence > &alignment, vector< CTree::STreeLeaf > &node_list1, vector< CTree::STreeLeaf > &node_list2, CNcbiMatrix< CHitList > &pair_info, int iteration)
Find the set of constraints to use for a profile-profile alignment.
pair< TRange, TRange > TRangePair
double x_GetScore(vector< CSequence > &align)
Compute the entropy score of a multiple alignment.
static const int kRpsScaleFactor
static const int kClusterNodeId
double x_RealignSequences(const TPhyTreeNode *input_cluster, vector< CSequence > &alignment, CNcbiMatrix< CHitList > &pair_info, double score, int iteration)
Perform a single bipartition on a multiple alignment.
size_t GetRows() const
get the number of rows in this matrix
size_t GetCols() const
get the number of columns in this matrix
Class for representing protein sequences.
int GetLength() const
Get the length of the current sequence.
unsigned char GetLetter(int pos) const
Access the sequence letter at a specified position.
TFreqMatrix & GetFreqs()
Access the list of position frequencies associated with a sequence.
static void CompressSequences(vector< CSequence > &seq, vector< int > index_list)
Given a collection of sequences, remove all sequence positions where a subset of the sequences all co...
static const unsigned char kGapChar
The ncbistdaa code for a gap.
definition of a Culling tree
static void ListTreeEdges(const TPhyTreeNode *node, vector< STreeEdge > &edge_list, int max_id=-1)
Traverse a tree below a given starting point, listing all edges encountered along the way.
static void ListTreeLeaves(const TPhyTreeNode *node, vector< STreeLeaf > &node_list, double curr_dist=0)
Traverse a tree below a given starting point, listing all leaves encountered along the way.
Callback for sorting tree edges in order of decreasing edge weight.
bool operator()(const CTree::STreeEdge &a, const CTree::STreeEdge &b) const
Interface for CMultiAligner.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
bool Empty(const CNcbiOstrstream &src)
void SetStartWg(TScore value)
TTranscript GetTranscript(bool reversed=true) const
void SetEndWs(TScore value)
virtual CNWAligner::TScore Run(void)
void SetEndWg(TScore value)
vector< ETranscriptSymbol > TTranscript
void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
void SetPattern(const vector< size_t > &pattern)
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
void SetStartWs(TScore value)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
position_type GetLength(void) const
bool NotEmpty(void) const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
TNodeList::const_iterator TNodeList_CI
bool IsLeaf() const
Report whether this is a leaf node.
const TValue & GetValue(void) const
Return node's value.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
unsigned int
A callback function used to compare two keys in a database.
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
#define INT4_MAX
largest nubmer represented by signed int
#define INT4_MIN
Smallest (most negative) number represented by signed int.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double f(double x_, const double &y_)
static const int kNumClasses
Number of 'letters' in the reduced alphabet.
static void x_NormalizeResidueFrequencies(double **freq_data, int freq_size)
Normalize the residue frequencies so that each column sums to 1.0.
static void x_HitToConstraints(vector< size_t > &constraint, CHit *hit)
Convert a constraint into a form usable by CPSSMAligner.
static void x_AddConstraint(vector< size_t > &constraint, CHit *hit)
Convert one hit into a constraint usable by CPSSMAligner.
static void x_ExpandRange(TRange &range, CSequence &seq)
Convert a sequence range to reflect gaps added to the underlying sequence.
static void s_CleanUpConstraints(CNcbiMatrix< CHitList > &pair_info, int num_queries)
static const int kRes2Class[kAlphabetSize]
Mapping from protein letters to reduced alphabet letters.
static void x_GetProfileMatchRanges(const TRange &seq_range1, const TRange seq_range2, CSequence &seq1, CSequence &seq2, vector< CMultiAligner::TRangePair > &prof_ranges)
Translate ungapped alignment range for pair-wise sequence alignment in input sequence coordinates int...
static void x_FillResidueFrequencies(double **freq_data, vector< CSequence > &query_data, vector< CTree::STreeLeaf > &node_list, TRange range=TRange(-1, -1))
Compute the residue frequencies for a specified range of a collection of sequences.
static const double kDerivedFreqs[kNumClasses]
Background frequencies of the letters in the reduced alphabet (each is the sum of the Robinson-Robins...
static int s_SeqToProfilePosition(int seq_pos, CSequence &seq)
Find a profile position for a input sequence position.
static void x_GetClusterGapLocations(const CClusterer::CSingleCluster &cluster, const vector< CSequence > &query_data, vector< TRange > &gaps)
Find locations of gaps in cluster prototype.
struct SGraphNode * path_next
the optimal path node belongs to
CHit * hit
the alignment represented by this node
Structure for listing tree edges.