50 typedef list<pair<int, string>>
TMDTag;
51 typedef list<pair<int, char>>
TCigar;
77 while(!
m_md.empty() &&
m_md.front().first < 0)
83 if(
m_cigar.front().second ==
'D')
94 while(!
m_md.empty() &&
m_md.back().first < 0)
100 if(
m_cigar.back().second ==
'D')
132 Exon GetNextExon(
string& cigar_string,
string& MD_tag,
int qfrom,
int sfrom);
169 while(!cigar_string.empty()) {
172 char c = cigar_string[
letter];
175 if(c !=
'S' && c !=
'N')
179 cigar_string.erase(0,
letter+1);
186 }
else if(c ==
'N') {
194 }
else if(c ==
'M' || c ==
'=' || c ==
'X') {
202 int m = stoi(MD_tag, &idx);
203 if(e.
m_md.empty() || e.
m_md.back().first <= 0)
207 MD_tag.erase(0, idx);
209 MD_tag = to_string(m-
len)+MD_tag;
219 if(e.
m_md.empty() || e.
m_md.back().first != 0)
220 e.
m_md.emplace_back(0, MD_tag.substr(0, 1));
222 e.
m_md.back().second += MD_tag.front();
229 }
else if(c ==
'I') {
235 }
else if(c ==
'D') {
241 if(MD_tag.front() ==
'^') {
242 del = MD_tag.substr(1,
len);
243 MD_tag.erase(0,
len+1);
245 del = MD_tag.substr(0,
len);
246 MD_tag.erase(0,
len);
248 if(e.
m_md.empty() || e.
m_md.back().first >= 0)
249 e.
m_md.emplace_back(-1, del);
251 e.
m_md.back().second += del;
254 cerr <<
"Unexpected symbol in CIGAR" << endl;
263 int length = seq.size();
272 case 'A': tA += 1;
break;
273 case 'C': tC += 1;
break;
274 case 'G': tG += 1;
break;
275 case 'T': tT += 1;
break;
279 double entropy = -(tA*
log(tA/length)+tC*
log(tC/length)+tG*
log(tG/length)+tT*
log(tT/length))/(length*
log(4.));
288 for(
int i = 11;
i < (
int)tags.size() && (MD_tag.empty() || tstrand.empty()); ++
i) {
290 vector<string> fields;
292 if(fields.size() != 3)
294 if(fields[0] ==
"MD" && fields[1] ==
"Z")
296 if((fields[0] ==
"ts" || fields[0] ==
"TS" || fields[0] ==
"XS") && fields[1] ==
"A")
300 cerr <<
"MD:Z tag in SAM file is expected" << endl;
305 string cigar_string = tags[5];
307 int sfrom = std::stoi(tags[3])-1;
308 while(!cigar_string.empty()) {
309 align_exons.push_back(
GetNextExon(cigar_string, MD_tag, qfrom, sfrom));
310 align_exons.back().m_tstrand = tstrand;
311 qfrom = align_exons.back().m_qto+1;
312 sfrom = align_exons.back().m_sto+1;
316 const string& seq = tags[9];
317 while(!align_exons.empty() &&
Entropy(seq.substr(align_exons.front().m_qfrom, align_exons.front().m_qto-align_exons.front().m_qfrom+1)) <
m_min_entropy)
318 align_exons.erase(align_exons.begin());
319 while(!align_exons.empty() &&
Entropy(seq.substr(align_exons.back().m_qfrom, align_exons.back().m_qto-align_exons.back().m_qfrom+1)) <
m_min_entropy)
320 align_exons.pop_back();
322 exons.insert(exons.end(), align_exons.begin(), align_exons.end());
326 return name+
":i:"+to_string(
value);
328 string Tag(
const string& name,
const string&
value) {
329 return name+
":Z:"+
value;
332 return name+
":A:"+
value;
337 fields[0] = compart.
m_read;
338 int flag = 1+2+strand*16+(1-strand)*32+(mate+1)*64;
339 fields[1] = to_string(flag);
341 fields[3] = to_string(compart.
m_range.first+1);
349 fields[8] = to_string(span);
350 fields[9] = compart.
m_rseq;
352 fields.push_back(
Tag(
"NM", compart.
m_dist));
356 fields.push_back(
Tag(
"MD", compart.
m_md));
357 fields.push_back(
Tag(
"NH", count));
358 fields.push_back(
Tag(
"HI", index));
362 for(
unsigned i = 1;
i < fields.size(); ++
i)
363 cout <<
"\t" << fields[
i];
369 fields[0] = compart.
m_read;
370 int flag = strand*16;
372 flag += 1+8+(mate+1)*64;
373 fields[1] = to_string(flag);
375 fields[3] = to_string(compart.
m_range.first+1);
381 fields[9] = compart.
m_rseq;
384 fields.push_back(
Tag(
"NM", compart.
m_dist));
388 fields.push_back(
Tag(
"MD", compart.
m_md));
389 fields.push_back(
Tag(
"NH", count));
390 fields.push_back(
Tag(
"HI", index));
393 for(
unsigned i = 1;
i < fields.size(); ++
i)
394 cout <<
"\t" << fields[
i];
402 for(
int mate = 0; mate < 2; ++mate) {
403 for(
int strand = 0; strand < 2; ++strand) {
405 for(
auto& compart : compact_aligns) {
407 if(compart.m_matep !=
nullptr)
417 auto& contig = contig_aligns.first;
418 for(
int mate = 0; mate < 2; ++mate) {
420 auto& left_aligns = contig_aligns.second[mate][strand];
421 for(
auto& compart : left_aligns) {
422 FormatPaired(compart, contig, mate, strand, mate_count[0], ++index,
true);
423 FormatPaired(*compart.m_matep, contig, 1-mate, 1-strand, mate_count[0], index,
false);
430 auto& contig = contig_aligns.first;
431 for(
int mate = 0; mate < 2; ++mate) {
432 for(
int strand = 0; strand < 2; ++strand) {
434 for(
auto& compart : compact_aligns)
435 FormatSingle(compart, contig, mate, strand, mate_count[mate], ++mate_index[mate]);
446 for(
int mate = 0; mate < 2; ++mate) {
447 for(
int strand = 0; strand < 2; ++strand) {
449 for(
auto& align : compact_aligns) {
450 mate_score[mate] =
max(mate_score[mate], align.m_score);
451 if(align.m_matep !=
nullptr)
452 pair_score =
max(pair_score, align.m_score+align.m_matep->m_score);
454 compact_aligns.remove_if([](
const AlignCompartment&
a){
return !
a.m_above_thresholds; });
459 bool paired = pair_score >=
max(mate_score[0], mate_score[1]);
461 for(
int mate = 0; mate < 2; ++mate) {
462 for(
int strand = 0; strand < 2; ++strand) {
466 if(
a.m_matep ==
nullptr)
return true;
467 if(
a.m_score+
a.m_matep->m_score == pair_score)
return false;
468 a.m_matep->m_matep =
nullptr;
return true; });
470 compact_aligns.remove_if([&](
AlignCompartment&
a){
a.m_matep =
nullptr;
return a.m_score != mate_score[mate]; });
488 vector<pair<int,int>> left_introns;
489 vector<pair<int,int>> right_introns;
491 for(
unsigned i = 1;
i < left.
m_exons.size(); ++
i) {
493 left_introns.emplace_back(left.
m_exons[
i-1].m_sto+1, left.
m_exons[
i].m_sfrom-1);
495 for(
unsigned i = 1;
i < right.
m_exons.size(); ++
i) {
497 right_introns.emplace_back(right.
m_exons[
i-1].m_sto+1, right.
m_exons[
i].m_sfrom-1);
499 return left_introns == right_introns;
504 for(
int left_mate = 0; left_mate < 2; ++left_mate) {
505 int right_mate = 1-left_mate;
507 int right_strand = 1;
508 auto& left_aligns = contig_aligns.second[left_mate][left_strand];
509 auto& right_aligns = contig_aligns.second[right_mate][right_strand];
511 auto iright = right_aligns.begin();
512 for(
auto ileft = left_aligns.begin(); ileft != left_aligns.end() && iright != right_aligns.end(); ++ileft) {
513 while(iright != right_aligns.end() && iright->m_range.second < ileft->m_range.second)
515 if(iright == right_aligns.end())
517 auto inext =
next(ileft);
518 if(inext != left_aligns.end() && inext->m_range.second < iright->m_range.second)
523 ileft->m_matep = &(*iright);
524 iright->m_matep = &(*ileft);
536 auto& contig = contig_aligns.first;
537 for(
int mate = 0; mate < 2; ++mate) {
538 for(
int strand = 0; strand < 2; ++strand) {
539 TSplitAligns& orig_aligns = contig_aligns.second[mate][strand];
540 if(orig_aligns.empty())
542 string& read = orig_aligns.front()[0];
543 bool paired_read = stoi(orig_aligns.front()[1])&1;
544 string& rseq = orig_aligns.front()[9];
545 int read_len = rseq.size();
549 for(
auto& oa : orig_aligns)
555 sort(exons.begin(), exons.end(), [](
const Exon& e1,
const Exon& e2){
556 if(e1.m_sto != e2.m_sto)
557 return e1.m_sto > e2.m_sto;
558 if(e1.m_sfrom != e2.m_sfrom)
559 return e1.m_sfrom < e2.m_sfrom;
560 if(e1.m_qto != e2.m_qto)
561 return e1.m_qto > e2.m_qto;
562 if(e1.m_qfrom != e2.m_qfrom)
563 return e1.m_qfrom < e2.m_qfrom;
564 if(e1.m_score != e2.m_score)
565 return e1.m_score > e2.m_score;
566 return e1.m_cigar < e2.m_cigar; });
569 auto tail = unique(exons.begin(), exons.end(), [](
const Exon& e1,
const Exon& e2){
570 return e1.m_sfrom == e2.m_sfrom && e1.m_sto == e2.m_sto && e1.m_qfrom == e2.m_qfrom && e1.m_qto == e2.m_qto; });
571 exons.erase(tail, exons.end());
574 for(
auto i = exons.begin();
i != exons.end(); ++
i) {
575 if(
i->m_type ==
eBogus ||
i->m_qfrom > 10)
577 for(
auto j =
next(
i); j != exons.end() && j->m_sto ==
i->m_sto && j->m_qto ==
i->m_qto; ++j)
580 exons.erase(
remove_if(exons.begin(), exons.end(), [](
Exon& e){ return e.m_type == eBogus; }), exons.end());
582 sort(exons.begin(), exons.end(), [](
const Exon& e1,
const Exon& e2){
583 if(e1.m_sfrom != e2.m_sfrom)
584 return e1.m_sfrom < e2.m_sfrom;
585 if(e1.m_sto != e2.m_sto)
586 return e1.m_sto > e2.m_sto;
587 if(e1.m_qfrom != e2.m_qfrom)
588 return e1.m_qfrom < e2.m_qfrom;
589 return e1.m_qto > e2.m_qto; });
592 for(
auto i = exons.begin();
i != exons.end(); ++
i) {
593 if(
i->m_type ==
eBogus || read_len-
i->m_qto-1 > 10)
595 for(
auto j =
next(
i); j != exons.end() && j->m_sfrom ==
i->m_sfrom && j->m_qfrom ==
i->m_qfrom; ++j)
598 exons.erase(
remove_if(exons.begin(), exons.end(), [](
Exon& e){ return e.m_type == eBogus; }), exons.end());
604 int soverlap =
i->m_sto-j->m_sfrom+1;
605 int slen =
min(
i->m_sto-
i->m_sfrom+1, j->m_sto-j->m_sfrom+1);
606 int qoverlap =
i->m_qto-j->m_qfrom+1;
607 int qlen =
min(
i->m_qto-
i->m_qfrom+1, j->m_qto-j->m_qfrom+1);
608 if(soverlap > 0.5*slen && qoverlap < 0.1*qlen)
618 for(
auto& e : exons) {
624 double ident = double(matches)/align_len;
625 double cov = double(qmax-qmin+1)/read_len;
627 double log2factor = 0.25;
628 int compart_penalty = ident*cov*
m_penalty*read_len + 0.5;
629 int exon_num = exons.size();
630 int best_index = exon_num-1;
631 for(
int i = exon_num-1;
i >= 0; --
i) {
633 int iscore = ei.m_matches;
635 ei.m_chain_score = iscore;
637 for(
int j =
i+1; j < exon_num; ++j) {
639 int ijscore = ej.m_chain_score+iscore;
643 if(ej.m_sfrom <= ei.m_sto)
646 int overlap = ei.m_qto+1-ej.m_qfrom;
647 int intron = ej.m_sfrom-ei.m_sto-1;
648 if(overlap == 0 && intron > 20 && intron <=
m_max_intron) {
650 double intron_len = ej.m_intron_len+intron;
651 double deltas = score-ei.m_chain_score-log2factor*
log2((intron_len+1)/(ei.m_intron_len+1));
653 ei.m_chain_score = score;
656 ei.m_intron_len = intron_len;
659 int overlap_penalty = 0;
660 overlap_penalty =
max(2,
int(ident*
abs(overlap)+0.5));
661 int score = ijscore-overlap_penalty;
662 double intron_len = ej.m_intron_len;
663 double deltas = score-ei.m_chain_score-log2factor*
log2((intron_len+1)/(ei.m_intron_len+1));
665 ei.m_chain_score = score;
668 ei.m_intron_len = intron_len+intron;
670 }
else if(overlap > 0 && overlap < 10 && intron <=
m_max_intron) {
671 int overlap_penalty = 0;
672 int s = ei.m_script.size()-1;
674 while(s >= 0 && l > 0) {
675 if(ei.m_script[s] ==
'=')
677 if(ei.m_script[s] !=
'D')
683 while(s < (
int)ej.m_script.size() && l > 0) {
684 if(ej.m_script[s] ==
'=')
686 if(ej.m_script[s] !=
'D')
690 overlap_penalty =
max(2, overlap_penalty);
691 int score = ijscore-overlap_penalty;
692 double intron_len = ej.m_intron_len+intron;
693 double deltas = score-ei.m_chain_score-log2factor*
log2((intron_len+1)/(ei.m_intron_len+1));
695 ei.m_chain_score = score;
698 ei.m_intron_len = intron_len;
701 int score = ijscore-compart_penalty;
702 double intron_len = ej.m_intron_len;
703 double deltas = score-ei.m_chain_score-log2factor*
log2((intron_len+1)/(ei.m_intron_len+1));
705 ei.m_chain_score = score;
708 ei.m_intron_len = intron_len;
712 double deltas = ei.m_chain_score-exons[best_index].m_chain_score-log2factor*
log2((
double)(ei.m_intron_len+1)/(exons[best_index].m_intron_len+1));
728 for(
Exon* p = &exons[best_index]; p !=
nullptr; p = p->
m_prevp) {
729 if(compartments.empty() || compartments.back().m_exons.back().m_type !=
eExtension)
730 compartments.emplace_back();
731 compartments.back().m_exons.push_back(*p);
735 for(
auto iloop = compartments.begin(); iloop != compartments.end(); ) {
737 while(!it->m_exons.empty()) {
739 if(it->m_exons.front().m_align_len == 0)
740 it->m_exons.erase(it->m_exons.begin());
744 while(!it->m_exons.empty()) {
746 if(it->m_exons.back().m_align_len == 0)
747 it->m_exons.pop_back();
751 if(it->m_exons.empty())
752 compartments.erase(it);
756 for(
auto& compartment : compartments) {
759 string& cigar = compartment.m_cigar;
760 if(compartment.m_exons.front().m_qfrom > 0)
761 cigar += to_string(compartment.m_exons.front().m_qfrom)+
"S";
762 string& tstrand = compartment.m_tstrand;
763 tstrand = compartment.m_exons.front().m_tstrand;
765 for(
int i = 0;
i < (
int)compartment.m_exons.size(); ++
i) {
766 auto& e = compartment.m_exons[
i];
771 int intron = compartment.m_exons[
i].m_sfrom-compartment.m_exons[
i-1].m_sto-1;
772 cigar += to_string(intron)+
"N";
774 for(
auto& elem : compartment.m_exons[
i].m_cigar)
775 cigar += to_string(elem.first)+elem.second;
777 if(!tstrand.empty() && e.
m_tstrand != tstrand)
781 int tail = rseq.size()-compartment.m_exons.back().m_qto-1;
783 cigar += to_string(tail)+
"S";
785 if(compartment.m_exons.size() == 1)
788 string& md_tag = compartment.m_md;
789 for(
auto i =
md.begin();
i !=
md.end(); ++
i) {
791 if(
i->first > 0 && j->first > 0)
792 i->first += j->first;
793 else if(
i->first == j->first)
794 i->second += j->second;
800 md_tag += to_string(
i->first);
801 else if(
i->first == 0)
804 md_tag +=
"^"+
i->second;
806 compartment.m_range.first = compartment.m_exons.front().m_sfrom;
807 compartment.m_range.second = compartment.m_exons.back().m_sto;
808 int read_span = compartment.m_exons.back().m_qto-compartment.m_exons.front().m_qfrom+1;
810 compartment.m_read = read;
811 compartment.m_rseq = rseq;
812 compartment.m_paired_read = paired_read;
827 list<TAlignCompartments::iterator> erased;
828 TAlignCompartments::reverse_iterator keeper = compartments.rbegin();
829 for(
auto i =
next(keeper);
i != compartments.rend(); ++
i) {
834 if(
i->m_score > keeper->m_score) {
835 erased.push_back(
prev(keeper.base()));
838 erased.push_back(
prev(
i.base()));
842 compartments.erase(
i);
866 argdescr->SetUsageContext(
GetArguments().GetProgramBasename(),
"exon_selector expects SAM alignments at stdin collated by query, e.g. with 'sort -k 1,1'");
868 argdescr->AddDefaultKey (
"max_intron",
"max_intron",
"Maximum intron length (in base pairs)",
CArgDescriptions::eInteger,
"1200000");
869 argdescr->AddDefaultKey(
"min_query_len",
"min_query_len",
"Minimum length for individual cDNA sequences.",
CArgDescriptions::eInteger,
"50");
870 argdescr->AddFlag(
"star",
"STAR aligner settings: -match 1 -mismatch 1 -gapopen1 2 -gapextend1 2 -gapopen2 2 -gapextend2 2 -min_output_identity 0.9 -min_output_coverage 0.9");
871 argdescr->AddFlag(
"minimap2",
"minimap2 aligner settings: -match 1 -mismatch 2 -gapopen1 2 -gapextend1 1 -gapopen2 32 -gapextend2 0 -min_output_identity 0.8 -min_output_coverage 0.3");
872 argdescr->AddOptionalKey(
"min_output_identity",
"min_output_identity",
"Minimal identity for output alignments",
CArgDescriptions::eDouble);
873 argdescr->AddOptionalKey(
"min_output_coverage",
"min_output_coverage",
"Minimal coverage for output alignments",
CArgDescriptions::eDouble);
876 argdescr->AddOptionalKey(
"gapopen1",
"gapopen1",
"Gap open penalty. A gap of length k costs min{gapopen1+k*gapextend1,gapopen2+k*gapextend2}",
CArgDescriptions::eInteger);
880 argdescr->AddFlag(
"nocheck",
"Don't check if reads are collated (saves memory)");
881 argdescr->AddFlag(
"remove_repeats",
"Remove alignments which have repeats in the read");
884 argdescr->SetConstraint(
"penalty", constrain01);
885 argdescr->SetConstraint(
"min_output_identity", constrain01);
886 argdescr->SetConstraint(
"min_output_coverage", constrain01);
889 argdescr->SetConstraint(
"min_query_len", constrain_minqlen);
892 argdescr->SetConstraint(
"match", constrain_score);
893 argdescr->SetConstraint(
"mismatch", constrain_score);
894 argdescr->SetConstraint(
"gapopen1", constrain_score);
895 argdescr->SetConstraint(
"gapextend1", constrain_score);
896 argdescr->SetConstraint(
"gapopen2", constrain_score);
897 argdescr->SetConstraint(
"gapextend2", constrain_score);
909 bool check = !args[
"nocheck"];
910 if(args[
"remove_repeats"])
913 if(args[
"star"] && args[
"minimap2"]) {
914 cerr <<
"-star and -minimap2 can't be used together" << endl;
918 if(args[
"minimap2"] || !args[
"star"]) {
940 if(args[
"min_output_identity"])
942 if(args[
"min_output_coverage"])
951 if(args[
"gapextend1"])
955 if(args[
"gapextend2"])
960 bool paired_read =
false;
962 while(getline(cin, line)) {
966 cout << line << endl;
969 vector<string> fields;
971 string read = fields[0];
973 if(
check && !previous_reads.
insert(read_old).second) {
974 cerr <<
"Input collated by reads is expected" << endl;
987 paired_read = stoi(fields[1])&1;
993 string contig = fields[2];
994 int flag = std::stoi(fields[1]);
995 int strand = (flag&16) ? 1 : 0;
996 int mate = (flag&128) ? 1 : 0;
1001 if(
check && !previous_reads.
insert(read_old).second) {
1002 cerr <<
"Input collated by reads is expected" << endl;
1017 int main(
int argc,
const char* argv[])
void remove_if(Container &c, Predicate *__pred)
virtual void Init()
Initialize the application.
bool CompatiblePair(const AlignCompartment &left, const AlignCompartment &right)
map< string, TMatrix< TAlignCompartments > > m_compartments
list< pair< int, char > > TCigar
void FormatSingle(AlignCompartment &compart, const string &contig, int mate, int strand, int count, int index)
list< TSamFields > TSplitAligns
map< string, TMatrix< TSplitAligns > > m_read_aligns
void ExtractExonsFromSAM(const TSamFields &tags, TExons &exons)
Exon GetNextExon(string &cigar_string, string &MD_tag, int qfrom, int sfrom)
virtual int Run()
Run the application.
list< pair< int, string > > TMDTag
vector< string > TSamFields
void SelectBestLocations()
list< AlignCompartment > TAlignCompartments
void FormatPaired(AlignCompartment &compart, const string &contig, int mate, int strand, int count, int index, bool left)
iterator_bool insert(const value_type &val)
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
double Entropy(const string &seq)
string Tag(const string &name, int value)
int main(int argc, const char *argv[])
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ eDouble
Convertible into a floating point number (double)
@ eInteger
Convertible into an integer number (int or Int8)
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
@ eDS_ToStderr
To standard error stream.
@ eDiag_Info
Informational message.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
unsigned int
A callback function used to compare two keys in a database.
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
AlignCompartment * m_matep
void RemoveLeftIndels(int gapopen1, int gapextend1, int gapopen2, int gapextend2)
void RemoveRightIndels(int gapopen1, int gapextend1, int gapopen2, int gapextend2)
static Uint4 letter(char c)