106 istringstream icigar(cigar_string);
110 while(icigar >>
len >> c) {
116 }
else if(c ==
'N') {
121 cigar_string = cigar_string.substr(clip_pos);
124 }
else if(c ==
'M' || c ==
'=' || c ==
'X') {
132 int m = stoi(MD_tag, &idx);
133 MD_tag.erase(0, idx);
135 MD_tag = to_string(m-
len)+MD_tag;
148 }
else if(c ==
'I') {
152 }
else if(c ==
'D') {
156 if(MD_tag.front() ==
'^')
157 MD_tag.erase(0,
len+1);
159 MD_tag.erase(0,
len);
161 cerr <<
"Unexpected symbol in CIGAR" << endl;
164 clip_pos = icigar.tellg();
166 cigar_string.clear();
171 int ProcessSAM(
const vector<string>& tags, vector<CCompactSAMApplication::Exon>& exons) {
175 int mismatchpenalty = 1;
178 double log2factor = 0.25;
179 int intron_penalty = 0;
180 int noncanonical_penalty = 8;
181 int gcag_penalty = 4;
182 int atac_penalty = 8;
186 for(
int i = 11;
i < (
int)tags.size() && (MD_tag.empty() || jM_tag.empty()); ++
i) {
188 vector<string> fields;
190 if(fields.size() != 3)
192 if(fields[0] ==
"MD" && fields[1] ==
"Z")
194 if(fields[0] ==
"jM" && fields[1] ==
"B")
198 cerr <<
"MD:Z tag in SAM file is expected" << endl;
202 cerr <<
"jM:B tag in SAM file is expected" << endl;
207 string cigar_string = tags[5];
209 int sfrom = std::stoi(tags[3])-1;
210 while(!cigar_string.empty()) {
211 exons.push_back(
GetNextExon(cigar_string, MD_tag, qfrom, sfrom));
212 qfrom = exons.back().m_qto+1;
213 sfrom = exons.back().m_sto+1;
215 pair<int, int>
range(exons.front().m_sfrom, exons.back().m_sto);
219 for(
auto& e : exons) {
220 int gap_len = e.m_align_len-e.m_matches-e.m_mismatches;
221 e.m_score = e.m_matches*matchscore-e.m_mismatches*mismatchpenalty-e.m_indels*gapopen-gap_len*gapextend;
222 align_score += e.m_score;
224 if(
next(exons.begin()) != exons.end()) {
225 vector<string> splices;
227 for(
auto& spl : splices) {
229 align_score -= intron_penalty;
231 align_score -= noncanonical_penalty;
232 else if(spl ==
"3" || spl ==
"4")
233 align_score -= gcag_penalty;
234 else if(spl ==
"5" || spl ==
"6")
235 align_score -= atac_penalty;
250 return name+
":i:"+to_string(
value);
252 string Tag(
const string& name,
const string&
value) {
253 return name+
":Z:"+
value;
257 auto itag = find_if(fields.begin()+11, fields.end(), [&](
const string&
tag){ return tag.substr(0,2) == name; });
258 if(itag != fields.end())
263 void RemoveTag(vector<string>& fields,
const string& name) {
264 auto itag = find_if(fields.begin()+11, fields.end(), [&](
const string&
tag){ return tag.substr(0,2) == name; });
265 if(itag != fields.end())
274 int flag = 1+2+strand*16+(1-strand)*32+(mate+1)*64;
275 fields[1] = to_string(flag);
283 fields[8] = to_string(span);
291 mismatches += e.m_mismatches;
293 mismatches += e.m_mismatches;
305 int flag = stoi(fields[1])&1;
315 fields[1] = to_string(flag);
328 mismatches += e.m_mismatches;
339 for(
int mate = 0; mate < 2; ++mate) {
340 for(
int strand = 0; strand < 2; ++strand) {
342 for(
auto& ai : compact_aligns) {
344 if(ai.m_matep !=
nullptr)
354 for(
int mate = 0; mate < 2; ++mate) {
356 auto& left_aligns = contig_aligns.second[mate][strand];
357 for(
auto& ai : left_aligns) {
358 FormatPaired(ai, mate, strand, mate_count[0], ++index,
true);
359 FormatPaired(*ai.m_matep, 1-mate, 1-strand, mate_count[0], index,
false);
365 bool all_aligned = mate_count[0] > 0 && mate_count[1] > 0;
368 for(
int mate = 0; mate < 2; ++mate) {
369 for(
int strand = 0; strand < 2; ++strand) {
371 for(
auto& ai : compact_aligns)
372 FormatSingle(ai, mate, strand, mate_count[mate], ++mate_index[mate], all_aligned);
383 for(
int mate = 0; mate < 2; ++mate) {
384 for(
int strand = 0; strand < 2; ++strand) {
386 for(
auto& align : compact_aligns) {
387 mate_score[mate] =
max(mate_score[mate], align.m_score);
388 if(align.m_matep !=
nullptr)
389 pair_score =
max(pair_score, align.m_score+align.m_matep->m_score);
391 compact_aligns.remove_if([](
const AlignInfo&
a){
return !
a.m_above_thresholds; });
396 bool paired = pair_score >=
max(mate_score[0], mate_score[1]);
398 for(
int mate = 0; mate < 2; ++mate) {
399 for(
int strand = 0; strand < 2; ++strand) {
402 compact_aligns.remove_if([&](
const AlignInfo&
a){
403 if(
a.m_matep ==
nullptr)
return true;
404 if(
a.m_score+
a.m_matep->m_score == pair_score)
return false;
405 a.m_matep->m_matep =
nullptr;
return true; });
407 compact_aligns.remove_if([&](
AlignInfo&
a){
a.m_matep =
nullptr;
return a.m_score != mate_score[mate]; });
425 vector<pair<int,int>> left_introns;
426 vector<pair<int,int>> right_introns;
428 for(
unsigned i = 1;
i < left.
m_exons.size(); ++
i) {
430 left_introns.emplace_back(left.
m_exons[
i-1].m_sto+1, left.
m_exons[
i].m_sfrom-1);
432 for(
unsigned i = 1;
i < right.
m_exons.size(); ++
i) {
434 right_introns.emplace_back(right.
m_exons[
i-1].m_sto+1, right.
m_exons[
i].m_sfrom-1);
436 return left_introns == right_introns;
441 for(
int left_mate = 0; left_mate < 2; ++left_mate) {
442 int right_mate = 1-left_mate;
444 int right_strand = 1;
445 auto& left_aligns = contig_aligns.second[left_mate][left_strand];
446 auto& right_aligns = contig_aligns.second[right_mate][right_strand];
448 auto iright = right_aligns.begin();
449 for(
auto ileft = left_aligns.begin(); ileft != left_aligns.end() && iright != right_aligns.end(); ++ileft) {
450 while(iright != right_aligns.end() && iright->m_range.second < ileft->m_range.second)
452 if(iright == right_aligns.end())
454 auto inext =
next(ileft);
455 if(inext != left_aligns.end() && inext->m_range.second < iright->m_range.second)
460 ileft->m_matep = &(*iright);
461 iright->m_matep = &(*ileft);
471 for(
int mate = 0; mate < 2; ++mate) {
472 for(
int strand = 0; strand < 2; ++strand) {
473 TAlignInfoList& compact_aligns = contig_aligns.second[mate][strand];
474 if(compact_aligns.empty())
477 size_t read_len = compact_aligns.front().m_fields[9].size();
480 for(
auto& ai : compact_aligns) {
481 auto& fields = ai.m_fields;
482 vector<Exon> align_exons;
483 int align_score =
ProcessSAM(fields, align_exons);
484 pair<int, int> align_range(align_exons.front().m_sfrom, align_exons.back().m_sto);
485 int read_span = align_exons.back().m_qto-align_exons.front().m_qfrom+1;
486 exons.insert(exons.end(), align_exons.begin(), align_exons.end());
490 for(
auto& e : align_exons) {
491 matches += e.m_matches;
492 align_len += e.m_align_len;
495 ai.m_range = align_range;
496 ai.m_score = align_score;
497 ai.m_above_thresholds = above_hresholds;
498 swap(align_exons, ai.m_exons);
504 sort(exons.begin(), exons.end(), [](
const Exon& e1,
const Exon& e2){
505 if(e1.m_score == e2.m_score)
506 return e1.m_matches > e2.m_matches;
508 return e1.m_score > e2.m_score; });
511 auto tail =
remove_if(exons.begin(), exons.end(), [&](
const Exon& e){
512 bool r1 = !lefts.insert(e.m_sfrom).second; bool r2 = !rights.insert(e.m_sto).second;
514 exons.erase(tail, exons.end());
522 for(
auto& e : exons) {
524 hit->SetQueryId(readid);
525 hit->SetSubjId(contigid);
526 hit->SetQueryStart(e.m_qfrom);
527 hit->SetQueryStop(e.m_qto);
528 hit->SetSubjStart(e.m_sfrom);
529 hit->SetSubjStop(e.m_sto);
530 hit->SetLength(e.m_align_len);
531 hit->SetMismatches(e.m_mismatches);
532 hit->SetGaps(e.m_indels);
534 hit->SetScore(e.m_score);
535 hit->SetRawScore(e.m_score);
536 hit->SetIdentity(
float(e.m_matches)/e.m_align_len);
537 hitrefs.push_back(hit);
541 typedef TAccessor::TCoord TCoord;
543 const TCoord penalty_bps (TCoord(
m_penalty*read_len + 0.5));
544 const TCoord min_matches (TCoord(
m_min_idty*read_len + 0.5));
547 const TCoord min_singleton_matches (
min(msm1, msm2));
549 TAccessor ca (penalty_bps, min_matches, min_singleton_matches,
false);
551 ca.Run(hitrefs.begin(), hitrefs.end());
556 for(
bool b0 (ca.GetFirst(comp)); b0 ; b0 = ca.GetNext(comp)) {
559 pair<int, int>
range(span[2], span[3]);
564 compact_aligns.remove_if([&](
const AlignInfo&
a){
return selected_ranges.count(
a.m_range) == 0; });
572 for(
auto it = compact_aligns.begin(); it != compact_aligns.end(); ++it) {
573 for(
auto inext =
next(it); inext != compact_aligns.end() && inext->m_range == it->m_range; )
574 inext = compact_aligns.erase(inext);
586 argdescr->SetUsageContext(
GetArguments().GetProgramBasename(),
587 "compact_sam expects SAM alignments at stdin collated by query, e.g. with 'sort -k 1,1'");
589 argdescr->AddDefaultKey(
"min_output_identity",
"min_output_identity",
"Minimal identity for output alignments",
592 argdescr->AddDefaultKey(
"min_output_coverage",
"min_output_coverage",
"Minimal coverage for output alignments",
595 argdescr->AddDefaultKey(
"penalty",
"penalty",
"Per-compartment penalty",
598 argdescr->AddDefaultKey (
"max_intron",
"max_intron",
599 "Maximum intron length (in base pairs)",
603 argdescr->AddDefaultKey(
"min_query_len",
"min_query_len",
604 "Minimum length for individual cDNA sequences.",
607 argdescr->AddFlag(
"nocheck",
"Don't check if reads are collated (saves memory)");
610 argdescr->SetConstraint(
"penalty", constrain01);
611 argdescr->SetConstraint(
"min_output_identity", constrain01);
612 argdescr->SetConstraint(
"min_output_coverage", constrain01);
615 argdescr->SetConstraint(
"min_query_len", constrain_minqlen);
632 bool check = !args[
"nocheck"];
637 while(getline(cin, line)) {
641 cout << line << endl;
644 vector<string> fields;
646 string read = fields[0];
647 if(read != read_old) {
648 if(
check && !previous_reads.
insert(read_old).second) {
649 cerr <<
"Input collated by reads is expected" << endl;
664 string contig = fields[2];
665 int flag = std::stoi(fields[1]);
666 int strand = (flag&16) ? 1 : 0;
667 int mate = (flag&128) ? 1 : 0;
670 swap(ais.back().m_fields, fields);
674 if(
check && !previous_reads.
insert(read_old).second) {
675 cerr <<
"Input collated by reads is expected" << endl;
687 int main(
int argc,
const char* argv[])
void remove_if(Container &c, Predicate *__pred)
void SelectBestLocations()
bool CompatiblePair(const AlignInfo &left, const AlignInfo &right)
vector< THitRef > THitRefs
double m_min_singleton_idty
virtual int Run()
Run the application.
int m_min_singleton_idty_bps
virtual void Init()
Initialize the application.
list< AlignInfo > TAlignInfoList
map< string, TMatrix< TAlignInfoList > > m_compact_aligns
list< TSamFields > TSplitAligns
vector< string > TSamFields
iterator_bool insert(const value_type &val)
void FormatSingle(CCompactSAMApplication::AlignInfo &ai, int mate, int strand, int count, int index, bool all_aligned)
void FormatPaired(CCompactSAMApplication::AlignInfo &ai, int mate, int strand, int count, int index, bool left)
string Tag(const string &name, int value)
int ProcessSAM(const vector< string > &tags, vector< CCompactSAMApplication::Exon > &exons)
void Print(const CCompactSAMApplication::AlignInfo &ai)
void ReplaceTag(vector< string > &fields, const string &name, const T &value)
CCompactSAMApplication::Exon GetNextExon(string &cigar_string, string &MD_tag, int qfrom, int sfrom)
int main(int argc, const char *argv[])
void RemoveTag(vector< string > &fields, const string &name)
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static void s_GetSpan(const THitRefs &hitrefs, TCoord span[4])
Get sequence span for a set of alignments (hits).
unsigned int TSeqPos
Type for sequence locations and lengths.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
@ eDouble
Convertible into a floating point number (double)
@ eInteger
Convertible into an integer number (int or Int8)
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
@ eDS_ToStderr
To standard error stream.
@ eDiag_Info
Informational message.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
unsigned int
A callback function used to compare two keys in a database.
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application: