NCBI C++ ToolKit
magicblast_util.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: magicblast_util.cpp 97664 2022-08-10 13:45:37Z boratyng $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Greg Boratyn
27  * Implements utils for MagicBLAST application
28  *
29  */
30 
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiapp.hpp>
43 #include "../blast/blast_app_util.hpp"
44 
46 
54 
58 
60 
66 
67 #include "magicblast_util.hpp"
68 
69 #include <unordered_set>
70 #include <unordered_map>
71 #include <memory>
72 
73 #ifndef SKIP_DOXYGEN_PROCESSING
75 BEGIN_SCOPE(blast);
77 #endif
78 
79 
80 typedef unordered_map<string, CRef<CSeq_entry> > TQueryMap;
81 
82 
83 static
85  const CMagicBlastResults& results,
86  const TQueryMap& queries,
87  bool first_seg,
88  const string& user_tag);
89 
90 static
92  const CMagicBlastResults& results,
93  const TQueryMap& queries,
94  bool first_seg,
95  bool trim_read_ids,
96  const string& user_tag);
97 
98 
99 static char s_Complement(char c)
100 {
101  char retval;
102 
103  switch (c) {
104  case 'A':
105  retval = 'T';
106  break;
107 
108  case 'a':
109  retval = 't';
110  break;
111 
112  case 'C':
113  retval = 'G';
114  break;
115 
116  case 'c':
117  retval = 'g';
118  break;
119 
120  case 'G':
121  retval = 'C';
122  break;
123 
124  case 'g':
125  retval = 'c';
126  break;
127 
128  case 'T':
129  retval = 'A';
130  break;
131 
132  case 't':
133  retval = 'a';
134  break;
135 
136  case 'N':
137  retval = 'N';
138  break;
139 
140  case 'n':
141  retval = 'n';
142  break;
143 
144  case '-':
145  retval = '-';
146  break;
147 
148  default:
149  retval = 'x';
150  };
151 
152  return retval;
153 }
154 
155 
156 static string s_GetBareId(const CSeq_id& id)
157 {
158  string retval;
159  // Gis are printed with the bar
160  if (id.IsGi()) {
161  retval = id.AsFastaString();
162  }
163  else if (id.IsGeneral()) {
164  const CDbtag& dbt = id.GetGeneral();
165  if (dbt.GetTag().IsStr()) {
166  retval = dbt.GetTag().GetStr();
167  }
168  else if (dbt.GetTag().IsId()) {
169  retval = NStr::IntToString(dbt.GetTag().GetId());
170  }
171  }
172  else {
173  retval = id.GetSeqIdString(true);
174  }
175 
176  return retval;
177 }
178 
179 
180 static string s_GetSequenceId(const CBioseq& bioseq)
181 {
182  string retval;
183  if (bioseq.IsSetDescr()) {
184  for (auto it: bioseq.GetDescr().Get()) {
185  if (it->IsTitle()) {
186  vector<string> tokens;
187  NStr::Split(it->GetTitle(), " ", tokens);
188  retval = tokens[0];
189  }
190  }
191  }
192 
193  if (retval.empty()) {
194  retval = s_GetBareId(*bioseq.GetFirstId());
195  }
196 
197  return retval;
198 }
199 
200 
201 static string s_GetFastaDefline(const CBioseq& bioseq)
202 {
203  string retval;
204  if (bioseq.IsSetDescr()) {
205  for (auto it: bioseq.GetDescr().Get()) {
206  if (it->IsTitle()) {
207  retval = it->GetTitle();
208  }
209  }
210  }
211 
212  if (retval.empty()) {
213  retval = s_GetBareId(*bioseq.GetFirstId());
214  }
215 
216  return retval;
217 }
218 
219 
220 static void s_CreateQueryMap(const CBioseq_set& query_batch,
221  TQueryMap& query_map)
222 {
223  query_map.clear();
224  for (auto it: query_batch.GetSeq_set()) {
225 
226  CRef<CSeq_entry> seq_entry(it);
227  const CSeq_id* seq_id = seq_entry->GetSeq().GetFirstId();
228  if (!seq_id) {
229  NCBI_THROW(CException, eInvalid, "Missing Sequence Id");
230  }
231  string id = seq_id->GetSeqIdString();
232  query_map[id] = seq_entry;
233  }
234 }
235 
236 
237 static const CBioseq& s_GetQueryBioseq(const TQueryMap& queries,
238  const CSeq_id& seqid)
239 {
240  TQueryMap::const_iterator it = queries.find(seqid.GetSeqIdString());
241  _ASSERT(it != queries.end());
242  if (it == queries.end()) {
243  NCBI_THROW(CException, eInvalid, (string)"Query Bioseq not found for "
244  "id: " + s_GetBareId(seqid));
245  }
246 
247  return it->second->GetSeq();
248 }
249 
250 static int s_GetQuerySequence(const CBioseq& bioseq,
251  const CRange<TSeqPos>& range,
252  bool reverse_complement,
253  string& sequence)
254 {
255  const CSeq_data& seq_data = bioseq.GetInst().GetSeq_data();
256  switch (seq_data.Which()) {
258  sequence = seq_data.GetIupacna().Get();
259  if (range.NotEmpty() && !range.IsWhole()) {
260  sequence = sequence.substr(range.GetFrom(), range.GetLength());
261  }
262  break;
263 
265  CSeqConvert::Convert(seq_data.GetNcbi2na().Get(),
266  CSeqUtil::e_Ncbi2na, range.GetFrom(),
267  range.GetLength(),
268  sequence, CSeqUtil::e_Iupacna);
269  break;
271  CSeqConvert::Convert(seq_data.GetNcbi4na().Get(),
272  CSeqUtil::e_Ncbi4na, range.GetFrom(),
273  range.GetLength(),
274  sequence, CSeqUtil::e_Iupacna);
275  break;
276 
278  CSeqConvert::Convert(seq_data.GetNcbi8na().Get(),
279  CSeqUtil::e_Ncbi8na, range.GetFrom(),
280  range.GetLength(),
281  sequence, CSeqUtil::e_Iupacna);
282  break;
283 
284  default:
285  NCBI_THROW(CException, eInvalid, "Unexpected query sequence "
286  "encoding");
287  };
288 
289 
290  if (reverse_complement) {
291  string tmp(sequence);
293  sequence);
294  }
295 
296  return 0;
297 }
298 
299 // Get FASTQ quality string, return true if the bioseq has qualities,
300 // otheriwse false.
301 static bool s_GetQualityString(const CBioseq& bioseq, string& quality_str)
302 {
303  // First check UserObject in SeqDescr. These are ASCII encoded quality
304  // scores copied from a FASTQ file.
305  if (bioseq.IsSetDescr()) {
306  for (const auto& it: bioseq.GetDescr().Get()) {
307  if (it->IsUser() && it->GetUser().GetType().GetStr() == "Mapping") {
308  if (it->GetUser().HasField("quality")) {
309  quality_str.assign(it->GetUser().GetField("quality").GetString());
310  return true;
311  }
312  }
313  }
314  }
315 
316  // If not present, check Seq-annot. These are Phred quality values as
317  // integers (usually from SRA).
318  if (bioseq.IsSetAnnot()) {
319  for (const auto& ann_it: bioseq.GetAnnot()) {
320  if (ann_it->IsGraph()) {
321  for (const auto& it: ann_it->GetData().GetGraph()) {
322  if (it->IsSetTitle() && it->GetTitle().find("Phred Quality") != string::npos) {
323  if (it->GetGraph().IsByte()) {
324  const vector<char>& values = it->GetGraph().GetByte().GetValues();
325  // produce 33-base ASCII encoding
326  quality_str.reserve(values.size());
327  for (const auto& ch: values) {
328  quality_str.push_back(ch + 33);
329  }
330 
331  return true;
332  }
333  }
334  }
335  }
336  }
337  }
338 
339  return false;
340 }
341 
342 
343 static
345  const CMagicBlastResults& results,
346  const TQueryMap& queries,
347  bool first_seg)
348 {
349  CSeq_id id;
350  if (!results.IsPaired() || first_seg) {
351  id.Set(results.GetQueryId().AsFastaString());
352  }
353  else {
354  id.Set(results.GetLastId().AsFastaString());
355  }
356 
357  const CBioseq& bioseq = s_GetQueryBioseq(queries, id);
358 
359  // defline
360  ostr << ">" << s_GetFastaDefline(bioseq) << endl;
361 
362  // sequence
363  string sequence;
365  s_GetQuerySequence(bioseq, range, false, sequence);
366  ostr << sequence;
367 
368  return ostr;
369 }
370 
371 static
374  const CMagicBlastResults& results,
375  const TQueryMap& queries,
376  bool first_seg,
377  bool trim_read_ids,
378  const string& user_tag)
379 {
380 
381 
382  switch (fmt) {
383 
385  return PrintTabularUnaligned(ostr, results, queries, first_seg,
386  user_tag);
387 
389  return PrintFastaUnaligned(ostr, results, queries, first_seg);
390 
391  default:
392  return PrintSAMUnaligned(ostr, results, queries, first_seg, trim_read_ids, user_tag);
393  };
394 }
395 
397  const string& cmd_line_args, bool user_tag)
398 {
399  string sep = "\t";
400 
401  ostr << "# MAGICBLAST " << version << endl;
402  ostr << "# " << cmd_line_args << endl;
403 
404  ostr << "# Fields: ";
405  ostr << "query acc." << sep;
406  ostr << "reference acc." << sep;
407  ostr << "% identity" << sep;
408  ostr << "not used" << sep;
409  ostr << "not used" << sep;
410  ostr << "not used" << sep;
411  ostr << "query start" << sep;
412  ostr << "query end" << sep;
413  ostr << "reference start" << sep;
414  ostr << "reference end" << sep;
415  ostr << "not used" << sep;
416  ostr << "not used" << sep;
417  ostr << "score" << sep;
418  ostr << "query strand" << sep;
419  ostr << "reference strand" << sep;
420  ostr << "query length" << sep;
421  ostr << "BTOP" << sep;
422  ostr << "num placements" << sep;
423  ostr << "not used" << sep;
424  ostr << "compartment" << sep;
425  ostr << "left overhang" << sep;
426  ostr << "right overhang" << sep;
427  ostr << "mate reference" << sep;
428  ostr << "mate ref. start" << sep;
429  ostr << "composite score";
430  if (user_tag) {
431  ostr << sep << "user tag";
432  }
433 
434  ostr << endl;
435 
436  return ostr;
437 }
438 
439 
440 static
442  const TQueryMap& queries,
443  bool is_paired, int batch_number, int compartment,
444  const string& user_tag,
445  const CSeq_align* mate = NULL)
446 {
447  // if paired alignment
448  if (align.GetSegs().IsDisc()) {
449 
450  const CSeq_align_set& disc = align.GetSegs().GetDisc();
451  _ASSERT(disc.Get().size() == 2u);
452 
453  CSeq_align_set::Tdata::const_iterator first = disc.Get().begin();
454  _ASSERT(first != disc.Get().end());
455  CSeq_align_set::Tdata::const_iterator second(first);
456  ++second;
457  _ASSERT(second != disc.Get().end());
458 
459  PrintTabular(ostr, **first, queries, is_paired, batch_number,
460  compartment, user_tag, second->GetNonNullPointer());
461  ostr << endl;
462 
463  PrintTabular(ostr, **second, queries, is_paired, batch_number,
464  compartment, user_tag, first->GetNonNullPointer());
465 
466  return ostr;
467  }
468 
469  string sep = "\t";
470  const CBioseq& bioseq = s_GetQueryBioseq(queries, align.GetSeq_id(0));
471  ostr << s_GetSequenceId(bioseq) << sep;
472 
473  ostr << s_GetBareId(align.GetSeq_id(1)) << sep;
474 
475  int score;
476  double perc_identity;
479  perc_identity);
480 
481  ostr << perc_identity << sep;
482 
483  ostr << 0 << sep; // length
484  ostr << 0 << sep; // mismatch
485  ostr << 0 << sep; // gapopen
486 
487 
488  int query_len = 0;
489 
490  if (align.GetSegs().IsDenseg()) {
491  CRange<TSeqPos> range = align.GetSeqRange(0);
492  ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
493  range = align.GetSeqRange(1);
494  if (align.GetSeqStrand(0) == eNa_strand_minus) {
495  ostr << range.GetTo() + 1 << sep << range.GetFrom() + 1 << sep;
496  }
497  else {
498  ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
499  }
500  }
501  else if (align.GetSegs().IsSpliced()) {
502  CRange<TSeqPos> range = align.GetSeqRange(0);
503  if (align.GetSegs().GetSpliced().IsSetProduct_length()) {
504  query_len = align.GetSegs().GetSpliced().GetProduct_length();
505  }
506  else {
507  _ASSERT(0);
508  }
509  if (align.GetSeqStrand(0) == eNa_strand_minus) {
510  ostr << query_len - range.GetTo() << sep
511  << query_len - range.GetFrom() << sep;
512 
513  range = align.GetSeqRange(1);
514  ostr << range.GetTo() + 1 << sep << range.GetFrom() + 1 << sep;
515  }
516  else {
517  ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
518  range = align.GetSeqRange(1);
519  ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
520  }
521 
522  }
523 
524  ostr << 0.0 << sep; // e-value
525  ostr << 99 << sep; // bit score
526 
527  ostr << score << sep;
528 
529  // query is always a plus strand
530  ostr << "plus" << sep
531  << (align.GetSeqStrand(0) == 1 ? "plus" : "minus");
532 
533  string btop_string;
534  Int4 num_hits = 0;
535  Int4 pair_start = 0;
536  Int4 fragment_score = 0;
537 
538  CConstRef<CUser_object> ext = align.FindExt("Mapper Info");
539  if (ext.NotEmpty()) {
540 
541  ITERATE (CUser_object::TData, it, ext->GetData()) {
542  if (!(*it)->GetLabel().IsStr()) {
543  continue;
544  }
545 
546  if ((*it)->GetLabel().GetStr() == "btop" &&
547  (*it)->GetData().IsStr()) {
548 
549  btop_string = (*it)->GetString();
550  }
551  else if ((*it)->GetLabel().GetStr() == "num_hits" &&
552  (*it)->GetData().IsInt()) {
553 
554  num_hits = (*it)->GetInt();
555  }
556  }
557  }
558 
559  // for alignments on the minus strand
560  if (align.GetSeqStrand(0) == eNa_strand_minus) {
561 
562  // reverse btop string
563  string new_btop;
564  int i = btop_string.length() - 1;
565  bool intron = false;
566  while (i >= 0) {
567  int to = i;
568  while (i >= 0 && (isdigit(btop_string[i]) ||
569  btop_string[i] == ')')) {
570  i--;
571  }
572 
573  new_btop += btop_string.substr(i + 1, to - i);
574 
575  if (i >= 0 && btop_string[i] == '^') {
576  intron = !intron;
577  }
578 
579  if (i > 0) {
580  if (isalpha(btop_string[i]) || btop_string[i] == '-') {
581 
582  if (intron) {
583  // if intron, reverse complement splice signals
584  new_btop += s_Complement(btop_string[i]);
585  new_btop += s_Complement(btop_string[i - 1]);
586  }
587  else {
588  // otherwise, complement bases in place
589  new_btop += s_Complement(btop_string[i - 1]);
590  new_btop += s_Complement(btop_string[i]);
591  }
592  i--;
593  }
594  else {
595  new_btop += btop_string[i];
596  }
597  }
598 
599  i--;
600  }
601  btop_string.swap(new_btop);
602  }
603 
604  fragment_score = score;
605  if (mate) {
606  int mate_score = 0;
607  mate->GetNamedScore(CSeq_align::eScore_Score, mate_score);
608  fragment_score += mate_score;
609  }
610 
611  // report unaligned part of the query: 3' end and reverese complemented
612  // 5' end
613  string left_overhang = "-";
614  string right_overhang = "-";
615  if (query_len <= 0) {
616  query_len = bioseq.GetInst().GetLength();
617  }
618 
619  CRange<TSeqPos> range = align.GetSeqRange(0);
620  int from = range.GetFrom();
621  int to = range.GetToOpen();
622  if (align.GetSeqStrand(0) == eNa_strand_minus) {
623  from = query_len - range.GetToOpen();
624  to = query_len - range.GetFrom();
625  }
626 
627  // reverse complemented 5' end
628  if (from > 0) {
629  CRange<TSeqPos> r(MAX(0, from - 30), from - 1);
630  left_overhang.clear();
631  s_GetQuerySequence(bioseq, r, true, left_overhang);
632  }
633 
634  // 3' end
635  if (to < query_len) {
636  CRange<TSeqPos> r(to, MIN(to + 30 - 1, query_len - 1));
637  right_overhang.clear();
638  s_GetQuerySequence(bioseq, r, false, right_overhang);
639  }
640 
641  ostr << sep << query_len
642  << sep << btop_string
643  << sep << num_hits
644  << sep << /*splice*/ "-"
645  << sep << batch_number << ":" << compartment
646  << sep << left_overhang
647  << sep << right_overhang;
648 
649  if (is_paired && mate) {
650  if (align.GetSeq_id(1).Match(mate->GetSeq_id(1))) {
651  ostr << sep << "-";
652  }
653  else {
654  ostr << sep << mate->GetSeq_id(1).AsFastaString();
655  }
656 
657  pair_start = mate->GetSeqStart(1) + 1;
658  //FIXME: for tests
659  if (mate->GetSeqStrand(0) == eNa_strand_minus) {
660  pair_start = mate->GetSeqStop(1) + 1;
661  }
662  if ((align.GetSeqStart(1) < mate->GetSeqStart(1) &&
663  align.GetSeqStrand(0) == eNa_strand_minus) ||
664  (mate->GetSeqStart(1) < align.GetSeqStart(1) &&
665  mate->GetSeqStrand(0) == eNa_strand_minus)) {
666 
667  pair_start = -pair_start;
668  }
669  ostr << sep << pair_start;
670 
671  }
672  else {
673  ostr << sep << "-" << sep << "-";
674  }
675 
676  ostr << sep << fragment_score;
677 
678  if (!user_tag.empty()) {
679  ostr << sep << user_tag;
680  }
681 
682  return ostr;
683 }
684 
685 
687  const CMagicBlastResults& results,
688  const TQueryMap& queries,
689  bool first_seg,
690  const string& user_tag)
691 {
692  string sep = "\t";
693  CSeq_id id;
694  if (!results.IsPaired() || first_seg) {
695  id.Set(results.GetQueryId().AsFastaString());
696  }
697  else {
698  id.Set(results.GetLastId().AsFastaString());
699  }
700  const CBioseq& bioseq = s_GetQueryBioseq(queries, id);
701 
702  // query
703  ostr << s_GetSequenceId(bioseq) << sep;
704 
705  // subject
706  ostr << "-" << sep;
707 
708  // percent identity
709  ostr << 0.0 << sep;
710 
711  ostr << 0 << sep; // length
712  ostr << 0 << sep; // mismatch
713  ostr << 0 << sep; // gapopen
714 
715  // query start and stop
716  ostr << 0 << sep << 0 << sep;
717 
718  // subject start and stop
719  ostr << 0 << sep << 0 << sep;
720 
721  ostr << 0 << sep; // e-value
722  ostr << 99 << sep; // bit score
723 
724  ostr << 0 << sep;
725 
726  // query and subject strand
727  ostr << "-" << sep << "-" << sep;
728 
729  // query length
730  int query_len = bioseq.GetInst().GetLength();
731 
732  ostr << query_len << sep;
733 
734  // btop string
735  ostr << "-" << sep;
736 
737  // number of placements
738  ostr << 0 << sep;
739 
740  // splice
741  ostr << "-" << sep;
742 
743  // compartment
744  string compart = "-";
745  // if a read did not pass filtering
747  first_seg ? results.GetFirstInfo() : results.GetLastInfo();
748  if ((info & CMagicBlastResults::fFiltered) != 0) {
749  compart = "F";
750  }
751  ostr << compart << sep;
752 
753  // left overhang
754  ostr << "-" << sep;
755 
756  // right overhang
757  ostr << "-" << sep;
758 
759  // mate reference
760  ostr << "-" << sep;
761 
762  // mate start position
763  ostr << "-" << sep;
764 
765  // composite score
766  ostr << 0;
767 
768  if (!user_tag.empty()) {
769  ostr << sep << user_tag;
770  }
771 
772  return ostr;
773 }
774 
775 static
777  CNcbiOstream& unaligned_ostr,
778  CFormattingArgs::EOutputFormat unaligned_fmt,
779  const CMagicBlastResults& results,
780  const TQueryMap& queries,
781  bool is_paired, int batch_number,
782  int& compartment,
783  bool trim_read_id,
784  bool print_unaligned,
785  bool no_discordant,
786  const string& user_tag)
787 {
788  bool is_concordant = results.IsConcordant();
789 
790  if (!no_discordant || (no_discordant && is_concordant)) {
791  for (auto it: results.GetSeqAlign()->Get()) {
792  PrintTabular(ostr, *it, queries, is_paired, batch_number,
793  compartment++, user_tag);
794  ostr << endl;
795  }
796  }
797 
798  if (!print_unaligned) {
799  return ostr;
800  }
801 
802  if ((results.GetFirstInfo() & CMagicBlastResults::fUnaligned) != 0 ||
803  (no_discordant && !is_concordant)) {
804 
805  PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, true,
806  trim_read_id, user_tag);
807  unaligned_ostr << endl;
808  }
809 
810  if (results.IsPaired() &&
811  ((results.GetLastInfo() & CMagicBlastResults::fUnaligned) != 0 ||
812  (no_discordant && !is_concordant))) {
813 
814  PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, false,
815  trim_read_id, user_tag);
816  unaligned_ostr << endl;
817  }
818 
819  return ostr;
820 }
821 
822 
824  CNcbiOstream& unaligned_ostr,
825  CFormattingArgs::EOutputFormat unaligned_fmt,
826  const CMagicBlastResultSet& results,
827  const CBioseq_set& query_batch,
828  bool is_paired, int batch_number,
829  bool trim_read_id,
830  bool print_unaligned,
831  bool no_discordant,
832  const string& user_tag)
833 {
834  TQueryMap queries;
835  s_CreateQueryMap(query_batch, queries);
836 
837  int compartment = 0;
838  for (auto it: results) {
839  PrintTabular(ostr, unaligned_ostr, unaligned_fmt, *it, queries,
840  is_paired, batch_number, compartment, trim_read_id,
841  print_unaligned, no_discordant, user_tag);
842  }
843 
844  return ostr;
845 }
846 
847 
849  CRef<CLocalDbAdapter> db_adapter,
850  const string& cmd_line_args)
851 {
852  BlastSeqSrc* seq_src = db_adapter->MakeSeqSrc();
853  IBlastSeqInfoSrc* seqinfo_src = db_adapter->MakeSeqInfoSrc();
854  _ASSERT(seq_src && seqinfo_src);
855 
856  CRef<CSeqDB> seqdb;
857  if (db_adapter->IsBlastDb()) {
858  seqdb.Reset(db_adapter->GetSearchDatabase()->GetSeqDb());
859  }
860 
861  ostr << "@HD\t" << "VN:1.0\t" << "GO:query" << endl;
862 
865  CRef<CSeq_id> seqid(new CSeq_id);
866  Uint4 length;
867  Int4 oid;
868  while ((oid = BlastSeqSrcIteratorNext(seq_src, it)) != BLAST_SEQSRC_EOF) {
869  GetSequenceLengthAndId(seqinfo_src, oid, CSeq_id::BlastRank, seqid,
870  &length);
871 
872  ostr << "@SQ\t" << "SN:" << s_GetBareId(*seqid) << "\tLN:" << length;
873 
874  vector<TTaxId> taxids;
875  if (seqdb.NotEmpty()) {
876  seqdb->GetTaxIDs(oid, taxids);
877  }
878 
879  if (!taxids.empty() && taxids[0] != 0) {
880  ostr << "\tSP:";
881  for (vector<TTaxId>::iterator it = taxids.begin();
882  it != taxids.end(); ++it) {
883  if (it != taxids.begin()) {
884  ostr << ",";
885  }
886  ostr << *it;
887  }
888  }
889  ostr << endl;
890  }
892 
893  ostr << "@PG\tID:magicblast\tPN:magicblast\tCL:" << cmd_line_args << endl;
894 
895  return ostr;
896 }
897 
898 
899 // hash function for pointers to Seq_id
901 {
902  size_t operator()(const CSeq_id* s) const {
903  std::hash<string> h;
904  return h(s->AsFastaString());
905  }
906 };
907 
908 
909 // equal_to function for pointers to Seq_id
910 struct eq_seqid
911 {
912  bool operator()(const CSeq_id* a, const CSeq_id* b) const {
913  return a->Match(*b);
914  }
915 };
916 
917 // hash_set of pointers to Seq_ids
918 typedef unordered_set<const CSeq_id*, hash_seqid, eq_seqid> TSeq_idHashSet;
919 
920 
921 static ENa_strand
922 s_GetSpliceSiteOrientation(const CSpliced_seg::TExons::const_iterator& exon,
923  const CSpliced_seg::TExons::const_iterator& next_exon)
924 {
926 
927  // orientation is unknown if exons align on different strands or a exon's
928  // genomic strand is unknown
929  if ((*exon)->GetGenomic_strand() !=
930  (*next_exon)->GetGenomic_strand() ||
931  (*exon)->GetGenomic_strand() == eNa_strand_unknown) {
932 
933  return eNa_strand_unknown;
934  }
935 
936  // orientation is unknown if splice signal is not set
937  if (!(*exon)->IsSetDonor_after_exon() ||
938  !(*next_exon)->IsSetAcceptor_before_exon()) {
939 
940  return eNa_strand_unknown;
941  }
942 
943  // get splice signal
944  string donor = (*exon)->GetDonor_after_exon().GetBases();
945  string acceptor = (*next_exon)->GetAcceptor_before_exon().GetBases();
946 
947  // if the signal is recognised then the splice orientation is the same as
948  // genomic strand
949  if (IsConsensusSplice(donor, acceptor) ||
950  IsKnownNonConsensusSplice(donor, acceptor)) {
951 
952  result = (*exon)->GetGenomic_strand();
953  }
954  else {
955  // otherwise try to recognise reverse complemented splice signals
956 
957  string rc_donor;
958  string rc_acceptor;
959 
962  0, donor.length(),
963  rc_donor);
964 
967  0, acceptor.length(),
968  rc_acceptor);
969 
970  // if reverse complemented signals are recognised then splice
971  // orientation is opposite to genomic strand
972  if (IsConsensusSplice(rc_acceptor, rc_donor) ||
973  IsKnownNonConsensusSplice(rc_acceptor, rc_donor)) {
974 
975  if ((*exon)->GetGenomic_strand() == eNa_strand_plus) {
977  }
978  else if ((*exon)->GetGenomic_strand() == eNa_strand_minus) {
980  }
981  else {
983  }
984  }
985  else {
986  // if neither signals are recognised then splice orientation is
987  // unknown
989  }
990 
991  }
992 
993  return result;
994 }
995 
996 
997 #define SAM_FLAG_MULTI_SEGMENTS 0x1
998 #define SAM_FLAG_SEGS_ALIGNED 0x2
999 #define SAM_FLAG_SEG_UNMAPPED 0x4
1000 #define SAM_FLAG_NEXT_SEG_UNMAPPED 0x8
1001 #define SAM_FLAG_SEQ_REVCOMP 0x10
1002 #define SAM_FLAG_NEXT_REVCOMP 0x20
1003 #define SAM_FLAG_FIRST_SEGMENT 0x40
1004 #define SAM_FLAG_LAST_SEGMENT 0x80
1005 #define SAM_FLAG_SECONDARY 0x100
1006 
1007 static
1009  const TQueryMap& queries,
1010  const BlastQueryInfo* query_info,
1011  bool is_spliced,
1012  int batch_number, bool& first_secondary,
1013  bool& last_secondary, bool trim_read_ids,
1014  E_StrandSpecificity strand_specific,
1015  bool only_specific,
1016  bool print_md_tag,
1017  bool other = false,
1018  const string& user_tag = "",
1019  const CSeq_align* mate = NULL)
1020 {
1021  string sep = "\t";
1022 
1023  string btop_string;
1024  string md_tag;
1025  int query_len = 0;
1026  int num_hits = 0;
1027  int context = -1;
1028  int sam_flags = 0;
1029  const int kMaxInsertSize = is_spliced ?
1032 
1033  // if paired alignment
1034  if (align.GetSegs().IsDisc()) {
1035 
1036  _ASSERT(align.GetSegs().GetDisc().Get().size() == 2);
1037 
1038  const CSeq_align_set& disc = align.GetSegs().GetDisc();
1039  CSeq_align_set::Tdata::const_iterator first = disc.Get().begin();
1040  _ASSERT(first != disc.Get().end());
1041  CSeq_align_set::Tdata::const_iterator second(first);
1042  ++second;
1043  _ASSERT(second != disc.Get().end());
1044 
1045  PrintSAM(ostr, **first, queries, query_info, is_spliced,
1046  batch_number, first_secondary, last_secondary,
1047  trim_read_ids, strand_specific, only_specific,
1048  print_md_tag, false, user_tag,
1049  second->GetNonNullPointer());
1050  ostr << endl;
1051 
1052  PrintSAM(ostr, **second, queries, query_info, is_spliced,
1053  batch_number, first_secondary, last_secondary,
1054  trim_read_ids, strand_specific, only_specific,
1055  print_md_tag, true, user_tag,
1056  first->GetNonNullPointer());
1057 
1058  return ostr;
1059  }
1060 
1061  // get align data saved in the user object
1062  CConstRef<CUser_object> ext = align.FindExt("Mapper Info");
1063  if (ext.NotEmpty()) {
1064 
1065  ITERATE (CUser_object::TData, it, ext->GetData()) {
1066  if (!(*it)->GetLabel().IsStr()) {
1067  continue;
1068  }
1069 
1070  if ((*it)->GetLabel().GetStr() == "btop" &&
1071  (*it)->GetData().IsStr()) {
1072 
1073  btop_string = (*it)->GetString();
1074  }
1075  else if ((*it)->GetLabel().GetStr() == "num_hits" &&
1076  (*it)->GetData().IsInt()) {
1077 
1078  num_hits = (*it)->GetInt();
1079  }
1080  else if ((*it)->GetLabel().GetStr() == "context" &&
1081  (*it)->GetData().IsInt()) {
1082 
1083  context = (*it)->GetInt();
1084  }
1085  else if ((*it)->GetLabel().GetStr() == "md_tag" &&
1086  (*it)->GetData().IsStr()) {
1087 
1088  md_tag = (*it)->GetString();
1089  }
1090  }
1091 
1092  }
1093 
1094  vector<ENa_strand> orientation;
1095  if (align.GetSegs().Which() == CSeq_align::TSegs::e_Spliced) {
1096  const CSpliced_seg& spliced = align.GetSegs().GetSpliced();
1097 
1098  query_len = spliced.GetProduct_length();
1099  }
1100 
1101  // observed template length
1102  int template_length = 0;
1103  CRange<TSeqPos> range = align.GetSeqRange(1);
1104  if (mate && align.GetSeq_id(1).Match(mate->GetSeq_id(1))) {
1105  CRange<TSeqPos> mate_range = mate->GetSeqRange(1);
1106  if (align.GetSeqStrand(0) == eNa_strand_plus &&
1107  align.GetSeqStrand(1) == eNa_strand_plus) {
1108 
1109  template_length = (int)mate_range.GetTo() - (int)range.GetFrom() + 1;
1110  }
1111  else {
1112  template_length =
1113  -((int)range.GetTo() - (int)mate_range.GetFrom() + 1);
1114  }
1115  }
1116 
1117 
1118  // FIXME: if subject is on a minus strand we need to reverse
1119  // complement both
1120  if (align.GetSeqStrand(0) == eNa_strand_minus) {
1121  sam_flags |= SAM_FLAG_SEQ_REVCOMP;
1122  }
1123 
1124  if (context >= 0 && query_info->contexts[context].segment_flags != 0) {
1125  sam_flags |= SAM_FLAG_MULTI_SEGMENTS;
1126 
1127  if ((query_info->contexts[context].segment_flags & fFirstSegmentFlag)
1128  != 0) {
1129  sam_flags |= SAM_FLAG_FIRST_SEGMENT;
1130  }
1131 
1132  if ((query_info->contexts[context].segment_flags & fLastSegmentFlag)
1133  != 0) {
1134  sam_flags |= SAM_FLAG_LAST_SEGMENT;
1135  }
1136 
1137  if ((query_info->contexts[context].segment_flags & fPartialFlag) != 0
1138  || !mate) {
1139 
1140  sam_flags |= SAM_FLAG_NEXT_SEG_UNMAPPED;
1141  }
1142 
1143  if (mate) {
1144  // FIXME: it is assumed that subject is always in plus strand
1145  // (BLAST way)
1146  ENa_strand a_strand = align.GetSeqStrand(0);
1147  ENa_strand m_strand = mate->GetSeqStrand(0);
1148  bool plus_minus =
1149  a_strand == eNa_strand_plus && m_strand == eNa_strand_minus;
1150  bool minus_plus =
1151  a_strand == eNa_strand_minus && m_strand == eNa_strand_plus;
1152  TSeqPos a_start = align.GetSeqStart(1);
1153  TSeqPos m_start = mate->GetSeqStart(1);
1154 
1155  // For strand specific output we reset SAM_FLAG_SEGS_ALIGNED
1156  // for paired alignments with the wrong configuration
1157  if (strand_specific != eNonSpecific) {
1158  // In this statement <bool1> != <bool2> is equivalent to
1159  // EXCLUSIVE-OR.
1160  // If <bool2> is false, conditional returns <bool1>.
1161  // If <bool2> is true, conditional returns <bool1> inverted.
1162  // So if "other" is true, actions based on "plus_minus"
1163  // and "minus_plus" are reversed.
1164  if (((strand_specific == eFwdRev && plus_minus != other)
1165  || (strand_specific == eRevFwd && minus_plus != other))
1166  && template_length < kMaxInsertSize) {
1167 
1168  sam_flags |= SAM_FLAG_SEGS_ALIGNED;
1169  }
1170  } else {
1171  if (((a_start <= m_start && plus_minus)
1172  || (m_start <= a_start && minus_plus))
1173  && abs(template_length) < kMaxInsertSize) {
1174  sam_flags |= SAM_FLAG_SEGS_ALIGNED;
1175  }
1176  }
1177 
1178  if (mate->GetSeqStrand(0) == eNa_strand_minus) {
1179  sam_flags |= SAM_FLAG_NEXT_REVCOMP;
1180  }
1181  }
1182  }
1183 
1184  // set secondary alignment bit
1185  if ((sam_flags & SAM_FLAG_FIRST_SEGMENT) != 0) {
1186  if (first_secondary) {
1187  sam_flags |= SAM_FLAG_SECONDARY;
1188  }
1189  else {
1190  first_secondary = true;
1191  }
1192  }
1193  else {
1194  if (last_secondary) {
1195  sam_flags |= SAM_FLAG_SECONDARY;
1196  }
1197  else {
1198  last_secondary = true;
1199  }
1200  }
1201 
1202  // read id
1203  const CBioseq& bioseq = s_GetQueryBioseq(queries, align.GetSeq_id(0));
1204  string read_id = s_GetSequenceId(bioseq);
1205  if (trim_read_ids &&
1206  (NStr::EndsWith(read_id, ".1") || NStr::EndsWith(read_id, ".2") ||
1207  NStr::EndsWith(read_id, "/1") || NStr::EndsWith(read_id, "/2"))) {
1208 
1209  read_id.resize(read_id.length() - 2);
1210  }
1211  ostr << read_id << sep;
1212 
1213  // flag
1214  ostr << sam_flags << sep;
1215 
1216  // reference sequence id
1217  ostr << s_GetBareId(align.GetSeq_id(1)) << sep;
1218 
1219  // mapping position
1220  ostr << range.GetFrom() + 1 << sep;
1221 
1222  // mapping quality
1223  // 255 means MAPQ value unavailable
1224  int mapq = 255;
1225  // for single alignements, report 60 (like HISAT2)
1226  if (num_hits == 1) {
1227  mapq = 60;
1228  }
1229  else if (num_hits > 1) {
1230  // MAPQ value for more than one alignment (like TopHat2 and STAR)
1231  mapq = (int)((-10.0 * log10(1.0 - 1.0 / (double) num_hits)) + 0.5);
1232  }
1233  ostr << mapq << sep;
1234 
1235  // CIGAR string
1236  string cigar;
1237  int edit_distance = 0;
1238  if (align.GetSegs().Which() == CSeq_align::TSegs::e_Denseg) {
1239  const CDense_seg& denseg = align.GetSegs().GetDenseg();
1240  const CDense_seg::TStarts& starts = denseg.GetStarts();
1241  const CDense_seg::TLens& lens = denseg.GetLens();
1242  CRange<TSeqPos> qrange = align.GetSeqRange(0);
1243 
1244  if (align.GetSeqStrand(0) == eNa_strand_plus) {
1245  if (qrange.GetFrom() > 0) {
1246  cigar += NStr::IntToString(qrange.GetFrom());
1247  cigar += "S";
1248  }
1249  }
1250  else {
1251  if ((int)qrange.GetToOpen() < query_len) {
1252  cigar += NStr::IntToString(query_len - qrange.GetToOpen());
1253  cigar += "S";
1254  }
1255  }
1256  for (size_t i=0;i < starts.size();i+=2) {
1257  cigar += NStr::IntToString(lens[i/2]);
1258  if (starts[i] >= 0 && starts[i + 1] >= 0) {
1259  cigar += "M";
1260  }
1261  else if (starts[i] < 0) {
1262  if (lens[i/2] < 10) {
1263  cigar += "D";
1264  }
1265  else {
1266  cigar += "N";
1267  }
1268  }
1269  else {
1270  cigar += "I";
1271  }
1272  }
1273  if (align.GetSeqStrand(0) == eNa_strand_plus) {
1274  if ((int)qrange.GetToOpen() < query_len) {
1275  cigar += NStr::IntToString(query_len - qrange.GetToOpen());
1276  cigar += "S";
1277  }
1278  }
1279  else {
1280  if (qrange.GetFrom() > 0) {
1281  cigar += NStr::IntToString(qrange.GetFrom());
1282  cigar += "S";
1283  }
1284  }
1285  }
1286  else if (align.GetSegs().Which() == CSeq_align::TSegs::e_Spliced) {
1287  const CSpliced_seg& spliced = align.GetSegs().GetSpliced();
1288  CRange<TSeqPos> qrange = align.GetSeqRange(0);
1289 
1290  if (qrange.GetFrom() > 0) {
1291  cigar += NStr::IntToString(qrange.GetFrom());
1292  cigar += "S";
1293  }
1294 
1295  ITERATE (CSpliced_seg::TExons, exon, spliced.GetExons()) {
1296  int num = 0;
1297  char op = 0;
1298  ITERATE(CSpliced_exon::TParts, it, (*exon)->GetParts()) {
1299  switch ((*it)->Which()) {
1301  if (op && op != 'M') {
1302  cigar += NStr::IntToString(num);
1303  cigar += op;
1304  num = 0;
1305  }
1306  num += (*it)->GetMatch();
1307  op = 'M';
1308  break;
1309 
1311  if (op && op != 'M') {
1312  cigar += NStr::IntToString(num);
1313  cigar += op;
1314  num = 0;
1315  }
1316  edit_distance += (*it)->GetMismatch();
1317  num += (*it)->GetMismatch();
1318  op = 'M';
1319  break;
1320 
1322  if (op && op != 'I') {
1323  cigar += NStr::IntToString(num);
1324  cigar += op;
1325  num = 0;
1326  }
1327  edit_distance += (*it)->GetProduct_ins();
1328  num += (*it)->GetProduct_ins();
1329  op = 'I';
1330  break;
1331 
1333  if (op && op != 'D') {
1334  cigar += NStr::IntToString(num);
1335  cigar += op;
1336  num = 0;
1337  }
1338  edit_distance += (*it)->GetGenomic_ins();
1339  num += (*it)->GetGenomic_ins();
1340  op = 'D';
1341  break;
1342 
1343  default:
1344  NCBI_THROW(CException, eInvalid, "Unsupported "
1345  "CSpliced_exon_chunk::TPart value");
1346  }
1347  }
1348  if (num > 0) {
1349  cigar += NStr::IntToString(num);
1350  cigar += op;
1351 
1352  }
1353 
1354  CSpliced_seg::TExons::const_iterator next_exon(exon);
1355  ++next_exon;
1356  if (next_exon != spliced.GetExons().end()) {
1357  int query_gap = (*next_exon)->GetProduct_start().GetNucpos() -
1358  (*exon)->GetProduct_end().GetNucpos() - 1;
1359  if (query_gap > 0) {
1360  cigar += NStr::IntToString(query_gap);
1361  cigar += "I";
1362  }
1363  edit_distance += query_gap;
1364 
1365  int intron = (*next_exon)->GetGenomic_start() -
1366  (*exon)->GetGenomic_end() - 1;
1367  if (intron > 0) {
1368  cigar += NStr::IntToString(intron);
1369  cigar += "N";
1370  }
1371 
1372  // get intron orientation
1373  orientation.push_back(
1374  s_GetSpliceSiteOrientation(exon, next_exon));
1375  }
1376  }
1377 
1378  if ((int)qrange.GetToOpen() < query_len) {
1379  cigar += NStr::IntToString(query_len - qrange.GetToOpen());
1380  cigar += "S";
1381  }
1382  }
1383  else {
1384  NCBI_THROW(CSeqalignException, eUnsupported, "The SAM formatter does "
1385  "does not support this alignment structure");
1386  }
1387 
1388  ostr << cigar << sep;
1389 
1390  // reference name of the mate
1391  if (mate) {
1392  if (align.GetSeq_id(1).Match(mate->GetSeq_id(1))) {
1393  ostr << "=";
1394  }
1395  else {
1396  ostr << s_GetBareId(mate->GetSeq_id(1));
1397  }
1398  }
1399  else {
1400  ostr << "*";
1401  }
1402  ostr << sep;
1403 
1404  // position of the mate
1405  if (mate) {
1406  ostr << MIN(mate->GetSeqStart(1), mate->GetSeqStop(1)) + 1;
1407  }
1408  else {
1409  ostr << "0";
1410  }
1411  ostr << sep;
1412 
1413  // observed template length
1414  ostr << template_length;
1415  ostr << sep;
1416 
1417  // read sequence
1418  string sequence;
1420  int status = s_GetQuerySequence(bioseq, r,
1421  (sam_flags & SAM_FLAG_SEQ_REVCOMP) != 0, sequence);
1422 
1423  if (!status && sequence.length() > 0) {
1424  ostr << sequence << sep;
1425  }
1426  else {
1427  ostr << "*" << sep;
1428  }
1429 
1430  // quality string
1431  string quality_str;
1432  bool has_qualities = s_GetQualityString(bioseq, quality_str);
1433  ostr << (has_qualities ? quality_str : "*") ;
1434 
1435  // optional fields
1436  // number of hits reported for the query
1437  ostr << sep << "NH:i:" << num_hits;
1438 
1439  // score
1440  int score = 0;
1442  ostr << sep << "AS:i:" << score;
1443 
1444  // edit distance
1445  ostr << sep << "NM:i:" << edit_distance;
1446 
1447  // splice site orientation
1448  // The final splice orientation is positive or negative, if all introns in
1449  // the alignment have the same orientation, or unknown if orientation
1450  // changes.
1451  if (!orientation.empty()) {
1452  char ori;
1453 
1454  switch (orientation[0]) {
1455  case eNa_strand_plus:
1456  ori = '+';
1457  break;
1458 
1459  case eNa_strand_minus:
1460  ori = '-';
1461  break;
1462 
1463  default:
1464  ori = '?';
1465  }
1466 
1467  for (size_t i=1;i < orientation.size();i++) {
1468  if (orientation[i] != orientation[0]) {
1469  ori = '?';
1470  }
1471  }
1472 
1473  ostr << sep << "XS:A:" << ori;
1474  }
1475 
1476  // MD tag in Seq-align has long subject gaps (deletions) encoded as
1477  // !<gap length>!. 'x' is printed as each deletec base, because we do not
1478  // have access to subject sequence.
1479  if (print_md_tag && !md_tag.empty()) {
1480  vector<string> tokens;
1481  NStr::Split(md_tag, "!", tokens);
1482 
1483  ostr << sep << "MD:Z:";
1484  size_t i = 0;
1485  for (;i < tokens.size();i+=2) {
1486  ostr << tokens[i];
1487 
1488  if (i < tokens.size() - 1) {
1489  int num = NStr::StringToInt(tokens[i + 1]);
1490  _ASSERT(num > 0);
1491  ostr << "^";
1492  for (int k=0;k < num;k++) {
1493  ostr << "x";
1494  }
1495  }
1496  }
1497  }
1498 
1499  if (!user_tag.empty()) {
1500  ostr << sep << "XU:Z:" << user_tag;
1501  }
1502 
1503  return ostr;
1504 }
1505 
1506 
1508  const CMagicBlastResults& results,
1509  const TQueryMap& queries,
1510  bool first_seg,
1511  bool trim_read_ids,
1512  const string& user_tag)
1513 {
1514  string sep = "\t";
1515 
1516  CSeq_id id;
1517  if (!results.IsPaired() || first_seg) {
1518  id.Set(results.GetQueryId().AsFastaString());
1519  }
1520  else {
1521  id.Set(results.GetLastId().AsFastaString());
1522  }
1523 
1524  // read id
1525  const CBioseq& bioseq = s_GetQueryBioseq(queries, id);
1526  string read_id = s_GetSequenceId(bioseq);
1527  if (trim_read_ids &&
1528  (NStr::EndsWith(read_id, ".1") || NStr::EndsWith(read_id, ".2") ||
1529  NStr::EndsWith(read_id, "/1") || NStr::EndsWith(read_id, "/2"))) {
1530 
1531  read_id.resize(read_id.length() - 2);
1532  }
1533  ostr << read_id << sep;
1534 
1535  // SAM flags
1537  if (results.IsPaired()) {
1539  if ((first_seg && !results.LastAligned()) ||
1540  (!first_seg && !results.FirstAligned())) {
1541 
1543  }
1544 
1545  if (first_seg) {
1547  }
1548  else {
1550  }
1551  }
1552  ostr << flags << sep;
1553 
1554  // reference sequence id
1555  ostr << "*" << sep;
1556 
1557  // mapping position
1558  ostr << "0" << sep;
1559 
1560  // mapping quality
1561  ostr << "0" << sep;
1562 
1563  // CIGAR
1564  ostr << "*" << sep;
1565 
1566  // mate reference sequence id
1567  ostr << "*" << sep;
1568 
1569  // mate postition
1570  ostr << "0" << sep;
1571 
1572  // template length
1573  ostr << "0" << sep;
1574 
1575  // sequence
1576  string sequence;
1578  int status = s_GetQuerySequence(bioseq, range, false, sequence);
1579  if (status || sequence.empty()) {
1580  ostr << "*" << sep;
1581  }
1582  else {
1583  ostr << sequence << sep;
1584  }
1585 
1586  // quality string
1587  string quality_str;
1588  bool has_qualities = s_GetQualityString(bioseq, quality_str);
1589  ostr << (has_qualities ? quality_str : "*");
1590 
1591  // read did not pass filtering
1593  first_seg ? results.GetFirstInfo() : results.GetLastInfo();
1594  if ((info & CMagicBlastResults::fFiltered) != 0) {
1595  ostr << sep << "YF:Z:F";
1596  }
1597 
1598  if (!user_tag.empty()) {
1599  ostr << sep << "XU:Z:" << user_tag;
1600  }
1601 
1602  return ostr;
1603 }
1604 
1605 static
1607  CNcbiOstream& unaligned_ostr,
1608  CFormattingArgs::EOutputFormat unaligned_fmt,
1609  CMagicBlastResults& results,
1610  const TQueryMap& queries,
1611  const BlastQueryInfo* query_info,
1612  bool is_spliced, int batch_number,
1613  bool trim_read_id, bool print_unaligned,
1614  bool no_discordant, E_StrandSpecificity strand_specific,
1615  bool only_specific,
1616  bool print_md_tag,
1617  const string& user_tag)
1618 {
1619  bool first_secondary = false;
1620  bool last_secondary = false;
1621 
1622  if (strand_specific == eFwdRev) {
1624  }
1625  else if (strand_specific == eRevFwd) {
1627  }
1628 
1629  // Is the pair aligned concordantly? (Unpaired are treated as concordant.)
1630  bool is_concordant = results.IsConcordant();
1631 
1632  if (!no_discordant || (no_discordant && is_concordant)) {
1633  for (auto it: results.GetSeqAlign()->Get()) {
1634  PrintSAM(ostr, *it, queries, query_info, is_spliced, batch_number,
1635  first_secondary, last_secondary, trim_read_id,
1636  strand_specific, only_specific, print_md_tag, false,
1637  user_tag);
1638  ostr << endl;
1639  }
1640  }
1641 
1642  if (!print_unaligned) {
1643  return ostr;
1644  }
1645 
1646  if ((results.GetFirstInfo() & CMagicBlastResults::fUnaligned) != 0 ||
1647  (no_discordant && !is_concordant)) {
1648 
1649  PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, true,
1650  trim_read_id, user_tag);
1651  unaligned_ostr << endl;
1652  }
1653 
1654  if (results.IsPaired() &&
1655  ((results.GetLastInfo() & CMagicBlastResults::fUnaligned) != 0 ||
1656  (no_discordant && !is_concordant))) {
1657  PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, false,
1658  trim_read_id, user_tag);
1659  unaligned_ostr << endl;
1660  }
1661 
1662  return ostr;
1663 }
1664 
1665 
1667  CNcbiOstream& unaligned_ostr,
1668  CFormattingArgs::EOutputFormat unaligned_fmt,
1669  const CMagicBlastResultSet& results,
1670  const CBioseq_set& query_batch,
1671  const BlastQueryInfo* query_info,
1672  bool is_spliced,
1673  int batch_number,
1674  bool trim_read_id,
1675  bool print_unaligned,
1676  bool no_discordant,
1677  E_StrandSpecificity strand_specific,
1678  bool only_specific,
1679  bool print_md_tag,
1680  const string& user_tag)
1681 {
1682  TQueryMap bioseqs;
1683  s_CreateQueryMap(query_batch, bioseqs);
1684 
1685  for (auto it: results) {
1686  PrintSAM(ostr, unaligned_ostr, unaligned_fmt, *it, bioseqs, query_info,
1687  is_spliced, batch_number, trim_read_id, print_unaligned,
1688  no_discordant, strand_specific, only_specific, print_md_tag,
1689  user_tag);
1690  }
1691 
1692  return ostr;
1693 }
1694 
1695 
1696 CNcbiOstream& PrintASN1(CNcbiOstream& ostr, const CBioseq_set& query_batch,
1697  CSeq_align_set& aligns)
1698 {
1699  TQueryMap queries;
1700  s_CreateQueryMap(query_batch, queries);
1701 
1702  for (auto it: aligns.Set()) {
1703  if (it->GetSegs().Which() != CSeq_align::TSegs::e_Spliced) {
1704  continue;
1705  }
1706 
1707  const CBioseq& bioseq = s_GetQueryBioseq(queries, it->GetSeq_id(0));
1708  CRef<CSeq_id> seqid;
1709  if (bioseq.IsSetDescr()) {
1710  for (auto it: bioseq.GetDescr().Get()) {
1711  if (it->IsTitle()) {
1712  vector<string> tokens;
1713  NStr::Split(it->GetTitle(), " ", tokens);
1714  seqid.Reset(new CSeq_id(CSeq_id::e_Local, tokens[0]));
1715  }
1716  }
1717  }
1718 
1719  if (seqid.NotEmpty()) {
1720  it->SetSegs().SetSpliced().SetProduct_id(*seqid);
1721  }
1722  }
1723 
1724  ostr << MSerial_AsnText << aligns;
1725 
1726  return ostr;
1727 }
1728 
1729 
1730 END_SCOPE(blast)
1732 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Produce formatted blast output for command line applications.
Interface for reading SRA sequences into blast input.
Routines for creating nucleotide BLAST lookup tables.
#define MAGICBLAST_MAX_INSERT_SIZE_NONSPLICED
#define MAGICBLAST_MAX_INSERT_SIZE_SPLICED
Default maximum insert size: distance on the subject between reads that belong to a pair,...
@ fPartialFlag
The other segment is not present (did not pass quality filtering.
@ fFirstSegmentFlag
The first sequence of a pair.
@ fLastSegmentFlag
The last sequence of a pair.
Declarations of auxiliary functions using IBlastSeqInfoSrc to retrieve ids and related sequence infor...
Int4 BlastSeqSrcIteratorNext(const BlastSeqSrc *seq_src, BlastSeqSrcIterator *itr)
Increments the BlastSeqSrcIterator.
Definition: blast_seqsrc.c:425
BlastSeqSrcIterator * BlastSeqSrcIteratorFree(BlastSeqSrcIterator *itr)
Frees the BlastSeqSrcIterator structure.
Definition: blast_seqsrc.c:412
BlastSeqSrcIterator * BlastSeqSrcIteratorNew(void)
Allocate and initialize an iterator over a BlastSeqSrc with a default chunk size for MT-safe iteratio...
Definition: blast_seqsrc.c:380
#define BLAST_SEQSRC_EOF
No more sequences available.
Definition: blast_seqsrc.h:292
void BlastSeqSrcResetChunkIterator(BlastSeqSrc *seq_src)
Reset the internal "bookmark" of the last chunk for iteration provided by this object.
Definition: blast_seqsrc.c:436
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
Definition: Dbtag.hpp:53
EOutputFormat
Defines the output formats supported by our command line formatter.
@ eTabular
Tabular output.
@ eFasta
unaligned reads in magicblast
Results of Magic-BLAST mapping.
Definition: magicblast.hpp:241
Magic-BLAST results for a single query/read or a pair of reads.
Definition: magicblast.hpp:137
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
void GetTaxIDs(int oid, map< TGi, TTaxId > &gi_to_taxid, bool persist=false) const
Get taxid for an OID.
Definition: seqdb.cpp:441
static SIZE_TYPE ReverseComplement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
@ e_Ncbi8na
Definition: sequtil.hpp:52
@ e_Iupacna
Definition: sequtil.hpp:47
@ e_Ncbi4na
Definition: sequtil.hpp:50
@ e_Ncbi2na
Definition: sequtil.hpp:48
@ eScore_PercentIdentity_Gapped
Definition: Seq_align.hpp:163
CRange< TSeqPos > GetSeqRange(TDim row) const
GetSeqRange NB: On a Spliced-seg, in case the product-type is protein, these only return the amin par...
Definition: Seq_align.cpp:153
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
bool GetNamedScore(const string &id, int &score) const
Get score.
Definition: Seq_align.cpp:563
TSeqPos GetSeqStart(TDim row) const
Definition: Seq_align.cpp:252
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
CConstRef< CUser_object > FindExt(const string &ext_type) const
Find extension by type in ext container.
Definition: Seq_align.cpp:2274
Abstract base class to encapsulate retrieval of sequence identifiers.
bool IsConsensusSplice(const string &splice5, const string &splice3)
Consensus splice is GY..AG or AT..AC.
bool IsKnownNonConsensusSplice(const string &splice5, const string &splice3)
GT-{TG,GG,AT,AA} or {GA,TT,AT,GG}-AG or AT-{AT,AA}.
static uch flags
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static char tmp[3200]
Definition: utf8.c:42
bool LastAligned(void) const
Is the mate aligned.
Definition: magicblast.hpp:204
CRef< CSeqDB > GetSeqDb() const
Obtain a reference to the database.
CConstRef< CSeq_align_set > GetSeqAlign(void) const
Get alignments.
Definition: magicblast.hpp:174
void SortAlignments(EOrdering order)
Sort alignments by selected criteria (pair configuration)
Definition: magicblast.cpp:640
BlastSeqSrc * MakeSeqSrc()
Retrieves or constructs the BlastSeqSrc.
bool IsConcordant(void) const
Are an aligned pair concordant?
Definition: magicblast.hpp:183
bool IsBlastDb() const
Returns true if this object represents a BLAST database.
bool FirstAligned(void) const
Is the query aligned.
Definition: magicblast.hpp:201
bool IsPaired(void) const
Are alignments computed for paired reads.
Definition: magicblast.hpp:180
IBlastSeqInfoSrc * MakeSeqInfoSrc()
Retrieves or constructs the IBlastSeqInfoSrc.
TResultsInfo GetLastInfo(void) const
Get alignment flags for the mate.
Definition: magicblast.hpp:189
TResultsInfo GetFirstInfo(void) const
Get alignment flags for the query.
Definition: magicblast.hpp:186
const CSeq_id & GetQueryId(void) const
Get query sequence id.
Definition: magicblast.hpp:192
CRef< CSearchDatabase > GetSearchDatabase()
void GetSequenceLengthAndId(const IBlastSeqInfoSrc *seqinfo_src, int oid, CRef< objects::CSeq_id > &seqid, TSeqPos *length)
Retrieves subject sequence Seq-id and length.
const CSeq_id & GetLastId(void) const
Get sequence id of the last sequence of a paired read.
Definition: magicblast.hpp:198
@ fUnaligned
Read is unaligned.
Definition: magicblast.hpp:143
@ fFiltered
Read did not pass quality filtering.
Definition: magicblast.hpp:146
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const TPrim & Get(void) const
Definition: serialbase.hpp:347
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
CSeq_id & Set(const CTempString &the_id, TParseFlags flags=fParse_AnyRaw)
Reassign based on flat specifications; arguments interpreted as with constructors.
Definition: Seq_id.cpp:2457
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1065
static int BlastRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:782
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
position_type GetToOpen(void) const
Definition: range.hpp:138
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TData & GetData(void) const
Get the Data member data.
vector< CRef< CUser_field > > TData
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
Tdata & Set(void)
Assign a value to data member.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_align_.hpp:691
vector< TSeqPos > TLens
Definition: Dense_seg_.hpp:108
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
bool IsDisc(void) const
Check if variant Disc is selected.
Definition: Seq_align_.hpp:772
list< CRef< CSpliced_exon_chunk > > TParts
bool IsSetProduct_length(void) const
length of the product, in bases/residues from this (or from poly-a if present), a 3' unaligned length...
bool IsSpliced(void) const
Check if variant Spliced is selected.
Definition: Seq_align_.hpp:778
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
bool IsDenseg(void) const
Check if variant Denseg is selected.
Definition: Seq_align_.hpp:740
@ e_Product_ins
insertion in product sequence (i.e. gap in the genomic sequence)
@ e_Genomic_ins
insertion in genomic sequence (i.e. gap in the product sequence)
@ e_Match
both sequences represented, product and genomic sequences match
@ e_Mismatch
both sequences represented, product and genomic sequences do not match
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ e_Local
local use
Definition: Seq_id_.hpp:95
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TIupacna & GetIupacna(void) const
Get the variant data.
Definition: Seq_data_.hpp:510
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
const TNcbi4na & GetNcbi4na(void) const
Get the variant data.
Definition: Seq_data_.hpp:570
const TNcbi2na & GetNcbi2na(void) const
Get the variant data.
Definition: Seq_data_.hpp:550
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
const TNcbi8na & GetNcbi8na(void) const
Get the variant data.
Definition: Seq_data_.hpp:590
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ e_Ncbi2na
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
@ e_Ncbi8na
8 bit extended nucleic acid code
Definition: Seq_data_.hpp:108
@ e_Ncbi4na
4 bit nucleic acid code
Definition: Seq_data_.hpp:107
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
Declares CMagicBlast, the C++ API for the BLAST RNA-Seq mapping engine.
USING_SCOPE(objects)
#define SAM_FLAG_SECONDARY
#define SAM_FLAG_FIRST_SEGMENT
static CNcbiOstream & PrintTabularUnaligned(CNcbiOstream &ostr, const CMagicBlastResults &results, const TQueryMap &queries, bool first_seg, const string &user_tag)
unordered_map< string, CRef< CSeq_entry > > TQueryMap
#define SAM_FLAG_SEGS_ALIGNED
static const CBioseq & s_GetQueryBioseq(const TQueryMap &queries, const CSeq_id &seqid)
static string s_GetSequenceId(const CBioseq &bioseq)
CNcbiOstream & PrintSAMHeader(CNcbiOstream &ostr, CRef< CLocalDbAdapter > db_adapter, const string &cmd_line_args)
CNcbiOstream & PrintASN1(CNcbiOstream &ostr, const CBioseq_set &query_batch, CSeq_align_set &aligns)
static CNcbiOstream & PrintSAMUnaligned(CNcbiOstream &ostr, const CMagicBlastResults &results, const TQueryMap &queries, bool first_seg, bool trim_read_ids, const string &user_tag)
unordered_set< const CSeq_id *, hash_seqid, eq_seqid > TSeq_idHashSet
#define SAM_FLAG_SEQ_REVCOMP
#define SAM_FLAG_NEXT_REVCOMP
#define SAM_FLAG_LAST_SEGMENT
static bool s_GetQualityString(const CBioseq &bioseq, string &quality_str)
CNcbiOstream & PrintTabularHeader(CNcbiOstream &ostr, const string &version, const string &cmd_line_args, bool user_tag)
static int s_GetQuerySequence(const CBioseq &bioseq, const CRange< TSeqPos > &range, bool reverse_complement, string &sequence)
static CNcbiOstream & PrintTabular(CNcbiOstream &ostr, const CSeq_align &align, const TQueryMap &queries, bool is_paired, int batch_number, int compartment, const string &user_tag, const CSeq_align *mate=NULL)
static CNcbiOstream & PrintUnaligned(CNcbiOstream &ostr, CFormattingArgs::EOutputFormat fmt, const CMagicBlastResults &results, const TQueryMap &queries, bool first_seg, bool trim_read_ids, const string &user_tag)
static string s_GetFastaDefline(const CBioseq &bioseq)
static CNcbiOstream & PrintFastaUnaligned(CNcbiOstream &ostr, const CMagicBlastResults &results, const TQueryMap &queries, bool first_seg)
static ENa_strand s_GetSpliceSiteOrientation(const CSpliced_seg::TExons::const_iterator &exon, const CSpliced_seg::TExons::const_iterator &next_exon)
static CNcbiOstream & PrintSAM(CNcbiOstream &ostr, const CSeq_align &align, const TQueryMap &queries, const BlastQueryInfo *query_info, bool is_spliced, int batch_number, bool &first_secondary, bool &last_secondary, bool trim_read_ids, E_StrandSpecificity strand_specific, bool only_specific, bool print_md_tag, bool other=false, const string &user_tag="", const CSeq_align *mate=NULL)
#define SAM_FLAG_SEG_UNMAPPED
static string s_GetBareId(const CSeq_id &id)
static void s_CreateQueryMap(const CBioseq_set &query_batch, TQueryMap &query_map)
static char s_Complement(char c)
#define SAM_FLAG_MULTI_SEGMENTS
#define SAM_FLAG_NEXT_SEG_UNMAPPED
E_StrandSpecificity
@ eFwdRev
@ eNonSpecific
@ eRevFwd
static MDB_envinfo info
Definition: mdb_load.c:37
static int version
Definition: mdb_load.c:29
range(_Ty, _Ty) -> range< _Ty >
#define abs(a)
Definition: ncbi_heapmgr.c:130
unsigned int a
Definition: ncbi_localip.c:102
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
T log10(T x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Declares the CRemoteBlast class.
Flat formatter for Sequence Alignment/Map (SAM).
Int4 segment_flags
Flags describing segments for paired reads.
The query related information.
BlastContextInfo * contexts
Information per context.
Complete type definition of Blast Sequence Source Iterator.
Complete type definition of Blast Sequence Source ADT.
Definition: blast_seqsrc.c:43
bool operator()(const CSeq_id *a, const CSeq_id *b) const
size_t operator()(const CSeq_id *s) const
#define _ASSERT
else result
Definition: token2.c:20
static CS_CONTEXT * context
Definition: will_convert.c:21
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
Modified on Sun Apr 21 03:40:39 2024 by modify_doxy.py rev. 669887