NCBI C++ ToolKit
alnvec_multi_ds.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: alnvec_multi_ds.cpp 47479 2023-05-02 13:24:02Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Andrey Yazhuk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
35 
36 #include <gui/objutils/utils.hpp>
38 
41 #include <objects/seq/Bioseq.hpp>
42 
43 #include <objmgr/align_ci.hpp>
44 
46 #include <objmgr/util/sequence.hpp>
47 #include <objmgr/seqdesc_ci.hpp>
48 #include <objmgr/util/sequence.hpp>
53 
54 
55 
58 
59 // Copy of scope singleton from sviewer_data.cpp
61 
63 
64 template <class T>
65 void TlsValCleanup(T* oldVal, void* /*data*/)
66 {
67  delete oldVal;
68 }
69 
70 // Get the scope without any seq-entries loaded into it.
71 // Seq-entries can interfere with id resulution and mask information
72 // otherwise available for well-known ids, e.g. tax id.
73 // See MSA-75 for examples.
74 static
75 objects::CScope& s_GetPristineScope(const objects::CBioseq_Handle& bio_handle)
76 {
77  TScopeRef* scope = s_ScopeTls.GetValue();
78  if (!scope) {
79  TScopeRef* s(new TScopeRef(
80  new CScope(bio_handle.GetScope().GetObjectManager())
81  ));
82  (*s)->AddDefaults();
83  s_ScopeTls.SetValue(s, TlsValCleanup<TScopeRef>, 0);
84  scope = s_ScopeTls.GetValue();
85  }
86  return **scope;
87 }
88 
89 
90 const string& IAlignRowHandle::GetOrgLabel(bool fExtOrgLookup) const
91 {
92  if (m_OrgLabel.empty()) {
94  const CBioseq_Handle& bio_handle = GetBioseqHandle();
95  if (fExtOrgLookup) {
96  const CBioSource* biosrc = sequence::GetBioSource(bio_handle);
97  if (biosrc) {
98  biosrc->GetOrg().GetLabel(&m_OrgLabel);
99  }
100  else {
101  auto TaxId = GetTaxId(fExtOrgLookup);
102  if (!TaxId) {
103  // There can be conflicting seq-entries in the scope which
104  // prevent us to get the tax id. MSA-75
105  // Use a fresh CScope to get tax id for well-known ids
106  CConstRef<CSeq_id> seq_id = bio_handle.GetSeqId();
107  TaxId = s_GetPristineScope(bio_handle).GetTaxId(*seq_id);
108  }
109  if (TaxId) {
111  m_OrgLabel = tc.GetLabel(TaxId);
112  }
113  }
114  }
115  else {
116  // Try descr.title
117  if (bio_handle.CanGetDescr() && bio_handle.GetDescr().CanGet()) {
118  list<CRef<CSeqdesc> > seqdesclist = bio_handle.GetDescr().Get();
119  for (auto seqdesc : seqdesclist) {
120  if (seqdesc->IsTitle()) {
121  m_OrgLabel = seqdesc->GetTitle();
122  break;
123  }
124  }
125  }
126  }
127  }
128  }
129  return m_OrgLabel;
130 }
131 int IAlignRowHandle::GetTaxId(bool fExtOrgLookup) const
132 {
133  if (m_TaxId == 0 && CanGetBioseqHandle()) {
134  const CBioseq_Handle& bio_handle = GetBioseqHandle();
135  m_TaxId = sequence::GetTaxId(bio_handle);
136  if (!m_TaxId && fExtOrgLookup) {
137  // There can be conflicting seq-entries in the scope which
138  // prevent us to get the tax id. MSA-75
139  // Use a fresh CScope to get tax id for well-known ids
140  CConstRef<CSeq_id> seq_id = bio_handle.GetSeqId();
141  m_TaxId = s_GetPristineScope(bio_handle).GetTaxId(*seq_id);
142  }
143  }
144  return m_TaxId;
145 }
146 
147 
148 static
149 void s_GetTaxonomyInfoFromService(int tax_id, string &taxonomy)
150 {
151  taxonomy.clear();
152  if (tax_id <= 0)
153  return;
155  taxonomy = tc.GetTaxname(tax_id);
156  string temp = tc.GetCommon(tax_id);
157  if (!temp.empty()) {
158  taxonomy += " (";
159  taxonomy += temp;
160  taxonomy += ')';
161  }
162  temp = tc.GetBlastName(tax_id);
163  if (!temp.empty()) {
164  taxonomy += " [";
165  taxonomy += temp;
166  taxonomy += ']';
167  }
168 }
169 
170 
171 static
172 void s_GetTaxonomyInfoFromBiosrc(const objects::CBioSource &biosrc, std::string &taxonomy)
173 {
174  taxonomy.clear();
175  if (biosrc.IsSetTaxname()) {
176  taxonomy = biosrc.GetTaxname();
177  const COrg_ref &org = biosrc.GetOrg();
178  if (org.CanGetCommon()) {
179  taxonomy += " (";
180  taxonomy += org.GetCommon();
181  taxonomy += ')';
182  }
183  }
184 }
185 
186 void IAlignRowHandle::GetTaxonomy(string& taxonomy, bool fExtOrgLookup) const
187 {
188  taxonomy.clear();
189  if (CanGetBioseqHandle()) {
190  const CBioseq_Handle &bsh = GetBioseqHandle();
191 
192  const CBioSource* biosrc = objects::sequence::GetBioSource(bsh);
193  if (biosrc) {
194  s_GetTaxonomyInfoFromBiosrc(*biosrc, taxonomy);
195  }
196  else {
198  }
199 
200  if (taxonomy.empty())
201  taxonomy = GetOrgLabel(fExtOrgLookup);
202  }
203 }
204 
205 
206 
208 : m_Scope(&scope),
209  m_ConsRowIndex(-1),
210  m_CreateConsensus(false),
211  m_isDataReadSync(false)
212 {
213 }
214 
215 
217 {
218  x_ClearHandles();
219 
220  try {
221  if(x_IsJobRunning()) {
222  x_DeleteJob();
223  }
224  }
225  catch (CException& e) {
226  LOG_POST(Error << "CAlnVecMultiDataSource::~CAlnVecMultiDataSource(): "
227  << "failed to delete job: " << e.GetMsg());
228  }
229 }
230 
231 
234  &CAlnVecMultiDataSource::OnAppJobNotification)
236  &CAlnVecMultiDataSource::OnAppJobNotification)
238 
239 
240 int CAlnVecMultiDataSource::GetConsensusRow() const
241 {
242  return m_ConsRowIndex;
243 }
244 
245 
246 void CAlnVecMultiDataSource::Init(const objects::CSeq_annot& annot, bool sync, bool select_anchor)
247 {
248  vector< CConstRef<CSeq_align> > aligns;
249 
250  if(annot.GetData().IsAlign() ) {
251  ITERATE (CSeq_annot::TData::TAlign, it, annot.GetData().GetAlign()) {
252  aligns.emplace_back(*it);
253  }
254  }
255  Init(aligns, sync, select_anchor);
256 }
257 
258 
259 void CAlnVecMultiDataSource::Init(const objects::CBioseq_Handle& handle, bool sync, bool select_anchor)
260 {
261  vector< CConstRef<CSeq_align> > aligns;
262 
263  SAnnotSelector sel =
265  CAlign_CI it(handle, sel);
266  for ( ; it; ++it) {
267  aligns.emplace_back(&*it);
268  }
269  Init(aligns, sync, select_anchor);
270 }
271 
272 
273 void CAlnVecMultiDataSource::Init(const vector< CConstRef<CSeq_align> >& aligns,
274  bool sync, bool select_anchor)
275 {
276  _ASSERT( ! m_Job);
278 
279  m_Job.Reset(new CBuildAlnVecJob(aligns, *m_Scope, select_anchor));
280  m_isDataReadSync = sync;
281 
282  if(sync) {
283  // do everything synchronously
284  disp.RunSync(*m_Job, m_JobID, *this);
285  m_JobID = -1;
286  /*
287  int type = CDataChangeNotifier::eError;
288 
289  if(m_Job->Run()) {
290  type = CDataChangeNotifier::eChanged;
291  CBuildAlnVecResult* res = dynamic_cast<CBuildAlnVecResult*>(m_Job->GetResult().GetPointer());
292  if (!res) {
293  NCBI_THROW(CCoreException, eNullPtr, "Cannot get merge results: AlnVec is empty");
294  }
295  _ASSERT(res);
296  x_Assign(*res->m_AlnVec);
297  }
298 
299  //m_HasScores = x_TransferResults(m_Job->GetResult().GetPointer());
300  m_Job.Reset();
301 
302  CUpdate up(type, "");
303  NotifyListener(up);
304  */
305  } else {
306  // use CAppJobDispatcher to execute background jobs
307  m_JobID = disp.StartJob(*m_Job, "ThreadPool", *this, 1, true);
308  }
309 }
310 
311 
313 {
314  return m_AlnVec.GetPointer() == NULL;
315 }
316 
317 
319 {
320  CAppJobNotification* notn = dynamic_cast<CAppJobNotification*>(evt);
321  _ASSERT(notn);
322 
323  if(notn) {
324  int job_id = notn->GetJobID();
325  if(m_JobID != job_id) {
326  ERR_POST("CAlnVecMultiDataSource::OnAppJobNotification() - unknown Job ID " << job_id);
327  } else {
328  switch(notn->GetState()) {
329  case IAppJob::eCompleted: {
330  CBuildAlnVecResult* res = dynamic_cast<CBuildAlnVecResult*>(notn->GetResult().GetPointer());
331  _ASSERT(res);
332  x_Assign(*res->m_AlnVec);
334  NotifyListener(up);
335  m_Job.Reset();
336  break;
337  }
338  case IAppJob::eFailed: {
340  CUpdate up(CDataChangeNotifier::eError, error->GetText());
341  NotifyListener(up);
342  break;
343  }
344  case IAppJob::eCanceled: {
345  CUpdate up(CDataChangeNotifier::eError, "Canceled");
346  NotifyListener(up);
347  break;
348  }
349  case IAppJob::eRunning: {
350  CConstIRef<IAppJobProgress> progressSoFar = notn->GetProgress();
351  if (progressSoFar) {
353  progressSoFar->GetText(), progressSoFar->GetNormDone());
354  NotifyListener(up);
355  }
356  break;
357  }
358  default:
359  _ASSERT(false);
360  }
361  }
362  }
363 }
364 
365 
366 void CAlnVecMultiDataSource::x_Assign(objects::CAlnVec& aln_vec)
367 {
368  x_ClearHandles();
369 
370  m_AlnVec.Reset(&aln_vec);
371  SetGapChar('-');
372 
373  x_CreateHandles();
374  for (const auto& row : m_Handles) {
375  if (!row->IsConsensus())
376  continue;
377  m_ConsRowIndex = row->GetRowNum();
378  m_CreateConsensus = false;
379  break;
380  }
381  CreateConsensus();
382 }
383 
384 
386 {
387  return m_Job.GetPointer() != NULL;
388 }
389 
390 
392 {
393 // This assertion is invalid, see Init with sync == true
394 // _ASSERT(m_Job && m_JobID != -1);
395  _ASSERT(m_Job);
396 
397  if (m_JobID != -1) {
399  disp.DeleteJob(m_JobID);
400 
401  m_JobID = -1;
402  }
403  m_Job.Reset();
404 
405  //TODO update listener ?
406 }
407 
408 
410 {
412  if(prg) {
413  string text = prg->GetText();
414 
415  if(x_IsJobRunning()) {
417  NotifyListener(up);
418  }
419  }
420 }
421 
422 
424 {
425  m_AlnVec->SetGapChar(gap_char);
426 }
427 
429 {
430  if( ! m_AlnVec) {
431  return IAlnExplorer::fInvalid;
432  }
433  if(m_AlnVec->GetDenseg().IsSetWidths()) {
434  return IAlnExplorer::fMixed;
435  } else {
436  try {
437  const CBioseq& bioseq = *GetBioseqHandle(0).GetBioseqCore();
438  if (bioseq.GetInst().GetMol() == CSeq_inst::eMol_aa) {
439  return IAlnExplorer::fProtein;
440  } else {
441  return IAlnExplorer::fDNA;
442  }
443  } catch (exception& e) {
444  ERR_POST(Error << e.what());
445  return IAlnExplorer::fInvalid;
446  }
447  }
448 }
449 
450 
452 {
453  return (TNumrow) m_AlnVec->GetNumRows();
454 }
455 
456 
458 {
459  return m_AlnVec->GetAlnStart();
460 }
461 
462 
464 {
465  return m_AlnVec->GetAlnStop();
466 }
467 
468 
470 {
471  return m_AlnVec->GetSeqStart(row);
472 }
473 
474 
476 {
477  return m_AlnVec->GetSeqStop(row);
478 }
479 
480 
482 {
483  return m_AlnVec->IsSetAnchor();
484 }
485 
486 
488 {
489  return m_AlnVec->GetAnchor();
490 }
491 
492 
494 {
495  return true;
496 }
497 
498 
500 {
501  m_AlnVec->SetAnchor(anchor);
502  return true;
503 }
504 
505 
507 {
508  m_AlnVec->UnsetAnchor();
509  return true;
510 }
511 
512 
514 {
515  return (row >= 0 && row < m_AlnVec->GetNumRows());
516 }
517 
518 
520 {
521  return m_AlnVec->GetSeqId(row);
522 }
523 
524 
526 {
527  return m_AlnVec->GetBioseqHandle(row);
528 }
529 
531 {
532  return m_AlnVec->GetWidth(row);
533 }
534 
536 {
537  return GetRowHandle(row)->GetGenCode();
538 }
539 
541 {
542  return m_AlnVec->IsPositiveStrand(row);
543 }
544 
545 
547 {
548  return m_AlnVec->IsNegativeStrand(row);
549 }
550 
551 
554  bool try_reverse_dir) const
555 {
556  return m_AlnVec->GetAlnPosFromSeqPos(for_row, aln_pos,
557  (CAlnVec::ESearchDirection) dir, try_reverse_dir);
558 }
559 
560 
563  bool try_reverse_dir) const
564 {
565  return m_AlnVec->GetSeqPosFromAlnPos(row, seq_pos, (CAlnVec::ESearchDirection) dir, try_reverse_dir);
566 }
567 
568 
569 
571  const IAlnExplorer::TSignedRange& aln_range) const
572 {
573  return m_AlnVec->GetAlnSeqString(buffer, row, aln_range);
574 }
575 
576 
578  const TRangeColl& seq_coll,
579  TRangeColl& aln_coll) const
580 {
581  aln_coll.empty();
582 
583  TSeqPos seq_start = m_AlnVec->GetSeqStart(row);
584  TSeqPos seq_stop = m_AlnVec->GetSeqStop(row);
585  if(seq_start > seq_stop) {
586  swap(seq_start, seq_stop);
587  }
588 
589  ITERATE(TRangeColl, it, seq_coll) {
590  // clip collection by alignment
591  TSeqPos from = max(seq_start, it->GetFrom());
592  TSeqPos to = min(seq_stop, it->GetTo());
593  // translate
594  TSeqPos aln_from = m_AlnVec->GetAlnPosFromSeqPos(row, from, CAlnVec::eRight);
595  TSeqPos aln_to = m_AlnVec->GetAlnPosFromSeqPos(row, to, CAlnVec::eLeft);
596  if(aln_from > aln_to) {
597  swap(aln_from, aln_to);
598  }
599  aln_coll.CombineWith(TSeqRange(aln_from, aln_to));
600  }
601 }
602 
603 
605  const TRangeColl& aln_coll,
606  TRangeColl& seq_coll) const
607 {
608  seq_coll.empty();
609 
610  TSeqPos aln_start = m_AlnVec->GetSeqAlnStart(row);
611  TSeqPos aln_stop = m_AlnVec->GetSeqAlnStop(row);
612 
613  ITERATE(TRangeColl, it, aln_coll) {
614  // clip collection by alignment
615  TSeqPos aln_from = max(aln_start, it->GetFrom());
616  TSeqPos aln_to = min(aln_stop, it->GetTo());
617 
618  // translate
619  TSeqPos from = m_AlnVec->GetSeqPosFromAlnPos(row, aln_from, CAlnVec::eRight);
620  TSeqPos to = m_AlnVec->GetSeqPosFromAlnPos(row, aln_to, CAlnVec::eLeft);
621 
622  if(from > to) {
623  swap(from, to);
624  }
625  seq_coll.CombineWith(TSeqRange(from, to));
626  }
627 }
628 
629 
631 {
632  return static_cast<const IAlignRowHandle*>(m_Handles[row]);
633 }
634 
635 
639 {
640  const IAlignRowHandle* handle = GetRowHandle(row);
641  return handle->CreateSegmentIterator(range, flags);
642 }
643 
644 
646 {
648  delete *it;
649  }
650  m_Handles.clear();
651 }
652 
653 
655 {
656  TNumrow row_n = m_AlnVec->GetNumRows();
657  m_Handles.resize(row_n);
658  for( TNumrow r = 0; r < row_n; r++ ) {
660  }
661 }
662 
663 
665 {
666  return true;
667 }
668 
669 bool s_IsGap(char ch)
670 {
671  return ch == '-';
672 }
673 
674 void CollectNucleotideFrequences(const vector<string>& rows, int col, int base_count[], int numBases)
675 {
676  // first, we record which bases occur and how often
677  // this is computed in NCBI4na notation
678  fill_n(base_count, numBases, 0);
679 
680  unsigned char c;
681  for (size_t i = 0; i < rows.size(); ++i) {
682  if (col >= rows[i].size())
683  continue;
684  c = rows[i][col];
685  switch (c) {
686  case 'A':
687  ++base_count[0];
688  break;
689  case 'C':
690  ++base_count[1];
691  break;
692  case 'M':
693  ++base_count[1];
694  ++base_count[0];
695  break;
696  case 'G':
697  ++base_count[2];
698  break;
699  case'R':
700  ++base_count[2];
701  ++base_count[0];
702  break;
703  case 'S':
704  ++base_count[2];
705  ++base_count[1];
706  break;
707  case 'V':
708  ++base_count[2];
709  ++base_count[1];
710  ++base_count[0];
711  break;
712  case 'T':
713  ++base_count[3];
714  break;
715  case 'W':
716  ++base_count[3];
717  ++base_count[0];
718  break;
719  case 'Y':
720  ++base_count[3];
721  ++base_count[1];
722  break;
723  case 'H':
724  ++base_count[3];
725  ++base_count[1];
726  ++base_count[0];
727  break;
728  case 'K':
729  ++base_count[3];
730  ++base_count[2];
731  break;
732  case 'D':
733  ++base_count[3];
734  ++base_count[2];
735  ++base_count[0];
736  break;
737  case 'B':
738  ++base_count[3];
739  ++base_count[2];
740  ++base_count[1];
741  break;
742  case 'N':
743  ++base_count[3];
744  ++base_count[2];
745  ++base_count[1];
746  ++base_count[0];
747  break;
748  default:
749  break;
750  }
751  }
752 }
753 
754 //void CollectProteinFrequences(const string& col, int base_count[], int numBases)
755 void CollectProteinFrequences(const vector<string>& rows, int col, int base_count[], int numBases)
756 {
757  // first, we record which bases occur and how often
758  // this is computed in NCBI4na notation
759  fill_n(base_count, numBases, 0);
760 
761  char c;
762  for (size_t i = 0; i < rows.size(); ++i) {
763  if (rows[i].empty())
764  continue;
765  c = rows[i][col];
766  int pos = c - 'A';
767  if (0 <= pos && pos < numBases)
768  ++base_count[pos];
769  }
770 }
771 
772 void CAlnVecMultiDataSource::CreateConsensus(vector<string>& consens) const
773 {
774 
775  bool isNucleotide = m_AlnVec->GetBioseqHandle(0).IsNucleotide();
776 
777  const int numBases = isNucleotide ? 4 : 26;
778  const auto num_rows = m_AlnVec->GetNumRows();
779  const auto num_segs = m_AlnVec->GetNumSegs();
780 
781  int base_count[26]; // must be a compile-time constant for some compilers
782 
783  // determine what the number of segments required for a gapped consensus
784  // segment is. this must be rounded to be at least 50%.
785  int gap_seg_thresh = num_rows - num_rows / 2;
786 
787  for (auto j = 0; j < num_segs; ++j) {
788  // evaluate for gap / no gap
789  int gap_count = 0;
790  auto seg_len = m_AlnVec->GetLen(j, 0);
791  for (auto i = 0; i < num_rows; ++i) {
792  if (m_AlnVec->GetStart(i, j, 0) == -1) {
793  //auto seg_type = m_AlnVec->GetSegType(i, j, 0);
794  //if (seg_type != 0)
795  ++gap_count;
796  }
797  }
798 
799  // check to make sure that this seg is not a consensus
800  // gap seg
801  if (gap_count > gap_seg_thresh) {
802  consens[j].resize(seg_len);
803  for (size_t i = 0; i < seg_len; ++i)
804  consens[j][i] = m_AlnVec->GetGapChar(0);
805  continue;
806  }
807 
808 
809  // the base threshold for being considered unique is at least
810  // 70% of the available sequences
811  int base_thresh =
812  ((num_rows - gap_count) * 7 + 5) / 10;
813 
814  {
815  // we will build a segment with enough bases to match
816  consens[j].resize(seg_len);
817 
818  // retrieve all sequences for this segment
819  vector<string> segs(num_rows);
820  m_AlnVec->RetrieveSegmentSequences(j, segs);
821 
823  //
824  // evaluate for a consensus
825  //
826  for (auto i = 0; i < seg_len; ++i) {
827  if (isNucleotide) {
828  CollectNucleotideFrequences(segs, i, base_count, numBases);
829  }
830  else {
831  CollectProteinFrequences(segs, i, base_count, numBases);
832  }
833 
834 
835  // we create a sorted list (in descending order) of
836  // frequencies of appearance to base
837  // the frequency is "global" for this position: that is,
838  // if 40% of the sequences are gapped, the highest frequency
839  // any base can have is 0.6
840  TRevMap rev_map;
841 
842  for (int k = 0; k < numBases; ++k) {
843  // this gets around a potentially tricky idiosyncrasy
844  // in some implementations of multimap. depending on
845  // the library, the key may be const (or not)
846  TRevMap::value_type p(base_count[k], isNucleotide ? (1 << k) : k);
847  rev_map.insert(p);
848  }
849 
850  // now, the first element here contains the best frequency
851  // we scan for the appropriate bases
852  if (rev_map.count(rev_map.begin()->first) == 1 &&
853  rev_map.begin()->first >= base_thresh) {
854  consens[j][i] = isNucleotide ?
855  m_AlnVec->ToIupac(rev_map.begin()->second) :
856  (rev_map.begin()->second + 'A');
857  }
858  else {
859  // now we need to make some guesses based on IUPACna
860  // notation
861  int count;
862  unsigned char c = 0x00;
863  int freq = 0;
864  TRevMap::iterator curr = rev_map.begin();
865  TRevMap::iterator prev = rev_map.begin();
866  for (count = 0;
867  curr != rev_map.end() &&
868  (freq < base_thresh || prev->first == curr->first);
869  ++curr, ++count) {
870  prev = curr;
871  freq += curr->first;
872  if (isNucleotide) {
873  c |= curr->second;
874  }
875  else {
876  unsigned char cur_char = curr->second + 'A';
877  switch (c) {
878  case 0x00:
879  c = cur_char;
880  break;
881  case 'N': case 'D':
882  c = (cur_char == 'N' || cur_char == 'D') ? 'B' : 'X';
883  break;
884  case 'Q': case 'E':
885  c = (cur_char == 'Q' || cur_char == 'E') ? 'Z' : 'X';
886  break;
887  case 'I': case 'L':
888  c = (cur_char == 'I' || cur_char == 'L') ? 'J' : 'X';
889  break;
890  default:
891  c = 'X';
892  }
893  }
894  }
895 
896  //
897  // catchall
898  //
899  if (count > 2) {
900  consens[j][i] = isNucleotide ? 'N' : 'X';
901  }
902  else {
903  consens[j][i] = isNucleotide ? m_AlnVec->ToIupac(c) : c;
904  }
905  }
906  }
907  }
908  }
909 }
910 
911 /*
912 // Create consensus using CAlnVec function
913 void CAlnVecMultiDataSource::CreateConsensus()
914 {
915  if (m_CreateConsensus) {
916  if (m_AlnVec.NotEmpty() && m_ConsRowIndex == -1) {
917  x_ClearHandles();
918 
919  CRef<CDense_seg> ds = m_AlnVec->CreateConsensus(m_ConsRowIndex);
920  m_AlnVec.Reset(new CAlnVec(*ds, m_AlnVec->GetScope()));
921 
922  x_CreateHandles();
923  }
924  }
925 }
926 */
927 
928 // Create alignment using local (CAlnVecMultiDataSource) function
930 {
931  if (m_CreateConsensus) {
932  if (m_AlnVec.NotEmpty() && m_ConsRowIndex == -1) {
933  x_ClearHandles();
934 
935  unique_ptr<vector<string>> consens(new vector<string>);
936  consens->resize(m_AlnVec->GetNumSegs());
937  CreateConsensus(*consens);
938 
939  CSeq_id consensus_id("lcl|consensus");
940  CRef<CBioseq> bioseq(new CBioseq);
941  CRef<CDense_seg> ds = m_AlnVec->CreateConsensus(m_ConsRowIndex,
942  *bioseq, consensus_id, consens.get());
943 
944  // add bioseq to the scope
945  CRef<CSeq_entry> entry(new CSeq_entry());
946  entry->SetSeq(*bioseq);
947  m_AlnVec->GetScope().AddTopLevelSeqEntry(*entry);
948 
949  m_AlnVec.Reset(new CAlnVec(*ds, m_AlnVec->GetScope()));
950  x_CreateHandles();
951  }
952  }
953 }
954 
955 static void s_TranslateAndExpand(string& seq, int gen_code)
956 {
957  if (seq.empty())
958  return;
959  string new_seq;
960  CAlnVec::TranslateNAToAA(seq, new_seq);
961  auto seq_len = new_seq.size();
962  seq.resize(seq_len * 3);
963 
964  for (size_t i = 0; i < seq_len; ++i) {
965  size_t n_i = 3 * i;
966  seq[n_i] = seq[n_i + 1] = seq[n_i + 2] = new_seq[i];
967  }
968 }
969 
970 #ifdef _SHOW_CONSENSUS_IN_PANORAMA_
971 
972 static set<char> ss_NonAmbiguousAA = {
973  'A','C','D','E','F',
974  'G','H','I','K','L',
975  'M','N','O','P','Q',
976  'R','S','T','U','V',
977  'W','Y','X', ' ', '-'
978 };
979 
980 static set<char> ss_NonAmbiguousDNA = {
981  'A','C','G','T', 'N', ' ', '-'
982 };
983 #endif
984 
985 #define ADD_BASE_TO_GRAPH(b) \
986  { if (graphs[b].size() == 0) graphs[b].resize(ref_len, 0); graphs[b][curr_pos] += 1; }
987 
988 
989 void IAlnMultiDataSource::CollectAlignStats(const TSignedSeqRange& aln_range, TStatGraphs& graphs, bool translate_sequence) const
990 {
991  _ASSERT(graphs.empty());
992  graphs.clear();
993  auto align_type = GetAlignType();
994 
995  if (align_type == IAlnExplorer::fMixed)
996  NCBI_THROW(CException, eUnknown, "Alignment of mixed types are not supported");
997  if (GetNumRows() == 0)
998  return;
999  char ambiguous_residue = ' ';
1000  if (align_type == IAlnExplorer::EAlignType::fDNA)
1001  ambiguous_residue = 'N';
1002  else if (align_type == IAlnExplorer::EAlignType::fProtein)
1003  ambiguous_residue = 'X';
1004 
1005  // initialize the top sequence to generate statistics for every base
1006  string ref_str;
1007  int anchor = -1;
1008  auto consensus_idx = GetConsensusRow();
1009  if (IsSetAnchor()) {
1010  anchor = GetAnchor();
1011  GetAlnSeqString(anchor, ref_str, aln_range);
1012  if (translate_sequence)
1013  s_TranslateAndExpand(ref_str, GetGenCode(anchor));
1014  }
1015  bool ref_is_consensus = anchor != -1 && anchor == consensus_idx;
1016 
1017 #ifdef _SHOW_CONSENSUS_IN_PANORAMA_
1018  const set<char>* non_ambiguous_set = nullptr;
1019  if (consensus_idx >= 0) {
1020  if (translate_sequence || GetAlignType() == IAlnExplorer::fProtein)
1021  non_ambiguous_set = &ss_NonAmbiguousAA;
1022  else
1023  non_ambiguous_set = &ss_NonAmbiguousDNA;
1024  }
1025 #endif
1026 
1027  TSeqPos ref_len = aln_range.GetLength();
1028  for (TNumrow row = 0; row < GetNumRows(); ++row) {
1029  if (row == consensus_idx) // skip consensus
1030  continue;
1031 
1032  unique_ptr<IAlnSegmentIterator> p_it
1034 
1035  for (IAlnSegmentIterator& it = *p_it; it; ++it) {
1036  const IAlnSegment& seg = *it;
1037 
1038  if ((seg.GetType() & IAlnSegment::fIndel &&
1039  !seg.GetRange().Empty()) ||
1040  // seg.GetType() & IAlnSegment::fUnaligned ||
1041  seg.GetType() & IAlnSegment::fGap) {
1042  // ignore the inserts
1043  continue;
1044  }
1045  // auto row_str = GetSeqId(row).GetSeqIdString();
1046  const IAlnSegment::TSignedRange& curr_aln_r = seg.GetAlnRange();
1047  TSeqPos seg_len = (TSeqPos)curr_aln_r.GetLength();
1048  if (seg_len == 0)
1049  continue;
1050 
1051  TSignedSeqPos curr_aln_start = curr_aln_r.GetFrom();
1052 
1053  TSeqPos off = 0;
1054  size_t pos = 0;
1055  if (curr_aln_start < aln_range.GetFrom()) {
1056  pos = aln_range.GetFrom() - curr_aln_start;
1057  } else {
1058  off = curr_aln_start - aln_range.GetFrom();
1059  }
1060  size_t curr_pos = off;
1061  string aln_seq;
1062  if (seg.GetType() & IAlnSegment::fAligned) {
1063  GetAlnSeqString(row, aln_seq, curr_aln_r);
1064  if (translate_sequence)
1065  s_TranslateAndExpand(aln_seq, GetGenCode(row));
1066  }
1067 
1068  while (pos < seg_len && curr_pos < ref_len) {
1069  if (seg.GetType() & IAlnSegment::fAligned && pos < aln_seq.size()) {
1070  char base = aln_seq[pos];
1071  if (s_IsGap(base)) {
1072  base = '-';
1073  } else if (anchor >=0 && curr_pos < ref_len) {
1074 #ifdef _SHOW_CONSENSUS_IN_PANORAMA_
1075 
1076  if (non_ambiguous_set && non_ambiguous_set->count(ref_str[curr_pos]) == 0)
1077  ADD_BASE_TO_GRAPH('w')
1078  else
1079 #endif
1080  if (base != ref_str[curr_pos] && base != ambiguous_residue) {
1081  if (ref_str[curr_pos] != ambiguous_residue || ref_is_consensus) {
1082  ADD_BASE_TO_GRAPH('m')
1083  }
1084  }
1085 
1086  /*
1087  if (base != ref_str[curr_pos]
1088  && base != ambiguous_residue
1089  && ref_str[curr_pos] != ambiguous_residue)
1090  ADD_BASE_TO_GRAPH('m')
1091  */
1092  }
1093  ADD_BASE_TO_GRAPH(base)
1094 
1095  } else {
1096  ADD_BASE_TO_GRAPH('-')
1097  }
1098  ++pos;
1099  ++curr_pos;
1100  }
1101  }
1102  }
1103 }
1104 
1105 
static CRef< CScope > m_Scope
static void s_GetTaxonomyInfoFromService(int tax_id, string &taxonomy)
CRef< CScope > TScopeRef
USING_SCOPE(ncbi::objects)
static void s_TranslateAndExpand(string &seq, int gen_code)
static objects::CScope & s_GetPristineScope(const objects::CBioseq_Handle &bio_handle)
static void s_GetTaxonomyInfoFromBiosrc(const objects::CBioSource &biosrc, std::string &taxonomy)
void CollectNucleotideFrequences(const vector< string > &rows, int col, int base_count[], int numBases)
ON_EVENT(CAppJobNotification, CAppJobNotification::eStateChanged, &CAlnVecMultiDataSource::OnAppJobNotification) ON_EVENT(CAppJobNotification
void CollectProteinFrequences(const vector< string > &rows, int col, int base_count[], int numBases)
void TlsValCleanup(T *oldVal, void *)
bool s_IsGap(char ch)
#define ADD_BASE_TO_GRAPH(b)
static CStaticTls< TScopeRef > s_ScopeTls
CAlign_CI –.
Definition: align_ci.hpp:63
CAlnVecMultiDataSource - implementation of IAlnMultiDataSource for CAlnVec-based alignments.
virtual const IAlignRowHandle * GetRowHandle(TNumrow row) const
virtual void x_OnJobProgress(CAppJobNotification &notn)
virtual bool IsPositiveStrand(TNumrow row) const
virtual TSeqPos GetSeqStop(TNumrow row) const
virtual void GetAlnFromSeq(TNumrow row, const TRangeColl &seq_coll, TRangeColl &aln_coll) const
virtual TNumrow GetNumRows(void) const
number of rows in alignment
virtual void CreateConsensus()
virtual TSignedSeqPos GetAlnPosFromSeqPos(TNumrow row, TSeqPos seq_pos, TSearchDirection dir=IAlnExplorer::eNone, bool try_reverse_dir=true) const
virtual void GetSeqFromAln(TNumrow row, const TRangeColl &aln_coll, TRangeColl &seq_coll) const
virtual void Init(const objects::CSeq_annot &annot, bool sync=false, bool select_anchor=false)
virtual bool SetAnchor(TNumrow anchor)
virtual TSeqPos GetSeqStart(TNumrow row) const
virtual void SetGapChar(TResidue gap_char)
vector< CAlnVecRowHandle * > THandleVector
virtual TSignedSeqPos GetSeqPosFromAlnPos(TNumrow for_row, TSeqPos aln_pos, TSearchDirection dir=IAlnExplorer::eNone, bool try_reverse_dir=true) const
virtual TSeqPos GetBaseWidth(TNumrow row) const
virtual bool IsNegativeStrand(TNumrow row) const
virtual IAlnSegmentIterator * CreateSegmentIterator(TNumrow row, const IAlnExplorer::TSignedRange &range, IAlnSegmentIterator::EFlags flags) const
CAlnVecMultiDataSource(objects::CScope &scope)
virtual string & GetAlnSeqString(TNumrow row, string &buffer, const IAlnExplorer::TSignedRange &aln_range) const
virtual TSeqPos GetAlnStart(void) const
virtual bool IsSetAnchor(void) const
Anchoring methods.
virtual IAlnExplorer::EAlignType GetAlignType() const
virtual void x_CreateHandles()
void OnAppJobNotification(CEvent *evt)
virtual TSeqPos GetAlnStop(void) const
virtual bool UnsetAnchor(void)
CRef< objects::CScope > m_Scope
virtual const objects::CSeq_id & GetSeqId(TNumrow row) const
CRef< objects::CAlnVec > m_AlnVec
virtual bool x_IsJobRunning()
virtual void x_ClearHandles()
virtual bool CanGetId(TNumrow row) const
virtual void x_Assign(objects::CAlnVec &aln_vec)
virtual bool CanCreateConsensus()
virtual const objects::CBioseq_Handle & GetBioseqHandle(TNumrow row) const
CRef< CBuildAlnVecJob > m_Job
virtual bool IsEmpty() const
virtual TNumrow GetAnchor(void) const
virtual bool CanChangeAnchor(void) const
virtual int GetGenCode(IAlnExplorer::TNumrow row) const
CAlnVecRowHandle is inherited from IAlignRowHandle and represents a row of CAlnVec-based alignment.
static void TranslateNAToAA(const string &na, string &aa, int gen_code=kDefaultGenCode)
Definition: alnvec.cpp:893
CAppJobDispatcher.
CAppJobNotification Notification send by CAppJobEventTranslator.
CBioseq_Handle –.
CBuildAlnVecJob.
CBuildAlnVecResult.
CRef< objects::CAlnVec > m_AlnVec
CUpdate - notification send by CUIDataSource to the listener.
virtual void NotifyListener(CUpdate &update)
CEventHandler.
CEvent - generic event implementation TODO TODO - Attachments.
Definition: event.hpp:86
void GetLabel(string *label) const
Definition: Org_ref.cpp:57
TThisType & CombineWith(const TRange &r)
Definition: range_coll.hpp:195
bool empty() const
Definition: range_coll.hpp:102
CScope –.
Definition: scope.hpp:92
Definition: Seq_entry.hpp:56
string GetTaxname(int tax_id)
static CTaxonCache & GetInstance()
string GetCommon(int tax_id)
string GetLabel(int tax_id)
string GetBlastName(int tax_id)
IAlignRowHandle provides an abstract way to access alignment row data.
Definition: alnmulti_ds.hpp:59
virtual bool CanGetBioseqHandle() const =0
virtual const string & GetOrgLabel(bool fExtOrgLookup=false) const
virtual int GetGenCode() const =0
virtual const objects::CBioseq_Handle & GetBioseqHandle() const =0
virtual void GetTaxonomy(string &taxonomy, bool fExtOrgLookup=false) const
virtual int GetTaxId(bool fExtOrgLookup=false) const
virtual IAlnSegmentIterator * CreateSegmentIterator(const IAlnExplorer::TSignedRange &range, IAlnSegmentIterator::EFlags flags) const =0
virtual const string & GetText() const =0
ESearchDirection
Position search options.
@ eRight
Towards higher aln coord (always to the right)
@ eLeft
Towards lower aln coord (always to the left)
virtual IAlnExplorer::EAlignType GetAlignType() const =0
virtual int GetConsensusRow() const =0
returns index of the Consensus row or -1 if it doesn't exist
void CollectAlignStats(const TSignedSeqRange &range, TStatGraphs &graphs, bool translate_sequence=false) const
IAlnExplorer::TResidue TResidue
virtual IAlnSegmentIterator * CreateSegmentIterator(TNumrow row, const IAlnExplorer::TSignedRange &range, IAlnSegmentIterator::EFlags flags) const =0
IAlnExplorer::TNumrow TNumrow
virtual TNumrow GetNumRows(void) const =0
number of rows in alignment
virtual bool IsSetAnchor(void) const =0
Anchoring methods.
virtual TNumrow GetAnchor(void) const =0
virtual string & GetAlnSeqString(TNumrow row, string &buffer, const IAlnExplorer::TSignedRange &aln_range) const =0
Alignment segment iterator interface.
EFlags
Iterator options.
@ eAllSegments
Iterate all segments.
Alignment segment interface.
virtual const TSignedRange & GetRange(void) const =0
Get the selected row range.
@ fAligned
Aligned segment.
@ fIndel
Either anchor or the selected row is not present in the segment.
@ fGap
Both anchor row and the selected row are not included in the segment (some other row is present and t...
virtual TSegTypeFlags GetType(void) const =0
Get current segment type.
virtual const TSignedRange & GetAlnRange(void) const =0
Get alignment range for the segment.
virtual int GetGenCode(IAlnExplorer::TNumrow row) const =0
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
Definition: map.hpp:338
static uch flags
#define T(s)
Definition: common.h:230
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
static objects::SAnnotSelector GetAnnotSelector(TAnnotFlags flags=0)
request an annotation selector for a given type
Definition: utils.cpp:167
static bool IsVDBAccession(const string &acc)
Check if string starts with ("SRA", "SRR", "DRR", "ERR")
Definition: utils.cpp:887
CConstIRef< IAppJobError > GetError() const
returns non-null pointer only if job Failed
void RunSync(IAppJob &job, TJobID &jobId, CEventHandler &listener)
Runs jon synchronously sending job notifications synchronously Returns when job is finished.
static CAppJobDispatcher & GetInstance()
CRef< CObject > GetResult() const
returns non-null pointer only if Completed or Running and has temporary results available
bool DeleteJob(TJobID job_id)
when a Job is deleted the listener is not notified
TJobID StartJob(IAppJob &job, const string &engine_name, IEngineParams *params=NULL)
Starts a Job on the specified engine in "passive mode" - no notifications or progress reports will be...
CConstIRef< IAppJobProgress > GetProgress() const
returns non-null pointer only if notification type is eProgress
#define END_EVENT_MAP()
Ends definition of Command Map.
#define BEGIN_EVENT_MAP(thisClass, baseClass)
Begins definition of Command Map for CEventHandler-derived class.
TJobState GetState() const
@ eCanceled
Definition: app_job.hpp:91
@ eCompleted
Definition: app_job.hpp:89
@ eRunning
Definition: app_job.hpp:88
@ eFailed
Definition: app_job.hpp:90
const CBioSource * GetBioSource(const CBioseq &bioseq)
Retrieve the BioSource object for a given bioseq handle.
Definition: sequence.cpp:104
TTaxId GetTaxId(const CBioseq_Handle &handle)
return the tax-id associated with a given sequence.
Definition: sequence.cpp:274
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
const TDescr & GetDescr(void) const
bool CanGetDescr(void) const
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
position_type GetLength(void) const
Definition: range.hpp:158
bool Empty(void) const
Definition: range.hpp:148
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
bool CanGetCommon(void) const
Check if it is safe to call GetCommon method.
Definition: Org_ref_.hpp:413
const TCommon & GetCommon(void) const
Get the Common member data.
Definition: Org_ref_.hpp:419
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
bool CanGet(void) const
Check if it is safe to call Get method.
Definition: Seq_descr_.hpp:160
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
list< CRef< CSeq_align > > TAlign
Definition: Seq_annot_.hpp:194
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
int i
static void text(MDB_val *v)
Definition: mdb_dump.c:62
range(_Ty, _Ty) -> range< _Ty >
constexpr bool empty(list< Ts... >) noexcept
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const struct ncbi::grid::netcache::search::fields::SIZE size
T max(T x_, T y_)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static pcre_uint8 * buffer
Definition: pcretest.c:1051
#define row(bind, expected)
Definition: string_bind.c:73
SAnnotSelector –.
#define _ASSERT
Modified on Wed Apr 24 14:17:19 2024 by modify_doxy.py rev. 669887