NCBI C++ ToolKit
cdd_pssm_input.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cdd_pssm_input.cpp 100101 2023-06-15 14:10:29Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Greg Boratyn
27  *
28  */
29 
30 /** @file psi_pssm_input.cpp
31  * Implementation of the concrete strategy to obtain PSSM input data for
32  * PSI-BLAST.
33  */
34 #include <ncbi_pch.hpp>
35 
36 // BLAST includes
37 //#include <algo/blast/api/psi_pssm_input.hpp>
40 #include "../core/blast_psi_priv.h"
41 
42 // Object includes
48 
49 // Object manager includes
50 #include <objmgr/scope.hpp>
51 #include <objmgr/seq_vector.hpp>
52 
53 
54 /** @addtogroup AlgoBlast
55  *
56  * @{
57  */
58 
61 BEGIN_SCOPE(blast)
62 
63 #ifndef GAP_IN_ALIGNMENT
64  /// Representation of GAP in Seq-align
65 # define GAP_IN_ALIGNMENT ((Uint4)-1)
66 #endif
67 
68 //////////////////////////////////////////////////////////////////////////////
69 
70 
71 CCddInputData::CCddInputData(const Uint1* query, unsigned int query_length,
73  const PSIBlastOptions& opts,
74  const string& dbname,
75  const string& matrix_name /* = "BLOSUM62" */,
76  int gap_existence /* = 0 */,
77  int gap_extension /* = 0 */,
78  PSIDiagnosticsRequest* diags /* = NULL */,
79  const string& query_title /* = "" */)
80  : m_QueryTitle(query_title),
81  m_DbName(dbname),
82  m_SeqalignSet(seqaligns),
83  m_Msa(NULL),
84  m_Opts(opts),
85  m_MatrixName(matrix_name),
86  m_DiagnosticsRequest(diags),
87  m_MinEvalue(-1.0),
88  m_GapExistence(gap_existence),
89  m_GapExtension(gap_extension)
90 {
91  if (!query) {
92  NCBI_THROW(CBlastException, eInvalidArgument, "NULL query");
93  }
94 
95  if (seqaligns.Empty()) {
96  NCBI_THROW(CBlastException, eInvalidArgument, "NULL alignments");
97  }
98 
99  m_QueryData.resize(query_length);
100  memcpy(&m_QueryData[0], query, query_length * sizeof(Uint1));
101 }
102 
103 
105 {
106  for (unsigned int i=0;i < m_Hits.size();i++) {
107  delete m_Hits[i];
108  }
109 
110  delete [] m_Msa;
111 }
112 
114 {
116 
117  NCBI_THROW(CBlastException, eInvalidOptions,
118  "Minimum RPS-BLAST e-value is larger than the maximum one");
119  }
120 
122 
123  // process primary alignments
125 
126  // remove overlaping mutliple hits to the same CD
128 
129  // this is required by PSSM engine code
130  m_MsaDimensions.query_length = static_cast<Uint4>(m_QueryData.size());
131  m_MsaDimensions.num_seqs = static_cast<Uint4>(m_Hits.size());
133 
134  x_FillHitsData();
135  // this validation has only assertions, no use calling it
136  // for non-debug builds
138 
139  x_CreateMsa();
140  // the same validation is done on the core level
142 
143  // extract query as Bioseq, needed so that query information can be stored
144  // in PssmWithParameters
146 
149 }
150 
151 
152 void CCddInputData::x_ProcessAlignments(double min_evalue, double max_evalue)
153 {
155  double evalue;
156  if (!(*it)->GetNamedScore(CSeq_align::eScore_EValue, evalue)) {
157  NCBI_THROW(CBlastException, eInvalidArgument,
158  "Evalue not found in Seq-align");
159  }
160 
161  if (evalue >= min_evalue && evalue < max_evalue) {
162  m_Hits.push_back(new CHit((*it)->GetSegs().GetDenseg(), evalue));
163  }
164  }
165 }
166 
167 
169 {
170  // if less than 2 hits, do nothing
171  if (m_Hits.size() < 2) {
172  return;
173  }
174 
175  // sort by accession and e-value
176  sort(m_Hits.begin(), m_Hits.end(), compare_hits_by_seqid_eval());
177  vector<CHit*> new_hits;
178  new_hits.reserve(m_Hits.size());
179 
180  new_hits.push_back(m_Hits[0]);
181 
182  vector<CHit*>::iterator it(m_Hits.begin());
183  ++it;
184 
185  // for each hit
186  for (;it != m_Hits.end();++it) {
187 
188  // for each kept hit with the same subject accession as it and better
189  // e-value
190  for (int i=static_cast<int>(new_hits.size()) - 1;i >= 0
191  && (*it)->m_SubjectId->Match(*new_hits[i]->m_SubjectId);i--) {
192 
193  const CHit* kept_hit = new_hits[i];
194 
195  // find intersection between hits on subjects,
196  // intersection needs to have query range from kept_hit for
197  // later subtraction
198  CHit intersection(*kept_hit);
199  intersection.IntersectWith(**it, CHit::eSubject);
200 
201  // subtract the subject intersection using query ranges,
202  // hits to different ranges of the same CD are treated as
203  // different hits
204  (*it)->Subtract(intersection);
205 
206  if ((*it)->IsEmpty()) {
207  delete *it;
208  *it = NULL;
209  break;
210  }
211  }
212  if (*it) {
213  new_hits.push_back(*it);
214  }
215 
216  }
217  m_Hits.swap(new_hits);
218 }
219 
220 
222 {
223  // initialize seqdb
225 
226  // load residue counts from file
227  CRef<CBlastRPSInfo> profile_data(
229 
230  // Set data for each hit
231  NON_CONST_ITERATE (vector<CHit*>, it, m_Hits) {
232 
233  _ASSERT(*it);
234 
235  (*it)->FillData(seqdb, *profile_data);
236  }
237 }
238 
239 
241 {
242  const int kQueryLength = static_cast<int>(m_QueryData.size());
243  const int kNumCds = static_cast<int>(m_Hits.size());
244 
245  // initialize msa map
246  PSICdMsaCell cell;
247  cell.is_aligned = (Uint1)false;
248  cell.data = NULL;
249  // allocate memory for num cdds + query
250  m_MsaData.resize(kQueryLength * (kNumCds), cell);
251  m_Msa = new PSICdMsaCell*[kNumCds];
252  if (!m_Msa) {
253  NCBI_THROW(CBlastSystemException, eOutOfMemory,
254  "Multiple alignment data structure");
255  }
256  for (int i=0;i < kNumCds;i++) {
257  m_Msa[i] = &m_MsaData[i * (int)kQueryLength];
258  }
259 
260  // fot each hit
261  for (size_t hit_idx=0;hit_idx < m_Hits.size();hit_idx++) {
262 
263  // for each hit segment
264  NON_CONST_ITERATE(vector<CHitSegment*>, it,
265  m_Hits[hit_idx]->GetSegments()) {
266 
267  const int kNumQueryColumns
268  = (*it)->m_QueryRange.GetTo() - (*it)->m_QueryRange.GetFrom();
269 
270  int q_from = (*it)->m_QueryRange.GetFrom();
271 
272  // for each position in the hit segemnt
273  for (int i=0;i < kNumQueryColumns; i++) {
274  // set as aligned and point to data
275  m_Msa[hit_idx][q_from + i].is_aligned = (Uint1)true;
276  m_Msa[hit_idx][q_from + i].data = &(*it)->m_MsaData[i];
277  }
278  }
279  m_Hits[hit_idx]->m_MsaIdx = static_cast<int>(hit_idx);
280  }
281 
282  m_CddData.msa = m_Msa;
283 }
284 
285 
287 {
288  _ASSERT(m_Msa);
289  const int kQueryLength = static_cast<int>(m_QueryData.size());
290  const int kNumCds = static_cast<int>(m_Hits.size());
292  for (int i=0;i < kNumCds;i++) {
293  _ASSERT(m_Msa[i]);
294  }
295 
296  for (int i=0;i < kNumCds;i++) {
297  for (int j=0;j < kQueryLength;j++) {
298 
299  if (m_QueryData[i] == kGapChar) {
300  NCBI_THROW(CBlastException, eInvalidArgument,
301  "Query sequence cannot contain gaps");
302  }
303 
304  if (m_Msa[i][j].is_aligned) {
305  _ASSERT(m_Msa[i][j].data);
306  const PSICdMsaCellData* data = m_Msa[i][j].data;
307 
308  // some domain models have incomplete data and are supposed to
309  // be removed from the database or search results,
310  // this exception checks whether one of these domains
311  // has slipped in
312  if (data->iobsr <= 0.0) {
313  NCBI_THROW(CBlastException, eInvalidArgument,
314  "Zero independent observations in domain model");
315  }
316 
317  _ASSERT(data->wfreqs);
318  double s = 0;
319  for (int k=0;k < kAlphabetSize;k++) {
320  if (data->wfreqs[k] < 0.0) {
321  NCBI_THROW(CBlastException, eInvalidArgument,
322  "Negative residue frequency in a domain "
323  "model");
324  }
325  s += data->wfreqs[k];
326  }
327  // some domain models have incomplete data and are supposed to
328  // be removed from the database or search results,
329  // this exception checks whether one of these domains
330  // has slipped in
331  if (fabs(s - 1.0) > 1e-5) {
332  NCBI_THROW(CBlastException, eInvalidArgument,
333  "Domain residue frequencies do not sum to 1");
334  }
335  }
336  }
337  }
338 
339  return true;
340 }
341 
342 
343 CCddInputData::CHit::CHit(const CDense_seg& denseg, double evalue)
344  : m_Evalue(evalue), m_MsaIdx(-1)
345 {
346  const int kNumDims = denseg.GetDim();
347  const int kNumSegments = denseg.GetNumseg();
348 
349  _ASSERT(kNumDims == 2);
350 
351  m_SubjectId.Reset(denseg.GetIds()[1].GetNonNullPointer());
352 
353  const vector<TSignedSeqPos>& starts = denseg.GetStarts();
354  const vector<TSeqPos>& lens = denseg.GetLens();
355 
356  TSeqPos query_index = 0;
357  TSeqPos subject_index = 1;
358 
359  for (int seg=0;seg < kNumSegments;seg++) {
360  TSeqPos query_offset = starts[query_index];
361  TSeqPos subject_offset = starts[subject_index];
362 
363  query_index += kNumDims;
364  subject_index += kNumDims;
365 
366  // segments of gaps in query or subject are ignored
367  if (query_offset != GAP_IN_ALIGNMENT
368  && subject_offset != GAP_IN_ALIGNMENT) {
369 
370  m_SegmentList.push_back(new CHitSegment(
371  TRange(query_offset, query_offset + lens[seg]),
372  TRange(subject_offset, subject_offset
373  + lens[seg])));
374 
375  query_offset += lens[seg];
376  subject_offset += lens[seg];
377  }
378  }
379 }
380 
381 
383  : m_SubjectId(hit.m_SubjectId),
384  m_Evalue(hit.m_Evalue),
385  m_MsaIdx(hit.m_MsaIdx)
386 {
387  m_SegmentList.reserve(hit.m_SegmentList.size());
388  ITERATE (vector<CHitSegment*>, it, hit.m_SegmentList) {
389  m_SegmentList.push_back(new CHitSegment(**it));
390  }
391 }
392 
393 
395 {
396  ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
397  delete *it;
398  }
399 }
400 
401 
403 {
404  if (IsEmpty()) {
405  return 0;
406  }
407 
408  unsigned int result = 0;
409  ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
410  result += (*it)->GetLength();
411  }
412 
413  return result;
414 }
415 
416 
418  const CBlastRPSInfo& profile_data)
419 {
420  // get record index of the CD in the database
421  int db_oid;
422  seqdb.SeqidToOid(*m_SubjectId, db_oid);
423 
424  // fill segment data
425  NON_CONST_ITERATE(vector<CHitSegment*>, it, m_SegmentList) {
426  (*it)->FillData(db_oid, profile_data);
427  }
428 }
429 
430 
432 {
433  ITERATE (vector<CHit*>, it, m_Hits) {
434  _ASSERT(*it);
435  (*it)->Validate();
436  }
437  return true;
438 }
439 
440 
442 {
443  // Test our pre-conditions
446 
448 
449  // set the sequence id
450  if (!m_SeqalignSet->Get().empty()) {
451  CRef<CSeq_align> aln =
452  const_cast<CSeq_align_set*>(&*m_SeqalignSet)->Set().front();
453  CRef<CSeq_id> query_id(const_cast<CSeq_id*>(&aln->GetSeq_id(0)));
454  m_QueryBioseq->SetId().push_back(query_id);
455  }
456 
457  // set required Seq-inst fields
458  m_QueryBioseq->SetInst().SetRepr(CSeq_inst::eRepr_raw);
459  m_QueryBioseq->SetInst().SetMol(CSeq_inst::eMol_aa);
460  m_QueryBioseq->SetInst().SetLength(GetQueryLength());
461 
462  // set the sequence data in ncbistdaa format
463  CNCBIstdaa& seq = m_QueryBioseq->SetInst().SetSeq_data().SetNcbistdaa();
464  seq.Set().reserve(GetQueryLength());
465  for (TSeqPos i = 0; i < GetQueryLength(); i++) {
466  seq.Set().push_back(m_QueryData[i]);
467  }
468 
469  if (!m_QueryTitle.empty()) {
470  CRef<CSeqdesc> desc(new CSeqdesc());
471  desc->SetTitle(m_QueryTitle);
472  m_QueryBioseq->SetDescr().Set().push_back(desc);
473  }
474 
475  // Test our post-condition
477 }
478 
479 
481 {
482  _ASSERT(!m_SubjectId.Empty());
483 
484  ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
485  _ASSERT(*it);
486  (*it)->Validate();
487  }
488 
489  return true;
490 }
491 
492 
494 {
495  if (m_SegmentList.empty()) {
496  return true;
497  }
498 
499  ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
500  if (!(*it)->IsEmpty()) {
501  return false;
502  }
503  }
504 
505  return true;
506 }
507 
508 
509 void CCddInputData::CHit::IntersectWith(const vector<TRange>& ranges,
511 {
512  // This function assumes that input ranges and hit segments are sorted
513  // by range and mutually exclusive
514 
515  vector<TRange>::const_iterator r_itr = ranges.begin();
516  vector<CHitSegment*>::iterator seg_it = m_SegmentList.begin();
517  vector<CHitSegment*> new_segs;
518  while (seg_it != m_SegmentList.end() && r_itr != ranges.end()) {
519 
520  // get current hit segment range
521  const TRange seg_range
522  = (app == eSubject ? (*seg_it)->m_SubjectRange
523  : (*seg_it)->m_QueryRange);
524 
525  // skip all ranges strictly below current hit segment
526  while (r_itr != ranges.end() && r_itr->GetTo() < seg_range.GetFrom()) {
527  r_itr++;
528  }
529 
530  if (r_itr == ranges.end()) {
531  break;
532  }
533 
534  // find intersection with current hit segment
535  TRange intersection(seg_range.IntersectionWith(*r_itr));
536 
537  // if intersection is the same as hit segment, do nothing
538  if (intersection == seg_range) {
539  seg_it++;
540  continue;
541  }
542 
543  // if intersection is empty, delete current hit segment
544  if (intersection.Empty()) {
545  delete *seg_it;
546  *seg_it = NULL;
547 
548  seg_it++;
549  continue;
550  }
551 
552  // otherwise find intersections with current hit segment
553  // for each range that intersects with current hit segment
554  while (r_itr != ranges.end() && r_itr->GetFrom() < seg_range.GetTo()) {
555 
556  // get and save intersection
557  int d_from = max(seg_range.GetFrom(),
558  r_itr->GetFrom()) - seg_range.GetFrom();
559  int d_to = min(seg_range.GetTo(),
560  r_itr->GetTo()) - seg_range.GetTo();
561 
562  CHitSegment* new_seg = new CHitSegment(**seg_it);
563  new_seg->AdjustRanges(d_from, d_to);
564  _ASSERT(!new_seg->IsEmpty());
565  new_segs.push_back(new_seg);
566 
567  // move to the next range
568  r_itr++;
569  }
570 
571  // current hit segment will be replaced with intersection, hence it
572  // is deleted
573  delete *seg_it;
574  *seg_it = NULL;
575  seg_it++;
576  }
577 
578  // each hit segment behind the last input range will have an empty
579  // interesection hence it is removed
580  while (seg_it != m_SegmentList.end()) {
581  delete *seg_it;
582  *seg_it = NULL;
583  seg_it++;
584  }
585 
586  // remove empty hit segments, add new intersections and sort the list
587  ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
588  if (*it) {
589  new_segs.push_back(*it);
590  }
591  }
592  sort(new_segs.begin(), new_segs.end(), compare_hitseg_range());
593 
594  m_SegmentList.swap(new_segs);
595 }
596 
597 
600 {
601  vector<TRange> ranges;
602  ranges.reserve(hit.GetSegments().size());
603  ITERATE (vector<CHitSegment*>, it, hit.GetSegments()) {
604  ranges.push_back(app == eQuery ? (*it)->m_QueryRange
605  : (*it)->m_SubjectRange);
606  }
607 
608  sort(ranges.begin(), ranges.end(), compare_range());
609 
610  IntersectWith(ranges, app);
611 }
612 
613 
615 {
616  // if either hit is empty than the result is the same as current
617  // object
618  if (IsEmpty() || hit.IsEmpty()) {
619  return;
620  }
621 
622  // This function assumes that input ranges and hit segments are sorted
623  // by range and mutually exclusive
624 
625  // find alignment start and stop of the hit to be subtracted
626  int from = hit.GetSegments().front()->m_QueryRange.GetFrom();
627  int to = hit.GetSegments().back()->m_QueryRange.GetTo();
628 
629  // if there is no overlap between hits, then do nothing
630  if (m_SegmentList.front()->m_QueryRange.GetFrom() >= to
631  || m_SegmentList.back()->m_QueryRange.GetTo() <= from) {
632 
633  return;
634  }
635 
636  // iterate over segments
637  vector<CHitSegment*>::iterator it = m_SegmentList.begin();
638 
639  vector<CHitSegment*> new_segments;
640  new_segments.reserve(m_SegmentList.size());
641 
642  // keep all segments that end before the subtracted hits starts
643  // unchanged
644  while (it != m_SegmentList.end() && (*it)->m_QueryRange.GetTo() <= from) {
645 
646  new_segments.push_back(*it);
647  ++it;
648  }
649 
650  // if all segments end before the subctracted hit starts
651  // or none of the segments overlaps with the subtracted hit,
652  // there is nothing to subtract, exit
653  if (it == m_SegmentList.end() || (*it)->m_QueryRange.GetFrom() > to) {
654  return;
655  }
656 
657  // if the current segment covers the whole subtracted hit
658  if ((*it)->m_QueryRange.GetTo() > to) {
659 
660  // make two segments for what is to the left and right of
661  // the subtracted hit
662 
663  CHitSegment* new_seg;
664 
665  if ((*it)->m_QueryRange.GetFrom() < from) {
666 
667  new_seg = new CHitSegment(**it);
668 
669  // left part
670  int d_to = from - (*it)->m_QueryRange.GetTo();
671  _ASSERT(d_to < 0);
672  (*it)->AdjustRanges(0, d_to);
673  _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
674  new_segments.push_back(*it);
675  }
676  else {
677  new_seg = *it;
678  }
679 
680  // right part
681  int d_from = to - new_seg->m_QueryRange.GetFrom();
682  _ASSERT(d_from >= 0);
683  new_seg->AdjustRanges(d_from, 0);
684  _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
685  new_segments.push_back(new_seg);
686 
687  // the following segments do not intersect with subtracted hit
688  ++it;
689  for (;it != m_SegmentList.end();++it) {
690  new_segments.push_back(*it);
691  }
692  }
693  else {
694 
695  // if the segment overlaps completely with the subtracted hit,
696  // delete it
697  if ((*it)->m_QueryRange.GetFrom() >= from) {
698  delete *it;
699  *it = NULL;
700  }
701  else {
702 
703  // otherwise adjust segment end
704  int d_to = from - (*it)->m_QueryRange.GetTo();
705  _ASSERT(d_to < 0);
706 
707  (*it)->AdjustRanges(0, d_to);
708  _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
709  new_segments.push_back(*it);
710  }
711 
712  // delete all segments that completely overlap with subtracted hit
713  ++it;
714  while (it != m_SegmentList.end()
715  && (*it)->m_QueryRange.GetTo() <= to) {
716 
717  delete *it;
718  *it = NULL;
719 
720  ++it;
721  }
722 
723  if (it != m_SegmentList.end()) {
724 
725  if ((*it)->m_QueryRange.GetFrom() < to) {
726  int d_from = to - (*it)->m_QueryRange.GetFrom();
727  _ASSERT(d_from > 0);
728 
729  (*it)->AdjustRanges(d_from, 0);
730  _ASSERT((*it)->m_QueryRange.GetFrom()
731  < (*it)->m_QueryRange.GetTo());
732 
733  new_segments.push_back(*it);
734  }
735  else {
736  delete *it;
737  *it = NULL;
738  }
739 
740  // keep all segments above subtracted hit
741  ++it;
742  while (it != m_SegmentList.end()) {
743  new_segments.push_back(*it);
744  ++it;
745  }
746  }
747  }
748 
749  m_SegmentList.swap(new_segments);
750 }
751 
752 
754  const CBlastRPSInfo& profile_data)
755 {
757  d.wfreqs = NULL;
758  d.iobsr = -1.0;
759  m_MsaData.resize(m_QueryRange.GetTo() - m_QueryRange.GetFrom(), d);
760 
761  x_FillResidueCounts(db_oid, profile_data);
762  x_FillObservations(db_oid, profile_data);
763 }
764 
765 
767 {
768  _ASSERT(m_QueryRange.GetFrom() >= 0 && m_QueryRange.GetTo() >= 0);
769  _ASSERT(m_SubjectRange.GetFrom() >= 0 && m_SubjectRange.GetTo() >= 0);
770 
771  const int kQueryLength = m_QueryRange.GetTo() - m_QueryRange.GetFrom();
772  const int kSubjectLength = m_SubjectRange.GetTo() - m_SubjectRange.GetFrom();
773 
774  if (kQueryLength != kSubjectLength) {
775  return false;
776  }
777 
778  _ASSERT((int)m_WFreqsData.size() == kSubjectLength * kAlphabetSize);
779  _ASSERT((int)m_MsaData.size() == kSubjectLength);
780 
781  ITERATE (vector<PSICdMsaCellData>, it, m_MsaData) {
782  _ASSERT(it->wfreqs);
783  }
784 
785  return true;
786 }
787 
789 {
790  m_QueryRange.SetFrom(m_QueryRange.GetFrom() + d_from);
791  m_QueryRange.SetTo(m_QueryRange.GetTo() + d_to);
792 
793  m_SubjectRange.SetFrom(m_SubjectRange.GetFrom() + d_from);
794  m_SubjectRange.SetTo(m_SubjectRange.GetTo() + d_to);
795 }
796 
797 
799 {
800  return m_QueryRange.GetFrom() > m_QueryRange.GetTo()
801  || m_SubjectRange.GetFrom() > m_SubjectRange.GetTo();
802 }
803 
805  const CBlastRPSInfo& profile_data)
806 {
807  _ASSERT(profile_data()->freq_header);
808 
809  BlastRPSProfileHeader* header = profile_data()->freq_header;
810  int num_profiles = header->num_profiles;
811 
812  _ASSERT(db_oid < num_profiles);
813 
814  // Get weighted residue counts for CD
815  const Int4* db_seq_offsets = header->start_offsets;
816  const TFreqs* db_counts =
817  (TFreqs*)(header->start_offsets + num_profiles + 1);
818 
819  // extract residue counts
820  const TFreqs* counts = db_counts + db_seq_offsets[db_oid] * kAlphabetSize;
821  int db_seq_length = db_seq_offsets[db_oid + 1] - db_seq_offsets[db_oid];
822 
823  // correct seq length for column of zero counts in cdd counts file
824  db_seq_length--;
825  _ASSERT(db_seq_length > 0);
826  _ASSERT(m_SubjectRange.GetTo() <= db_seq_length);
827 
828 
829  int num_columns = (int)m_MsaData.size();
830  m_WFreqsData.resize(num_columns * kAlphabetSize);
831  for (int i=0;i < num_columns;i++) {
832  m_MsaData[i].wfreqs = &m_WFreqsData[i * kAlphabetSize];
833 
834  // column frequencies for a column must sum to 1, but they may not due
835  // to storing in CDD as integers, the difference is distributed equally
836  // among all the non-zero frequencies
837  TFreqs sum_freqs = 0;
838  for (int j=0;j < kAlphabetSize;j++) {
839  sum_freqs +=
840  counts[(m_SubjectRange.GetFrom() + i) * kAlphabetSize + j];
841  }
842 
843  for (int j=0;j < kAlphabetSize;j++) {
844  m_MsaData[i].wfreqs[j] =
845  (double)counts[(m_SubjectRange.GetFrom() + i) * kAlphabetSize + j]
846  / (double)sum_freqs;
847  }
848  }
849 }
850 
852  const CBlastRPSInfo& profile_data)
853 {
854  // Get effective numbers of independent observations
855 
856  _ASSERT(profile_data()->obsr_header);
857 
858  BlastRPSProfileHeader* header = profile_data()->obsr_header;
859  int num_profiles = header->num_profiles;
860 
861  _ASSERT(db_oid < num_profiles);
862 
863  // find poiter to eff number of observations
864  const Int4* offsets = header->start_offsets;
865  const TObsr* data_start
866  = (TObsr*)(header->start_offsets + num_profiles + 1);
867 
868  const TObsr* data = data_start + offsets[db_oid];
869  int data_size = offsets[db_oid + 1] - offsets[db_oid];
870 
871  // extract effective numbers of obaservations
872  vector<TObsr> obsr;
873  for (int i=0;i < data_size;i+=2) {
874  TObsr val = data[i];
875  Int4 num = (Int4)data[i + 1];
876  _ASSERT(fabs((double)num - data[i + 1]) < 1e-05);
877 
878  for (int j=0;j < num;j++) {
879  obsr.push_back(val);
880  }
881  }
882 
883  int num_columns = m_SubjectRange.GetTo() - m_SubjectRange.GetFrom();
884  for (int i=0;i < num_columns;i++) {
885  m_MsaData[i].iobsr =
886  (double)obsr[m_SubjectRange.GetFrom() + i] / kRpsScaleFactor;
887  }
888 }
889 
890 
891 END_SCOPE(blast)
893 
894 /* @} */
User-defined methods of the data storage class.
Declares the BLAST exception class.
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
Defines BLAST error codes (user errors included)
Wrapper class to manage the BlastRPSInfo structure, as currently there aren't any allocation or deall...
Definition: rps_aux.hpp:68
Defines system exceptions occurred while running BLAST.
Represents one alignment segment of a RPS-BLAST hit.
Single RPS-BLAST hit.
Class used for sorting hits by subject seq-id and e-value.
Class used for sorting hit segments by range.
Class used for sorting ranges.
CNCBIstdaa –.
Definition: NCBIstdaa.hpp:66
CSeqDB.
Definition: seqdb.hpp:161
@ eProtein
Definition: seqdb.hpp:174
bool SeqidToOid(const CSeq_id &seqid, int &oid) const
Translate a Seq-id to any matching OID.
Definition: seqdb.cpp:903
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
char data[12]
Definition: iconv.c:80
#define GAP_IN_ALIGNMENT
Representation of GAP in Seq-align.
virtual ~CCddInputData()
Virtual destructor.
CRef< objects::CBioseq > m_QueryBioseq
Query as Bioseq.
static const char kGapChar('-')
The representation of a gap in ASCII format.
void AdjustRanges(int d_from, int d_to)
Change ranges on query and subject by given values.
TRange m_QueryRange
Segment range on query.
vector< CHit * > m_Hits
RPS-BLAST hits in internal representation.
bool IsEmpty(void) const
Is hit empty.
string m_DbName
CDD database name.
CHit(const objects::CDense_seg &denseg, double evalue)
Constructor.
vector< CHitSegment * > m_SegmentList
List of hit segments.
void x_CreateMsa(void)
Create multiple alignment of CDs.
void x_ProcessAlignments(double min_evalue, double max_evalue)
Process RPS-BLAST hits.
void x_RemoveMultipleCdHits(void)
Remove multiple hits to the same CD.
vector< CHitSegment * > & GetSegments(void)
Get hit segments.
void Subtract(const CHit &hit)
Subtract from another hit from this hit using query ranges.
CRange< int > TRange
PSIBlastOptions m_Opts
Delta BLAST options for PSSM Engine.
int GetLength(void) const
Get hit length in residues, counts number of matching residues, gaps are not counted.
void Process(void)
Pre-process CD matches to query.
double m_MinEvalue
Min e-value threshold for all hits to be included in PSSM computation.
PSICdMsaCell ** m_Msa
Pointer to MSA.
CConstRef< objects::CSeq_align_set > m_SeqalignSet
RPS-BLAST hits for the query.
CCddInputData(const Uint1 *query, unsigned int query_length, CConstRef< objects::CSeq_align_set > seqaligns, const PSIBlastOptions &opts, const string &dbname, const string &matrix_name="BLOSUM62", int gap_existence=0, int gap_extension=0, PSIDiagnosticsRequest *diags=NULL, const string &query_title="")
Constructor.
bool x_ValidateHits(void) const
Validate internal representation of RPS-BLAST hits.
PSICdMsa m_CddData
MSA of CDs and CD data.
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
vector< Uint1 > m_QueryData
Query sequence.
PSIMsaDimensions m_MsaDimensions
MSA dimensions, used by PSSM engine.
void x_FillResidueCounts(int db_oid, const CBlastRPSInfo &profile_data)
Populate arrays of weighted residue counts.
void x_FillObservations(int db_oid, const CBlastRPSInfo &profile_data)
Populate arrays of effective numbers of observations.
void FillData(int db_oid, const CBlastRPSInfo &profile_data)
Allocate and populate arrays for MSA data (weighted residue counts and effective observations used fo...
static const int kAlphabetSize
bool Validate(void) const
Validate hit.
vector< PSICdMsaCell > m_MsaData
MSA data.
Uint4 TFreqs
Type used for residue frequencies stored in CDD.
void IntersectWith(const vector< TRange > &segments, EApplyTo app)
Intersect hit segments with list of ranges and store result in hit segments.
bool Validate(void) const
Validate hit segment.
unsigned int GetQueryLength(void)
Get query length.
string m_QueryTitle
Query title (for PSSM)
static const int kRpsScaleFactor
Scale of residue frequencies and number of independent observations stored in CDD.
bool x_ValidateMsa(void) const
Validate multiple alignment of CDs.
void x_ExtractQueryForPssm(void)
Create query as Bioseq.
bool IsEmpty(void) const
Does the hit segment represent an empty range.
void FillData(const CSeqDB &seqdb, const CBlastRPSInfo &profile_data)
Allocate and populate arrays of data for PSSM computation.
Uint4 TObsr
Type used for number of independent observations stored in CDD.
void x_FillHitsData(void)
Read data needed for PSSM computation from CDD and populate arrays.
EApplyTo
Master selection for operations involving ranges.
@ fDeltaBlast
Flags set for DELTA-BLAST.
Definition: rps_aux.hpp:93
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
TPrim & Set(void)
Definition: serialbase.hpp:351
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
TThisType IntersectionWith(const TThisType &r) const
Definition: range.hpp:312
bool Empty(void) const
Definition: range.hpp:148
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
list< CRef< CSeq_align > > Tdata
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
bool is_aligned(T *p) noexcept
Check pointer alignment.
Definition: bmutil.h:637
constexpr auto sort(_Init &&init)
#define fabs(v)
Definition: ncbi_dispd.c:46
T max(T x_, T y_)
T min(T x_, T y_)
header of RPS blast '.rps' file
Definition: blast_rps.h:62
Int4 num_profiles
number of PSSMs in the file
Definition: blast_rps.h:64
Int4 start_offsets[1]
start of an Int4 array that gives the starting byte offset of each RPS DB sequence.
Definition: blast_rps.h:65
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double inclusion_ethresh
Minimum evalue for inclusion in PSSM calculation.
Data needed for PSSM computation stored in MSA cell for single column in CD aligned to a position in ...
Definition: blast_psi.h:113
double iobsr
Effective number of independent observations in a CD column.
Definition: blast_psi.h:118
double * wfreqs
Frequencies for each residue in CD column.
Definition: blast_psi.h:115
Alignment cell that represents one column of CD aligned to a position in the query.
Definition: blast_psi.h:124
Uint1 is_aligned
Does this cell represent column aligned to a CD.
Definition: blast_psi.h:125
PSICdMsaCellData * data
Data needed for PSSM computation.
Definition: blast_psi.h:128
PSIMsaDimensions * dimensions
Query length and number of aligned cds.
Definition: blast_psi.h:136
unsigned char * query
Query sequence as Ncbistdaa.
Definition: blast_psi.h:135
PSICdMsaCell ** msa
Multiple alignment of CDs.
Definition: blast_psi.h:138
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Definition: blast_psi.h:181
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Definition: blast_psi.h:59
Uint4 query_length
Length of the query.
Definition: blast_psi.h:58
static string query
#define _ASSERT
else result
Definition: token2.c:20
Modified on Tue May 21 11:00:52 2024 by modify_doxy.py rev. 669887