NCBI C++ ToolKit
psl_record.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 
2 /* $Id: psl_record.cpp 93574 2021-04-30 16:19:19Z stakhovv $
3  * ===========================================================================
4  *
5  * PUBLIC DOMAIN NOTICE
6  * National Center for Biotechnology Information
7  *
8  * This software/database is a "United States Government Work" under the
9  * terms of the United States Copyright Act. It was written as part of
10  * the author's official duties as a United States Government employee and
11  * thus cannot be copyrighted. This software/database is freely available
12  * to the public for use. The National Library of Medicine and the U.S.
13  * Government have not placed any restriction on its use or reproduction.
14  *
15  * Although all reasonable efforts have been taken to ensure the accuracy
16  * and reliability of the software and data, the NLM and the U.S.
17  * Government do not and cannot warrant the performance or results that
18  * may be obtained by using this software or data. The NLM and the U.S.
19  * Government disclaim all warranties, express or implied, including
20  * warranties of performance, merchantability or fitness for any particular
21  * purpose.
22  *
23  * Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * Authors: Frank Ludwig
28  *
29  * File Description: Write alignment
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
47 
48 #include <objmgr/scope.hpp>
49 #include <objmgr/bioseq_handle.hpp>
50 #include <objmgr/seq_vector.hpp>
51 #include <objmgr/util/sequence.hpp>
52 
56 #include "psl_record.hpp"
57 
59 
62 
63 // ----------------------------------------------------------------------------
64 static void
66  const CSpliced_exon_chunk& chunk,
67  int& matchSize,
68  int& misMatchSize,
69  int& productInsertionSize,
70  int& genomicInsertionSize)
71 // ----------------------------------------------------------------------------
72 {
73  switch(chunk.Which()) {
75  matchSize += chunk.GetMatch();
76  break;
78  misMatchSize += chunk.GetMismatch();
79  break;
81  productInsertionSize += chunk.GetProduct_ins();
82  break;
84  genomicInsertionSize += chunk.GetGenomic_ins();
85  break;
86  default:
87  break; //for now, not interested
88  }
89 }
90 
91 // ----------------------------------------------------------------------------
92 void
94  CScope& scope,
95  const CSpliced_seg& splicedSeg)
96 // ----------------------------------------------------------------------------
97 {
98  if (splicedSeg.CanGetProduct_strand()) {
99  mStrandQ = splicedSeg.GetProduct_strand();
100  }
101  if (splicedSeg.CanGetGenomic_strand()) {
102  mStrandT = splicedSeg.GetGenomic_strand();
103  }
104 }
105 
106 // ----------------------------------------------------------------------------
107 void
109  CScope& scope,
110  const CSpliced_seg& splicedSeg)
111 // ----------------------------------------------------------------------------
112 {
113  if (mNumInsertT != -1 && mBaseInsertT != -1) {
114  return;
115  }
116  if (mStrandT == eNa_strand_unknown) {
117  xInitializeStrands(scope, splicedSeg);
118  }
119 
120  // get inserts that are part of exons:
121  mMatches = mMisMatches = 0;
124  const auto& exonList = splicedSeg.GetExons();
125  for (auto pExon: exonList) {
126  for (auto pPart: pExon->GetParts()) {
129  if (pPart->IsProduct_ins()) {
130  mNumInsertQ++;
131  }
132  else if (pPart->IsGenomic_ins()) {
133  mNumInsertT++;
134  }
135  }
136  }
137 
138  // for the target, introns count as inserts:
139  int lastExonBoundT = -1;
140  if ( mStrandT == eNa_strand_plus) {
141  for (auto pExon: exonList) {
142  if (lastExonBoundT == -1) {
143  lastExonBoundT = pExon->GetGenomic_end() + 1;
144  continue;
145  }
146  int exonStart = pExon->GetGenomic_start();
147  if (exonStart > lastExonBoundT) {
148  mBaseInsertT += (exonStart - lastExonBoundT);
149  }
150  lastExonBoundT = pExon->GetGenomic_end() + 1;
151  }
152  }
153  else { // eNa_strand_minus
154  for (auto pExon: exonList) {
155  if (lastExonBoundT == -1) {
156  lastExonBoundT = pExon->GetGenomic_start();
157  continue;
158  }
159  int exonEnd = pExon->GetGenomic_end() + 1;
160  if (exonEnd < lastExonBoundT) {
161  mBaseInsertT += (lastExonBoundT - exonEnd);
162  }
163  lastExonBoundT = pExon->GetGenomic_start();
164  }
165  }
166  mNumInsertT += static_cast<int>((exonList.size() - 1));
167 }
168 
169 // ----------------------------------------------------------------------------
170 void
172  CScope& scope,
173  const CSpliced_seg& splicedSeg)
174 // ----------------------------------------------------------------------------
175 {
176  const auto& queryId = splicedSeg.GetProduct_id();
177  auto querySeqHandle = scope.GetBioseqHandle(queryId);
178  CGenbankIdResolve::Get().GetBestId(querySeqHandle.GetSeq_id_Handle(), scope, mNameQ);
179  if (!querySeqHandle) {
180  throw CWriterMessage(
181  "Unable to resolve given query id", eDiag_Error);
182  }
183  mSizeQ = querySeqHandle.GetInst_Length();
184  mStartQ = splicedSeg.GetSeqStart(0);
185  mEndQ = splicedSeg.GetSeqStop(0) + 1;
186 
187  const auto& targetId = splicedSeg.GetGenomic_id();
188  auto targetSeqHandle = scope.GetBioseqHandle(targetId);
189  CGenbankIdResolve::Get().GetBestId(targetSeqHandle.GetSeq_id_Handle(), scope, mNameT);
190  if (!targetSeqHandle) {
191  throw CWriterMessage(
192  "Unable to resolve given target id", eDiag_Error);
193  }
194  mSizeT = targetSeqHandle.GetInst_Length();
195  mStartT = splicedSeg.GetSeqStart(1);
196  mEndT = splicedSeg.GetSeqStop(1) + 1;
197 }
198 
199 // ----------------------------------------------------------------------------
200 void
202  CScope& scope,
203  const CSpliced_seg& splicedSeg)
204 // ----------------------------------------------------------------------------
205 {
206  const auto& exonList = splicedSeg.GetExons();
207  mBlockCount = static_cast<int>(exonList.size());
208 
209  for (auto pExon: exonList) {
210  //auto partCount = pExon->GetParts().size();
211  int exonStartQ = static_cast<int>(pExon->GetProduct_start().AsSeqPos());
212  int exonStartT = static_cast<int>(pExon->GetGenomic_start());
213  mBlockStartsQ.push_back(exonStartQ);
214  mBlockStartsT.push_back(exonStartT);
215  int blockSize = 0;
216  int productInsertionPending = 0;
217  int genomicInsertionPending = 0;
218  for (auto pPart: pExon->GetParts()) {
219  if (productInsertionPending || genomicInsertionPending) {
220  mBlockCount++;
221  mBlockSizes.push_back(blockSize);
222  mBlockStartsQ.push_back(exonStartQ + blockSize + productInsertionPending);
223  mBlockStartsT.push_back(
224  exonStartT + blockSize + genomicInsertionPending);
225  exonStartQ += blockSize + productInsertionPending;
226  exonStartT += blockSize + genomicInsertionPending;
227  blockSize = 0;
228  productInsertionPending = 0;
229  genomicInsertionPending = 0;
230  }
232  *pPart, blockSize, blockSize, productInsertionPending, genomicInsertionPending);
233  }
234  mBlockSizes.push_back(blockSize);
235  exonStartQ += blockSize;
236  exonStartT += blockSize;
237  }
238 }
239 
240 // ----------------------------------------------------------------------------
241 void
243  CScope& scope,
244  const CSpliced_seg& splicedSeg)
245 // ----------------------------------------------------------------------------
246 {
247  const auto& exonList = splicedSeg.GetExons();
248  mBlockCount = static_cast<int>(exonList.size());
249 
250  for (auto pExon: exonList) {
251  int exonEndT = static_cast<int>(pExon->GetGenomic_end() + 1);
252  int exonEndQ = mSizeQ - static_cast<int>(pExon->GetProduct_end().AsSeqPos() + 1);
253  int blockSize = 0;
254  int productInsertionPending = 0;
255  int genomicInsertionPending = 0;
256  for (auto pPart: pExon->GetParts()) {
257  if (productInsertionPending || genomicInsertionPending) {
258  mBlockCount++;
259  mBlockSizes.push_back(blockSize);
260  mBlockStartsT.push_back(exonEndT - blockSize);
261  mBlockStartsQ.push_back(exonEndQ);
262  exonEndQ -= (blockSize + productInsertionPending);
263  exonEndT -= (blockSize + genomicInsertionPending);
264  blockSize = 0;
265  productInsertionPending = 0;
266  genomicInsertionPending = 0;
267  }
269  *pPart, blockSize, blockSize, productInsertionPending, genomicInsertionPending);
270  }
271  exonEndT -= blockSize;
272  mBlockStartsT.push_back(exonEndT);
273  mBlockStartsQ.push_back(exonEndQ);
274  exonEndQ -= blockSize;
275  mBlockSizes.push_back(blockSize);
276  }
277  std::reverse(mBlockStartsT.begin(), mBlockStartsT.end());
278  std::reverse(mBlockSizes.begin(), mBlockSizes.end());
279  std::reverse(mBlockStartsQ.begin(), mBlockStartsQ.end());
280 }
281 
282 // ----------------------------------------------------------------------------
283 void
285  CScope& scope,
286  const CSpliced_seg& splicedSeg)
287 // ----------------------------------------------------------------------------
288 {
289  if (mBlockCount != -1) {
290  return;
291  }
292 
293  if (mStrandT == eNa_strand_unknown) {
294  xInitializeStrands(scope, splicedSeg);
295  }
296 
297  const auto& exonList = splicedSeg.GetExons();
298  mBlockCount = static_cast<int>(exonList.size());
299 
300  if (mStrandT == eNa_strand_plus) {
301  xInitializeBlocksStrandPositive(scope, splicedSeg);
302  }
303  else {
304  xInitializeBlocksStrandNegative(scope, splicedSeg);
305  }
306 }
307 
308 // ----------------------------------------------------------------------------
309 void
311  CScope& scope,
312  const CSpliced_seg& splicedSeg)
313 // ----------------------------------------------------------------------------
314 {
315  if (!splicedSeg.CanGetProduct_type() ||
317  // would love to support but need proper sample data first!
318  throw CWriterMessage(
319  "Unsupported alignment product type \"protein\"", eDiag_Error);
320  }
321 
322  const auto& exonList = splicedSeg.GetExons();
323  for (auto pExon: exonList) {
324  if (!pExon->CanGetProduct_start() || !pExon->CanGetProduct_end()) {
325  throw CWriterMessage(
326  "Mandatory product information missing", eDiag_Error);
327  }
328  if (!pExon->CanGetGenomic_start() || !pExon->CanGetGenomic_end()) {
329  throw CWriterMessage(
330  "Mandatory target information missing", eDiag_Error);
331  }
332  }
333 }
334 
335 // ----------------------------------------------------------------------------
336 void
338  CScope& scope,
339  const CSpliced_seg& splicedSeg)
340 // ----------------------------------------------------------------------------
341 {
342  xValidateSegment(scope, splicedSeg);
343 
344  xInitializeStrands(scope, splicedSeg);
345  xInitializeSequenceInfo(scope, splicedSeg);
346  xInitializeStats(scope, splicedSeg);
347  xInitializeBlocks(scope, splicedSeg);
348 }
349 
350 // ----------------------------------------------------------------------------
351 void
353  CScope& scope,
354  const CSeq_align::TScore& scores)
355 // ----------------------------------------------------------------------------
356 {
357  for (const auto& pScore: scores) {
358  if (!pScore->CanGetId() || !pScore->GetId().IsStr()) {
359  continue;
360  }
361  if (!pScore->CanGetValue()) {
362  continue;
363  }
364  const auto& key = pScore->GetId().GetStr();
365  const auto& value = pScore->GetValue();
366  if (key == "num_mismatch" && value.IsInt() && mMisMatches == -1) {
367  mMisMatches = value.GetInt();
368  continue;
369  }
370  }
371 }
372 
373 // ----------------------------------------------------------------------------
374 void
376 // ----------------------------------------------------------------------------
377 {
378  if (mRepMatches == -1) {
379  mRepMatches = 0;
380  }
381  if (mMisMatches == -1) {
382  mMisMatches = 0;
383  }
384  if (mCountN == -1) {
385  mCountN = 0;
386  }
387 }
388 
389 // ----------------------------------------------------------------------------
390 void
392  CScope& scope,
393  const CDense_seg& denseSeg)
394 // ----------------------------------------------------------------------------
395 {
396  if (denseSeg.GetDim() != 2) {
397  throw CWriterMessage(
398  "PSL supports only pairwaise alignments", eDiag_Error);
399  }
400 }
401 
402 // ----------------------------------------------------------------------------
403 void
405  CScope& scope,
406  const CDense_seg& denseSeg)
407 // ----------------------------------------------------------------------------
408 {
409  mStrandQ = denseSeg.GetSeqStrand(0);
410  mStrandT = denseSeg.GetSeqStrand(1);
411 }
412 
413 // ----------------------------------------------------------------------------
414 void
416  CScope& scope,
417  const CDense_seg& denseSeg)
418 // ----------------------------------------------------------------------------
419 {
420  const CSeq_id& idQ = denseSeg.GetSeq_id(0);
421  auto seqHandleQ = scope.GetBioseqHandle(idQ);
422  CGenbankIdResolve::Get().GetBestId(seqHandleQ.GetSeq_id_Handle(), scope, mNameQ);
423  mSizeQ = seqHandleQ.GetInst_Length();
424  mStartQ = denseSeg.GetSeqStart(0);
425  mEndQ = denseSeg.GetSeqStop(0) + 1;
426 
427  const CSeq_id& idT = denseSeg.GetSeq_id(1);
428  auto seqHandleT = scope.GetBioseqHandle(idT);
429  CGenbankIdResolve::Get().GetBestId(seqHandleT.GetSeq_id_Handle(), scope, mNameT);
430  mSizeT = seqHandleT.GetInst_Length();
431  mStartT = denseSeg.GetSeqStart(1);
432  mEndT = denseSeg.GetSeqStop(1) + 1;
433 }
434 
435 // ----------------------------------------------------------------------------
436 void
438  CScope& scope,
439  const CDense_seg& denseSeg)
440 // ----------------------------------------------------------------------------
441 {
442  mMatches = 0;
443  for (auto length: denseSeg.GetLens()) {
444  mMatches += length;
445  }
446  mBlockCount = static_cast<int>(denseSeg.GetLens().size());
447  auto starts = denseSeg.GetStarts();
448  auto lens = denseSeg.GetLens();
449  for (int i=0; i < mBlockCount; ++i) {
450  if (starts[2*i] != -1 && starts[2*i+1] != -1) {
451  mBlockStartsQ.push_back(starts[2*i]);
452  mBlockStartsT.push_back(starts[2*i + 1]);
453  mBlockSizes.push_back(lens[i]);
454  }
455  }
456  if (eNa_strand_minus == denseSeg.GetSeqStrand(0)) {
457  std::reverse(mBlockStartsQ.begin(), mBlockStartsQ.end());
458  std::reverse(mBlockSizes.begin(), mBlockSizes.end());
459  }
460  if (eNa_strand_minus == denseSeg.GetSeqStrand(1)) {
461  std::reverse(mBlockStartsT.begin(), mBlockStartsT.end());
462  std::reverse(mBlockSizes.begin(), mBlockSizes.end());
463  }
464  mBlockCount = static_cast<int>(mBlockSizes.size());
465 
468  if (mBlockStartsT[0] == -1) {
469  mNumInsertQ++;
471  }
472  if (mBlockStartsQ[0] == -1) {
473  mNumInsertT++;
475  }
476  for (int i=1; i < mBlockCount; ++i) {
477  auto endOfLastQ = mBlockStartsQ[i-1] + mBlockSizes[i-1];
478  auto startOfThisQ = mBlockStartsQ[i];
479  if (startOfThisQ - endOfLastQ != 0) {
480  mNumInsertQ++;
481  mBaseInsertQ += (startOfThisQ - endOfLastQ);
482  }
483  if (mBlockStartsT[i] == -1) {
484  mNumInsertQ++;
486  }
487  auto endOfLastT = mBlockStartsT[i-1] + mBlockSizes[i-1];
488  auto startOfThisT = mBlockStartsT[i];
489  if (startOfThisT - endOfLastT != 0) {
490  mNumInsertT++;
491  mBaseInsertT += (startOfThisT - endOfLastT);
492  }
493  if (mBlockStartsQ[i] == -1) {
494  mNumInsertT++;
496  }
497  }
498 }
499 
500 // ----------------------------------------------------------------------------
501 void
503  CScope& scope,
504  const CDense_seg& denseSeg)
505 // ----------------------------------------------------------------------------
506 {
507  xValidateSegment(scope, denseSeg);
508  xInitializeStrands(scope, denseSeg);
509  xInitializeSequenceInfo(scope, denseSeg);
510  xInitializeStatsAndBlocks(scope, denseSeg);
511 }
512 
513 // ----------------------------------------------------------------------------
514 void
516  const string& message,
517  EDiagSev severity)
518 // ----------------------------------------------------------------------------
519 {
520  if (mpMessageListener) {
521  mpMessageListener->PutMessage(CWriterMessage(message, severity));
522  return;
523  }
524  NCBI_THROW(CObjWriterException, eBadInput, message);
525 };
526 
528 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
const CSeq_id & GetSeq_id(TDim row) const
Definition: Dense_seg.cpp:154
ENa_strand GetSeqStrand(TDim row) const
Definition: Dense_seg.cpp:241
TSeqPos GetSeqStop(TDim row) const
Definition: Dense_seg.cpp:203
TSeqPos GetSeqStart(TDim row) const
Definition: Dense_seg.cpp:165
static CGenbankIdResolve & Get()
bool GetBestId(CSeq_id_Handle, CScope &, string &)
virtual bool PutMessage(const IObjtoolsMessage &message)
Definition: listener.cpp:53
vector< int > mBlockSizes
Definition: psl_record.hpp:158
int mBaseInsertT
Definition: psl_record.hpp:146
void xInitializeSequenceInfo(CScope &, const CSpliced_seg &)
Definition: psl_record.cpp:171
void xInitializeBlocksStrandNegative(CScope &, const CSpliced_seg &)
Definition: psl_record.cpp:242
int mBaseInsertQ
Definition: psl_record.hpp:144
void xInitializeStatsAndBlocks(CScope &, const CDense_seg &)
Definition: psl_record.cpp:437
void Initialize(CScope &scope, const CSpliced_seg &splicedSeg)
Definition: psl_record.cpp:337
CWriterListener * mpMessageListener
Definition: psl_record.hpp:138
vector< int > mBlockStartsQ
Definition: psl_record.hpp:159
void Finalize()
Definition: psl_record.cpp:375
void xInitializeStrands(CScope &, const CSpliced_seg &)
Definition: psl_record.cpp:93
void xInitializeStats(CScope &, const CSpliced_seg &)
Definition: psl_record.cpp:108
void xValidateSegment(CScope &, const CSpliced_seg &)
Definition: psl_record.cpp:310
void xInitializeBlocksStrandPositive(CScope &, const CSpliced_seg &)
Definition: psl_record.cpp:201
vector< int > mBlockStartsT
Definition: psl_record.hpp:160
ENa_strand mStrandQ
Definition: psl_record.hpp:147
void xPutMessage(const string &message, EDiagSev severity)
Definition: psl_record.cpp:515
ENa_strand mStrandT
Definition: psl_record.hpp:148
string mNameQ
Definition: psl_record.hpp:149
string mNameT
Definition: psl_record.hpp:153
void xInitializeBlocks(CScope &, const CSpliced_seg &)
Definition: psl_record.cpp:284
CScope –.
Definition: scope.hpp:92
CSpliced_exon_chunk –.
TSeqPos GetSeqStop(TDim row) const
TSeqPos GetSeqStart(TDim row) const
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
TMatch GetMatch(void) const
Get the variant data.
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
bool CanGetProduct_type(void) const
Check if it is safe to call GetProduct_type method.
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
TMismatch GetMismatch(void) const
Get the variant data.
TGenomic_strand GetGenomic_strand(void) const
Get the Genomic_strand member data.
bool CanGetProduct_strand(void) const
Check if it is safe to call GetProduct_strand method.
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
const TExons & GetExons(void) const
Get the Exons member data.
TProduct_strand GetProduct_strand(void) const
Get the Product_strand member data.
TProduct_ins GetProduct_ins(void) const
Get the variant data.
bool CanGetGenomic_strand(void) const
Check if it is safe to call GetGenomic_strand method.
E_Choice Which(void) const
Which variant is currently selected.
@ e_Product_ins
insertion in product sequence (i.e. gap in the genomic sequence)
@ e_Genomic_ins
insertion in genomic sequence (i.e. gap in the product sequence)
@ e_Match
both sequences represented, product and genomic sequences match
@ e_Mismatch
both sequences represented, product and genomic sequences do not match
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
int i
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
USING_SCOPE(objects)
static void sExonChunkAppendStats(const CSpliced_exon_chunk &chunk, int &matchSize, int &misMatchSize, int &productInsertionSize, int &genomicInsertionSize)
Definition: psl_record.cpp:65
Modified on Tue Apr 23 07:37:18 2024 by modify_doxy.py rev. 669887