NCBI C++ ToolKit
seq_align_util.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_align_util.cpp 79163 2017-08-16 19:27:32Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksey Grichenko
27 *
28 * File Description:
29 * Seq-align utilities
30 */
31 
32 #include <ncbi_pch.hpp>
41 #include <objmgr/seq_vector.hpp>
42 #include <objmgr/util/sequence.hpp>
45 
46 
49 BEGIN_SCOPE(sequence)
50 
51 
54  const CSeq_loc& loc,
55  CScope* scope)
56 {
57  if ( loc.IsWhole() ) {
59  copy->Assign(align);
60  return copy;
61  }
62  const CSeq_id* orig_id = loc.GetId();
63  if ( !orig_id ) {
64  NCBI_THROW(CAnnotMapperException, eBadLocation,
65  "Location with multiple ids can not be used to "
66  "remap seq-aligns.");
67  }
68  CRef<CSeq_id> id(new CSeq_id);
69  id->Assign(*orig_id);
70 
71  // Create source seq-loc
72  CSeq_loc src_loc(*id, 0, GetLength(loc, scope) - 1);
73  ENa_strand strand = loc.GetStrand();
74  if (strand != eNa_strand_unknown) {
75  src_loc.SetStrand(strand);
76  }
77  CSeq_loc_Mapper mapper(src_loc, loc, scope);
78  return mapper.Map(align, row);
79 }
80 
81 
83 {
84 public:
85  CProductStringBuilder(const CSeq_align& align, CScope& scope);
86  const string& GetProductString(void);
87 
88 private:
89  bool x_AddExon(const CSpliced_exon& ex);
90  bool x_AddExonPart(const CSpliced_exon_chunk& ch, TSeqPos& gen_offset);
91  void x_Match(TSeqPos gen_from, TSeqPos gen_to_open);
92  bool x_Mismatch(TSeqPos mismatch_len);
93 
97  bool m_GenRev = false;
98  bool m_ProdRev = false;
100  string m_ExonData;
101  string m_Result;
103  size_t m_MismatchPos = 0;
104 };
105 
106 
108  : m_Align(align), m_Scope(scope)
109 {
110 }
111 
112 
114 {
115  m_Result.clear();
116  // Only spliced-segs are supported.
117  if (!m_Align.GetSegs().IsSpliced()) {
118  NCBI_THROW(CObjmgrUtilException, eBadAlignment,
119  "Only splised-seg alignments are supported");
120  }
121 
122  const CSpliced_seg& spliced_seg = m_Align.GetSegs().GetSpliced();
123  // Only genomic alignments support MismatchedBases.
125  // ERROR: Non-transcript alignment.
126  NCBI_THROW(CObjmgrUtilException, eBadAlignment,
127  "Only transcript spliced-segs are supported");
128  }
129 
130  const CSeq_id& gen_id = m_Align.GetSeq_id(1);
131 
132  CBioseq_Handle gen_handle = m_Scope.GetBioseqHandle(gen_id);
133  if ( !gen_handle ) {
134  NCBI_THROW(CObjmgrUtilException, eBadAlignment,
135  "Failed to fetch genomic sequence data");
136  }
137 
139 
140  if ( spliced_seg.IsSetProduct_length() ) {
141  m_Result.reserve(spliced_seg.GetProduct_length());
142  }
145 
146  // NOTE: Even if ext is not set or does not contain MismatchedBases entry it may
147  // still be possible to generate product sequence if the alignment is a perfect
148  // match (no indels, mismatches or unaligned ranges on product).
149 
150  if ( m_Align.IsSetExt() ) {
151  // Find MismatchedBases entry in ext. If several entries are present, use
152  // the first one.
153  ITERATE(CSeq_align::TExt, ext_it, m_Align.GetExt()) {
154  const CUser_object& obj = **ext_it;
155  if (obj.GetType().IsStr() && obj.GetType().GetStr() == "MismatchedBases") {
156  ITERATE(CUser_object::TData, data_it, obj.GetData()) {
157  const CUser_field& field = **data_it;
158  if (field.GetLabel().IsStr() && field.GetLabel().GetStr() == "Bases" &&
159  field.GetData().IsStr()) {
160  m_MismatchedBases = field.GetData().GetStr();
161  break;
162  }
163  }
164  if ( !m_MismatchedBases.empty() ) break;
165  }
166  }
167  }
168 
169  if ((m_GenRev != m_ProdRev) && !m_MismatchedBases.empty()) {
171  }
172 
173  const CSpliced_seg::TExons& exons = spliced_seg.GetExons();
174 
175  if ( m_ProdRev ) {
176  REVERSE_ITERATE(CSpliced_seg::TExons, ex_it, exons) {
177  if ( !x_AddExon(**ex_it) ) return kEmptyStr;
178  }
179  }
180  else {
181  ITERATE(CSpliced_seg::TExons, ex_it, exons) {
182  if ( !x_AddExon(**ex_it) ) return kEmptyStr;
183  }
184  }
185  if (m_MismatchPos < m_MismatchedBases.size()) {
187  }
188 
189  return m_Result;
190 }
191 
192 
194 {
195  TSeqPos gen_from = ex.GetGenomic_start();
196  TSeqPos gen_to = ex.GetGenomic_end() + 1; // open range
199 
200  // The whole exon must be reverse-complemented.
201  m_GenVector.GetSeqData(gen_from, gen_to, m_ExonData);
202  if (m_GenRev != m_ProdRev) {
204  }
205 
206  TSeqPos prod_from = ex.GetProduct_start().GetNucpos();
207  if (prod_from > m_ProdPos) {
208  if ( !x_Mismatch(prod_from - m_ProdPos) ) return false;
209  }
210  _ASSERT(prod_from == m_ProdPos);
211 
212  if ( ex.IsSetParts() ) {
213  // Iterate parts
214  TSeqPos gen_offset = 0;
215  if (m_ProdRev) {
217  if ( !x_AddExonPart(**part_it, gen_offset) ) return false;
218  }
219  }
220  else {
221  ITERATE(CSpliced_exon::TParts, part_it, ex.GetParts()) {
222  if ( !x_AddExonPart(**part_it, gen_offset) ) return false;
223  }
224  }
225  }
226  else {
227  // Use whole exon
228  x_Match(0, gen_to - gen_from);
229  }
230  _ASSERT(m_ProdPos == ex.GetProduct_end().GetNucpos() + 1);
231  return true;
232 }
233 
234 
236 {
237  switch ( ch.Which() ) {
239  x_Match(gen_offset, gen_offset + ch.GetMatch());
240  gen_offset += ch.GetMatch();
241  break;
243  if ( !x_Mismatch(ch.GetMismatch()) ) return false;
244  gen_offset += ch.GetMismatch();
245  break;
248  break;
250  gen_offset += ch.GetGenomic_ins();
251  break;
253  // ERROR: It's not clear if diag is a match or a mismatch.
254  default:
255  // ERROR: Unexpected chunk type.
256  NCBI_THROW(CObjmgrUtilException, eBadAlignment,
257  "Unsupported chunk type");
258  }
259  return true;
260 }
261 
262 
263 inline
264 void CProductStringBuilder::x_Match(TSeqPos gen_from, TSeqPos gen_to_open)
265 {
266  m_Result.append(m_ExonData.substr(gen_from, gen_to_open - gen_from));
267  m_ProdPos += gen_to_open - gen_from;
268 }
269 
270 
271 inline
273 {
274  if (m_MismatchedBases.size() < mismatch_len) return false;
275  m_Result.append(m_MismatchedBases.substr(m_MismatchPos, mismatch_len));
276  m_MismatchPos += mismatch_len;
277  m_ProdPos += mismatch_len;
278  return true;
279 }
280 
281 
282 string GetProductString(const CSeq_align& align, CScope& scope)
283 {
284  CProductStringBuilder builder(align, scope);
285  return builder.GetProductString();
286 }
287 
288 
289 END_SCOPE(sequence)
static CRef< CScope > m_Scope
bool IsReverse(ENa_strand s)
Definition: Na_strand.hpp:75
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CAnchoredAln::TDim TDim
Seq-loc and seq-align mapper exceptions.
CBioseq_Handle –.
Exceptions for objmgr/util library.
const string & GetProductString(void)
bool x_Mismatch(TSeqPos mismatch_len)
bool x_AddExonPart(const CSpliced_exon_chunk &ch, TSeqPos &gen_offset)
void x_Match(TSeqPos gen_from, TSeqPos gen_to_open)
const CSeq_align & m_Align
bool x_AddExon(const CSpliced_exon &ex)
CProductStringBuilder(const CSeq_align &align, CScope &scope)
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
static SIZE_TYPE ReverseComplement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
@ e_Iupacna
Definition: sequtil.hpp:47
CSeqVector –.
Definition: seq_vector.hpp:65
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
CSeq_loc_Mapper –.
CSpliced_exon_chunk –.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define REVERSE_ITERATE(Type, Var, Cont)
ITERATE macro to reverse sequence through container elements.
Definition: ncbimisc.hpp:827
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
Definition: Seq_loc.cpp:5196
string GetProductString(const CSeq_align &align, CScope &scope)
Given a spliced-seg alignment with MismatchedBases user object, compose product sequence data.
CRef< CSeq_align > RemapAlignToLoc(const CSeq_align &align, CSeq_align::TDim row, const CSeq_loc &loc, CScope *scope)
Remap seq-align row to the seq-loc.
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
const TStr & GetStr(void) const
Get the variant data.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TData & GetData(void) const
Get the Data member data.
bool IsStr(void) const
Check if variant Str is selected.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TData & GetData(void) const
Get the Data member data.
const TLabel & GetLabel(void) const
Get the Label member data.
const TType & GetType(void) const
Get the Type member data.
vector< CRef< CUser_field > > TData
bool IsSetParts(void) const
basic seqments always are in biologic order Check if a value has been assigned to Parts data member.
TMatch GetMatch(void) const
Get the variant data.
bool IsSetExt(void) const
extra info Check if a value has been assigned to Ext data member.
Definition: Seq_align_.hpp:989
TGenomic_start GetGenomic_start(void) const
Get the Genomic_start member data.
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
TMismatch GetMismatch(void) const
Get the variant data.
list< CRef< CUser_object > > TExt
Definition: Seq_align_.hpp:402
const TParts & GetParts(void) const
Get the Parts member data.
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
const TExt & GetExt(void) const
Get the Ext member data.
list< CRef< CSpliced_exon_chunk > > TParts
bool IsSetProduct_length(void) const
length of the product, in bases/residues from this (or from poly-a if present), a 3' unaligned length...
TGenomic_end GetGenomic_end(void) const
Get the Genomic_end member data.
bool IsSpliced(void) const
Check if variant Spliced is selected.
Definition: Seq_align_.hpp:778
bool IsNucpos(void) const
Check if variant Nucpos is selected.
TProduct_ins GetProduct_ins(void) const
Get the variant data.
TNucpos GetNucpos(void) const
Get the variant data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
E_Choice Which(void) const
Which variant is currently selected.
@ e_Product_ins
insertion in product sequence (i.e. gap in the genomic sequence)
@ e_Diag
both sequences are represented, there is sufficient similarity between product and genomic sequences....
@ e_Genomic_ins
insertion in genomic sequence (i.e. gap in the product sequence)
@ e_Match
both sequences represented, product and genomic sequences match
@ e_Mismatch
both sequences represented, product and genomic sequences do not match
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
#define row(bind, expected)
Definition: string_bind.c:73
#define _ASSERT
#define const
Definition: zconf.h:232
Modified on Wed Apr 17 13:08:11 2024 by modify_doxy.py rev. 669887