NCBI C++ ToolKit
variation_util2.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: variation_util2.cpp 415910 2013-10-22 16:06:39Z astashya
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Description:
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
34 
41 #include <objmgr/feat_ci.hpp>
42 #include <objmgr/align_ci.hpp>
43 
44 
47 
59 
60 #include <objects/pub/Pub_set.hpp>
61 #include <objects/pub/Pub.hpp>
65 
66 
69 
71 #include <objects/seq/Seq_data.hpp>
75 #include <objects/seq/Seq_inst.hpp>
76 #include <objects/seq/Seqdesc.hpp>
77 
80 
82 
83 #include <serial/iterator.hpp>
84 #include <objmgr/util/sequence.hpp>
85 #include <objmgr/bioseq_handle.hpp>
86 #include <objmgr/seq_vector.hpp>
89 
91 
93 
94 namespace variation
95 {
96 
98  const string& message,
100  static_cast<CVariationException::ECode>(0))
101  {
103  e->SetMessage(message);
104  if(code) {
105  e->SetCode(code);
106  }
107  return e;
108  }
109 
110 
111  template<typename T>
112  static void ChangeIdsInPlace(
113  T& container,
114  sequence::EGetIdType id_type,
115  CScope& scope)
116  {
117  for(CTypeIterator<CSeq_id> it = Begin(container); it; ++it) {
118  CSeq_id& id = *it;
119  if((id_type == sequence::eGetId_ForceAcc && id.GetTextseq_Id())
120  || (id_type == sequence::eGetId_ForceGi && id.IsGi())) {
121  continue;
122  }
123 
124  const CSeq_id_Handle idh = sequence::GetId(id, scope, id_type);
125 
126  if(idh) {
127  id.Assign(*idh.GetSeqId());
128  } else if(id.IsGeneral() || id.IsLocal()) {
129  ; // e.g. ENST
130  } else {
131  NCBI_USER_THROW("Can't convert seq-id for " + it->AsFastaString());
132  }
133  }
134  }
135 
137  const CVariantPlacement& p,
138  CScope* scope)
139  {
140  int start_offset = p.IsSetStart_offset() ? p.GetStart_offset() : 0;
141 
142  int stop_offset = p.IsSetStop_offset() ? p.GetStop_offset()
143  : p.GetLoc().IsPnt() ? start_offset
144  : 0;
145  return ncbi::sequence::GetLength(p.GetLoc(), scope)
146  + stop_offset
147  - start_offset;
148  }
149 
150  // verify that exon_anchor_pos is represented in biostarts or biostops
151  // depending on the sign of offset_pos
152  static bool ValidExonTerminal(
153  const set<TSeqPos>& exon_biostarts,
154  const set<TSeqPos>& exon_biostops,
155  TSeqPos exon_anchor_pos,
156  int offset_pos)
157  {
158  // VAR-1379, VAR-1500
159  // Note: offset=0 may appear in cases like NM_000535.5:c.1145-?_2174+?del
160  // We don't know here whether we're dealing with lt or gt, so will optimistically
161  // allow if we find the anchor in either starts or stops.
162  bool found_in_starts = exon_biostarts.find(exon_anchor_pos)
163  != exon_biostarts.end();
164 
165  bool found_in_stops = exon_biostops.find(exon_anchor_pos)
166  != exon_biostops.end();
167 
168  return (offset_pos < 0 && found_in_starts)
169  || (offset_pos > 0 && found_in_stops)
170  || (offset_pos == 0 && (found_in_starts || found_in_stops));
171  }
172 
173 
174  static int GetFuzzSign(const CInt_fuzz& fuzz, int loc_sign)
175  {
176  return !fuzz.IsLim() ? 0
177  : fuzz.GetLim() == CInt_fuzz::eLim_lt ? -1
178  : fuzz.GetLim() == CInt_fuzz::eLim_tl ? -1 * loc_sign
179  : fuzz.GetLim() == CInt_fuzz::eLim_gt ? 1
180  : fuzz.GetLim() == CInt_fuzz::eLim_tr ? 1 * loc_sign
181  : 0;
182  }
183 
184 
185  static bool ValidExonTerminals(
186  const set<TSeqPos>& exon_biostarts,
187  const set<TSeqPos>& exon_biostops,
188  const CVariantPlacement& p)
189  {
190  int sign = sequence::GetStrand(p.GetLoc(), NULL) == eNa_strand_minus ? -1 : 1;
191 
192  const int start_offset_sign = //VAR-1500
194  && p.GetStart_offset() > 0 ? 1
195 
196  : p.IsSetStart_offset()
197  && p.GetStart_offset() < 0 ? -1
198 
200 
201  : 0;
202 
203  const int stop_offset_sign =
204  p.IsSetStop_offset()
205  && p.GetStop_offset() > 0 ? 1
206 
207  : p.IsSetStop_offset()
208  && p.GetStop_offset() < 0 ? -1
209 
211 
212  : 0;
213 
214  // if offset's sign is consistent with location's strand, we expect to find
215  // the anchor in exon-stops, otherwise in exon-starts. //VAR-1379
216  const bool start_ok =
219  exon_biostarts,
220  exon_biostops,
222  start_offset_sign * sign);
223 
224  const bool stop_ok =
227  exon_biostarts,
228  exon_biostops,
230  stop_offset_sign * sign);
231 
232  return start_ok && stop_ok;
233  }
234 
235 
237  {
238  const CSeq_id* id = p.GetLoc().GetId();
239  if(!id
240  || !(p.IsSetStart_offset() || p.IsSetStop_offset())
241  || !(p.IsSetMol() && (p.GetMol() == CVariantPlacement::eMol_cdna
242  || p.GetMol() == CVariantPlacement::eMol_rna))) {
243  return eNotApplicable;
244  }
245 
246  SAnnotSelector sel;
249 
250  set<TSeqPos> exon_biostarts, exon_biostops;
251  for(CFeat_CI ci(bsh, sel); ci; ++ci) {
252  const CMappedFeat& mf = *ci;
253  exon_biostarts.insert(sequence::GetStart(mf.GetLocation(), NULL));
254  exon_biostops.insert(sequence::GetStop(mf.GetLocation(), NULL));
255  }
256 
257  return exon_biostarts.empty() && exon_biostops.empty() ? eNotApplicable
258  : ValidExonTerminals(exon_biostarts, exon_biostops, p) ? ePass
259  : eFail;
260  }
261 
263  const CVariantPlacement& p,
264  const CSeq_align& aln)
265  {
266  const CSeq_id* id = p.GetLoc().GetId();
267  if(!aln.GetSegs().IsSpliced()
268  || !id
269  || !sequence::IsSameBioseq(aln.GetSeq_id(0), *id, m_scope)
270  || !(p.IsSetStart_offset() || p.IsSetStop_offset())
272  ) {
273  return eNotApplicable;
274  }
275 
276  set<TSeqPos> exon_biostarts, exon_biostops;
278  {
279  const CSpliced_exon& exon = **it;
280  exon_biostarts.insert(exon.GetProduct_start().IsNucpos() ?
281  exon.GetProduct_start().GetNucpos() :
282  exon.GetProduct_start().GetProtpos().GetAmin());
283 
284  exon_biostops.insert(exon.GetProduct_end().IsNucpos() ?
285  exon.GetProduct_end().GetNucpos() :
286  exon.GetProduct_end().GetProtpos().GetAmin());
287  }
288 
289  return ValidExonTerminals(exon_biostarts, exon_biostops, p) ? ePass : eFail;
290  }
291 
292  static void SwapLtGtFuzz(CInt_fuzz& fuzz)
293  {
294  if(fuzz.IsLim() && fuzz.GetLim() == CInt_fuzz::eLim_gt) {
296  } else if(fuzz.IsLim() && fuzz.GetLim() == CInt_fuzz::eLim_lt) {
298  }
299  }
300 
301 
302  //loc is presumed Int-loc
303  static void ApplyOffsetFuzz(
304  CSeq_loc& loc,
305  const CInt_fuzz& offset_fuzz,
306  bool is_start)
307  {
308  bool minus_strand = loc.GetStrand() == eNa_strand_minus;
309  CInt_fuzz& fuzz = (minus_strand == is_start) ? loc.SetInt().SetFuzz_to()
310  : loc.SetInt().SetFuzz_from();
311  if(!fuzz.Which()) { //will not apply this fuzz if mapping itself was fuzzy
312  fuzz.Assign(offset_fuzz);
313  if(minus_strand) {
314  SwapLtGtFuzz(fuzz);
315  }
316  if(fuzz.IsRange()) {
317  //fuzz range coordinates are invalid after remapping
319  }
320  }
321  }
322 
323 
325  {
326  if(!p.IsSetStart_offset()
327  && !p.IsSetStop_offset()) {
328  return;
329  }
330 
331  if(p.GetLoc().IsPnt()
332  && !p.IsSetStop_offset()
333  && p.IsSetStart_offset()) {
334  //In case of point, only start-offset may be specified. In this case
335  //temporarily set stop-offset to be the same (it will be reset below),
336  //such that start and stop of point-as-interval are adjusted consistently.
338  }
339 
340  //Convert to interval.
341  //Need to do for a point so that we can deal with start and stop independently.
342  //Another scenario is when start/stop in CDNA coords lie in different exons, so
343  //the mapping is across one or more intron - we want to collapse the genomic loc
344  //(do we want to preselve the gaps caused by indels rather than introns?)
345  CRef<CSeq_loc> loc =
347  p.GetLoc(),
349  NULL);
350 
351  if(!loc->IsInt()
352  && (p.IsSetStart_offset() || p.IsSetStop_offset())) {
353  NcbiCerr << MSerial_AsnText << p;
354  NCBI_THROW(CException, eUnknown, "Complex location");
355  }
356 
357  bool minus_strand = loc->GetStrand() == eNa_strand_minus;
358 
359  if(p.IsSetStart_offset()) {
360  TSeqPos& bio_start_ref = minus_strand ? loc->SetInt().SetTo()
361  : loc->SetInt().SetFrom();
362 
363  bio_start_ref += p.GetStart_offset() * (minus_strand ? -1 : 1);
364  p.ResetStart_offset();
365  }
366 
367  if(p.IsSetStop_offset()) {
368  TSeqPos& bio_stop_ref =
369  loc->GetStrand() == eNa_strand_minus ? loc->SetInt().SetFrom()
370  : loc->SetInt().SetTo();
371 
372  bio_stop_ref += p.GetStop_offset() * (minus_strand ? -1 : 1);
373  p.ResetStop_offset();
374  }
375 
376  if(p.IsSetStart_offset_fuzz()) {
377  ApplyOffsetFuzz(*loc, p.GetStart_offset_fuzz(), true);
379  }
380 
381  if(p.IsSetStop_offset_fuzz()) {
382  ApplyOffsetFuzz(*loc, p.GetStop_offset_fuzz(), false);
384  }
385 
386  if(loc->GetTotalRange().GetLength() == 1) {
388  }
389 
390  p.SetLoc().Assign(*loc);
391  }
392 
393 
396  const CSpliced_seg& ss,
397  CScope* scope)
398  {
399 #if 0
400  if(!p.GetLoc().IsPnt() && !p.GetLoc().IsInt()) {
401  NCBI_THROW(CException, eUnknown, "Expected simple loc (int or pnt)");
402  }
403 #endif
404 
405  if(p.IsSetStart_offset()
406  || p.IsSetStop_offset()
408  || p.IsSetStop_offset_fuzz()) {
409  NCBI_THROW(CException, eUnknown, "Expected offset-free placement");
410  }
411 
412  if(!p.GetLoc().GetId()
414  *p.GetLoc().GetId(),
415  ss.GetGenomic_id(),
416  scope)) {
418  "Expected genomic_id in the variation to be the same as in spliced-seg");
419  }
420 
421  long start = p.GetLoc().GetStart(eExtreme_Positional);
422  long stop = p.GetLoc().GetStop(eExtreme_Positional);
423 
424  long closest_start = ss.GetExons().front()->GetGenomic_start();
425  // closest-exon-boundary for bio-start of variation location
426 
427  long closest_stop = ss.GetExons().front()->GetGenomic_start();
428  // closest-exon-boundary for bio-stop of variation location
429 
431  {
432  const CSpliced_exon& se = **it;
433 
434  if(se.GetGenomic_end() >= start && se.GetGenomic_start() <= start) {
435  closest_start = start; //start is within exon - use itself.
436  } else {
437  if(abs((long)se.GetGenomic_end() - start) < abs(closest_start - start)) {
438  closest_start = (long)se.GetGenomic_end();
439  }
440  if(abs((long)se.GetGenomic_start() - start) < abs(closest_start - start)) {
441  closest_start = (long)se.GetGenomic_start();
442  }
443  }
444 
445  if(se.GetGenomic_end() >= stop && se.GetGenomic_start() <= stop) {
446  closest_stop = stop; //end is within exon - use itself.
447  } else {
448  if(abs((long)se.GetGenomic_end() - stop) < abs(closest_stop - stop)) {
449  closest_stop = (long)se.GetGenomic_end();
450  }
451  if(abs((long)se.GetGenomic_start() - stop) < abs(closest_stop - stop)) {
452  closest_stop = (long)se.GetGenomic_start();
453  }
454  }
455  }
456 
457  //convert to int, so we can set start and stop
459 
460  //adjust location
461  if(start != closest_start || stop != closest_stop) {
462  int_loc->SetInt().SetFrom(closest_start);
463  int_loc->SetInt().SetTo(closest_stop);
464  }
465  p.SetLoc(*int_loc);
466 
467  //Add offsets. Note that start-offset and stop-offset are always biological.
468  //(e.g. start-offset applies to bio-start; value > 0 means downstream, and value <0 means upstream)
469 
470  //Note: qry/sbj strandedness in the alignment is irrelevent - we're simply adjusting the
471  //location in the genomic coordinate system, conserving the strand
472  if(start != closest_start) {
473  long offset = (start - closest_start);
474  if(p.GetLoc().GetStrand() == eNa_strand_minus) {
475  p.SetStop_offset(offset * -1);
476  } else {
478  }
479  }
480 
481  if(stop != closest_stop) {
482  long offset = (stop - closest_stop);
483  if(p.GetLoc().GetStrand() == eNa_strand_minus) {
484  p.SetStart_offset(offset * -1);
485  } else {
487  }
488  }
489  }
490 
491 
492  CRef<CVariantPlacement> CVariationUtil::Remap(const CVariantPlacement& p, const CSeq_align& aln, bool check_placements)
493  {
494  if(!p.GetLoc().GetId()) {
495  NCBI_THROW(CException, eUnknown, "Can't get seq-id");
496  }
497 
498  //note: the operations here are scope-free (i.e. seq-id types in aln must be consistent with p,
499  //or else will throw; using a scope will impose major slowdown, which is problematic if we're
500  //dealing with tens or hundreds of thousands of remappings.
501 
503  p2->Assign(p);
504 
505  if(aln.GetSegs().IsSpliced()
507  *p2->GetLoc().GetId(), NULL)) {
509  }
510 
511  CSeq_align::TDim target_row = -1;
512  for(int i = 0; i < 2; i++) {
513  if(sequence::IsSameBioseq(*p2->GetLoc().GetId(), aln.GetSeq_id(i), NULL)) {
514  target_row = 1 - i;
515  }
516  }
517 
518  if(target_row == -1) {
520  eUnknown,
521  "The alignment has no row for seq-id "
522  + p2->GetLoc().GetId()->AsFastaString());
523  }
524 
525  CRef<CSeq_loc_Mapper> mapper(new CSeq_loc_Mapper(aln, target_row, NULL));
526  mapper->SetMergeAll();
527 
528  CRef<CVariantPlacement> p3 = x_Remap(*p2, *mapper);
529 
530  if(p3->GetLoc().GetId()
531  && aln.GetSegs().IsSpliced()
533  *p3->GetLoc().GetId(), NULL)) {
534  if(CheckExonBoundary(p, aln) == eFail) {
535 
536  bool source_loc_is_projected =
539  // checkig exon-boundary only for non-projected cases, since an exon-boundary
540  // may become non-exon-boundary (e.g. transcript extended or exon boundary shifted).
541  // VAR-1309
542 
546 
547  const string msg =
548  "HGVS exon-boundary position not found in alignment of "
549  + aln.GetSeq_id(0).AsFastaString()
550  + "-vs-"
551  + aln.GetSeq_id(1).AsFastaString();
552 
553  p3->SetExceptions().push_back(CreateException(msg, code));
554  }
555 
557  }
558 
559  //note: AttachSeq happens only after the intronic offsets are resolved.
560  AttachSeq(*p3);
561 
562  // Note: as per SNP-5641 - will check for mismatches
563  // even in the presence of other more severe exceptions
564  // NcbiCerr << "Checking for mismatches-in-mapping"
565  // << MSerial_AsnText << p << MSerial_AsnText << *p3 << "\n\n";
566  if(p.IsSetSeq() && p3->IsSetSeq()
567  && p.GetSeq().GetLength() == p3->GetSeq().GetLength()
568  && p.GetSeq().IsSetSeq_data() && p3->GetSeq().IsSetSeq_data()
569  && p.GetSeq().GetSeq_data().Which() == p3->GetSeq().GetSeq_data().Which()
570  && !p.GetSeq().GetSeq_data().Equals(p3->GetSeq().GetSeq_data())) {
571  p3->SetExceptions().push_back(CreateException(
572  "Mismatches in mapping",
574  }
575 
576  //VAR-1307
577  {{
578  static const long thr = 5000;
579  long start_offset = !p2->IsSetStart_offset() ? 0 : p2->GetStart_offset();
580  long stop_offset = !p2->IsSetStop_offset() ? 0 : p2->GetStop_offset();
581  bool far_start = start_offset + thr < 0
582  && (p2->GetLoc().GetStart(eExtreme_Biological) == aln.GetSeqStart(1)
583  || p2->GetLoc().GetStart(eExtreme_Biological) == aln.GetSeqStop(1));
584 
585  bool far_stop = stop_offset > thr
586  && (p2->GetLoc().GetStop(eExtreme_Biological) == aln.GetSeqStart(1)
587  || p2->GetLoc().GetStop(eExtreme_Biological) == aln.GetSeqStop(1));
588 
589  if(far_start || far_stop) {
590  p3->SetExceptions().push_back(CreateException(
591  "Source location overhangs the alignment by at least 5kb ",
593  }
594  }}
595 
596  if(check_placements) {
597  CheckPlacement(*p3);
598  }
599  return p3;
600  }
601 
602 
604  {
605  CRef<CSeq_align> aln(new CSeq_align);
607  CSpliced_seg& ss = aln->SetSegs().SetSpliced();
608  ss.SetProduct_id().Assign(sequence::GetId(rna_feat.GetProduct(), NULL));
609  ss.SetGenomic_id().Assign(sequence::GetId(rna_feat.GetLocation(), NULL));
610  ss.SetExons();
614 
615  TSignedSeqPos product_pos(0);
618  se->SetGenomic_start(ci.GetRange().GetFrom());
619  se->SetGenomic_end(ci.GetRange().GetTo());
620  se->SetProduct_start().SetNucpos(product_pos);
621  se->SetProduct_end().SetNucpos(product_pos + ci.GetRange().GetLength() - 1);
622  product_pos += ci.GetRange().GetLength();
623  ss.SetExons().push_back(se);
624  }
625 
626  return aln;
627  }
628 
629 
630  //todo: 1. what if we don't have the requested version of the product annotated?
631  // 2. do we need to support NGs, etc (will need to search all alignments on sequnece, as we can't find NG by product-id)
632  // 3. Do we want to return VariantPlacement instead, such that we can communicate auxiliary info?
634  const CVariation& v,
635  const CSeq_id& target)
636  {
638  result->SetLoc().SetNull();
640  CBioseq_Handle bsh = m_scope->GetBioseqHandle(target);
641  CRef<CSeq_loc_Mapper> upmapper;
642 
643  if(v.IsSetPlacements()) {
644  //First see if we already have location on target in one of the placements
646  {
647  const CVariantPlacement& p = **it;
648  if(p.GetLoc().GetId() && sequence::IsSameBioseq(target, *p.GetLoc().GetId(), m_scope)) {
649  result->Assign(p);
650  break;
651  }
652  }
653 
654  if(result->GetLoc().IsNull()) {
655  //did not find an existing placement for the target; will try to remap
657  {
658  const CVariantPlacement& placement = **it;
659 
660  if(!placement.GetLoc().GetId()) {
661  continue;
662  }
663  CSeq_id_Handle p_idh = sequence::GetId(*placement.GetLoc().GetId(),
664  *m_scope,
666 
667  bool is_scaffold = p_idh.IdentifyAccession() == CSeq_id::eAcc_refseq_contig
671 
672  if(placement.GetMol() == CVariantPlacement::eMol_protein) {
673 
674  CRef<CVariation> precursor = InferNAfromAA(v);
675  result = RemapToAnnotatedTarget(*precursor, target);
676 
677  } else if(placement.GetMol() == CVariantPlacement::eMol_genomic
678  && is_scaffold) {
679  // location is possibly a component of the target
680  if(!upmapper) {
681  upmapper.Reset(new CSeq_loc_Mapper(1, bsh, CSeq_loc_Mapper::eSeqMap_Up));
682  }
683  result = Remap(placement, *upmapper);
685 
686  } else {
687 
688  // the annotation is gi-based, so we'll make sure the target
689  // is gi-based so we won't need to do conversion for all products while searching.
690  CSeq_id_Handle product_idh = sequence::GetId(*placement.GetLoc().GetId(),
691  *m_scope,
693  if(!product_idh) {
694  continue;
695  }
696 
697  // note that here we are looking for annotated (packaged) alignment at the
698  // feature's location first (we could have searched the whole sequence instead,
699  // but it is quite slow for NCs) If we have the feature but no alignment,
700  // it is assumed that the alignment is perfect, and a synthetic seq-align
701  // is created based on RNA feature.
702  CRef<CSeq_align> aln;
703  {{
704  CRef<CSeq_loc> target_loc = bsh.GetRangeSeq_loc(0, 0); //this returns "whole"
705  //will try to constrain target_loc from whole to product location only
706 
707  SAnnotSelector sel;
708  sel.SetResolveAll();
709  sel.SetAdaptiveDepth(true);
711 
712  for(CFeat_CI ci(bsh, sel); ci; ++ci) {
713  const CMappedFeat& mf = *ci;
714  if(!mf.IsSetProduct()) {
715  continue;
716  }
717  const CSeq_id& product_id = sequence::GetId(mf.GetProduct(), NULL);
718  if(product_id.Equals(*product_idh.GetSeqId())) {
719  target_loc->Assign(mf.GetLocation());
721  break;
722  }
723  }
724 
725  //try to find explicit seq_align at the target-loc (if not found, will use the synthetic one)
726  for(CAlign_CI ci2(*m_scope, *target_loc, sel); ci2; ++ci2) {
727  const CSeq_align& current_aln = *ci2;
728  if(current_aln.GetSeq_id(0).Equals(*product_idh.GetSeqId())) {
729  aln = SerialClone<CSeq_align>(current_aln);
730  }
731  }
732  }}
733 
734  if(aln) {
735  CRef<CVariantPlacement> p2(SerialClone<CVariantPlacement>(placement));
738  result = Remap(*p2, *aln);
739  }
740 
741  }
742  if(!result->GetLoc().IsNull()) {
743  break;
744  }
745  }
746  }
747  } else if(v.GetData().IsSet()) { //no placements at this level - try in sub-variations
749  {
750  const CVariation& v2 = **it;
751  CRef<CVariantPlacement> mapped_placement = RemapToAnnotatedTarget(v2, target);
752  if(!result->GetLoc().IsNull()) { //already have somthing - append
753  result->SetLoc().Add(mapped_placement->GetLoc());
754  } else {
755  result->Assign(*mapped_placement);
756  }
757  }
758  }
759 
761  result->SetLoc().Assign(*loc);
762  return result;
763  }
764 
765 
766 
767  template<typename T>
768  static bool ContainsIupacNaAmbiguities(const T& obj)
769  {
770  for(CTypeConstIterator<CSeq_data> it(Begin(obj)); it; ++it) {
771  const CSeq_data& sd = *it;
772  if(!sd.IsIupacna()) {
773  continue;
774  }
775  ITERATE(string, it2, sd.GetIupacna().Get())
776  {
777  char c = *it2;
778  if(c != 'A' && c != 'C' && c != 'G' && c != 'T') {
779  return true;
780  }
781  }
782  }
783  return false;
784  }
785 
787  {
788  bool had_ambiguities = false;
789 
790  if(v.IsSetPlacements() && v.GetPlacements().size() > 0) {
792  had_ambiguities = true;
793  v.SetPlacements().front()->SetExceptions().push_back(CreateException("Ambiguous residues in variation", CVariationException::eCode_ambiguous_sequence));
794  }
795  }
796 
797  if(v.GetData().IsSet()) {
798  NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations())
799  {
800  CVariation& v2 = **it;
801  had_ambiguities = had_ambiguities || CheckAmbiguitiesInLiterals(v2);
802  }
803  }
804  return had_ambiguities;
805  }
806 
808  {
809  bool invalid_location = false;
810  bool out_of_order = false;
811 
813  invalid_location = true;
814  }
815 
816  if(p.GetLoc().GetId() && !p.GetLoc().IsEmpty()) {
818  out_of_order = true;
819  }
820 
821 #if 0
822  //Note: not comparing offsets, as it may be legitimately out-of-order. JIRA:SNP-5238
823  if(p.IsSetStart_offset() && p.IsSetStop_offset() && sequence::GetLength(p.GetLoc(), NULL) == 1) {
824  if(p.GetStart_offset() > p.GetStop_offset()) {
825  //note: not checking for strand, as offsets are always in the order of transcription
826  out_of_order = true;
827  invalid_location = true;
828  }
829  }
830 #endif
831  }
832 
833  if(invalid_location) {
834  p.SetExceptions().push_back(CreateException(
835  out_of_order ? "Invalid location - start and stop are out of order"
836  : "Invalid location",
838  }
839 
840  if(p.GetLoc().GetId()) {
841 
842  CBioseq_Handle bsh =
844  *p.GetLoc().GetId());
845 
846  if(bsh && bsh.GetState() != 0
847  && bsh.GetState() != CBioseq_Handle::fState_dead) { //dead is OK, beacuse supporting old versions
848  p.SetExceptions().push_back(
850  "Bioseq is suppressed or withdrawn",
852  }
853  }
854 
855  return invalid_location;
856  }
857 
858 
860  const CVariantPlacement& p,
861  CSeq_loc_Mapper& mapper)
862  {
863  CRef<CVariantPlacement> p2 = x_Remap(p, mapper);
864 
865  if(((p.IsSetStart_offset()
866  || p.IsSetStop_offset())
870  && p2->GetMol() != CVariantPlacement::eMol_unknown) /*in case remapped to nothing*/) {
871  NcbiCerr << "Mapped: " << MSerial_AsnText << *p2;
872  //When mapping an offset-placement to a genomic placement, may need to resolve offsets.
873  //or, when mapping from genomic to a product coordinates, may need to add offsets. In above cases
874  //we need to use the seq-align-based mapping.
876  "Cannot use Mapper-based method to remap intronic cases;"
877  "must remap via spliced-seg alignment instead.");
878  }
879 
880  AttachSeq(*p2);
881 
882  if(p.IsSetSeq() && p2->IsSetSeq()
883  && p.GetSeq().GetLength() == p2->GetSeq().GetLength()
884  && p.GetSeq().IsSetSeq_data() && p2->GetSeq().IsSetSeq_data()
885  && p.GetSeq().GetSeq_data().Which() == p2->GetSeq().GetSeq_data().Which()
886  && !p.GetSeq().GetSeq_data().Equals(p2->GetSeq().GetSeq_data())) {
887  p2->SetExceptions().push_back(
889  "Mismatches in mapping",
891  }
892 
893  CheckPlacement(*p2);
894  return p2;
895  }
896 
897 
899  const CVariantPlacement& p,
900  CSeq_loc_Mapper& mapper)
901  {
903 
904 
905  if(p.IsSetStart_offset()) {
906  p2->SetStart_offset() = p.GetStart_offset();
907  }
908  if(p.IsSetStop_offset()) {
909  p2->SetStop_offset() = p.GetStop_offset();
910  }
911  if(p.IsSetStart_offset_fuzz()) {
912  p2->SetStart_offset_fuzz().Assign(p.GetStart_offset_fuzz());
913  }
914  if(p.IsSetStop_offset_fuzz()) {
915  p2->SetStop_offset_fuzz().Assign(p.GetStop_offset_fuzz());
916  }
917 
918  CRef<CSeq_loc> mapped_loc = mapper.Map(p.GetLoc());
919 
920  {{
921  // If we have offsets, then the distinction
922  // between point and one-point interval is important, e.g.
923  // NM_000155.3:c.-116-3_-116 - the location is an interval,
924  // but the anchor point is the same; we need to
925  // keep it as interval, as if we represent it as a point,
926  // then the corresponding HGVS is also a point: NM_000155.3:c.-116-3
927  const bool equal_offsets = (
928  !p2->IsSetStart_offset()
929  && !p2->IsSetStop_offset())
930 
931  || (p2->IsSetStart_offset()
932  && p2->IsSetStop_offset()
933 
934  && p2->GetStart_offset()
935  == p2->GetStop_offset());
936 
937  const bool merge_single_range =
938  p.GetLoc().IsInt()
939  && mapped_loc->IsPnt()
940  && !equal_offsets;
941 
942  if(mapped_loc->IsInt()
943  && mapped_loc->GetInt().IsSetFuzz_from()
944  && mapped_loc->GetInt().IsSetFuzz_to()
945  && sequence::GetLength(*mapped_loc, NULL) == 1) {
946  // workaround for CXX-4376. single-nt locations will
947  // not be collapsed to a point if both From and To fuzz is present
948  mapped_loc->SetInt().ResetFuzz_to();
949  }
950 
951  mapped_loc = sequence::Seq_loc_Merge(
952  *mapped_loc,
953  merge_single_range ? CSeq_loc::fMerge_SingleRange
955  NULL);
956  }}
957 
958 #if 1
959  //SNP-5148
960  if(mapped_loc->IsNull() && p.GetLoc().GetId() && !p.GetLoc().IsEmpty()) {
961  //If mapped to nothing, expand the loc and try again. The purpose is to know the seq-id of the
962  //neighborhood that we couldn't map to, so we can produce and Empty seq-loc that contains
963  //a seq-id, reporting that we tried to map to this sequence, but there's no mapping
965  loc1->SetInt().SetFrom() = loc1->GetInt().GetFrom() < 500 ? 0 : loc1->SetInt().GetFrom() - 500;
966  loc1->SetInt().SetTo() += 500;
967  CRef<CSeq_loc> tmp_mapped_loc = mapper.Map(*loc1);
968 
969  if(tmp_mapped_loc->GetId()) {
970  mapped_loc->SetEmpty().Assign(*tmp_mapped_loc->GetId());
971  }
972 
973  }
974  if(p2->GetLoc().IsEmpty()) {
976  }
977 #endif
978 
979 
980  p2->SetLoc(*mapped_loc);
982 
983  {{
984  TSeqPos orig_len = p.GetLoc().Which() ? sequence::GetLength(p.GetLoc(), NULL) : 0;
985  TSeqPos mapped_len = mapped_loc->Which() ? sequence::GetLength(*mapped_loc, NULL) : 0;
986  bool orig_is_compound = p.GetLoc().IsMix() || p.GetLoc().IsPacked_int();
987  bool mapped_is_compound = mapped_loc->IsMix() || mapped_loc->IsPacked_int();
988  CRef<CVariationException> exception;
989  if(mapped_len == 0) {
990  exception.Reset(new CVariationException);
991  exception->SetCode(CVariationException::eCode_no_mapping);
992  } else if(mapped_len < orig_len) {
993  exception.Reset(new CVariationException);
994  exception->SetCode(CVariationException::eCode_partial_mapping);
995  } else if(!orig_is_compound && mapped_is_compound) {
996  exception.Reset(new CVariationException);
997  exception->SetCode(CVariationException::eCode_split_mapping);
998  }
999  if(exception) {
1000  exception->SetMessage("");
1001  p2->SetExceptions().push_back(exception);
1002  }
1003  }}
1004 
1005  if(p2->GetLoc().GetId()) {
1007  } else {
1009  }
1010 
1011  //VAR-1307
1012  if(p2->GetLoc().GetId()
1016 
1020 
1021  && s_GetLength(p, m_scope)
1022  > s_GetLength(*p2, m_scope) + 5000) {
1023  p2->SetExceptions().push_back(CreateException(
1024  "Source location overhangs the alignment by at least 5kb",
1026  }
1027 
1028 
1030  return p2;
1031  }
1032 
1034  {
1035  p.ResetSeq();
1036  bool ret = false;
1037  TSeqPos length = s_GetLength(p, m_scope);
1038 
1039  p.SetSeq().SetLength(length);
1040 
1041  if((p.IsSetStart_offset() && p.GetStart_offset() != 0)
1042  || (p.IsSetStop_offset() && p.GetStop_offset() != 0)) {
1043  p.SetExceptions().push_back(CreateException(
1044  "Can't get sequence for an offset-based location",
1046  ret = false;
1047  } else if(length > max_len) {
1048  p.SetExceptions().push_back(CreateException(
1049  "Sequence is longer than the cutoff threshold",
1051  ret = false;
1052  } else {
1053  try {
1055  p.SetSeq(*literal);
1056 
1058  p.SetExceptions().push_back(CreateException(
1059  "Ambiguous residues in reference",
1061  }
1062  ret = true;
1063  }
1064  catch(CException&) {
1065  //location can be invalid - SNP-5510
1066  p.SetExceptions().push_back(CreateException(
1067  "Cannot fetch sequence at location",
1069  ret = false;
1070  }
1071  }
1072  return ret;
1073  }
1074 
1076  {
1077  v.Index();
1078  bool had_exceptions = false;
1079  if(v.IsSetPlacements()) {
1080 
1081  CConstRef<CSeq_literal> asserted_literal;
1082  CConstRef<CSeq_literal> variant_literal;
1083  // will only fetch for snp/mnp cases,
1084  // as ref_same_as_variant test is only applicable for these
1085 
1086  // Find the asserted and variant literal for this variation
1087  // (don't look into consequence-variations that may also contain
1088  // protein-specific asserted and variant sequence)
1089  for(CTypeConstIterator<CVariation> it(Begin(v)); it; ++it) {
1090  const CVariation& v2 = *it;
1091  if(!v2.GetData().IsInstance() || (v2.GetConsequenceParent() && &v != &v2)) {
1092  continue;
1093  }
1094 
1095  const CVariation_inst& inst = v2.GetData().GetInstance();
1096  if(inst.GetDelta().size() == 1
1097  && inst.GetDelta().front()->IsSetSeq()
1098  && inst.GetDelta().front()->GetSeq().IsLiteral()) {
1099  CConstRef<CSeq_literal> literal(&inst.GetDelta().front()->GetSeq().GetLiteral());
1100  if(!asserted_literal
1101  && inst.IsSetObservation()
1103  asserted_literal = literal;
1104  } else if(!variant_literal //note: finding the very first one; will that work always?
1106  && (!inst.GetDelta().front()->IsSetAction() || inst.GetDelta().front()->GetAction() == CDelta_item::eAction_morph)
1107  && (!inst.GetDelta().front()->IsSetMultiplier() && !inst.GetDelta().front()->IsSetMultiplier_fuzz())
1108  ) {
1109  variant_literal = literal;
1110  }
1111  }
1112  }
1113 
1114 #if 0
1115  if(variant_literal) {
1116  LOG_POST("Found variant-literal");
1117  NcbiCout << MSerial_AsnText << *variant_literal;
1118  } else {
1119  LOG_POST("Did not find variant-literal");
1120  }
1121 #endif
1122 
1124  {
1125  CVariantPlacement& p = **it;
1126  bool attached = AttachSeq(p, max_len);
1127 
1128  if(attached
1129  && asserted_literal
1130  && (p.GetSeq().GetLength() != asserted_literal->GetLength()
1131  || (p.GetSeq().IsSetSeq_data() && asserted_literal->IsSetSeq_data()
1132  && p.GetSeq().GetSeq_data().Which() == asserted_literal->GetSeq_data().Which()
1133  && !p.GetSeq().GetSeq_data().Equals(asserted_literal->GetSeq_data())))) {
1134  p.SetExceptions().push_back(CreateException(
1135  "Asserted sequence is inconsistent with reference",
1137  had_exceptions = true;
1138  }
1139 
1140  if(attached
1141  && variant_literal
1142  && variant_literal->Equals(p.GetSeq())) {
1143  p.SetExceptions().push_back(CreateException(
1144  "Reference sequence is the same as variant",
1146  had_exceptions = true;
1147  }
1148  }
1149  }
1150 
1151  if(v.GetData().IsSet()) {
1153  v.SetData().SetSet().SetVariations())
1154  {
1155  CVariation& v2 = **it;
1156  had_exceptions = had_exceptions || AttachSeq(v2, max_len);
1157  }
1158  }
1159  return !had_exceptions;
1160  }
1161 
1162 
1164  {
1165  //Most of the time can figure out from seq-id. Must be careful not to
1166  //automatically treat NC as "genomic", as we need to differentiate from
1167  //mitochondrion.
1168  CSeq_id::EAccessionInfo acinf = id.IdentifyAccession();
1170 
1173  } else if(acinf == CSeq_id::eAcc_refseq_mrna || acinf == CSeq_id::eAcc_refseq_mrna_predicted) {
1174  moltype = CVariantPlacement::eMol_cdna;
1175  } else if(acinf == CSeq_id::eAcc_refseq_ncrna || acinf == CSeq_id::eAcc_refseq_ncrna_predicted) {
1176  moltype = CVariantPlacement::eMol_rna;
1177  } else if(acinf == CSeq_id::eAcc_refseq_genomic
1178  || acinf == CSeq_id::eAcc_refseq_contig
1179  || acinf == CSeq_id::eAcc_refseq_wgs_intermed) {
1181  } else if(id.IdentifyAccession() == CSeq_id::eAcc_refseq_chromosome
1182  && NStr::StartsWith(id.GetTextseq_Id()->GetAccession(), "AC_")) {
1184  } else if(id.IdentifyAccession() == CSeq_id::eAcc_refseq_chromosome
1185  && ((id.GetTextseq_Id()->GetAccession() >= "NC_000001" && id.GetTextseq_Id()->GetAccession() <= "NC_000024")
1186  || (id.GetTextseq_Id()->GetAccession() >= "NC_000067" && id.GetTextseq_Id()->GetAccession() <= "NC_000087"))) {
1187  //hardcode most "popular" chromosomes that are not mitochondrions.
1189  } else {
1191  if(!bsh) {
1193  } else {
1194  //check if mitochondrion
1195  if(bsh.GetTopLevelEntry().IsSetDescr()) {
1197  {
1198  const CSeqdesc& desc = **it;
1199  if(!desc.IsSource()) {
1200  continue;
1201  }
1204  }
1205  }
1206  }
1207 
1208  //check molinfo
1209  if(moltype == CVariantPlacement::eMol_unknown) {
1210  const CMolInfo* m = sequence::GetMolInfo(bsh);
1211  if(!m) {
1213  } else if(m->GetBiomol() == CMolInfo::eBiomol_genomic
1216  } else if(m->GetBiomol() == CMolInfo::eBiomol_mRNA
1218  moltype = CVariantPlacement::eMol_cdna;
1219  } else if(m->GetBiomol() == CMolInfo::eBiomol_ncRNA
1229  moltype = CVariantPlacement::eMol_rna;
1230  } else if(m->GetBiomol() == CMolInfo::eBiomol_peptide) {
1232  } else {
1234  }
1235  }
1236  }
1237  }
1238  return moltype;
1239  }
1240 
1241 
1243  {
1244  SFlankLocs flanks;
1245  flanks.upstream.Reset(new CSeq_loc(CSeq_loc::e_Null));
1247  if(!loc.GetId()) {
1248  return flanks;
1249  }
1250 
1253 
1254  const CSeq_id& seq_id = sequence::GetId(loc, NULL);
1255 
1256  CBioseq_Handle bsh = m_scope->GetBioseqHandle(seq_id);
1257 
1258  if(!bsh) {
1259  NCBI_USER_THROW_FMT("Unable to retrieve bioseq from scope : id " <<
1260  seq_id.GetSeqIdString(true /*with version*/));
1261  }
1262 
1263  TSignedSeqPos max_pos = bsh.GetInst_Length() - 1;
1264 
1265  flanks.upstream->SetInt().SetId().Assign(sequence::GetId(loc, NULL));
1266  flanks.upstream->SetInt().SetStrand(sequence::GetStrand(loc, NULL));
1267  flanks.upstream->SetInt().SetTo(min(max_pos, stop + (TSignedSeqPos)len));
1268  flanks.upstream->SetInt().SetFrom(max((TSignedSeqPos)0, start - (TSignedSeqPos)len));
1269 
1270  flanks.downstream->Assign(*flanks.upstream);
1271 
1272  CSeq_loc& second = sequence::GetStrand(loc, NULL) == eNa_strand_minus ? *flanks.upstream : *flanks.downstream;
1273  CSeq_loc& first = sequence::GetStrand(loc, NULL) == eNa_strand_minus ? *flanks.downstream : *flanks.upstream;
1274 
1275  if(start == 0) {
1276  first.SetNull();
1277  } else {
1278  first.SetInt().SetTo(start - 1);
1279  }
1280 
1281  if(stop == max_pos) {
1282  second.SetNull();
1283  } else {
1284  second.SetInt().SetFrom(stop + 1);
1285  }
1286 
1287  return flanks;
1288  }
1289 
1290 
1291 
1292  ///////////////////////////////////////////////////////////////////////////////
1293  //
1294  // Methods and functions pertaining to converting protein variation in precursor coords
1295  //
1296  ///////////////////////////////////////////////////////////////////////////////
1297  void CVariationUtil::s_UntranslateProt(const string& prot_str, vector<string>& codons)
1298  {
1299  if(prot_str.size() != 1) {
1300  NCBI_THROW(CException, eUnknown, "Expected prot_str of length 1");
1301  }
1302 
1303  static const char* alphabet = "ACGT";
1304  string codon = "AAA";
1305  for(size_t i0 = 0; i0 < 4; i0++) {
1306  codon[0] = alphabet[i0];
1307  for(size_t i1 = 0; i1 < 4; i1++) {
1308  codon[1] = alphabet[i1];
1309  for(size_t i2 = 0; i2 < 4; i2++) {
1310  codon[2] = alphabet[i2];
1311  string prot("");
1313  NStr::ReplaceInPlace(prot, "*", "X"); //Conversion to IUPAC produces "X", but Translate produces "*"
1314 
1315  //LOG_POST(">>>" << codon << " " << prot << " " << prot_str);
1316  if(prot == prot_str) {
1317  codons.push_back(codon);
1318  }
1319  }
1320  }
1321  }
1322  }
1323 
1324  size_t CVariationUtil::s_CountMatches(const string& a, const string& b)
1325  {
1326  size_t count(0);
1327  for(size_t i = 0; i < min(a.size(), b.size()); i++) {
1328  if(a[i] == b[i]) {
1329  count++;
1330  }
1331  }
1332  return count;
1333  }
1334 
1336  const string& codon_from, //codon on cDNA
1337  const string& prot_to, //missense/nonsense AA
1338  vector<string>& codons_to) //calculated variation-codons
1339  {
1340  vector<string> candidates1;
1341  size_t max_matches(0);
1342  s_UntranslateProt(prot_to, candidates1);
1343  codons_to.clear();
1344  bool have_silent = false;
1345 
1346  ITERATE(vector<string>, it1, candidates1)
1347  {
1348  size_t matches = s_CountMatches(codon_from, *it1);
1349 
1350  // LOG_POST("CalcPrecursorVariationCodon:" << codon_from << " " << prot_to << " " << *it1 << " " << matches);
1351  if(matches == 3) {
1352  //all three bases in a codon matched - we must be processing a silent mutation.
1353  //in this case we want to consider candidate codons other than itself.
1354  have_silent = true;
1355  continue;
1356  }
1357 
1358  if(matches >= max_matches) {
1359  if(matches > max_matches) {
1360  codons_to.clear();
1361  }
1362  codons_to.push_back(*it1);
1363  max_matches = matches;
1364  }
1365  }
1366 
1367  // didn't find polymorphic candidate - must be dealing with no-change at nucleotide level
1368  if(codons_to.empty() && have_silent) {
1369  codons_to.push_back(codon_from);
1370  }
1371  }
1372 
1373  string CVariationUtil::s_CollapseAmbiguities(const vector<string>& seqs)
1374  {
1375  string collapsed_seq;
1376 
1377  vector<int> bits; //4-bit bitmask denoting whether a nucleotide occurs at this pos at any seq in seqs
1378 
1379  typedef const vector<string> TConstStrs;
1380  ITERATE(TConstStrs, it, seqs)
1381  {
1382  const string& seq = *it;
1383  if(seq.size() > bits.size()) {
1384  bits.resize(seq.size());
1385  }
1386 
1387  for(size_t i = 0; i < seq.size(); i++) {
1388  char nt = seq[i];
1389  int m = (nt == 'T' ? 1
1390  : nt == 'G' ? 2
1391  : nt == 'C' ? 4
1392  : nt == 'A' ? 8 : 0);
1393  if(!m) {
1394  NCBI_THROW(CException, eUnknown, "Expected [ACGT] alphabet");
1395  }
1396 
1397  bits[i] |= m;
1398  }
1399  }
1400 
1401  static const char* iupac_nuc_ambiguity_codes = "NTGKCYSBAWRDMHVN";
1402  collapsed_seq.resize(bits.size());
1403  for(size_t i = 0; i < collapsed_seq.size(); i++) {
1404  collapsed_seq[i] = iupac_nuc_ambiguity_codes[bits[i]];
1405  }
1406  return collapsed_seq;
1407  }
1408 
1409 
1411  {
1412  if(v.GetData().IsSet()) {
1413  CVariation::TData::TSet::TVariations variations = v.SetData().SetSet().SetVariations();
1414  v.SetData().SetSet().SetVariations().clear();
1415 
1417  {
1418  CRef<CVariation> v2 = *it;
1419 
1420  if(v2->GetData().IsInstance()
1421  && v2->GetData().GetInstance().IsSetObservation()
1422  && !(v2->GetData().GetInstance().GetObservation() & CVariation_inst::eObservation_variant)) {
1423  //only variant observations are propagated to the nuc level
1424  continue;
1425  }
1427  v.SetData().SetSet().SetVariations().push_back(v2);
1428  }
1429  v.ResetPlacements(); //nucleotide placement will be computed for each instance and attached at its level
1430 
1431  if(v.GetData().GetSet().GetVariations().size() == 1) {
1432  v.Assign(*v.GetData().GetSet().GetVariations().front());
1433  }
1434  return;
1435  }
1436 
1437  //Remap the placement to nucleotide
1438 
1439  const CVariation::TPlacements* placements = s_GetPlacements(v);
1440  if(!placements || placements->size() == 0) {
1441  NCBI_THROW(CException, eUnknown, "Expected a placement");
1442  }
1443 
1444  const CVariantPlacement& placement = *placements->front();
1445 
1446  if(placement.IsSetStart_offset() || placement.IsSetStop_offset()) {
1447  NCBI_THROW(CException, eUnknown, "Expected offset-free placement");
1448  }
1449 
1450  if(placement.IsSetMol() && placement.GetMol() != CVariantPlacement::eMol_protein) {
1451  NCBI_THROW(CException, eUnknown, "Expected a protein-type placement");
1452  }
1453 
1455  //note: sel by product; location is prot; found feature is mrna having this prot as product
1456  CRef<CSeq_loc_Mapper> prot2precursor_mapper;
1457  for(CFeat_CI ci(*m_scope, placement.GetLoc(), sel); ci; ++ci) {
1458  try {
1459  prot2precursor_mapper.Reset(new CSeq_loc_Mapper(ci->GetMappedFeature(), CSeq_loc_Mapper::eProductToLocation, m_scope));
1460  }
1461  catch(CException&) {
1462  ;// may legitimately throw if feature is not good for mapping
1463  }
1464  break;
1465  }
1466 
1467  if(!prot2precursor_mapper) {
1468  NCBI_THROW(CException, eUnknown, "Can't create prot2precursor mapper. Is this a prot?");
1469  }
1470 
1471  CRef<CSeq_loc> nuc_loc = prot2precursor_mapper->Map(placement.GetLoc());
1473 
1476  : v.GetData().GetInstance().GetDelta().size() != 1 ? CConstRef<CDelta_item>(NULL)
1477  : v.GetData().GetInstance().GetDelta().front();
1478 
1479 
1480  if(!v.GetData().IsInstance()) {
1481  // variation has "special" payload - nothing to do other than remap location
1483  p->SetLoc(*nuc_loc);
1484  p->SetMol(nuc_loc->GetId() ? GetMolType(*nuc_loc->GetId()) : CVariantPlacement::eMol_unknown);
1486  v2->SetData().Assign(v.GetData());
1487  v2->SetPlacements().push_back(p);
1488  v.Assign(*v2);
1489  return;
1490  }
1491 
1492  //See if the variation is simple-enough to process; if too complex, return "unknown-variation"
1493  if(!nuc_loc->IsInt() //complex nucleotide location
1494  || sequence::GetLength(*nuc_loc, NULL) != 3 //does not remap to single codon
1495  || !delta //delta is not a single-element item
1496  || (delta->IsSetAction() && delta->GetAction() != CDelta_item::eAction_morph) //not a replace-at-location delta-type
1497  || !delta->IsSetSeq() //don't know what's the variant allele
1498  || !delta->GetSeq().IsLiteral()
1499  || delta->GetSeq().GetLiteral().GetLength() != 1) //not single-AA missense
1500  {
1502  p->SetLoc(*nuc_loc);
1503  p->SetMol(nuc_loc->GetId() ? GetMolType(*nuc_loc->GetId()) : CVariantPlacement::eMol_unknown);
1505  v2->SetData().SetUnknown();
1506  v2->SetPlacements().push_back(p);
1507  v.Assign(*v2);
1508  return;
1509  }
1510 
1511 
1513  string original_allele_codon; //nucleotide allele on the sequence
1514  seqv.GetSeqData(seqv.begin(), seqv.end(), original_allele_codon);
1515 
1516  string variant_codon;
1517  {{
1518  CSeq_data variant_prot_seq;
1520  delta->GetSeq().GetLiteral().GetSeq_data(),
1521  &variant_prot_seq,
1523 
1524  vector<string> variant_codons;
1529  variant_prot_seq.GetIupacaa(),
1530  variant_codons);
1531  } else {
1533  original_allele_codon,
1534  variant_prot_seq.GetIupacaa(),
1535  variant_codons);
1536  }
1537  variant_codon = s_CollapseAmbiguities(variant_codons);
1538  }}
1539 
1541  && !v.IsSetFrameshift()
1542  && variant_codon != original_allele_codon) {
1543  while(variant_codon.length() > 0
1544  && original_allele_codon.length() > 0
1545  && variant_codon.at(0) == original_allele_codon.at(0)) {
1546  variant_codon = variant_codon.substr(1);
1547  original_allele_codon = original_allele_codon.substr(1);
1548  if(nuc_loc->GetStrand() == eNa_strand_minus) {
1549  nuc_loc->SetInt().SetTo()--;
1550  } else {
1551  nuc_loc->SetInt().SetFrom()++;
1552  }
1553  }
1554 
1555  while(variant_codon.length() > 0
1556  && original_allele_codon.length() > 0
1557  && variant_codon.at(variant_codon.length() - 1)
1558  == original_allele_codon.at(original_allele_codon.length() - 1)) {
1559  variant_codon.resize(variant_codon.length() - 1);
1560  original_allele_codon.resize(original_allele_codon.length() - 1);
1561  //Note: normally given a protein, the parent will be a mRNA and the CDS location
1562  //will have plus strand; however, the parent could be MT, so we can't assume plus strand
1563  if(nuc_loc->GetStrand() == eNa_strand_minus) {
1564  nuc_loc->SetInt().SetFrom()++;
1565  } else {
1566  nuc_loc->SetInt().SetTo()--;
1567  }
1568  }
1569  }
1570 
1571  CRef<CDelta_item> delta2(new CDelta_item);
1572  delta2->SetSeq().SetLiteral().SetLength(variant_codon.length());
1573  delta2->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set(variant_codon);
1574 
1576 
1577  //set placement
1579  //merge loc to convert int of length 1 to a pnt as necessary
1581  p2->SetMol(GetMolType(sequence::GetId(*nuc_loc, NULL)));
1582  AttachSeq(*p2);
1583  v2->SetPlacements().push_back(p2);
1584 
1585 
1586  if(v.IsSetFrameshift()) {
1587  v2->SetData().SetUnknown(); //don't know what is happening at nucleotide level
1588  } else {
1589  CVariation_inst& inst2 = v2->SetData().SetInstance();
1590  inst2.SetType(variant_codon.length() == 1 ? CVariation_inst::eType_snv : CVariation_inst::eType_mnp);
1591  inst2.SetDelta().push_back(delta2);
1592 
1593  if(v.GetData().GetInstance().IsSetObservation()) {
1595  }
1596  }
1597 
1598  v.Assign(*v2);
1599  }
1600 
1602  {
1604  v2->Assign(v);
1605  v2->Index();
1608  v2->Index();
1609 
1611 
1612  //Note: The result describes whole codons, even for point mutations, i.e. common suffx/prefix are not truncated.
1613  return v2;
1614  }
1615 
1616 
1618  {
1619  return x_GetLiteralAtLoc(loc);
1620  }
1621 
1622 
1624  {
1625  ITERATE(CSeq_descr::Tdata, it, bsh.GetDescr().Get())
1626  {
1627  const CSeqdesc& desc = **it;
1628  if(!desc.IsMolinfo()
1629  || !desc.GetMolinfo().IsSetCompleteness()) {
1630  continue;
1631  }
1632 
1635  }
1636 
1637  return false;
1638  }
1639 
1640 
1642  {
1644  if(sequence::GetLength(loc, NULL) == 0) {
1645  literal->SetLength(0);
1646  } else {
1648  if(cached_literal) {
1649  literal = cached_literal;
1650  } else {
1651  string seq;
1652  try {
1654 
1655  if(v.IsProtein()) {
1656  // if loc extends by 1 past the end protein - we'll need to
1657  // truncate the loc to the boundaries of prot, and then add "*" or "X" manually,
1658  // as otherwise fetching seq will throw.
1660  if(bsh.GetInst_Length() == loc.GetStop(eExtreme_Positional)) {
1661 
1663  range_loc->SetInt().SetTo(bsh.GetInst_Length() - 1);
1664 
1666  *range_loc->Intersect(loc, 0, NULL));
1667 
1668  literal->SetLength() += 1;
1669  literal->SetSeq_data().SetNcbieaa().Set().push_back(
1670  IsRightPartial(bsh) ? 'X' : '*');
1671 
1672  return literal;
1673  }
1674  }
1675 
1676  v.GetSeqData(v.begin(), v.end(), seq);
1677  literal->SetLength(seq.size());
1678  if(v.IsProtein()) {
1679  literal->SetSeq_data().SetNcbieaa().Set(seq);
1680  } else if(v.IsNucleotide()) {
1681  literal->SetSeq_data().SetIupacna().Set(seq);
1682  }
1683  }
1684  catch(CException& e) {
1685  string loc_label;
1686  loc.GetLabel(&loc_label);
1687  NCBI_RETHROW_SAME(e, "Can't get literal for " + loc_label);
1688  }
1689  }
1690  }
1691  return literal;
1692  }
1693 
1694 
1696  {
1698 
1699  if(b.GetLength() == 0) {
1700  c->Assign(a);
1701  } else if(a.GetLength() == 0) {
1702  c->Assign(b);
1703  } else {
1704  c->SetLength(a.GetLength() + b.GetLength());
1705 
1706  if(a.IsSetFuzz() || b.IsSetFuzz()) {
1707  c->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
1708  }
1709 
1710  if(a.IsSetSeq_data() && b.IsSetSeq_data()) {
1712  a.GetSeq_data(), 0, a.GetLength(),
1713  b.GetSeq_data(), 0, b.GetLength());
1714  }
1715  }
1716  return c;
1717  }
1718 
1720  const CSeq_literal& payload,
1721  const CSeq_literal& ref, TSeqPos pos)
1722  {
1724  if(pos == 0) {
1725  result = s_CatLiterals(payload, ref);
1726  } else if(pos == ref.GetLength()) {
1727  result = s_CatLiterals(ref, payload);
1728  } else {
1730  prefix->Assign(ref);
1731  prefix->SetLength(pos);
1732  CSeqportUtil::Keep(&prefix->SetSeq_data(), 0, prefix->GetLength());
1733 
1735  suffix->Assign(ref);
1736  suffix->SetLength(ref.GetLength() - pos);
1737  CSeqportUtil::Keep(&suffix->SetSeq_data(), pos, suffix->GetLength());
1738 
1739  CRef<CSeq_literal> prefix_and_payload = s_CatLiterals(*prefix, payload);
1740  result = s_CatLiterals(*prefix_and_payload, *suffix);
1741  }
1742  return result;
1743  }
1744 
1745 
1746 
1747 
1749  {
1751  if(literal) {
1752  return literal;
1753  }
1754 
1755  //instantiate a literal from a placement (make sure there are no offsets)
1756  const CVariation::TPlacements* placements = s_GetPlacements(v);
1757 
1758  if(!placements) {
1759  return literal;
1760  }
1761 
1762  ITERATE(CVariation::TPlacements, it, *placements)
1763  {
1764  const CVariantPlacement& p = **it;
1765  if(p.IsSetStart_offset() || p.IsSetStop_offset()) {
1766  continue;
1767  }
1768 
1770  break;
1771  }
1772 
1773  return literal;
1774  }
1775 
1776 
1777  /*!
1778  * Convert any simple nucleotide variation to delins form, if possible; throw if not.
1779  * Precondition: location must be set.
1780  */
1782  {
1783  v.Index();
1784 
1785  if(v.GetData().IsSet()) {
1787  v.SetData().SetSet().SetVariations())
1788  {
1789  ChangeToDelins(**it);
1790  }
1791  } else if(v.GetData().IsInstance()
1793  && v.GetData().GetInstance().GetDelta().empty()) {
1794  CVariation_inst& inst = v.SetData().SetInstance();
1796 
1798  if(!this_literal) {
1799  NcbiCerr << MSerial_AsnText << v;
1801  "Could not find literal for 'this' location in placements");
1802  }
1803 
1805  di->SetSeq().SetLiteral().Assign(*this_literal);
1806  this->FlipStrand(*di);
1807  inst.SetDelta().push_back(di);
1808 
1809  } else if(v.GetData().IsInstance()) {
1810  CVariation_inst& inst = v.SetData().SetInstance();
1812 
1813  if(inst.GetDelta().size() == 0) {
1815  di->SetSeq().SetLiteral().SetLength(0);
1816  di->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set("");
1817  inst.SetDelta().push_back(di);
1818  } else if(inst.GetDelta().size() > 1) {
1819  NCBI_THROW(CException, eUnknown, "Deltas of length >1 are not supported");
1820  } else {
1821  CDelta_item& di = *inst.SetDelta().front();
1822 
1823  //convert 'del' to 'replace-with-empty-literal'
1825  di.ResetAction();
1826  di.SetSeq().SetLiteral().SetLength(0);
1827  di.SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set("");
1828  }
1829 
1830  //convert 'loc' or 'this'-based deltas to literals
1831  if(di.GetSeq().IsLoc()) {
1833  di.SetSeq().SetLiteral(*literal);
1834  } else if(di.GetSeq().IsThis()) {
1836  if(!this_literal) {
1837  NcbiCerr << MSerial_AsnText << v;
1838  NCBI_THROW(CException, eUnknown, "Could not find literal for 'this' location in placements");
1839  } else {
1840  di.SetSeq().SetLiteral().Assign(*this_literal);
1841  }
1842  }
1843 
1844  //expand multipliers.
1845  if(di.IsSetMultiplier()) {
1846  if(di.GetMultiplier() < 0) {
1847  NCBI_THROW(CException, eUnknown, "Encountered negative multiplier");
1848  } else {
1849  CSeq_literal& literal = di.SetSeq().SetLiteral();
1850 
1851  if(!literal.IsSetSeq_data() || !literal.GetSeq_data().IsIupacna()) {
1852  NCBI_THROW(CException, eUnknown, "Expected IUPACNA-type seq-literal");
1853  }
1854  string str_kernel = literal.GetSeq_data().GetIupacna().Get();
1855  literal.SetSeq_data().SetIupacna().Set("");
1856  for(int i = 0; i < di.GetMultiplier(); i++) {
1857  literal.SetSeq_data().SetIupacna().Set() += str_kernel;
1858  }
1859  literal.SetLength(literal.GetSeq_data().GetIupacna().Get().size());
1860  if(literal.IsSetFuzz()) {
1861  literal.SetFuzz().SetLim(CInt_fuzz::eLim_unk);
1862  }
1863 
1864  di.ResetMultiplier();
1865  if(di.IsSetMultiplier_fuzz()) {
1867  }
1868  }
1869  }
1870 
1871  //Convert ins-X-before-loc to 'replace seq@loc with X + seq@loc'
1872  if(!di.IsSetAction() || di.GetAction() == CDelta_item::eAction_morph) {
1873  ; //already done
1874  } else if(di.GetAction() == CDelta_item::eAction_ins_before) {
1875  di.ResetAction();
1877 
1878 
1879  if(!this_literal) {
1880  NCBI_THROW(CException, eUnknown, "Could not find literal for 'this' location in placements");
1881  } else if(this_literal->GetLength() == 0) {
1882  NCBI_THROW(CException, eUnknown, "Encountered empty literal");
1883  } else {
1884  //Note: need to be careful here: if dinucleotide or multinucleotide, "before" really means "before last pos"
1885  // (as dinucleotide placement is used for insertions in order to be strand-flippable)
1886 
1888  *this_literal,
1889  this_literal->GetLength() - 1);
1890  di.SetSeq().SetLiteral().Assign(*literal);
1891  }
1892  }
1893  }
1894  }
1895  }
1896 
1897 
1898 
1900  CVariation& v,
1901  const CSeq_id* target_id,
1902  bool ignore_genomic)
1903  {
1904  v.Index();
1905 
1906  if(v.GetData().IsSet()) {
1907 
1909  v.SetData().SetSet().SetVariations())
1910  {
1911  AttachProteinConsequences(**it, target_id, ignore_genomic);
1912  }
1913  return;
1914  }
1915 
1916  if(!v.GetData().IsInstance()) {
1917  return;
1918  }
1919 
1920 
1921  const CVariation::TPlacements* placements = s_GetPlacements(v);
1922 
1923  if(!placements || placements->size() == 0) {
1924  return;
1925  }
1926 
1927 
1929  && !(v.GetData().GetInstance().GetObservation()
1931  //only compute placements for variant instances; (as for asserted/reference there's no change)
1932  return;
1933  }
1934 
1935  ITERATE(CVariation::TPlacements, it, *placements)
1936  {
1937  const CVariantPlacement& placement = **it;
1938 
1939  if(!placement.GetLoc().GetId()
1940  || (target_id && !sequence::IsSameBioseq(*target_id,
1941  sequence::GetId(placement.GetLoc(), NULL),
1942  m_scope))) {
1943  continue;
1944  }
1945 
1946  if(placement.IsSetStart_offset() && (placement.IsSetStop_offset() || placement.GetLoc().IsPnt())) {
1947  continue; //intronic.
1948  }
1949 
1950  if(placement.GetMol() != CVariantPlacement::eMol_cdna
1952  && (placement.GetMol() != CVariantPlacement::eMol_genomic || ignore_genomic)) {
1953  continue;
1954  }
1955 
1956 
1957  CCdregionIndex::TCdregions cdregions;
1958  m_cdregion_index.Get(placement.GetLoc(), cdregions);
1959 
1960  ITERATE(CCdregionIndex::TCdregions, it, cdregions)
1961  {
1962  CRef<CVariation> prot_variation = TranslateNAtoAA(v.GetData().GetInstance(), placement, *it->cdregion_feat);
1963  CVariation::TConsequence::value_type consequence(new CVariation::TConsequence::value_type::TObjectType);
1964  consequence->SetVariation(*prot_variation);
1965  v.SetConsequence().push_back(consequence);
1966  }
1967  }
1968  }
1969 
1970  //translate IUPACNA literal; do not translate last partial codon.
1971  static string Translate(const string& nuc_str, bool is_mito)
1972  {
1973  string prot_str;
1974 
1976  code.SetId(is_mito ? 2 : 1);
1977 
1979  nuc_str,
1980  prot_str,
1982  &code);
1983 
1984  if(prot_str.size() * 3 < nuc_str.size()) {
1985  prot_str.push_back('X');
1986  }
1987 
1988  //truncate everything past the first stop
1989  size_t stop_pos = prot_str.find('*');
1990  if(stop_pos != NPOS) {
1991  prot_str.resize(stop_pos + 1);
1992  }
1993 
1994  return prot_str;
1995  }
1996 
1998  const string& prot_ref_str,
1999  const string& prot_delta_str)
2000  {
2001  CVariation_inst::EType inst_type =
2002  prot_delta_str.size() == 0 && prot_ref_str.size() > 0 ? CVariation_inst::eType_del
2003  : prot_delta_str.size() > 0 && prot_ref_str.size() == 0 ? CVariation_inst::eType_ins
2004  : prot_delta_str.size() != prot_ref_str.size() ? CVariation_inst::eType_prot_other
2005  : prot_ref_str == prot_delta_str ? CVariation_inst::eType_prot_silent
2006  : prot_ref_str.size() > 1 ? CVariation_inst::eType_prot_other
2007  : NStr::Find(prot_delta_str, "*") != NPOS ? CVariation_inst::eType_prot_nonsense
2009  return inst_type;
2010  }
2011 
2013  const string& prot_ref_str,
2014  const string& prot_delta_str)
2015  {
2016  CVariantProperties::TEffect effect = 0;
2017  for(size_t i = 0; i < prot_ref_str.size() && i < prot_delta_str.size(); i++) {
2018  if(prot_ref_str[i] == prot_delta_str[i]) {
2020  } else if(prot_ref_str[i] == '*') {
2022  } else if(prot_delta_str[i] == '*') {
2024  } else {
2026  }
2027  }
2028  return effect;
2029  }
2030 
2032  TSignedSeqPos nuc_delta_len,
2033  const string& prot_ref_str,
2034  const string& prot_variant_str)
2035  {
2037 
2038  bool stop_gain = false;
2039  bool stop_loss = false;
2040  for(size_t i = 0; i < max(prot_ref_str.size(), prot_variant_str.size()); i++) {
2041  char r = i >= prot_ref_str.size() ? '-' : prot_ref_str[i];
2042  char v = i >= prot_variant_str.size() ? '-' : prot_variant_str[i];
2043 
2044  if(r == '*' && v != '*') {
2045  stop_loss = true;
2046  }
2047 
2048  if(r != '*' && v == '*') {
2049  stop_gain = true;
2050  }
2051  }
2052  if(stop_gain) {
2053  terms.push_back(CVariationUtil::eSO_stop_gained);
2054  }
2055  if(stop_loss) {
2056  terms.push_back(CVariationUtil::eSO_stop_lost);
2057  }
2058 
2059  if(nuc_delta_len == 0) {
2060  if(!stop_gain && !stop_loss) {
2061  terms.push_back(prot_ref_str == prot_variant_str ? CVariationUtil::eSO_synonymous_variant
2063  }
2064  } else if(nuc_delta_len % 3 == 0) {
2065  terms.push_back(CVariationUtil::eSO_inframe_indel);
2066  } else {
2067  terms.push_back(CVariationUtil::eSO_frameshift_variant);
2068  }
2069 
2070  return terms;
2071  }
2072 
2073 
2075  {
2076  vp.SetLoc().FlipStrand();
2077  if(vp.IsSetSeq() && vp.GetSeq().IsSetSeq_data()) {
2079  vp.SetSeq().SetSeq_data(),
2080  &vp.SetSeq().SetSeq_data(),
2081  0,
2082  vp.GetSeq().GetLength());
2083  }
2084 
2086  tmp->Assign(vp);
2087 
2088  if(tmp->IsSetStart_offset()) {
2089  vp.SetStop_offset(tmp->GetStart_offset() * -1);
2090  } else {
2091  vp.ResetStop_offset();
2092  }
2093 
2094  if(tmp->IsSetStop_offset()) {
2095  vp.SetStart_offset(tmp->GetStop_offset() * -1);
2096  } else {
2097  vp.ResetStart_offset();
2098  }
2099 
2100  if(tmp->IsSetStart_offset_fuzz()) {
2101  vp.SetStop_offset_fuzz().Assign(tmp->GetStart_offset_fuzz());
2102  } else {
2103  vp.ResetStop_offset_fuzz();
2104  }
2105 
2106  if(tmp->IsSetStop_offset_fuzz()) {
2107  vp.SetStart_offset_fuzz().Assign(tmp->GetStop_offset_fuzz());
2108  } else {
2110  }
2111  }
2112 
2113 
2115  const CVariation& v,
2116  const CVariation_inst& inst)
2117  {
2118  bool ret = true;
2120  {
2121  const CDelta_item& di = **it;
2124  ret = false; //will set back to true if found acceptable placement
2125  if(p) {
2127  {
2128  if(s_GetLength(**it, NULL) >= 2) {
2129  ret = true;
2130  }
2131  }
2132  }
2133  }
2134  }
2135  return ret;
2136  }
2137 
2139  {
2140  v.Index(); //required so that can get factored placements from sub-variations.
2141  if(v.IsSetPlacements()) {
2143  {
2144  FlipStrand(**it);
2145  }
2146  }
2147 
2148  if(v.GetData().IsSet()) {
2150  it,
2151  v.SetData().SetSet().SetVariations())
2152  {
2153  FlipStrand(**it);
2154  }
2155  } else if(v.GetData().IsInstance()) {
2156 
2158  NCBI_THROW(CException, eUnknown, "Variation is not strand-flippable");
2159  } else {
2160  FlipStrand(v.SetData().SetInstance());
2161  }
2162  }
2163  }
2164 
2165 
2167  {
2168  if(!di.IsSetSeq()) {
2169  return;
2170  }
2171  if(di.GetSeq().IsLoc()) {
2172  di.SetSeq().SetLoc().FlipStrand();
2173  } else if(di.GetSeq().IsLiteral() && di.GetSeq().GetLiteral().IsSetSeq_data()) {
2175  di.SetSeq().GetLiteral().GetSeq_data(),
2176  &di.SetSeq().SetLiteral().SetSeq_data(),
2177  0, di.GetSeq().GetLiteral().GetLength());
2178  }
2179  }
2180 
2181 
2183  {
2184  if(!inst.IsSetDelta()) {
2185  return;
2186  }
2187 
2189  {
2190  FlipStrand(**it);
2191  }
2192  reverse(inst.SetDelta().begin(), inst.SetDelta().end());
2193  }
2194 
2195 
2197  {
2198  //note: loc may be a split loc when part of a codon is in another exon
2199 
2200  if(!v.IsSetPlacements() || v.GetPlacements().size() != 1) {
2201  NCBI_THROW(CException, eUnknown, "Expected single placement");
2202  }
2203 
2205 
2206  const CSeq_loc& orig_loc = v.GetPlacements().front()->GetLoc();
2207  CRef<CSeq_loc> sub_loc = orig_loc.Subtract(loc, 0, NULL, NULL);
2208  if(sub_loc->Which() && sequence::GetLength(*sub_loc, NULL) > 0) {
2209  //NcbiCerr << MSerial_AsnText << v;
2210  //NcbiCerr << MSerial_AsnText << loc;
2211  NCBI_THROW(CException, eUnknown, "Location must be a superset of the variation's loc");
2212  }
2213 
2214  sub_loc->Assign(*range_loc);
2215  sub_loc->SetInt().SetTo(sequence::GetStop(orig_loc, NULL, eExtreme_Positional));
2217  if(!suffix_loc->Which()) {
2218  suffix_loc->SetNull(); //bug in subtract
2219  }
2220 
2221  sub_loc->Assign(*range_loc);
2222  sub_loc->SetInt().SetFrom(sequence::GetStart(orig_loc, NULL, eExtreme_Positional));
2224  if(!prefix_loc->Which()) {
2225  prefix_loc->SetNull();
2226  }
2227 
2229  swap(prefix_loc, suffix_loc);
2230  }
2231 
2232  //NcbiCerr << "prefix loc" << MSerial_AsnText << *prefix_loc;
2233  //NcbiCerr << "suffix loc" << MSerial_AsnText << *suffix_loc;
2234 
2235 
2236  CRef<CSeq_literal> prefix_literal = x_GetLiteralAtLoc(*prefix_loc);
2237  CRef<CSeq_literal> suffix_literal = x_GetLiteralAtLoc(*suffix_loc);
2238 
2239  for(CTypeIterator<CVariation_inst> it(Begin(v)); it; ++it) {
2240  CVariation_inst& inst = *it;
2241  if(inst.GetDelta().size() != 1) {
2242  NCBI_THROW(CException, eUnknown, "Expected single-element delta");
2243  }
2244 
2245  CDelta_item& delta = *inst.SetDelta().front();
2246 
2247  if(!delta.IsSetSeq() || !delta.GetSeq().IsLiteral()) {
2248  NCBI_THROW(CException, eUnknown, "Expected literal");
2249  }
2250 
2251  CRef<CSeq_literal> tmp_literal1 = s_CatLiterals(*prefix_literal, delta.SetSeq().SetLiteral());
2252  CRef<CSeq_literal> tmp_literal2 = s_CatLiterals(*tmp_literal1, *suffix_literal);
2253  delta.SetSeq().SetLiteral(*tmp_literal2);
2254  }
2255 
2256  v.SetPlacements().front()->SetLoc().Assign(loc);
2257  }
2258 
2259  static bool Contains(const CSeq_loc& a, const CSeq_loc& b, CScope* scope)
2260  {
2261  CRef<CSeq_loc> a1(new CSeq_loc);
2262  a1->Assign(a);
2263  a1->ResetStrand();
2264 
2265  CRef<CSeq_loc> b1(new CSeq_loc);
2266  b1->Assign(b);
2267  b1->ResetStrand();
2268 
2270  }
2271 
2273  const CSeq_id& id,
2275  {
2277  p->SetLoc().SetWhole().Assign(id);
2278  p->SetMol(mol);
2280  v->SetData().SetUnknown();
2281  v->SetPlacements().push_back(p);
2283  return v;
2284  }
2285 
2287  const CVariantPlacement& nuc_p,
2288  const CSeq_feat& cds_feat,
2289  bool is_frameshifting,
2290  CScope& scope)
2291  {
2292  CRef<CSeq_loc> prot_loc;
2293  CRef<CSeq_loc> codons_loc;
2294  try {
2295  CRef<CSeq_loc_Mapper> nuc2prot_mapper;
2296  CRef<CSeq_loc_Mapper> prot2nuc_mapper;
2297  nuc2prot_mapper.Reset(new CSeq_loc_Mapper(
2298  cds_feat,
2300  &scope));
2301  prot2nuc_mapper.Reset(new CSeq_loc_Mapper(
2302  cds_feat,
2304  &scope));
2305 
2306  prot_loc = nuc2prot_mapper->Map(nuc_p.GetLoc());
2307  codons_loc = prot2nuc_mapper->Map(*prot_loc);
2308 
2309 
2310 
2311  if(codons_loc->IsNull()) {
2312  // normally shouldn't happen, but may happen with
2313  // dubious annotation, e.g. BC149603.1:c.1A>G.
2314  // Mapping to protein coordinates returns NULL,
2315  // probably because protein and cds lengths are inconsistent.
2317  }
2318  }
2319  catch(CException&) {
2320  // may legitimately throw if feature is not good for mapping (e.g. partial).
2321  prot_loc.Reset(new CSeq_loc());
2322  prot_loc->SetWhole().Assign(sequence::GetId(cds_feat.GetProduct(), NULL));
2323  codons_loc.Reset(SerialClone(nuc_p.GetLoc()));
2324  }
2325 
2327  v->SetData().SetUnknown();
2328 
2330  prot_p->SetLoc(*prot_loc);
2332  prot_p->SetExceptions().push_back(CreateException(
2333  "Cannot infer consequence; projecting location only",
2335  v->SetPlacements().push_back(prot_p);
2336 
2338  codons_p->SetLoc(*codons_loc);
2339  codons_p->SetMol(nuc_p.GetMol());
2340  v->SetPlacements().push_back(codons_p);
2341 
2343 
2344  CVariationMethod& m = v->SetMethod();
2346 
2347  if(is_frameshifting) {
2348  v->SetSo_terms().push_back(CVariationUtil::eSO_frameshift_variant);
2349  }
2350 
2351  return v;
2352  }
2353 
2354 
2355  static size_t GetCommonPrefixLen(const string& a, const string& b)
2356  {
2357  size_t i(0);
2358  while(i < a.size() && i < b.size() && a[i] == b[i]) {
2359  i++;
2360  }
2361  return i;
2362  }
2363 
2364  static size_t GetCommonSuffixLen(const string& a, const string& b)
2365  {
2366  size_t i(0);
2367  while(i < a.size() && i < b.size() && a[a.size() - 1 - i] == b[b.size() - 1 - i]) {
2368  i++;
2369  }
2370  return i;
2371  }
2372 
2373 
2374  // Return true iff can't use the CDS for mapping between prot and mRNA;
2375  static bool HasProblematicExceptions(const CSeq_feat& cds_feat)
2376  {
2377  return !cds_feat.IsSetExcept() ? false
2378  : !cds_feat.GetExcept() ? false
2379  : !cds_feat.IsSetExcept_text() ? true
2380  : (int)cds_feat.GetExcept_text().size() > ( //not all of except-text is explained by benign exceptions
2381  49 * (NStr::Find(cds_feat.GetExcept_text(), "translation initiation by" /* tRNA-Leu at CUG codon"*/) != NPOS)
2382  + 27 * (NStr::Find(cds_feat.GetExcept_text(), "mismatches in translation") != NPOS)
2383  + 20 * (NStr::Find(cds_feat.GetExcept_text(), "ribosomal slippage") != NPOS));
2384  }
2385 
2386 
2388  const CVariation_inst& nuc_inst,
2389  const CVariantPlacement& nuc_p,
2390  const CSeq_feat& cds_feat)
2391  {
2392  const bool verbose = false;
2393 
2394  if(verbose) NcbiCerr << "TranslateNAtoAA Inputs:\n" << MSerial_AsnText
2395  << nuc_inst << "\n\n" << MSerial_AsnText << nuc_p
2396  << "\n\n" << MSerial_AsnText << cds_feat << endl;
2397 
2399  sequence::GetId(cds_feat.GetLocation(), NULL),
2400  m_scope)) {
2401  NCBI_THROW(CException, eUnknown, "Placement and CDS are on different seqs");
2402  }
2403 
2404  //create an inst-variation from the provided inst with the single specified placement
2406  v->SetData().SetInstance().Assign(nuc_inst);
2407  v->ResetPlacements();
2408 
2410  p->Assign(nuc_p);
2411  p->ResetSeq(); //Will recalculate after adjusting to codon boundary
2412  v->SetPlacements().push_back(p);
2413 
2414  //normalize to delins form so we can deal with it uniformly
2415  try {
2416  ChangeToDelins(*v);
2417  }
2418  catch(...) {
2419  //e.g. NM_000384.2:c.905-1_905dupGG - can't convert t odelins
2420 
2422  nuc_p,
2423  cds_feat,
2424  false,
2425  *m_scope);
2426  }
2427 
2428  const CDelta_item& nuc_delta = *v->GetData().GetInstance().GetDelta().front();
2429 
2430  // note: using type long instead of TSignedSeqPos is a bug on 64-bit systems:
2431  // the result of the subtraction that's expected to be negative would be a
2432  // BIG_NUMBER that is cast to type long without the wrap-around into negatives.
2433  TSignedSeqPos nuc_delta_len = nuc_delta.GetSeq().GetLiteral().GetLength()
2434  - sequence::GetLength(p->GetLoc(), NULL);
2435 
2436 
2438  sequence::GetStrand(p->GetLoc(), NULL))) {
2439  //need the variation and cds to be in the same orientation, such that
2440  //sequence represents the actual codons and AAs when mapped to protein
2441  FlipStrand(*v);
2442  }
2443 
2444  if(verbose) NcbiCerr << "Normalized variation: " << MSerial_AsnText << *v;
2445 
2446 
2447  //Map to protein and back to get the affected codons.
2448  //Note: need the scope because the cds_feat is probably gi-based,
2449  //while our locs are probably accver-based
2450  CRef<CSeq_loc_Mapper> nuc2prot_mapper, prot2nuc_mapper;
2451  CRef<CSeq_loc> prot_loc, codons_loc;
2452  try {
2453  nuc2prot_mapper.Reset(new CSeq_loc_Mapper(
2454  cds_feat,
2456  m_scope));
2457  prot2nuc_mapper.Reset(new CSeq_loc_Mapper(cds_feat,
2459  m_scope));
2460 
2461  prot_loc = nuc2prot_mapper->Map(p->GetLoc());
2462  codons_loc = prot2nuc_mapper->Map(*prot_loc);
2463  }
2464  catch(CException&) {
2465  // may legitimately throw if feature
2466  // is not good for mapping (e.g. partial).
2467  }
2468 
2469  const bool is_within_cds = Contains(cds_feat.GetLocation(), nuc_p.GetLoc(), m_scope);
2470 
2471  bool is_ok = !nuc_p.IsSetStart_offset()
2472  && !nuc_p.IsSetStop_offset()
2473  && is_within_cds
2474  && !codons_loc.IsNull()
2475  && !codons_loc->IsNull()
2476  && nuc_delta.GetSeq().IsLiteral()
2477  && nuc_delta.GetSeq().GetLiteral().IsSetSeq_data();
2478 
2479  int frameshift_phase = nuc_delta_len % 3;
2480  if(frameshift_phase < 0) {
2481  frameshift_phase += 3;
2482  }
2483 
2484  if(!is_ok) {
2486  nuc_p,
2487  cds_feat,
2488  frameshift_phase != 0,
2489  *m_scope);
2490  }
2491 
2492 
2493  string downstream_cds_suffix_seq_str; //cds sequence downstream of affected codons
2494  {{
2495  //extend cds-loc downstream by a little bit to allow inferring changes in the stop-codon
2496  //(e.g. base in the stop-codon is deleted, and the sequence downstream of cds is now involved)
2497  CRef<CSeq_loc> ext_cds_loc;
2498  {{
2499  SFlankLocs flocs = CreateFlankLocs(cds_feat.GetLocation(), 6);
2500  ext_cds_loc = sequence::Seq_loc_Add(
2501  cds_feat.GetLocation(),
2502  *flocs.downstream,
2504  }}
2505 
2506  CRef<CSeq_loc> downstream_cds_loc;
2507  {{
2508  SFlankLocs flocs = CreateFlankLocs(*codons_loc, 1000);
2509  downstream_cds_loc = ext_cds_loc->Intersect(
2510  *flocs.downstream,
2512  }}
2513 
2514  CRef<CSeq_literal> literal = x_GetLiteralAtLoc(*downstream_cds_loc);
2515  if(verbose) NcbiCerr << "Downstream-cds: " << MSerial_AsnText << *literal;
2516 
2517  if(literal->GetLength() > 0) {
2518  downstream_cds_suffix_seq_str = literal->GetSeq_data().GetIupacna().Get();
2519  }
2520  }}
2521 
2522  TSignedSeqPos frame = abs(
2524  - (TSignedSeqPos)codons_loc->GetStart(eExtreme_Biological));
2525 
2526  codons_loc->SetId(sequence::GetId(p->GetLoc(), NULL));
2527  // restore the original id, as mapping forward and back may have changed the type
2528 
2530  // not strictly required, but generally prefer accvers in the output for readability
2531  // as we're dealing with various types of mols
2532 
2533  x_AdjustDelinsToInterval(*v, *codons_loc);
2534  AttachSeq(*v, 100000); //need enough sequnece to get create reference and variant translations downstream
2535 
2536  if(!v->GetPlacements().front()->GetSeq().IsSetSeq_data()) {
2538  nuc_p,
2539  cds_feat,
2540  frameshift_phase != 0,
2541  *m_scope);
2542  }
2543 
2544  if(verbose) NcbiCerr << "Adjusted-for-codons " << MSerial_AsnText << *v;
2545 
2546  string nuc_ref_prefix = v->GetPlacements().front()->GetSeq().GetSeq_data().GetIupacna().Get();
2547 
2548  const CSeq_literal& nuc_var_literal = v->GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral();
2549  string nuc_var_prefix = nuc_var_literal.GetLength() == 0 ? "" : nuc_var_literal.GetSeq_data().GetIupacna().Get();
2550 
2551  string nuc_ref_str = nuc_ref_prefix + downstream_cds_suffix_seq_str;
2552  string nuc_var_str = nuc_var_prefix + downstream_cds_suffix_seq_str;
2553 
2554  string prot_ref_str = Translate(nuc_ref_str, p->GetMol() == CVariantPlacement::eMol_mitochondrion);
2555  string prot_var_str = Translate(nuc_var_str, p->GetMol() == CVariantPlacement::eMol_mitochondrion);
2556  int num_ref_codons = (nuc_ref_prefix.size() + 2) / 3;
2557  int num_var_codons = (nuc_var_prefix.size() + 2) / 3;
2558 
2559 
2560  if(verbose) NcbiCerr << "nuc_ref_str: " << nuc_ref_str << "\n"
2561  << "nuc_var_str: " << nuc_var_str << "\n";
2562 
2563 
2564  if(verbose) NcbiCerr << "prot_ref_str: " << prot_ref_str << "\n"
2565  << "prot_var_str: " << prot_var_str << "\n";
2566 
2567 
2568  int common_prot_prefix_len(0); //will calculate length of unchanged leading AAs in translations (starting with codons of interest)
2569 
2570 
2571  if(prot_ref_str == prot_var_str) {
2572  //No-change variation. Will truncate to original counts of affected codons, such that the
2573  //no-change variation describes the original codons that did not affect the translation.
2574  prot_ref_str.resize(min(static_cast<int>(prot_ref_str.size()), num_ref_codons));
2575  prot_var_str.resize(prot_ref_str.size());
2576 
2577  if(prot_ref_str.size() > 0 && *prot_ref_str.rbegin() == '*') {
2578  //If translated all the way to stop with no-changes, this is no longer a frameshift, even though the indel is not mod%3
2579  frameshift_phase = 0;
2580  }
2581 
2582  } else {
2583 
2584  //Justify the variation to first affected AA.
2585  //
2586  //Truncate common prefix of translations; truncate the 5'-end of the prot-loc and recalculate codons-loc accordingly.
2587  //Rationale: nucleotide level might not affect translation at the affected codons' location, but change the translation
2588  //downstream. That's why we have to skip unchanged translation to find the first affected AA.
2589  //
2590  //This does not apply to synonymous changes where there's no change at all rather than a downstream change.
2591 
2592  common_prot_prefix_len = GetCommonPrefixLen(prot_ref_str, prot_var_str);
2593  if(common_prot_prefix_len > 0
2594  && common_prot_prefix_len == static_cast<int>(prot_ref_str.size())) {
2595  common_prot_prefix_len -= 1; // when truncating common prefx, don't want to truncate all the way
2596  // will leave last position as necessary
2597  }
2598 
2599  prot_ref_str = prot_ref_str.substr(common_prot_prefix_len);
2600  prot_var_str = prot_var_str.substr(common_prot_prefix_len);
2601 
2602  if(verbose) NcbiCerr << "prot_ref_str: " << prot_ref_str << ":" << prot_ref_str.size() << "\n"
2603  << "prot_var_str: " << prot_var_str << ":" << prot_var_str.size() << "\n";
2604 
2605  if(frameshift_phase == 0) {
2606 
2607  size_t suffix_len = GetCommonSuffixLen(prot_ref_str, prot_var_str);
2608  size_t min_len = min(prot_ref_str.size(), prot_var_str.size());
2609  size_t ref_stop_pos = prot_ref_str.find('*');
2610  size_t var_stop_pos = prot_var_str.find('*');
2611  size_t min_stop_pos = min(ref_stop_pos, var_stop_pos);
2612 
2613  // VAR-699, VAR-872
2614  bool truncate_at_stop = min_stop_pos < min_len
2615  && ref_stop_pos != var_stop_pos
2616  && nuc_delta_len == 0;
2617 
2618  if(truncate_at_stop) {
2619  prot_ref_str.resize(min_stop_pos + 1);
2620  prot_var_str.resize(min_stop_pos + 1);
2621  } else {
2622  prot_ref_str.resize(prot_ref_str.size() - suffix_len);
2623  prot_var_str.resize(prot_var_str.size() - suffix_len);
2624  }
2625  } else {
2626  //Keep the frst AA in case of frameshifts
2627  //NM_000492.3:c.3528delC -> NP_000483.3:p.Lys1177Serfs instead of NP_000483.3:p.Lys1177delfs
2628  prot_ref_str.resize(min(static_cast<size_t>(1), prot_ref_str.size()));
2629  prot_var_str.resize(min(static_cast<size_t>(1), prot_var_str.size()));
2630  }
2631 
2632  //adjust the protein location.
2633  prot_loc = sequence::Seq_loc_Merge(*prot_loc, CSeq_loc::fMerge_SingleRange, NULL); //to convert point to interval
2634  if(prot_ref_str.size() == 0) {
2635  //After truncating common prefix and suffix the reference is reduced to nothing - we have pure insertion.
2636  //We need 2-AA location describing the point of insertion
2637  prot_loc->SetInt().SetFrom() += common_prot_prefix_len - 1;
2638  prot_loc->SetInt().SetTo(prot_loc->SetInt().SetFrom() + 1);
2639  } else {
2640  //the location describes the sequence that's being changed
2641  prot_loc->SetInt().SetFrom() += common_prot_prefix_len;
2642  prot_loc->SetInt().SetTo(prot_loc->SetInt().SetFrom() + prot_ref_str.size() - 1);
2643  }
2644  prot_loc = sequence::Seq_loc_Merge(*prot_loc, 0, NULL); //to convert single-pos int to point, as appropriate
2645 
2646  codons_loc = prot2nuc_mapper->Map(*prot_loc);
2647 
2648  if(codons_loc->IsNull()) {
2649  // may have gone past the end of the CDS, e.g. NG_011646.1:g.1508delT
2651  nuc_p,
2652  cds_feat,
2653  frameshift_phase != 0,
2654  *m_scope);
2655  }
2656 
2657 
2658  codons_loc->SetId(sequence::GetId(p->GetLoc(), NULL));
2659 
2660  if(verbose) NcbiCerr << "prot_ref_str: " << prot_ref_str << ":" << prot_ref_str.size() << "\n"
2661  << "prot_var_str: " << prot_var_str << ":" << prot_var_str.size() << "\n";
2662  }
2663 
2664 
2665  if(verbose) NcbiCerr << "reference codons: " << num_ref_codons
2666  << "; variant codons: " << num_var_codons
2667  << "; common prefix: " << common_prot_prefix_len << "\n";
2668 
2669 
2670  //if(verbose) NcbiCerr << "prot_ref : " << prot_ref_str << "\n" << "prot_delta: " << prot_var_str << "\ntruncated prefix: " << common_prot_prefix_len << "\n Adjusted codons loc: " << MSerial_AsnText << *codons_loc;
2671 
2672  //Constructing protein-variation
2673 
2674  CRef<CVariation> prot_v(new CVariation);
2675 
2676  //will have two placements: one describing the AA, and the other describing codons
2678 
2679  if(HasProblematicExceptions(cds_feat)) {
2680  //mapped protein position is bogus, as cds is either partial or there are indels.
2681  prot_p->SetLoc().SetEmpty().Assign(sequence::GetId(*prot_loc, NULL));
2682  prot_p->SetMol(GetMolType(sequence::GetId(prot_p->GetLoc(), NULL)));
2683  prot_p->SetSeq().SetLength(prot_ref_str.size());
2684  prot_p->SetSeq().SetSeq_data().SetNcbieaa().Set(prot_ref_str);
2685  } else {
2686  prot_p->SetLoc(*prot_loc);
2687  prot_p->SetMol(GetMolType(sequence::GetId(prot_p->GetLoc(), NULL)));
2688  AttachSeq(*prot_p);
2689  }
2690  prot_v->SetPlacements().push_back(prot_p);
2691 
2692 
2694  codons_p->SetLoc(*codons_loc);
2695  codons_p->SetMol(GetMolType(sequence::GetId(codons_p->GetLoc(), NULL)));
2696  codons_p->SetFrame(frame);
2697  AttachSeq(*codons_p);
2698  prot_v->SetPlacements().push_back(codons_p);
2699 
2700 
2701  //calculate properties
2702  {{
2703  if(frameshift_phase == 0 && prot_ref_str.size() == prot_var_str.size()) {
2704  //VAR-267 - calculate missense/synonymous/stop-gain/loss for non-frameshifting and non-length-changing cases only
2705  CVariantProperties::TEffect prop = CalcEffectForProt(prot_ref_str, prot_var_str);
2706  if(prop != 0) {
2707  prot_v->SetVariant_prop().SetEffect(prop);
2708  }
2709  }
2710 
2711  TSOTerms so_terms = CalcSOTermsForProt(nuc_delta_len,
2712  prot_ref_str,
2713  prot_var_str);
2714  copy(so_terms.begin(), so_terms.end(), back_inserter(prot_v->SetSo_terms()));
2715  }}
2716 
2717 
2718  prot_v->SetData().SetInstance().SetType(CalcInstTypeForAA(prot_ref_str, prot_var_str));
2719 
2720  //set inst data
2722  prot_v->SetData().SetInstance().SetDelta().push_back(di);
2723 
2724  if(prot_var_str.size() > 0) {
2725 
2726  // Use NA alphabet instead of AA, as the dbSNP requested original codons.
2727  // The downstream process can always translate this to AAs.
2728  //
2729  // Note, however, that we cannot always use the original variant codons sequence, because
2730  // the location may have been adjusted downstream (see comments at TruncateCommonPrefixAndSuffix).
2731  // So we'll have to get this from nuc_var_str.
2732  if(false && common_prot_prefix_len == 0) {
2733  di->SetSeq().Assign(v->GetData().GetInstance().GetDelta().front()->GetSeq());
2734  } else {
2735 
2736  if(verbose) NcbiCerr
2737  << "inst-type: " << prot_v->GetData().GetInstance().GetType()
2738  << "; nuc_var_len: " << nuc_var_str.size()
2739  << "; nuc_var_str: " << nuc_var_str
2740  << "; prefix_len: " << common_prot_prefix_len * 3
2741  << "; var_codons:" << prot_var_str.size() * 3 << "\n";
2742 
2743  // Note: need to take min here, because common_prot_prefix_len * 3 may exceed
2744  // nuc_var_str.size() due to translation of a partial codon.
2745  string adjusted_codons_str = nuc_var_str.substr(
2746  min<int>(nuc_var_str.size(), common_prot_prefix_len * 3),
2747  prot_var_str.size() * 3);
2748 
2749  if(adjusted_codons_str.size() > 0) {
2750  di->SetSeq().SetLiteral().SetLength(adjusted_codons_str.size());
2751  di->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set() = adjusted_codons_str;
2752  } else {
2753  di->SetSeq().SetThis();
2754  di->SetAction(CDelta_item::eAction_del_at);
2755  }
2756  }
2757 
2758  if(prot_ref_str.size() == 0) {
2759  di->SetAction(CDelta_item::eAction_ins_before);
2760  }
2761  } else {
2762  di->SetSeq().SetThis();
2763  di->SetAction(CDelta_item::eAction_del_at);
2764  }
2765 
2766  //set frameshift
2767  if(frameshift_phase != 0) {
2768  //note: SetEffect() contains magic-value in debug mode by default, not 0.
2769  //prot_v->SetVariant_prop().SetEffect() |= CVariantProperties::eEffect_frameshift;
2770  prot_v->SetVariant_prop().SetEffect(
2772  | (prot_v->IsSetVariant_prop()
2773  && prot_v->GetVariant_prop().IsSetEffect()
2774  ? prot_v->GetVariant_prop().GetEffect() : 0));
2775 
2776  prot_v->SetFrameshift().SetPhase(frameshift_phase);
2777  }
2778 
2779  if(verbose) NcbiCerr << "protein variation:" << MSerial_AsnText << *prot_v;
2780  if(verbose) NcbiCerr << "Done with protein variation\n";
2781 
2782  return prot_v;
2783  }
2784 
2785 
2786 
2787  ////////////////////////////////////////////////////////////////////////////////
2788  //
2789  // SO-terms calculations
2790  //
2791  ////////////////////////////////////////////////////////////////////////////////
2792 
2793 
2795  {
2797  terms.push_back(eSO_intergenic_variant);
2798  }
2800  terms.push_back(eSO_2KB_upstream_variant);
2801  }
2803  terms.push_back(eSO_500B_downstream_variant);
2804  }
2806  terms.push_back(eSO_splice_donor_variant);
2807  }
2809  terms.push_back(eSO_splice_acceptor_variant);
2810  }
2812  terms.push_back(eSO_intron_variant);
2813  }
2815  terms.push_back(eSO_5_prime_UTR_variant);
2816  }
2818  terms.push_back(eSO_3_prime_UTR_variant);
2819  }
2821  terms.push_back(eSO_nc_transcript_variant);
2822  }
2824  terms.push_back(eSO_initiator_codon_variant);
2825  }
2827  terms.push_back(eSO_terminator_codon_variant);
2828  }
2829 
2831  terms.push_back(eSO_frameshift_variant);
2832  }
2834  terms.push_back(eSO_missense_variant);
2835  }
2836 
2838  terms.push_back(eSO_stop_gained);
2839  }
2841  terms.push_back(eSO_stop_lost);
2842  }
2844  terms.push_back(eSO_synonymous_variant);
2845  }
2846  }
2847 
2849  {
2850  return
2851  term == eSO_intergenic_variant ? "intergenic_variant"
2852  : term == eSO_2KB_upstream_variant ? "2KB_upstream_variant"
2853  : term == eSO_500B_downstream_variant ? "500B_downstream_variant"
2854  : term == eSO_splice_donor_variant ? "splice_donor_variant"
2855  : term == eSO_splice_acceptor_variant ? "splice_acceptor_varian"
2856  : term == eSO_intron_variant ? "intron_variant"
2857  : term == eSO_5_prime_UTR_variant ? "5_prime_UTR_variant"
2858  : term == eSO_3_prime_UTR_variant ? "3_prime_UTR_variant"
2859  : term == eSO_coding_sequence_variant ? "coding_sequence_variant"
2860  : term == eSO_nc_transcript_variant ? "nc_transcript_variant"
2861  : term == eSO_initiator_codon_variant ? "initiator_codon_variant"
2862  : term == eSO_terminator_codon_variant ? "terminator_codon_variant"
2863 
2864  : term == eSO_synonymous_variant ? "synonymous_variant"
2865  : term == eSO_missense_variant ? "missense_variant"
2866  : term == eSO_frameshift_variant ? "frameshift_variant"
2867  : term == eSO_inframe_indel ? "inframe_indel"
2868  : term == eSO_stop_gained ? "stop_gained"
2869  : term == eSO_stop_lost ? "stop_lost"
2870  : "other_variant";
2871  };
2872 
2873 
2874 
2876  {
2877  if(v.IsSetPlacements()) {
2878  return &v.GetPlacements();
2879  } else if(v.GetParent()) {
2880  return s_GetPlacements(*v.GetParent());
2881  } else {
2882  return NULL;
2883  }
2884  }
2885 
2887  {
2888  const CVariation::TPlacements* placements = s_GetPlacements(v);
2889  ITERATE(CVariation::TPlacements, it, *placements)
2890  {
2891  const CVariantPlacement& p = **it;
2892  if(p.IsSetSeq()) {
2893  return CConstRef<CSeq_literal>(&p.GetSeq());
2894  }
2895  }
2896  return CConstRef<CSeq_literal>(NULL);
2897  }
2898 
2900  {
2901  const CVariation* parent = v.GetParent();
2902  if(parent == NULL) {
2903  return CConstRef<CSeq_literal>(NULL);
2904  } else {
2905  if(parent->GetData().IsSet()) {
2907  {
2908  const CVariation& sibling = **it;
2909  if(sibling.GetData().IsInstance()
2910  && sibling.GetData().GetInstance().IsSetObservation()
2912  && sibling.GetData().GetInstance().GetDelta().size() == 1
2913  && sibling.GetData().GetInstance().GetDelta().front()->IsSetSeq()
2914  && sibling.GetData().GetInstance().GetDelta().front()->GetSeq().IsLiteral()) {
2915  return CConstRef<CSeq_literal>(&sibling.GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral());
2916  }
2917  }
2918  }
2919 
2920  return s_FindAssertedLiteral(*parent);
2921  }
2922  }
2923 
2924 
2925  static bool Equals(
2926  const CVariation::TPlacements& p1,
2927  const CVariation::TPlacements& p2)
2928  {
2929  if(p1.size() != p2.size()) {
2930  return false;
2931  }
2932  CVariation::TPlacements::const_iterator it1 = p1.begin();
2933  CVariation::TPlacements::const_iterator it2 = p2.begin();
2934 
2935  for(; it1 != p1.end() && it2 != p2.end(); ++it1, ++it2) {
2936  const CVariantPlacement& p1 = **it1;
2937  const CVariantPlacement& p2 = **it2;
2938  if(!p1.Equals(p2)) {
2939  return false;
2940  }
2941  }
2942  return true;
2943  }
2944 
2946  {
2947  if(!v.GetData().IsSet()) {
2948  return;
2949  }
2950 
2953  v.SetData().SetSet().SetVariations())
2954  {
2955  s_FactorOutPlacements(**it);
2956  }
2957 
2959 
2960  bool ok = true;
2963  v.SetData().SetSet().SetVariations())
2964  {
2965  CVariation& v2 = **it;
2966  if(!v2.IsSetPlacements()) {
2967  ok = false;
2968  break;
2969  } else if(!p1) {
2970  p1 = &v2.SetPlacements();
2971  continue;
2972  } else {
2973  if(!Equals(*p1, v2.GetPlacements())) {
2974  ok = false;
2975  break;
2976  }
2977  }
2978  }
2979 
2980  if(ok && p1) {
2981  //transfer p1 placements to this level
2983  {
2984  v.SetPlacements().push_back(*it);
2985  }
2986 
2987  //reset placements at the children level
2989  v.SetData().SetSet().SetVariations())
2990  {
2991  CVariation& v2 = **it;
2992  v2.ResetPlacements();
2993  }
2994  }
2995  }
2996 
2997 
2999  const CVariation& v,
3000  const CVariantPlacement& p)
3001  {
3002  CConstRef<CVariation> cons_v(NULL);
3003  if(v.IsSetConsequence() && p.GetLoc().GetId()) {
3005  {
3006  const CVariation::TConsequence::value_type::TObjectType& cons = **it;
3007  if(cons.IsVariation()
3008  && cons.GetVariation().IsSetPlacements()) {
3009  ITERATE(CVariation::TPlacements, it2, cons.GetVariation().GetPlacements())
3010  {
3011  const CVariantPlacement& vp = **it2;
3012  if(vp.GetLoc().GetId()
3013  && p.GetLoc().GetId()
3014  && vp.GetLoc().GetId()->Equals(*p.GetLoc().GetId())) {
3015  cons_v.Reset(&cons.GetVariation());
3016  break;
3017  }
3018  }
3019  }
3020  }
3021  }
3022 
3023  if(!cons_v && v.GetData().IsSet()) {
3025  v.GetData().GetSet().GetVariations())
3026  {
3028  if(cons_v1) {
3029  cons_v = cons_v1;
3030  break;
3031  }
3032  }
3033  }
3034 
3035  return cons_v;
3036  }
3037 
3038 
3040  {
3042  {
3043  const CDbtag& dbtag = **it;
3044  if(dbtag.GetDb() == "GeneID"
3045  && dbtag.GetTag().IsId()
3046  && dbtag.GetTag().GetId() == gene_id) {
3047  return;
3048  }
3049  }
3050 
3051  CRef<CDbtag> dbtag(new CDbtag());
3052  dbtag->SetDb("GeneID");
3053  dbtag->SetTag().SetId(gene_id);
3054  p.SetDbxrefs().push_back(dbtag);
3055  }
3056 
3058  {
3059  if(!placement.IsSetGene_location()) {
3060  // need to zero-out the bitmask,
3061  // otherwise in debug mode it will be preset to a magic value,
3062  // and then modifying it with "|=" will produce garbage.
3063  placement.SetGene_location(0);
3064  }
3065 
3066 
3067  // for offset-style intronic locations (not genomic and have offset),
3068  // can infer where we are based on offset
3069  if(!placement.IsSetMol()
3070  || placement.GetMol() != CVariantPlacement::eMol_genomic) {
3072  sequence::GetId(placement.GetLoc(), NULL));
3073 
3074  if(placement.IsSetStart_offset()
3075  && placement.GetStart_offset() != 0) {
3077  placement,
3078  placement.GetStart_offset(),
3079  placement.GetLoc(),
3080  bsh);
3081  }
3082 
3083  if(placement.IsSetStop_offset()
3084  && placement.GetStop_offset() != 0) {
3086  placement,
3087  placement.GetStop_offset(),
3088  placement.GetLoc(),
3089  bsh);
3090  }
3091  }
3092 
3095 
3096  // note: this assumes that the offsets are HGVS-spec compliant:
3097  // anchor locations are at the exon terminals, and the
3098  // offset values point into the intron.
3099  bool is_completely_intronic = false;
3100  {{
3101  bool is_start_offset = placement.IsSetStart_offset()
3102  && placement.GetStart_offset() != 0;
3103 
3104  bool is_stop_offset = placement.IsSetStop_offset()
3105  && placement.GetStop_offset() != 0;
3106 
3107  // Single anchor point, and have any offset.
3108  bool is_case1 = sequence::GetLength(placement.GetLoc(), NULL) == 1
3109  && (is_start_offset || is_stop_offset);
3110 
3111  // Other possibility is when start and stop are addressed from different exons:
3112  // The location length must be 2 (end of one exon and start of the other),
3113  // and the offsets point inwards.
3114  bool is_case2 = sequence::GetLength(placement.GetLoc(), NULL) == 2
3115  && is_start_offset && placement.GetStart_offset() > 0
3116  && is_stop_offset && placement.GetStop_offset() < 0;
3117 
3118  is_completely_intronic = is_case1 || is_case2;
3119  }}
3120 
3121  // collapse all gene-specific location properties into prop.
3122  // Will do it in three rounds so that more important properties
3123  // will have their gene-ids on top of the list: VAR-1297
3124  // First: ignore near-gene-or-intron.
3125  // Second: ignore near-gene.
3126  // Third: don't ignore any property.
3127  for(size_t i = 0; i < 3; i++) {
3128  static const CVariantProperties::TGene_location flags[3] = {
3132  ,
3135  ,
3136  0 };
3137 
3139  {
3140  int gene_id = it->first;
3141  CVariantProperties::TGene_location loc_prop = it->second;
3142 
3143  if(loc_prop & flags[i]) {
3144  continue;
3145  }
3146 
3147  if(gene_id != 0) {
3148  s_AttachGeneIDdbxref(placement, gene_id);
3149  }
3150 
3151  if(!is_completely_intronic) {
3152  //we don't want to compute properties for intronic anchor points. VAR-149
3153  placement.SetGene_location() |= loc_prop;
3154  }
3155  }
3156  }
3157 
3158  }
3159 
3160 
3162  const CSeq_align& transcript_aln,
3163  const CSeq_loc& query_loc,
3164  TSOTerms& terms)
3165  {
3166  //note: initializing mapper with scope because the annotation from scope is gi-based,
3167  //while the parameters are normaly accver-based
3168  CRef<CSeq_loc_Mapper> mapper(new CSeq_loc_Mapper(transcript_aln, 1, m_scope));
3169 
3170  CConstRef<CSeq_loc> genomic_query_loc;
3171  if(query_loc.GetId() && query_loc.GetId()->Equals(transcript_aln.GetSeq_id(0))) {
3172  genomic_query_loc = mapper->Map(query_loc);
3173  } else {
3174  genomic_query_loc.Reset(&query_loc);
3175  }
3176 
3177  CRef<CSeq_loc> rna_loc = transcript_aln.CreateRowSeq_loc(1);
3178 
3179  CRef<CSeq_loc> cds_loc;
3180  {{
3181  CBioseq_Handle bsh = m_scope->GetBioseqHandle(transcript_aln.GetSeq_id(0));
3182  for(CFeat_CI ci(bsh, SAnnotSelector(CSeqFeatData::e_Cdregion)); ci; ++ci) {
3183  const CMappedFeat& mf = *ci;
3184  cds_loc = mapper->Map(mf.GetLocation());
3185 
3186  //remove indels from the mapped cds loc
3188  cds_loc = rna_loc->Intersect(*cds_loc, 0, NULL);
3189  break;
3190  }
3191  }}
3192 
3193  s_FindLocationProperties(rna_loc, cds_loc, *genomic_query_loc, terms);
3194  }
3195 
3196 
3198  CConstRef<CSeq_loc> rna_loc,
3199  CConstRef<CSeq_loc> cds_loc,
3200  const CSeq_loc& query_loc,
3201  CVariationUtil::TSOTerms& terms)
3202  {
3203  struct SPropsMap
3204  {
3206  typedef map<CSeq_id_Handle, TRangeMap> TIdRangeMap;
3207  TIdRangeMap loc_map;
3208 
3209  void Add(CVariationUtil::ESOTerm term, const CSeq_loc& loc)
3210  {
3211  for(CSeq_loc_CI ci(loc); ci; ++ci) {
3212  loc_map[ci.GetSeq_id_Handle()][ci.GetRange()] = term;
3213  }
3214  }
3215  } props_map;
3216 
3217  typedef pair<CRef<CSeq_loc>, CRef<CSeq_loc> > TLocsPair;
3218 
3219  if(!rna_loc && !cds_loc) {
3220  return;
3221  }
3222 
3223  const CSeq_loc& main_loc = rna_loc ? *rna_loc : *cds_loc;
3224 
3225  {{
3227  main_loc,
3228  main_loc.GetStop(eExtreme_Positional) + 100000);
3229 
3230  props_map.Add(eSO_2KB_upstream_variant, *p.first);
3231  props_map.Add(eSO_500B_downstream_variant, *p.second);
3232  }}
3233 
3234  {{
3236  props_map.Add(eSO_intron_variant, *p.first);
3237  size_t i(0);
3238  for(CSeq_loc_CI ci(*p.second,
3240  CSeq_loc_CI::eOrder_Biological); ci; ++ci) {
3241  props_map.Add((i % 2 ? eSO_splice_acceptor_variant
3243  *ci.GetRangeAsSeq_loc());
3244  i++;
3245  }
3246  }}
3247 
3248  if(!cds_loc) {
3249  props_map.Add(eSO_nc_transcript_variant, *rna_loc);
3250  } else {
3251  props_map.Add(eSO_coding_sequence_variant, *cds_loc);
3252 
3253  {{
3255  props_map.Add(eSO_initiator_codon_variant, *p.first);
3256  props_map.Add(eSO_terminator_codon_variant, *p.second);
3257  }}
3258 
3259  if(rna_loc) {
3260  TLocsPair p = CVariantPropertiesIndex::s_GetUTRLocs(*cds_loc, *rna_loc);
3261  props_map.Add(eSO_5_prime_UTR_variant, *p.first);
3262  props_map.Add(eSO_3_prime_UTR_variant, *p.second);
3263  }
3264  }
3265 
3266  set<CVariationUtil::ESOTerm> terms_set;
3267  for(CSeq_loc_CI ci(query_loc, CSeq_loc_CI::eEmpty_Skip); ci; ++ci) {
3268  const SPropsMap::TRangeMap& rm = props_map.loc_map[ci.GetSeq_id_Handle()];
3269  for(SPropsMap::TRangeMap::const_iterator it2 = rm.begin(ci.GetRange()); it2.Valid(); ++it2) {
3270  terms_set.insert(it2->second);
3271  }
3272  }
3273  copy(terms_set.begin(), terms_set.end(), back_inserter(terms));
3274  }
3275 
3276 
3277 
3278 
3279 
3280  //transcript length less polyA
3282  {
3285  return bsh.GetInst_Length();
3286  }
3287 
3288  SAnnotSelector sel;
3291 
3292  TSeqPos last_exon_pos(0);
3293  TSeqPos last_polyA_pos(0);
3294  for(CFeat_CI ci(bsh, sel); ci; ++ci) {
3295  const CMappedFeat& mf = *ci;
3297  last_exon_pos = max(last_exon_pos, sequence::GetStop(mf.GetLocation(), NULL));
3298  } else {
3299  last_polyA_pos = max(last_polyA_pos, sequence::GetStop(mf.GetLocation(), NULL));
3300  }
3301  }
3302 
3303  return last_exon_pos ? last_exon_pos + 1
3304  : last_polyA_pos ? last_polyA_pos + 1
3305  : bsh.GetInst_Length();
3306  }
3307 
3308 
3310  CVariantPlacement& p,
3311  int offset,
3312  const CSeq_loc& loc,
3313  CBioseq_Handle& bsh)
3314  {
3315  if(!p.IsSetGene_location()) {
3316  //need to zero-out the bitmask, otherwise in debug mode it will be preset to a magic value,
3317  //and then modifying it with "|=" will produce garbage.
3318  p.SetGene_location(0);
3319  }
3320 
3322  offset *= -1;
3323  }
3324 
3325  if(loc.GetStop(eExtreme_Positional) + 1 >= GetEffectiveTranscriptLength(bsh) && offset > 0) {
3326  //at the 3'-end; check if near-gene or intergenic
3327  if(offset <= 500) {
3329  } else {
3331  }
3332  } else if(loc.GetStart(eExtreme_Positional) == 0 && offset < 0) {
3333  //at the 5'-end; check if near-gene or intergenic
3334  if(offset >= -2000) {
3336  } else {
3338  }
3339  } else {
3340  //intronic or splice
3341  if(offset < 0 && offset >= -2) {
3343  } else if(offset > 0 && offset <= 2) {
3345  } else {
3347  }
3348  }
3349 
3350  if(p.GetGene_location() == 0) {
3351  p.ResetGene_location();
3352  }
3353  }
3354 
3355 
3357  {
3358  v.Index();
3359 
3360  for(CTypeIterator<CVariation> it(Begin(v)); it; ++it) {
3361  CVariation& v2 = *it;
3362  if(!v2.IsSetPlacements()) {
3363  continue;
3364  }
3365 
3366  //will collapse placement-specific gene-location properties onto variation properties
3367  if(!v2.SetVariant_prop().IsSetGene_location()) {
3368  v2.SetVariant_prop().SetGene_location(0); //clear out the magic-value
3369  }
3370 
3371  NON_CONST_ITERATE(CVariation::TPlacements, it2, v2.SetPlacements())
3372  {
3373  CVariantPlacement& p = **it2;
3375 
3376  if(v2.GetConsequenceParent()) {
3377  //If this variation is a consequence of a parent variation, we are only interested
3378  //in the product-specific properties (as the consequence variation will have nucleotide-level placement
3379  //but we're not interested in recomputing multiple-genes-specific properties here.
3380  break;
3381  }
3382 
3383  v2.SetVariant_prop().SetGene_location() |= p.GetGene_location();
3384  }
3385  }
3386  }
3387 
3388 
3390  const CSeq_loc& loc,
3392  {
3394  TMap m; //will collapse properties per gene-id (TGene_location is a bitmask)
3395 
3396  CRef<CSeq_loc> loc2(new CSeq_loc);
3397  loc2->Assign(loc);
3399 
3400  for(CSeq_loc_CI ci(*loc2); ci; ++ci) {
3401  CSeq_id_Handle idh = ci.GetSeq_id_Handle();
3402 
3403  if(m_loc2prop.find(idh) == m_loc2prop.end()) {
3404  //first time seeing this seq-id - index it
3405  try {
3406  x_Index(idh);
3407  }
3408  catch(CException& e) {
3409  NCBI_RETHROW_SAME(e, "Can't Index " + idh.AsString());
3410  }
3411  }
3412 
3414  if(it == m_loc2prop.end()) {
3415  return;
3416  }
3417  const TRangeMap& rm = it->second;
3418 
3419  for(TRangeMap::const_iterator it2 = rm.begin(ci.GetRange()); it2.Valid(); ++it2) {
3420  ITERATE(TGeneIDAndPropVector, it3, it2->second)
3421  {
3422  TGeneIDAndProp gene_id_and_prop = *it3;
3423  int gene_id = gene_id_and_prop.first;
3424  CVariantProperties::TGene_location properties = gene_id_and_prop.second;
3425 
3426  if(m.find(gene_id) == m.end()) {
3427  m[gene_id] = 0; //need to zero-out the bitmask in debug mode
3428  }
3429  m[gene_id] |= properties;
3430  }
3431  }
3432  }
3433 
3434  ITERATE(TMap, it, m)
3435  {
3436  v.push_back(TGeneIDAndProp(it->first, it->second));
3437  }
3438  }
3439 
3441  const CSeq_loc& loc,
3442  int gene_id,
3444  {
3445  for(CSeq_loc_CI ci(loc); ci; ++ci) {
3446  try {
3447  m_loc2prop[ci.GetSeq_id_Handle()][ci.GetRange()].push_back(TGeneIDAndProp(gene_id, prop));
3448  }
3449  catch(CException& e) {
3450  NcbiCerr << MSerial_AsnText << loc << gene_id << " " << prop;
3451  NCBI_RETHROW_SAME(e, "Can't index location");
3452  }
3453  }
3454  }
3455 
3456  ///Note: this is strand-agnostic
3458  {
3459  public:
3461  {
3462  for(CSeq_loc_CI ci(loc); ci; ++ci) {
3463  m_rangemap[ci.GetSeq_id_Handle()][ci.GetRange()] = ci.GetRangeAsSeq_loc();
3464  }
3465  }
3466 
3467  void operator()(CSeq_loc& container_loc) const
3468  {
3469  for(CTypeIterator<CSeq_loc> it(Begin(container_loc)); it; ++it) {
3470  CSeq_loc& loc = *it;
3471  if(!loc.IsInt() && !loc.IsPnt()) {
3472  continue;
3473  }
3475  TIdRangeMap::const_iterator it2 = m_rangemap.find(idh);
3476  if(it2 == m_rangemap.end()) {
3477  continue;
3478  }
3479  const TRangeMap& rm = it2->second;
3480 
3481  for(TRangeMap::const_iterator it3 = rm.begin(loc.GetTotalRange()); it3.Valid(); ++it3) {
3482  CConstRef<CSeq_loc> loc2 = it3->second;
3483  CRef<CSeq_loc> loc3 = sequence::Seq_loc_Subtract(loc, *loc2, 0, NULL);
3484  loc.Assign(*loc3);
3485  }
3486  }
3487  }
3488  private:
3492  };
3493 
3494 
3495  static bool IsRefSeqGene(const CBioseq_Handle& bsh)
3496  {
3497  if(!bsh.CanGetDescr()) {
3498  return false;
3499  }
3500  ITERATE(CSeq_descr::Tdata, it, bsh.GetDescr().Get())
3501  {
3502  const CSeqdesc& desc = **it;
3503  if(desc.IsGenbank()) {
3504  const CGB_block::TKeywords& k = desc.GetGenbank().GetKeywords();
3505  if(std::find(k.begin(), k.end(), "RefSeqGene") != k.end()) {
3506  return true;
3507  }
3508  }
3509  }
3510  return false;
3511  }
3512 
3513  // VAR-239
3514  // If RefSeqGene NG, Return GeneIDs for loci having alignments to transcripts.
3515  // Otherwise, return empty set
3517  {
3518  set<int> gene_ids;
3519  if(!IsRefSeqGene(bsh)) {
3520  return gene_ids;
3521  }
3522 
3523  set<CSeq_id_Handle> transcript_seq_ids;
3524  for(CAlign_CI ci(bsh); ci; ++ci) {
3525  const CSeq_align& aln = *ci;
3526  if(aln.GetSegs().IsSpliced()) {
3527  transcript_seq_ids.insert(CSeq_id_Handle::GetHandle(aln.GetSeq_id(0)));
3528  }
3529  }
3530 
3531  SAnnotSelector sel;
3535  for(CFeat_CI ci(bsh, sel); ci; ++ci) {
3536  const CMappedFeat& mf = *ci;
3537  if(!mf.IsSetProduct() || !mf.IsSetDbxref()) {
3538  continue;
3539  }
3540 
3543 
3544  if(transcript_seq_ids.find(product_id) == transcript_seq_ids.end()) {
3545  continue;
3546  }
3547 
3548  //note: using temporary reference "dbxrefs" is necessary because
3549  //"ITERATE(CSeq_feat::TDbxref, it, mf.GetDbxref())", assert-fails in stl-safe-iter mode
3550  const CSeq_feat::TDbxref& dbxrefs = mf.GetDbxref();
3551  ITERATE(CSeq_feat::TDbxref, it, dbxrefs)
3552  {
3553  const CDbtag& dbtag = **it;
3554  if(dbtag.GetDb() == "GeneID"
3555  || dbtag.GetDb() == "LocusID") {
3556  gene_ids.insert(dbtag.GetTag().GetId());
3557  }
3558  }
3559  }
3560  return gene_ids;
3561  }
3562 
3563 
3564  // There's no GeneID on a protein bioseq.
3565  // Instead, need to find the corresponding cdregion (which may or may not have GeneID dbxref),
3566  // and get GeneID based on parent gene feature.
3568  CBioseq_Handle orig_bsh)
3569  {
3570  CBioseq_Handle bsh = orig_bsh.GetInst_Mol() == CSeq_inst::eMol_aa ?
3571  sequence::GetNucleotideParent(orig_bsh) : orig_bsh;
3572 
3573  SAnnotSelector sel;
3574  sel.SetResolveTSE();
3575  sel.SetAdaptiveDepth();
3579  CFeat_CI ci(bsh, sel);
3580  feature::CFeatTree ft(ci);
3581 
3582  for(ci.Rewind(); ci; ++ci) {
3583  const CMappedFeat& mf = *ci;
3584  if(mf.IsSetProduct()
3585  && mf.GetProduct().GetId()
3587  *orig_bsh.GetSeqId(),
3588  *mf.GetProduct().GetId(),
3589  &orig_bsh.GetScope())) {
3590  return s_GetGeneID(mf, ft);
3591  }
3592  }
3593  return 0;
3594  }
3595 
3596 
3598  {
3599  SAnnotSelector sel;
3600  sel.SetResolveAll(); //needs to work on NCs
3601  sel.SetAdaptiveDepth();
3605 
3607  CFeat_CI ci(bsh, sel);
3608  feature::CFeatTree ft(ci);
3609 
3610  m_loc2prop[idh].size(); //in case there's no annotation, simply add the entry to the map
3611  //se we know it has been indexed.
3612 
3613  set<int> focus_loci = GetFocusLocusIDs(bsh);
3614 
3615  //Note: A location can have in-neighborhood == true only if it is NOT in
3616  //range of another locus. However, if we're dealing with in-focus/out-of-focus
3617  //genes on NG, then for the purpose of calculating in-neighborhood flags we need to
3618  //put the non-focus genes out-of-scope. On the other hand, when calculating is-intergenic
3619  //flag, we have to assume that the non-focus genes are in scope (VAR-239).
3620  //That's why we need to collect focus_gene_ranges and non_focus_gene_ranges separately.
3621  //(See the code below that sets CVariantProperties::eGene_location_intergenic)
3622 
3623  CRef<CSeq_loc> focus_gene_ranges(new CSeq_loc(CSeq_loc::e_Mix));
3624  CRef<CSeq_loc> non_focus_gene_ranges(new CSeq_loc(CSeq_loc::e_Mix));
3625  for(ci.Rewind(); ci; ++ci) {
3626  const CMappedFeat& mf = *ci;
3627  if(!mf.GetData().IsGene()) {
3628  continue;
3629  }
3630 
3631  const int gene_id = s_GetGeneID(mf, ft);
3632  const bool is_focus_locus = focus_loci.empty()
3633  || focus_loci.count(gene_id);
3634 
3635  (is_focus_locus ? focus_gene_ranges
3636  : non_focus_gene_ranges)
3637  ->SetMix().Set().push_back(
3639  mf.GetLocation(),
3641  NULL));
3642  }
3643  focus_gene_ranges->ResetStrand();
3644  non_focus_gene_ranges->ResetStrand();
3645  SFastLocSubtract subtract_gene_ranges_from(*focus_gene_ranges);
3646 
3647  CRef<CSeq_loc> all_gene_neighborhoods(new CSeq_loc(CSeq_loc::e_Mix));
3648 
3649  bool found_some_gene_ids = false;
3650 
3651  for(ci.Rewind(); ci; ++ci) {
3652  const CMappedFeat& mf = *ci;
3653  CMappedFeat parent_mf = ft.GetParent(mf);
3654 
3655  if(!mf.GetLocation().GetId()) {
3656  continue;
3657  }
3658 
3659  const int gene_id = s_GetGeneID(mf, ft);
3660  if(!focus_loci.empty()
3661  && focus_loci.find(gene_id) == focus_loci.end()) {
3662  continue; //VAR-239
3663  }
3664 
3665  if(!parent_mf && gene_id) {
3666  // Some locations currently may not be covered by
3667  // any variant-properties values (e.g. in-cds)
3668  // and yet we need to index gene_ids at those locations.
3669  // Since variant-properties is a bitmask,
3670  // we can simply use the value 0 for that.
3671  // Only need to do that for parent locs.
3672  x_Add(mf.GetLocation(), gene_id, 0);
3673  found_some_gene_ids = true;
3674  }
3675 
3676  // compute neighbborhood locations.
3677  // Normally for gene features, or rna features lacking a parent gene feature.
3678  // Applicable for dna context only.
3679  if((mf.GetData().IsGene() || (!parent_mf && mf.GetData().IsRna()))
3680  && bsh.GetBioseqMolType() == CSeq_inst::eMol_dna) {
3681  TLocsPair p = s_GetNeighborhoodLocs(mf.GetLocation(), bsh.GetInst_Length() - 1);
3682 
3683  // exclude any gene overlap from neighborhood
3684  // (i.e. we don't care if location is in
3685  // neighborhood of gene-A if it is within gene-B.
3686 
3687  // First, reset strands (as we did with gene_ranges),
3688  // as we need to do this in strand-agnostic manner.
3689  // Note: a reasonable thing to do would be to set
3690  // all-gene-ranges strand to "both", with an expectation
3691  // that subtracting such a loc from either plus or minus
3692  // loc would work, but unfortunately it doesn't.
3693  // Resetting strand, however, is fine, because our
3694  // indexing is strand-agnostic.
3695  p.first->ResetStrand();
3696  p.second->ResetStrand();
3697 
3698 #if 0
3699  // Note: disabling subtraction of gene ranges because want to
3700  // report neargene-ness regardless of other loci SNP-5000
3701 #if
3702  //all_gene_ranges is a big complex loc, and subtracting with Seq_loc_Subtract
3703  //for every neighborhood is slow (takes almost 10 seconds for NC_000001),
3704  //so we'll use fast map-based subtractor instead
3705  p.first = sequence::Seq_loc_Subtract(
3706  *p.first,
3707  *all_gene_ranges,
3709  NULL);
3710 
3711  p.second = sequence::Seq_loc_Subtract(
3712  *p.second,
3713  *all_gene_ranges,
3715  NULL);
3716 #else
3717  subtract_gene_ranges_from(*p.first);
3718  subtract_gene_ranges_from(*p.second);
3719 #endif
3720 #endif
3721  x_Add(*p.first, gene_id, CVariantProperties::eGene_location_near_gene_5);
3722  x_Add(*p.second, gene_id, CVariantProperties::eGene_location_near_gene_3);
3723 
3724  all_gene_neighborhoods->SetMix().Set().push_back(p.first);
3725  all_gene_neighborhoods->SetMix().Set().push_back(p.second);
3726  }
3727 
3728  //VAR-703: need to process both RNAs and CDSes to include IGs.
3729  if(mf.GetData().IsRna() || mf.GetData().IsCdregion()) {
3730  if(mf.GetData().IsRna()
3732  x_Add(mf.GetLocation(),
3733  gene_id,
3735  }
3736 
3737  TLocsPair p = s_GetIntronsAndSpliceSiteLocs(mf.GetLocation());
3738  x_Add(*p.first,
3739  gene_id,
3741 
3742  size_t i(0);
3743  for(CSeq_loc_CI ci(*p.second,
3746  ci; ++ci) {
3747  x_Add(*ci.GetRangeAsSeq_loc(),
3748  gene_id,
3751  ++i;
3752  }
3753  }
3754 
3755  if(mf.GetData().IsCdregion()) {
3756  TLocsPair p = s_GetStartAndStopCodonsLocs(mf.GetLocation());
3757 
3758  x_Add(*p.first,
3759  gene_id,
3761 
3762  x_Add(*p.second,
3763  gene_id,
3765 
3767  {{
3768  if(parent_mf) {
3769  rna_loc->Assign(parent_mf.GetLocation());
3770  } else if(bsh.GetBioseqMolType() == CSeq_inst::eMol_rna) {
3771  //refseqs have gene feature annotated on rnas spanning the whole sequence,
3772  //but in general there may be free-floating CDS on an rna, in which case
3773  //parent loc is the whole seq.
3774  rna_loc = bsh.GetRangeSeq_loc(
3775  0,
3776  bsh.GetInst_Length() - 1,
3777  mf.GetLocation().GetStrand());
3778  }
3779  }}
3780 
3781  p = s_GetUTRLocs(mf.GetLocation(), *rna_loc);
3782 
3783  x_Add(*p.first,
3784  gene_id,
3786 
3787  x_Add(*p.second,
3788  gene_id,
3790 
3791  } else if(mf.GetData().IsGene()) {
3792  if(ft.GetChildren(mf).size() == 0) {
3793  x_Add(mf.GetLocation(),
3794  gene_id,
3798  }
3799  }
3800  }
3801 
3802  if(bsh.GetBioseqMolType() == CSeq_inst::eMol_dna) {
3803 
3804  CRef<CSeq_loc> range_loc =
3805  bsh.GetRangeSeq_loc(0, bsh.GetInst_Length() - 1, eNa_strand_plus);
3806 
3807  CRef<CSeq_loc> genes_and_neighborhoods_loc =
3808  sequence::Seq_loc_Add(*all_gene_neighborhoods,
3809  *focus_gene_ranges,
3810  0, NULL);
3811 
3812  genes_and_neighborhoods_loc =
3814  *genes_and_neighborhoods_loc,
3815  *non_focus_gene_ranges,
3816  0, NULL);
3817 
3818  genes_and_neighborhoods_loc->ResetStrand();
3819 
3820  CRef<CSeq_loc> intergenic_loc =
3822  *range_loc,
3823  *genes_and_neighborhoods_loc,
3825  NULL);
3826 
3827  x_Add(*intergenic_loc,
3828  0,
3830  }
3831 
3833  && !found_some_gene_ids) {
3834  // JIRA:SNP-5390
3835  // in it's normal configuration the iterator won't
3836  // find a feature with GeneID on a protein, and so
3837  // it would not be reported in the protein placement.
3838  // If a CDS is annotated directly on a DNA molecule,
3839  // there would be no product-specific GeneID at all (
3840  CRef<CSeq_loc> whole_range_loc =
3841  bsh.GetRangeSeq_loc(0, bsh.GetInst_Length() - 1, eNa_strand_plus);
3842 
3843  int gene_id = s_GetGeneIdForProduct(bsh);
3844  if(gene_id) {
3845  x_Add(*whole_range_loc, gene_id, 0);
3846  }
3847  }
3848  }
3849 
3851  const CMappedFeat& mf,
3852  feature::CFeatTree& ft)
3853  {
3854  int gene_id = 0;
3855  if(mf.IsSetDbxref()) {
3856  //note: using temporary reference "dbxrefs" is necessary because
3857  //"ITERATE(CSeq_feat::TDbxref, it, mf.GetDbxref())", assert-fails in stl-safe-iter mode
3858  const CSeq_feat::TDbxref& dbxrefs = mf.GetDbxref();
3859  ITERATE(CSeq_feat::TDbxref, it, dbxrefs)
3860  {
3861  const CDbtag& dbtag = **it;
3862  if(dbtag.GetDb() == "GeneID"
3863  || dbtag.GetDb() == "LocusID") {
3864  gene_id = dbtag.GetTag().GetId();
3865  }
3866  }
3867  }
3868 
3869  // try legacy representation, e.g. NM_000155.1:
3870  if(!gene_id
3871  && mf.GetData().IsGene()
3872  && mf.GetData().GetGene().IsSetDb()) {
3873  const CGene_ref::TDb& dbxrefs = mf.GetData().GetGene().GetDb();
3874  ITERATE(CSeq_feat::TDbxref, it, dbxrefs)
3875  {
3876  const CDbtag& dbtag = **it;
3877  if(dbtag.GetDb() == "GeneID"
3878  || dbtag.GetDb() == "LocusID") {
3879  gene_id = dbtag.GetTag().GetId();
3880  }
3881  }
3882  }
3883 
3884  if(gene_id == 0) {
3885  CMappedFeat parent = ft.GetParent(mf);
3886  return parent ? s_GetGeneID(parent, ft) : gene_id;
3887  } else {
3888  return gene_id;
3889  }
3890  }
3891 
3892 
3895  {
3896  TLocsPair p;
3898  p.second.Reset(new CSeq_loc);
3899  p.second->Assign(*p.first);
3900  p.first->SetInt().SetTo(p.first->GetInt().GetFrom() + 2);
3901  p.second->SetInt().SetFrom(p.second->GetInt().GetTo() - 2);
3902 
3903  if(IsReverse(cds_loc.GetStrand())) {
3904  swap(p.first, p.second);
3905  }
3906 
3908  p.first->SetNull();
3909  }
3911  p.second->SetNull();
3912  }
3913 
3914  return p;
3915  }
3916 
3919  {
3920  TLocsPair p;
3922  CRef<CSeq_loc> sub_loc2(new CSeq_loc);
3923  sub_loc2->Assign(*sub_loc1);
3924 
3925  sub_loc1->SetInt().SetTo(parent_loc.GetTotalRange().GetTo());
3926  sub_loc2->SetInt().SetFrom(parent_loc.GetTotalRange().GetFrom());
3927 
3928  p.first = sequence::Seq_loc_Subtract(parent_loc, *sub_loc1, CSeq_loc::fSortAndMerge_All, NULL);
3929  p.second = sequence::Seq_loc_Subtract(parent_loc, *sub_loc2, CSeq_loc::fSortAndMerge_All, NULL);
3930 
3931  if(IsReverse(cds_loc.GetStrand())) {
3932  swap(p.first, p.second);
3933  }
3934  return p;
3935  }
3936 
3939  {
3940  TSeqPos flank1_len(2000), flank2_len(500);
3941  if(IsReverse(gene_loc.GetStrand())) {
3942  swap(flank1_len, flank2_len);
3943  }
3944 
3945  TLocsPair p;
3947  p.second.Reset(new CSeq_loc);
3948  p.second->Assign(*p.first);
3949 
3950  if(p.first->GetTotalRange().GetFrom() == 0) {
3951  p.first->SetNull();
3952  } else {
3953  p.first->SetInt().SetTo(p.first->GetTotalRange().GetFrom() - 1);
3954  p.first->SetInt().SetFrom(p.first->GetTotalRange().GetFrom() < flank1_len ? 0 : p.first->GetTotalRange().GetFrom() - flank1_len);
3955  }
3956 
3957  if(p.second->GetTotalRange().GetTo() == max_pos) {
3958  p.second->SetNull();
3959  } else {
3960  p.second->SetInt().SetFrom(p.second->GetTotalRange().GetTo() + 1);
3961  p.second->SetInt().SetTo(p.second->GetTotalRange().GetTo() > max_pos ? max_pos : p.second->GetTotalRange().GetTo() + flank2_len);
3962  }
3963 
3964  if(IsReverse(gene_loc.GetStrand())) {
3965  swap(p.first, p.second);
3966  }
3967  return p;
3968  }
3969 
3970 
3973  {
3975 
3976  CRef<CSeq_loc> introns_loc_with_splice_sites = sequence::Seq_loc_Subtract(*range_loc, rna_loc, 0, NULL);
3977  CRef<CSeq_loc> introns_loc_without_splice_sites(new CSeq_loc);
3978  introns_loc_without_splice_sites->Assign(*introns_loc_with_splice_sites);
3979 
3980  //truncate each terminal by two bases from each end to exclude splice sites.
3981  for(CTypeIterator<CSeq_interval> it(Begin(*introns_loc_without_splice_sites)); it; ++it) {
3982  CSeq_interval& seqint = *it;
3983  if(seqint.GetLength() >= 5) {
3984  seqint.SetFrom() += 2;
3985  seqint.SetTo() -= 2;
3986  }
3987  }
3988 
3989  TLocsPair p;
3990  p.first = introns_loc_without_splice_sites;
3991  p.second = sequence::Seq_loc_Subtract(*introns_loc_with_splice_sites,
3992  *introns_loc_without_splice_sites,
3994  NULL);
3995  return p;
3996  }
3997 
3998 
3999 
4000 
4001 
4002 
4004  {
4005  SAnnotSelector sel;
4006  sel.SetResolveAll(); //needs to work on NCs
4007  sel.SetAdaptiveDepth();
4011 
4012  m_data[idh].size(); //this will create m_data[idh] entry so that next time we don't try to reindex it if there are no cdregions
4013  m_seq_data_map[idh].mapper.Reset();
4014 
4015  CRef<CSeq_loc> all_rna_loc(new CSeq_loc(CSeq_loc::e_Null));
4016 
4017  for(CFeat_CI ci(bsh, sel); ci; ++ci) {
4018  const CMappedFeat& mf = *ci;
4019  if(!mf.IsSetProduct() || !mf.GetProduct().GetId()) {
4020  continue; //could be segment
4021  }
4022 
4023  if(mf.GetData().IsRna()) {
4024  all_rna_loc->Add(mf.GetLocation());
4025  } else {
4026  all_rna_loc->Add(mf.GetLocation()); //if on mRNA seq, there's no annotated mRNA feature - will just index cds
4027 
4028  SCdregion s;
4030  for(CSeq_loc_CI ci(mf.GetLocation()); ci; ++ci) {
4031  m_data[ci.GetSeq_id_Handle()][ci.GetRange()].push_back(s);
4032  }
4033 
4034 #if 0
4035  //could possibly cache product sequence as well
4036  if(m_options && CVariationUtil::fOpt_cache_exon_sequence) {
4037  x_CacheSeqData(mf.GetProduct(), CSeq_id_Handle::GetHandle(*mf.GetProduct().GetId()));
4038  }
4039 #endif
4040  }
4041  }
4042 
4043  all_rna_loc = sequence::Seq_loc_Merge(*all_rna_loc, CSeq_loc::fSortAndMerge_All, NULL);
4044 
4046  x_CacheSeqData(*all_rna_loc, idh);
4047  }
4048  }
4049 
4051  {
4053  SSeqData& d = m_seq_data_map[idh2];
4054 
4055  if(loc.IsNull() || loc.IsEmpty()) {
4056  return;
4057  }
4058 
4059  CRef<CSeq_loc> loc2(new CSeq_loc);
4060  loc2->Assign(loc);
4061 
4062  CSeqVector seqv;
4063  if(loc.IsWhole()) {
4065  loc2 = bsh.GetRangeSeq_loc(0, bsh.GetInst_Length() - 1, eNa_strand_plus);
4067  } else {
4069  }
4070 
4071  seqv.GetSeqData(seqv.begin(), seqv.end(), d.seq_data);
4072 
4073  if(d.seq_data.size() != sequence::GetLength(*loc2, m_scope) /*could be whole, so need scope*/) {
4074  //expecting the length of returned sequence to be exactly the same as location;
4075  //otherwise can't associate sub-location with position in returned sequence
4076  d.seq_data = "";
4077  d.mapper.Reset();
4078  return;
4079  }
4080 
4081  CRef<CSeq_loc> target_loc(new CSeq_loc);
4082  target_loc->SetInt().SetId().SetLocal().SetStr("all_cds");
4083  target_loc->SetInt().SetFrom(0);
4084  target_loc->SetInt().SetTo(d.seq_data.size() - 1);
4085  target_loc->SetInt().SetStrand(eNa_strand_plus);
4086 
4087  d.mapper.Reset(new CSeq_loc_Mapper(*loc2, *target_loc, NULL));
4088  }
4089 
4091  {
4093  literal->SetSeq_data().SetIupacna().Set("");
4094 
4095 
4096  CRef<CSeq_loc> loc2(new CSeq_loc);
4097  loc2->Assign(loc);
4099 
4100  for(CSeq_loc_CI ci(*loc2); ci; ++ci) {
4101  if(m_seq_data_map.find(ci.GetSeq_id_Handle()) == m_seq_data_map.end()) {
4102  return CRef<CSeq_literal>(NULL);
4103  }
4104  const SSeqData& d = m_seq_data_map.find(ci.GetSeq_id_Handle())->second;
4105  if(!d.mapper) {
4106  return CRef<CSeq_literal>(NULL);
4107  }
4108 
4109  CRef<CSeq_loc> mapped_loc = d.mapper->Map(*ci.GetRangeAsSeq_loc());
4110 
4111  string s = "";
4112  mapped_loc->GetLabel(&s);
4113  if((!mapped_loc->IsInt() && !mapped_loc->IsPnt()) //split mapping
4114  || mapped_loc->GetTotalRange().GetLength() != ci.GetRange().GetLength()
4115  || mapped_loc->GetStrand() != eNa_strand_plus) //todo: can accomodate minus by reverse-complementing the chunk
4116  {
4117  return CRef<CSeq_literal>(NULL);
4118  }
4119 
4120  string seq_chunk = d.seq_data.substr(mapped_loc->GetStart(eExtreme_Positional), mapped_loc->GetTotalRange().GetLength());
4121  literal->SetSeq_data().SetIupacna().Set() += seq_chunk;
4122  }
4123 
4124  literal->SetLength(literal->GetSeq_data().GetIupacna().Get().size());
4125  return literal;
4126  }
4127 
4129  {
4130  CRef<CSeq_loc> loc2(new CSeq_loc);
4131  loc2->Assign(loc);
4133 
4134  for(CSeq_loc_CI ci(*loc2); ci; ++ci) {
4135  CSeq_id_Handle idh = ci.GetSeq_id_Handle();
4136 
4137  if(m_data.find(idh) == m_data.end()) {
4138  //first time seeing this seq-id - index it
4139  x_Index(idh);
4140  }
4141 
4142  TIdRangeMap::const_iterator it = m_data.find(idh);
4143  if(it == m_data.end()) {
4144  return;
4145  }
4146  const TRangeMap& rm = it->second;
4147 
4148  set<SCdregion> results;
4149  for(TRangeMap::const_iterator it2 = rm.begin(ci.GetRange()); it2.Valid(); ++it2) {
4150  ITERATE(TCdregions, it3, it2->second)
4151  {
4152  results.insert(*it3);
4153  }
4154  }
4155  ITERATE(set<SCdregion>, it, results)
4156  {
4157  cdregions.push_back(*it);
4158  }
4159  }
4160  }
4161 
4162 
4163 
4164  //
4165  //
4166  // Methods to suuport Variation / Variation_ref conversions.
4167  //
4168  //
4169 
4170  //Make a copy of the child and inherit relevant (SNP-5716) fields from parent that are not set in the child
4172  {
4173  CRef<CVariation> v(SerialClone(child));
4174 
4175  if(!v->IsSetId() && parent.IsSetId()) {
4176  v->SetId().Assign(parent.GetId());
4177  }
4178 
4179  if(!v->IsSetParent_id() && parent.IsSetParent_id()) {
4180  v->SetParent_id().Assign(parent.GetParent_id());
4181  }
4182 
4183  if(!v->IsSetSample_id() && parent.IsSetSample_id()) {
4185  {
4186  v->SetSample_id().push_back(CRef<CObject_id>(SerialClone(**it)));
4187  }
4188  }
4189 
4190  if(!v->IsSetOther_ids() && parent.IsSetOther_ids()) {
4192  {
4193  v->SetOther_ids().push_back(CRef<CDbtag>(SerialClone(**it)));
4194  }
4195  }
4196 
4197  return v;
4198  }
4199 
4201  {
4203  if(v.IsSetPlacements()) {
4205  {
4206  const CVariantPlacement& p = **it;
4208  CRef<CSeq_feat> feat(new CSeq_feat);
4209  feat->SetLocation().Assign(p.GetLoc());
4210  feat->SetData().SetVariation(*vr);
4211  feats.push_back(feat);
4212  }
4213  } else if(v.GetData().IsSet()) {
4215  {
4216  AsVariation_feats(*InheritParentAttributes(**it, v), feats);
4217  }
4218  }
4219 
4221  {
4222  CSeq_feat& feat = **it;
4223  if(v.IsSetPub() && !feat.IsSetCit()) {
4224  feat.SetCit().Assign(v.GetPub());
4225  }
4226  if(v.IsSetExt()) {
4227  ITERATE(CVariation::TExt, it2, v.GetExt())
4228  {
4230  uo->Assign(**it2);
4231  feat.SetExts().push_back(uo);
4232  }
4233  }
4234  }
4235 
4236  //note: we can't attach user-objects to out_feats directly because it will result in duplicates
4237  //when out_feats recurses into next subvariation. Hence we are working with our own ftable of
4238  //feats, and attach the results in the end.
4239  out_feats.insert(out_feats.end(), feats.begin(), feats.end());
4240  }
4241 
4243  {
4245 
4246  if(v.IsSetId()) {
4247  vr->SetId().Assign(v.GetId());
4248  }
4249 
4250  if(v.IsSetParent_id()) {
4251  vr->SetParent_id().Assign(v.GetParent_id());
4252  }
4253 
4254  if(v.IsSetSample_id() && v.GetSample_id().size() > 0) {
4255  vr->SetSample_id().Assign(*v.GetSample_id().front());
4256  }
4257 
4258  if(v.IsSetOther_ids()) {
4260  {
4261  vr->SetOther_ids().push_back(CRef<CDbtag>(SerialClone(**it)));
4262  }
4263  }
4264 
4265  if(v.IsSetName()) {
4266  vr->SetName(v.GetName());
4267  }
4268 
4269  if(v.IsSetSynonyms()) {
4270  vr->SetSynonyms() = v.GetSynonyms();
4271  }
4272 
4273  if(v.IsSetDescription()) {
4274  vr->SetName(v.GetDescription());
4275  }
4276 
4277  if(v.IsSetPhenotype()) {
4279  {
4281  p->Assign(**it);
4282  vr->SetPhenotype().push_back(p);
4283  }
4284  }
4285 
4286  if(v.IsSetMethod()) {
4287  vr->SetMethod() = v.GetMethod().GetMethod();
4288  }
4289 
4290  if(v.IsSetVariant_prop()) {
4291  vr->SetVariant_prop().Assign(v.GetVariant_prop());
4292  }
4293 
4294  //Note: Variation.frameshift <-> Variation-ref.consequence[].frameshift
4295  if(v.IsSetFrameshift()) {
4297  new CVariation_ref::TConsequence::value_type::TObjectType);
4298  vr->SetConsequence().push_back(fr_cons);
4299  fr_cons->SetFrameshift();
4300  if(v.GetFrameshift().IsSetPhase()) {
4301  fr_cons->SetFrameshift().SetPhase(v.GetFrameshift().GetPhase());
4302  }
4303  if(v.GetFrameshift().IsSetX_length()) {
4304  fr_cons->SetFrameshift().SetX_length(v.GetFrameshift().GetX_length());
4305  }
4306  }
4307 
4308  if(v.GetData().IsComplex()) {
4309  vr->SetData().SetComplex();
4310  } else if(v.GetData().IsInstance()) {
4311  vr->SetData().SetInstance().Assign(v.GetData().GetInstance());
4312  s_AddInstOffsetsFromPlacementOffsets(vr->SetData().SetInstance(), p);
4313  } else if(v.GetData().IsNote()) {
4314  vr->SetData().SetNote() = v.GetData().GetNote();
4315  } else if(v.GetData().IsUniparental_disomy()) {
4316  vr->SetData().SetUniparental_disomy();
4317  } else if(v.GetData().IsUnknown()) {
4318  vr->SetData().SetUnknown();
4319  } else if(v.GetData().IsSet()) {
4320  const CVariation::TData::TSet& v_set = v.GetData().GetSet();
4321  CVariation_ref::TData::TSet& vr_set = vr->SetData().SetSet();
4322  vr_set.SetType(v_set.GetType());
4323  if(v_set.IsSetName()) {
4324  vr_set.SetName(v_set.GetName());
4325  }
4327  {
4328  vr_set.SetVariations().push_back(x_AsVariation_ref(**it, p));
4329  }
4330  } else {
4331  NCBI_THROW(CException, eUnknown, "Unhandled Variation_ref::TData::E_CChoice");
4332  }
4333 
4334  if(v.IsSetConsequence()) {
4335  vr->SetConsequence();
4337  {
4338  const CVariation::TConsequence::value_type::TObjectType& v_cons = **it;
4340  new CVariation_ref::TConsequence::value_type::TObjectType);
4341  vr->SetConsequence().push_back(vr_cons);
4342  vr_cons->SetUnknown();
4343 
4344  if(v_cons.IsSplicing()) {
4345  vr_cons->SetSplicing();
4346  } else if(v_cons.IsNote()) {
4347  vr_cons->SetNote(v_cons.GetNote());
4348  } else if(v_cons.IsVariation()) {
4349  CRef<CVariation_ref> cons_variation = x_AsVariation_ref(v_cons.GetVariation(), p);
4350  vr_cons->SetVariation(*cons_variation);
4351  } else if(v_cons.IsLoss_of_heterozygosity()) {
4352  vr_cons->SetLoss_of_heterozygosity();
4353  if(v_cons.GetLoss_of_heterozygosity().IsSetReference()) {
4354  vr_cons->SetLoss_of_heterozygosity().SetReference(
4355  v_cons.GetLoss_of_heterozygosity().GetReference());
4356  }
4357  if(v_cons.GetLoss_of_heterozygosity().IsSetTest()) {
4358  vr_cons->SetLoss_of_heterozygosity().SetTest(
4359  v_cons.GetLoss_of_heterozygosity().GetTest());
4360  }
4361  }
4362  }
4363  }
4364 
4365  if(v.IsSetSomatic_origin()) {
4366  vr->SetSomatic_origin();
4368  {
4369  const CVariation::TSomatic_origin::value_type::TObjectType& v_so = **it;
4370 
4372  new CVariation_ref::TSomatic_origin::value_type::TObjectType);
4373 
4374  if(v_so.IsSetSource()) {
4375  vr_so->SetSource().Assign(v_so.GetSource());
4376  }
4377 
4378  if(v_so.IsSetCondition()) {
4379  vr_so->SetCondition();
4380  if(v_so.GetCondition().IsSetDescription()) {
4381  vr_so->SetCondition().SetDescription(
4382  v_so.GetCondition().GetDescription());
4383  }
4384  if(v_so.GetCondition().IsSetObject_id()) {
4385  vr_so->SetCondition().SetObject_id();
4386  ITERATE(CVariation::TSomatic_origin::value_type::TObjectType::TCondition::TObject_id,
4387  it,
4388  v_so.GetCondition().GetObject_id())
4389  {
4390  CRef<CDbtag> dbtag(new CDbtag);
4391  dbtag->Assign(**it);
4392  vr_so->SetCondition().SetObject_id().push_back(dbtag);
4393  }
4394  }
4395  }
4396 
4397  vr->SetSomatic_origin().push_back(vr_so);
4398  }
4399  }
4400 
4401 
4402  return vr;
4403  }
4404 
4406  {
4408  delta->SetAction(CDelta_item::eAction_offset);
4409  delta->SetSeq().SetLiteral().SetLength(abs(offset));
4410  if(offset < 0) {
4411  delta->SetMultiplier(-1);
4412  }
4413  if(fuzz) {
4414  delta->SetSeq().SetLiteral().SetFuzz().Assign(*fuzz);
4415  }
4416  return delta;
4417  }
4418 
4420  CVariation_inst& vi,
4421  const CVariantPlacement& p)
4422  {
4423  if(p.IsSetStart_offset()) {
4424  vi.SetDelta().push_front(CreateDeltaForOffset(
4425  p.GetStart_offset(),
4427  }
4428  if(p.IsSetStop_offset()) {
4429  vi.SetDelta().push_back(CreateDeltaForOffset(
4430  p.GetStop_offset(),
4432  }
4433  }
4434 
4435 
4437  {
4438  if(!variation_feat.GetData().IsVariation()) {
4439  NCBI_THROW(CException, eUnknown, "Expected variation-ref feature");
4440  }
4441 
4442  CRef<CVariation> v = x_AsVariation(variation_feat.GetData().GetVariation());
4443 
4445  p->SetLoc().Assign(variation_feat.GetLocation());
4446  if(p->GetLoc().GetId()) {
4448  } else {
4450  }
4451  v->SetPlacements().push_back(p);
4452 
4453 
4454  if(variation_feat.IsSetCit()) {
4455  v->SetPub().Assign(variation_feat.GetCit());
4456  }
4457  if(variation_feat.IsSetExt()) {
4458  v->SetExt();
4460  uo->Assign(variation_feat.GetExt());
4461  v->SetExt().push_back(uo);
4462  }
4463  if(variation_feat.IsSetExts()) {
4464  v->SetExt();
4465  ITERATE(CSeq_feat::TExts, it, variation_feat.GetExts())
4466  {
4468  uo->Assign(**it);
4469  v->SetExt().push_back(uo);
4470  }
4471  }
4472 
4474  return v;
4475  }
4476 
4478  {
4480  if(vr.IsSetId()) {
4481  v->SetId().Assign(vr.GetId());
4482  }
4483 
4484  if(vr.IsSetParent_id()) {
4485  v->SetParent_id().Assign(vr.GetParent_id());
4486  }
4487 
4488  if(vr.IsSetSample_id()) {
4489  v->SetSample_id().push_back(CRef<CObject_id>(SerialClone(vr.GetSample_id())));
4490  }
4491 
4492  if(vr.IsSetOther_ids()) {
4494  {
4495  v->SetOther_ids().push_back(CRef<CDbtag>(SerialClone(**it)));
4496  }
4497  }
4498 
4499  if(vr.IsSetName()) {
4500  v->SetName(vr.GetName());
4501  }
4502 
4503  if(vr.IsSetSynonyms()) {
4504  v->SetSynonyms() = vr.GetSynonyms();
4505  }
4506 
4507  if(vr.IsSetDescription()) {
4508  v->SetName(vr.GetDescription());
4509  }
4510 
4511  if(vr.IsSetPhenotype()) {
4513  {
4515  p->Assign(**it);
4516  v->SetPhenotype().push_back(p);
4517  }
4518  }
4519 
4520  if(vr.IsSetMethod()) {
4521  v->SetMethod().SetMethod() = vr.GetMethod();
4522  }
4523 
4524  if(vr.IsSetVariant_prop()) {
4525  v->SetVariant_prop().Assign(vr.GetVariant_prop());
4526  }
4527 
4528  if(vr.GetData().IsComplex()) {
4529  v->SetData().SetComplex();
4530  } else if(vr.GetData().IsInstance()) {
4531  v->SetData().SetInstance().Assign(vr.GetData().GetInstance());
4532  } else if(vr.GetData().IsNote()) {
4533  v->SetData().SetNote() = vr.GetData().GetNote();
4534  } else if(vr.GetData().IsUniparental_disomy()) {
4535  v->SetData().SetUniparental_disomy();
4536  } else if(vr.GetData().IsUnknown()) {
4537  v->SetData().SetUnknown();
4538  } else if(vr.GetData().IsSet()) {
4539  const CVariation_ref::TData::TSet& vr_set = vr.GetData().GetSet();
4540  CVariation::TData::TSet& v_set = v->SetData().SetSet();
4541  v_set.SetType(vr_set.GetType());
4542  if(vr_set.IsSetName()) {
4543  v_set.SetName(vr_set.GetName());
4544  }
4546  {
4547  v_set.SetVariations().push_back(x_AsVariation(**it));
4548  }
4549  } else {
4550  NCBI_THROW(CException, eUnknown, "Unhandled Variation_ref::TData::E_CChoice");
4551  }
4552 
4553  if(vr.IsSetConsequence()) {
4554  v->SetConsequence();
4556  {
4557  const CVariation_ref::TConsequence::value_type::TObjectType& vr_cons = **it;
4558 
4559  if(vr_cons.IsFrameshift()) {
4560  // frameshift is represented in consequence in Variation-ref, and
4561  // directly as an attribute in Variation.
4562  CVariation& cons_variation = *v;
4563  cons_variation.SetFrameshift();
4564  if(vr_cons.GetFrameshift().IsSetPhase()) {
4565  cons_variation.SetFrameshift().SetPhase(vr_cons.GetFrameshift().GetPhase());
4566  }
4567  if(vr_cons.GetFrameshift().IsSetX_length()) {
4568  cons_variation.SetFrameshift().SetX_length(vr_cons.GetFrameshift().GetX_length());
4569  }
4570  continue;
4571  }
4572 
4573  CVariation::TConsequence::value_type v_cons(new CVariation::TConsequence::value_type::TObjectType);
4574 
4575  if(vr_cons.IsUnknown()) {
4576  v_cons->SetUnknown();
4577  } else if(vr_cons.IsSplicing()) {
4578  v_cons->SetSplicing();
4579  } else if(vr_cons.IsNote()) {
4580  v_cons->SetNote(vr_cons.GetNote());
4581  } else if(vr_cons.IsVariation()) {
4582  CRef<CVariation> cons_variation = x_AsVariation(vr_cons.GetVariation());
4583  v_cons->SetVariation(*cons_variation);
4584  } else if(vr_cons.IsLoss_of_heterozygosity()) {
4585  v_cons->SetLoss_of_heterozygosity();
4586  if(vr_cons.GetLoss_of_heterozygosity().IsSetReference()) {
4587  v_cons->SetLoss_of_heterozygosity().SetReference(vr_cons.GetLoss_of_heterozygosity().GetReference());
4588  }
4589  if(vr_cons.GetLoss_of_heterozygosity().IsSetTest()) {
4590  v_cons->SetLoss_of_heterozygosity().SetTest(vr_cons.GetLoss_of_heterozygosity().GetTest());
4591  }
4592  }
4593 
4594  v->SetConsequence().push_back(v_cons);
4595  }
4596  if(v->GetConsequence().empty()) {
4597  v->ResetConsequence();
4598  }
4599  }
4600 
4601  if(vr.IsSetSomatic_origin()) {
4602  v->SetSomatic_origin();
4604  {
4605  const CVariation_ref::TSomatic_origin::value_type::TObjectType& vr_so = **it;
4606  CVariation::TSomatic_origin::value_type v_so(new CVariation::TSomatic_origin::value_type::TObjectType);
4607 
4608  if(vr_so.IsSetSource()) {
4609  v_so->SetSource().Assign(vr_so.GetSource());
4610  }
4611 
4612  if(vr_so.IsSetCondition()) {
4613  v_so->SetCondition();
4614  if(vr_so.GetCondition().IsSetDescription()) {
4615  v_so->SetCondition().SetDescription(vr_so.GetCondition().GetDescription());
4616  }
4617  if(vr_so.GetCondition().IsSetObject_id()) {
4618  v_so->SetCondition().SetObject_id();
4619  ITERATE(CVariation_ref::TSomatic_origin::value_type::TObjectType::TCondition::TObject_id,
4620  it,
4621  vr_so.GetCondition().GetObject_id())
4622  {
4623  CRef<CDbtag> dbtag(new CDbtag);
4624  dbtag->Assign(**it);
4625  v_so->SetCondition().SetObject_id().push_back(dbtag);
4626  }
4627  }
4628  }
4629 
4630  v->SetSomatic_origin().push_back(v_so);
4631  }
4632  }
4633 
4634  return v;
4635  }
4636 
4637 
4638  static int GetSignedOffset(const CDelta_item& delta)
4639  {
4640  return delta.GetSeq().GetLiteral().GetLength()
4641  * (delta.IsSetMultiplier() ? delta.GetMultiplier() : 1);
4642  }
4643 
4644  static const CInt_fuzz* GetFuzz(const CDelta_item& delta)
4645  {
4646  return delta.GetSeq().GetLiteral().IsSetFuzz() ?
4647  &delta.GetSeq().GetLiteral().GetFuzz() : NULL;
4648  }
4649 
4651  {
4652  if(v.GetData().IsSet()) {
4654  it,
4655  v.SetData().SetSet().SetVariations())
4656  {
4658  }
4659  } else if(v.GetData().IsInstance() && v.GetData().GetInstance().GetDelta().size() > 0) {
4660  CConstRef<CDelta_item> delta_first = v.GetData().GetInstance().GetDelta().front();
4661  CConstRef<CDelta_item> delta_last = v.GetData().GetInstance().GetDelta().back();
4662 
4663  if(delta_first->IsSetAction()
4664  && delta_first->GetAction() == CDelta_item::eAction_offset) {
4665  p.SetStart_offset(GetSignedOffset(*delta_first));
4666  const CInt_fuzz* fuzz = GetFuzz(*delta_first);
4667  if(fuzz) {
4668  p.SetStart_offset_fuzz().Assign(*fuzz);
4669  }
4670 
4671  v.SetData().SetInstance().SetDelta().pop_front();
4672  }
4673 
4674  if(delta_last != delta_first
4675  && delta_last->IsSetAction()
4676  && delta_last->GetAction() == CDelta_item::eAction_offset) {
4677  p.SetStop_offset(GetSignedOffset(*delta_last));
4678  const CInt_fuzz* fuzz = GetFuzz(*delta_last);
4679  if(fuzz) {
4680  p.SetStop_offset_fuzz().Assign(*fuzz);
4681  }
4682 
4683  v.SetData().SetInstance().SetDelta().pop_back();
4684  }
4685  }
4686  }
4687 
4688 
4689 
4690 #if 0
4691  CVariationUtil::ETestStatus CVariationUtil::CheckAssertedAllele(
4692  const CSeq_feat& variation_feat,
4693  string* asserted_out,
4694  string* actual_out)
4695  {
4696  return eNotApplicable;
4697  if(!variation_feat.GetData().IsVariation()) {
4698  return eNotApplicable;
4699  }
4700 
4701  CVariation_ref vr;
4702  vr.Assign(variation_feat.GetData().GetVariation());
4703  if(!vr.IsSetLocation()) {
4704  vr.SetLocation().Assign(variation_feat.GetLocation());
4705  }
4706  s_PropagateLocsInPlace(vr);
4707 
4708 
4709  bool have_asserted_seq = false;
4710  bool is_ok = true;
4711  for(CTypeIterator<CVariation_ref> it1(Begin(vr)); it1; ++it1) {
4712  const CVariation_ref& vr1 = *it1;
4713  if(vr1.GetData().IsInstance()
4714  && vr1.GetData().GetInstance().IsSetObservation()
4716  {
4717  string asserted_seq;
4718  const CSeq_literal& literal = vr1.GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral();
4719  if(literal.GetSeq_data().IsIupacna()) {
4720  asserted_seq = literal.GetSeq_data().GetIupacna();
4721  have_asserted_seq = true;
4722  } else if(literal.GetSeq_data().IsNcbieaa()) {
4723  asserted_seq = literal.GetSeq_data().GetNcbieaa();
4724  have_asserted_seq = true;
4725  }
4726 
4727  //an asserted sequnece may be of the form "A..BC", where ".." is to be interpreted as a
4728  //gap of arbitrary length - we need to match prefix and suffix separately
4729  string prefix, suffix;
4730  string str_tmp = NStr::Replace(asserted_seq, "..", "\t"); //SplitInTwo's delimiter must be single-character
4731  NStr::SplitInTwo(str_tmp, "\t", prefix, suffix);
4732 
4734  string actual_seq;
4735  v.GetSeqData(v.begin(), v.end(), actual_seq);
4736 
4737  if( prefix.size() > 0 && !NStr::StartsWith(actual_seq, prefix)
4738  || suffix.size() > 0 && !NStr::EndsWith(actual_seq, suffix))
4739  {
4740  is_ok = false;
4741  if(asserted_out) {
4742  *asserted_out = asserted_seq;
4743  }
4744  if(actual_out) {
4745  *actual_out = actual_seq;
4746  }
4747  break;
4748  }
4749  }
4750  }
4751 
4752  return !have_asserted_seq ? eNotApplicable : is_ok ? ePass : eFail;
4753  }
4754 #endif
4755 
4756 
4757 };
4758 
4760 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool IsReverse(ENa_strand s)
Definition: Na_strand.hpp:75
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
bool SameOrientation(ENa_strand a, ENa_strand b)
Definition: Na_strand.hpp:83
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CAlign_CI –.
Definition: align_ci.hpp:63
CBioseq_Handle –.
CConstRef –.
Definition: ncbiobj.hpp:1266
Definition: Dbtag.hpp:53
CFeat_CI –.
Definition: feat_ci.hpp:64
CMappedFeat –.
Definition: mapped_feat.hpp:59
CPhenotype –.
Definition: Phenotype.hpp:66
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
ESubtype GetSubtype(void) const
CSeqVector –.
Definition: seq_vector.hpp:65
TSeqPos GetSeqStop(TDim row) const
Definition: Seq_align.cpp:273
CRef< CSeq_loc > CreateRowSeq_loc(TDim row) const
Definition: Seq_align.cpp:2028
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
TSeqPos GetSeqStart(TDim row) const
Definition: Seq_align.cpp:252
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
TSeqPos GetLength(void) const
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
CSeq_loc_Mapper –.
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
static TSeqPos Keep(CSeq_data *in_seq, TSeqPos uBeginIdx=0, TSeqPos uLength=0)
static TSeqPos ReverseComplement(CSeq_data *in_seq, TSeqPos uBeginIdx=0, TSeqPos uLength=0)
static TSeqPos Append(CSeq_data *out_seq, const CSeq_data &in_seq1, TSeqPos uBeginIdx1, TSeqPos uLength1, const CSeq_data &in_seq2, TSeqPos uBeginIdx2, TSeqPos uLength2)
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
CVariantPlacement –.
CVariationException –.
CVariationMethod –.
Set of related Variations.
Definition: Variation_.hpp:127
CVariation_inst –.
Set of related Variations.
const TLocation & GetLocation(void) const