NCBI C++ ToolKit
sequence.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sequence.cpp 100589 2023-08-14 14:23:37Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Clifford Clausen
27 *
28 * File Description:
29 * Sequence utilities requiring CScope
30 */
31 
32 #include <ncbi_pch.hpp>
33 #include <serial/iterator.hpp>
34 #include <util/static_map.hpp>
35 
37 #include <objmgr/scope.hpp>
38 #include <objmgr/seq_vector.hpp>
39 #include <objmgr/seq_vector_ci.hpp>
40 #include <objmgr/seqdesc_ci.hpp>
41 #include <objmgr/feat_ci.hpp>
42 #include <objmgr/bioseq_ci.hpp>
45 #include <objmgr/impl/synonyms.hpp>
48 
54 #include <objects/general/Date.hpp>
56 
58 
59 #include <objects/seq/Bioseq.hpp>
63 #include <objects/seq/MolInfo.hpp>
64 #include <objects/seq/Seg_ext.hpp>
65 #include <objects/seq/Seq_ext.hpp>
66 #include <objects/seq/Seq_gap.hpp>
67 #include <objects/seq/Seq_inst.hpp>
70 #include <objects/seq/Seq_hist.hpp>
73 
82 
84 
86 
88 #include <objmgr/seq_entry_ci.hpp>
89 #include <objmgr/util/sequence.hpp>
90 #include <objmgr/error_codes.hpp>
91 #include <util/strsearch.hpp>
92 
93 #include <list>
94 #include <algorithm>
95 
96 
97 #define NCBI_USE_ERRCODE_X ObjMgr_SeqUtil
98 
101 BEGIN_SCOPE(sequence)
102 
103 
105 {
106  ITERATE(CBioseq::TDescr::Tdata, it, bioseq.GetDescr().Get())
107  {
108  if ((**it).IsSource())
109  return &(**it).GetSource();
110  }
111 
112  return NULL;
113 }
114 
116 {
117  {{
118  CSeqdesc_CI desc(handle, CSeqdesc::e_Source);
119  if (desc) {
120  return &desc->GetSource();
121  }
122  }}
123  {{
125  if (desc) {
126  return &desc->GetSource();
127  }
128  }}
129 
130  return NULL;
131 }
132 
134 {
135  CConstRef<CSeq_feat> cds_feat;
136  CConstRef<CSeq_loc> cds_loc;
137  CConstRef<CBioSource> src_ref;
138 
139  CScope& scope = bsh.GetScope();
140 
141  cds_feat = sequence::GetCDSForProduct(bsh);
142 
143  if (cds_feat) {
144  cds_loc = &cds_feat->GetLocation();
145  if (cds_loc) {
146  CRef<CSeq_loc> cleaned_location(new CSeq_loc);
147  cleaned_location->Assign(*cds_loc);
149  if (src_feat) {
150  const CSeq_feat& feat = *src_feat;
151  if (feat.IsSetData()) {
152  return src_feat;
153  }
154  } else {
155  CRef<CSeq_loc> rev_loc(sequence::SeqLocRevCmpl(*cleaned_location, &scope));
156  cleaned_location->Assign(*rev_loc);
158  if (src_feat) {
159  const CSeq_feat& feat = *src_feat;
160  if (feat.IsSetData()) {
161  return src_feat;
162  }
163  }
164  }
165  }
166  }
167 
168  return CConstRef<CSeq_feat>();
169 }
170 
172 {
173  if (!bsh.IsAa()) {
174  return ZERO_TAX_ID;
175  }
176  auto pSourceFeat = GetSourceFeatForProduct(bsh);
177  if (!pSourceFeat) {
178  auto& scope = bsh.GetScope();
179  const auto& idh = bsh.GetAccessSeq_id_Handle();
180  if (idh) {
181  return scope.GetTaxId(idh);
182  }
183  else {
184  return ZERO_TAX_ID;
185  }
186  }
187  const auto& bioSource = pSourceFeat->GetData().GetBiosrc();
188  if (!bioSource.CanGetOrg()) {
189  return ZERO_TAX_ID;
190  }
191  return bioSource.GetOrg().GetTaxId();
192 }
193 
194 void GetOrg_refForProduct(const CBioseq_Handle& bsh, const COrg_ref* pOrgRef)
195 {
196  pOrgRef = nullptr;
197 
198  if (bsh.IsAa()) {
199  auto pSourceFeat = GetSourceFeatForProduct(bsh);
200  if (pSourceFeat) {
201  const auto& bioSource = pSourceFeat->GetData().GetBiosrc();
202  if (bioSource.CanGetOrg()) {
203  pOrgRef = &bioSource.GetOrg();
204  return;
205  }
206  }
207  }
208 }
209 
210 
212 {
213  const auto* pSource = GetBioSourceForBioseq(bsh);
214  if (!pSource || !pSource->CanGetOrg()) {
215  return nullptr;
216  }
217  return &pSource->GetOrg();
218 }
219 
221 {
222  if (bsh.IsAa()) {
223  auto pSourceFeat = GetSourceFeatForProduct(bsh);
224  if (pSourceFeat) {
225  return &pSourceFeat->GetData().GetBiosrc();
226  }
227  }
228 
229  // find a biosource descriptor
230  CSeqdesc_CI dsrcIt(bsh, CSeqdesc::e_Source);
231  if (dsrcIt) {
232  return &dsrcIt->GetSource();
233  }
234 
235  // if no descriptor was found, try a source feature
236  CFeat_CI fsrcIt(bsh, CSeqFeatData::e_Biosrc);
237  if (fsrcIt) {
238  const CSeq_feat& src_feat = fsrcIt->GetOriginalFeature();
239  return &src_feat.GetData().GetBiosrc();
240  }
241 
242  return nullptr;
243 }
244 
246 {
247  vector<CSeqdesc::E_Choice> types;
248  types.push_back(CSeqdesc::e_Source);
249  types.push_back(CSeqdesc::e_Org);
250  CSeqdesc_CI desc_it(handle, types);
251  if ( desc_it ) {
252  const CSeqdesc& desc = *desc_it;
253  if ( desc.IsSource() ) {
254  return &desc.GetSource().GetOrg();
255  }
256  if ( desc.IsOrg() ) {
257  return &desc.GetOrg();
258  }
259  }
260  return 0;
261 }
262 
263 
264 const COrg_ref& GetOrg_ref(const CBioseq_Handle& handle)
265 {
266  const COrg_ref* org_ref = GetOrg_refOrNull(handle);
267  if ( org_ref ) {
268  return *org_ref;
269  }
270  NCBI_THROW(CException, eUnknown, "No organism set");
271 }
272 
273 
275 {
276  const COrg_ref* org_ref = GetOrg_refOrNull(handle);
277  if ( org_ref ) {
278  return org_ref->GetTaxId();
279  }
280  return ZERO_TAX_ID;
281 }
282 
283 
284 const CMolInfo* GetMolInfo(const CBioseq& bioseq)
285 {
286  ITERATE(CBioseq::TDescr::Tdata, it, bioseq.GetDescr().Get())
287  {
288  if ((**it).IsMolinfo())
289  return &(**it).GetMolinfo();
290  }
291  return NULL;
292 }
293 
294 
295 const CMolInfo* GetMolInfo(const CBioseq_Handle& handle)
296 {
297  CSeqdesc_CI desc_iter(handle, CSeqdesc::e_Molinfo);
298  for ( ; desc_iter; ++desc_iter) {
299  return &desc_iter->GetMolinfo();
300  }
301 
302  return NULL;
303 }
304 
305 
306 
308 (const CSeq_loc& loc,
309  CScope& scope,
311 {
312  CBioseq_Handle retval;
313 
314  try {
315  if (IsOneBioseq(loc, &scope)) {
316  return scope.GetBioseqHandle(GetId(loc, &scope), flag);
317  }
318 
319  // assuming location is annotated on parts of a segmented bioseq
320  for (CSeq_loc_CI it(loc); it; ++it) {
321  CBioseq_Handle part = scope.GetBioseqHandle(it.GetSeq_id(), flag);
322  if (part) {
323  retval = GetParentForPart(part);
324  }
325  break; // check only the first part
326  }
327 
328  // if multiple intervals and not parts, look for the first loaded bioseq
329  if (!retval) {
330  for (CSeq_loc_CI it(loc); it; ++it) {
331  retval =
332  scope.GetBioseqHandle(it.GetSeq_id_Handle(), CScope::eGetBioseq_Loaded);
333  if (retval) {
334  break;
335  }
336  }
337  }
338 
339  if (!retval && flag == CScope::eGetBioseq_All) {
340  for (CSeq_loc_CI it(loc); it; ++it) {
341  retval =
342  scope.GetBioseqHandle(it.GetSeq_id_Handle(), flag);
343  if (retval) {
344  break;
345  }
346  }
347  }
348  } catch (exception&) {
349  retval.Reset();
350  }
351 
352  return retval;
353 }
354 
355 
356 string GetProteinName(const CBioseq_Handle& seq)
357 {
358  if ( !seq ) {
359  NCBI_THROW(CObjMgrException, eInvalidHandle,
360  "GetProteinName: "
361  "null handle");
362  }
363  if ( !seq.IsProtein() ) {
364  NCBI_THROW_FMT(CObjmgrUtilException, eBadSequenceType,
365  "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
366  "the sequence is not a protein");
367  }
368  TSeqPos seq_length = seq.GetBioseqLength();
369  TSeqPos best_length = 0;
370  vector<CMappedFeat> best_feats;
371  for ( CFeat_CI it(seq, CSeqFeatData::e_Prot); it; ++it ) {
372  COpenRange<TSeqPos> range = it->GetRange();
373  if ( range.GetToOpen() > seq_length ) {
374  range.SetToOpen(seq_length);
375  }
376  TSeqPos length = range.GetLength();
377  if ( length > best_length ) {
378  best_length = length;
379  best_feats.clear();
380  }
381  if ( length == best_length ) {
382  best_feats.push_back(*it);
383  }
384  }
385  if ( best_feats.empty() ) {
386  NCBI_THROW_FMT(CObjMgrException, eFindFailed,
387  "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
388  "the sequence does't have prot feature");
389  }
390  if ( best_feats.size() > 1 ) {
391  NCBI_THROW_FMT(CObjMgrException, eFindConflict,
392  "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
393  "the sequence have ambiguous prot feature");
394  }
395  string ret;
396  best_feats[0].GetData().GetProt().GetLabel(&ret);
397  if ( ret.empty() ) {
399  "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
400  "the prot feature doesn't return name");
401  }
402  return ret;
403 }
404 
405 
407 {
408  switch (GetErrCode()) {
409  case eNoSynonyms: return "eNoSynonyms";
410  case eRequestedIdNotFound: return "eRequestedIdNotFound";
411  default: return CException::GetErrCodeString();
412  }
413 }
414 
415 
417 {
418  CConstRef<CSeq_id> id = idh.GetSeqId();
419  CRef<CSeq_id> id_non_const
420  (const_cast<CSeq_id*>(id.GetPointer()));
421  return CSeq_id::Score(id_non_const);
422 }
423 
424 
426 {
427  CConstRef<CSeq_id> id = idh.GetSeqId();
428  CRef<CSeq_id> id_non_const
429  (const_cast<CSeq_id*>(id.GetPointer()));
430  return CSeq_id::BestRank(id_non_const);
431 }
432 
433 
435 {
436  CConstRef<CSeq_id> id = idh.GetSeqId();
437  CRef<CSeq_id> id_non_const
438  (const_cast<CSeq_id*>(id.GetPointer()));
439  return CSeq_id::WorstRank(id_non_const);
440 }
441 
442 
444 {
445  CConstRef<CSeq_id> id = idh.GetSeqId();
446  CRef<CSeq_id> id_non_const
447  (const_cast<CSeq_id*>(id.GetPointer()));
448  return CSeq_id::FastaAARank(id_non_const);
449 }
450 
451 
453 {
454  CConstRef<CSeq_id> id = idh.GetSeqId();
455  CRef<CSeq_id> id_non_const
456  (const_cast<CSeq_id*>(id.GetPointer()));
457  return CSeq_id::FastaNARank(id_non_const);
458 }
459 
460 
461 
463 {
464  if ( ids.empty() ) {
465  return CSeq_id_Handle();
466  }
467 
468  switch ( (type & eGetId_TypeMask) ) {
469  case eGetId_ForceGi:
470  if ( !CSeq_id::AvoidGi() ) {
471  ITERATE (CScope::TIds, iter, ids) {
472  if (iter->IsGi()) {
473  return *iter;
474  }
475  }
476  }
477  if ((type & eGetId_ThrowOnError) != 0) {
478  NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
479  "sequence::GetId(): gi seq-id not found in the list");
480  }
481  break;
482 
483  case eGetId_ForceAcc:
484  {{
485  CSeq_id_Handle best = x_GetId(ids, eGetId_Best);
486  if (best &&
487  best.GetSeqId()->GetTextseq_Id() != NULL &&
488  best.GetSeqId()->GetTextseq_Id()->IsSetAccession()) {
489  return best;
490  }
491  }}
492  if ((type & eGetId_ThrowOnError) != 0) {
493  NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
494  "sequence::GetId(): text seq-id not found in the list");
495  }
496  break;
497 
498  case eGetId_Best:
499  {{
500  return FindBestChoice(ids, Score_SeqIdHandle);
501  }}
502 
503  case eGetId_Seq_id_Score:
504  {{
505  return FindBestChoice(ids, Score_SeqIdHandle);
506  }}
507 
509  {{
511  }}
512 
514  {{
516  }}
517 
519  {{
521  }}
522 
524  {{
526  }}
527 
528  default:
529  break;
530  }
531  return CSeq_id_Handle();
532 }
533 
534 
536 {
537  return GetId(seq.GetId(), type);
538 }
539 
540 
542 {
543  CScope::TIds ids;
544  ITERATE (CBioseq::TId, it, ids_in) {
545  ids.push_back(CSeq_id_Handle::GetHandle(**it));
546  }
547 
548  return x_GetId(ids, type);
549 }
550 
551 
553 {
554  return GetId(CSeq_id_Handle::GetHandle(id), scope, type);
555 }
556 
557 
560 {
561  CSeq_id_Handle ret;
562  if (!idh) return ret;
563  try {
564  if ( (type & eGetId_TypeMask) == eGetId_ForceGi ) {
565  if ( idh.IsGi() && (type & eGetId_VerifyId) == 0 ) {
566  return idh;
567  }
568  TGi gi = scope.GetGi(idh);
569  if (gi != ZERO_GI) {
570  ret = CSeq_id_Handle::GetGiHandle(gi);
571  }
572  }
573  else if ( (type & eGetId_TypeMask) == eGetId_Canonical) {
574  /// Short-cuts for commonly used IDs that are
575  /// known unambiguously to be canonical:
576  /// - ID/GenBank: GI
577  /// - Trace: gnl|ti|<tid> in the C++ Toolkit;
578  /// note that in the C Toolkit, the
579  /// canonical ID appears to be gnl|TRACE|<tid>.
580  /// - Short Read Archive: gnl|SRA|...
582  idh.IsGi()) return idh;
583  if (idh.Which() == CSeq_id::e_General) {
584  CConstRef<CSeq_id> id = idh.GetSeqId();
585  _ASSERT(id && id->IsGeneral());
586  const CSeq_id::TGeneral::TDb& db = id->GetGeneral().GetDb();
587  if (db == "ti" || db == "SRA") return idh;
588  }
589 
590  /// Fallback to retrieve IDs.
591  ret = x_GetId(scope.GetIds(idh), type);
592  if ( !ret ) {
593  /// failed to retrieve IDs
594  /// assume input is the best that we can do
595  ret = idh;
596  }
597  }
598  else if ( (type & eGetId_TypeMask) == eGetId_ForceAcc ) {
599  ret = scope.GetAccVer(idh);
600  }
601  else {
602  ret = x_GetId(scope.GetIds(idh), type);
603  }
604  }
605  catch (exception& e) {
606  ERR_POST("sequence::GetId(): exception: "<<e.what());
607  if ( (type & eGetId_ThrowOnError) != 0 ) {
608  throw;
609  }
610  ret.Reset();
611  return ret;
612  }
613  if ( !ret && (type & eGetId_ThrowOnError) != 0 ) {
614  NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
615  "sequence::GetId(): seq-id not found in the scope");
616  }
617  return ret;
618 }
619 
620 
623 {
624  _ASSERT(handle);
625 
626  const CScope::TIds& ids = handle.GetId();
627  CSeq_id_Handle idh = x_GetId(ids, type);
628 
629  if ( !idh && (type & eGetId_ThrowOnError) != 0 ) {
630  NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
631  "Unable to get Seq-id from handle");
632  }
633 
634  return idh;
635 }
636 
637 
638 TGi GetGiForAccession(const string& acc, CScope& scope, EGetIdType flags)
639 {
640  if ( CSeq_id::AvoidGi() ) return ZERO_GI;
641 
642  // Clear throw-on-error flag
643  EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceGi;
644  try {
645  CSeq_id acc_id(acc);
646  // Get gi only if acc a real accession.
647  if ( acc_id.GetTextseq_Id() ) {
648  CSeq_id_Handle idh = GetId(acc_id, scope, get_id_flags);
649  if ( idh.IsGi() ) {
650  return idh.GetGi();
651  }
652  }
653  }
654  catch (exception& e) {
655  if ( (flags & eGetId_ThrowOnError) != 0 ) {
656  throw e;
657  }
658  return ZERO_GI;
659  }
660  if ( (flags & eGetId_ThrowOnError) != 0 ) {
661  NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
662  "sequence::GetGiForAccession(): invalid seq-id type");
663  }
664  return ZERO_GI;
665 }
666 
667 
668 TGi GetGiForId(const objects::CSeq_id& id, CScope& scope, EGetIdType flags)
669 {
670  if ( CSeq_id::AvoidGi() ) return ZERO_GI;
671 
672  // Clear throw-on-error flag
673  EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceGi;
674  CSeq_id_Handle idh = GetId(id, scope, get_id_flags);
675  if ( idh.IsGi() ) {
676  return idh.GetGi();
677  }
678  if ( (flags & eGetId_ThrowOnError) != 0 ) {
679  NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
680  "sequence::GetGiForId(): seq-id not found in the scope");
681  }
682  return ZERO_GI;
683 }
684 
685 
687  CScope& scope,
688  EAccessionVersion use_version,
690 {
691  // Clear throw-on-error flag
692  EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceAcc;
693  bool with_version = (use_version == eWithAccessionVersion);
694 
695  CSeq_id gi_id(CSeq_id::e_Gi, gi);
696  CSeq_id_Handle idh = GetId(gi_id, scope, get_id_flags);
697  if ( idh ) {
698  return idh.GetSeqId()->GetSeqIdString(with_version);
699  }
700  if ( (flags & eGetId_ThrowOnError) != 0 ) {
701  NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
702  "sequence::GetAccessionForGi(): seq-id not found in the scope");
703  }
704  return kEmptyStr;
705 }
706 
707 
708 string GetAccessionForId(const objects::CSeq_id& id,
709  CScope& scope,
710  EAccessionVersion use_version,
712 {
713  // Clear throw-on-error flag
714  EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceAcc;
715  bool with_version = (use_version == eWithAccessionVersion);
716 
717  CSeq_id_Handle idh = GetId(id, scope, get_id_flags);
718  if ( idh ) {
719  return idh.GetSeqId()->GetSeqIdString(with_version);
720  }
721  if ( (flags & eGetId_ThrowOnError) != 0 ) {
722  NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
723  "sequence::GetAccessionForId(): seq-id not found in the scope");
724  }
725  return kEmptyStr;
726 }
727 
728 
730  CScope& scope,
731  const CTime* tlim)
732 {
733  CBioseq_Handle h = scope.GetBioseqHandle(idh);
734  set<CSeq_id_Handle> visited;
735  CSeq_id_Handle next = idh;
736  while (h && h.IsSetInst() && h.GetInst().IsSetHist()
737  && h.GetInst().GetHist().IsSetReplaced_by()) {
738  const CSeq_hist_rec& rec = h.GetInst().GetHist().GetReplaced_by();
739 
740  // Check if the next bioseq is newer than the limit.
741  if (tlim && rec.IsSetDate() &&
742  rec.GetDate().AsCTime().DiffTimeSpan(*tlim).GetSign() == ePositive) {
743  break;
744  }
745  // Make sure the list of ids is not empty
746  if ( rec.GetIds().empty() ) {
747  return CSeq_id_Handle();
748  }
749  visited.insert(next);
750  // If there are several replaced-by entries, use the first one
752  *h.GetInst().GetHist().GetReplaced_by().GetIds().front());
753  if (visited.find(next) != visited.end()) {
754  // Infinite recursion detected
755  return CSeq_id_Handle();
756  }
757  h = scope.GetBioseqHandle(next);
758  }
759  return h ? next : CSeq_id_Handle();
760 }
761 
762 
764 {
766  scope, NULL).GetSeqId();
767 }
768 
770 {
771  return x_FindLatestSequence(idh, scope, NULL);
772 }
773 
775  CScope& scope,
776  const CTime& tlim)
777 {
779  scope, &tlim).GetSeqId();
780 }
781 
783  CScope& scope,
784  const CTime& tlim)
785 {
786  return x_FindLatestSequence(idh, scope, &tlim);
787 }
788 
789 
791  const CSeq_loc& source_loc, TS2PFlags flags,
792  CScope* scope, int* frame)
793 {
794  SRelLoc::TFlags rl_flags = 0;
795  if (flags & fS2P_NoMerge) {
796  rl_flags |= SRelLoc::fNoMerge;
797  }
798  SRelLoc rl(feat.GetLocation(), source_loc, scope, rl_flags);
799  _ASSERT(!rl.m_Ranges.empty());
800  rl.m_ParentLoc.Reset(&feat.GetProduct());
801  if (feat.GetData().IsCdregion()) {
802  // 3:1 ratio
803  const CCdregion& cds = feat.GetData().GetCdregion();
804  int base_frame = cds.GetFrame();
805  if (base_frame > 0) {
806  --base_frame;
807  }
808  if (frame) {
809  *frame = (3 + rl.m_Ranges.front()->GetFrom() - base_frame) % 3 + 1;
810  }
811  TSeqPos prot_length;
812  try {
813  prot_length = GetLength(feat.GetProduct(), scope);
814  } catch (CObjmgrUtilException&) {
815  prot_length = numeric_limits<TSeqPos>::max();
816  }
818  if (IsReverse((*it)->GetStrand())) {
820  << "SourceToProduct:"
821  " parent and child have opposite orientations");
822  }
823  TSeqPos fr = (*it)->GetFrom();
824  TSeqPos to = (*it)->GetTo();
825  (*it)->SetFrom(((*it)->GetFrom() - base_frame) / 3);
826  (*it)->SetTo (((*it)->GetTo() - base_frame) / 3);
827  if ((flags & fS2P_AllowTer) && to == prot_length * 3 && fr < to ) {
828  --(*it)->SetTo();
829  }
830  }
831  } else {
832  if (frame) {
833  *frame = 0; // not applicable; explicitly zero
834  }
835  }
836 
837  return rl.Resolve(scope, rl_flags);
838 }
839 
840 
841 CRef<CSeq_loc> ProductToSource(const CSeq_feat& feat, const CSeq_loc& prod_loc,
842  TP2SFlags flags, CScope* scope)
843 {
844  SRelLoc rl(feat.GetProduct(), prod_loc, scope);
845  _ASSERT(!rl.m_Ranges.empty());
846  rl.m_ParentLoc.Reset(&feat.GetLocation());
847  if (feat.GetData().IsCdregion()) {
848  // 3:1 ratio
849  const CCdregion& cds = feat.GetData().GetCdregion();
850  int base_frame = cds.GetFrame();
851  if (base_frame > 0) {
852  --base_frame;
853  }
854  TSeqPos nuc_length, prot_length;
855  try {
856  nuc_length = GetLength(feat.GetLocation(), scope);
857  } catch (CObjmgrUtilException&) {
858  nuc_length = numeric_limits<TSeqPos>::max();
859  }
860  try {
861  prot_length = GetLength(feat.GetProduct(), scope);
862  } catch (CObjmgrUtilException&) {
863  prot_length = numeric_limits<TSeqPos>::max();
864  }
866  _ASSERT( !IsReverse((*it)->GetStrand()) );
867  TSeqPos from, to;
868  if ((flags & fP2S_Extend) && (*it)->GetFrom() == 0) {
869  from = 0;
870  } else {
871  from = (*it)->GetFrom() * 3 + base_frame;
872  }
873  if ((flags & fP2S_Extend) && (*it)->GetTo() == prot_length - 1) {
874  to = nuc_length - 1;
875  } else {
876  to = (*it)->GetTo() * 3 + base_frame + 2;
877  }
878  (*it)->SetFrom(from);
879  (*it)->SetTo (to);
880  }
881  }
882 
883  return rl.Resolve(scope);
884 }
885 
886 
887 typedef pair<Int8, CConstRef<CSeq_feat> > TFeatScore;
888 typedef vector<TFeatScore> TFeatScores;
889 
890 template <class T, class U>
892 {
893  bool operator()(const pair<T,U>& p1, const pair<T,U>& p2) const
894  {
895  return p1.first < p2.first;
896  }
897 };
898 
899 template <class T, class U>
901 {
902  bool operator()(const pair<T,U>& p1, const pair<T,U>& p2) const
903  {
904  return p1.second < p2.second;
905  }
906 };
907 
909 {
910 public:
911  COverlapPairLess( CScope *scope_arg ) : scope(scope_arg) { }
912 
913  bool operator()( const pair<Int8,CConstRef<CSeq_feat> >& gene1,
914  const pair<Int8, CConstRef<CSeq_feat> >& gene2 )
915  {
916  // First, compare by overlap amount
917  if( gene1.first != gene2.first ) {
918  return gene1.first < gene2.first;
919  }
920 
921  const CSeq_loc &loc1 = gene1.second->GetLocation();
922  const CSeq_loc &loc2 = gene2.second->GetLocation();
923 
924  // If genes are at identical positions, we fall back on the label
926  sequence::eSame) {
927  if( gene1.second->IsSetData() && gene1.second->GetData().IsGene() &&
928  gene2.second->IsSetData() && gene2.second->GetData().IsGene() )
929  {
930  string gene1_label;
931  string gene2_label;
932 
933  gene1.second->GetData().GetGene().GetLabel( &gene1_label );
934  gene2.second->GetData().GetGene().GetLabel( &gene2_label );
935  return gene1_label < gene2_label;
936  }
937  }
938 
939  return false;
940  }
941 private:
943 };
944 
946  CSeqFeatData::E_Choice feat_type,
947  CSeqFeatData::ESubtype feat_subtype,
948  EOverlapType overlap_type,
949  TFeatScores& feats,
950  CScope& scope,
951  const TBestFeatOpts opts,
953 {
954  bool revert_locations = false;
955  SAnnotSelector::EOverlapType annot_overlap_type;
956  switch (overlap_type) {
957  case eOverlap_Simple:
958  case eOverlap_Contained:
959  case eOverlap_Contains:
960  // Require total range overlap
961  annot_overlap_type = SAnnotSelector::eOverlap_TotalRange;
962  break;
963  case eOverlap_Subset:
964  case eOverlap_SubsetRev:
966  case eOverlap_Interval:
968  revert_locations = true;
969  // there's no break here - proceed to "default"
970  default:
971  // Require intervals overlap
972  annot_overlap_type = SAnnotSelector::eOverlap_Intervals;
973  break;
974  }
975 
976  CConstRef<CSeq_feat> feat_ref;
977  TOverlapFlags overlap_flags = fOverlap_Default;
978 
979  CBioseq_Handle bioseq_handle;
982  if ( loc.IsWhole() ) {
983  bioseq_handle = scope.GetBioseqHandle(loc.GetWhole());
984  range = range.GetWhole();
985  }
986  else if ( loc.IsInt() || loc.IsPnt() || loc.IsPacked_int() || loc.IsMix() || loc.IsPacked_pnt() ) {
987  const CSeq_id* id = loc.GetId();
988  if( NULL != id ) {
989  bioseq_handle = scope.GetBioseqHandle(*id);
990  range.SetFrom(loc.GetStart(eExtreme_Positional));
991  range.SetTo(loc.GetStop(eExtreme_Positional));
992  if ( loc.IsSetStrand() ) {
993  strand = loc.GetStrand();
994  }
995  }
996  }
997  else {
998  range = range.GetEmpty();
999  }
1000 
1001  // Check if the sequence is circular
1002  TSeqPos circular_length = kInvalidSeqPos;
1003  CConstRef<CSeq_id> circular_id;
1004  if ( bioseq_handle ) {
1005  if ( bioseq_handle.IsSetInst_Topology() &&
1006  bioseq_handle.GetInst_Topology() == CSeq_inst::eTopology_circular ) {
1007  circular_length = bioseq_handle.GetBioseqLength();
1008  circular_id = bioseq_handle.GetSeqId();
1009  }
1010  }
1011  else {
1012  try {
1013  const CSeq_id* loc_id = nullptr;
1014  try {
1015  loc.CheckId(loc_id);
1016  }
1017  catch (exception&) {
1018  loc_id = 0;
1019  }
1020  if ( loc_id ) {
1021  circular_id.Reset(loc_id);
1022  CBioseq_Handle bseq_handle = scope.GetBioseqHandle(*circular_id);
1023  if ( bseq_handle && bseq_handle.IsSetInst_Topology() &&
1024  bseq_handle.GetInst_Topology() == CSeq_inst::eTopology_circular ) {
1025  circular_length = bseq_handle.GetBioseqLength();
1026  }
1027  }
1028  }
1029  catch (exception& _DEBUG_ARG(e)) {
1030  _TRACE("test for circularity failed: " << e.what()) ;
1031  }
1032  }
1033 
1034  CRef<CSeq_loc> circular_loc;
1035  if (circular_id && range.GetFrom() > range.GetTo()) {
1036  // Circular bioseq, the location crosses zero. Can't use a single
1037  // total range.
1038  circular_loc.Reset(new CSeq_loc);
1039  CRef<CSeq_interval> sub_loc(new CSeq_interval);
1040  sub_loc->SetId().Assign(*circular_id);
1041  sub_loc->SetFrom(0);
1042  sub_loc->SetTo(range.GetTo());
1043  if ( loc.IsSetStrand() ) {
1044  sub_loc->SetStrand(loc.GetStrand());
1045  }
1046  // First interval - no matter front or back
1047  circular_loc->SetPacked_int().Set().push_back(sub_loc);
1048  sub_loc.Reset(new CSeq_interval);
1049  sub_loc->SetId().Assign(*circular_id);
1050  sub_loc->SetFrom(range.GetFrom());
1051  sub_loc->SetTo(circular_length == kInvalidSeqPos
1052  ? kInvalidSeqPos : circular_length - 1);
1053  if ( loc.IsSetStrand() ) {
1054  sub_loc->SetStrand(loc.GetStrand());
1055  }
1056  if ( IsReverse(strand) ) {
1057  circular_loc->SetPacked_int().Set().push_front(sub_loc);
1058  }
1059  else {
1060  circular_loc->SetPacked_int().Set().push_back(sub_loc);
1061  }
1062  }
1063  try {
1064  SAnnotSelector sel;
1065  sel.SetFeatType(feat_type)
1066  .SetFeatSubtype(feat_subtype)
1067  .SetOverlapType(annot_overlap_type)
1068  .SetResolveTSE();
1069  if( opts & fBestFeat_IgnoreStrand ) {
1070  sel.SetIgnoreStrand();
1071  if( ! circular_id && range.GetFrom() > range.GetTo() ) {
1072  // switch from and to
1073  range = CRange<TSeqPos>( range.GetTo(), range.GetFrom() );
1074  }
1075  }
1076  if( plugin ) {
1077  plugin->processSAnnotSelector( sel );
1078  }
1079 
1080  unique_ptr<CFeat_CI> feat_it_ptr;
1081  if( plugin ) {
1082  plugin->setUpFeatureIterator( bioseq_handle, feat_it_ptr,
1083  circular_length, range, loc, sel, scope, strand);
1084  } else {
1085  if ( circular_loc ) {
1086  if ( !bioseq_handle ) {
1087  sel.SetSearchUnresolved();
1088  }
1089  feat_it_ptr.reset( new CFeat_CI(scope, *circular_loc, sel) );
1090  }
1091  else if ( bioseq_handle ) {
1092  feat_it_ptr.reset( new CFeat_CI(bioseq_handle, range, strand, sel) );
1093  }
1094  else {
1095  sel.SetSearchUnresolved();
1096  feat_it_ptr.reset( new CFeat_CI(scope, loc, sel) );
1097  }
1098  }
1099  // convenience variable so we don't have to keep dereferencing the unique_ptr
1100  CFeat_CI &feat_it = *feat_it_ptr;
1101 
1102  CRef<CSeq_loc> cleaned_loc( new CSeq_loc );
1103  cleaned_loc->Assign( loc );
1104  if( opts & fBestFeat_IgnoreStrand ) {
1105  cleaned_loc->SetStrand(eNa_strand_plus);
1106  overlap_flags |= fOverlap_IgnoreTopology;
1107  }
1108  if( plugin ) {
1109  plugin->processLoc( bioseq_handle, cleaned_loc, circular_length );
1110  }
1111 
1112  for ( ; feat_it; ++feat_it) {
1113  CRef<CSeq_loc> cleaned_loc_this_iteration = cleaned_loc;
1114  CRef<CSeq_loc> candidate_feat_loc( new CSeq_loc );
1115  candidate_feat_loc->Assign( feat_it->GetOriginalFeature().GetLocation() );
1116  if( opts & fBestFeat_IgnoreStrand ) {
1117  candidate_feat_loc->SetStrand(eNa_strand_plus);
1118  }
1119  EOverlapType overlap_type_this_iteration = overlap_type;
1120  bool revert_locations_this_iteration = revert_locations;
1121 
1122  if( plugin ) {
1123  bool shouldContinueToNextIteration = false;
1124  plugin->processMainLoop(
1125  shouldContinueToNextIteration,
1126  cleaned_loc_this_iteration,
1127  candidate_feat_loc,
1128  overlap_type_this_iteration,
1129  revert_locations_this_iteration,
1130  bioseq_handle,
1131  *feat_it,
1132  circular_length,
1133  annot_overlap_type);
1134  if( shouldContinueToNextIteration ) {
1135  continue;
1136  }
1137  }
1138 
1139  try {
1140  // treat subset as a special case
1141  Int8 cur_diff = -1;
1142  if ( !revert_locations_this_iteration ) {
1143  if (overlap_flags == fOverlap_Default) {
1144  cur_diff = TestForOverlap64(*candidate_feat_loc,
1145  *cleaned_loc_this_iteration,
1146  overlap_type_this_iteration,
1147  circular_length,
1148  &scope);
1149  }
1150  else {
1151  cur_diff = TestForOverlapEx(*candidate_feat_loc,
1152  *cleaned_loc_this_iteration,
1153  overlap_type_this_iteration,
1154  &scope,
1155  overlap_flags);
1156  }
1157  }
1158  else {
1159  if (overlap_flags == fOverlap_Default) {
1160  cur_diff = TestForOverlap64(*cleaned_loc_this_iteration,
1161  *candidate_feat_loc,
1162  overlap_type_this_iteration,
1163  circular_length,
1164  &scope);
1165  }
1166  else {
1167  cur_diff = TestForOverlapEx(*cleaned_loc_this_iteration,
1168  *candidate_feat_loc,
1169  overlap_type_this_iteration,
1170  &scope,
1171  overlap_flags);
1172  }
1173  }
1174 
1175  if( plugin ) {
1176  plugin->postProcessDiffAmount( cur_diff, cleaned_loc_this_iteration,
1177  candidate_feat_loc, scope, sel, circular_length );
1178  }
1179  if (cur_diff < 0) {
1180  continue;
1181  }
1182 
1183  // quick fix for CFeat_CI returning wrong additional features
1184  if (overlap_type == eOverlap_Contained) {
1185  ECompare cmp = Compare(feat_it->GetLocation(), loc, &scope, fCompareOverlapping);
1186  if (cmp != eContains && cmp != eSame) {
1187  continue;
1188  }
1189  }
1190  TFeatScore sc(cur_diff, ConstRef(&feat_it->GetMappedFeature()));
1191  feats.push_back(sc);
1192  }
1193  catch (CObjmgrUtilException&) {
1194  // On TestForOverlap64 error proceed to the next feature.
1195  continue;
1196  }
1197  }
1198  }
1199  catch (exception&) {
1200  _TRACE("GetOverlappingFeatures(): error: feature iterator failed");
1201  }
1202 
1203  std::stable_sort(feats.begin(), feats.end(),
1204  COverlapPairLess( &scope ) );
1205 }
1206 
1207 
1209  CSeqFeatData::E_Choice feat_type,
1210  EOverlapType overlap_type,
1211  CScope& scope,
1212  TBestFeatOpts opts,
1214 {
1215  TFeatScores scores;
1217  feat_type, CSeqFeatData::eSubtype_any,
1218  overlap_type, scores, scope, opts, plugin );
1219  if (scores.size()) {
1220  if (opts & fBestFeat_FavorLonger) {
1221  return scores.back().second;
1222  } else {
1223  return scores.front().second;
1224  }
1225  }
1226  return CConstRef<CSeq_feat>();
1227 }
1228 
1229 
1231  CSeqFeatData::ESubtype feat_type,
1232  EOverlapType overlap_type,
1233  CScope& scope,
1234  TBestFeatOpts opts,
1236 {
1237  TFeatScores scores;
1239  CSeqFeatData::GetTypeFromSubtype(feat_type), feat_type,
1240  overlap_type, scores, scope, opts, plugin );
1241 
1242  if (scores.size()) {
1243  if (opts & fBestFeat_FavorLonger) {
1244  return scores.back().second;
1245  } else {
1246  return scores.front().second;
1247  }
1248  }
1249  return CConstRef<CSeq_feat>();
1250 }
1251 
1252 
1253 /// GetmRNAforCDS
1254 /// A function to find a CSeq_feat representing the
1255 /// appropriate mRNA for a given CDS.
1256 /// @param cds The feature for which the mRNA to be found
1257 /// @param scope The scope
1258 ///
1259 /// @return CConstRef<CSeq_feat> for new mRNA (will be NULL if none is found)
1260 
1262 {
1263  CConstRef<CSeq_feat> mrna;
1264 
1265  bool has_xref = false;
1266  if (cds.IsSetXref()) {
1267  /* using FeatID from feature cross-references:
1268  * if CDS refers to an mRNA by feature ID, use that feature
1269  */
1270  CBioseq_Handle bsh;
1271  try {
1272  bsh = scope.GetBioseqHandle(cds.GetLocation());
1273  } catch (CException& ) {
1274  // multi-accession location, can't do this check
1275  return CConstRef<CSeq_feat>(NULL);
1276  }
1277  if (!bsh)
1278  {
1279  return CConstRef<CSeq_feat>(NULL);
1280  }
1281 
1282  CTSE_Handle tse = bsh.GetTSE_Handle();
1283  ITERATE(CSeq_feat::TXref, it, cds.GetXref()) {
1284  if ((*it)->IsSetId() && (*it)->GetId().IsLocal()) {
1285  CSeq_feat_Handle mrna_h = tse.GetFeatureWithId(CSeqFeatData::eSubtype_mRNA, (*it)->GetId().GetLocal());
1286  if (mrna_h) {
1287  mrna = mrna_h.GetSeq_feat();
1288  }
1289  has_xref = true;
1290  }
1291  }
1292  }
1293  if (!has_xref) {
1294  /* using original location to find mRNA:
1295  * mRNA must include the CDS location and the internal interval boundaries need to be identical
1296  */
1298  }
1299  return mrna;
1300 }
1301 
1302 
1303 static
1306  CSeqFeatData::ESubtype subtype,
1307  CScope& scope,
1308  bool search_both_strands = true)
1309 {
1310  TFeatScores scores;
1311  CConstRef<CSeq_feat> overlap;
1313  type, subtype,
1314  eOverlap_Contained, scores,
1315  scope);
1316  if (scores.size()) {
1317  overlap = scores.front().second;
1318  }
1319 
1320  if (search_both_strands && !overlap) {
1321  CRef<CSeq_loc> loc(new CSeq_loc);
1322  loc->Assign(snp_feat.GetLocation());
1323 
1324  ENa_strand strand = GetStrand(*loc, &scope);
1325  if (strand == eNa_strand_plus || strand == eNa_strand_minus) {
1326  loc->FlipStrand();
1327  } else if (strand == eNa_strand_unknown) {
1329  }
1330 
1331  scores.clear();
1333  type, subtype,
1334  eOverlap_Contained, scores,
1335  scope);
1336  if (scores.size()) {
1337  overlap = scores.front().second;
1338  }
1339  }
1340 
1341  return overlap;
1342 }
1343 
1344 
1347  CScope& scope,
1348  bool search_both_strands)
1349 {
1351  scope, search_both_strands);
1352 }
1353 
1354 
1356  CSeqFeatData::ESubtype subtype,
1357  CScope& scope,
1358  bool search_both_strands)
1359 {
1360  return x_GetBestOverlapForSNP(snp_feat,
1361  CSeqFeatData::GetTypeFromSubtype(subtype), subtype, scope,
1362  search_both_strands);
1363 }
1364 
1365 
1367  const CSeq_loc& loc, CScope& scope,
1368  ETransSplicing eTransSplicing )
1369 {
1370  switch ( eTransSplicing ) {
1371  case eTransSplicing_Auto:
1372  {
1373  ENa_strand strand = loc.GetStrand();
1374  if (strand == eNa_strand_both || strand == eNa_strand_other) {
1375  // Mixed strand indicates trans-splicing must be on.
1376  return GetOverlappingGene(loc, scope, eTransSplicing_Yes);
1377  }
1378  // Try with trans-splicing on first. If it finds nothing, try
1379  // to turn it off.
1381  return ret ? ret : GetOverlappingGene(loc, scope, eTransSplicing_No);
1382  }
1383  case eTransSplicing_Yes:
1384  {
1385  // If trans-splicing is on, the result must be a multi-range gene.
1389  if ( ret ) {
1390  CSeq_loc_CI it(ret->GetLocation());
1391  ++it;
1392  if ( !it ) ret.Reset();
1393  }
1394  return ret;
1395  }
1396  case eTransSplicing_No:
1397  {
1398  // Multi-range genes assume trans-splicing=on and should not be included
1399  // when it's off.
1402  eOverlap_Contained, scope, 0);
1403  if ( ret ) {
1404  CSeq_loc_CI it(ret->GetLocation());
1405  ++it;
1406  if ( it ) ret.Reset();
1407  }
1408  return ret;
1409  }
1410  }
1411  return null;
1412 }
1413 
1414 
1415 bool IsTransSpliced(const CSeq_feat& feat)
1416 {
1417  // note - even if the exception says "trans-splicing", it isn't really trans-splicing if
1418  // it's a single interval
1419  if (feat.IsSetExcept_text() && NStr::Find(feat.GetExcept_text(), "trans-splicing") != string::npos
1420  && !feat.GetLocation().IsInt()) {
1421  return true;
1422  } else {
1423  return false;
1424  }
1425 }
1426 
1427 
1428 bool IsPseudo(const CSeq_feat& feat, CScope& scope)
1429 {
1430  if (feat.IsSetPseudo() && feat.GetPseudo()) {
1431  return true;
1432  }
1433  if (feat.IsSetQual()) {
1434  ITERATE(CSeq_feat::TQual, it, feat.GetQual()) {
1435  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudogene")) {
1436  return true;
1437  }
1438  }
1439  }
1440  if (feat.GetData().IsGene()) {
1441  if (feat.GetData().GetGene().IsSetPseudo() && feat.GetData().GetGene().GetPseudo()) {
1442  return true;
1443  }
1444  } else {
1445  if (feat.IsSetXref()) {
1446  ITERATE(CSeq_feat::TXref, it, feat.GetXref()) {
1447  if ((*it)->IsSetData() && (*it)->GetData().IsGene() &&
1448  (*it)->GetData().GetGene().IsSetPseudo() &&
1449  (*it)->GetData().GetGene().GetPseudo()) {
1450  return true;
1451  }
1452  }
1453  }
1454  CConstRef<CSeq_feat> gene = GetGeneForFeature(feat, scope);
1455  if (gene && IsPseudo(*gene, scope)) {
1456  return true;
1457  }
1458  }
1459  return false;
1460 }
1461 
1462 CConstRef<CSeq_feat> GetLocalGeneByLocus(const string& locus, bool use_tag, CBioseq_Handle bsh)
1463 {
1464  CTSE_Handle tse = bsh.GetTSE_Handle();
1465  const CBioseq& b = *(bsh.GetCompleteBioseq());
1466 
1467  CTSE_Handle::TSeq_feat_Handles potentials = tse.GetGenesWithLocus(locus, use_tag);
1468  //if (potentials.size() == 1) { // it may return wrong gene!
1469  // return potentials.front().GetSeq_feat();
1470  //}
1471  ITERATE(CTSE_Handle::TSeq_feat_Handles, p, potentials) {
1472  try {
1473  CSeq_id_Handle id_h = p->GetLocationId();
1474  if (id_h) {
1475  CConstRef<CSeq_id> p_id = id_h.GetSeqId();
1476  if (p_id) {
1477  ITERATE(CBioseq::TId, id, b.GetId()) {
1478  CSeq_id::E_SIC cmp = p_id->Compare(**id);
1479  if (cmp == CSeq_id::e_YES) {
1480  return p->GetSeq_feat();
1481  } else if (cmp == CSeq_id::e_NO) {
1482  break;
1483  }
1484  }
1485  }
1486  }
1487  } catch (CException&) {
1488  CSeq_loc_CI li(p->GetLocation());
1489  while (li) {
1490  try {
1491  const CSeq_id& this_id = li.GetSeq_id();
1492  ITERATE(CBioseq::TId, id, b.GetId()) {
1493  CSeq_id::E_SIC cmp = this_id.Compare(**id);
1494  if (cmp == CSeq_id::e_YES) {
1495  return p->GetSeq_feat();
1496  } else if (cmp == CSeq_id::e_NO) {
1497  break;
1498  }
1499  }
1500  } catch (CException& ) {
1501  // no Seq-id for this sublocation, keep trying
1502  }
1503  ++li;
1504  }
1505  }
1506  }
1507  return CConstRef<CSeq_feat>(NULL);
1508 }
1509 
1510 
1512 {
1513  if (gene.IsSetLocus_tag() && !(gene.GetLocus_tag().empty())) {
1515  if (f) {
1516  return f;
1517  }
1518  }
1519  if (gene.IsSetLocus() && !(gene.GetLocus().empty())) {
1520  CConstRef<CSeq_feat> f = GetLocalGeneByLocus(gene.GetLocus(), false, bsh);
1521  if (f) {
1522  return f;
1523  }
1524  }
1525  return CConstRef<CSeq_feat>(NULL);
1526 }
1527 
1528 
1530 {
1531  if (feat.IsSetXref()) {
1532  CBioseq_Handle bsh = GetBioseqFromSeqLoc(feat.GetLocation(), scope);
1533  if (!bsh) {
1534  return CConstRef<CSeq_feat>();
1535  }
1536  CTSE_Handle tse = bsh.GetTSE_Handle();
1537  ITERATE(CSeq_feat::TXref, xit, feat.GetXref()) {
1538  if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() && (*xit)->GetData().GetGene().IsSuppressed()) {
1539  return (CConstRef <CSeq_feat>());
1540  }
1541  if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal() &&
1542  (!(*xit)->IsSetData() || (*xit)->GetData().IsGene())) {
1543  const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
1545  if (far_feats.size() > 0) {
1546  return far_feats.front().GetSeq_feat();
1547  }
1548  // if xref claims to point to gene feature but gene feature does not exist,
1549  // return NULL
1550  if ((*xit)->IsSetData() && (*xit)->GetData().IsGene()) {
1551  return CConstRef<CSeq_feat>();
1552  }
1553  } else if ((*xit)->IsSetData() && (*xit)->GetData().IsGene()) {
1554  const CGene_ref& gene = (*xit)->GetData().GetGene();
1555  return GetLocalGeneByXref(gene, bsh);
1556  }
1557  }
1558  }
1559 
1561  if (gf) {
1562  ECompare cmp = Compare(gf->GetLocation(), feat.GetLocation(), &scope, fCompareOverlapping);
1563  if (cmp == eContains || cmp == eSame) {
1564  return gf;
1565  }
1566  }
1567 
1568  return CConstRef <CSeq_feat>();
1569 }
1570 
1571 
1573 {
1575  eOverlap_Contained, scope);
1576 }
1577 
1578 
1580 {
1582  eOverlap_Contained, scope);
1583 }
1584 
1585 
1587 {
1589  eOverlap_Contained, scope);
1590 }
1591 
1592 
1594 {
1596  eOverlap_Contained, scope);
1597 }
1598 
1599 
1601 {
1603  eOverlap_Contained, scope);
1604 }
1605 
1606 
1607 const char* kRibosomalSlippageText = "ribosomal slippage";
1608 
1610  CScope& scope,
1611  TBestFeatOpts opts,
1613 {
1615  CConstRef<CSeq_feat> mrna_feat;
1616 
1617  // search for a best overlapping mRNA
1618  // we start with a scan through the product accessions because we need
1619  // to insure that the chosen transcript does indeed match what we want
1620  TFeatScores feats;
1621  EOverlapType overlap_type = eOverlap_CheckIntRev;
1622  if (cds_feat.IsSetExcept() && cds_feat.GetExcept() &&
1623  cds_feat.IsSetExcept_text() &&
1624  cds_feat.GetExcept_text() == kRibosomalSlippageText) {
1625  overlap_type = eOverlap_SubsetRev;
1626  }
1630  overlap_type,
1631  feats, scope, opts, plugin );
1632  /// easy out: 0 or 1 possible features
1633  if (feats.size() < 2) {
1634  if (feats.size() == 1) {
1635  mrna_feat = feats.front().second;
1636  }
1637  return mrna_feat;
1638  }
1639 
1640  if (cds_feat.IsSetProduct()) {
1641  try {
1642  // this may throw, if the product spans multiple sequences
1643  // this would be extremely unlikely, but we catch anyway
1644  const CSeq_id& product_id =
1645  sequence::GetId(cds_feat.GetProduct(), &scope);
1646 
1647  ITERATE (TFeatScores, feat_iter, feats) {
1648  const CSeq_feat& feat = *feat_iter->second;
1649  if ( !feat.IsSetExt() ) {
1650  continue;
1651  }
1652 
1653  /// scan the user object in the ext field
1654  /// we look for a user object of type MrnaProteinLink
1655  /// this should contain a seq-d string that we can match
1656  CTypeConstIterator<CUser_object> obj_iter(feat);
1657  for ( ; obj_iter; ++obj_iter) {
1658  if (obj_iter->IsSetType() &&
1659  obj_iter->GetType().IsStr() &&
1660  obj_iter->GetType().GetStr() == "MrnaProteinLink") {
1661  string prot_id_str = obj_iter->GetField("protein seqID")
1662  .GetData().GetStr();
1663  CSeq_id prot_id(prot_id_str);
1664  vector<CSeq_id_Handle> ids = scope.GetIds(prot_id);
1665  ids.push_back(CSeq_id_Handle::GetHandle(prot_id));
1666  ITERATE (vector<CSeq_id_Handle>, id_iter, ids) {
1667  if (product_id.Match(*id_iter->GetSeqId())) {
1668  mrna_feat.Reset(&feat);
1669  return mrna_feat;
1670  }
1671  }
1672  }
1673  }
1674  }
1675  }
1676  catch (exception&) {
1677  }
1678  }
1679 
1680  if (cds_feat.IsSetProduct() && !(opts & fBestFeat_NoExpensive) ) {
1681  try {
1682  // this may throw, if the product spans multiple sequences
1683  // this would be extremely unlikely, but we catch anyway
1684  const CSeq_id& product_id =
1685  sequence::GetId(cds_feat.GetProduct(), &scope);
1686 
1687  TFeatScores matching_feats;
1688  ITERATE (TFeatScores, feat_iter, feats) {
1689 
1690  // we grab the mRNA product, if available, and scan it for
1691  // a CDS feature. the CDS feature should point to the same
1692  // product as our current feature.
1693  const CSeq_feat& mrna = *feat_iter->second;
1694  if ( !mrna.IsSetProduct() ) {
1695  continue;
1696  }
1697 
1698  CBioseq_Handle handle =
1699  scope.GetBioseqHandle(mrna.GetProduct());
1700  if ( !handle ) {
1701  continue;
1702  }
1703 
1704  SAnnotSelector cds_sel;
1705  cds_sel.SetOverlapIntervals()
1706  .ExcludeNamedAnnots("SNP")
1707  .SetResolveTSE()
1709  CFeat_CI other_iter(scope, mrna.GetProduct(), cds_sel);
1710  for ( ; other_iter && !mrna_feat; ++other_iter) {
1711  const CSeq_feat& cds = other_iter->GetOriginalFeature();
1712  if ( !cds.IsSetProduct() ) {
1713  continue;
1714  }
1715 
1716  CBioseq_Handle prot_handle =
1717  scope.GetBioseqHandle(cds.GetProduct());
1718  if ( !prot_handle ) {
1719  continue;
1720  }
1721 
1722  if (prot_handle.IsSynonym(product_id)) {
1723  // got it!
1724  matching_feats.push_back(*feat_iter);
1725  break;
1726  }
1727  }
1728  }
1729  if ( !matching_feats.empty() ) {
1730  // keep only matching features
1731  feats.swap(matching_feats);
1732  if ( feats.size() == 1 ) {
1733  mrna_feat = feats.front().second;
1734  return mrna_feat;
1735  }
1736  }
1737  }
1738  catch (exception&) {
1739  }
1740  }
1741 
1742  // check for transcript_id; this is a fast check
1743  string transcript_id = cds_feat.GetNamedQual("transcript_id");
1744  if ( !transcript_id.empty() ) {
1745  ITERATE (vector<TFeatScore>, feat_iter, feats) {
1746  const CSeq_feat& feat = *feat_iter->second;
1747  string other_transcript_id =
1748  feat.GetNamedQual("transcript_id");
1749  if (transcript_id == other_transcript_id) {
1750  mrna_feat.Reset(&feat);
1751  return mrna_feat;
1752  }
1753  }
1754  }
1755 
1756  //
1757  // try to find the best by overlaps alone
1758  //
1759 
1760  if ( !mrna_feat && !(opts & fBestFeat_StrictMatch) ) {
1761  if (opts & fBestFeat_FavorLonger) {
1762  mrna_feat = feats.back().second;
1763  } else {
1764  mrna_feat = feats.front().second;
1765  }
1766  }
1767 
1768  return mrna_feat;
1769 }
1770 
1771 
1772 // Plugin for GetOverlappingFeatures - uses eOverlap_CheckIntervals
1773 // or eOverlap_Subset depending on the "ribosomal slippage" flag
1774 // in the current feature.
1775 
1777 {
1778 public:
1780  : m_PrevPlugin(prev_plugin) {}
1781  virtual ~CCdsForMrnaPlugin() {}
1782 
1784  SAnnotSelector &sel)
1785  {
1786  if ( m_PrevPlugin ) {
1788  }
1789  }
1790 
1791  virtual void setUpFeatureIterator(
1792  CBioseq_Handle &bioseq_handle,
1793  unique_ptr<CFeat_CI> &feat_ci,
1794  TSeqPos circular_length ,
1796  const CSeq_loc& loc,
1797  SAnnotSelector &sel,
1798  CScope &scope,
1799  ENa_strand &strand)
1800  {
1801  if ( m_PrevPlugin ) {
1802  m_PrevPlugin->setUpFeatureIterator(bioseq_handle,
1803  feat_ci, circular_length, range, loc, sel, scope, strand);
1804  return;
1805  }
1806  if ( bioseq_handle ) {
1807  feat_ci.reset(new CFeat_CI(bioseq_handle, range, strand, sel));
1808  } else {
1809  feat_ci.reset(new CFeat_CI(scope, loc, sel));
1810  }
1811  }
1812 
1813  virtual void processLoc(
1814  CBioseq_Handle &bioseq_handle,
1815  CRef<CSeq_loc> &loc,
1816  TSeqPos circular_length)
1817  {
1818  if ( m_PrevPlugin ) {
1819  m_PrevPlugin->processLoc(bioseq_handle, loc, circular_length);
1820  }
1821  }
1822 
1823  virtual void processMainLoop(
1824  bool &shouldContinueToNextIteration,
1825  CRef<CSeq_loc> &cleaned_loc_this_iteration,
1826  CRef<CSeq_loc> &candidate_feat_loc,
1827  EOverlapType &overlap_type_this_iteration,
1828  bool &revert_locations_this_iteration,
1829  CBioseq_Handle &bioseq_handle,
1830  const CMappedFeat &feat,
1831  TSeqPos circular_length,
1832  SAnnotSelector::EOverlapType annot_overlap_type)
1833  {
1834  const CSeq_feat& cds = feat.GetOriginalFeature();
1835  _ASSERT(cds.GetData().GetSubtype() ==
1837  // If the feature has "ribosomal slippage" flag set, use
1838  // eOverlap_Subset. Otherwise use more strict eOverlap_CheckIntervals.
1839  if (cds.IsSetExcept() && cds.GetExcept() &&
1840  cds.IsSetExcept_text() &&
1842  overlap_type_this_iteration = eOverlap_Subset;
1843  }
1844  if ( m_PrevPlugin ) {
1845  m_PrevPlugin->processMainLoop(shouldContinueToNextIteration,
1846  cleaned_loc_this_iteration, candidate_feat_loc,
1847  overlap_type_this_iteration,
1848  revert_locations_this_iteration,
1849  bioseq_handle, feat, circular_length, annot_overlap_type);
1850  }
1851  }
1852 
1853  virtual void postProcessDiffAmount(
1854  Int8 &cur_diff,
1855  CRef<CSeq_loc> &cleaned_loc,
1856  CRef<CSeq_loc> &candidate_feat_loc,
1857  CScope &scope,
1858  SAnnotSelector &sel,
1859  TSeqPos circular_length )
1860  {
1861  if ( m_PrevPlugin ) {
1863  cleaned_loc, candidate_feat_loc,
1864  scope, sel, circular_length);
1865  }
1866  }
1867 
1868 private:
1870 };
1871 
1872 
1874 GetBestCdsForMrna(const CSeq_feat& mrna_feat,
1875  CScope& scope,
1876  TBestFeatOpts opts,
1878 {
1880  CConstRef<CSeq_feat> cds_feat;
1881 
1882  unique_ptr<CGetOverlappingFeaturesPlugin> cds_plugin(
1883  new CCdsForMrnaPlugin(plugin));
1884  // search for a best overlapping CDS
1885  // we start with a scan through the product accessions because we need
1886  // to insure that the chosen transcript does indeed match what we want
1887  TFeatScores feats;
1888  GetOverlappingFeatures(mrna_feat.GetLocation(),
1892  feats, scope, opts, cds_plugin.get());
1893 
1894  /// easy out: 0 or 1 possible features
1895  if (feats.size() < 2) {
1896  if (feats.size() == 1) {
1897  cds_feat = feats.front().second;
1898  }
1899  return cds_feat;
1900  }
1901 
1902  if (mrna_feat.IsSetExt()) {
1903  /// scan the user object in the ext field
1904  /// we look for a user object of type MrnaProteinLink
1905  /// this should contain a seq-d string that we can match
1906  string prot_id_str;
1907  CTypeConstIterator<CUser_object> obj_iter(mrna_feat);
1908  for ( ; obj_iter; ++obj_iter) {
1909  if (obj_iter->IsSetType() &&
1910  obj_iter->GetType().IsStr() &&
1911  obj_iter->GetType().GetStr() == "MrnaProteinLink") {
1912  prot_id_str = obj_iter->GetField("protein seqID").GetData().GetStr();
1913  break;
1914  }
1915  }
1916  if ( !prot_id_str.empty() ) {
1917  CSeq_id prot_id(prot_id_str);
1918  vector<CSeq_id_Handle> ids = scope.GetIds(prot_id);
1919  ids.push_back(CSeq_id_Handle::GetHandle(prot_id));
1920 
1921  try {
1922  /// look for a CDS feature that matches this expected ID
1923  ITERATE (TFeatScores, feat_iter, feats) {
1924  const CSeq_feat& feat = *feat_iter->second;
1925  if ( !feat.IsSetProduct() ) {
1926  continue;
1927  }
1928  const CSeq_id& id =
1929  sequence::GetId(feat.GetLocation(), &scope);
1930  ITERATE (vector<CSeq_id_Handle>, id_iter, ids) {
1931  if (id.Match(*id_iter->GetSeqId())) {
1932  cds_feat.Reset(&feat);
1933  return cds_feat;
1934  }
1935  }
1936  }
1937  }
1938  catch (exception&) {
1939  }
1940  }
1941  }
1942 
1943  // scan through the product accessions because we need to insure that the
1944  // chosen transcript does indeed match what we want
1945  if (mrna_feat.IsSetProduct() && !(opts & fBestFeat_NoExpensive) ) {
1946  do {
1947  try {
1948  // this may throw, if the product spans multiple sequences
1949  // this would be extremely unlikely, but we catch anyway
1950  const CSeq_id& mrna_product =
1951  sequence::GetId(mrna_feat.GetProduct(), &scope);
1952  CBioseq_Handle mrna_handle =
1953  scope.GetBioseqHandle(mrna_product);
1954 
1955  // find the ID of the protein accession we're looking for
1956  CConstRef<CSeq_id> protein_id;
1957  {{
1958  SAnnotSelector sel;
1959  sel.SetOverlapIntervals()
1960  .ExcludeNamedAnnots("SNP")
1961  .SetResolveTSE()
1963 
1964  CFeat_CI iter(mrna_handle, sel);
1965  for ( ; iter; ++iter) {
1966  if (iter->IsSetProduct()) {
1967  protein_id.Reset
1968  (&sequence::GetId(iter->GetProduct(),
1969  &scope));
1970  break;
1971  }
1972  }
1973  }}
1974 
1975  if ( !protein_id ) {
1976  break;
1977  }
1978 
1979  TFeatScores::const_iterator feat_iter = feats.begin();
1980  TFeatScores::const_iterator feat_end = feats.end();
1981  for ( ; feat_iter != feat_end && !cds_feat; ++feat_iter) {
1982  /// look for all contained CDS features; for each, check
1983  /// to see if the protein product is the expected protein
1984  /// product
1985  const CSeq_feat& cds = *feat_iter->second;
1986  if ( !cds.IsSetProduct() ) {
1987  continue;
1988  }
1989 
1990  CBioseq_Handle prot_handle =
1991  scope.GetBioseqHandle(cds.GetProduct());
1992  if ( !prot_handle ) {
1993  continue;
1994  }
1995 
1996  if (prot_handle.IsSynonym(*protein_id)) {
1997  // got it!
1998  cds_feat.Reset(&cds);
1999  return cds_feat;
2000  }
2001  }
2002  }
2003  catch ( exception& ) {
2004  }
2005  }
2006  while (false);
2007  }
2008 
2009  // check for transcript_id
2010  // this is generally only available in GTF/GFF-imported features
2011  string transcript_id = mrna_feat.GetNamedQual("transcript_id");
2012  if ( !transcript_id.empty() ) {
2013  ITERATE (TFeatScores, feat_iter, feats) {
2014  const CSeq_feat& feat = *feat_iter->second;
2015  string other_transcript_id =
2016  feat.GetNamedQual("transcript_id");
2017  if (transcript_id == other_transcript_id) {
2018  cds_feat.Reset(&feat);
2019  return cds_feat;
2020  }
2021  }
2022  }
2023 
2024  //
2025  // try to find the best by overlaps alone
2026  //
2027 
2028  if ( !cds_feat && !(opts & fBestFeat_StrictMatch) ) {
2029  if (opts & fBestFeat_FavorLonger) {
2030  cds_feat = feats.back().second;
2031  } else {
2032  cds_feat = feats.front().second;
2033  }
2034  }
2035 
2036  return cds_feat;
2037 }
2038 
2039 
2041  CScope& scope,
2042  TBestFeatOpts opts,
2044 {
2046  CConstRef<CSeq_feat> gene_feat;
2047 
2048  // search for a best overlapping gene
2049  TFeatScores feats;
2050  GetOverlappingFeatures(mrna_feat.GetLocation(),
2054  feats, scope, opts, plugin );
2055  /// easy out: 0 or 1 possible features
2056  if (feats.size() < 2) {
2057  if (feats.size() == 1) {
2058  gene_feat = feats.front().second;
2059  }
2060  return gene_feat;
2061  }
2062 
2063  ///
2064  /// compare gene xrefs to see if ew can find a match
2065  ///
2066  const CGene_ref* ref = mrna_feat.GetGeneXref();
2067  if (ref) {
2068  if (ref->IsSuppressed()) {
2069  /// 'suppress' case
2070  return gene_feat;
2071  }
2072 
2073  string ref_str;
2074  ref->GetLabel(&ref_str);
2075 
2076  ITERATE (TFeatScores, feat_it, feats) {
2077  const CSeq_feat& feat = *feat_it->second;
2078  const CGene_ref& other_ref = feat.GetData().GetGene();
2079  string other_ref_str;
2080  other_ref.GetLabel(&other_ref_str);
2081  if (ref_str == other_ref_str) {
2082  gene_feat = &feat;
2083  return gene_feat;
2084  }
2085  }
2086  }
2087 
2088  ///
2089  /// compare by dbxrefs
2090  ///
2091  if (mrna_feat.IsSetDbxref()) {
2092  int gene_id = 0;
2093  ITERATE (CSeq_feat::TDbxref, dbxref, mrna_feat.GetDbxref()) {
2094  if ((*dbxref)->GetDb() == "GeneID" ||
2095  (*dbxref)->GetDb() == "LocusID") {
2096  gene_id = (*dbxref)->GetTag().GetId();
2097  break;
2098  }
2099  }
2100 
2101  if (gene_id != 0) {
2102  ITERATE (TFeatScores, feat_it, feats) {
2103  const CSeq_feat& feat = *feat_it->second;
2104  ITERATE (CSeq_feat::TDbxref, dbxref, feat.GetDbxref()) {
2105  const string& db = (*dbxref)->GetDb();
2106  if ((db == "GeneID" || db == "LocusID") &&
2107  (*dbxref)->GetTag().GetId() == gene_id) {
2108  gene_feat = &feat;
2109  return gene_feat;
2110  }
2111  }
2112  }
2113  }
2114  }
2115 
2116  if ( !gene_feat && !(opts & fBestFeat_StrictMatch) ) {
2117  if (opts & fBestFeat_FavorLonger) {
2118  gene_feat = feats.back().second;
2119  } else {
2120  gene_feat = feats.front().second;
2121  }
2122  }
2123 
2124  return gene_feat;
2125 }
2126 
2127 
2129  CScope& scope,
2130  TBestFeatOpts opts,
2132 {
2134 
2135  CConstRef<CSeq_feat> feat_ref;
2136 
2137  // search for a best overlapping gene
2138  TFeatScores feats;
2143  feats, scope, opts, plugin );
2144  /// easy out: 0 or 1 possible features
2145  if (feats.size() < 2) {
2146  if (feats.size() == 1) {
2147  feat_ref = feats.front().second;
2148  }
2149  return feat_ref;
2150  }
2151 
2152  // next: see if we can match based on gene xref
2153  const CGene_ref* ref = cds_feat.GetGeneXref();
2154  if (ref) {
2155  if (ref->IsSuppressed()) {
2156  /// 'suppress' case
2157  return feat_ref;
2158  }
2159 
2160  ITERATE (TFeatScores, feat_it, feats) {
2161  const CSeq_feat& feat = *feat_it->second;
2162 
2163  string ref_str;
2164  ref->GetLabel(&ref_str);
2165 
2166  const CGene_ref& other_ref = feat.GetData().GetGene();
2167  string other_ref_str;
2168  other_ref.GetLabel(&other_ref_str);
2169  if (ref_str == other_ref_str) {
2170  feat_ref = &feat;
2171  return feat_ref;
2172  }
2173  }
2174  }
2175 
2176  /// last check: expensive: need to proxy through mRNA match
2177  if ( !feat_ref && !(opts & fBestFeat_NoExpensive) ) {
2178  feat_ref = GetBestMrnaForCds(cds_feat, scope,
2179  opts | fBestFeat_StrictMatch);
2180  if (feat_ref) {
2181  feat_ref = GetBestGeneForMrna(*feat_ref, scope, opts);
2182  if (feat_ref) {
2183  return feat_ref;
2184  }
2185  }
2186  }
2187 
2188  if ( !feat_ref && !(opts & fBestFeat_StrictMatch) ) {
2189  feat_ref = feats.front().second;
2190  }
2191  return feat_ref;
2192 }
2193 
2194 
2195 void GetMrnasForGene(const CSeq_feat& gene_feat, CScope& scope,
2196  list< CConstRef<CSeq_feat> >& mrna_feats,
2197  TBestFeatOpts opts,
2199 {
2201  SAnnotSelector sel;
2202  sel.SetResolveTSE()
2203  .SetAdaptiveDepth()
2205  CFeat_CI feat_it(scope, gene_feat.GetLocation(), sel);
2206  if (feat_it.GetSize() == 0) {
2207  return;
2208  }
2209 
2210  ///
2211  /// pass 1: compare by gene xref
2212  ///
2213  {{
2214  const CGene_ref& ref = gene_feat.GetData().GetGene();
2215  string ref_str;
2216  ref.GetLabel(&ref_str);
2217  size_t count = 0;
2218  for ( ; feat_it; ++feat_it) {
2219 
2220  const CGene_ref* other_ref =
2221  feat_it->GetOriginalFeature().GetGeneXref();
2222  if ( !other_ref || other_ref->IsSuppressed() ) {
2223  continue;
2224  }
2225 
2226  string other_ref_str;
2227  other_ref->GetLabel(&other_ref_str);
2228  if (other_ref_str != ref_str) {
2229  continue;
2230  }
2231 
2232  ECompare comp = sequence::Compare(gene_feat.GetLocation(),
2233  feat_it->GetLocation(),
2234  &scope,
2236  if (comp != eSame && comp != eContains) {
2237  continue;
2238  }
2239 
2240  CConstRef<CSeq_feat> feat_ref(&feat_it->GetOriginalFeature());
2241  mrna_feats.push_back(feat_ref);
2242  ++count;
2243  }
2244 
2245  if (count) {
2246  return;
2247  }
2248  }}
2249 
2250  ///
2251  /// pass 2: compare by gene id
2252  ///
2253  {{
2254  int gene_id = 0;
2255  if (gene_feat.IsSetDbxref()) {
2256  ITERATE (CSeq_feat::TDbxref, dbxref, gene_feat.GetDbxref()) {
2257  if ((*dbxref)->GetDb() == "GeneID" ||
2258  (*dbxref)->GetDb() == "LocusID") {
2259  gene_id = (*dbxref)->GetTag().GetId();
2260  break;
2261  }
2262  }
2263  }
2264 
2265  if (gene_id) {
2266  size_t count = 0;
2267  feat_it.Rewind();
2268  for ( ; feat_it; ++feat_it) {
2269  /// check the suppress case
2270  /// regardless of the gene-id binding, we always ignore these
2271  const CGene_ref* other_ref =
2272  feat_it->GetOriginalFeature().GetGeneXref();
2273  if ( other_ref && other_ref->IsSuppressed() ) {
2274  continue;
2275  }
2276 
2277  CConstRef<CSeq_feat> ref(&feat_it->GetOriginalFeature());
2278 
2279  ECompare comp = sequence::Compare(gene_feat.GetLocation(),
2280  feat_it->GetLocation(),
2281  &scope,
2283  if (comp != eSame && comp != eContains) {
2284  continue;
2285  }
2286 
2287  if (feat_it->IsSetDbxref()) {
2288  ITERATE (CSeq_feat::TDbxref, dbxref, feat_it->GetDbxref()) {
2289  if (((*dbxref)->GetDb() == "GeneID" ||
2290  (*dbxref)->GetDb() == "LocusID") &&
2291  (*dbxref)->GetTag().GetId() == gene_id) {
2292  mrna_feats.push_back(ref);
2293  ++count;
2294  break;
2295  }
2296  }
2297  }
2298  }
2299 
2300  if (count) {
2301  return;
2302  }
2303  }
2304  }}
2305 
2306  // gene doesn't have a gene_id or a gene ref
2307  CConstRef<CSeq_feat> feat =
2311  scope, opts, plugin );
2312  if (feat) {
2313  mrna_feats.push_back(feat);
2314  }
2315 }
2316 
2317 
2318 void GetCdssForGene(const CSeq_feat& gene_feat, CScope& scope,
2319  list< CConstRef<CSeq_feat> >& cds_feats,
2320  TBestFeatOpts opts,
2322 {
2324  list< CConstRef<CSeq_feat> > mrna_feats;
2325  GetMrnasForGene(gene_feat, scope, mrna_feats, opts);
2326  if (mrna_feats.size()) {
2327  ITERATE (list< CConstRef<CSeq_feat> >, iter, mrna_feats) {
2328  CConstRef<CSeq_feat> cds = GetBestCdsForMrna(**iter, scope, opts);
2329  if (cds) {
2330  cds_feats.push_back(cds);
2331  }
2332  }
2333  } else {
2334  CConstRef<CSeq_feat> feat =
2338  scope, opts, plugin );
2339  if (feat) {
2340  cds_feats.push_back(feat);
2341  }
2342  }
2343 }
2344 
2345 
2348  CSeqFeatData::E_Choice feat_type,
2349  sequence::EOverlapType overlap_type,
2350  CScope& scope,
2351  TBestFeatOpts opts,
2353 {
2354  CConstRef<CSeq_feat> feat_ref;
2355  switch (feat_type) {
2356  case CSeqFeatData::e_Gene:
2357  return GetBestOverlappingFeat(feat,
2359  overlap_type, scope, opts, plugin );
2360 
2361  case CSeqFeatData::e_Rna:
2362  feat_ref = GetBestOverlappingFeat(feat,
2364  overlap_type, scope, opts, plugin );
2365  break;
2366 
2368  return GetBestOverlappingFeat(feat,
2370  overlap_type, scope, opts, plugin );
2371 
2372  default:
2373  break;
2374  }
2375 
2376  if ( !feat_ref ) {
2378  (feat.GetLocation(), feat_type, overlap_type, scope, opts, plugin );
2379  }
2380 
2381  return feat_ref;
2382 }
2383 
2384 
2387  CSeqFeatData::ESubtype subtype,
2388  sequence::EOverlapType overlap_type,
2389  CScope& scope,
2390  TBestFeatOpts opts,
2392 {
2393  CConstRef<CSeq_feat> feat_ref;
2394  switch (feat.GetData().GetSubtype()) {
2396  switch (subtype) {
2398  return GetBestGeneForMrna(feat, scope, opts);
2399 
2401  return GetBestCdsForMrna(feat, scope, opts);
2402 
2403  default:
2404  break;
2405  }
2406  break;
2407 
2409  switch (subtype) {
2411  return GetBestMrnaForCds(feat, scope, opts);
2412 
2414  return GetBestGeneForCds(feat, scope, opts);
2415 
2416  default:
2417  break;
2418  }
2419  break;
2420 
2422  return GetBestOverlapForSNP(feat, subtype, scope, true);
2423 
2424  default:
2425  break;
2426  }
2427 
2428  if ( !feat_ref ) {
2429  feat_ref = GetBestOverlappingFeat
2430  (feat.GetLocation(), subtype, overlap_type, scope, opts, plugin );
2431  }
2432 
2433  return feat_ref;
2434 }
2435 
2436 
2437 namespace {
2438 
2439 CConstRef<CSeq_feat> x_GetFeatById(CSeqFeatData::ESubtype subtype,
2440  const CSeq_feat& feat,
2441  const CTSE_Handle& tse)
2442 {
2443  if ( feat.IsSetXref() ) {
2444  ITERATE ( CSeq_feat::TXref, it, feat.GetXref() ) {
2445  const CSeqFeatXref& xref = **it;
2446  if ( xref.IsSetId() ) {
2447  const CFeat_id& id = xref.GetId();
2448  if ( id.IsLocal() ) {
2449  const CObject_id& obj_id = id.GetLocal();
2450  if ( obj_id.IsId() ) {
2451  int local_id = obj_id.GetId();
2452  CSeq_feat_Handle feat_handle =
2453  tse.GetFeatureWithId(subtype, local_id);
2454  if ( feat_handle ) {
2455  return feat_handle.GetSeq_feat();
2456  }
2457  }
2458  }
2459  }
2460  }
2461  }
2462  return null;
2463 }
2464 
2465 }
2466 
2467 
2470  const CTSE_Handle& tse,
2471  TBestFeatOpts opts,
2473 {
2475  CConstRef<CSeq_feat> ret =
2476  x_GetFeatById(CSeqFeatData::eSubtype_gene, mrna_feat, tse);
2477  if ( !ret ) {
2478  ret = GetBestGeneForMrna(mrna_feat, tse.GetScope(), opts);
2479  }
2480  return ret;
2481 }
2482 
2485  const CTSE_Handle& tse,
2486  TBestFeatOpts opts,
2488 {
2490  CConstRef<CSeq_feat> ret =
2491  x_GetFeatById(CSeqFeatData::eSubtype_gene, cds_feat, tse);
2492  if ( !ret ) {
2493  ret = GetBestGeneForCds(cds_feat, tse.GetScope(), opts);
2494  }
2495  return ret;
2496 }
2497 
2500  const CTSE_Handle& tse,
2501  TBestFeatOpts opts,
2503 {
2505  CConstRef<CSeq_feat> ret =
2506  x_GetFeatById(CSeqFeatData::eSubtype_mRNA, cds_feat, tse);
2507  if ( !ret ) {
2508  ret = GetBestMrnaForCds(cds_feat, tse.GetScope(), opts);
2509  }
2510  return ret;
2511 }
2512 
2514 GetBestCdsForMrna(const CSeq_feat& mrna_feat,
2515  const CTSE_Handle& tse,
2516  TBestFeatOpts opts,
2518 {
2520  CConstRef<CSeq_feat> ret =
2521  x_GetFeatById(CSeqFeatData::eSubtype_cdregion, mrna_feat, tse);
2522  if ( !ret ) {
2523  ret = GetBestCdsForMrna(mrna_feat, tse.GetScope(), opts);
2524  }
2525  return ret;
2526 }
2527 
2528 void GetMrnasForGene(const CSeq_feat& gene_feat,
2529  const CTSE_Handle& tse,
2530  list< CConstRef<CSeq_feat> >& mrna_feats,
2531  TBestFeatOpts opts,
2533 {
2535  GetMrnasForGene(gene_feat, tse.GetScope(), mrna_feats, opts);
2536 }
2537 
2538 void GetCdssForGene(const CSeq_feat& gene_feat,
2539  const CTSE_Handle& tse,
2540  list< CConstRef<CSeq_feat> >& cds_feats,
2541  TBestFeatOpts opts,
2543 {
2545  GetCdssForGene(gene_feat, tse.GetScope(), cds_feats, opts);
2546 }
2547 
2548 // Get the encoding CDS feature of a given protein sequence.
2549 const CSeq_feat* GetCDSForProduct(const CBioseq& product, CScope* scope)
2550 {
2551  if ( scope == 0 ) {
2552  return 0;
2553  }
2554 
2555  return GetCDSForProduct(scope->GetBioseqHandle(product));
2556 }
2557 
2559 {
2561  if ( f ) {
2562  return &f.GetOriginalFeature();
2563  }
2564 
2565  return 0;
2566 }
2567 
2569 {
2570  if ( bsh ) {
2571  // try first in-TSE CDS
2572  CFeat_CI fi(bsh,
2574  .SetByProduct().SetLimitTSE(bsh.GetTSE_Handle()));
2575  if ( !fi ) {
2576  // then any other CDS
2577  fi = CFeat_CI(bsh,
2579  .SetByProduct().ExcludeTSE(bsh.GetTSE_Handle()));
2580  }
2581  if ( fi ) {
2582  // return the first one (should be the one packaged on the
2583  // nuc-prot set).
2584  return *fi;
2585  }
2586  }
2587 
2588  return CMappedFeat();
2589 }
2590 
2591 
2592 // Get the mature peptide feature of a protein
2593 const CSeq_feat* GetPROTForProduct(const CBioseq& product, CScope* scope)
2594 {
2595  if ( scope == 0 ) {
2596  return 0;
2597  }
2598 
2599  return GetPROTForProduct(scope->GetBioseqHandle(product));
2600 }
2601 
2603 {
2604  if ( bsh ) {
2605  CFeat_CI fi(bsh, SAnnotSelector(CSeqFeatData::e_Prot).SetByProduct());
2606  if ( fi ) {
2607  return &(fi->GetOriginalFeature());
2608  }
2609  }
2610 
2611  return 0;
2612 }
2613 
2614 
2615 
2616 // Get the encoding mRNA feature of a given mRNA (cDNA) bioseq.
2617 const CSeq_feat* GetmRNAForProduct(const CBioseq& product, CScope* scope)
2618 {
2619  if ( scope == 0 ) {
2620  return 0;
2621  }
2622 
2623  return GetmRNAForProduct(scope->GetBioseqHandle(product));
2624 }
2625 
2627 {
2628  if ( bsh ) {
2630  as.SetByProduct();
2631 
2632  CFeat_CI fi(bsh, as);
2633  if ( fi ) {
2634  return &(fi->GetOriginalFeature());
2635  }
2636  }
2637 
2638  return 0;
2639 }
2640 
2641 
2643 {
2644  if ( bsh ) {
2645  CFeat_CI fi(bsh,
2647  .SetByProduct());
2648  if ( fi ) {
2649  // return the first one (should be the one packaged on the
2650  // nuc-prot set).
2651  return *fi;
2652  }
2653  }
2654 
2655  return CMappedFeat();
2656 }
2657 
2658 
2659 // Get the encoding sequence of a protein
2660 const CBioseq* GetNucleotideParent(const CBioseq& product, CScope* scope)
2661 {
2662  if ( scope == 0 ) {
2663  return 0;
2664  }
2665  CBioseq_Handle bsh = GetNucleotideParent(scope->GetBioseqHandle(product));
2666  return bsh ? bsh.GetCompleteBioseq() : reinterpret_cast<const CBioseq*>(0);
2667 }
2668 
2670 {
2671  // If protein use CDS to get to the encoding Nucleotide.
2672  // if nucleotide (cDNA) use mRNA feature.
2673  const CSeq_feat* sfp = bsh.GetInst().IsAa() ?
2674  GetCDSForProduct(bsh) : GetmRNAForProduct(bsh);
2675 
2676  CBioseq_Handle ret;
2677  if ( sfp ) {
2678  try {
2679  ret = bsh.GetScope().GetBioseqHandle(sfp->GetLocation());
2680  } catch(...) {
2681  // may fail due to trans-splicing, e.g., on small-genome set
2682  }
2683  }
2684  return ret;
2685 }
2686 
2687 
2689 {
2690  CBioseq_Handle seg;
2691 
2692  if (part) {
2693  CSeq_entry_Handle segset =
2695  if (segset) {
2696  for (CSeq_entry_CI it(segset); it; ++it) {
2697  if (it->IsSeq()) {
2698  seg = it->GetSeq();
2699  break;
2700  }
2701  }
2702  }
2703  }
2704 
2705  return seg;
2706 }
2707 
2708 
2709 END_SCOPE(sequence)
2710 
2711 
2712 
2714  : m_Out(out),
2715  m_Flags(fInstantiateGaps | fAssembleParts | fEnableGI),
2716  m_GapMode(eGM_letters)
2717 {
2718  m_Gen.reset(new sequence::CDeflineGenerator);
2719  SetWidth(70);
2720 }
2721 
2723 {
2724  m_Out << flush;
2725 }
2726 
2728  const CSeq_loc* location)
2729 {
2730  for (CBioseq_CI it(handle); it; ++it) {
2731  if ( !SkipBioseq(*it) ) {
2732  if (location) {
2733  CSeq_loc loc2;
2734  loc2.SetWhole().Assign(*it->GetSeqId());
2735  int d = sequence::TestForOverlap
2737  kInvalidSeqPos, &handle.GetScope());
2738  if (d < 0) {
2739  continue;
2740  }
2741  }
2742  Write(*it, location);
2743  }
2744  }
2745 }
2746 
2747 
2749  const CSeq_loc* location,
2750  const string& custom_title)
2751 {
2752  WriteTitle(handle, location, custom_title);
2753  WriteSequence(handle, location);
2754 }
2755 
2756 
2757 static string s_FastaGetOriginalID (const CBioseq& seq)
2758 
2759 {
2760  FOR_EACH_SEQDESC_ON_BIOSEQ (it, seq) {
2761  const CSeqdesc& desc = **it;
2762  if (! desc.IsUser()) continue;
2763  if (! desc.GetUser().IsSetType()) continue;
2764  const CUser_object& usr = desc.GetUser();
2765  const CObject_id& oi = usr.GetType();
2766  if (! oi.IsStr()) continue;
2767  const string& type = oi.GetStr();
2768  if (! NStr::EqualNocase(type, "OrginalID") && ! NStr::EqualNocase(type, "OriginalID")) continue;
2769  FOR_EACH_USERFIELD_ON_USEROBJECT (uitr, usr) {
2770  const CUser_field& fld = **uitr;
2771  if (FIELD_IS_SET_AND_IS(fld, Label, Str)) {
2772  const string &label_str = GET_FIELD(fld.GetLabel(), Str);
2773  if (! NStr::EqualNocase(label_str, "LocalId")) continue;
2774  if (fld.IsSetData() && fld.GetData().IsStr()) {
2775  return fld.GetData().GetStr();
2776  }
2777  }
2778  }
2779  }
2780 
2781  return "";
2782 }
2783 
2784 static bool s_ShouldUseOriginalID (const CBioseq& seq)
2785 {
2786  FOR_EACH_SEQID_ON_BIOSEQ (id_itr, seq) {
2787  const CSeq_id& sid = **id_itr;
2788  switch (sid.Which()) {
2789  case CSeq_id::e_Local:
2790  break;
2791  case CSeq_id::e_General:
2792  {
2793  const CDbtag& dbtag = sid.GetGeneral();
2794  if (dbtag.IsSetDb()) {
2795  const string& db = dbtag.GetDb();
2796  if (! NStr::EqualNocase(db, "TMSMART") &&
2797  ! NStr::EqualNocase(db, "BankIt") &&
2798  ! NStr::EqualNocase(db, "NCBIFILE")) {
2799  return false;
2800  }
2801  }
2802  }
2803  break;
2804  default:
2805  return false;
2806  }
2807  }
2808 
2809  return true;
2810 }
2811 
2812 void CFastaOstream::x_GetBestId(CConstRef<CSeq_id>& gi_id, CConstRef<CSeq_id>& best_id, bool& hide_prefix, const CBioseq& bioseq)
2813 {
2814  bool is_na = bioseq.GetInst().GetMol() != CSeq_inst::eMol_aa;
2815  best_id = FindBestChoice(bioseq.GetId(), is_na ? CSeq_id::FastaNARank : CSeq_id::FastaAARank);
2816 
2817  ITERATE(CBioseq::TId, id, bioseq.GetId()) {
2818  if ((*id)->IsGi()) {
2819  gi_id = *id;
2820  break;
2821  }
2822  }
2823 
2824  // see SQD-4144, only Accession.Version should be shown, without prefixes and suffixes
2825  if (best_id.NotEmpty() &&
2826  (m_Flags & fEnableGI) == 0 &&
2827  (m_Flags & fHideGenBankPrefix) != 0)
2828  {
2829  switch (best_id->Which())
2830  {
2831  case CSeq_id::e_Genbank:
2832  case CSeq_id::e_Embl:
2833  case CSeq_id::e_Other:
2834  case CSeq_id::e_Ddbj:
2835  case CSeq_id::e_Tpg:
2836  case CSeq_id::e_Tpe:
2837  case CSeq_id::e_Tpd:
2838  hide_prefix = true;
2839  break;
2840  default:
2841  break;
2842  }
2843  }
2844 }
2845 
2846 static bool s_WriteGnlAndAcc(const CBioseq& bioseq, CNcbiOstream& ostr)
2847 {
2848  CRef<CSeq_id> pGnlId;
2849  CRef<CSeq_id> pAccession;
2850 
2851  for (const auto& pId : bioseq.GetId()) {
2852  if (pId->IsGeneral()) {
2853  pGnlId = pId;
2854  continue;
2855  }
2856  if (pId->IsGenbank()) {
2857  pAccession = pId;
2858  }
2859  }
2860 
2861  if (pGnlId) {
2862  pGnlId->WriteAsFasta(ostr);
2863  }
2864 
2865  if (pAccession) {
2866  if (pGnlId) {
2867  ostr << '|';
2868  }
2869  pAccession->WriteAsFasta(ostr);
2870  }
2871 
2872  return (pAccession || pGnlId);
2873 }
2874 
2876 {
2877 
2878  if ((m_Flags & fShowGnlAndAcc) &&
2879  s_WriteGnlAndAcc(bioseq, m_Out)) {
2880  return;
2881  }
2882 
2883  CConstRef<CSeq_id> best_id;
2884  CConstRef<CSeq_id> gi_id;
2885  bool hide_prefix = false;
2886 
2887  // override this method and provide application specific 'best id' policy
2888  x_GetBestId(gi_id, best_id, hide_prefix, bioseq);
2889 
2890  if (best_id.NotEmpty())
2891  {
2892  // RW-139, no GI in FASTA output
2893  if (gi_id.NotEmpty() && (m_Flags & fEnableGI) && !best_id->IsGi())
2894  {
2895  // FastA format
2896  // Here we have something like:
2897  // gi|###|SOME_ACCESSION|title
2898 
2899  gi_id->WriteAsFasta(m_Out);
2900  m_Out << '|';
2901  }
2902 
2903  const CTextseq_id* text_id = 0;
2904  if (hide_prefix)
2905  {
2906  text_id = best_id->GetTextseq_Id();
2907  }
2908 
2909  if (text_id != 0)
2910  {
2911  if (text_id->IsSetAccession())
2912  {
2913  m_Out << text_id->GetAccession();
2914  if (text_id->IsSetVersion())
2915  {
2916  m_Out << "." << text_id->GetVersion();
2917  }
2918  }
2919  }
2920  else
2921  {
2922  best_id->WriteAsFasta(m_Out);
2923  }
2924  }
2925 }
2926 
2928  const CSeq_loc* location)
2929 {
2930  bool have_range = (location != NULL && !location->IsWhole()
2931  && !(m_Flags & fSuppressRange) );
2932 
2933  if ( !have_range && (m_Flags & fNoDupCheck) == 0) {
2934  ITERATE (CBioseq::TId, id, bioseq.GetId()) {
2936  pair<TSeq_id_HandleSet::iterator, bool> p
2937  = m_PreviousWholeIds.insert(idh);
2938  if ( !p.second ) {
2939  NCBI_THROW(CObjmgrUtilException, eBadLocation,
2940  "Duplicate Seq-id " + (*id)->AsFastaString()
2941  + " in FASTA output");
2942  }
2943  }
2944  }
2945 
2946  m_Out << '>';
2947  if (!(m_Flags & fIgnoreOriginalID) &&
2948  s_ShouldUseOriginalID(bioseq)) {
2949  string origID = s_FastaGetOriginalID(bioseq);
2950  if (! NStr::IsBlank(origID)) {
2951  m_Out << "lcl|" << origID;
2952  } else {
2953  x_WriteAsFasta(bioseq);
2954  }
2955  } else {
2956  x_WriteAsFasta(bioseq);
2957  }
2958 
2959  if (have_range) {
2960  char delim = ':';
2961  for (CSeq_loc_CI it(*location); it; ++it) {
2962  CSeq_loc::TRange range = it.GetRange();
2963  TSeqPos from = range.GetFrom() + 1, to = range.GetTo() + 1;
2964  _ASSERT(from <= to);
2965  m_Out << delim;
2966  if (it.IsSetStrand() && IsReverse(it.GetStrand())) {
2967  m_Out << 'c' << to << '-' << from;
2968  } else {
2969  m_Out << from << '-' << to;
2970  }
2971  delim = ',';
2972  }
2973  }
2974 }
2975 
2976 inline
2977 sequence::CDeflineGenerator::TUserFlags
2978 CFastaOstream::x_GetTitleFlags(void) const
2979 {
2980  sequence::TGetTitleFlags title_flags = 0;
2981  title_flags |= sequence::CDeflineGenerator::fFastaFormat;
2982 
2983  if ((m_Flags & fNoExpensiveOps) != 0) {
2984  title_flags |= sequence::CDeflineGenerator::fNoExpensiveOps;
2985  }
2986  if ((m_Flags & fShowModifiers) != 0) {
2987  title_flags |= sequence::CDeflineGenerator::fShowModifiers;
2988  }
2989  if ((m_Flags & fDoNotUseAutoDef) != 0) {
2990  title_flags |= sequence::CDeflineGenerator::fDoNotUseAutoDef;
2991  }
2992  /*
2993  if ((m_Flags & fDoNotUseAutoDef) == 0) {
2994  title_flags |= sequence::CDeflineGenerator::fUseAutoDef;
2995  }
2996  */
2997  return title_flags;
2998 }
2999 
3000 void CFastaOstream::x_WriteSeqTitle(const CBioseq_Handle & bioseq_handle,
3001  const string& custom_title)
3002 {
3003  string safe_title = (!custom_title.empty()) ? custom_title
3004  : m_Gen->GenerateDefline(bioseq_handle, x_GetTitleFlags());
3005 
3006  if ( !safe_title.empty() ) {
3007  if ( !(m_Flags & fKeepGTSigns) ) {
3008  NStr::ReplaceInPlace(safe_title, ">", "_");
3009  }
3010  if (safe_title[0] != ' ') {
3011  m_Out << ' ';
3012  }
3013 
3014  if ((m_Flags & fHTMLEncode) != 0) {
3015  safe_title = NStr::HtmlEncode(safe_title);
3016  }
3017  m_Out << safe_title;
3018  }
3019  m_Out << '\n';
3020 }
3021 
3022 void CFastaOstream::WriteTitle(const CBioseq& bioseq,
3023  const CSeq_loc* location,
3024  bool no_scope, // not used
3025  const string& custom_title)
3026 {
3027  x_WriteSeqIds(bioseq, location);
3028  CScope scope(*CObjectManager::GetInstance());
3029  CBioseq_Handle bioseq_handle = scope.AddBioseq(bioseq);
3030  x_WriteSeqTitle(bioseq_handle, custom_title);
3031 }
3032 
3033 void CFastaOstream::WriteTitle(const CBioseq_Handle& bioseq_handle,
3034  const CSeq_loc* location,
3035  const string& custom_title)
3036 {
3037  const CBioseq& bioseq = *bioseq_handle.GetBioseqCore();
3038  x_WriteSeqIds(bioseq, location);
3039  x_WriteSeqTitle(bioseq_handle, custom_title);
3040 }
3041 
3042 
3043 CConstRef<CSeq_loc> CFastaOstream::x_MapMask(CSeq_loc_Mapper& mapper,
3044  const CSeq_loc& mask,
3045  const CSeq_id* base_seq_id,
3046  CScope* scope)
3047 {
3048  CConstRef<CSeq_loc> mapped_mask(&mask);
3049 
3050  // Mapping down requires the higher-level ID as a reference, even
3051  // when given a scope, and as such should precede mapping up to
3052  // keep sequence::GetId from bombing out.
3053  if ((m_Flags & fMapMasksDown) != 0 && scope) {
3054  try {
3055  CSeq_loc_Mapper mapper_down
3056  (scope->GetBioseqHandle(sequence::GetId(*mapped_mask, scope)),
3057  CSeq_loc_Mapper::eSeqMap_Down);
3058  mapped_mask = mapped_mask->Add(*mapper_down.Map(*mapped_mask),
3059  CSeq_loc::fSortAndMerge_All, 0);
3060  } catch (CObjmgrUtilException&) {
3061  }
3062  }
3063  if ((m_Flags & fMapMasksUp) != 0 && scope && base_seq_id) {
3064  CSeq_loc_Mapper mapper_up(scope->GetBioseqHandle(*base_seq_id),
3065  CSeq_loc_Mapper::eSeqMap_Up);
3066  mapped_mask = mapped_mask->Add(*mapper_up.Map(*mapped_mask),
3067  CSeq_loc::fSortAndMerge_All, 0);
3068  }
3069  mapped_mask = mapper.Map(*mapped_mask);
3070  return mapped_mask;
3071 }
3072 
3073 
3074 void CFastaOstream::x_GetMaskingStates(TMSMap& masking_state,
3075  const CSeq_id* base_seq_id,
3076  const CSeq_loc* location,
3077  CScope* scope)
3078 {
3079  CRef<CSeq_loc_Mapper> mapper;
3080  CBioseq_Handle bsh;
3081 
3082  if (m_SoftMask.NotEmpty() || m_HardMask.NotEmpty()) {
3083  _ASSERT(base_seq_id);
3084  if (location) {
3085  CSeq_loc loc2;
3086  try {
3087  TSeqPos length = sequence::GetLength(*location, scope);
3088  loc2.SetInt().SetId().Assign(*base_seq_id);
3089  loc2.SetInt().SetFrom(0);
3090  loc2.SetInt().SetTo(length - 1);
3091  } catch (exception&) {
3092  loc2.SetWhole().Assign(*base_seq_id);
3093  }
3094  mapper.Reset(new CSeq_loc_Mapper(*location, loc2, scope));
3095  } else {
3096  // still useful for filtering out locations on other sequences
3097  CSeq_loc whole;
3098  whole.SetWhole().Assign(*base_seq_id);
3099  mapper.Reset(new CSeq_loc_Mapper(whole, whole, scope));
3100  }
3101  mapper->SetMergeAll();
3102  mapper->TruncateNonmappingRanges();
3103 
3104  if (scope && (m_Flags & (fMapMasksUp | fMapMasksDown))) {
3105  bsh = scope->GetBioseqHandle(*base_seq_id);
3106  }
3107 
3108  const CSeq_loc& mask = m_SoftMask ? *m_SoftMask : *m_HardMask;
3109  int type = m_SoftMask ? eSoftMask : eHardMask;
3110  CConstRef<CSeq_loc> mapped_mask = x_MapMask(*mapper, mask, base_seq_id,
3111  scope);
3112 
3113  masking_state[0] = 0;
3114  for (CSeq_loc_CI it(*mapped_mask); it; ++it) {
3115  CSeq_loc_CI::TRange loc_range = it.GetRange();
3116  masking_state[loc_range.GetFrom()] = type;
3117  masking_state[loc_range.GetToOpen()] = 0;
3118  }
3119  }
3120 
3121  if (m_SoftMask.NotEmpty() && m_HardMask.NotEmpty()) {
3122  CConstRef<CSeq_loc> mapped_mask = x_MapMask(*mapper, *m_HardMask,
3123  base_seq_id, scope);
3124  for (CSeq_loc_CI it(*mapped_mask); it; ++it) {
3125  CSeq_loc_CI::TRange loc_range = it.GetRange();
3126  TSeqPos from = loc_range.GetFrom();
3127  TSeqPos to = loc_range.GetToOpen();
3128  TMSMap::iterator ms_it = masking_state.lower_bound(from);
3129  int prev_state;
3130 
3131  if (ms_it == masking_state.end()) {
3132  masking_state[loc_range.GetFrom()] = eHardMask;
3133  masking_state[loc_range.GetToOpen()] = 0;
3134  continue;
3135  } else if (ms_it->first == from) {
3136  prev_state = ms_it->second;
3137  ms_it->second |= eHardMask;
3138  } else {
3139  // NB: lower_bound's name is misleading, as it actually
3140  // returns the least element whose key >= from.
3141  _ASSERT(ms_it != masking_state.begin());
3142  TMSMap::iterator prev_it = ms_it;
3143  --prev_it;
3144  prev_state = prev_it->second;
3145  TMSMap::value_type value(from, prev_state | eHardMask);
3146 
3147  // Add the new element (using ms_it as a position hint),
3148  // and repoint ms_it at it so that the below loop will
3149  // start at the correct position.
3150  ms_it = masking_state.insert(ms_it, value);
3151  }
3152  while (++ms_it != masking_state.end() && ms_it->first < to) {
3153  prev_state = ms_it->second;
3154  ms_it->second |= eHardMask;
3155  }
3156  if (ms_it == masking_state.end() || ms_it->first != to) {
3157  masking_state.insert(ms_it, TMSMap::value_type(to, prev_state));
3158  }
3159  }
3160  }
3161 }
3162 
3163 
3165  const TMSMap& masking_state)
3166 {
3167  TSeqPos rem_line = m_Width;
3168  CSeqVector_CI it(vec);
3169  TMSMap::const_iterator ms_it = masking_state.begin();
3170  TSeqPos rem_state
3171  = (ms_it == masking_state.end() ? numeric_limits<TSeqPos>::max()
3172  : ms_it->first);
3173  int current_state = 0;
3174  CTempString uc_hard_mask_str
3175  (vec.IsProtein() ? m_UC_Xs.get() : m_UC_Ns.get(), m_Width);
3176  CTempString lc_hard_mask_str
3177  (vec.IsProtein() ? m_LC_Xs.get() : m_LC_Ns.get(), m_Width);
3178  EGapMode native_gap_mode
3179  = ((vec.GetGapChar() == '-') ? eGM_dashes : eGM_letters);
3180  CTempString alt_gap_str;
3181 
3182  if (native_gap_mode == eGM_dashes) {
3183  alt_gap_str = uc_hard_mask_str;
3184  } else {
3185  alt_gap_str.assign(m_Dashes.get(), m_Width);
3186  }
3187 
3188  if ((m_Flags & fReverseStrand) != 0) {
3189  it.SetStrand(Reverse(it.GetStrand()));
3190  }
3191 
3192  while ( it ) {
3193  if (rem_state == 0) {
3194  _ASSERT(ms_it->first == it.GetPos());
3195  current_state = ms_it->second;
3196  if (++ms_it == masking_state.end()) {
3197  rem_state = numeric_limits<TSeqPos>::max();
3198  } else {
3199  rem_state = ms_it->first - it.GetPos();
3200  }
3201  }
3202  if( (m_Flags & fShowGapsOfSizeZero) != 0 &&
3203  it.HasZeroGapBefore() )
3204  {
3205  m_Out << "-\n";
3206  rem_line = m_Width;
3207  }
3208  if ((m_GapMode != native_gap_mode || (m_Flags & fInstantiateGaps) == 0)
3209  && it.GetGapSizeForward())
3210  {
3211  TSeqPos gap_size = it.GetGapSizeForward();
3212  if (m_GapMode == eGM_one_dash
3213  || (m_Flags & fInstantiateGaps) == 0) {
3214  m_Out << "-\n";
3215  rem_line = m_Width;
3216  } else if (m_GapMode == eGM_count) {
3217  if (rem_line < m_Width) {
3218  m_Out << '\n';
3219  }
3221  if (it.GetCurrentSeqMap_CI().IsUnknownLength()) {
3222  // conventional designation, regardless of nominal length
3223  if( gap_size > 0 && (m_Flags & fKeepUnknGapNomLen) != 0 )
3224  {
3225  m_Out << ">?unk" << gap_size;
3226  } else {
3227  m_Out << ">?unk100";
3228  }
3229  } else {
3230  m_Out << ">?" << gap_size;
3231  }
3232  // print gap mods, if requested
3233  if( (m_Flags & fShowGapModifiers) != 0 )
3234  {
3235  CConstRef<CSeq_literal> pGapLiteral =
3237  if( pGapLiteral &&
3238  FIELD_IS_SET_AND_IS(*pGapLiteral, Seq_data, Gap) )
3239  {
3240  const CSeq_gap & seq_gap =
3241  pGapLiteral->GetSeq_data().GetGap();
3242  SGapModText gap_mod_text;
3243  GetGapModText(seq_gap, gap_mod_text);
3244 
3245  CNcbiOstrstream gap_mod_strm;
3246  gap_mod_text.WriteAllModsAsFasta(gap_mod_strm);
3247  const string sGapModText =
3248  CNcbiOstrstreamToString(gap_mod_strm);
3249  if( ! sGapModText.empty() ) {
3250  m_Out << ' ' << sGapModText;
3251  }
3252  }
3253  }
3254  m_Out << '\n';
3255  rem_line = m_Width;
3256  } else {
3257  TSeqPos rem_gap = gap_size;
3258  while (rem_gap >= rem_line) {
3259  x_WriteBuffer(alt_gap_str.data(), rem_line);
3260  m_Out << '\n';
3261  rem_gap -= rem_line;
3262  rem_line = m_Width;
3263  }
3264  if (rem_gap > 0) {
3265  x_WriteBuffer(alt_gap_str.data(), rem_gap);
3266  rem_line -= rem_gap;
3267  }
3268  }
3269  it.SkipGap();
3270  if (rem_state >= gap_size) {
3271  rem_state -= gap_size;
3272  } else {
3273  while (++ms_it != masking_state.end()
3274  && ms_it->first < it.GetPos()) {
3275  current_state = ms_it->second;
3276  }
3277  if (ms_it == masking_state.end()) {
3278  rem_state = numeric_limits<TSeqPos>::max();
3279  } else {
3280  rem_state = ms_it->first - it.GetPos();
3281  }
3282  }
3283  } else {
3284  TSeqPos count = min(TSeqPos(it.GetBufferSize()), rem_state);
3285  TSeqPos new_pos = it.GetPos() + count;
3286  const char* ptr = it.GetBufferPtr();
3287  string lc_buffer;
3288 
3289  rem_state -= count;
3290  if (current_state & eHardMask) {
3291  ptr = (current_state & eSoftMask) ? lc_hard_mask_str.data()
3292  : uc_hard_mask_str.data();
3293  } else if (current_state & eSoftMask) {
3294  // ToLower() always operates in place. :-/
3295  lc_buffer.assign(ptr, count);
3296  NStr::ToLower(lc_buffer);
3297  ptr = lc_buffer.data();
3298  }
3299  while ( count >= rem_line ) {
3300  x_WriteBuffer(ptr, rem_line);
3301  if ( !(current_state & eHardMask) ) {
3302  ptr += rem_line;
3303  }
3304  count -= rem_line;
3305  m_Out << '\n';
3306  rem_line = m_Width;
3307  }
3308  if ( count > 0 ) {
3309  x_WriteBuffer(ptr, count);
3310  rem_line -= count;
3311  }
3312  it.SetPos(new_pos);
3313  }
3314  }
3315  if ( rem_line < m_Width ) {
3316  m_Out << '\n';
3317  }
3318  // m_Out << NcbiFlush;
3319 }
3320 
3321 
3323  const CSeq_loc* location,
3324  const CSeq_loc::EOpFlags merge_flags)
3325 
3326 {
3327  vector<CTSE_Handle> used_tses;
3328  if ( !(m_Flags & fAssembleParts) && !handle.IsSetInst_Seq_data() ) {
3329  SSeqMapSelector sel(CSeqMap::fFindInnerRef, (size_t)-1);
3330  sel.SetLinkUsedTSE(handle.GetTSE_Handle());
3331  sel.SetLinkUsedTSE(used_tses);
3332  if ( !handle.GetSeqMap().CanResolveRange(&handle.GetScope(), sel) ) {
3333  return;
3334  }
3335  }
3336 
3337  CScope& scope = handle.GetScope();
3338  CSeqVector v;
3339  if (location) {
3340  if (sequence::SeqLocCheck(*location, &scope)
3342  string label;
3343  location->GetLabel(&label);
3344  NCBI_THROW(CObjmgrUtilException, eBadLocation,
3345  "CFastaOstream: location out of range: " + label);
3346  }
3347  CRef<CSeq_loc> merged
3348  = sequence::Seq_loc_Merge(*location, merge_flags, &scope);
3349  v = CSeqVector(*merged, scope, CBioseq_Handle::eCoding_Iupac);
3350  } else {
3352  }
3353  if (v.IsProtein()) { // allow extensions
3355  }
3356 
3357  TMSMap masking_state;
3358  if (m_SoftMask.NotEmpty() || m_HardMask.NotEmpty()) {
3359  x_GetMaskingStates(masking_state, handle.GetSeqId(), location, &scope);
3360  }
3361  x_WriteSequence(v, masking_state);
3362 }
3363 
3364 
3366  bool no_scope)
3367 {
3368  if (location || !no_scope) {
3370  Write(scope.AddTopLevelSeqEntry(entry), location);
3371  } else {
3372  switch (entry.Which()) {
3373  case CSeq_entry::e_Seq:
3374  Write(entry.GetSeq(), location, no_scope);
3375  break;
3376  case CSeq_entry::e_Set:
3377  ITERATE (CBioseq_set::TSeq_set, it, entry.GetSet().GetSeq_set()) {
3378  Write(**it, location, no_scope);
3379  }
3380  break;
3381  default:
3382  // throw
3383  break;
3384  }
3385  }
3386 }
3387 
3388 
3390  bool no_scope, const string& custom_title )
3391 {
3393  CBioseq_Handle bioseq_handle = scope.AddBioseq(seq);
3394  if (location || !no_scope) {
3395  Write(bioseq_handle, location, custom_title);
3396  } else {
3397  /// write our title
3398  x_WriteSeqIds(seq, NULL);
3399  x_WriteSeqTitle(bioseq_handle, custom_title);
3400 
3401  /// write the sequence
3402  TMSMap masking_state;
3403  x_GetMaskingStates(masking_state, NULL, NULL, NULL);
3404 
3405  /// check to see if all of our segments are resolvable
3406  bool is_raw = true;
3407  switch (seq.GetInst().GetRepr()) {
3408  case CSeq_inst::eRepr_raw:
3409  break;
3412  seq.GetInst().GetExt().GetDelta().Get()) {
3413  if ((*iter)->Which() == CDelta_seq::e_Loc) {
3414  is_raw = false;
3415  break;
3416  }
3417  }
3418  break;
3419  default:
3420  is_raw = false;
3421  break;
3422  }
3423 
3424  if (is_raw) {
3426  if (vec.IsProtein()) { // allow extensions
3428  }
3429  x_WriteSequence(vec, masking_state);
3430  } else {
3431  /// we require far-pointer resolution
3433  CBioseq_Handle bsh = scope.AddBioseq(seq);
3435  if (vec.IsProtein()) {
3437  }
3438  x_WriteSequence(vec, masking_state);
3439  }
3440  }
3441 }
3442 
3443 
3445 {
3446  return (type == eSoftMask) ? m_SoftMask : m_HardMask;
3447 }
3448 
3449 
3451 {
3453 }
3454 
3455 
3457 {
3458  m_Width = width;
3459  m_Dashes.reset(new char[width]); memset(m_Dashes.get(), '-', width);
3460  m_LC_Ns .reset(new char[width]); memset(m_LC_Ns .get(), 'n', width);
3461  m_LC_Xs .reset(new char[width]); memset(m_LC_Xs .get(), 'x', width);
3462  m_UC_Ns .reset(new char[width]); memset(m_UC_Ns .get(), 'N', width);
3463  m_UC_Xs .reset(new char[width]); memset(m_UC_Xs .get(), 'X', width);
3464 }
3465 
3466 void
3468  CNcbiOstream & out ) const
3469 {
3470  string sPrefix;
3471  if( ! gap_type.empty() ) {
3472  out << sPrefix << "[gap-type=" << gap_type << ']';
3473  sPrefix = " ";
3474  }
3475  if( ! gap_linkage_evidences.empty() ) {
3476  out << sPrefix << "[linkage-evidence=" << NStr::Join(gap_linkage_evidences, ";") << ']';
3477  sPrefix = " ";
3478  }
3479 }
3480 
3481 // static
3482 void
3484  const CSeq_gap & seq_gap,
3485  SGapModText & out_gap_info )
3486 {
3487  // convenience references
3488  string & gap_type = out_gap_info.gap_type;
3489  vector<string> & gap_linkage_evidences =
3490  out_gap_info.gap_linkage_evidences;
3491 
3492  // make sure initialized
3493  gap_type.clear();
3494  gap_linkage_evidences.clear();
3495 
3496  // true if we need to have a /linkage-evidence tag.
3497  // Also, if this is false, we should *not* have any
3498  // linkage-evidence tag
3499  bool need_evidence = false;
3500 
3501  // determine if we're linked, and also determine if
3502  // we need linkage-evidence
3503  bool is_linkage =
3504  seq_gap.CanGetLinkage() &&
3506 
3507  if ( seq_gap.IsSetLinkage_evidence() ) {
3508  is_linkage = true; /* do not rely solely on Seq-gap.linkage, which is not always set correctly */
3509  }
3510 
3511  // For /gap_type qual
3512  if( seq_gap.CanGetType() ) {
3513  switch( seq_gap.GetType() ) {
3515  // don't show /gap_type - policy changed at SQD-1801
3516  gap_type = "unknown";
3517  need_evidence = is_linkage;
3518  break;
3520  gap_type = "within scaffold";
3521  need_evidence = true;
3522  break;
3523  case CSeq_gap::eType_clone:
3524  gap_type = ( is_linkage ? "within scaffold" : "between scaffolds" );
3525  need_evidence = is_linkage;
3526  break;
3528  gap_type = "short arm";
3529  break;
3531  gap_type = "heterochromatin";
3532  break;
3534  gap_type = "centromere";
3535  break;
3537  gap_type = "telomere";
3538  break;
3540  gap_type = ( is_linkage ?
3541  "repeat within scaffold" :
3542  "repeat between scaffolds" );
3543  need_evidence = is_linkage;
3544  break;
3546  gap_type = "between scaffolds";
3547  break;
3549  gap_type = "within scaffold";
3550  need_evidence = is_linkage;
3551  break;
3553  gap_type = "contamination";
3554  need_evidence = is_linkage;
3555  break;
3556  case CSeq_gap::eType_other:
3557  gap_type = "other";
3558  break;
3559  default:
3560  gap_type = "(ERROR: UNRECOGNIZED_GAP_TYPE:" +
3561  NStr::IntToString(seq_gap.GetType()) + ")";
3562  break;
3563  }
3564  }
3565 
3566  // For linkage evidence
3567  if( seq_gap.CanGetLinkage_evidence() ) {
3569  evidence_iter,
3570  seq_gap.GetLinkage_evidence() )
3571  {
3572  const CLinkage_evidence & evidence = **evidence_iter;
3573  if( evidence.CanGetType() ) {
3574  switch( evidence.GetType() ) {
3576  gap_linkage_evidences.push_back("paired-ends");
3577  break;
3579  gap_linkage_evidences.push_back("align genus");
3580  break;
3582  gap_linkage_evidences.push_back("align xgenus");
3583  break;
3585  gap_linkage_evidences.push_back("align trnscpt");
3586  break;
3588  gap_linkage_evidences.push_back("within clone");
3589  break;
3591  gap_linkage_evidences.push_back("clone contig");
3592  break;
3594  gap_linkage_evidences.push_back("map");
3595  break;
3597  gap_linkage_evidences.push_back("strobe");
3598  break;
3600  gap_linkage_evidences.push_back("unspecified");
3601  break;
3603  gap_linkage_evidences.push_back("pcr");
3604  break;
3606  gap_linkage_evidences.push_back("proximity ligation");
3607  break;
3609  gap_linkage_evidences.push_back("other");
3610  break;
3611  default:
3612  gap_linkage_evidences.push_back("(UNRECOGNIZED LINKAGE EVIDENCE:" +
3613  NStr::IntToString( evidence.GetType() ) + ")");
3614  break;
3615  }
3616  }
3617  }
3618  }
3619 
3620  if( need_evidence && gap_linkage_evidences.empty() ) {
3621  gap_linkage_evidences.push_back("unspecified");
3622  } else if( ! need_evidence && ! gap_linkage_evidences.empty() ) {
3623  // This case shouldn't happen if the validator is checking
3624  // records first.
3625  gap_linkage_evidences.clear();
3626  }
3627 }
3628 
3629 /////////////////////////////////////////////////////////////////////////////
3630 //
3631 // sequence translation
3632 //
3633 
3634 
3635 template <class Container>
3636 void x_Translate(const Container& seq,
3637  string& prot,
3638  int frame,
3639  const CGenetic_code* code,
3640  bool is_5prime_complete,
3641  bool is_3prime_complete,
3642  bool include_stop,
3643  bool remove_trailing_X,
3644  bool* alt_start)
3645 {
3646  // reserve our space
3647  const size_t usable_size = seq.size() > frame ? seq.size() - frame : 0;
3648  const size_t mod = usable_size % 3;
3649  prot.erase();
3650  prot.reserve((usable_size + 2) / 3);
3651 
3652  // get appropriate translation table
3653  const CTrans_table & tbl =
3656 
3657  char aa = '\0';
3658  int state = 0;
3659  int start_state = 0;
3660  try {
3661  // main loop through bases
3662  typename Container::const_iterator start = seq.begin();
3663  {{
3664  for (int i = 0; i < frame; ++i) {
3665  ++start;
3666  }
3667  }}
3668 
3669  size_t i;
3670  size_t k;
3671  size_t length = usable_size / 3;
3672  bool check_start = (is_5prime_complete && frame == 0);
3673  bool first_time = true;
3674 
3675  for (i = 0; i < length; ++i) {
3676 
3677  // loop through one codon at a time
3678  for (k = 0; k < 3; ++k, ++start) {
3679  state = tbl.NextCodonState(state, *start);
3680  }
3681 
3682  if (first_time) {
3683  start_state = state;
3684  }
3685 
3686  // save translated amino acid
3687  if (first_time && check_start) {
3688  aa = tbl.GetStartResidue(state);
3689  prot.append(1, aa);
3690  } else {
3691  aa = tbl.GetCodonResidue(state);
3692  prot.append(1, aa);
3693  }
3694 
3695  first_time = false;
3696  }
3697 
3698  if (mod) {
3699  for (k = 0; k < mod; ++k, ++start) {
3700  state = tbl.NextCodonState(state, *start);
3701  }
3702 
3703  for (; k < 3; ++k) {
3704  state = tbl.NextCodonState(state, 'N');
3705  }
3706 
3707  if (first_time) {
3708  start_state = state;
3709  }
3710 
3711  // save translated amino acid
3712  char c = tbl.GetCodonResidue(state);
3713  if (first_time && check_start) {
3714  aa = tbl.GetStartResidue(state);
3715  prot.append(1, aa);
3716  } else if (c != 'X') {
3717  // if padding was needed, trim ambiguous last residue
3718  aa = tbl.GetCodonResidue(state);
3719  prot.append(1, aa);
3720  }
3721  }
3722  } catch (CSeqVectorException& /*ex*/) {
3723  // ran out of sequence
3724  }
3725 
3726  if ( aa != '*' && include_stop && (! mod) && prot.size() > 0 && is_3prime_complete ) {
3727  // check for stop codon that normally encodes an amino acid
3728  aa = tbl.GetStopResidue(state);
3729  if (aa == '*') {
3730  prot[prot.size()-1] = aa;
3731  }
3732  }
3733 
3734  // check for alternative start codon
3735  if (alt_start && is_5prime_complete) {
3736  if ( tbl.IsAltStart(start_state) ) {
3737  *alt_start = true;
3738  } else {
3739  *alt_start = false;
3740  }
3741  }
3742 
3743  if ( !include_stop ) {
3744  SIZE_TYPE sz = prot.find_first_of("*");
3745  if (sz != string::npos) {
3746  prot.resize(sz);
3747  }
3748  }
3749 
3750  if (remove_trailing_X) {
3751  SIZE_TYPE sz;
3752  for (sz = prot.size(); sz > 0 && prot[sz - 1] == 'X'; --sz) {
3753  }
3754  prot.resize(sz);
3755  }
3756 
3757  /**
3758  cerr << "source: ";
3759  ITERATE (typename Container, it, seq) {
3760  cerr << *it;
3761  }
3762  cerr << endl;
3763  cerr << "xlate: ";
3764  ITERATE (string, it, prot) {
3765  cerr << *it;
3766  }
3767  cerr << endl;
3768  **/
3769 }
3770 
3771 
3772 static void AddAAToDeltaSeq (CRef<CBioseq> prot, char residue)
3773 {
3774  if (prot->SetInst().SetExt().SetDelta().Set().empty()
3775  || prot->GetInst().GetExt().GetDelta().Get().back()->GetLiteral().GetSeq_data().IsGap()) {
3776  // either first seg or transitioning from gap, need new seg
3777  CRef<CDelta_seq> seg(new CDelta_seq());
3778  seg->SetLiteral().SetLength(0);
3779  prot->SetInst().SetExt().SetDelta().Set().push_back(seg);
3780  }
3781 
3782  CRef<CDelta_seq> last = prot->SetInst().SetExt().SetDelta().Set().back();
3783 
3784  if (residue == '*' || residue == '-') {
3785  // found a residue that is not part of the IUPACAA alphabet, must convert to NCBIEAA
3786  if (last->IsLiteral() && last->GetLiteral().IsSetSeq_data() && last->GetLiteral().GetSeq_data().IsIupacaa()) {
3787  // convert to ncbieaa
3788  string current = last->GetLiteral().GetSeq_data().GetIupacaa().Get();
3789  last->SetLiteral().SetSeq_data().SetNcbieaa().Set(current);
3790  }
3791  // add *
3792  last->SetLiteral().SetSeq_data().SetNcbieaa().Set().append(1, residue);
3793  } else if (last->IsLiteral() && last->GetLiteral().IsSetSeq_data() && last->GetLiteral().GetSeq_data().IsNcbieaa()) {
3794  // already using NCBIEAA, must continue to do so
3795  last->SetLiteral().SetSeq_data().SetNcbieaa().Set().append(1, residue);
3796  } else {
3797  // so far, have not found residues that are not part of IUPACAA, can continue to use IUPACAA
3798  last->SetLiteral().SetSeq_data().SetIupacaa().Set().append(1, residue);
3799  }
3800 
3801  TSeqPos len = last->GetLiteral().GetLength();
3802  last->SetLiteral().SetLength(len + 1);
3803 }
3804 
3805 
3806 static void AddGapToDeltaSeq (CRef<CBioseq>prot, bool unknown_length, TSeqPos add_len)
3807 {
3808  if (prot->SetInst().SetExt().SetDelta().Set().empty()) {
3809  // create new segment for gap
3810  CRef<CDelta_seq> new_seg(new CDelta_seq());
3811  new_seg->SetLiteral().SetSeq_data().SetGap().SetType(CSeq_gap::eType_unknown);
3812  new_seg->SetLiteral().SetLength(add_len);
3813  if (unknown_length) {
3814  new_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3815  }
3816  prot->SetInst().SetExt().SetDelta().Set().push_back(new_seg);
3817  } else {
3818  CRef<CDelta_seq> last = prot->SetInst().SetExt().SetDelta().Set().back();
3819  if (last->SetLiteral().GetSeq_data().IsGap()
3820  && ((unknown_length && last->SetLiteral().IsSetFuzz())
3821  || (!unknown_length && !last->SetLiteral().IsSetFuzz()))) {
3822  // ok, already creating gap segment with correct fuzz
3823  TSeqPos len = prot->GetInst().GetExt().GetDelta().Get().back()->GetLiteral().GetLength();
3824  prot->SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetLength(len + add_len);
3825  } else {
3826  // create new segment for gap
3827  CRef<CDelta_seq> new_seg(new CDelta_seq());
3828  new_seg->SetLiteral().SetSeq_data().SetGap().SetType(CSeq_gap::eType_unknown);
3829  new_seg->SetLiteral().SetLength(add_len);
3830  if (unknown_length) {
3831  new_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3832  }
3833  prot->SetInst().SetExt().SetDelta().Set().push_back(new_seg);
3834  }
3835  }
3836 }
3837 
3838 
3840  CScope& scope)
3841 {
3842  const CGenetic_code* code = NULL;
3843  int frame = 0;
3844  if (cds.GetData().IsCdregion()) {
3845  const CCdregion& cdr = cds.GetData().GetCdregion();
3846  if (cdr.IsSetFrame()) {
3847  switch (cdr.GetFrame()) {
3848  case CCdregion::eFrame_two:
3849  frame = 1;
3850  break;
3852  frame = 2;
3853  break;
3854  default:
3855  break;
3856  }
3857  }
3858  if (cdr.IsSetCode()) {
3859  code = &cdr.GetCode();
3860  }
3861  }
3862  bool is_5prime_complete = !cds.GetLocation().IsPartialStart(eExtreme_Biological);
3863 
3866  map.Reset(&seq.GetSeqMap());
3867 
3868  CRef<CBioseq> prot(new CBioseq());
3869 
3870  prot->SetInst().SetRepr(CSeq_inst::eRepr_delta);
3871  prot->SetInst().SetMol(CSeq_inst::eMol_aa);
3872  prot->SetInst().SetLength(0);
3873 
3874  // reserve our space
3875  const TSeqPos usable_size = TSeqPos(seq.size()) - frame;
3876  const TSeqPos mod = usable_size % 3;
3877 
3878  // get appropriate translation table
3879  const CTrans_table & tbl =
3882 
3883  try {
3884  // main loop through bases
3885  CSeqVector::const_iterator start = seq.begin();
3886  for (int i = 0; i < frame; ++i) {
3887  ++start;
3888  }
3889 
3890  TSeqPos i;
3891  TSeqPos k;
3892  int state = 0;
3893  TSeqPos length = usable_size / 3;
3894  bool check_start = (is_5prime_complete && frame == 0);
3895  bool first_time = true;
3896 
3897  for (i = 0; i < length; ++i) {
3898  bool is_gap = true;
3899  bool unknown_length = false;
3900  TSeqPos pos = (i * 3) + frame;
3901 
3902  if (start.HasZeroGapBefore()) {
3903  AddGapToDeltaSeq(prot, true, 0);
3904  }
3905 
3906  // loop through one codon at a time
3907  for (k = 0; k < 3; ++k, ++start) {
3908  state = tbl.NextCodonState(state, *start);
3909  if (seq.IsInGap(pos + k)) {
3910  if (is_gap && !unknown_length) {
3911  CSeqMap_CI map_iter(map, &scope, SSeqMapSelector(), pos + k);
3912  if (map_iter.GetType() == CSeqMap::eSeqGap
3913  && map_iter.IsUnknownLength()) {
3914  unknown_length = true;
3915  }
3916  }
3917  } else {
3918  is_gap = false;
3919  }
3920  }
3921 
3922  if (is_gap) {
3923  AddGapToDeltaSeq(prot, unknown_length, 1);
3924  } else {
3925  // save translated amino acid
3926  if (first_time && check_start) {
3928  } else {
3930  }
3931 
3932  }
3933 
3934  first_time = false;
3935  }
3936 
3937  if (mod) {
3938  bool is_gap = true;
3939  bool unknown_length = false;
3940  TSeqPos pos = (length * 3) + frame;
3941  for (k = 0; k < mod; ++k, ++start) {
3942  state = tbl.NextCodonState(state, *start);
3943  if (seq.IsInGap(pos + k)) {
3944  if (is_gap && !unknown_length) {
3945  CSeqMap_CI map_iter(map, &scope, SSeqMapSelector(), pos + k);
3946  if (map_iter.GetType() == CSeqMap::eSeqGap) {
3947  if (map_iter.IsUnknownLength()) {
3948  unknown_length = true;
3949  }
3950  }
3951  }
3952  } else {
3953  is_gap = false;
3954  }
3955  }
3956 
3957  if (is_gap) {
3958  AddGapToDeltaSeq(prot, unknown_length, 1);
3959  } else {
3960  for (; k < 3; ++k) {
3961  state = tbl.NextCodonState(state, 'N');
3962  }
3963 
3964  // save translated amino acid
3965  char c = tbl.GetCodonResidue(state);
3966  if (c != 'X') {
3967  if (first_time && check_start) {
3969  } else {
3971  }
3972  }
3973  }
3974  }
3975  } catch (CSeqVectorException& /*ex*/) {
3976  // ran out of sequence
3977  }
3978 
3979  TSeqPos prot_len = 0;
3980  ITERATE(CDelta_ext::Tdata, seg_it, prot->SetInst().SetExt().SetDelta().Set()) {
3981  prot_len += (*seg_it)->GetLiteral().GetLength();
3982  }
3983 
3984  // code break substitution
3985  if (cds.GetData().IsCdregion() &&
3986  cds.GetData().GetCdregion().IsSetCode_break()) {
3987  const CCdregion& cdr = cds.GetData().GetCdregion();
3988  ITERATE(CCdregion::TCode_break, code_break, cdr.GetCode_break()) {
3989  const CRef <CCode_break> brk = *code_break;
3990  const CSeq_loc& cbk_loc = brk->GetLoc();
3991  TSeqPos seq_pos =
3992  sequence::LocationOffset(cds.GetLocation(), cbk_loc,
3994  &scope);
3995  seq_pos -= frame;
3996  string::size_type j = seq_pos / 3;
3997  if (j < prot_len) {
3998  const CCode_break::C_Aa& c_aa = brk->GetAa();
3999  if (c_aa.IsNcbieaa()) {
4000  CDelta_ext::Tdata::iterator seg_it = prot->SetInst().SetExt().SetDelta().Set().begin();
4001  string::size_type offset = 0;
4002  while (seg_it != prot->SetInst().SetExt().SetDelta().Set().end()
4003  && offset + (*seg_it)->GetLiteral().GetLength() < j) {
4004  offset += (*seg_it)->GetLiteral().GetLength();
4005  ++seg_it;
4006  }
4007  if (seg_it != prot->SetInst().SetExt().SetDelta().Set().end()
4008  && !(*seg_it)->GetLiteral().GetSeq_data().IsGap()) {
4009  if ((*seg_it)->GetLiteral().GetSeq_data().IsIupacaa()) {
4010  (*seg_it)->SetLiteral().SetSeq_data().SetIupacaa().Set()[j - offset] = c_aa.GetNcbieaa();
4011  } else {
4012  (*seg_it)->SetLiteral().SetSeq_data().SetNcbieaa().Set()[j - offset] = c_aa.GetNcbieaa();
4013  }
4014  }
4015  }
4016  } else if (j == prot_len) {
4017  // add terminal exception
4018  const CCode_break::C_Aa& c_aa = brk->GetAa();
4019  if (c_aa.IsNcbieaa() && c_aa.GetNcbieaa() == 42) {
4020  AddAAToDeltaSeq(prot, c_aa.GetNcbieaa());
4021  }
4022  }
4023  }
4024  }
4025 
4026  // remove stop codon from end
4027  CRef<CDelta_seq> end;
4028  if (!prot->SetInst().SetExt().SetDelta().Set().empty())
4029  {
4030  end = prot->SetInst().SetExt().SetDelta().Set().back();
4031  }
4032 
4033  if (end && end->IsLiteral() && end->GetLiteral().IsSetSeq_data()) {
4034  if (end->GetLiteral().GetSeq_data().IsIupacaa()) {
4035  string& last_seg = end->SetLiteral().SetSeq_data().SetIupacaa().Set();
4036  if (NStr::EndsWith(last_seg, "*")) {
4037  last_seg = last_seg.substr(0, last_seg.length() - 1);
4038  end->SetLiteral().SetLength(TSeqPos(last_seg.length()));
4039  }
4040  } else if (end->GetLiteral().GetSeq_data().IsNcbieaa()) {
4041  string& last_seg = end->SetLiteral().SetSeq_data().SetNcbieaa().Set();
4042  if (NStr::EndsWith(last_seg, "*")) {
4043  last_seg = last_seg.substr(0, last_seg.length() - 1);
4044  end->SetLiteral().SetLength(TSeqPos(last_seg.length()));
4045  }
4046  }
4047  }
4048 
4049  // recalculate protein length, check need for ncbieaa - may have been altered by removal of stop codon/transl_except
4050  prot_len = 0;
4051  NON_CONST_ITERATE(CDelta_ext::Tdata, seg_it, prot->SetInst().SetExt().SetDelta().Set()) {
4052  prot_len += (*seg_it)->GetLiteral().GetLength();
4053  if ((*seg_it)->GetLiteral().IsSetSeq_data()
4054  && (*seg_it)->GetLiteral().GetSeq_data().IsNcbieaa()) {
4055  string current = (*seg_it)->GetLiteral().GetSeq_data().GetNcbieaa();
4056  if (NStr::Find(current, "*") == string::npos && NStr::Find(current, "-") == string::npos) {
4057  (*seg_it)->SetLiteral().SetSeq_data().SetIupacaa().Set(current);
4058  }
4059  }
4060  }
4061  prot->SetInst().SetLength(prot_len);
4062 
4063  if (prot->GetInst().GetLength() == 0) {
4064  prot.Reset(NULL);
4065  } else if (prot->SetInst().SetExt().SetDelta().Set().size() == 1
4066  && prot->SetInst().SetExt().SetDelta().Set().front()->IsLiteral()
4067  && prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().IsSetSeq_data()) {
4068  // only one segment, should be raw rather than delta
4069  if (prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().IsIupacaa()) {
4070  string data = prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().GetIupacaa().Get();
4071  prot->SetInst().ResetExt();
4072  prot->SetInst().SetSeq_data().SetIupacaa().Set(data);
4073  prot->SetInst().SetRepr(CSeq_inst::eRepr_raw);
4074  } else if (prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().IsNcbieaa()) {
4075  string data = prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().GetNcbieaa().Get();
4076  prot->SetInst().ResetExt();
4077  prot->SetInst().SetSeq_data().SetNcbieaa().Set(data);
4078  prot->SetInst().SetRepr(CSeq_inst::eRepr_raw);
4079  }
4080  }
4081 
4082  return prot;
4083 }
4084 
4085 
4087 {
4088  if (!protein || !protein->IsAa() || !protein->IsSetInst()) {
4089  return false;
4090  }
4091  return protein->SetInst().ConvertDeltaToRaw();
4092 }
4093 
4094 
4095 void CSeqTranslator::Translate(const string& seq, string& prot,
4096  const CGenetic_code* code,
4097  bool include_stop,
4098  bool remove_trailing_X,
4099  bool* alt_start,
4100  bool is_5prime_complete,
4101  bool is_3prime_complete)
4102 {
4103  x_Translate(seq, prot, 0, code,
4104  is_5prime_complete, is_3prime_complete, include_stop, remove_trailing_X, alt_start);
4105 }
4106 
4107 
4108 void CSeqTranslator::Translate(const string& seq,
4109  string& prot,
4111  const CGenetic_code* code,
4112  bool* alt_start)
4113 {
4114  x_Translate(seq, prot, 0, code,
4115  !(flags & fIs5PrimePartial),
4116  !(flags & fIs3PrimePartial),
4117  !(flags & fNoStop),
4118  flags & fRemoveTrailingX,
4119  alt_start);
4120 }
4121 
4122 
4123 void CSeqTranslator::Translate(const CSeqVector& seq, string& prot,
4124  const CGenetic_code* code,
4125  bool include_stop,
4126  bool remove_trailing_X,
4127  bool* alt_start,
4128  bool is_5prime_complete,
4129  bool is_3prime_complete)
4130 {
4131  x_Translate(seq, prot, 0, code,
4132  is_5prime_complete, is_3prime_complete, include_stop, remove_trailing_X, alt_start);
4133 }
4134 
4135 
4136 void CSeqTranslator::Translate(const CSeqVector& seq, string& prot,
4138  const CGenetic_code* code,
4139  bool* alt_start)
4140 {
4141  x_Translate(seq, prot, 0, code,
4142  !(flags & fIs5PrimePartial),
4143  !(flags & fIs3PrimePartial),
4144  !(flags & fNoStop),
4145  flags & fRemoveTrailingX,
4146  alt_start);
4147 }
4148 
4149 
4151  const CBioseq_Handle& handle,
4152  string& prot,
4153  const CGenetic_code* code,
4154  bool include_stop,
4155  bool remove_trailing_X,
4156  bool* alt_start)
4157 {
4159  x_Translate(seq, prot, 0, code,
4162  include_stop, remove_trailing_X, alt_start);
4163 }
4164 
4165 
4166 
4168  CScope& scope,
4169  string& prot,
4170  const CGenetic_code* code,
4171  bool include_stop,
4172  bool remove_trailing_X,
4173  bool* alt_start)
4174 {
4175  CSeqVector seq(loc, scope, CBioseq_Handle::eCoding_Iupac);
4176  x_Translate(seq, prot, 0, code,
4179  include_stop, remove_trailing_X, alt_start);
4180 }
4181 
4182 
4184  CScope& scope,
4185  string& prot,
4186  bool include_stop,
4187  bool remove_trailing_X,
4188  bool* alt_start)
4189 {
4190  const CGenetic_code* code = NULL;
4191  int frame = 0;
4192  if (feat.GetData().IsCdregion()) {
4193  const CCdregion& cdr = feat.GetData().GetCdregion();
4194  if (cdr.IsSetFrame ()) {
4195  switch (cdr.GetFrame ()) {
4196  case CCdregion::eFrame_two :
4197  frame = 1;
4198  break;
4200  frame = 2;
4201  break;
4202  default :
4203  break;
4204  }
4205  }
4206  if (cdr.IsSetCode()) {
4207  code = &cdr.GetCode();
4208  }
4209  }
4210 
4211  bool code_break_include_stop = include_stop;
4212  if (feat.GetData().IsCdregion() &&
4213  feat.GetData().GetCdregion().IsSetCode_break()) {
4214  code_break_include_stop = true;
4215  }
4216 
4218  x_Translate(seq, prot, frame, code,
4221  code_break_include_stop, remove_trailing_X, alt_start);
4222 
4223 
4224  // code break substitution
4225  if (feat.GetData().IsCdregion() &&
4226  feat.GetData().GetCdregion().IsSetCode_break()) {
4227  const CCdregion& cdr = feat.GetData().GetCdregion();
4228  string::size_type protlen = prot.size();
4229  ITERATE (CCdregion::TCode_break, code_break, cdr.GetCode_break()) {
4230  const CRef <CCode_break> brk = *code_break;
4231  const CSeq_loc& cbk_loc = brk->GetLoc();
4232  TSeqPos seq_pos =
4233  sequence::LocationOffset(feat.GetLocation(), cbk_loc,
4235  &scope);
4236  seq_pos -= frame;
4237  string::size_type i = seq_pos / 3;
4238  if (i < protlen) {
4239  const CCode_break::C_Aa& c_aa = brk->GetAa ();
4240  if (c_aa.IsNcbieaa ()) {
4241  prot [i] = c_aa.GetNcbieaa ();
4242  }
4243  } else if (i == protlen) {
4244  // add terminal exception
4245  const CCode_break::C_Aa& c_aa = brk->GetAa ();
4246  if (c_aa.IsNcbieaa () && c_aa.GetNcbieaa () == 42) {
4247  prot += c_aa.GetNcbieaa ();
4248  }
4249  }
4250  }
4251 
4252  if ( !include_stop ) {
4253  SIZE_TYPE sz = prot.find_first_of("*");
4254  if (sz != string::npos) {
4255  prot.resize(sz);
4256  }
4257  }
4258  }
4259 }
4260 
4261 
4262 typedef struct {
4266  size_t len;
4268 } SFrameInfo;
4269 
4271 
4273 {
4274  ambiguous = false;
4275  if (!cds.IsSetLocation() || !cds.IsSetData() || !cds.GetData().IsCdregion()) {
4277  }
4278  const CCdregion& cdr = cds.GetData().GetCdregion();
4279 
4280  CCdregion::EFrame orig_frame = cdr.IsSetFrame() ? cdr.GetFrame() : CCdregion::eFrame_one;
4281  if (orig_frame == CCdregion::eFrame_not_set) {
4282  orig_frame = CCdregion::eFrame_one;
4283  }
4284 
4285  CRef<CSeq_feat> tmp_cds(new CSeq_feat());
4286  tmp_cds->Assign(cds);
4287  TFrameInfoMap frame_map;
4288  frame_map[CCdregion::eFrame_one] = { false, false, false, NPOS, 0 };
4289  frame_map[CCdregion::eFrame_two] = { false, false, false, NPOS, 1 };
4290  frame_map[CCdregion::eFrame_three] = { false, false, false, NPOS, 2 };
4291 
4292  bool is_3complete = !tmp_cds->GetLocation().IsPartialStop(eExtreme_Biological);
4293  bool is_5complete = !tmp_cds->GetLocation().IsPartialStart(eExtreme_Biological);
4294 
4295  size_t leftover = sequence::GetLength(tmp_cds->GetLocation(), &scope) % 3;
4296 
4297  for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4298  tmp_cds->SetData().SetCdregion().SetFrame(it->first);
4299  string prot;
4300  CSeqTranslator::Translate(*tmp_cds, scope, prot, true, false, NULL);
4301  size_t pos = NStr::Find(prot, "*");
4302  it->second.len = prot.length();
4303 
4304  if ((pos == prot.length() - 1) && (leftover == it->second.frame_offset)) {
4305  it->second.has_final_stop = true;
4306  } else if (pos != NPOS) {
4307  it->second.has_internal_stop = true;
4308  }
4309 
4310  if (NStr::StartsWith(prot, "M") && it->second.frame_offset == 0) {
4311  it->second.has_start_m = true;
4312  }
4313  }
4314 
4315  // if the original frame has no internal stop codons and has a final
4316  // stop codon, keep the original frame
4317  if (frame_map[orig_frame].has_final_stop) {
4318  return orig_frame;
4319  }
4320 
4321  if (is_3complete && !is_5complete) {
4322  // find a frame that has a stop codon
4323  for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4324  if (it->second.has_final_stop) {
4325  return it->first;
4326  }
4327  }
4328  }
4329 
4330  if (is_5complete && !is_3complete) {
4331  // find a frame that has a start codon (could only be first frame)
4332  if (frame_map[CCdregion::eFrame_one].has_start_m && !frame_map[CCdregion::eFrame_one].has_internal_stop) {
4333  return CCdregion::eFrame_one;
4334  }
4335  }
4336 
4337  if (is_5complete) {
4338  // find a frame that has a start codon (could only be first frame)
4339  if (frame_map[CCdregion::eFrame_one].has_start_m && !frame_map[CCdregion::eFrame_one].has_internal_stop) {
4340  return CCdregion::eFrame_one;
4341  }
4342  }
4343 
4344  if (is_3complete) {
4345  // find a frame that has a stop codon
4346  for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4347  if (it->second.has_final_stop) {
4348  return it->first;
4349  }
4350  }
4351  }
4352 
4353  // otherwise, just looking for no internal stop codon
4354  if (!frame_map[orig_frame].has_internal_stop) {
4355  return orig_frame;
4356  }
4357 
4359  for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4360  if (!it->second.has_internal_stop) {
4361  if (best_frame == CCdregion::eFrame_not_set) {
4362  best_frame = it->first;
4363  } else {
4364  ambiguous = true;
4365  }
4366  }
4367  }
4368  if (best_frame != CCdregion::eFrame_not_set) {
4369  return best_frame;
4370  } else {
4371  return orig_frame;
4372  }
4373 }
4374 
4375 
4377 {
4378  bool ambiguous = false;
4379 
4380  return FindBestFrame(cds, scope, ambiguous);
4381 }
4382 
4383 
4385  const CBioseq_Handle& bsh,
4386  const CSeq_loc& loc,
4387  const CCdregion& cdr,
4388  bool include_stop,
4389  bool remove_trailing_X,
4390  bool* alt_start,
4392 {
4393  CSeq_feat feat;
4394  feat.SetLocation(const_cast<CSeq_loc&>(loc));
4395  feat.SetData().SetCdregion(const_cast<CCdregion&>(cdr));
4397  include_stop, remove_trailing_X, alt_start);
4398 }
4399 
4400 
4402  string& prot,
4403  const CSeq_feat& cds,
4404  CScope& scope,
4405  bool include_stop,
4406  bool remove_trailing_X,
4407  bool* alt_start,
4409 {
4410  _ASSERT(cds.GetData().IsCdregion());
4411  prot.erase();
4412  CBioseq_Handle bsh = scope.GetBioseqHandle(cds.GetLocation());
4413  if ( !bsh ) {
4414  return;
4415  }
4417  include_stop, remove_trailing_X, alt_start);
4418 }
4419 
4420 
4421 SRelLoc::SRelLoc(const CSeq_loc& parent, const CSeq_loc& child, CScope* scope,
4423  : m_ParentLoc(&parent)
4424 {
4425  typedef CSeq_loc::TRange TRange0;
4426  for (CSeq_loc_CI cit(child); cit; ++cit) {
4427  const CSeq_id& cseqid = cit.GetSeq_id();
4428  TRange0 crange = cit.GetRange();
4429  if (crange.IsWholeTo() && scope) {
4430  // determine actual end
4431  crange.SetToOpen(sequence::GetLength(cit.GetSeq_id(), scope));
4432  }
4433  ENa_strand cstrand = cit.GetStrand();
4434  TSeqPos pos = 0;
4435  for (CSeq_loc_CI pit(parent); pit; ++pit) {
4436  ENa_strand pstrand = pit.GetStrand();
4437  TRange0 prange = pit.GetRange();
4438  if (prange.IsWholeTo() && scope) {
4439  // determine actual end
4440  prange.SetToOpen(sequence::GetLength(pit.GetSeq_id(), scope));
4441  }
4442  if ( !sequence::IsSameBioseq(cseqid, pit.GetSeq_id(), scope) ) {
4443  pos += prange.GetLength();
4444  continue;
4445  }
4446  CRef<TRange> intersection(new TRange);
4447  TSeqPos abs_from, abs_to;
4448  CConstRef<CInt_fuzz> fuzz_from, fuzz_to;
4449  if (crange.GetFrom() >= prange.GetFrom()) {
4450  abs_from = crange.GetFrom();
4451  fuzz_from = cit.GetFuzzFrom();
4452  if (abs_from == prange.GetFrom()) {
4453  // subtract out parent fuzz, if any
4454  const CInt_fuzz* pfuzz = pit.GetFuzzFrom();
4455  if (pfuzz) {
4456  if (fuzz_from) {
4458  f->Assign(*fuzz_from);
4459  f->Subtract(*pfuzz, abs_from, abs_from);
4460  if (f->IsP_m() && !f->GetP_m() ) {
4461  fuzz_from.Reset(); // cancelled
4462  } else {
4463  fuzz_from = f;
4464  }
4465  } else {
4466  fuzz_from = pfuzz->Negative(abs_from);
4467  }
4468  }
4469  }
4470  } else {
4471  abs_from = prange.GetFrom();
4472  // fuzz_from = pit.GetFuzzFrom();
4474  f->SetLim(CInt_fuzz::eLim_lt);
4475  fuzz_from = f;
4476  }
4477  if (crange.GetTo() <= prange.GetTo()) {
4478  abs_to = crange.GetTo();
4479  fuzz_to = cit.GetFuzzTo();
4480  if (abs_to == prange.GetTo()) {
4481  // subtract out parent fuzz, if any
4482  const CInt_fuzz* pfuzz = pit.GetFuzzTo();
4483  if (pfuzz) {
4484  if (fuzz_to) {
4486  f->Assign(*fuzz_to);
4487  f->Subtract(*pfuzz, abs_to, abs_to);
4488  if (f->IsP_m() && !f->GetP_m() ) {
4489  fuzz_to.Reset(); // cancelled
4490  } else {
4491  fuzz_to = f;
4492  }
4493  } else {
4494  fuzz_to = pfuzz->Negative(abs_to);
4495  }
4496  }
4497  }
4498  } else {
4499  abs_to = prange.GetTo();
4500  // fuzz_to = pit.GetFuzzTo();
4502  f->SetLim(CInt_fuzz::eLim_gt);
4503  fuzz_to = f;
4504  }
4505  if (abs_from <= abs_to) {
4506  if (IsReverse(pstrand)) {
4507  TSeqPos sigma = pos + prange.GetTo();
4508  intersection->SetFrom(sigma - abs_to);
4509  intersection->SetTo (sigma - abs_from);
4510  if (fuzz_from) {
4511  intersection->SetFuzz_to().AssignTranslated
4512  (*fuzz_from, intersection->GetTo(), abs_from);
4513  intersection->SetFuzz_to().Negate
4514  (intersection->GetTo());
4515  }
4516  if (fuzz_to) {
4517  intersection->SetFuzz_from().AssignTranslated
4518  (*fuzz_to, intersection->GetFrom(), abs_to);
4519  intersection->SetFuzz_from().Negate
4520  (intersection->GetFrom());
4521  }
4522  if (cstrand == eNa_strand_unknown) {
4523  intersection->SetStrand(pstrand);
4524  } else {
4525  intersection->SetStrand(Reverse(cstrand));
4526  }
4527  } else {
4528  TSignedSeqPos delta = pos - prange.GetFrom();
4529  intersection->SetFrom(abs_from + delta);
4530  intersection->SetTo (abs_to + delta);
4531  if (fuzz_from) {
4532  intersection->SetFuzz_from().AssignTranslated
4533  (*fuzz_from, intersection->GetFrom(), abs_from);
4534  }
4535  if (fuzz_to) {
4536  intersection->SetFuzz_to().AssignTranslated
4537  (*fuzz_to, intersection->GetTo(), abs_to);
4538  }
4539  if (cstrand == eNa_strand_unknown) {
4540  intersection->SetStrand(pstrand);
4541  } else {
4542  intersection->SetStrand(cstrand);
4543  }
4544  }
4545  // add to m_Ranges, combining with the previous
4546  // interval if possible
4547  if ( !(flags & fNoMerge) && !m_Ranges.empty()
4548  && SameOrientation(intersection->GetStrand(),
4549  m_Ranges.back()->GetStrand()) ) {
4550  if (m_Ranges.back()->GetTo() == intersection->GetFrom() - 1
4551  && !IsReverse(intersection->GetStrand()) ) {
4552  m_Ranges.back()->SetTo(intersection->GetTo());
4553  if (intersection->IsSetFuzz_to()) {
4554  m_Ranges.back()->SetFuzz_to
4555  (intersection->SetFuzz_to());
4556  } else {
4557  m_Ranges.back()->ResetFuzz_to();
4558  }
4559  } else if (m_Ranges.back()->GetFrom()
4560  == intersection->GetTo() + 1
4561  && IsReverse(intersection->GetStrand())) {
4562  m_Ranges.back()->SetFrom(intersection->GetFrom());
4563  if (intersection->IsSetFuzz_from()) {
4564  m_Ranges.back()->SetFuzz_from
4565  (intersection->SetFuzz_from());
4566  } else {
4567  m_Ranges.back()->ResetFuzz_from();
4568  }
4569  } else {
4570  m_Ranges.push_back(intersection);
4571  }
4572  } else {
4573  m_Ranges.push_back(intersection);
4574  }
4575  }
4576  pos += prange.GetLength();
4577  }
4578  }
4579 }
4580 
4581 
4582 // Bother trying to merge?
4584  SRelLoc::TFlags /* flags */)
4585  const
4586 {
4587  typedef CSeq_loc::TRange TRange0;
4589  CSeq_loc_mix& mix = result->SetMix();
4590  ITERATE (TRanges, it, m_Ranges) {
4591  _ASSERT((*it)->GetFrom() <= (*it)->GetTo());
4592  TSeqPos pos = 0, start = (*it)->GetFrom();
4593  bool keep_going = true;
4594  for (CSeq_loc_CI pit(new_parent); pit; ++pit) {
4595  TRange0 prange = pit.GetRange();
4596  if (prange.IsWholeTo() && scope) {
4597  // determine actual end
4598  prange.SetToOpen(sequence::GetLength(pit.GetSeq_id(), scope));
4599  }
4600  TSeqPos length = prange.GetLength();
4601  if (start >= pos && start < pos + length) {
4602  TSeqPos from, to;
4603  CConstRef<CInt_fuzz> fuzz_from, fuzz_to;
4604  ENa_strand strand;
4605  if (IsReverse(pit.GetStrand())) {
4606  TSeqPos sigma = pos + prange.GetTo();
4607  from = sigma - (*it)->GetTo();
4608  to = sigma - start;
4609  if (from < prange.GetFrom() || from > sigma) {
4610  from = prange.GetFrom();
4611  keep_going = true;
4612  } else {
4613  keep_going = false;
4614  }
4615  if ( !(*it)->IsSetStrand()
4616  || (*it)->GetStrand() == eNa_strand_unknown) {
4617  strand = pit.GetStrand();
4618  } else {
4619  strand = Reverse((*it)->GetStrand());
4620  }
4621  if (from == prange.GetFrom()) {
4622  fuzz_from = pit.GetFuzzFrom();
4623  }
4624  if ( !keep_going && (*it)->IsSetFuzz_to() ) {
4626  if (fuzz_from) {
4627  f->Assign(*fuzz_from);
4628  } else {
4629  f->SetP_m(0);
4630  }
4631  f->Subtract((*it)->GetFuzz_to(), from, (*it)->GetTo(),
4633  if (f->IsP_m() && !f->GetP_m() ) {
4634  fuzz_from.Reset(); // cancelled
4635  } else {
4636  fuzz_from = f;
4637  }
4638  }
4639  if (to == prange.GetTo()) {
4640  fuzz_to = pit.GetFuzzTo();
4641  }
4642  if (start == (*it)->GetFrom()
4643  && (*it)->IsSetFuzz_from()) {
4645  if (fuzz_to) {
4646  f->Assign(*fuzz_to);
4647  } else {
4648  f->SetP_m(0);
4649  }
4650  f->Subtract((*it)->GetFuzz_from(), to,
4651  (*it)->GetFrom(), CInt_fuzz::eAmplify);
4652  if (f->IsP_m() && !f->GetP_m() ) {
4653  fuzz_to.Reset(); // cancelled
4654  } else {
4655  fuzz_to = f;
4656  }
4657  }
4658  } else {
4659  TSignedSeqPos delta = prange.GetFrom() - pos;
4660  from = start + delta;
4661  to = (*it)->GetTo() + delta;
4662  if (to > prange.GetTo()) {
4663  to = prange.GetTo();
4664  keep_going = true;
4665  } else {
4666  keep_going = false;
4667  }
4668  if ( !(*it)->IsSetStrand()
4669  || (*it)->GetStrand() == eNa_strand_unknown) {
4670  strand = pit.GetStrand();
4671  } else {
4672  strand = (*it)->GetStrand();
4673  }
4674  if (from == prange.GetFrom()) {
4675  fuzz_from = pit.GetFuzzFrom();
4676  }
4677  if (start == (*it)->GetFrom()
4678  && (*it)->IsSetFuzz_from()) {
4680  if (fuzz_from) {
4681  f->Assign(*fuzz_from);
4682  f->Add((*it)->GetFuzz_from(), from,
4683  (*it)->GetFrom());
4684  } else {
4685  f->AssignTranslated((*it)->GetFuzz_from(), from,
4686  (*it)->GetFrom());
4687  }
4688  if (f->IsP_m() && !f->GetP_m() ) {
4689  fuzz_from.Reset(); // cancelled
4690  } else {
4691  fuzz_from = f;
4692  }
4693  }
4694  if (to == prange.GetTo()) {
4695  fuzz_to = pit.GetFuzzTo();
4696  }
4697  if ( !keep_going && (*it)->IsSetFuzz_to() ) {
4699  if (fuzz_to) {
4700  f->Assign(*fuzz_to);
4701  f->Add((*it)->GetFuzz_to(), to, (*it)->GetTo());
4702  } else {
4703  f->AssignTranslated((*it)->GetFuzz_to(), to,
4704  (*it)->GetTo());
4705  }
4706  if (f->IsP_m() && !f->GetP_m() ) {
4707  fuzz_to.Reset(); // cancelled
4708  } else {
4709  fuzz_to = f;
4710  }
4711  }
4712  }
4713  if (from == to
4714  && (fuzz_from == fuzz_to
4715  || (fuzz_from.GetPointer() && fuzz_to.GetPointer()
4716  && fuzz_from->Equals(*fuzz_to)))) {
4717  // just a point
4718  CRef<CSeq_loc> loc(new CSeq_loc);
4719  CSeq_point& point = loc->SetPnt();
4720  point.SetPoint(from);
4721  if (strand != eNa_strand_unknown) {
4722  point.SetStrand(strand);
4723  }
4724  if (fuzz_from) {
4725  point.SetFuzz().Assign(*fuzz_from);
4726  }
4727  point.SetId().Assign(pit.GetSeq_id());
4728  mix.Set().push_back(loc);
4729  } else {
4730  CRef<CSeq_loc> loc(new CSeq_loc);
4731  CSeq_interval& ival = loc->SetInt();
4732  ival.SetFrom(from);
4733  ival.SetTo(to);
4734  if (strand != eNa_strand_unknown) {
4735  ival.SetStrand(strand);
4736  }
4737  if (fuzz_from) {
4738  ival.SetFuzz_from().Assign(*fuzz_from);
4739  }
4740  if (fuzz_to) {
4741  ival.SetFuzz_to().Assign(*fuzz_to);
4742  }
4743  ival.SetId().Assign(pit.GetSeq_id());
4744  mix.Set().push_back(loc);
4745  }
4746  if (keep_going) {
4747  start = pos + length;
4748  } else {
4749  break;
4750  }
4751  }
4752  pos += length;
4753  }
4754  if (keep_going) {
4755  TSeqPos total_length;
4756  string label;
4757  new_parent.GetLabel(&label);
4758  try {
4759  total_length = sequence::GetLength(new_parent, scope);
4760  ERR_POST_X(8, Warning << "SRelLoc::Resolve: Relative position "
4761  << start << " exceeds length (" << total_length
4762  << ") of parent location " << label);
4763  } catch (CObjmgrUtilException&) {
4764  ERR_POST_X(9, Warning << "SRelLoc::Resolve: Relative position "
4765  << start
4766  << " exceeds length (?\?\?) of parent location "
4767  << label);
4768  }
4769  }
4770  }
4771  // clean up output
4772  switch (mix.Get().size()) {
4773  case 0:
4774  result->SetNull();
4775  break;
4776  case 1:
4777  {{
4778  CRef<CSeq_loc> first = mix.Set().front();
4779  result = first;
4780  break;
4781  }}
4782  default:
4783  break;
4784  }
4785  return result;
4786 }
4787 
4788 
4789 //============================================================================//
4790 // SeqSearch //
4791 //============================================================================//
4792 
4793 // Public:
4794 // =======
4795 
4796 // Constructors and Destructors:
4798  m_Client(client), m_Flags(flags), m_LongestPattern(0), m_Fsa(true)
4799 {
4800 }
4801 
4802 
4804 {
4805 }
4806 
4807 
4809 static const TCharPair sc_comp_tbl[32] = {
4810  // uppercase
4811  { 'A', 'T' },
4812  { 'B', 'V' },
4813  { 'C', 'G' },
4814  { 'D', 'H' },
4815  { 'G', 'C' },
4816  { 'H', 'D' },
4817  { 'K', 'M' },
4818  { 'M', 'K' },
4819  { 'N', 'N' },
4820  { 'R', 'Y' },
4821  { 'S', 'S' },
4822  { 'T', 'A' },
4823  { 'U', 'A' },
4824  { 'V', 'B' },
4825  { 'W', 'W' },
4826  { 'Y', 'R' },
4827  // lowercase
4828  { 'a', 'T' },
4829  { 'b', 'V' },
4830  { 'c', 'G' },
4831  { 'd', 'H' },
4832  { 'g', 'C' },
4833  { 'h', 'D' },
4834  { 'k', 'M' },
4835  { 'm', 'K' },
4836  { 'n', 'N' },
4837  { 'r', 'Y' },
4838  { 's', 'S' },
4839  { 't', 'A' },
4840  { 'u', 'A' },
4841  { 'v', 'B' },
4842  { 'w', 'W' },
4843  { 'y', 'R' },
4844 };
4847 
4848 
4849 inline
4850 static char s_GetComplement(char c)
4851 {
4852  TComplement::const_iterator comp_it = sc_Complement.find(c);
4853  return (comp_it != sc_Complement.end()) ? comp_it->second : '\0';
4854 }
4855 
4856 
4857 static string s_GetReverseComplement(const string& sequence)
4858 {
4859  string revcomp;
4860  revcomp.reserve(sequence.length());
4861  string::const_reverse_iterator rend = sequence.rend();
4862 
4863  for (string::const_reverse_iterator rit = sequence.rbegin(); rit != rend; ++rit) {
4864  revcomp += s_GetComplement(*rit);
4865  }
4866 
4867  return revcomp;
4868 }
4869 
4870 
4872 (const string& name,
4873  const string& sequence,
4874  Int2 cut_site,
4876 {
4877  if (NStr::IsBlank(name) || NStr::IsBlank(sequence)) {
4878  NCBI_THROW(CUtilException, eNoInput, "Empty input value");
4879  }
4880 
4881  // cleanup pattern
4882  string pattern = sequence;
4883  NStr::TruncateSpaces(pattern);
4884  NStr::ToUpper(pattern);
4885 
4886  string revcomp = s_GetReverseComplement(pattern);
4887  bool symmetric = (pattern == revcomp);
4888  ENa_strand strand = symmetric ? eNa_strand_both : eNa_strand_plus;
4889 
4890  // record expansion of entered pattern
4891  x_AddNucleotidePattern(name, pattern, cut_site, strand, flags);
4892 
4893  // record expansion of reverse complement of asymmetric pattern
4894  if (!symmetric && (!x_IsJustTopStrand(flags))) {
4895  TSeqPos revcomp_cut_site = TSeqPos(pattern.length()) - cut_site;
4896  x_AddNucleotidePattern(name, revcomp, revcomp_cut_site,
4898  }
4899 }
4900 
4901 
4902 // Program passes each character in turn to finite state machine.
4904 (int current_state,
4905  char ch,
4906  int position,
4907  int length)
4908 {
4909  if (m_Client == NULL) {
4910  return 0;
4911  }
4912 
4913  // on first character, populate state transition table
4914  if (!m_Fsa.IsPrimed()) {
4915  m_Fsa.Prime();
4916  }
4917 
4918  int next_state = m_Fsa.GetNextState(current_state, ch);
4919 
4920  // report matches (if any)
4921  if (m_Fsa.IsMatchFound(next_state)) {
4922  ITERATE(vector<TPatternInfo>, it, m_Fsa.GetMatches(next_state)) {
4923  int start = position - int(it->GetSequence().length()) + 1;
4924 
4925  // prevent multiple reports of patterns for circular sequences.
4926  if (start < length) {
4927  bool keep_going = m_Client->OnPatternFound(*it, start);
4928  if (!keep_going) {
4929  break;
4930  }
4931  }
4932  }
4933  }
4934 
4935  return next_state;
4936 }
4937 
4938 
4939 // Search entire bioseq.
4941 {
4942  if (!bsh || m_Client == NULL) {
4943  return;
4944  }
4945 
4947  TSeqPos seq_len = seq_vec.size();
4948  TSeqPos search_len = seq_len;
4949 
4950  // handle circular bioseqs
4951  CSeq_inst::ETopology topology = bsh.GetInst_Topology();
4952  if (topology == CSeq_inst::eTopology_circular) {
4953  search_len += TSeqPos(m_LongestPattern - 1);
4954  }
4955 
4956  int state = m_Fsa.GetInitialState();
4957 
4958  for (TSeqPos i = 0; i < search_len; ++i) {
4959  state = Search(state, seq_vec[i % seq_len], i, seq_len);
4960  }
4961 }
4962 
4963 
4964 // Private:
4965 // ========
4966 
4967 /// translation finite state machine base codes - ncbi4na
4969  eBase_A = 1, ///< A
4970  eBase_C, ///< C
4971  eBase_M, ///< AC
4972  eBase_G, ///< G
4973  eBase_R, ///< AG
4974  eBase_S, ///< CG
4975  eBase_V, ///< ACG
4976  eBase_T, ///< T
4977  eBase_W, ///< AT
4978  eBase_Y, ///< CT
4979  eBase_H, ///< ACT
4980  eBase_K, ///< GT
4981  eBase_D, ///< AGT
4982  eBase_B, ///< CGT
4983  eBase_N ///< ACGT
4984 };
4985 
4986 /// conversion table from Ncbi4na / Iupacna to EBaseCode
4987 static const EBaseCode sc_CharToEnum[256] = {
4988  // Ncbi4na
4993 
5006  // Iupacna (uppercase)
5015  // Iupacna (lowercase)
5023 
5057 };
5058 
5059 static const char sc_EnumToChar[16] = {
5060  '\0', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'
5061 };
5062 
5063 
5065 (const string& name,
5066  string& pattern,
5067  Int2 cut_site,
5068  ENa_strand strand,
5070 {
5071  if (pattern.length() > m_LongestPattern) {
5072  m_LongestPattern = pattern.length();
5073  }
5074 
5075  TPatternInfo pat_info(name, kEmptyStr, cut_site);
5076  pat_info.m_Strand = strand;
5077 
5078  if (!x_IsExpandPattern(flags)) {
5079  pat_info.m_Sequence = pattern;
5080  x_AddPattern(pat_info, pattern, flags);
5081  } else {
5082  string buffer;
5083  buffer.reserve(pattern.length());
5084 
5085  x_ExpandPattern(pattern, buffer, 0, pat_info, flags);
5086  }
5087 }
5088 
5089 
5091 (string& sequence,
5092  string& buf,
5093  size_t pos,
5094  TPatternInfo& pat_info,
5096 {
5097  static const EBaseCode expansion[] = { eBase_A, eBase_C, eBase_G, eBase_T };
5098 
5099  if (pos < sequence.length()) {
5100  Uint4 code = static_cast<Uint4>(sc_CharToEnum[static_cast<Uint1>(sequence[pos])]);
5101 
5102  for (int i = 0; i < 4; ++i) {
5103  if ((code & expansion[i]) != 0) {
5104  buf += sc_EnumToChar[expansion[i]];
5105  x_ExpandPattern(sequence, buf, pos + 1, pat_info, flags);
5106  buf.erase(pos);
5107  }
5108  }
5109  } else {
5110  // when position reaches pattern length, store one expanded string.
5111  x_AddPattern(pat_info, buf, flags);
5112  }
5113 }
5114 
5115 
5116 void CSeqSearch::x_AddPattern(TPatternInfo& pat_info, string& sequence, TSearchFlags flags)
5117 {
5118  x_StorePattern(pat_info, sequence);
5119 
5120  if (x_IsAllowMismatch(flags)) {
5121  // put 'N' at every position if a single mismatch is allowed.
5122  char ch = 'N';
5123  NON_CONST_ITERATE (string, it, sequence) {
5124  swap(*it, ch);
5125 
5126  x_StorePattern(pat_info, sequence);
5127 
5128  // restore proper character, go on to put N in next position.
5129  swap(*it, ch);
5130  }
5131  }
5132 }
5133 
5134 
5135 void CSeqSearch::x_StorePattern(TPatternInfo& pat_info, string& sequence)
5136 {
5137  pat_info.m_Sequence = sequence;
5138  m_Fsa.AddWord(sequence, pat_info);
5139 }
5140 
5141 
5143 {
5144  switch (inst.GetRepr()) {
5145  case CSeq_inst::eRepr_raw:
5147  break;
5149  if (!inst.IsSetExt() || !inst.GetExt().IsDelta()) {
5150  NCBI_THROW(CObjmgrUtilException, eBadSequenceType,
5151  "Sequence of this type cannot be reverse-complemented.");
5152  }
5153  // reverse order of segments
5154  inst.SetExt().SetDelta().Set().reverse();
5155  // reverse-complement individual segments
5156  NON_CONST_ITERATE(CSeq_inst::TExt::TDelta::Tdata, it, inst.SetExt().SetDelta().Set()) {
5157  switch ((*it)->Which()) {
5158  case CDelta_seq::e_Literal:
5159  if ((*it)->GetLiteral().IsSetSeq_data()) {
5160  CSeq_literal& lit = (*it)->SetLiteral();
5161  if (!lit.GetSeq_data().IsGap()) {
5163  }
5164  }
5165  break;
5166  case CDelta_seq::e_Loc:
5167  {{
5168  CRef<CSeq_loc> flip(sequence::SeqLocRevCmpl((*it)->SetLoc(), scope));
5169  (*it)->SetLoc(*flip);
5170  }}
5171  break;
5172  default:
5173  // do nothing
5174  break;
5175  }
5176  }
5177  break;
5178  default:
5179  NCBI_THROW(CObjmgrUtilException, eBadSequenceType,
5180  "Sequence of this type cannot be reverse-complemented.");
5181  break;
5182  }
5183 }
5184 
5185 
5188 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
LargeInt< 1 > revcomp(const LargeInt< 1 > &x, size_t sizeKmer)
Definition: LargeInt1.hpp:148
User-defined methods of the data storage class.
bool IsReverse(ENa_strand s)
Definition: Na_strand.hpp:75
ENa_strand Reverse(ENa_strand s)
Definition: Na_strand.hpp:90
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
bool SameOrientation(ENa_strand a, ENa_strand b)
Definition: Na_strand.hpp:83
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
#define true
Definition: bool.h:35
size_t GetSize(void) const
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
bool IsAa(void) const
Definition: Bioseq.cpp:350
CCdregion –.
Definition: Cdregion.hpp:66
virtual void setUpFeatureIterator(CBioseq_Handle &bioseq_handle, unique_ptr< CFeat_CI > &feat_ci, TSeqPos circular_length, CRange< TSeqPos > &range, const CSeq_loc &loc, SAnnotSelector &sel, CScope &scope, ENa_strand &strand)
Definition: sequence.cpp:1791
virtual void processMainLoop(bool &shouldContinueToNextIteration, CRef< CSeq_loc > &cleaned_loc_this_iteration, CRef< CSeq_loc > &candidate_feat_loc, EOverlapType &overlap_type_this_iteration, bool &revert_locations_this_iteration, CBioseq_Handle &bioseq_handle, const CMappedFeat &feat, TSeqPos circular_length, SAnnotSelector::EOverlapType annot_overlap_type)
Definition: sequence.cpp:1823
CGetOverlappingFeaturesPlugin * m_PrevPlugin
Definition: sequence.cpp:1869
virtual void processSAnnotSelector(SAnnotSelector &sel)
Definition: sequence.cpp:1783
virtual void postProcessDiffAmount(Int8 &cur_diff, CRef< CSeq_loc > &cleaned_loc, CRef< CSeq_loc > &candidate_feat_loc, CScope &scope, SAnnotSelector &sel, TSeqPos circular_length)
Definition: sequence.cpp:1853
virtual ~CCdsForMrnaPlugin()
Definition: sequence.cpp:1781
CCdsForMrnaPlugin(CGetOverlappingFeaturesPlugin *prev_plugin)
Definition: sequence.cpp:1779
virtual void processLoc(CBioseq_Handle &bioseq_handle, CRef< CSeq_loc > &loc, TSeqPos circular_length)
Definition: sequence.cpp:1813
CTime AsCTime(CTime::ETimeZone tz=CTime::eLocal) const
Definition: Date.cpp:70
Definition: Dbtag.hpp:53
CDelta_seq –.
Definition: Delta_seq.hpp:66
FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp>
Definition: sequence.hpp:770
CFeat_CI –.
Definition: feat_ci.hpp:64
CFeat_id –.
Definition: Feat_id.hpp:66
static const CTrans_table & GetTransTable(int id)
void GetLabel(string *label) const
Definition: Gene_ref.cpp:57
bool IsSuppressed(void) const
Definition: Gene_ref.cpp:75
CRef< CInt_fuzz > Negative(TSeqPos n) const
Definition: Int_fuzz.hpp:106
@ eAmplify
go for the largest possible range
Definition: Int_fuzz.hpp:69
CMappedFeat –.
Definition: mapped_feat.hpp:59