1 /* $Id: context.cpp 99483 2023-04-04 17:43:43Z stakhovv $
2 * ===========================================================================
3 *
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aaron Ucko, NCBI
27 * Mati Shomrat
28 *
29 * File Description:
30 * new (early 2003) flat-file generator -- context needed when (pre)formatting
31 *
32 * ===========================================================================
33 */
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistd.hpp>
38 #include <objects/seq/Bioseq.hpp>
39 #include <objects/seq/Seq_ext.hpp>
40 #include <objects/seq/Seg_ext.hpp>
51 #include <objmgr/scope.hpp>
52 #include <objmgr/bioseq_handle.hpp>
54 #include <objmgr/seq_entry_ci.hpp>
55 #include <objmgr/seqdesc_ci.hpp>
56 #include <objmgr/util/sequence.hpp>
58 #include <objmgr/seq_map.hpp>
59 #include <objmgr/seq_map_ci.hpp>
60 #include <objmgr/feat_ci.hpp>
61 #include <objmgr/bioseq_ci.hpp>
62 #include <objmgr/annot_ci.hpp>
69 USING_SCOPE(sequence);
72 /////////////////////////////////////////////////////////////////////////////
73 //
74 // CBioseqContext
76 // constructor
78 (const CBioseq_Handle& seq,
79  CFlatFileContext& ffctx,
80  CMasterContext* mctx,
81  CTopLevelSeqEntryContext *tlsec) :
82  m_Handle(seq),
83  m_pOpticalMapPoints(nullptr),
84  m_Repr(CSeq_inst::eRepr_not_set),
85  m_Mol(CSeq_inst::eMol_not_set),
86  m_HasParts(false),
87  m_IsPart(false),
88  m_PartNumber(0),
89  m_IsDeltaLitOnly(false),
90  m_IsProt(false),
91  m_IsInSGS(false),
92  m_IsInGPS(false),
93  m_IsInNucProt(false),
94  m_IsGED(false),
95  m_IsGenbank(false),
96  m_IsEMBL(false),
97  m_IsDDBJ(false),
98  m_IsPDB(false),
99  m_IsSP(false),
100  m_IsTPA(false),
101  m_IsJournalScan(false),
102  m_IsRefSeq(false),
103  m_RefseqInfo(0),
104  m_IsGbGenomeProject(false), // GenBank Genome project data (AE)
105  m_IsNcbiCONDiv(false), // NCBI CON division (CH)
106  m_IsNcbiGenomes(false),
107  m_IsPatent(false),
108  m_IsGI(false),
109  m_IsWGS(false),
110  m_IsWGSMaster(false),
111  m_IsTSA(false),
112  m_IsTSAMaster(false),
113  m_IsTLS(false),
114  m_IsTLSMaster(false),
115  m_IsHup(false),
116  m_Gi(ZERO_GI),
117  m_ShowGBBSource(false),
118  m_PatSeqid(0),
119  m_HasOperon(false),
120  m_HasMultiIntervalGenes(true), // true is the safe choice if we're not sure
121  m_IsGenomeAssembly(false),
122  m_IsCrossKingdom(false),
123  m_UsePDBCompoundForComment(false),
124  m_fUnverified(fUnverified_None),
125  m_fUnreviewed(fUnreviewed_None),
126  m_ShowAnnotCommentAsCOMMENT(false),
127  m_ShowAnnotCommentAsCOMMENT_checked(false),
128  m_FFCtx(ffctx),
129  m_RefCache(nullptr),
130  m_Master(mctx),
131  m_TLSeqEntryCtx(tlsec)
132 {
133  x_Init(seq, m_FFCtx.GetLocation());
134 }
138 (const CBioseq_Handle& prev_seq,
139  const CBioseq_Handle& seq,
140  const CBioseq_Handle& next_seq,
141  CFlatFileContext& ffctx,
142  CMasterContext* mctx,
143  CTopLevelSeqEntryContext *tlsec) :
144  m_PrevHandle(prev_seq),
145  m_Handle(seq),
146  m_NextHandle(next_seq),
147  m_pOpticalMapPoints(nullptr),
148  m_Repr(CSeq_inst::eRepr_not_set),
149  m_Mol(CSeq_inst::eMol_not_set),
150  m_HasParts(false),
151  m_IsPart(false),
152  m_PartNumber(0),
153  m_IsDeltaLitOnly(false),
154  m_IsProt(false),
155  m_IsInSGS(false),
156  m_IsInGPS(false),
157  m_IsInNucProt(false),
158  m_IsGED(false),
159  m_IsGenbank(false),
160  m_IsEMBL(false),
161  m_IsDDBJ(false),
162  m_IsPDB(false),
163  m_IsSP(false),
164  m_IsTPA(false),
165  m_IsJournalScan(false),
166  m_IsRefSeq(false),
167  m_RefseqInfo(0),
168  m_IsGbGenomeProject(false), // GenBank Genome project data (AE)
169  m_IsNcbiCONDiv(false), // NCBI CON division (CH)
170  m_IsNcbiGenomes(false),
171  m_IsPatent(false),
172  m_IsGI(false),
173  m_IsWGS(false),
174  m_IsWGSMaster(false),
175  m_IsTSA(false),
176  m_IsTSAMaster(false),
177  m_IsTLS(false),
178  m_IsTLSMaster(false),
179  m_IsHup(false),
180  m_Gi(ZERO_GI),
181  m_ShowGBBSource(false),
182  m_PatSeqid(0),
183  m_HasOperon(false),
184  m_HasMultiIntervalGenes(true), // true is the safe choice if we're not sure
185  m_IsGenomeAssembly(false),
186  m_IsCrossKingdom(false),
187  m_UsePDBCompoundForComment(false),
188  m_fUnverified(fUnverified_None),
189  m_fUnreviewed(fUnreviewed_None),
190  m_ShowAnnotCommentAsCOMMENT(false),
191  m_ShowAnnotCommentAsCOMMENT_checked(false),
192  m_FFCtx(ffctx),
193  m_RefCache(nullptr),
194  m_Master(mctx),
195  m_TLSeqEntryCtx(tlsec)
196 {
197  x_Init(seq, m_FFCtx.GetLocation());
198 }
201 // destructor
203 {
204  if (m_Virtual) {
206  }
207 }
211 {
212  if ( id.IsGi() && id.GetGi() == m_Gi ) {
214  }
216  CSeq_id_Handle idh =
218  return idh;
219 }
223 // initialization
224 void CBioseqContext::x_Init(const CBioseq_Handle& seq, const CSeq_loc* user_loc)
225 {
226  _ASSERT(seq);
227  _ASSERT(seq.IsSetInst());
229  // NB: order of execution is important
230  m_Repr = x_GetRepr();
231  m_Mol = seq.GetInst_Mol();
233  x_SetId();
235  if ( IsSegmented() ) {
237  }
238  m_IsPart = x_IsPart();
239  if ( m_IsPart ) {
240  _ASSERT(m_Master);
242  }
243  if ( IsDelta() ) {
245  }
249  m_IsInSGS = x_IsInSGS();
250  m_IsInGPS = x_IsInGPS();
253  x_SetLocation(user_loc);
259  // m_HasOperon = x_HasOperon();
261  if (IsRefSeq()) {
263  }
266  sel.SetResolveAll();
268  // x_SetHasMultiIntervalGenes();
270  // x_SetTaxname();
273 }
277 {
278  CRef<CSeq_loc> loc;
280  if (user_loc) {
281  // map the user location to the current bioseq
284  if ( !sequence::IsSameBioseq(idh1, idh2, &m_Handle.GetScope()) ) {
286  loc.Reset(mapper.Map(*user_loc));
287  } else {
288  loc.Reset(new CSeq_loc);
289  loc->Assign(*user_loc);
290  }
292  if (loc) {
293  if (loc->IsWhole()) {
294  loc.Reset();
295  } else if (loc->IsInt()) {
297  if (!IsReverse(loc->GetStrand()) && range.GetFrom() == 0 && range.GetTo() == m_Handle.GetInst_Length() - 1) {
298  loc.Reset();
299  }
300  }
301  }
302  }
304  // if no partial location specified do the entire bioseq
305  if (!loc) {
306  loc.Reset(new CSeq_loc);
307  loc->SetWhole(*m_PrimaryId);
308  } else {
309  x_SetMapper(*loc);
310  }
312  m_Location = loc;
313 }
317 {
320  // not covering the entire bioseq (may be multiple ranges)
321  CRef<CBioseq> vseq(new CBioseq(loc, GetAccession()));
322  vseq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
323  CBioseq_Handle vseqh = GetScope().AddBioseq(*vseq);
325  if (vseqh) {
329  //m_Mapper->KeepNonmappingRanges();
330  }
331 }
334 {
335  m_HasMultiIntervalGenes = false;
339  CFeat_CI gene_ci( m_Handle, sel );
340  for( ; gene_ci ; ++gene_ci ) {
341  switch( gene_ci->GetLocation().Which() ) {
344  case CSeq_loc::e_Mix:
345  case CSeq_loc::e_Equiv:
347  break;
348  default:
349  // do nothing
350  break;
351  }
353  break;
354  }
355  }
356 }
359 {
360  if (UsingSeqEntryIndex()) {
362  if (! idx) return false;
364  if (! bsx) return false;
365  return bsx->HasMultiIntervalGenes();
366  }
370 }
373 {
374  // look for taxname in Seqdescs
375  int num_super_kingdom = 0;
376  bool super_kingdoms_different = false;
377  string super_kingdom_name;
379  for( ; desc_ci; ++desc_ci ) {
380  if( desc_ci->IsSource() ) {
381  const CBioSource &bsrc = desc_ci->GetSource();
382  if (bsrc.IsSetOrgname()) {
383  const COrgName& onp = bsrc.GetOrgname();
384  if (onp.IsSetName()) {
385  const COrgName::TName& nam = onp.GetName();
386  if (nam.IsPartial()) {
387  const CPartialOrgName& pon = nam.GetPartial();
388  if (pon.IsSet()) {
389  const CPartialOrgName::Tdata& tx = pon.Get();
390  ITERATE (CPartialOrgName::Tdata, itr, tx) {
391  const CTaxElement& te = **itr;
392  if (te.IsSetFixed_level()) {
393  if (te.GetFixed_level() == 0 && te.IsSetLevel()) {
394  const string& lvl = te.GetLevel();
395  if (NStr::EqualNocase (lvl, "superkingdom")) {
396  num_super_kingdom++;
397  if (super_kingdom_name.empty() && te.IsSetName()) {
398  super_kingdom_name = te.GetName();
399  } else if (te.IsSetName() && ! NStr::EqualNocase (super_kingdom_name, te.GetName())) {
400  super_kingdoms_different = true;
401  }
402  if (num_super_kingdom > 1 && super_kingdoms_different) {
403  m_IsCrossKingdom = true;
404  }
405  }
406  }
407  }
408  }
409  }
410  }
411  }
412  }
413  if( bsrc.IsSetTaxname() && ! bsrc.GetTaxname().empty() ) {
414  // we found a taxname; but need to look at all descriptors to set m_IsCrossKingdom, so keep going
415  m_Taxname = bsrc.GetTaxname();
416  // return;
417  }
418  }
419  }
421  if (! m_Taxname.empty()) {
422  return;
423  }
425  // fall back on the Seq-feats
427  SAnnotSelector sel;
430  CFeat_CI biosrc_ci( m_Handle, sel );
431  for( ; biosrc_ci ; ++biosrc_ci ) {
432  CConstRef<CSeq_feat> seq_feat = biosrc_ci->GetSeq_feat();
433  if( seq_feat && seq_feat->IsSetData() ) {
434  const CSeqFeatData & seq_feat_data = seq_feat->GetData();
435  if( seq_feat_data.IsBiosrc() ) {
436  const CBioSource & bsrc = seq_feat_data.GetBiosrc();
437  if( bsrc.IsSetTaxname() && ! bsrc.GetTaxname().empty() ) {
438  // we found a taxname; we're done
439  m_Taxname = bsrc.GetTaxname();
440  return;
441  }
442  }
443  }
444  }
445 }
447 const string& CBioseqContext::GetTaxname(void) const
448 {
449  // check for indexed version first
450  if (UsingSeqEntryIndex()) {
452  if (idx) {
454  if (bsx) {
455  m_Taxname = bsx->GetTaxname();
456  }
457  }
458  return m_Taxname;
459  }
461  x_SetTaxname();
462  return m_Taxname;
463 }
467 {
468  // check for indexed version first
469  if (UsingSeqEntryIndex()) {
471  if (idx) {
473  if (bsx) {
475  }
476  }
477  return m_IsCrossKingdom;
478  }
480  x_SetTaxname();
481  return m_IsCrossKingdom;
482 }
486 {
488 }
492 {
494 }
498 {
499  if ( ! FIELD_IS_SET_AND_IS(uo, Type, Str) ||
500  ! NStr::EqualNocase(uo.GetType().GetStr(), "FileTrack"))
501  {
502  return;
503  }
505  CConstRef<CUser_field> pFileTrackURLField = uo.GetFieldRef("FileTrackURL");
506  if( ! pFileTrackURLField ) {
507  pFileTrackURLField = uo.GetFieldRef("Map-FileTrackURL");
508  }
509  if ( pFileTrackURLField) {
510  if ( FIELD_IS_SET_AND_IS(*pFileTrackURLField, Data, Str) ) {
511  if ( ! pFileTrackURLField->GetData().GetStr().empty() ) {
512  m_FiletrackURL = pFileTrackURLField->GetData().GetStr();
513  }
514  } else if ( FIELD_IS_SET_AND_IS(*pFileTrackURLField, Data, Strs) ) {
515  const vector< string > & strs = pFileTrackURLField->GetData().GetStrs();
516  FOR_EACH_STRING_IN_VECTOR (itr, strs) {
517  string str = *itr;
518  if ( ! str.empty() ) {
520  }
521  }
522  }
523  }
525  CConstRef<CUser_field> pBaseModURLField = uo.GetFieldRef("BaseModification-FileTrackURL");
526  if ( pBaseModURLField) {
527  if ( FIELD_IS_SET_AND_IS(*pBaseModURLField, Data, Str) ) {
528  if ( ! pBaseModURLField->GetData().GetStr().empty() ) {
529  m_BasemodURLs.push_back(pBaseModURLField->GetData().GetStr());
530  }
531  } else if ( FIELD_IS_SET_AND_IS(*pBaseModURLField, Data, Strs) ) {
532  m_BasemodURLs = pBaseModURLField->GetData().GetStrs();
533  }
534  }
535 }
538 {
539  if ( ! FIELD_IS_SET_AND_IS(uo, Type, Str) ||
540  ! NStr::EqualNocase(uo.GetType().GetStr(), "AuthorizedAccess"))
541  {
542  return;
543  }
544  CConstRef<CUser_field> pAuthorizedAccessField =
545  uo.GetFieldRef("Study");
546  if( ! pAuthorizedAccessField ||
547  ! FIELD_IS_SET_AND_IS(*pAuthorizedAccessField, Data, Str) ||
548  pAuthorizedAccessField->GetData().GetStr().empty() )
549  {
550  return;
551  }
552  m_AuthorizedAccess = pAuthorizedAccessField->GetData().GetStr();
553 }
556 {
557  if( GetRepr() != CSeq_inst::eRepr_map ||
558  ! FIELD_IS_SET_AND_IS(m_Handle, Inst_Ext, Map) )
559  {
560  return;
561  }
563  const CMap_ext & map_ext = m_Handle.GetInst_Ext().GetMap();
564  FOR_EACH_SEQFEAT_ON_MAPEXT(feat_it, map_ext ) {
565  const CSeq_feat & feat = **feat_it;
566  if( ! FIELD_IS_SET_AND_IS(feat, Data, Rsite) ||
567  ! feat.IsSetLocation() )
568  {
569  continue;
570  }
571  const CSeq_loc & feat_loc = feat.GetLocation();
572  switch( feat_loc.Which() ) {
573  case CSeq_loc::e_Pnt: {
574  const CSeq_point & seq_point = feat_loc.GetPnt();
576  if( seq_point.IsSetPoint() ) {
579  seq_point, Fuzz);
581  seq_point, Id);
583  seq_point, Strand);
584  m_pOpticalMapPointsDestroyer->AddPoint( seq_point.GetPoint() );
587  }
588  break;
589  }
591  m_pOpticalMapPoints = & feat_loc.GetPacked_pnt();
592  // in case a previous iteration set this
594  break;
595  default:
596  // ignore other types
597  break;
598  }
599  }
600 }
603 {
604  // translate finishing status
605  typedef SStaticPair<const char *, const char *> TFinStatElem;
606  static const TFinStatElem sc_finstat_map[] = {
607  { "Annotation-directed-improvement", "ANNOTATION_DIRECTED_IMPROVEMENT" },
608  { "High-quality-draft", "HIGH_QUALITY_DRAFT" },
609  { "Improved-high-quality-draft", "IMPROVED_HIGH_QUALITY_DRAFT" },
610  { "Noncontiguous-finished", "NONCONTIGUOUS_FINISHED" },
611  { "Standard-draft", "STANDARD_DRAFT" }
612  };
614  DEFINE_STATIC_ARRAY_MAP(TFinStatMap, sc_FinStatMap, sc_finstat_map);
616  for (CSeqdesc_CI it(m_Handle, CSeqdesc::e_User); it; ++it) {
617  const CUser_object& uo = it->GetUser();
619  if (uo.IsSetType() && uo.GetType().IsStr()) {
621  if( uo.IsSetData() ) {
622  ITERATE( CUser_object::TData, field_iter, uo.GetData() ) {
623  const CUser_field &field = **field_iter;
624  if( ! field.IsSetData() || ! field.GetData().IsStr() ||
625  ! field.IsSetLabel() || ! field.GetLabel().IsStr() ) {
626  continue;
627  }
628  if( field.GetLabel().GetStr() == "StructuredCommentPrefix" &&
629  field.GetData().GetStr() == "##Genome-Assembly-Data-START##" )
630  {
631  m_IsGenomeAssembly = true;
632  }
633  if( field.GetLabel().GetStr() == "Current Finishing Status" )
634  {
635  string asn_fin_stat = field.GetData().GetStr();
636  replace( asn_fin_stat.begin(), asn_fin_stat.end(), ' ', '-' );
637  TFinStatMap::const_iterator new_fin_stat_iter = sc_FinStatMap.find(asn_fin_stat.c_str());
638  if( new_fin_stat_iter != sc_FinStatMap.end() ) {
639  m_FinishingStatus = new_fin_stat_iter->second;
640  }
641  }
642  }
643  }
644  } else if (utype == CUser_object::eObjectType_Unverified) {
645  if (uo.IsUnverifiedOrganism()) {
647  }
648  if (uo.IsUnverifiedFeature()) {
650  }
651  if (uo.IsUnverifiedMisassembled()) {
653  }
654  if (uo.IsUnverifiedContaminant()) {
656  }
657  // default in the past was to use feature
660  }
661  } else if (utype == CUser_object::eObjectType_Unreviewed) {
662  if (uo.IsUnreviewedUnannotated()) {
664  }
665  } else if ( utype == CUser_object::eObjectType_FileTrack ) {
666  x_SetFiletrackURL(uo);
667  } else if ( NStr::EqualNocase(uo.GetType().GetStr(), "AuthorizedAccess") ) {
669  } else if ( NStr::EqualNocase(uo.GetType().GetStr(), "ENCODE") ) {
670  x_SetEncode(uo);
671  }
672  }
673  }
674 }
677 {
679  {
681  }
683 }
686 {
689  if (GetRepr() == CSeq_inst::eRepr_map) {
690  // TODO: is this right? Maybe handle it differently once
691  // CAnnot_CI is able to handle CSeq_inst::eRepr_map.
692  return;
693  }
695  // JIRA SQD-4444 : copy annot selector from the one saved in this context structure
696  // SAnnotSelector sel = m_FFCtx.SetAnnotSelector();
697  SAnnotSelector sel;
699  CAnnot_CI annot_ci(m_Handle, sel);
700  for( ; annot_ci; ++annot_ci ) {
701  if( ! annot_ci->Seq_annot_IsSetDesc() ) {
702  continue;
703  }
705  const CSeq_annot::TDesc & desc = annot_ci->Seq_annot_GetDesc();
706  ITERATE( CSeq_annot::TDesc::Tdata, one_desc_iter, desc.Get() ) {
707  const CAnnotdesc & one_desc = **one_desc_iter;
708  if( ! one_desc.IsUser() ) {
709  continue;
710  }
712  // we finally got down to an annot desc user object. See if it indicates any
713  // relevant information
714  const CUser_object & user_obj = one_desc.GetUser();
715  if( ! user_obj.IsSetType() || ! user_obj.GetType().IsStr() ||
716  ! user_obj.IsSetData() ||
717  user_obj.GetType().GetStr() != "AnnotDescCommentPolicy" )
718  {
719  continue;
720  }
722  // check policy flags
723  ITERATE( CUser_object::TData, policy_field_iter, user_obj.GetData() ) {
724  const CUser_field & policy_field = **policy_field_iter;
725  if( ! policy_field.IsSetLabel() || ! policy_field.GetLabel().IsStr() ||
726  ! policy_field.IsSetData() ||
727  policy_field.GetLabel().GetStr() != "Policy" )
728  {
729  continue;
730  }
732  if( policy_field.GetData().IsStr() ) {
733  const string & policy_str = policy_field.GetData().GetStr();
734  if( policy_str == "ShowInComment" ) {
736  }
737  }
738  }
739  }
740  }
741 }
745 {
746  return CFeat_CI(m_Handle.GetScope(),
747  *m_Location,
749 }
752 {
753  // check for indexed version first
754  if (UsingSeqEntryIndex()) {
756  if (! idx) return false;
758  if (! bsx) return false;
759  return bsx->HasOperon();
760  }
763  return m_HasOperon;
764 }
768 {
773  m_Accession.erase();
776  // -----------------------------------------------------------------------
777  // Look for TPA assembly:
778  // -----------------------------------------------------------------------
779  bool bTpaAssemblyPresent = false;
780  for (CSeqdesc_CI it(m_Handle, CSeqdesc::e_User); it; ++it) {
781  const CUser_object& obj = it->GetUser();
782  if ( !obj.GetType().IsStr() ) {
783  continue;
784  }
785  if ( obj.GetType().GetStr() == "TpaAssembly" ) {
786  bTpaAssemblyPresent = true;
787  continue;
788  }
789  if ( obj.GetType().GetStr() == "GenomeProjectsDB" ) {
790  m_IsGbGenomeProject = true;
791  continue;
792  }
793  }
795  ITERATE (CBioseq::TId, id_iter, m_Handle.GetBioseqCore()->GetId()) {
796  const CSeq_id& id = **id_iter;
797  const CTextseq_id* tsip = id.GetTextseq_Id();
798  const string& acc = (tsip && tsip->CanGetAccession()) ?
799  tsip->GetAccession() : kEmptyStr;
801  CSeq_id::EAccessionInfo acc_info = id.IdentifyAccession();
802  unsigned int acc_div = acc_info & CSeq_id::eAcc_division_mask;
804  switch ( id.Which() ) {
805  // Genbank, Embl or Ddbj
806  case CSeq_id::e_Embl:
807  m_IsEMBL = true;
808  break;
809  case CSeq_id::e_Ddbj:
810  m_IsDDBJ = true;
811  break;
812  case CSeq_id::e_Genbank:
813  m_IsGenbank = true;
814  switch (acc_info) {
816  m_IsGbGenomeProject = true;
817  break;
819  m_IsNcbiCONDiv = true;
820  break;
821  default:
822  break;
823  }
824  break;
825  // Patent
826  case CSeq_id::e_Patent:
827  m_IsPatent = true;
828  if (id.GetPatent().IsSetSeqid()) {
829  m_PatSeqid = id.GetPatent().GetSeqid();
830  }
831  break;
832  // RefSeq
833  case CSeq_id::e_Other:
834  m_IsRefSeq = true;
835  m_RefseqInfo = acc_info;
836  break;
837  // Gi
838  case CSeq_id::e_Gi:
839  m_IsGI = true;
840  m_Gi = id.GetGi();
841  break;
842  // PDB
843  case CSeq_id::e_Pdb:
844  m_IsPDB = true;
845  break;
846  // TPA
847  case CSeq_id::e_Tpg:
848  m_IsTPA = true;
849  m_IsGenbank = true;
850  break;
851  case CSeq_id::e_Tpe:
852  m_IsTPA = true;
853  m_IsEMBL = true;
854  break;
855  case CSeq_id::e_Tpd:
856  m_IsTPA = true;
857  m_IsDDBJ = true;
858  break;
859  case CSeq_id::e_General:
860  if ( id.GetGeneral().CanGetDb() ) {
861  if ( !NStr::CompareCase(id.GetGeneral().GetDb(), "BankIt") ) {
862  m_IsTPA = bTpaAssemblyPresent;
863  }
864  if( NStr::Equal(id.GetGeneral().GetDb(), "NCBI_GENOMES") ) {
865  m_IsNcbiGenomes = true;
866  }
867  }
868  break;
869  case CSeq_id::e_Gibbsq:
870  case CSeq_id::e_Gibbmt:
871  case CSeq_id::e_Giim:
872  m_IsJournalScan = true;
873  break;
875  m_IsSP = true;
876  break;
877  // nothing special
878  case CSeq_id::e_Pir:
879  case CSeq_id::e_not_set:
880  case CSeq_id::e_Local:
881  case CSeq_id::e_Prf:
882  default:
883  break;
884  }
886  // WGS
887  m_IsWGS = m_IsWGS || (acc_div == CSeq_id::eAcc_wgs);
889  if ( m_IsWGS && !acc.empty() ) {
890  /*
891  size_t len = acc.length();
892  m_IsWGSMaster =
893  ((len == 12 || len == 15) && NStr::EndsWith(acc, "000000")) ||
894  ((len == 14) && NStr::EndsWith(acc, "00000000")) ||
895  ((len == 13 || len == 16 || len == 17) && NStr::EndsWith(acc, "0000000"));
896  */
897  m_IsWGSMaster = (acc_info & CSeq_id::fAcc_master) != 0;
898  if ( m_IsWGSMaster ) {
899  m_WGSMasterAccn = acc;
900  m_WGSMasterName = tsip->CanGetName() ? tsip->GetName() : kEmptyStr;
901  }
902  }
904  // TSA
907  if ( m_IsTSA && !acc.empty() ) {
909  m_IsTSAMaster = true;
910  }
911  if ( m_IsTSAMaster ) {
912  m_TSAMasterAccn = acc;
913  m_TSAMasterName = tsip->CanGetName() ? tsip->GetName() : kEmptyStr;
914  }
915  }
917  // TLS
920  if ( m_IsTLS && !acc.empty() ) {
922  m_IsTLSMaster = true;
923  }
924  if ( m_IsTLSMaster ) {
925  m_TLSMasterAccn = acc;
926  m_TLSMasterName = tsip->CanGetName() ? tsip->GetName() : kEmptyStr;
927  }
928  }
931  // GBB source
934  if (m_IsGenbank || m_IsEMBL || m_IsDDBJ) {
935  if (acc.length() == 6) {
936  char ch = acc[0];
937  if (ch == 'J' || ch == 'K' || ch == 'L' || ch == 'M') {
938  m_ShowGBBSource = true;
939  }
940  }
941  }
942  }
944  // Genbank/Embl/Ddbj (GED)
946 }
950 {
951  return m_Handle.IsSetInst_Repr() ?
953 }
957 {
959  return desc ? &desc->GetMolinfo() : nullptr;
960 }
964 {
965  if ( m_Repr == CSeq_inst::eRepr_raw ||
969  const CSeq_entry_Handle& fftse = GetTopLevelEntry();
971  _ASSERT(eh && eh.IsSeq());
973  if (fftse != eh) {
974  eh = eh.GetParentEntry();
975  if ( eh && eh.IsSet() ) {
976  CBioseq_set_Handle bsst = eh.GetSet();
977  if ( bsst.IsSetClass() &&
978  bsst.GetClass() == CBioseq_set::eClass_parts ) {
979  return true;
980  }
981  }
982  }
983  }
984  return false;
985 }
989 {
990  _ASSERT(IsSegmented());
994  if ( !h ) {
995  return false;
996  }
998  // make sure the segmented set contains our bioseq
999  {{
1000  bool has_seq = false;
1001  for ( CSeq_entry_CI it(h); it; ++it ) {
1002  if ( it->IsSeq() && it->GetSeq() == m_Handle ) {
1003  has_seq = true;
1004  break;
1005  }
1006  }
1007  if ( !has_seq ) {
1008  return false;
1009  }
1010  }}
1012  // find the parts set
1013  {{
1014  for ( CSeq_entry_CI it(h); it; ++it ) {
1015  if ( it->IsSet() && it->GetSet().IsSetClass() &&
1016  it->GetSet().GetClass() == CBioseq_set::eClass_parts ) {
1017  return true;
1018  }
1019  }
1020  }}
1022  return false;
1023 }
1027 {
1028  _ASSERT(IsDelta());
1030  if ( m_Handle.IsSetInst_Ext() ) {
1032  if ( ext.IsDelta() ) {
1033  ITERATE (CDelta_ext::Tdata, it, ext.GetDelta().Get()) {
1034  if ( (*it)->IsLoc() ) {
1035  const CSeq_loc& loc = (*it)->GetLoc();
1036  if (loc.IsNull()) continue;
1037  return false;
1038  }
1039  }
1040  }
1041  }
1042  return true;
1043 }
1047 {
1048  return m_Master ? m_Master->GetPartNumber(m_Handle) : 0;
1049 }
1053 {
1054  CSeq_entry_Handle e =
1056  return e;
1057 }
1061 {
1062  CSeq_entry_Handle e =
1064  return e;
1065 }
1069 {
1070  CSeq_entry_Handle e =
1072  return e;
1073 }
1077 {
1078  const CFlatFileConfig& cfg = Config();
1079  if ( cfg.IsStyleContig() ) {
1080  return true;
1081  } else if ( cfg.IsStyleNormal() ) {
1082  if ( (IsSegmented() && !HasParts()) ||
1083  (IsDelta() && !IsDeltaLitOnly()) ) {
1084  return true;
1085  }
1086  }
1088  return false;
1089 }
1093 {
1094  if (uo.IsSetType() && uo.GetType().IsStr()) {
1095  if (NStr::EqualNocase(uo.GetType().GetStr(), "ENCODE")) {
1096  m_Encode.Reset(&uo);
1097  }
1098  }
1099 }
1102 /////////////////////////////////////////////////////////////////////////////
1103 //
1104 // CMasterContext
1108  m_Handle(seq)
1109 {
1110  _ASSERT(seq);
1111  _ASSERT(seq.GetInst_Ext().IsSeg());
1113  x_SetNumParts();
1114  x_SetBaseName();
1115 }
1119 {
1120 }
1124 {
1125  if ( !part ) {
1126  return 0;
1127  }
1128  CScope& scope = m_Handle.GetScope();
1130  SIZE_TYPE serial = 1;
1132  if ((*it)->IsNull()) {
1133  continue;
1134  }
1135  const CSeq_id& id = GetId(**it, &m_Handle.GetScope());
1137  if (bsh &&
1138  bsh.IsSetInst_Repr() &&
1140  if (bsh == part) {
1141  return serial;
1142  }
1143  ++serial;
1144  }
1145  }
1147  return 0;
1148 }
1152 {
1153  CScope& scope = m_Handle.GetScope();
1154  SIZE_TYPE count = 0;
1156  // count only non-gap and non-virtual parts
1158  const CSeq_loc& loc = **it;
1159  if (loc.IsNull()) { // skip gaps
1160  continue;
1161  }
1162  // count only non-virtual
1163  const CSeq_id_Handle id = CSeq_id_Handle::GetHandle(GetId(loc, &scope));
1165  if (part &&
1166  part.IsSetInst_Repr() &&
1168  ++count;
1169  }
1170  }
1171  m_NumParts = count;
1172 }
1175 static void s_GetNameForBioseq(const CBioseq_Handle& seq, string& name)
1176 {
1177  name.erase();
1179  CConstRef<CSeq_id> sip;
1180  ITERATE (CBioseq_Handle::TId, it, seq.GetId()) {
1181  CConstRef<CSeq_id> id = it->GetSeqId();
1182  if (id->IsGenbank() || id->IsEmbl() || id->IsDdbj() ||
1183  id->IsTpg() || id->IsTpe() || id->IsTpd()) {
1184  sip = id;
1185  break;
1186  }
1187  }
1189  if (sip) {
1190  const CTextseq_id* tsip = sip->GetTextseq_Id();
1191  if (tsip && tsip->CanGetName()) {
1192  name = tsip->GetName();
1193  }
1194  }
1195 }
1199 {
1200  string parent_name;
1201  s_GetNameForBioseq(m_Handle, parent_name);
1203  // if there's no "SEG_" prefix just use the master's name
1204  if (!NStr::StartsWith(parent_name, "SEG_")) {
1205  m_BaseName = parent_name;
1206  return;
1207  }
1209  // otherwise, eliminate the prefix ...
1210  parent_name = parent_name.substr(4);
1212  // ... and calculate a base name
1214  // find the first segment
1215  CScope* scope = &m_Handle.GetScope();
1216  CBioseq_Handle segment;
1217  const CSeqMap& seqmap = m_Handle.GetSeqMap();
1218  CSeqMap_CI it = seqmap.BeginResolved(scope,
1219  SSeqMapSelector()
1220  .SetResolveCount(1)
1221  .SetFlags(CSeqMap::fFindRef));
1222  while (it) {
1223  CSeq_id_Handle id = it.GetRefSeqid();
1224  segment = scope->GetBioseqHandleFromTSE(id, m_Handle);
1225  if (segment) {
1226  break;
1227  }
1228  }
1229  string seg_name;
1230  if (segment) {
1231  s_GetNameForBioseq(segment, seg_name);
1232  }
1234  if (!seg_name.empty() && NStr::EndsWith(seg_name, '1') &&
1235  parent_name.length() == seg_name.length() &&
1236  NStr::EndsWith(parent_name, '1')) {
1237  size_t pos = parent_name.length() - 2;
1238  for ( /*noop*/; pos > 0; --pos) {
1239  if (parent_name[pos] != '0') {
1240  break;
1241  }
1242  }
1243  parent_name.erase(pos + 1);
1244  }
1246  m_BaseName = parent_name;
1247 }
1249 /////////////////////////////////////////////////////////////////////////////
1250 //
1251 // CTopLevelSeqEntryContext
1254 {
1255  if (sep.IsSeq()) {
1256  // Is Bioseq
1257  const CBioseq& bsp = sep.GetSeq();
1258  for (auto& sid : bsp.GetId()) {
1259  TSEQID_CHOICE chs = sid->Which();
1260  switch (chs) {
1262  case CSeq_id_Base::e_Tpg:
1263  // Genbank allows merging only if it's the old-style 1 + 5 accessions
1264  {
1265  const CTextseq_id* tsid = sid->GetTextseq_Id ();
1266  if (tsid && tsid->IsSetAccession() && tsid->GetAccession().length() == 6) {
1267  m_CanSourcePubsBeFused = true;
1268  }
1269  }
1270  break;
1271  case CSeq_id_Base::e_Embl:
1272  case CSeq_id_Base::e_Ddbj:
1273  case CSeq_id_Base::e_Pir:
1275  case CSeq_id_Base::e_Prf:
1276  case CSeq_id_Base::e_Pdb:
1277  case CSeq_id_Base::e_Tpe:
1278  case CSeq_id_Base::e_Tpd:
1279  case CSeq_id_Base::e_Gpipe:
1283  // with some types, it's okay to merge
1284  m_CanSourcePubsBeFused = true;
1285  break;
1286  default:
1287  break;
1288  }
1289  }
1290  } else if (sep.IsSet()) {
1291  // Is Bioseq-set
1292  const CBioseq_set& bssp = sep.GetSet();
1293  if (bssp.CanGetClass() && bssp.GetClass() == CBioseq_set::eClass_small_genome_set) {
1294  m_HasSmallGenomeSet = true;
1295  }
1296  for (auto& seqentry : bssp.GetSeq_set()) {
1297  // recursively explore current Bioseq-set
1298  x_InitSeqs(*seqentry);
1299  }
1300  }
1301 }
1303 CTopLevelSeqEntryContext::CTopLevelSeqEntryContext( const CSeq_entry_Handle &entry_handle, bool useIndexedFasterSets )
1304 {
1305  m_CanSourcePubsBeFused = false;
1306  m_HasSmallGenomeSet = false;
1308  if (useIndexedFasterSets) {
1309  CSeq_entry_Handle tseh = entry_handle.GetTopLevelEntry();
1311  CSeq_entry& topsep = const_cast<CSeq_entry&>(*tcsep);
1312  x_InitSeqs( topsep );
1313  return;
1314  }
1316  CBioseq_CI bioseq_iter( entry_handle.GetScope(), *entry_handle.GetSeq_entryCore() );
1317  for( ; bioseq_iter; ++bioseq_iter ) {
1318  ITERATE( CBioseq_Handle::TId, it, bioseq_iter->GetId() ) {
1319  CConstRef<CSeq_id> seqId = (*it).GetSeqIdOrNull();
1320  if( ! seqId.IsNull() ) {
1321  switch( seqId->Which() ) {
1324  case CSeq_id_Base::e_Embl:
1325  case CSeq_id_Base::e_Pir:
1328  case CSeq_id_Base::e_Ddbj:
1329  case CSeq_id_Base::e_Prf:
1330  case CSeq_id_Base::e_Pdb:
1331  case CSeq_id_Base::e_Tpe:
1332  case CSeq_id_Base::e_Tpd:
1333  case CSeq_id_Base::e_Gpipe:
1334  // with some types, it's okay to merge
1335  m_CanSourcePubsBeFused = true;
1336  break;
1338  case CSeq_id_Base::e_Tpg:
1339  // Genbank allows merging only if it's the old-style 1 + 5 accessions
1340  if( seqId->GetTextseq_Id() &&
1341  seqId->GetTextseq_Id()->IsSetAccession() &&
1342  seqId->GetTextseq_Id()->GetAccession().length() == 6 ) {
1343  m_CanSourcePubsBeFused = true;
1344  }
1345  break;
1347  case CSeq_id_Base::e_Local:
1348  case CSeq_id_Base::e_Other:
1350  case CSeq_id_Base::e_Giim:
1351  case CSeq_id_Base::e_Gi:
1352  break;
1353  default:
1354  break;
1355  }
1356  }
1357  }
1358  }
1360  // check all Bioseq-sets, if any
1361  if( entry_handle.IsSet() ) {
1362  if( entry_handle.GetSet().CanGetClass() &&
1364  {
1365  m_HasSmallGenomeSet = true;
1366  } else {
1367  CSeq_entry_CI seq_entry_ci( entry_handle, CSeq_entry_CI::eRecursive );
1368  for( ; seq_entry_ci && ! m_HasSmallGenomeSet; ++seq_entry_ci ) {
1369  if( seq_entry_ci->IsSet() && seq_entry_ci->GetSet().CanGetClass() &&
1371  {
1372  m_HasSmallGenomeSet = true;
1373  break;
1374  }
1375  }
1376  }
1377  }
1378 }
