NCBI C++ ToolKit
indexer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jonathan Kans
27 *
28 */
29 
30 #include <ncbi_pch.hpp>
31 
32 #include <util/unicode.hpp>
33 #include <util/static_set.hpp>
34 #include <util/static_map.hpp>
35 
37 
38 #include <objmgr/feat_ci.hpp>
39 #include <objmgr/seqdesc_ci.hpp>
40 #include <objmgr/seq_map_ci.hpp>
41 #include <objmgr/error_codes.hpp>
42 
43 #include <objmgr/util/indexer.hpp>
44 #include <objmgr/util/sequence.hpp>
46 
47 #define NCBI_USE_ERRCODE_X ObjMgr_Indexer
48 
52 
53 
54 // CSeqEntryIndex
55 
56 // Constructors take top-level sequence object, create a CRef<CSeqMasterIndex>, and call its initializer
58 
59 {
60  m_Idx.Reset(new CSeqMasterIndex);
61  m_Idx->x_Initialize(topseh, policy, flags);
62 }
63 
65 
66 {
68  m_Idx->x_Initialize(bsh, policy, flags);
69 }
70 
72 
73 {
75  m_Idx->x_Initialize(topsep, policy, flags);
76 }
77 
79 
80 {
82  m_Idx->x_Initialize(seqset, policy, flags);
83 }
84 
86 
87 {
89  m_Idx->x_Initialize(bioseq, policy, flags);
90 }
91 
93 
94 {
96  m_Idx->x_Initialize(submit, policy, flags);
97 }
98 
100 
101 {
103  m_Idx->x_Initialize(topsep, sblock, policy, flags);
104 }
105 
107 
108 {
110  m_Idx->x_Initialize(topsep, descr, policy, flags);
111 }
112 
113 // Get first Bioseq index
115 
116 {
117  return m_Idx->GetBioseqIndex();
118 }
119 
120 // Get Nth Bioseq index
122 
123 {
124  return m_Idx->GetBioseqIndex(n);
125 }
126 
127 // Get Bioseq index by accession
129 
130 {
131  return m_Idx->GetBioseqIndex(accn);
132 }
133 
134 // Get Bioseq index by handle (via best Seq-id string)
136 
137 {
138  return m_Idx->GetBioseqIndex(bsh);
139 }
140 
141 // // Get Bioseq index by feature
143 
144 {
145  return m_Idx->GetBioseqIndex(mf);
146 }
147 
148 // Get Bioseq index by sublocation
150 
151 {
152  return m_Idx->GetBioseqIndex(loc);
153 }
154 
155 const vector<CRef<CBioseqIndex>>& CSeqEntryIndex::GetBioseqIndices(void)
156 
157 {
158  return m_Idx->GetBioseqIndices();
159 }
160 
161 const vector<CRef<CSeqsetIndex>>& CSeqEntryIndex::GetSeqsetIndices(void)
162 
163 {
164  return m_Idx->GetSeqsetIndices();
165 }
166 
168 
169 {
170  return m_Idx->DistributedReferences();
171 }
172 
174 
175 {
176  m_Idx->SetSnpFunc (snp);
177 }
178 
180 
181 {
182  return m_Idx->GetSnpFunc();
183 }
184 
185 void CSeqEntryIndex::SetFeatDepth(int featDepth)
186 
187 {
188  m_Idx->SetFeatDepth (featDepth);
189 }
190 
192 
193 {
194  return m_Idx->GetFeatDepth();
195 }
196 
197 void CSeqEntryIndex::SetGapDepth(int featDepth)
198 
199 {
200  m_Idx->SetGapDepth (featDepth);
201 }
202 
204 
205 {
206  return m_Idx->GetGapDepth();
207 }
208 
210 
211 {
212  return m_Idx->IsFetchFailure();
213 }
214 
216 
217 {
218  return m_Idx->IsIndexFailure();
219 }
220 
221 
222 // CSeqMasterIndex
223 
224 // Initializers take top-level sequence object, create Seq-entry wrapper if necessary
226 {
227  m_Policy = policy;
228  m_Flags = flags;
229 
230  m_Tseh = topseh.GetTopLevelEntry();
232  CSeq_entry& topsep = const_cast<CSeq_entry&>(*tcsep);
233  topsep.Parentize();
234  m_Tsep.Reset(&topsep);
235 
236  m_FeatTree = new feature::CFeatTree;
237 
238  m_HasOperon = false;
239  m_IsSmallGenomeSet = false;
240  m_DistributedReferences = false;
241  m_SnpFunc = 0;
242  m_FeatDepth = 0;
243  m_GapDepth = 0;
244  m_IndexFailure = false;
245 
246  try {
247  // Code copied from x_Init, then modified to reuse existing scope from CSeq_entry_Handle
249  if ( !m_Objmgr ) {
250  // raise hell
251  m_IndexFailure = true;
252  }
253 
255  if ( !m_Scope ) {
256  // raise hell
257  m_IndexFailure = true;
258  }
259 
260  m_Counter.Set(0);
261 
262  // Populate vector of CBioseqIndex objects representing local Bioseqs in blob
263  CRef<CSeqsetIndex> noparent;
264  x_InitSeqs( *m_Tsep, noparent );
265  }
266  catch (CException& e) {
267  m_IndexFailure = true;
268  ERR_POST_X(1, Error << "Error in CSeqMasterIndex::x_Init: " << e.what());
269  }
270 }
271 
273 {
274  m_Policy = policy;
275  m_Flags = flags;
276 
277  m_Tseh = bsh.GetTopLevelEntry();
279  CSeq_entry& topsep = const_cast<CSeq_entry&>(*tcsep);
280  topsep.Parentize();
281  m_Tsep.Reset(&topsep);
282 
283  m_FeatTree = new feature::CFeatTree;
284 
285  m_HasOperon = false;
286  m_IsSmallGenomeSet = false;
287  m_DistributedReferences = false;
288  m_SnpFunc = 0;
289  m_FeatDepth = 0;
290  m_GapDepth = 0;
291  m_IndexFailure = false;
292 
293  try {
294  // Code copied from x_Init, then modified to reuse existing scope from CSeq_entry_Handle
296  if ( !m_Objmgr ) {
297  // raise hell
298  m_IndexFailure = true;
299  }
300 
302  if ( !m_Scope ) {
303  // raise hell
304  m_IndexFailure = true;
305  }
306 
307  m_Counter.Set(0);
308 
309  // Populate vector of CBioseqIndex objects representing local Bioseqs in blob
310  CRef<CSeqsetIndex> noparent;
311  x_InitSeqs( *m_Tsep, noparent );
312  }
313  catch (CException& e) {
314  m_IndexFailure = true;
315  ERR_POST_X(1, Error << "Error in CSeqMasterIndex::x_Init: " << e.what());
316  }
317 }
318 
320 {
321  m_Policy = policy;
322  m_Flags = flags;
323 
324  topsep.Parentize();
325  m_Tsep.Reset(&topsep);
326 
327  x_Init();
328 }
329 
331 {
332  m_Policy = policy;
333  m_Flags = flags;
334 
335  CSeq_entry* parent = seqset.GetParentEntry();
336  if (parent) {
337  parent->Parentize();
338  m_Tsep.Reset(parent);
339  } else {
340  CRef<CSeq_entry> sep(new CSeq_entry);
341  sep->SetSet(seqset);
342  sep->Parentize();
343  m_Tsep.Reset(sep);
344  }
345 
346  x_Init();
347 }
348 
350 {
351  m_Policy = policy;
352  m_Flags = flags;
353 
354  CSeq_entry* parent = bioseq.GetParentEntry();
355  if (parent) {
356  parent->Parentize();
357  m_Tsep.Reset(parent);
358  } else {
359  CRef<CSeq_entry> sep(new CSeq_entry);
360  sep->SetSeq(bioseq);
361  sep->Parentize();
362  m_Tsep.Reset(sep);
363  }
364 
365  x_Init();
366 }
367 
369 {
370  m_Policy = policy;
371  m_Flags = flags;
372 
373  _ASSERT(submit.CanGetData());
374  _ASSERT(submit.CanGetSub());
375  _ASSERT(submit.GetData().IsEntrys());
376  _ASSERT(!submit.GetData().GetEntrys().empty());
377 
378  CRef<CSeq_entry> sep = submit.GetData().GetEntrys().front();
379  sep->Parentize();
380  m_Tsep.Reset(sep);
381  m_SbtBlk.Reset(&submit.GetSub());
382 
383  x_Init();
384 }
385 
387 {
388  m_Policy = policy;
389  m_Flags = flags;
390 
391  topsep.Parentize();
392  m_Tsep.Reset(&topsep);
393  m_SbtBlk.Reset(&sblock);
394 
395  x_Init();
396 }
397 
399 {
400  m_Policy = policy;
401  m_Flags = flags;
402 
403  topsep.Parentize();
404  m_Tsep.Reset(&topsep);
405  m_TopDescr.Reset(&descr);
406 
407  x_Init();
408 }
409 
411 
412 {
413  m_SnpFunc = snp;
414 }
415 
417 
418 {
419  return m_SnpFunc;
420 }
421 
422 void CSeqMasterIndex::SetFeatDepth (int featDepth)
423 
424 {
425  m_FeatDepth = featDepth;
426 }
427 
429 
430 {
431  return m_FeatDepth;
432 }
433 
434 void CSeqMasterIndex::SetGapDepth (int gapDepth)
435 
436 {
437  m_GapDepth = gapDepth;
438 }
439 
441 
442 {
443  return m_GapDepth;
444 }
445 
446 
447 // At end of program, poll all Bioseqs to check for far fetch failure flag
449 
450 {
451  for (auto& bsx : m_BsxList) {
452  if (bsx->IsFetchFailure()) {
453  return true;
454  }
455  }
456  return false;
457 }
458 
459 // FindBestIdChoice modified from feature_item.cpp
460 static int s_IdxSeqIdHandle(const CSeq_id_Handle& idh)
461 {
462  CConstRef<CSeq_id> id = idh.GetSeqId();
463  CRef<CSeq_id> id_non_const
464  (const_cast<CSeq_id*>(id.GetPointer()));
465  return CSeq_id::Score(id_non_const);
466 }
467 
469 {
472 
473  ITERATE( CBioseq_Handle::TId, it, ids ) {
474  switch( (*it).Which() ) {
475  case CSeq_id::e_Local:
476  case CSeq_id::e_Genbank:
477  case CSeq_id::e_Embl:
478  case CSeq_id::e_Ddbj:
480  case CSeq_id::e_Pir:
481  case CSeq_id::e_Gi:
482  case CSeq_id::e_Other:
483  case CSeq_id::e_General:
484  case CSeq_id::e_Tpg:
485  case CSeq_id::e_Tpe:
486  case CSeq_id::e_Tpd:
487  case CSeq_id::e_Gpipe:
488  tracker(*it);
489  break;
490  default:
491  break;
492  }
493  }
494  return tracker.GetBestChoice();
495 }
496 
498 
499 {
500  if (bsh) {
501  const CBioseq_Handle::TId& ids = bsh.GetId();
502  if (! ids.empty()) {
504  if (best) {
505  return best.AsString();
506  }
507  }
508  }
509 
510  return "";
511 }
512 
513 // Recursively explores from top-level Seq-entry to make flattened vector of CBioseqIndex objects
515 
516 {
517  if (sep.IsSeq()) {
518  // Is Bioseq
519  const CBioseq& bsp = sep.GetSeq();
521  if (bsh) {
522  // create CBioseqIndex object for current Bioseq
523  CRef<CBioseqIndex> bsx(new CBioseqIndex(bsh, bsp, bsh, prnt, m_Tseh, m_Scope, *this, m_Policy, m_Flags));
524 
525  // record CBioseqIndex in vector for IterateBioseqs or GetBioseqIndex
526  m_BsxList.push_back(bsx);
527 
528  // map from accession string to CBioseqIndex object
529  const string& accn = bsx->GetAccession();
530  m_AccnIndexMap[accn] = bsx;
531 
532  const CBioseq_Handle::TId& ids = bsh.GetId();
533  if (! ids.empty()) {
534  ITERATE( CBioseq_Handle::TId, it, ids ) {
535  TSEQID_CHOICE chs = (*it).Which();
536  switch( chs ) {
537  case CSeq_id::e_Local:
538  case CSeq_id::e_Genbank:
539  case CSeq_id::e_Embl:
540  case CSeq_id::e_Ddbj:
542  case CSeq_id::e_Pir:
543  case CSeq_id::e_Gi:
544  case CSeq_id::e_Other:
545  case CSeq_id::e_General:
546  case CSeq_id::e_Tpg:
547  case CSeq_id::e_Tpe:
548  case CSeq_id::e_Tpd:
549  case CSeq_id::e_Gpipe:
550  {
551  // map from handle to Seq-id string to CBioseqIndex object
552  string str = (*it).AsString();
553  m_BestIdIndexMap[str] = bsx;
554  break;
555  }
556  default:
557  break;
558  }
559  }
560  }
561 
562  if (bsp.IsSetDescr()) {
563  for (auto& desc : bsp.GetDescr().Get()) {
564  if (desc->Which() == CSeqdesc::e_Pub) {
566  }
567  }
568  }
569 
570  if (bsp.IsSetAnnot()) {
571  for (auto& annt : bsp.GetAnnot()) {
572  if (annt->IsFtable()) {
573  for (auto& feat : annt->GetData().GetFtable()) {
574  if (feat->IsSetData() && feat->GetData().Which() == CSeqFeatData::e_Pub) {
576  } else if (feat->IsSetCit()) {
578  }
579  }
580  }
581  }
582  }
583  }
584  } else if (sep.IsSet()) {
585  // Is Bioseq-set
586  const CBioseq_set& bssp = sep.GetSet();
588  if (ssh) {
589  // create CSeqsetIndex object for current Bioseq-set
590  CRef<CSeqsetIndex> ssx(new CSeqsetIndex(ssh, bssp, prnt));
591 
593  m_IsSmallGenomeSet = true;
594  }
595 
596  if (level > 0 && bssp.IsSetDescr()) {
597  for (auto& desc : bssp.GetDescr().Get()) {
598  if (desc->Which() == CSeqdesc::e_Pub) {
600  }
601  }
602  }
603 
604  // record CSeqsetIndex in vector
605  m_SsxList.push_back(ssx);
606 
607  for (auto& seqentry : bssp.GetSeq_set()) {
608  // recursively explore current Bioseq-set
609  x_InitSeqs(*seqentry, ssx, level + 1);
610  }
611 
612  if (bssp.IsSetAnnot()) {
613  for (auto& annt : bssp.GetAnnot()) {
614  if (annt->IsFtable()) {
615  for (auto& feat : annt->GetData().GetFtable()) {
616  if (feat->IsSetData() && feat->GetData().Which() == CSeqFeatData::e_Pub) {
618  } else if (feat->IsSetCit()) {
620  }
621  }
622  }
623  }
624  }
625  }
626  }
627 }
628 
629 // Common initialization function creates local default CScope
631 
632 {
633  m_FeatTree = new feature::CFeatTree;
634 
635  m_HasOperon = false;
636  m_IsSmallGenomeSet = false;
637  m_DistributedReferences = false;
638  m_SnpFunc = 0;
639  m_FeatDepth = 0;
640  m_GapDepth = 0;
641  m_IndexFailure = false;
642 
643  try {
645  if ( !m_Objmgr ) {
646  // raise hell
647  m_IndexFailure = true;
648  }
649 
650  m_Scope.Reset( new CScope( *m_Objmgr ) );
651  if ( !m_Scope ) {
652  // raise hell
653  m_IndexFailure = true;
654  }
655 
656  m_Counter.Set(0);
657 
658  m_Scope->AddDefaults();
659 
661 
662  // Populate vector of CBioseqIndex objects representing local Bioseqs in blob
663  CRef<CSeqsetIndex> noparent;
664  x_InitSeqs( *m_Tsep, noparent );
665  }
666  catch (CException& e) {
667  m_IndexFailure = true;
668  ERR_POST_X(1, Error << "Error in CSeqMasterIndex::x_Init: " << e.what());
669  }
670 }
671 
672 // Get first Bioseq index
674 
675 {
676  for (auto& bsx : m_BsxList) {
677  return bsx;
678  }
679  return CRef<CBioseqIndex> ();
680 }
681 
682 // Get Nth Bioseq index
684 
685 {
686  for (auto& bsx : m_BsxList) {
687  n--;
688  if (n > 0) continue;
689  return bsx;
690  }
691  return CRef<CBioseqIndex> ();
692 }
693 
694 // Get Bioseq index by accession
696 
697 {
699  if (it != m_AccnIndexMap.end()) {
700  CRef<CBioseqIndex> bsx = it->second;
701  return bsx;
702  }
703  return CRef<CBioseqIndex> ();
704 }
705 
706 // Get Bioseq index by handle (via best Seq-id string)
708 
709 {
710  string bestid = s_IdxGetBestIdString(bsh);
712  if (it != m_BestIdIndexMap.end()) {
713  CRef<CBioseqIndex> bsx = it->second;
714  return bsx;
715  }
716  return CRef<CBioseqIndex> ();
717 }
718 
719 // Get Bioseq index by string
721 
722 {
724  if (it != m_BestIdIndexMap.end()) {
725  CRef<CBioseqIndex> bsx = it->second;
726  return bsx;
727  }
728  return CRef<CBioseqIndex> ();
729 }
730 
731 // Get Bioseq index by feature
733 
734 {
735  CSeq_id_Handle idh = mf.GetLocationId();
737  return GetBioseqIndex(bsh);
738 }
739 
740 // Get Bioseq index by sublocation
742 
743 {
745  return GetBioseqIndex(bsh);
746 }
747 
748 // Allow access to internal vectors for application to use in iterators
749 const vector<CRef<CBioseqIndex>>& CSeqMasterIndex::GetBioseqIndices(void)
750 
751 {
752  return m_BsxList;
753 }
754 
755 const vector<CRef<CSeqsetIndex>>& CSeqMasterIndex::GetSeqsetIndices(void)
756 
757 {
758  return m_SsxList;
759 }
760 
761 
762 // CSeqsetIndex
763 
764 // Constructor
766  const CBioseq_set& bssp,
767  CRef<CSeqsetIndex> prnt)
768  : m_Ssh(ssh),
769  m_Bssp(bssp),
770  m_Prnt(prnt)
771 {
773 
774  if (ssh.IsSetClass()) {
775  m_Class = ssh.GetClass();
776  }
777 }
778 
779 
780 // CBioseqIndex
781 
782 // Constructor
784  const CBioseq& bsp,
785  CBioseq_Handle obsh,
786  CRef<CSeqsetIndex> prnt,
787  CSeq_entry_Handle tseh,
788  CRef<CScope> scope,
789  CSeqMasterIndex& idx,
792  : m_Bsh(bsh),
793  m_Bsp(bsp),
794  m_OrigBsh(obsh),
795  m_Prnt(prnt),
796  m_Tseh(tseh),
797  m_Scope(scope),
798  m_Idx(&idx),
799  m_Policy(policy),
800  m_Flags(flags)
801 {
802  m_FetchFailure = false;
803 
804  m_GapsInitialized = false;
805  m_DescsInitialized = false;
806  m_FeatsInitialized = false;
807  m_SourcesInitialized = false;
808  m_FeatForProdInitialized = false;
810 
811  m_ForceOnlyNearFeats = false;
812 
813  // reset member variables to cleared state
814  m_IsNA = false;
815  m_IsAA = false;
816  m_Topology = NCBI_SEQTOPOLOGY(not_set);
817 
818  m_IsDelta = false;
819  m_IsDeltaLitOnly = false;
820  m_IsVirtual = false;
821  m_IsMap = false;
822 
823  m_Title.clear();
824 
825  m_MolInfo.Reset();
829 
830  m_Accession.clear();
831 
832  m_IsRefSeq = false;
833  m_IsNC = false;
834  m_IsNM = false;
835  m_IsNR = false;
836  m_IsNZ = false;
837  m_IsPatent = false;
838  m_IsPDB = false;
839  m_IsWP = false;
840  m_ThirdParty = false;
841  m_WGSMaster = false;
842  m_TSAMaster = false;
843  m_TLSMaster = false;
844 
845  m_GeneralStr.clear();
846  m_GeneralId = 0;
847  m_PatentCountry.clear();
848  m_PatentNumber.clear();
849 
850  m_PatentSequence = 0;
851 
852  m_PDBChain = 0;
853  m_PDBChainID.clear();
854 
855  m_HTGTech = false;
856  m_HTGSUnfinished = false;
857  m_IsTLS = false;
858  m_IsTSA = false;
859  m_IsWGS = false;
860  m_IsEST_STS_GSS = false;
861 
862  m_UseBiosrc = false;
863 
864  m_HTGSCancelled = false;
865  m_HTGSDraft = false;
866  m_HTGSPooled = false;
867  m_TPAExp = false;
868  m_TPAInf = false;
869  m_TPAReasm = false;
870  m_Unordered = false;
871 
873 
875  m_DescTaxname.clear();
876 
877  m_BioSource.Reset();
878  m_Taxname.clear();
879  m_Common.clear();
880  m_Lineage.clear();
882  m_UsingAnamorph = false;
883  m_Genus.clear();
884  m_Species.clear();
885  m_Multispecies = false;
886  m_Genome = NCBI_GENOME(unknown);
887  m_IsPlasmid = false;
888  m_IsChromosome = false;
889 
890  m_Organelle.clear();
891 
892  m_FirstSuperKingdom.clear();
893  m_SecondSuperKingdom.clear();
894  m_IsCrossKingdom = false;
895 
898  m_Clone.clear();
899  m_has_clone = false;
900  m_Map.clear();
901  m_Plasmid.clear();
902  m_Segment.clear();
903 
904  m_Breed.clear();
905  m_Cultivar.clear();
907  m_Isolate.clear();
908  m_Strain.clear();
909  m_Substrain.clear();
911 
912  m_IsUnverified = false;
913  m_IsUnverifiedFeature = false;
914  m_IsUnverifiedOrganism = false;
917 
918  m_IsUnreviewed = false;
920 
922 
923  m_Comment.clear();
924  m_IsPseudogene = false;
925 
926  m_HasGene = false;
927  m_HasMultiIntervalGenes = false;
928  m_HasSource = false;
929 
930  m_rEnzyme.clear();
931 
932  // now start setting member variables from Bioseq
933  m_IsNA = m_Bsh.IsNa();
934  m_IsAA = m_Bsh.IsAa();
936  m_Length = 0;
937 
938  if (m_Bsh.IsSetInst()) {
939  if (m_Bsh.IsSetInst_Topology()) {
941  }
942 
943  if (m_Bsh.IsSetInst_Length()) {
945  } else {
947  }
948 
949  if (m_Bsh.IsSetInst_Repr()) {
951  m_IsDelta = (repr == CSeq_inst::eRepr_delta);
953  m_IsMap = (repr == CSeq_inst::eRepr_map);
954  }
955  if (m_IsDelta && m_Bsh.IsSetInst_Ext()) {
957  bool hasLoc = false;
958  if ( ext.IsDelta() ) {
959  ITERATE (CDelta_ext::Tdata, it, ext.GetDelta().Get()) {
960  if ( (*it)->IsLoc() ) {
961  const CSeq_loc& loc = (*it)->GetLoc();
962  if (loc.IsNull()) continue;
963  hasLoc = true;
964  }
965  }
966  }
967  if (! hasLoc) {
968  m_IsDeltaLitOnly = true;
969  }
970  }
971  }
972 
973  // process Seq-ids
974  for (CSeq_id_Handle sid : obsh.GetId()) {
975  // first switch to set RefSeq and ThirdParty flags
976  switch (sid.Which()) {
977  case NCBI_SEQID(Other):
978  m_IsRefSeq = true;
979  break;
980  case NCBI_SEQID(Tpg):
981  case NCBI_SEQID(Tpe):
982  case NCBI_SEQID(Tpd):
983  m_ThirdParty = true;
984  break;
985  default:
986  break;
987  }
988  // second switch now avoids complicated flag setting logic
989  switch (sid.Which()) {
990  case NCBI_SEQID(Tpg):
991  case NCBI_SEQID(Tpe):
992  case NCBI_SEQID(Tpd):
993  case NCBI_SEQID(Other):
994  case NCBI_SEQID(Genbank):
995  case NCBI_SEQID(Embl):
996  case NCBI_SEQID(Ddbj):
997  {
998  CConstRef<CSeq_id> id = sid.GetSeqId();
999  const CTextseq_id& tsid = *id->GetTextseq_Id ();
1000  if (tsid.IsSetAccession()) {
1001  m_Accession = tsid.GetAccession ();
1003  TACCN_CHOICE div = (TACCN_CHOICE) (type & NCBI_ACCN(division_mask));
1004  if ( div == NCBI_ACCN(wgs) )
1005  {
1006  if( (type & CSeq_id::fAcc_master) != 0 ) {
1007  m_WGSMaster = true;
1008  }
1009  } else if ( div == NCBI_ACCN(tsa) )
1010  {
1011  if( (type & CSeq_id::fAcc_master) != 0 && m_IsVirtual ) {
1012  m_TSAMaster = true;
1013  }
1014  } else if (type == NCBI_ACCN(refseq_chromosome)) {
1015  m_IsNC = true;
1016  } else if (type == NCBI_ACCN(refseq_mrna)) {
1017  m_IsNM = true;
1018  } else if (type == NCBI_ACCN(refseq_mrna_predicted)) {
1019  m_IsNM = true;
1020  } else if (type == NCBI_ACCN(refseq_ncrna)) {
1021  m_IsNR = true;
1022  } else if (type == NCBI_ACCN(refseq_wgs_nuc)) {
1023  m_IsNZ = true;
1024  } else if (type == NCBI_ACCN(refseq_unique_prot)) {
1025  m_IsWP = true;
1026  }
1027  }
1028  break;
1029  }
1030  case NCBI_SEQID(General):
1031  {
1032  CConstRef<CSeq_id> id = sid.GetSeqId();
1033  const CDbtag& gen_id = id->GetGeneral ();
1034  if (! gen_id.IsSkippable ()) {
1035  if (gen_id.IsSetTag ()) {
1036  const CObject_id& oid = gen_id.GetTag();
1037  if (oid.IsStr()) {
1038  m_GeneralStr = oid.GetStr();
1039  } else if (oid.IsId()) {
1040  m_GeneralId = oid.GetId();
1041  }
1042  }
1043  }
1044  break;
1045  }
1046  case NCBI_SEQID(Pdb):
1047  {
1048  m_IsPDB = true;
1049  CConstRef<CSeq_id> id = sid.GetSeqId();
1050  const CPDB_seq_id& pdb_id = id->GetPdb ();
1051  if (pdb_id.IsSetChain_id()) {
1052  m_PDBChainID = pdb_id.GetChain_id();
1053  } else if (pdb_id.IsSetChain()) {
1054  m_PDBChain = pdb_id.GetChain();
1055  }
1056  break;
1057  }
1058  case NCBI_SEQID(Patent):
1059  {
1060  m_IsPatent = true;
1061  CConstRef<CSeq_id> id = sid.GetSeqId();
1062  const CPatent_seq_id& pat_id = id->GetPatent();
1063  if (pat_id.IsSetSeqid()) {
1064  m_PatentSequence = pat_id.GetSeqid();
1065  }
1066  if (pat_id.IsSetCit()) {
1067  const CId_pat& cit = pat_id.GetCit();
1068  m_PatentCountry = cit.GetCountry();
1069  m_PatentNumber = cit.GetSomeNumber();
1070  }
1071  break;
1072  }
1073  case NCBI_SEQID(Gpipe):
1074  break;
1075  default:
1076  break;
1077  }
1078  }
1079 
1080  // process restriction map
1081  if (m_IsMap) {
1082  if (bsh.IsSetInst_Ext() && bsh.GetInst_Ext().IsMap()) {
1083  const CMap_ext& mp = bsh.GetInst_Ext().GetMap();
1084  if (mp.IsSet()) {
1085  const CMap_ext::Tdata& ft = mp.Get();
1086  ITERATE (CMap_ext::Tdata, itr, ft) {
1087  const CSeq_feat& feat = **itr;
1088  const CSeqFeatData& data = feat.GetData();
1089  if (! data.IsRsite()) continue;
1090  const CRsite_ref& rsite = data.GetRsite();
1091  if (rsite.IsStr()) {
1092  m_rEnzyme = rsite.GetStr();
1093  }
1094  }
1095  }
1096  }
1097  }
1098 }
1099 
1100 // Destructor
1102 
1103 {
1104 }
1105 
1106 // Gap collection (delayed until needed)
1108 
1109 {
1110  try {
1111  if (m_GapsInitialized) {
1112  return;
1113  }
1114 
1115  m_GapsInitialized = true;
1116 
1117  if (! m_IsDelta) {
1118  return;
1119  }
1120 
1121  SSeqMapSelector sel;
1122 
1123  size_t resolveCount = 0;
1124 
1126  auto idxl = idx.Lock();
1127  if (idxl) {
1128  resolveCount = idxl->GetGapDepth();
1129  }
1130 
1132  .SetResolveCount(resolveCount);
1133 
1134  // explore gaps, pass original target BioseqHandle if using Bioseq sublocation
1135  for (CSeqMap_CI gap_it(m_OrigBsh, sel); gap_it; ++gap_it) {
1136 
1137  TSeqPos start = gap_it.GetPosition();
1138  TSeqPos end = gap_it.GetEndPosition();
1139  TSeqPos length = gap_it.GetLength();
1140 
1141  // attempt to find CSeq_gap info
1142  const CSeq_gap * pGap = NULL;
1143  if( gap_it.IsSetData() && gap_it.GetData().IsGap() ) {
1144  pGap = &gap_it.GetData().GetGap();
1145  } else {
1146  CConstRef<CSeq_literal> pSeqLiteral = gap_it.GetRefGapLiteral();
1147  if( pSeqLiteral && pSeqLiteral->IsSetSeq_data() ) {
1148  const CSeq_data & seq_data = pSeqLiteral->GetSeq_data();
1149  if( seq_data.IsGap() ) {
1150  pGap = &seq_data.GetGap();
1151  }
1152  }
1153  }
1154 
1155  CFastaOstream::SGapModText gap_mod_text;
1156  if( pGap ) {
1157  CFastaOstream::GetGapModText(*pGap, gap_mod_text);
1158  }
1159  string type = gap_mod_text.gap_type;
1160  vector<string>& evidence = gap_mod_text.gap_linkage_evidences;
1161 
1162  bool isUnknownLength = gap_it.IsUnknownLength();
1163 
1164  // feature name depends on what quals we use
1165  bool isAssemblyGap = ( ! type.empty() || ! evidence.empty() );
1166 
1167  CRef<CGapIndex> sgx(new CGapIndex(start, end, length, type, evidence, isUnknownLength, isAssemblyGap, *this));
1168  m_GapList.push_back(sgx);
1169  }
1170  }
1171  catch (CException& e) {
1172  ERR_POST_X(3, Error << "Error in CBioseqIndex::x_InitGaps: " << e.what());
1173  }
1174 }
1175 
1176 static const char* x_OrganelleName (
1177  TBIOSOURCE_GENOME genome,
1178  bool has_plasmid,
1179  bool virus_or_phage,
1180  bool wgs_suffix
1181 )
1182 
1183 {
1184  const char* result = kEmptyCStr;
1185 
1186  switch (genome) {
1187  case NCBI_GENOME(chloroplast):
1188  result = "chloroplast";
1189  break;
1190  case NCBI_GENOME(chromoplast):
1191  result = "chromoplast";
1192  break;
1193  case NCBI_GENOME(kinetoplast):
1194  result = "kinetoplast";
1195  break;
1196  case NCBI_GENOME(mitochondrion):
1197  {
1198  if (has_plasmid || wgs_suffix) {
1199  result = "mitochondrial";
1200  } else {
1201  result = "mitochondrion";
1202  }
1203  break;
1204  }
1205  case NCBI_GENOME(plastid):
1206  result = "plastid";
1207  break;
1208  case NCBI_GENOME(macronuclear):
1209  {
1210  result = "macronuclear";
1211  break;
1212  }
1213  case NCBI_GENOME(extrachrom):
1214  {
1215  if (! wgs_suffix) {
1216  result = "extrachromosomal";
1217  }
1218  break;
1219  }
1220  case NCBI_GENOME(plasmid):
1221  {
1222  if (! wgs_suffix) {
1223  result = "plasmid";
1224  }
1225  break;
1226  }
1227  // transposon and insertion-seq are obsolete
1228  case NCBI_GENOME(cyanelle):
1229  result = "cyanelle";
1230  break;
1231  case NCBI_GENOME(proviral):
1232  {
1233  if (! virus_or_phage) {
1234  if (has_plasmid || wgs_suffix) {
1235  result = "proviral";
1236  } else {
1237  result = "provirus";
1238  }
1239  }
1240  break;
1241  }
1242  case NCBI_GENOME(virion):
1243  {
1244  if (! virus_or_phage) {
1245  result = "virus";
1246  }
1247  break;
1248  }
1249  case NCBI_GENOME(nucleomorph):
1250  {
1251  if (! wgs_suffix) {
1252  result = "nucleomorph";
1253  }
1254  break;
1255  }
1256  case NCBI_GENOME(apicoplast):
1257  result = "apicoplast";
1258  break;
1259  case NCBI_GENOME(leucoplast):
1260  result = "leucoplast";
1261  break;
1262  case NCBI_GENOME(proplastid):
1263  result = "proplastid";
1264  break;
1265  case NCBI_GENOME(endogenous_virus):
1266  result = "endogenous virus";
1267  break;
1268  case NCBI_GENOME(hydrogenosome):
1269  result = "hydrogenosome";
1270  break;
1271  case NCBI_GENOME(chromosome):
1272  result = "chromosome";
1273  break;
1274  case NCBI_GENOME(chromatophore):
1275  result = "chromatophore";
1276  break;
1277  }
1278 
1279  return result;
1280 }
1281 
1282 static bool s_BlankOrNotSpecialTaxname (string taxname)
1283 
1284 {
1285  if (taxname.empty()) {
1286  return true;
1287  }
1288 
1289  if (NStr::EqualNocase (taxname, "synthetic construct")) {
1290  return false;
1291  }
1292  if (NStr::EqualNocase (taxname, "artificial sequence")) {
1293  return false;
1294  }
1295  if (NStr::EqualNocase (taxname, "vector")) {
1296  return false;
1297  }
1298  if (NStr::EqualNocase (taxname, "Vector")) {
1299  return false;
1300  }
1301 
1302  return true;
1303 }
1304 
1306 
1307 {
1308  try {
1309  if (m_SourcesInitialized) {
1310  return;
1311  }
1312 
1313  m_SourcesInitialized = true;
1314 
1315  if (! m_DescsInitialized) {
1316  x_InitDescs();
1317  }
1318 
1323  if (sfxp) {
1325  if (bsrx) {
1326  CMappedFeat src_feat = bsrx->GetMappedFeat();
1327  if (src_feat) {
1328  const CBioSource& bsrc = src_feat.GetData().GetBiosrc();
1329  m_BioSource.Reset (&bsrc);
1330  }
1331  }
1332  }
1333  }
1334  }
1335 
1336  if (m_DescBioSource && ! m_BioSource) {
1338  }
1339 
1340  if (m_BioSource.NotEmpty()) {
1341  const string *common = 0;
1342 
1343  // get organism name
1344  if (m_BioSource->IsSetTaxname()) {
1346  }
1347  if (m_BioSource->IsSetCommon()) {
1348  common = &m_BioSource->GetCommon();
1349  }
1350  if (m_BioSource->IsSetOrgname()) {
1351  const COrgName& onp = m_BioSource->GetOrgname();
1352  if (onp.CanGetLineage()) {
1353  m_Lineage = onp.GetLineage();
1354  }
1355  }
1356  if (m_BioSource->CanGetOrg()) {
1357  const COrg_ref& org = m_BioSource->GetOrg();
1358  m_Taxid = org.GetTaxId();
1359  }
1360  if (m_BioSource->IsSetGenome()) {
1362  m_IsPlasmid = (m_Genome == NCBI_GENOME(plasmid));
1363  m_IsChromosome = (m_Genome == NCBI_GENOME(chromosome));
1364  }
1365 
1366  // process SubSource
1368  const CSubSource& sbs = **sbs_itr;
1369  if (! sbs.IsSetName()) continue;
1370  const string& str = sbs.GetName();
1372  case NCBI_SUBSOURCE(chromosome):
1373  m_Chromosome = str;
1374  break;
1375  case NCBI_SUBSOURCE(clone):
1376  m_Clone = str;
1377  m_has_clone = true;
1378  break;
1379  case NCBI_SUBSOURCE(map):
1380  m_Map = str;
1381  break;
1382  case NCBI_SUBSOURCE(plasmid_name):
1383  m_Plasmid = str;
1384  break;
1385  case NCBI_SUBSOURCE(segment):
1386  m_Segment = str;
1387  break;
1388  case NCBI_SUBSOURCE(linkage_group):
1389  m_LinkageGroup = str;
1390  break;
1391  default:
1392  break;
1393  }
1394  }
1395 
1396  if (m_BioSource->IsSetOrgname()) {
1397  const COrgName& onp = m_BioSource->GetOrgname();
1398  if (onp.IsSetName()) {
1399  const COrgName::TName& nam = onp.GetName();
1400  if (nam.IsBinomial()) {
1401  const CBinomialOrgName& bon = nam.GetBinomial();
1402  if (bon.IsSetGenus()) {
1403  m_Genus = bon.GetGenus();
1404  }
1405  if (bon.IsSetSpecies()) {
1406  m_Species = bon.GetSpecies();
1407  }
1408  } else if (nam.IsPartial()) {
1409  const CPartialOrgName& pon = nam.GetPartial();
1410  if (pon.IsSet()) {
1411  const CPartialOrgName::Tdata& tx = pon.Get();
1412  ITERATE (CPartialOrgName::Tdata, itr, tx) {
1413  const CTaxElement& te = **itr;
1414  if (te.IsSetFixed_level()) {
1415  int fl = te.GetFixed_level();
1416  if (fl > 0) {
1417  m_Multispecies = true;
1418  } else if (te.IsSetLevel()) {
1419  const string& lvl = te.GetLevel();
1420  if (! NStr::EqualNocase (lvl, "species")) {
1421  m_Multispecies = true;
1422  }
1423  }
1424  }
1425  }
1426  }
1427  }
1428  }
1429  }
1430 
1431  // process OrgMod
1432  const string *com = 0, *acr = 0, *syn = 0, *ana = 0,
1433  *gbacr = 0, *gbana = 0, *gbsyn = 0, *met = 0;
1434  int numcom = 0, numacr = 0, numsyn = 0, numana = 0,
1435  numgbacr = 0, numgbana = 0, numgbsyn = 0, nummet = 0;
1436 
1438  const COrgMod& omd = **omd_itr;
1439  if (! omd.IsSetSubname()) continue;
1440  const string& str = omd.GetSubname();
1441  SWITCH_ON_ORGMOD_CHOICE (omd) {
1442  case NCBI_ORGMOD(strain):
1443  if (m_Strain.empty()) {
1444  m_Strain = str;
1445  }
1446  break;
1447  case NCBI_ORGMOD(substrain):
1448  if (m_Substrain.empty()) {
1449  m_Substrain = str;
1450  }
1451  break;
1452  case NCBI_ORGMOD(cultivar):
1453  if (m_Cultivar.empty()) {
1454  m_Cultivar = str;
1455  }
1456  break;
1457  case NCBI_ORGMOD(specimen_voucher):
1458  if (m_SpecimenVoucher.empty()) {
1460  }
1461  break;
1462  case NCBI_ORGMOD(isolate):
1463  if (m_Isolate.empty()) {
1464  m_Isolate = str;
1465  }
1466  break;
1467  case NCBI_ORGMOD(breed):
1468  if (m_Breed.empty()) {
1469  m_Breed = str;
1470  }
1471  case NCBI_ORGMOD(common):
1472  com = &str;
1473  numcom++;
1474  break;
1475  case NCBI_ORGMOD(acronym):
1476  acr = &str;
1477  numacr++;
1478  break;
1479  case NCBI_ORGMOD(synonym):
1480  syn = &str;
1481  numsyn++;
1482  break;
1483  case NCBI_ORGMOD(anamorph):
1484  ana = &str;
1485  numana++;
1486  break;
1487  case NCBI_ORGMOD(gb_acronym):
1488  gbacr = &str;
1489  numgbacr++;
1490  break;
1491  case NCBI_ORGMOD(gb_synonym):
1492  gbsyn = &str;
1493  numgbsyn++;
1494  break;
1495  case NCBI_ORGMOD(gb_anamorph):
1496  gbana = &str;
1497  numgbana++;
1498  break;
1499  case NCBI_ORGMOD(metagenome_source):
1500  if (m_MetaGenomeSource.empty()) {
1502  }
1503  met = &str;
1504  nummet++;
1505  break;
1506  default:
1507  break;
1508  }
1509  }
1510 
1511  if (numacr > 1) {
1512  acr = NULL;
1513  }
1514  if (numana > 1) {
1515  ana = NULL;
1516  }
1517  if (numcom > 1) {
1518  com = NULL;
1519  }
1520  if (numsyn > 1) {
1521  syn = NULL;
1522  }
1523  if (numgbacr > 1) {
1524  gbacr = NULL;
1525  }
1526  if (numgbana > 1) {
1527  gbana = NULL;
1528  }
1529  if (numgbsyn > 1) {
1530  gbsyn = NULL;
1531  }
1532  if( nummet > 1 ) {
1533  met = NULL;
1534  }
1535 
1536  if( met != 0 ) {
1537  m_Common = *met;
1538  } else if ( syn != 0 ) {
1539  m_Common = *syn;
1540  } else if ( acr != 0 ) {
1541  m_Common = *acr;
1542  } else if ( ana != 0 ) {
1543  m_Common = *ana;
1544  m_UsingAnamorph = true;
1545  } else if ( com != 0 ) {
1546  m_Common = *com;
1547  } else if ( gbsyn != 0 ) {
1548  m_Common = *gbsyn;
1549  } else if ( gbacr != 0 ) {
1550  m_Common = *gbacr;
1551  } else if ( gbana != 0 ) {
1552  m_Common = *gbana;
1553  m_UsingAnamorph = true;
1554  } else if ( common != 0 ) {
1555  m_Common = *common;
1556  }
1557  }
1558 
1559  bool virus_or_phage = false;
1560  bool has_plasmid = false;
1561  bool wgs_suffix = false;
1562 
1563  if (NStr::FindNoCase(m_Taxname, "virus") != NPOS ||
1564  NStr::FindNoCase(m_Taxname, "phage") != NPOS) {
1565  virus_or_phage = true;
1566  }
1567 
1568  if (! m_Plasmid.empty()) {
1569  has_plasmid = true;
1570  /*
1571  if (NStr::FindNoCase(m_Plasmid, "plasmid") == NPOS &&
1572  NStr::FindNoCase(m_Plasmid, "element") == NPOS) {
1573  pls_pfx = " plasmid ";
1574  }
1575  */
1576  }
1577 
1578  if (m_IsWGS) {
1579  wgs_suffix = true;
1580  }
1581 
1582  m_Organelle = x_OrganelleName (m_Genome, has_plasmid, virus_or_phage, wgs_suffix);
1583  }
1584  catch (CException& e) {
1585  ERR_POST_X(4, Error << "Error in CBioseqIndex::x_InitSource: " << e.what());
1586  }
1587 }
1588 
1589 // Descriptor collection (delayed until needed)
1591 
1592 {
1593  try {
1594  if (m_DescsInitialized) {
1595  return;
1596  }
1597 
1598  m_DescsInitialized = true;
1599 
1600  const list <string> *keywords = NULL;
1601 
1602  int num_super_kingdom = 0;
1603  bool super_kingdoms_different = false;
1604 
1605  // explore descriptors, pass original target BioseqHandle if using Bioseq sublocation
1606  for (CSeqdesc_CI desc_it(m_OrigBsh); desc_it; ++desc_it) {
1607  const CSeqdesc& sd = *desc_it;
1608  CRef<CDescriptorIndex> sdx(new CDescriptorIndex(sd, *this));
1609  m_SdxList.push_back(sdx);
1610 
1611  switch (sd.Which()) {
1612  case CSeqdesc::e_Source:
1613  {
1614  if (! m_DescBioSource) {
1615  const CBioSource& biosrc = sd.GetSource();
1616  m_DescBioSource.Reset (&biosrc);
1617  if (m_IsNA && ! m_BioSource) {
1619  }
1620  }
1621  if (m_IsWP) {
1622  const CBioSource &bsrc = sd.GetSource();
1623  if (! bsrc.IsSetOrgname()) break;
1624  const COrgName& onp = bsrc.GetOrgname();
1625  if (! onp.IsSetName()) break;
1626  const COrgName::TName& nam = onp.GetName();
1627  if (! nam.IsPartial()) break;
1628  const CPartialOrgName& pon = nam.GetPartial();
1629  if (! pon.IsSet()) break;
1630  const CPartialOrgName::Tdata& tx = pon.Get();
1631  ITERATE (CPartialOrgName::Tdata, itr, tx) {
1632  const CTaxElement& te = **itr;
1633  if (! te.IsSetFixed_level()) continue;
1634  if (te.GetFixed_level() != 0) continue;
1635  if (! te.IsSetLevel()) continue;
1636  const string& lvl = te.GetLevel();
1637  if (! NStr::EqualNocase (lvl, "superkingdom")) continue;
1638  num_super_kingdom++;
1639  if (m_FirstSuperKingdom.empty() && te.IsSetName()) {
1641  } else if (te.IsSetName() && ! NStr::EqualNocase (m_FirstSuperKingdom, te.GetName())) {
1642  if (m_SecondSuperKingdom.empty()) {
1643  super_kingdoms_different = true;
1645  }
1646  }
1647  if (num_super_kingdom > 1 && super_kingdoms_different) {
1648  m_IsCrossKingdom = true;
1649  }
1650  }
1651  }
1652  break;
1653  }
1654  case CSeqdesc::e_Molinfo:
1655  {
1656  if (! m_MolInfo) {
1657  const CMolInfo& molinf = sd.GetMolinfo();
1658  m_MolInfo.Reset (&molinf);
1659  m_Biomol = molinf.GetBiomol();
1660  m_Tech = molinf.GetTech();
1661  m_Completeness = molinf.GetCompleteness();
1662 
1663  switch (m_Tech) {
1664  case NCBI_TECH(htgs_0):
1665  case NCBI_TECH(htgs_1):
1666  case NCBI_TECH(htgs_2):
1667  m_HTGSUnfinished = true;
1668  // manufacture all titles for unfinished HTG sequences
1669  // m_Reconstruct = true;
1670  m_Title.clear();
1671  // fall through
1672  case NCBI_TECH(htgs_3):
1673  m_HTGTech = true;
1674  m_UseBiosrc = true;
1675  break;
1676  case NCBI_TECH(est):
1677  case NCBI_TECH(sts):
1678  case NCBI_TECH(survey):
1679  m_IsEST_STS_GSS = true;
1680  m_UseBiosrc = true;
1681  break;
1682  case NCBI_TECH(wgs):
1683  m_IsWGS = true;
1684  m_UseBiosrc = true;
1685  break;
1686  case NCBI_TECH(tsa):
1687  m_IsTSA = true;
1688  m_UseBiosrc = true;
1689  if (m_IsVirtual) {
1690  m_TSAMaster = true;
1691  }
1692  break;
1693  case NCBI_TECH(targeted):
1694  m_IsTLS = true;
1695  m_UseBiosrc = true;
1696  if (m_IsVirtual) {
1697  m_TLSMaster = true;
1698  }
1699  break;
1700  default:
1701  break;
1702  }
1703  }
1704  break;
1705  }
1706  case CSeqdesc::e_Title:
1707  {
1708  if (m_Title.empty()) {
1709  // for everything other than PDB proteins, title must be packaged on Bioseq - RW-2005
1710  if ( m_IsPDB || desc_it.GetSeq_entry_Handle().IsSeq() ) {
1711  m_Title = sd.GetTitle();
1712  }
1713  }
1714  break;
1715  }
1716  case CSeqdesc::e_User:
1717  {
1718  const CUser_object& usr = sd.GetUser();
1719  if (usr.IsSetType()) {
1720  const CObject_id& oi = usr.GetType();
1721  if (oi.IsStr()) {
1722  const string& type = oi.GetStr();
1723  if (NStr::EqualNocase(type, "FeatureFetchPolicy")) {
1724  FOR_EACH_USERFIELD_ON_USEROBJECT (uitr, usr) {
1725  const CUser_field& fld = **uitr;
1726  if (fld.IsSetLabel() && fld.GetLabel().IsStr()) {
1727  const string &label_str = GET_FIELD(fld.GetLabel(), Str);
1728  if (! NStr::EqualNocase(label_str, "Policy")) continue;
1729  if (fld.IsSetData() && fld.GetData().IsStr()) {
1730  const string& str = fld.GetData().GetStr();
1731  if (NStr::EqualNocase(str, "OnlyNearFeatures")) {
1732  m_ForceOnlyNearFeats = true;
1733  }
1734  }
1735  }
1736  }
1737  } else if (NStr::EqualNocase(type, "Unverified")) {
1738  m_IsUnverified = true;
1739  if (usr.IsUnverifiedOrganism()) {
1740  m_IsUnverifiedOrganism = true;
1741  }
1742  if (usr.IsUnverifiedMisassembled()) {
1744  }
1745  if (usr.IsUnverifiedContaminant()) {
1747  }
1748  if (usr.IsUnverifiedFeature()) {
1749  m_IsUnverifiedFeature = true;
1750  }
1751  } else if (NStr::EqualNocase(type, "Unreviewed")) {
1752  m_IsUnreviewed = true;
1753  if (usr.IsUnreviewedUnannotated()) {
1755  }
1756  } else if (NStr::EqualNocase(type, "AutodefOptions")) {
1757  FOR_EACH_USERFIELD_ON_USEROBJECT (uitr, usr) {
1758  const CUser_field& fld = **uitr;
1759  if (! FIELD_IS_SET_AND_IS(fld, Label, Str)) continue;
1760  const string &label_str = GET_FIELD(fld.GetLabel(), Str);
1761  if (! NStr::EqualNocase(label_str, "Targeted Locus Name")) continue;
1762  if (fld.IsSetData() && fld.GetData().IsStr()) {
1763  m_TargetedLocus = fld.GetData().GetStr();
1764  }
1765  }
1766  }
1767  }
1768  }
1769  break;
1770  }
1771  case CSeqdesc::e_Comment:
1772  {
1773  m_Comment = sd.GetComment();
1774  if (NStr::Find (m_Comment, "[CAUTION] Could be the product of a pseudogene") != string::npos) {
1775  m_IsPseudogene = true;
1776  }
1777  break;
1778  }
1779  case CSeqdesc::e_Genbank:
1780  {
1781  const CGB_block& gbk = desc_it->GetGenbank();
1782  if (gbk.IsSetKeywords()) {
1783  keywords = &gbk.GetKeywords();
1784  }
1785  break;
1786  }
1787  case CSeqdesc::e_Embl:
1788  {
1789  const CEMBL_block& ebk = desc_it->GetEmbl();
1790  if (ebk.IsSetKeywords()) {
1791  keywords = &ebk.GetKeywords();
1792  }
1793  break;
1794  }
1795  case CSeqdesc::e_Pdb:
1796  {
1797  if (m_PDBCompound.empty()) {
1798  _ASSERT(m_IsPDB);
1799  const CPDB_block& pbk = desc_it->GetPdb();
1800  FOR_EACH_COMPOUND_ON_PDBBLOCK (cp_itr, pbk) {
1801  if (m_PDBCompound.empty()) {
1802  m_PDBCompound = *cp_itr;
1803  break;
1804  }
1805  }
1806  }
1807  break;
1808  }
1809  default:
1810  break;
1811  }
1812  }
1813 
1814  if (keywords != NULL) {
1815  FOR_EACH_STRING_IN_LIST (kw_itr, *keywords) {
1816  const string& clause = *kw_itr;
1817  list<string> kywds;
1818  NStr::Split( clause, ";", kywds, NStr::fSplit_Tokenize );
1819  FOR_EACH_STRING_IN_LIST ( k_itr, kywds ) {
1820  const string& str = *k_itr;
1821  if (NStr::EqualNocase (str, "UNORDERED")) {
1822  m_Unordered = true;
1823  }
1824  if ((! m_HTGTech) && (! m_ThirdParty)) continue;
1825  if (NStr::EqualNocase (str, "HTGS_DRAFT")) {
1826  m_HTGSDraft = true;
1827  } else if (NStr::EqualNocase (str, "HTGS_CANCELLED")) {
1828  m_HTGSCancelled = true;
1829  } else if (NStr::EqualNocase (str, "HTGS_POOLED_MULTICLONE")) {
1830  m_HTGSPooled = true;
1831  } else if (NStr::EqualNocase (str, "TPA:experimental")) {
1832  m_TPAExp = true;
1833  } else if (NStr::EqualNocase (str, "TPA:inferential")) {
1834  m_TPAInf = true;
1835  } else if (NStr::EqualNocase (str, "TPA:reassembly")) {
1836  m_TPAReasm = true;
1837  } else if (NStr::EqualNocase (str, "TPA:assembly")) {
1838  m_TPAReasm = true;
1839  }
1840  }
1841  }
1842  }
1843  }
1844  catch (CException& e) {
1845  ERR_POST_X(5, Error << "Error in CBioseqIndex::x_InitDescs: " << e.what());
1846  }
1847 }
1848 
1850 
1851 {
1852  bool snpOK = false;
1853  bool cddOK = false;
1854 
1855  if (policy == CSeqEntryIndex::eExhaustive) {
1856 
1857  // experimental policy forces collection of features from all sequence levels
1858  sel.SetResolveAll();
1860  // ignores RefSeq/INSD barrier, overrides far fetch policy user object
1861  // for now, always excludes external annots, ignores custom enable bits
1862 
1863  } else if (policy == CSeqEntryIndex::eInternal || onlyNear) {
1864 
1865  // do not fetch features from underlying sequence component records
1866  sel.SetResolveDepth(0);
1867  sel.SetExcludeExternal(true);
1868  // always excludes external annots, ignores custom enable bits
1869 
1870  } else if (policy == CSeqEntryIndex::eAdaptive) {
1871 
1872  sel.SetResolveAll();
1873  // normal situation uses adaptive depth for feature collection,
1874  // includes barrier between RefSeq and INSD accession types
1875  sel.SetAdaptiveDepth(true);
1876 
1877  // conditionally allows external annots, based on custom enable bits
1878  if ((flags & CSeqEntryIndex::fShowSNPFeats) != 0) {
1879  snpOK = true;
1880  }
1881  if ((flags & CSeqEntryIndex::fShowCDDFeats) != 0) {
1882  cddOK = true;
1883  }
1884 
1885  } else if (policy == CSeqEntryIndex::eExternal) {
1886 
1887  // same as eAdaptive
1888  sel.SetResolveAll();
1889  sel.SetAdaptiveDepth(true);
1890 
1891  // but always allows external annots without need for custom enable bits
1892  snpOK = true;
1893  cddOK = true;
1894 
1895  } else if (policy == CSeqEntryIndex::eFtp) {
1896 
1897  // for public ftp releases
1898  if (m_IsRefSeq) {
1899  // For genomes FTP, we're running with a local ASN cache. Fetching from ID has already
1900  // happened, and we specifically want to restrict to using annotation from the cache.
1901  sel.SetResolveDepth(0);
1902  sel.SetExcludeExternal(true);
1903  } else if (m_IsDeltaLitOnly) {
1904  sel.SetResolveDepth(0);
1905  sel.SetExcludeExternal(true);
1906  } else {
1907  sel.SetResolveDepth(0);
1908  sel.SetExcludeExternal(true);
1909  }
1910 
1911  } else if (policy == CSeqEntryIndex::eGenomes) {
1912 
1913  // for public ftp releases
1914 
1915  // Original comment was:
1916  // For genomes FTP, we're running with a local ASN cache. Fetching from ID has already
1917  // happened, and we specifically want to restrict to using annotation from the cache.
1918  // but later advice was to always use adaptive depth.
1919 
1920  if (m_IsRefSeq) {
1921  sel.SetResolveAll();
1922  sel.SetAdaptiveDepth(true);
1923  } else if (m_IsDeltaLitOnly) {
1924  sel.SetResolveAll();
1925  sel.SetAdaptiveDepth(true);
1926  } else {
1927  sel.SetResolveAll();
1928  sel.SetAdaptiveDepth(true);
1929  }
1930 
1931  } else if (policy == CSeqEntryIndex::eWeb) {
1932 
1933  // for public web pages
1934  if (m_IsRefSeq) {
1935  sel.SetResolveAll();
1936  sel.SetAdaptiveDepth(true);
1937  } else if (m_IsDeltaLitOnly) {
1938  sel.SetResolveAll();
1939  sel.SetAdaptiveDepth(true);
1940  } else {
1941  sel.SetResolveAll();
1942  sel.SetAdaptiveDepth(true);
1943  }
1944 
1945  // ID-6366 additional tests for -policy web to prevent gridlock caused by loading huge numbers of SNPs
1946  if (GetLength() <= 1000000) {
1947  // conditionally allows external annots, based on custom enable bits
1948  if ((flags & CSeqEntryIndex::fShowSNPFeats) != 0) {
1949  snpOK = true;
1950  }
1951  if ((flags & CSeqEntryIndex::fShowCDDFeats) != 0) {
1952  cddOK = true;
1953  }
1954  }
1955  }
1956 
1957  // fHideSNPFeats and fHideCDDFeats flags override any earlier settings
1958  if ((flags & CSeqEntryIndex::fHideSNPFeats) != 0) {
1959  snpOK = false;
1960  }
1961  if ((flags & CSeqEntryIndex::fHideCDDFeats) != 0) {
1962  cddOK = false;
1963  }
1964 
1965  // configure remote annot settings in selector
1966  if ( snpOK ) {
1967 
1969  auto idxl = idx.Lock();
1970  if (idxl) {
1971  FAddSnpFunc* func = idxl->GetSnpFunc();
1972  if (func) {
1973  // under PubSeq Gateway, need to get exact accession for SNP retrieval
1975  string na_acc;
1976  (*func) (bsh, na_acc);
1977  if (na_acc.length() > 0) {
1978  sel.IncludeNamedAnnotAccession(na_acc);
1979  }
1980  } else {
1981  // otherwise just give SNP name
1982  sel.IncludeNamedAnnotAccession("SNP");
1983  }
1984  }
1985 
1986  } else {
1987  sel.ExcludeNamedAnnotAccession("SNP");
1988  }
1989 
1990  if ( cddOK ) {
1991  sel.IncludeNamedAnnotAccession("CDD");
1992  } else {
1993  sel.ExcludeNamedAnnotAccession("CDD");
1994  }
1995 
1997  auto idxl = idx.Lock();
1998  if (idxl) {
1999  int featDepth = idxl->GetFeatDepth();
2000  if (featDepth > 0) {
2001  sel.SetResolveDepth(featDepth);
2002  }
2003  }
2004 
2005  // bit flags exclude specific features
2006  // source features are collected elsewhere
2008  // pub features are used in the REFERENCES section
2010  // some feature types are always excluded (deprecated?)
2011  // sel.ExcludeFeatSubtype(CSeqFeatData::eSubtype_non_std_residue)
2014  // exclude other types based on user flags
2015  if ((flags & CSeqEntryIndex::fHideImpFeats) != 0) {
2017  }
2018  if ((flags & CSeqEntryIndex::fHideSTSFeats) != 0) {
2020  }
2021  if ((flags & CSeqEntryIndex::fHideExonFeats) != 0) {
2022  sel.ExcludeNamedAnnots("Exon");
2024  }
2025  if ((flags & CSeqEntryIndex::fHideIntronFeats) != 0) {
2027  }
2028  if ((flags & CSeqEntryIndex::fHideMiscFeats) != 0) {
2035  }
2036  if ((flags & CSeqEntryIndex::fHideGapFeats) != 0) {
2039  }
2040 
2041  // additional common settings
2042  sel.SetFeatComparator(new feature::CFeatComparatorByLabel);
2043 
2044  // limit exploration of far deltas with no features to avoid timeout
2045  sel.SetMaxSearchSegments(500);
2047  sel.SetMaxSearchTime(25);
2048 
2049  // request exception to capture fetch failure
2050  sel.SetFailUnresolved();
2051 }
2052 
2053 // GetSelector is public access to selector populated by command-line arguments
2055 
2056 {
2058 }
2059 
2060 // Feature collection common implementation method (delayed until needed)
2062 
2063 {
2064  try {
2065  // Do not bail on m_FeatsInitialized flag
2066 
2067  if (! m_DescsInitialized) {
2068  // initialize descriptors first to get m_ForceOnlyNearFeats flag
2069  x_InitDescs();
2070  }
2071 
2072  m_FeatsInitialized = true;
2073 
2074  SAnnotSelector sel;
2075 
2077 
2078  bool onlyGeneRNACDS = false;
2080  onlyGeneRNACDS = true;
2081  }
2082 
2083  // variables for setting m_BestProteinFeature
2084  TSeqPos longest = 0;
2086  CProt_ref::EProcessed processed;
2087 
2089  auto idxl = idx.Lock();
2090  if (idxl) {
2091  /*
2092  if (! idxl->IsSmallGenomeSet()) {
2093  // limit feature collection to immediate Bioseq-set parent
2094  CRef<CSeqsetIndex> prnt = GetParent();
2095  if (prnt) {
2096  CBioseq_set_Handle bssh = prnt->GetSeqsetHandle();
2097  if (bssh) {
2098  CSeq_entry_Handle pseh = bssh.GetParentEntry();
2099  if (pseh) {
2100  sel.SetLimitSeqEntry(pseh);
2101  }
2102  }
2103  }
2104  }
2105  */
2106 
2107  CRef<feature::CFeatTree> ft = idxl->GetFeatTree();
2108 
2109  // start collection over on each segment
2110  m_SfxList.clear();
2111 
2112  // iterate features on Bioseq or sublocation
2113  CFeat_CI feat_it;
2114  CRef<CSeq_loc_Mapper> slice_mapper;
2115  if (slpp == 0) {
2116  feat_it = CFeat_CI(m_Bsh, sel);
2117  } else {
2118  SAnnotSelector sel_cpy = sel;
2119  sel_cpy.SetIgnoreStrand();
2120  /*
2121  if (selp->IsSetStrand() && selp->GetStrand() == eNa_strand_minus) {
2122  sel_cpy.SetSortOrder(SAnnotSelector::eSortOrder_Reverse);
2123  }
2124  */
2126  if (bsid) {
2127  SetDiagFilter(eDiagFilter_All, "!(1305.28,31)");
2128  CSeq_id seq_id;
2129  seq_id.Assign( *bsid );
2130  CSeq_loc old_loc;
2131  old_loc.SetInt().SetId( seq_id );
2132  old_loc.SetInt().SetFrom( 0 );
2133  old_loc.SetInt().SetTo( m_Length - 1 );
2134  slice_mapper = new CSeq_loc_Mapper( *slpp, old_loc, m_Scope );
2136  slice_mapper->TruncateNonmappingRanges();
2138  }
2139  feat_it = CFeat_CI(*m_Scope, *slpp, sel_cpy);
2140  }
2141 
2142  CConstRef<CSeq_loc> prev_loc;
2143 
2144  // iterate features on Bioseq
2145  for (; feat_it; ++feat_it) {
2146  const CMappedFeat mf = *feat_it;
2147 
2148  const CSeqFeatData& data = mf.GetData();
2149  CSeqFeatData::E_Choice typ = data.Which();
2150  if (onlyGeneRNACDS) {
2151  if (typ != CSeqFeatData::e_Gene &&
2152  typ != CSeqFeatData::e_Rna &&
2153  typ != CSeqFeatData::e_Cdregion) {
2154  continue;
2155  }
2156  }
2157 
2159 
2160  CConstRef<CSeq_loc> feat_loc(&mf.GetLocation());
2161  if (slpp) {
2162  feat_loc.Reset( slice_mapper->Map( mf.GetLocation() ) );
2163  }
2164 
2165  CRef<CFeatureIndex> sfx(new CFeatureIndex(hdl, mf, feat_loc, *this));
2166 
2168  CSeqFeatData::ESubtype subtype = sfx->GetSubtype();
2169 
2170  // For RW-1215, accession JB818822, insert instantiated gap feature before preceding misc_feature with the same location
2171 
2172  bool gapIsSame = false;
2173  if ( subtype == CSeqFeatData::eSubtype_gap && prev_loc && !m_SfxList.empty() ) {
2174  if ( feat_loc->GetStart(eExtreme_Positional) == prev_loc->GetStart(eExtreme_Positional) /* &&
2175  feat_loc->GetStop(eExtreme_Positional) == prev_loc->GetStop(eExtreme_Positional) */ ) {
2176  gapIsSame = true;
2177  }
2178  }
2179 
2180  if ( gapIsSame ) {
2181  m_SfxList.insert(m_SfxList.end() - 1, sfx);
2182  } else {
2183  m_SfxList.push_back(sfx);
2184  }
2185 
2186  prev_loc = feat_loc;
2187 
2188  // end of RW-1215 changes
2189 
2190  ft->AddFeature(mf);
2191 
2192  // CFeatureIndex from CMappedFeat for use with GetBestGene
2193  m_FeatIndexMap[mf] = sfx;
2194 
2195  // set specific flags for various feature types
2196  if (type == CSeqFeatData::e_Biosrc) {
2197  m_HasSource = true;
2198  if (! m_BioSource) {
2199  if (! mf.IsSetData ()) continue;
2200  const CSeqFeatData& sfdata = mf.GetData();
2201  const CBioSource& biosrc = sfdata.GetBiosrc();
2202  m_BioSource.Reset (&biosrc);
2203  }
2204  continue;
2205  }
2206 
2207  if (type == CSeqFeatData::e_Gene) {
2208  m_HasGene = true;
2210  continue;
2211  }
2212  const CSeq_loc& loc = mf.GetLocation ();
2213  switch (loc.Which()) {
2216  case CSeq_loc::e_Mix:
2217  case CSeq_loc::e_Equiv:
2218  m_HasMultiIntervalGenes = true;
2219  break;
2220  default:
2221  break;
2222  }
2223  continue;
2224  }
2225 
2226  if (subtype == CSeqFeatData::eSubtype_operon) {
2227  idxl->SetHasOperon(true);
2228  continue;
2229  }
2230 
2231  if (type == CSeqFeatData::e_Prot && IsAA()) {
2232  if (! mf.IsSetData ()) continue;
2233  const CSeqFeatData& sfdata = mf.GetData();
2234  const CProt_ref& prp = sfdata.GetProt();
2235  processed = CProt_ref::eProcessed_not_set;
2236  if (prp.IsSetProcessed()) {
2237  processed = prp.GetProcessed();
2238  }
2239  const CSeq_loc& loc = mf.GetLocation ();
2240  TSeqPos prot_length = sequence::GetLength(loc, m_Scope);
2241  if (prot_length > longest) {
2243  m_BestProteinFeature = sfx;
2244  longest = prot_length;
2245  bestprocessed = processed;
2246  } else if (prot_length == longest) {
2247  // unprocessed 0 > preprotein 1 > mat peptide 2
2248  if (processed < bestprocessed) {
2250  m_BestProteinFeature = sfx;
2251  longest = prot_length;
2252  bestprocessed = processed;
2253  }
2254  }
2255  continue;
2256  }
2257 
2258  if (type == CSeqFeatData::e_Cdregion && IsNA()) {
2259  } else if (type == CSeqFeatData::e_Rna && IsNA()) {
2260  } else if (type == CSeqFeatData::e_Prot && IsAA()) {
2261  } else {
2262  continue;
2263  }
2264 
2265  // index feature for (local) product Bioseq (CDS -> protein, mRNA -> cDNA, or Prot -> peptide)
2266  CSeq_id_Handle idh = mf.GetProductId();
2267  if (idh) {
2268  string str = idh.AsString();
2269  CRef<CBioseqIndex> bsxp = idxl->GetBioseqIndex(str);
2270  if (bsxp) {
2271  bsxp->m_FeatForProdInitialized = true;
2272  bsxp->m_FeatureForProduct = sfx;
2273  }
2274  }
2275  }
2276  }
2277  }
2278  catch (CException& e) {
2279  m_FetchFailure = true;
2280  ERR_POST_X(6, Error << "Error in CBioseqIndex::x_InitFeats: " << e.what());
2281  }
2282 }
2283 
2284 // Feature collection methods (delayed until needed)
2286 
2287 {
2288  x_InitFeats(0);
2289 }
2290 
2292 
2293 {
2294  x_InitFeats(&slp);
2295 }
2296 
2297 // GetFeatureForProduct allows hypothetical protein defline generator to obtain gene locus tag
2299 
2300 {
2301  if (! m_FeatForProdInitialized) {
2302  if (m_Bsh) {
2303  CFeat_CI fi(m_Bsh,
2305  .SetByProduct().SetLimitTSE(m_Bsh.GetTSE_Handle()));
2306  if (! fi) {
2307  fi = CFeat_CI(m_Bsh,
2309  .SetByProduct().SetLimitTSE(m_Bsh.GetTSE_Handle()));
2310  }
2311  if (! fi) {
2312  fi = CFeat_CI(m_Bsh,
2314  .SetByProduct().SetLimitTSE(m_Bsh.GetTSE_Handle()));
2315  }
2316  if (fi) {
2317  CMappedFeat mf = *fi;
2318  CSeq_id_Handle idh = mf.GetLocationId();
2319  CBioseq_Handle nbsh = m_Scope->GetBioseqHandle(idh);
2320  if (nbsh) {
2322  auto idxl = idx.Lock();
2323  if (idxl) {
2324  CRef<CBioseqIndex> bsxn = idxl->GetBioseqIndex(nbsh);
2325  if (bsxn) {
2326  if (! bsxn->m_FeatsInitialized) {
2327  bsxn->x_InitFeats();
2328  }
2329  }
2330  }
2331  }
2332  }
2333  }
2334  }
2335 
2336  return m_FeatureForProduct;
2337 }
2338 
2339 // Get Bioseq index containing feature with product pointing to this Bioseq
2341 
2342 {
2344  if (sfxp) {
2345  return sfxp->GetBioseqIndex();
2346  }
2347 
2348  return CWeakRef<CBioseqIndex> ();
2349 }
2350 
2351 // GetBestProteinFeature indexes longest protein feature on protein Bioseq
2353 
2354 {
2355  if (! m_BestProtFeatInitialized) {
2356  if (! m_FeatsInitialized) {
2357  x_InitFeats();
2358  }
2359  }
2360 
2361  return m_BestProteinFeature;
2362 }
2363 
2364 // Common descriptor field getters
2365 const string& CBioseqIndex::GetTitle (void)
2366 
2367 {
2368  if (! m_DescsInitialized) {
2369  x_InitDescs();
2370  }
2371 
2372  return m_Title;
2373 }
2374 
2376 
2377 {
2378  if (! m_DescsInitialized) {
2379  x_InitDescs();
2380  }
2381 
2382  return m_MolInfo;
2383 }
2384 
2386 
2387 {
2388  if (! m_DescsInitialized) {
2389  x_InitDescs();
2390  }
2391 
2392  return m_Biomol;
2393 }
2394 
2396 
2397 {
2398  if (! m_DescsInitialized) {
2399  x_InitDescs();
2400  }
2401 
2402  return m_Tech;
2403 }
2404 
2406 
2407 {
2408  if (! m_DescsInitialized) {
2409  x_InitDescs();
2410  }
2411 
2412  return m_Completeness;
2413 }
2414 
2416 
2417 {
2418  if (! m_DescsInitialized) {
2419  x_InitDescs();
2420  }
2421 
2422  return m_HTGTech;
2423 }
2424 
2426 
2427 {
2428  if (! m_DescsInitialized) {
2429  x_InitDescs();
2430  }
2431 
2432  return m_HTGSUnfinished;
2433 }
2434 
2436 
2437 {
2438  if (! m_DescsInitialized) {
2439  x_InitDescs();
2440  }
2441 
2442  return m_IsTLS;
2443 }
2444 
2446 
2447 {
2448  if (! m_DescsInitialized) {
2449  x_InitDescs();
2450  }
2451 
2452  return m_IsTSA;
2453 }
2454 
2456 
2457 {
2458  if (! m_DescsInitialized) {
2459  x_InitDescs();
2460  }
2461 
2462  return m_IsWGS;
2463 }
2464 
2466 
2467 {
2468  if (! m_DescsInitialized) {
2469  x_InitDescs();
2470  }
2471 
2472  return m_IsEST_STS_GSS;
2473 }
2474 
2476 
2477 {
2478  if (! m_DescsInitialized) {
2479  x_InitDescs();
2480  }
2481 
2482  return m_UseBiosrc;
2483 }
2484 
2486 
2487 {
2488  if (! m_SourcesInitialized) {
2489  x_InitSource();
2490  }
2491 
2492  return m_BioSource;
2493 }
2494 
2495 const string& CBioseqIndex::GetTaxname (void)
2496 
2497 {
2498  if (! m_SourcesInitialized) {
2499  x_InitSource();
2500  }
2501 
2502  return m_Taxname;
2503 }
2504 
2505 const string& CBioseqIndex::GetDescTaxname (void)
2506 
2507 {
2508  if (! m_SourcesInitialized) {
2509  x_InitSource();
2510  }
2511 
2512  return m_DescTaxname;
2513 }
2514 
2515 const string& CBioseqIndex::GetCommon (void)
2516 
2517 {
2518  if (! m_SourcesInitialized) {
2519  x_InitSource();
2520  }
2521 
2522  return m_Common;
2523 }
2524 
2525 const string& CBioseqIndex::GetLineage (void)
2526 
2527 {
2528  if (! m_SourcesInitialized) {
2529  x_InitSource();
2530  }
2531 
2532  return m_Lineage;
2533 }
2534 
2536 
2537 {
2538  if (! m_SourcesInitialized) {
2539  x_InitSource();
2540  }
2541 
2542  return m_Taxid;
2543 }
2544 
2546 
2547 {
2548  if (! m_SourcesInitialized) {
2549  x_InitSource();
2550  }
2551 
2552  return m_UsingAnamorph;
2553 }
2554 
2556 
2557 {
2558  if (! m_SourcesInitialized) {
2559  x_InitSource();
2560  }
2561 
2562  return m_Genus;
2563 }
2564 
2566 
2567 {
2568  if (! m_SourcesInitialized) {
2569  x_InitSource();
2570  }
2571 
2572  return m_Species;
2573 }
2574 
2576 
2577 {
2578  if (! m_SourcesInitialized) {
2579  x_InitSource();
2580  }
2581 
2582  return m_Multispecies;
2583 }
2584 
2586 
2587 {
2588  if (! m_SourcesInitialized) {
2589  x_InitSource();
2590  }
2591 
2592  return m_Genome;
2593 }
2594 
2596 
2597 {
2598  if (! m_SourcesInitialized) {
2599  x_InitSource();
2600  }
2601 
2602  return m_IsPlasmid;
2603 }
2604 
2606 
2607 {
2608  if (! m_SourcesInitialized) {
2609  x_InitSource();
2610  }
2611 
2612  return m_IsChromosome;
2613 }
2614 
2615 const string& CBioseqIndex::GetOrganelle (void)
2616 
2617 {
2618  if (! m_SourcesInitialized) {
2619  x_InitSource();
2620  }
2621 
2622  return m_Organelle;
2623 }
2624 
2626 
2627 {
2628  if (! m_SourcesInitialized) {
2629  x_InitSource();
2630  }
2631 
2632  return m_FirstSuperKingdom;
2633 }
2634 
2636 
2637 {
2638  if (! m_SourcesInitialized) {
2639  x_InitSource();
2640  }
2641 
2642  return m_SecondSuperKingdom;
2643 }
2644 
2646 
2647 {
2648  if (! m_SourcesInitialized) {
2649  x_InitSource();
2650  }
2651 
2652  return m_IsCrossKingdom;
2653 }
2654 
2656 
2657 {
2658  if (! m_SourcesInitialized) {
2659  x_InitSource();
2660  }
2661 
2662  return m_Chromosome;
2663 }
2664 
2666 
2667 {
2668  if (! m_SourcesInitialized) {
2669  x_InitSource();
2670  }
2671 
2672  return m_LinkageGroup;
2673 }
2674 
2676 
2677 {
2678  if (! m_SourcesInitialized) {
2679  x_InitSource();
2680  }
2681 
2682  return m_Clone;
2683 }
2684 
2686 
2687 {
2688  if (! m_SourcesInitialized) {
2689  x_InitSource();
2690  }
2691 
2692  return m_has_clone;
2693 }
2694 
2696 
2697 {
2698  if (! m_SourcesInitialized) {
2699  x_InitSource();
2700  }
2701 
2702  return m_Map;
2703 }
2704 
2706 
2707 {
2708  if (! m_SourcesInitialized) {
2709  x_InitSource();
2710  }
2711 
2712  return m_Plasmid;
2713 }
2714 
2716 
2717 {
2718  if (! m_SourcesInitialized) {
2719  x_InitSource();
2720  }
2721 
2722  return m_Segment;
2723 }
2724 
2726 
2727 {
2728  if (! m_SourcesInitialized) {
2729  x_InitSource();
2730  }
2731 
2732  return m_Breed;
2733 }
2734 
2736 
2737 {
2738  if (! m_SourcesInitialized) {
2739  x_InitSource();
2740  }
2741 
2742  return m_Cultivar;
2743 }
2744 
2745 
2747 
2748 {
2749  if (! m_SourcesInitialized) {
2750  x_InitSource();
2751  }
2752 
2753  return m_SpecimenVoucher;
2754 }
2755 
2756 
2758 
2759 {
2760  if (! m_SourcesInitialized) {
2761  x_InitSource();
2762  }
2763 
2764  return m_Isolate;
2765 }
2766 
2768 
2769 {
2770  if (! m_SourcesInitialized) {
2771  x_InitSource();
2772  }
2773 
2774  return m_Strain;
2775 }
2776 
2778 
2779 {
2780  if (! m_SourcesInitialized) {
2781  x_InitSource();
2782  }
2783 
2784  return m_Substrain;
2785 }
2786 
2788 
2789 {
2790  if (! m_SourcesInitialized) {
2791  x_InitSource();
2792  }
2793 
2794  return m_MetaGenomeSource;
2795 }
2796 
2798 
2799 {
2800  if (! m_DescsInitialized) {
2801  x_InitDescs();
2802  }
2803 
2804  return m_HTGSCancelled;
2805 }
2806 
2808 
2809 {
2810  if (! m_DescsInitialized) {
2811  x_InitDescs();
2812  }
2813 
2814  return m_HTGSDraft;
2815 }
2816 
2818 
2819 {
2820  if (! m_DescsInitialized) {
2821  x_InitDescs();
2822  }
2823 
2824  return m_HTGSPooled;
2825 }
2826 
2828 
2829 {
2830  if (! m_DescsInitialized) {
2831  x_InitDescs();
2832  }
2833 
2834  return m_TPAExp;
2835 }
2836 
2838 
2839 {
2840  if (! m_DescsInitialized) {
2841  x_InitDescs();
2842  }
2843 
2844  return m_TPAInf;
2845 }
2846 
2848 
2849 {
2850  if (! m_DescsInitialized) {
2851  x_InitDescs();
2852  }
2853 
2854  return m_TPAReasm;
2855 }
2856 
2858 
2859 {
2860  if (! m_DescsInitialized) {
2861  x_InitDescs();
2862  }
2863 
2864  return m_Unordered;
2865 }
2866 
2868 
2869 {
2870  if (! m_DescsInitialized) {
2871  x_InitDescs();
2872  }
2873 
2874  return m_PDBCompound;
2875 }
2876 
2878 
2879 {
2880  if (! m_DescsInitialized) {
2881  x_InitDescs();
2882  }
2883 
2884  return m_ForceOnlyNearFeats;
2885 }
2886 
2888 
2889 {
2890  if (! m_DescsInitialized) {
2891  x_InitDescs();
2892  }
2893 
2894  return m_IsUnverified;
2895 }
2896 
2898 
2899 {
2900  if (! m_DescsInitialized) {
2901  x_InitDescs();
2902  }
2903 
2904  return m_IsUnverifiedFeature;
2905 }
2906 
2908 
2909 {
2910  if (! m_DescsInitialized) {
2911  x_InitDescs();
2912  }
2913 
2914  return m_IsUnverifiedOrganism;
2915 }
2916 
2918 
2919 {
2920  if (! m_DescsInitialized) {
2921  x_InitDescs();
2922  }
2923 
2925 }
2926 
2928 
2929 {
2930  if (! m_DescsInitialized) {
2931  x_InitDescs();
2932  }
2933 
2935 }
2936 
2938 
2939 {
2940  if (! m_DescsInitialized) {
2941  x_InitDescs();
2942  }
2943 
2944  return m_IsUnreviewed;
2945 }
2946 
2948 
2949 {
2950  if (! m_DescsInitialized) {
2951  x_InitDescs();
2952  }
2953 
2955 }
2956 
2958 
2959 {
2960  if (! m_DescsInitialized) {
2961  x_InitDescs();
2962  }
2963 
2964  return m_TargetedLocus;
2965 }
2966 
2967 const string& CBioseqIndex::GetComment (void)
2968 
2969 {
2970  if (! m_DescsInitialized) {
2971  x_InitDescs();
2972  }
2973 
2974  return m_Comment;
2975 }
2976 
2978 
2979 {
2980  if (! m_DescsInitialized) {
2981  x_InitDescs();
2982  }
2983 
2984  return m_IsPseudogene;
2985 }
2986 
2988 
2989 {
2990  if (! m_FeatsInitialized) {
2991  x_InitFeats();
2992  }
2993 
2995  auto idxl = idx.Lock();
2996  if (idxl) {
2997  return idxl->HasOperon();
2998  }
2999 
3000  return false;
3001 }
3002 
3004 
3005 {
3006  if (! m_FeatsInitialized) {
3007  x_InitFeats();
3008  }
3009 
3010  return m_HasGene;
3011 }
3012 
3014 
3015 {
3016  if (! m_FeatsInitialized) {
3017  x_InitFeats();
3018  }
3019 
3020  return m_HasMultiIntervalGenes;
3021 }
3022 
3024 
3025 {
3026  if (! m_FeatsInitialized) {
3027  x_InitFeats();
3028  }
3029 
3030  return m_HasSource;
3031 }
3032 
3034 
3035 {
3036  if (! m_DescsInitialized) {
3037  x_InitDescs();
3038  }
3039 
3040  return m_rEnzyme;
3041 }
3042 
3044 
3045 {
3046  CRef<CFeatureIndex> sfx;
3047 
3049  if (it != m_FeatIndexMap.end()) {
3050  sfx = it->second;
3051  }
3052 
3053  return sfx;
3054 }
3055 
3056 void CBioseqIndex::GetSequence (int from, int to, string& buffer)
3057 
3058 {
3059  try {
3060  if (! m_SeqVec) {
3061  m_SeqVec = new CSeqVector(m_Bsh);
3062  if (m_SeqVec) {
3063  if (IsAA()) {
3065  } else {
3067  }
3068  }
3069  }
3070 
3071  if (m_SeqVec) {
3072  CSeqVector& vec = *m_SeqVec;
3073  if (from < 0) {
3074  from = 0;
3075  }
3076  if (to < 0 || to >= (int) vec.size()) {
3077  to = vec.size();
3078  }
3079  if (vec.CanGetRange(from, to)) {
3080  vec.GetSeqData(from, to, buffer);
3081  } else {
3082  m_FetchFailure = true;
3083  }
3084  }
3085  }
3086  catch (CException& e) {
3087  ERR_POST_X(7, Error << "Error in CBioseqIndex::GetSequence: " << e.what());
3088  }
3089 }
3090 
3091 string CBioseqIndex::GetSequence (int from, int to)
3092 
3093 {
3094  string buffer;
3095 
3096  GetSequence(from, to, buffer);
3097 
3098  return buffer;
3099 }
3100 
3102 
3103 {
3104  GetSequence(0, -1, buffer);
3105 }
3106 
3108 
3109 {
3110  string buffer;
3111 
3112  GetSequence(0, -1, buffer);
3113 
3114  return buffer;
3115 }
3116 
3117 const vector<CRef<CGapIndex>>& CBioseqIndex::GetGapIndices(void)
3118 
3119 {
3120  if (! m_GapsInitialized) {
3121  x_InitGaps();
3122  }
3123 
3124  return m_GapList;
3125 }
3126 
3127 const vector<CRef<CDescriptorIndex>>& CBioseqIndex::GetDescriptorIndices(void)
3128 
3129 {
3130  if (! m_DescsInitialized) {
3131  x_InitDescs();
3132  }
3133 
3134  return m_SdxList;
3135 }
3136 
3137 const vector<CRef<CFeatureIndex>>& CBioseqIndex::GetFeatureIndices(void)
3138 
3139 {
3140  if (! m_FeatsInitialized) {
3141  x_InitFeats();
3142  }
3143 
3144  return m_SfxList;
3145 }
3146 
3147 
3148 // CGapIndex
3149 
3150 // Constructor
3152  TSeqPos end,
3153  TSeqPos length,
3154  const string& type,
3155  const vector<string>& evidence,
3156  bool isUnknownLength,
3157  bool isAssemblyGap,
3158  CBioseqIndex& bsx)
3159  : m_Bsx(&bsx),
3160  m_Start(start),
3161  m_End(end),
3162  m_Length(length),
3163  m_GapType(type),
3164  m_GapEvidence(evidence),
3165  m_IsUnknownLength(isUnknownLength),
3166  m_IsAssemblyGap(isAssemblyGap)
3167 {
3168 }
3169 
3170 
3171 // CDescriptorIndex
3172 
3173 // Constructor
3175  CBioseqIndex& bsx)
3176  : m_Sd(sd),
3177  m_Bsx(&bsx)
3178 {
3179  m_Type = m_Sd.Which();
3180 }
3181 
3182 
3183 // CFeatureIndex
3184 
3185 // Constructor
3187  const CMappedFeat mf,
3188  CConstRef<CSeq_loc> feat_loc,
3189  CBioseqIndex& bsx)
3190  : m_Sfh(sfh),
3191  m_Mf(mf),
3192  m_Bsx(&bsx)
3193 {
3194  const CSeqFeatData& data = m_Mf.GetData();
3195  m_Type = data.Which();
3196  m_Subtype = data.GetSubtype();
3197  m_Fl = feat_loc;
3200 }
3201 
3202 // Find CFeatureIndex object for best gene using internal CFeatTree
3204 
3205 {
3206  try {
3207  CMappedFeat best;
3209  auto bsxl = bsx.Lock();
3210  if (bsxl) {
3211  CWeakRef<CSeqMasterIndex> idx = bsxl->GetSeqMasterIndex();
3212  auto idxl = idx.Lock();
3213  if (idxl) {
3214  best = feature::GetBestGeneForFeat(m_Mf, idxl->GetFeatTree(), 0,
3215  /* feature::CFeatTree::eBestGene_AllowOverlapped */
3216  feature::CFeatTree::eBestGene_TreeOnly);
3217  }
3218  if (best) {
3219  return bsxl->GetFeatIndex(best);
3220  }
3221  }
3222  } catch (CException& e) {
3223  ERR_POST_X(8, Error << "Error in CFeatureIndex::GetBestGene: " << e.what());
3224  }
3225  return CRef<CFeatureIndex> ();
3226 }
3227 
3228 
3229 // Find CFeatureIndex object for best parent using internal CFeatTree
3231 
3232 {
3233  try {
3234  CMappedFeat best;
3236  auto bsxl = bsx.Lock();
3237  if (bsxl) {
3238  CWeakRef<CSeqMasterIndex> idx = bsxl->GetSeqMasterIndex();
3239  auto idxl = idx.Lock();
3240  if (idxl) {
3241  static const CSeqFeatData::ESubtype sm_SpecialVDJTypes[] = {
3247  };
3248  for ( const CSeqFeatData::ESubtype* type_ptr = sm_SpecialVDJTypes;
3249  *type_ptr != CSeqFeatData::eSubtype_bad; ++type_ptr ) {
3250  best = feature::GetBestParentForFeat(m_Mf, *type_ptr, idxl->GetFeatTree(), 0);
3251  if (best) {
3252  return bsxl->GetFeatIndex(best);
3253  }
3254  }
3255  }
3256  }
3257  } catch (CException& e) {
3258  ERR_POST_X(8, Error << "Error in CFeatureIndex::GetBestParent: " << e.what());
3259  }
3260  return CRef<CFeatureIndex> ();
3261 }
3262 
3264 
3265 {
3267  auto bsxl = bsx.Lock();
3268  if (bsxl) {
3269  bsxl->SetFetchFailure(fails);
3270  }
3271 }
3272 
3273 // Find CFeatureIndex object for overlapping source feature using internal CFeatTree
3275 
3276 {
3277  try {
3278  CMappedFeat best;
3280  auto bsxl = bsx.Lock();
3281  if (bsxl) {
3282  if (bsxl->HasSource()) {
3283  CWeakRef<CSeqMasterIndex> idx = bsxl->GetSeqMasterIndex();
3284  auto idxl = idx.Lock();
3285  if (idxl) {
3286  CRef<feature::CFeatTree> ft = idxl->GetFeatTree();
3287  try {
3288  best = ft->GetParent(m_Mf, CSeqFeatData::eSubtype_biosrc);
3289  } catch (CException& e) {
3290  ERR_POST_X(9, Error << "Error in CFeatureIndex::GetOverlappingSource: " << e.what());
3291  }
3292  }
3293  if (best) {
3294  return bsxl->GetFeatIndex(best);
3295  }
3296  }
3297  }
3298  } catch (CException& e) {
3299  ERR_POST_X(10, Error << "Error in CFeatureIndex::GetOverlappingSource: " << e.what());
3300  }
3301  return CRef<CFeatureIndex> ();
3302 }
3303 
3304 void CFeatureIndex::GetSequence (int from, int to, string& buffer)
3305 
3306 {
3307  try {
3308  if (! m_SeqVec) {
3310  auto bsxl = bsx.Lock();
3311  if (bsxl) {
3313  if (lc) {
3314  m_SeqVec = new CSeqVector(*lc, *bsxl->GetScope());
3315  if (m_SeqVec) {
3316  if (bsxl->IsAA()) {
3318  } else {
3320  }
3321  }
3322  }
3323  }
3324  }
3325 
3326  if (m_SeqVec) {
3327  CSeqVector& vec = *m_SeqVec;
3328  if (from < 0) {
3329  from = 0;
3330  }
3331  if (to < 0 || to >= (int) vec.size()) {
3332  to = vec.size();
3333  }
3334  if (vec.CanGetRange(from, to)) {
3335  vec.GetSeqData(from, to, buffer);
3336  } else {
3337  SetFetchFailure(true);
3338  }
3339  }
3340  }
3341  catch (CException& e) {
3342  SetFetchFailure(true);
3343  ERR_POST_X(11, Error << "Error in CFeatureIndex::GetSequence: " << e.what());
3344  }
3345 }
3346 
3347 string CFeatureIndex::GetSequence (int from, int to)
3348 
3349 {
3350  string buffer;
3351 
3352  GetSequence(from, to, buffer);
3353 
3354  return buffer;
3355 }
3356 
3358 
3359 {
3360  GetSequence(0, -1, buffer);
3361 }
3362 
3364 
3365 {
3366  string buffer;
3367 
3368  GetSequence(0, -1, buffer);
3369 
3370  return buffer;
3371 }
3372 
3373 
3374 // CWordPairIndexer
3375 
3376 // superscript and subscript code points not handled by UTF8ToAsciiString
3380  { 0x00B2, '2' },
3381  { 0x00B3, '3' },
3382  { 0x00B9, '1' },
3383  { 0x2070, '0' },
3384  { 0x2071, '1' },
3385  { 0x2074, '4' },
3386  { 0x2075, '5' },
3387  { 0x2076, '6' },
3388  { 0x2077, '7' },
3389  { 0x2078, '8' },
3390  { 0x2079, '9' },
3391  { 0x207A, '+' },
3392  { 0x207B, '-' },
3393  { 0x207C, '=' },
3394  { 0x207D, '(' },
3395  { 0x207E, ')' },
3396  { 0x207F, 'n' },
3397  { 0x2080, '0' },
3398  { 0x2081, '1' },
3399  { 0x2082, '2' },
3400  { 0x2083, '3' },
3401  { 0x2084, '4' },
3402  { 0x2085, '5' },
3403  { 0x2086, '6' },
3404  { 0x2087, '7' },
3405  { 0x2088, '8' },
3406  { 0x2089, '9' },
3407  { 0x208A, '+' },
3408  { 0x208B, '-' },
3409  { 0x208C, '=' },
3410  { 0x208D, '(' },
3411  { 0x208E, ')' }
3412 };
3415 
3417 
3418 {
3419  const char* src = str.c_str();
3420  string dst;
3421  while (*src) {
3422  if (static_cast<unsigned char>(*src) < 128) { // no translation needed
3423  dst += *src++;
3424  } else {
3425  utf8::TUnicode character;
3426  size_t n = utf8::UTF8ToUnicode(src, &character);
3427  src += n;
3429  = sc_ExtraTranslations.find(character);
3430  if (it != sc_ExtraTranslations.end()) {
3431  dst += it->second;
3432  } else {
3433  const utf8::SUnicodeTranslation* translation =
3434  utf8::UnicodeToAscii(character);
3435  if (translation != NULL && translation->Type != utf8::eSkip) {
3436  _ASSERT(translation->Type == utf8::eString);
3437  if (translation->Subst != NULL) {
3438  dst += translation->Subst;
3439  }
3440  }
3441  }
3442  }
3443  }
3444  return dst;
3445 }
3446 
3447 static const char* const idxStopWords[] = {
3448  "+",
3449  "-",
3450  "a",
3451  "about",
3452  "again",
3453  "all",
3454  "almost",
3455  "also",
3456  "although",
3457  "always",
3458  "among",
3459  "an",
3460  "and",
3461  "another",
3462  "any",
3463  "are",
3464  "as",
3465  "at",
3466  "be",
3467  "because",
3468  "been",
3469  "before",
3470  "being",
3471  "between",
3472  "both",
3473  "but",
3474  "by",
3475  "can",
3476  "could",
3477  "did",
3478  "do",
3479  "does",
3480  "done",
3481  "due",
3482  "during",
3483  "each",
3484  "either",
3485  "enough",
3486  "especially",
3487  "etc",
3488  "for",
3489  "found",
3490  "from",
3491  "further",
3492  "had",
3493  "has",
3494  "have",
3495  "having",
3496  "here",
3497  "how",
3498  "however",
3499  "i",
3500  "if",
3501  "in",
3502  "into",
3503  "is",
3504  "it",
3505  "its",
3506  "itself",
3507  "just",
3508  "kg",
3509  "km",
3510  "made",
3511  "mainly",
3512  "make",
3513  "may",
3514  "mg",
3515  "might",
3516  "ml",
3517  "mm",
3518  "most",
3519  "mostly",
3520  "must",
3521  "nearly",
3522  "neither",
3523  "no",
3524  "nor",
3525  "obtained",
3526  "of",
3527  "often",
3528  "on",
3529  "our",
3530  "overall",
3531  "perhaps",
3532  "pmid",
3533  "quite",
3534  "rather",
3535  "really",
3536  "regarding",
3537  "seem",
3538  "seen",
3539  "several",
3540  "should",
3541  "show",
3542  "showed",
3543  "shown",
3544  "shows",
3545  "significantly",
3546  "since",
3547  "so",
3548  "some",
3549  "such",
3550  "than",
3551  "that",
3552  "the",
3553  "their",
3554  "theirs",
3555  "them",
3556  "then",
3557  "there",
3558  "therefore",
3559  "these",
3560  "they",
3561  "this",
3562  "those",
3563  "through",
3564  "thus",
3565  "to",
3566  "upon",
3567  "use",
3568  "used",
3569  "using",
3570  "various",
3571  "very",
3572  "was",
3573  "we",
3574  "were",
3575  "what",
3576  "when",
3577  "which",
3578  "while",
3579  "with",
3580  "within",
3581  "without",
3582  "would",
3583 };
3586 
3588 
3589 {
3590  TStopWords::const_iterator iter = sc_StopWords.find(str.c_str());
3591  return (iter != sc_StopWords.end());
3592 }
3593 
3595 
3596 {
3597  string dst = str;
3598 
3599  int max = (int) dst.length();
3600 
3601  for (; max > 0; max--) {
3602  char ch = dst[0];
3603  if (ch != '.' && ch != ',' && ch != ':' && ch != ';') {
3604  break;
3605  }
3606  // trim leading period, comma, colon, and semicolon
3607  dst.erase(0, 1);
3608  }
3609 
3610  for (; max > 0; max--) {
3611  char ch = dst[max-1];
3612  if (ch != '.' && ch != ',' && ch != ':' && ch != ';') {
3613  break;
3614  }
3615  // // trim trailing period, comma, colon, and semicolon
3616  dst.erase(max-1, 1);
3617  }
3618 
3619  if (max > 1) {
3620  if (dst[0] == '(' && dst[max-1] == ')') {
3621  // trim flanking parentheses
3622  dst.erase(max-1, 1);
3623  dst.erase(0, 1);
3624  max -= 2;
3625  }
3626  }
3627 
3628  if (max > 0) {
3629  if (dst[0] == '(' && NStr::Find (dst, ")") == NPOS) {
3630  // trim isolated left parentheses
3631  dst.erase(0, 1);
3632  max--;
3633  }
3634  }
3635 
3636  if (max > 1) {
3637  if (dst[max-1] == ')' && NStr::Find (dst, "(") == NPOS) {
3638  // trim isolated right parentheses
3639  dst.erase(max-1, 1);
3640  // max--;
3641  }
3642  }
3643 
3644  return dst;
3645 }
3646 
3647 static const char* const mixedTags[] = {
3648  "<b>",
3649  "<i>",
3650  "<u>",
3651  "<sup>",
3652  "<sub>",
3653  "</b>",
3654  "</i>",
3655  "</u>",
3656  "</sup>",
3657  "</sub>",
3658  "<b/>",
3659  "<i/>",
3660  "<u/>",
3661  "<sup/>",
3662  "<sub/>",
3663  "&lt;i&gt;",
3664  "&lt;/i&gt;",
3665  "&lt;i/&gt;",
3666  "&lt;b&gt;",
3667  "&lt;/b&gt;",
3668  "&lt;b/&gt;",
3669  "&lt;u&gt;",
3670  "&lt;/u&gt;",
3671  "&lt;u/&gt;",
3672  "&lt;sub&gt;",
3673  "&lt;/sub&gt;",
3674  "&lt;sub/&gt;",
3675  "&lt;sup&gt;",
3676  "&lt;/sup&gt;",
3677  "&lt;sup/&gt;",
3678  "&amp;lt;i&amp;gt;",
3679  "&amp;lt;/i&amp;gt;",
3680  "&amp;lt;i/&amp;gt;",
3681  "&amp;lt;b&amp;gt;",
3682  "&amp;lt;/b&amp;gt;",
3683  "&amp;lt;b/&amp;gt;",
3684  "&amp;lt;u&amp;gt;",
3685  "&amp;lt;/u&amp;gt;",
3686  "&amp;lt;u/&amp;gt;",
3687  "&amp;lt;sub&amp;gt;",
3688  "&amp;lt;/sub&amp;gt;",
3689  "&amp;lt;sub/&amp;gt;",
3690  "&amp;lt;sup&amp;gt;",
3691  "&amp;lt;/sup&amp;gt;",
3692  "&amp;lt;sup/&amp;gt;",
3693 };
3694 
3695 static int SkipMixedContent ( const char* ptr )
3696 
3697 {
3698  for (int i = 0; i < sizeof (mixedTags); i++) {
3699  const char* tag = mixedTags[i];
3700  const char* tmp = ptr;
3701  int len = 0;
3702  while (*tag && *tmp && *tag == *tmp) {
3703  tag++;
3704  tmp++;
3705  len++;
3706  }
3707  if (! *tag) {
3708  return len;
3709  }
3710  }
3711  return 0;
3712 }
3713 
3714 string CWordPairIndexer::TrimMixedContent ( const string& str )
3715 
3716 {
3717  const char* src = str.c_str();
3718  string dst;
3719  while (*src) {
3720  if (*src == '<' || *src == '&') {
3721  int skip = SkipMixedContent (src);
3722  if (skip > 0) {
3723  src += skip;
3724  } else {
3725  dst += *src++;
3726  }
3727  } else {
3728  dst += *src++;
3729  }
3730  }
3731  return dst;
3732 }
3733 
3734 string CWordPairIndexer::x_AddToWordPairIndex (string item, string prev)
3735 
3736 {
3737  if (IsStopWord(item)) {
3738  return "";
3739  }
3740  // append item
3741  m_Norm.push_back(item);
3742  if (! prev.empty()) {
3743  // append prev+" "+item
3744  string pair = prev + " " + item;
3745  m_Pair.push_back(pair);
3746  }
3747  return item;
3748 }
3749 
3751 
3752 {
3753  m_Norm.clear();
3754  m_Pair.clear();
3755 
3757  NStr::ToLower(str);
3758 
3759  if (NStr::Find(str, "<") != NPOS || NStr::Find(str, "&") != NPOS) {
3761  }
3762 
3763  // split terms at spaces
3764  list<string> terms;
3765  NStr::Split( str, " ", terms, NStr::fSplit_Tokenize );
3766  string prev;
3767  ITERATE( list<string>, it, terms ) {
3768  string curr = NStr::TruncateSpaces( *it );
3769  // allow parentheses in chemical formula
3770  curr = TrimPunctuation(curr);
3771  prev = x_AddToWordPairIndex (curr, prev);
3772  }
3773 
3774  // convert non-alphanumeric punctuation to space
3775  for (int i = 0; i < str.length(); i++) {
3776  char ch = str[i];
3777  if (ch >= 'A' && ch <= 'Z') {
3778  } else if (ch >= 'a' && ch <= 'z') {
3779  } else if (ch >= '0' && ch <= '9') {
3780  } else {
3781  str[i] = ' ';
3782  }
3783  }
3784  // now splitting at all punctuation
3785  list<string> words;
3786  NStr::Split( str, " ", words, NStr::fSplit_Tokenize );
3787  prev = "";
3788  ITERATE( list<string>, it, words ) {
3789  string curr = NStr::TruncateSpaces( *it );
3790  prev = x_AddToWordPairIndex (curr, prev);
3791  }
3792 
3793  std::sort(m_Norm.begin(), m_Norm.end());
3794  auto nit = std::unique(m_Norm.begin(), m_Norm.end());
3795  m_Norm.erase(nit, m_Norm.end());
3796 
3797  std::sort(m_Pair.begin(), m_Pair.end());
3798  auto pit = std::unique(m_Pair.begin(), m_Pair.end());
3799  m_Pair.erase(pit, m_Pair.end());
3800 }
3801 
3802 
static CRef< CScope > m_Scope
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
Tracks the best score (lowest value).
Definition: ncbiutil.hpp:219
CBinomialOrgName –.
bool IsSetCommon(void) const
Definition: BioSource.cpp:345
const string & GetTaxname(void) const
Definition: BioSource.cpp:340
const string & GetCommon(void) const
Definition: BioSource.cpp:350
const COrgName & GetOrgname(void) const
Definition: BioSource.cpp:410
bool IsSetOrgname(void) const
Definition: BioSource.cpp:405
bool IsSetTaxname(void) const
Definition: BioSource.cpp:335
CTempString GetSubstrain(void)
Definition: indexer.cpp:2777
CBioSource::TGenome m_Genome
Definition: indexer.hpp:725
CRef< CScope > m_Scope
Definition: indexer.hpp:622
bool m_FeatForProdInitialized
Definition: indexer.hpp:637
bool m_HTGSPooled
Definition: indexer.hpp:756
bool m_IsTLS
Definition: indexer.hpp:703
bool m_IsUnverifiedMisassembled
Definition: indexer.hpp:771
bool m_BestProtFeatInitialized
Definition: indexer.hpp:640
bool m_has_clone
Definition: indexer.hpp:739
bool m_ThirdParty
Definition: indexer.hpp:677
vector< CRef< CFeatureIndex > > m_SfxList
Definition: indexer.hpp:633
TFeatIndexMap m_FeatIndexMap
Definition: indexer.hpp:645
bool IsHTGTech(void)
Definition: indexer.cpp:2415
bool m_HasGene
Definition: indexer.hpp:785
CTempString m_Species
Definition: indexer.hpp:723
CTempString m_SpecimenVoucher
Definition: indexer.hpp:747
bool HasSource(void)
Definition: indexer.cpp:3023
~CBioseqIndex(void)
Definition: indexer.cpp:1101
CSeq_inst::TLength GetLength(void) const
Definition: indexer.hpp:474
bool m_IsDeltaLitOnly
Definition: indexer.hpp:662
CSeq_inst::TTopology m_Topology
Definition: indexer.hpp:658
string m_SecondSuperKingdom
Definition: indexer.hpp:732
CRef< CSeqVector > m_SeqVec
Definition: indexer.hpp:647
CTempString GetCultivar(void)
Definition: indexer.cpp:2735
bool IsForceOnlyNearFeats(void)
Definition: indexer.cpp:2877
bool HasClone(void)
Definition: indexer.cpp:2685
CTempString GetMetaGenomeSource(void)
Definition: indexer.cpp:2787
CTempString m_Clone
Definition: indexer.hpp:738
bool IsTPAReasm(void)
Definition: indexer.cpp:2847
string m_PDBChainID
Definition: indexer.hpp:690
bool IsTPAInf(void)
Definition: indexer.cpp:2837
bool m_IsPseudogene
Definition: indexer.hpp:782
void x_InitGaps(void)
Definition: indexer.cpp:1107
bool m_IsPatent
Definition: indexer.hpp:674
bool m_IsUnreviewed
Definition: indexer.hpp:775
CTempString GetBreed(void)
Definition: indexer.cpp:2725
const vector< CRef< CGapIndex > > & GetGapIndices(void)
Definition: indexer.cpp:3117
bool IsChromosome(void)
Definition: indexer.cpp:2605
CBioseqIndex(CBioseq_Handle bsh, const CBioseq &bsp, CBioseq_Handle obsh, CRef< CSeqsetIndex > prnt, CSeq_entry_Handle tseh, CRef< CScope > scope, CSeqMasterIndex &idx, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags)
Definition: indexer.cpp:783
string m_Lineage
Definition: indexer.hpp:718
string m_PatentCountry
Definition: indexer.hpp:685
bool IsWGS(void)
Definition: indexer.cpp:2455
bool m_ForceOnlyNearFeats
Definition: indexer.hpp:766
bool m_IsUnreviewedUnannotated
Definition: indexer.hpp:776
CConstRef< CBioSource > m_BioSource
Definition: indexer.hpp:714
bool m_UsingAnamorph
Definition: indexer.hpp:720
CTempString GetGenus(void)
Definition: indexer.cpp:2555
CMolInfo::TTech GetTech(void)
Definition: indexer.cpp:2395
CRef< CFeatureIndex > GetFeatureForProduct(void)
Definition: indexer.cpp:2298
const vector< CRef< CFeatureIndex > > & GetFeatureIndices(void)
Definition: indexer.cpp:3137
bool IsEST_STS_GSS(void)
Definition: indexer.cpp:2465
string m_Title
Definition: indexer.hpp:693
bool m_FetchFailure
Definition: indexer.hpp:652
const string & GetLineage(void)
Definition: indexer.cpp:2525
CConstRef< CMolInfo > m_MolInfo
Definition: indexer.hpp:696
bool m_IsUnverifiedFeature
Definition: indexer.hpp:769
CTempString GetStrain(void)
Definition: indexer.cpp:2767
CTempString m_Chromosome
Definition: indexer.hpp:736
CTempString m_Isolate
Definition: indexer.hpp:748
bool m_WGSMaster
Definition: indexer.hpp:678
bool IsUnreviewed(void)
Definition: indexer.cpp:2937
bool m_UseBiosrc
Definition: indexer.hpp:708
bool m_IsMap
Definition: indexer.hpp:664
bool m_HTGTech
Definition: indexer.hpp:701
bool m_TSAMaster
Definition: indexer.hpp:679
bool m_HTGSUnfinished
Definition: indexer.hpp:702
bool IsTPAExp(void)
Definition: indexer.cpp:2827
bool m_IsUnverifiedContaminant
Definition: indexer.hpp:772
bool IsHTGSPooled(void)
Definition: indexer.cpp:2817
bool m_IsPDB
Definition: indexer.hpp:675
bool IsHTGSCancelled(void)
Definition: indexer.cpp:2797
bool m_IsVirtual
Definition: indexer.hpp:663
bool m_IsDelta
Definition: indexer.hpp:661
void x_InitFeats(void)
Definition: indexer.cpp:2285
bool IsAA(void) const
Definition: indexer.hpp:472
CConstRef< CMolInfo > GetMolInfo(void)
Definition: indexer.cpp:2375
TTaxId m_Taxid
Definition: indexer.hpp:719
CBioseq_Handle GetBioseqHandle(void) const
Definition: indexer.hpp:428
bool IsUnverifiedFeature(void)
Definition: indexer.cpp:2897
bool m_IsRefSeq
Definition: indexer.hpp:669
CTempString GetSpecies(void)
Definition: indexer.cpp:2565
string m_PatentNumber
Definition: indexer.hpp:686
CRef< CFeatureIndex > GetFeatIndex(const CMappedFeat &mf)
Definition: indexer.cpp:3043
CConstRef< CBioSource > GetBioSource(void)
Definition: indexer.cpp:2485
string m_FirstSuperKingdom
Definition: indexer.hpp:731
CTempString m_Cultivar
Definition: indexer.hpp:746
bool HasOperon(void)
Definition: indexer.cpp:2987
string GetrEnzyme(void)
Definition: indexer.cpp:3033
string GetSequence(void)
Definition: indexer.cpp:3107
string m_GeneralStr
Definition: indexer.hpp:682
string GetSecondSuperKingdom(void)
Definition: indexer.cpp:2635
bool m_HTGSCancelled
Definition: indexer.hpp:754
bool IsUsingAnamorph(void)
Definition: indexer.cpp:2545
const string & GetAccession(void) const
Definition: indexer.hpp:482
CMolInfo::TCompleteness GetCompleteness(void)
Definition: indexer.cpp:2405
bool m_Multispecies
Definition: indexer.hpp:724
CWeakRef< CSeqMasterIndex > GetSeqMasterIndex(void) const
Definition: indexer.hpp:436
bool IsPlasmid(void)
Definition: indexer.cpp:2595
bool IsCrossKingdom(void)
Definition: indexer.cpp:2645
CTempString m_Substrain
Definition: indexer.hpp:750
const vector< CRef< CDescriptorIndex > > & GetDescriptorIndices(void)
Definition: indexer.cpp:3127
CSeqEntryIndex::TFlags m_Flags
Definition: indexer.hpp:650
bool IsUnordered(void)
Definition: indexer.cpp:2857
bool m_IsUnverifiedOrganism
Definition: indexer.hpp:770
CConstRef< CBioSource > m_DescBioSource
Definition: indexer.hpp:711
CSeq_inst::TLength m_Length
Definition: indexer.hpp:659
bool IsNA(void) const
Definition: indexer.hpp:471
string m_Organelle
Definition: indexer.hpp:729
CTempString GetIsolate(void)
Definition: indexer.cpp:2757
bool m_IsEST_STS_GSS
Definition: indexer.hpp:706
CTempString GetPDBCompound(void)
Definition: indexer.cpp:2867
CTempString m_Breed
Definition: indexer.hpp:745
bool HasGene(void)
Definition: indexer.cpp:3003
const string & GetTitle(void)
Definition: indexer.cpp:2365
CTempString GetMap(void)
Definition: indexer.cpp:2695
CBioseq_Handle m_Bsh
Definition: indexer.hpp:617
bool m_HasMultiIntervalGenes
Definition: indexer.hpp:786
bool m_TLSMaster
Definition: indexer.hpp:680
bool m_HasSource
Definition: indexer.hpp:787
bool IsUnreviewedUnannotated(void)
Definition: indexer.cpp:2947
bool m_IsTSA
Definition: indexer.hpp:704
void GetSelector(SAnnotSelector &sel)
Definition: indexer.cpp:2054
bool IsPseudogene(void)
Definition: indexer.cpp:2977
CTempString GetSegment(void)
Definition: indexer.cpp:2715
bool m_IsWGS
Definition: indexer.hpp:705
CTempString GetLinkageGroup(void)
Definition: indexer.cpp:2665
CBioseq_Handle m_OrigBsh
Definition: indexer.hpp:619
bool m_FeatsInitialized
Definition: indexer.hpp:632
vector< CRef< CDescriptorIndex > > m_SdxList
Definition: indexer.hpp:630
bool HasMultiIntervalGenes(void)
Definition: indexer.cpp:3013
bool IsHTGSDraft(void)
Definition: indexer.cpp:2807
CMolInfo::TTech m_Tech
Definition: indexer.hpp:698
string m_Accession
Definition: indexer.hpp:667
CTempString m_Segment
Definition: indexer.hpp:742
CMolInfo::TBiomol m_Biomol
Definition: indexer.hpp:697
const string & GetTaxname(void)
Definition: indexer.cpp:2495
bool m_SourcesInitialized
Definition: indexer.hpp:635
bool m_IsCrossKingdom
Definition: indexer.hpp:733
CWeakRef< CBioseqIndex > GetBioseqForProduct(void)
Definition: indexer.cpp:2340
bool m_TPAExp
Definition: indexer.hpp:757
const string & GetOrganelle(void)
Definition: indexer.cpp:2615
CRef< CFeatureIndex > GetBestProteinFeature(void)
Definition: indexer.cpp:2352
CTempString GetSpecimenVoucher(void)
Definition: indexer.cpp:2746
int m_GeneralId
Definition: indexer.hpp:683
string m_DescTaxname
Definition: indexer.hpp:712
vector< CRef< CGapIndex > > m_GapList
Definition: indexer.hpp:627
int m_PDBChain
Definition: indexer.hpp:689
CSeqEntryIndex::EPolicy m_Policy
Definition: indexer.hpp:649
CTempString GetClone(void)
Definition: indexer.cpp:2675
int m_PatentSequence
Definition: indexer.hpp:687
CRef< CFeatureIndex > m_BestProteinFeature
Definition: indexer.hpp:641
string m_Comment
Definition: indexer.hpp:781
CBioSource::TGenome GetGenome(void)
Definition: indexer.cpp:2585
bool m_Unordered
Definition: indexer.hpp:760
CTempString GetChromosome(void)
Definition: indexer.cpp:2655
void x_InitSource(void)
Definition: indexer.cpp:1305
bool IsUnverifiedMisassembled(void)
Definition: indexer.cpp:2917
CTempString m_TargetedLocus
Definition: indexer.hpp:778
CMolInfo::TBiomol GetBiomol(void)
Definition: indexer.cpp:2385
bool IsHTGSUnfinished(void)
Definition: indexer.cpp:2425
CTempString GetPlasmid(void)
Definition: indexer.cpp:2705
string GetFirstSuperKingdom(void)
Definition: indexer.cpp:2625
bool IsUseBiosrc(void)
Definition: indexer.cpp:2475
CTempString m_MetaGenomeSource
Definition: indexer.hpp:751
CTempString m_Plasmid
Definition: indexer.hpp:741
bool m_IsUnverified
Definition: indexer.hpp:768
void x_DefaultSelector(SAnnotSelector &sel, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, bool onlyNear, CScope &scope)
Definition: indexer.cpp:1849
CTempString m_PDBCompound
Definition: indexer.hpp:763
bool m_HTGSDraft
Definition: indexer.hpp:755
bool IsTSA(void)
Definition: indexer.cpp:2445
bool IsUnverifiedContaminant(void)
Definition: indexer.cpp:2927
CTempString m_Map
Definition: indexer.hpp:740
string m_Taxname
Definition: indexer.hpp:715
string m_rEnzyme
Definition: indexer.hpp:790
const string & GetCommon(void)
Definition: indexer.cpp:2515
CMolInfo::TCompleteness m_Completeness
Definition: indexer.hpp:699
CTempString m_Strain
Definition: indexer.hpp:749
bool IsMultispecies(void)
Definition: indexer.cpp:2575
bool m_IsPlasmid
Definition: indexer.hpp:726
bool IsTLS(void)
Definition: indexer.cpp:2435
CRef< CFeatureIndex > m_FeatureForProduct
Definition: indexer.hpp:638
bool m_TPAReasm
Definition: indexer.hpp:759
void x_InitDescs(void)
Definition: indexer.cpp:1590
const string & GetComment(void)
Definition: indexer.cpp:2967
TTaxId GetTaxid(void)
Definition: indexer.cpp:2535
bool m_TPAInf
Definition: indexer.hpp:758
bool IsUnverifiedOrganism(void)
Definition: indexer.cpp:2907
CTempString GetTargetedLocus(void)
Definition: indexer.cpp:2957
CTempString m_LinkageGroup
Definition: indexer.hpp:737
bool m_DescsInitialized
Definition: indexer.hpp:629
const string & GetDescTaxname(void)
Definition: indexer.cpp:2505
bool m_GapsInitialized
Definition: indexer.hpp:626
string m_Common
Definition: indexer.hpp:717
bool IsUnverified(void)
Definition: indexer.cpp:2887
CTempString m_Genus
Definition: indexer.hpp:722
bool m_IsChromosome
Definition: indexer.hpp:727
CBioseq_Handle –.
CBioseq_set_Handle –.
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq_set.hpp:122
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq.hpp:174
Definition: Dbtag.hpp:53
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
const CSeqdesc & m_Sd
Definition: indexer.hpp:870
CSeqdesc::E_Choice m_Type
Definition: indexer.hpp:873
CDescriptorIndex(const CSeqdesc &sd, CBioseqIndex &bsx)
Definition: indexer.cpp:3174
CEMBL_block –.
Definition: EMBL_block.hpp:66
CFeat_CI –.
Definition: feat_ci.hpp:64
CFeatureIndex(CSeq_feat_Handle sfh, const CMappedFeat mf, CConstRef< CSeq_loc > feat_loc, CBioseqIndex &bsx)
Definition: indexer.cpp:3186
const CMappedFeat m_Mf
Definition: indexer.hpp:935
CSeqFeatData::ESubtype m_Subtype
Definition: indexer.hpp:941
CSeqFeatData::ESubtype GetSubtype(void) const
Definition: indexer.hpp:909
CRef< CFeatureIndex > GetOverlappingSource(void)
Definition: indexer.cpp:3274
CSeqFeatData::E_Choice GetType(void) const
Definition: indexer.hpp:906
TSeqPos m_End
Definition: indexer.hpp:944
CRef< CSeqVector > m_SeqVec
Definition: indexer.hpp:937
CRef< CFeatureIndex > GetBestParent(void)
Definition: indexer.cpp:3230
CRef< CFeatureIndex > GetBestGene(void)
Definition: indexer.cpp:3203
CSeqFeatData::E_Choice m_Type
Definition: indexer.hpp:940
const CMappedFeat GetMappedFeat(void) const
Definition: indexer.hpp:897
CConstRef< CSeq_loc > m_Fl
Definition: indexer.hpp:936
void SetFetchFailure(bool fails)
Definition: indexer.cpp:3263
TSeqPos m_Start
Definition: indexer.hpp:943
CConstRef< CSeq_loc > GetMappedLocation(void) const
Definition: indexer.hpp:900
CWeakRef< CBioseqIndex > GetBioseqIndex(void) const
Definition: indexer.hpp:903
string GetSequence(void)
Definition: indexer.cpp:3363
CGapIndex(TSeqPos start, TSeqPos end, TSeqPos length, const string &type, const vector< string > &evidence, bool isUnknownLength, bool isAssemblyGap, CBioseqIndex &bsx)
Definition: indexer.cpp:3151
const string & GetSomeNumber(void) const
Definition: Id_pat.cpp:96
CMap_ext –.
Definition: Map_ext.hpp:66
CMappedFeat –.
Definition: mapped_feat.hpp:59
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
TTaxId GetTaxId() const
Definition: Org_ref.cpp:72
CPDB_block –.
Definition: PDB_block.hpp:66
CPartialOrgName –.
CRsite_ref –.
Definition: Rsite_ref.hpp:66
CScope –.
Definition: scope.hpp:92
CRef< CSeqMasterIndex > m_Idx
Definition: indexer.hpp:193
bool IsFetchFailure(void)
Definition: indexer.cpp:209
const vector< CRef< CBioseqIndex > > & GetBioseqIndices(void)
Definition: indexer.cpp:155
bool DistributedReferences(void)
Definition: indexer.cpp:167
void SetFeatDepth(int featDepth)
Definition: indexer.cpp:185
const vector< CRef< CSeqsetIndex > > & GetSeqsetIndices(void)
Definition: indexer.cpp:161
CSeqEntryIndex(CSeq_entry_Handle &topseh, EPolicy policy=eAdaptive, TFlags flags=fDefault)
Definition: indexer.cpp:57
int GetGapDepth(void)
Definition: indexer.cpp:203
bool IsIndexFailure(void)
Definition: indexer.cpp:215
FAddSnpFunc * GetSnpFunc(void)
Definition: indexer.cpp:179
void SetGapDepth(int gapDepth)
Definition: indexer.cpp:197
void SetSnpFunc(FAddSnpFunc *snp)
Definition: indexer.cpp:173
CRef< CBioseqIndex > GetBioseqIndex(void)
Definition: indexer.cpp:114
int GetFeatDepth(void)
Definition: indexer.cpp:191
ESubtype GetSubtype(void) const
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
bool m_DistributedReferences
Definition: indexer.hpp:321
CSeqEntryIndex::TFlags m_Flags
Definition: indexer.hpp:304
const vector< CRef< CSeqsetIndex > > & GetSeqsetIndices(void)
Definition: indexer.cpp:755
CConstRef< CSeq_descr > m_TopDescr
Definition: indexer.hpp:300
FAddSnpFunc * m_SnpFunc
Definition: indexer.hpp:323
const vector< CRef< CBioseqIndex > > & GetBioseqIndices(void)
Definition: indexer.cpp:749
bool DistributedReferences(void) const
Definition: indexer.hpp:265
void x_Initialize(CSeq_entry_Handle &topseh, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags)
Definition: indexer.cpp:225
CSeqEntryIndex::EPolicy m_Policy
Definition: indexer.hpp:303
bool m_IndexFailure
Definition: indexer.hpp:330
TAccnIndexMap m_AccnIndexMap
Definition: indexer.hpp:310
void x_InitSeqs(const CSeq_entry &sep, CRef< CSeqsetIndex > prnt, int level=0)
Definition: indexer.cpp:514
CConstRef< CSubmit_block > m_SbtBlk
Definition: indexer.hpp:299
FAddSnpFunc * GetSnpFunc(void)
Definition: indexer.cpp:416
bool IsFetchFailure(void)
Definition: indexer.cpp:448
void SetGapDepth(int gapDepth)
Definition: indexer.cpp:434
CRef< CObjectManager > m_Objmgr
Definition: indexer.hpp:294
TBestIdIndexMap m_BestIdIndexMap
Definition: indexer.hpp:314
CAtomicCounter m_Counter
Definition: indexer.hpp:328
int GetGapDepth(void)
Definition: indexer.cpp:440
void SetFeatDepth(int featDepth)
Definition: indexer.cpp:422
int GetFeatDepth(void)
Definition: indexer.cpp:428
vector< CRef< CBioseqIndex > > m_BsxList
Definition: indexer.hpp:306
void SetSnpFunc(FAddSnpFunc *snp)
Definition: indexer.cpp:410
CConstRef< CSeq_entry > m_Tsep
Definition: indexer.hpp:298
void x_Init(void)
Definition: indexer.cpp:630
CRef< CBioseqIndex > GetBioseqIndex(void)
Definition: indexer.cpp:673
CRef< CScope > m_Scope
Definition: indexer.hpp:295
vector< CRef< CSeqsetIndex > > m_SsxList
Definition: indexer.hpp:316
bool m_IsSmallGenomeSet
Definition: indexer.hpp:319
bool IsIndexFailure(void) const
Definition: indexer.hpp:283
CSeq_entry_Handle m_Tseh
Definition: indexer.hpp:296
CRef< feature::CFeatTree > m_FeatTree
Definition: indexer.hpp:301
CSeqVector –.
Definition: seq_vector.hpp:65
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
void Parentize(void)
Definition: Seq_entry.cpp:71
CSeq_ext –.
Definition: Seq_ext.hpp:66
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeq_loc_Mapper –.
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
CBioseq_set::TClass m_Class
Definition: indexer.hpp:363
CBioseq_set::TClass GetClass(void) const
Definition: indexer.hpp:356
CSeqsetIndex(CBioseq_set_Handle ssh, const CBioseq_set &bssp, CRef< CSeqsetIndex > prnt)
Definition: indexer.cpp:765
TBase::const_iterator const_iterator
Definition: static_set.hpp:828
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
CSubmit_block –.
CTaxElement –.
Definition: TaxElement.hpp:66
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
bool IsUnverifiedMisassembled() const
bool IsUnverifiedOrganism() const
bool IsUnverifiedContaminant() const
bool IsUnverifiedFeature() const
bool IsUnreviewedUnannotated() const
static string TrimMixedContent(const string &str)
Definition: indexer.cpp:3714
vector< string > m_Pair
Definition: indexer.hpp:981
static bool IsStopWord(const string &str)
Definition: indexer.cpp:3587
void PopulateWordPairIndex(string str)
Definition: indexer.cpp:3750
vector< string > m_Norm
Definition: indexer.hpp:980
string x_AddToWordPairIndex(string item, string prev)
Definition: indexer.cpp:3734
static string TrimPunctuation(const string &str)
Definition: indexer.cpp:3594
static string ConvertUTF8ToAscii(const string &str)
Definition: indexer.cpp:3416
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
static uch flags
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
#define FOR_EACH_USERFIELD_ON_USEROBJECT(Itr, Var)
FOR_EACH_USERFIELD_ON_USEROBJECT EDIT_EACH_USERFIELD_ON_USEROBJECT.
static int lc
Definition: getdata.c:30
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define NULL
Definition: ncbistd.hpp:225
void Set(TValue new_value) THROWS_NONE
Set atomic counter value.
Definition: ncbicntr.hpp:185
void SetDiagFilter(EDiagFilter what, const char *filter_str)
Set diagnostic filter.
Definition: ncbidiag.cpp:7670
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
@ eDiagFilter_All
for all non-FATAL
Definition: ncbidiag.hpp:2531
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
CConstRef< CSeq_id > GetSeqId(void) const
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
string AsString(void) const
static int Score(const CRef< CSeq_id > &id)
Wrappers for use with FindBestChoice from <corelib/ncbiutil.hpp>
Definition: Seq_id.hpp:740
@ fAcc_master
Definition: Seq_id.hpp:230
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
CMappedFeat GetBestParentForFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype parent_subtype, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3462
CMappedFeat GetBestGeneForFeat(const CMappedFeat &feat, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0, CFeatTree::EBestGeneType lookup_type=CFeatTree::eBestGene_TreeOnly)
Definition: feature.cpp:3443
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
vector< string > gap_linkage_evidences
A vector representing the linkage-evidences of the gap.
Definition: sequence.hpp:871
string gap_type
String representing the gap type.
Definition: sequence.hpp:868
static void GetGapModText(const CSeq_gap &seq_gap, SGapModText &out_gap_mod_text)
Given a CSeq_gap object, this outputs the Gap information.
Definition: sequence.cpp:3483
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
CSeq_loc_Mapper_Base & TruncateNonmappingRanges(void)
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CBioseq_set_Handle GetBioseq_setHandle(const CBioseq_set &seqset, EMissing action=eMissing_Default)
Definition: scope.cpp:176
void SetFuzzOption(TFuzzOption newOption)
vector< CSeq_id_Handle > TId
const CTSE_Handle & GetTSE_Handle(void) const
Get CTSE_Handle of containing TSE.
TClass GetClass(void) const
const TInst_Ext & GetInst_Ext(void) const
bool IsSetInst_Ext(void) const
const CSeqFeatData & GetData(void) const
TSeqPos GetBioseqLength(void) const
bool IsAa(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
bool IsSetInst_Length(void) const
TInst_Topology GetInst_Topology(void) const
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
TInst_Length GetInst_Length(void) const
bool IsSetInst(void) const
bool IsSetInst_Repr(void) const
bool IsSetClass(void) const
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
TInst_Repr GetInst_Repr(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
bool IsNa(void) const
bool IsSetInst_Topology(void) const
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
const TId & GetId(void) const
bool IsSetData(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
SSeqMapSelector & SetResolveCount(size_t res_cnt)
Set max depth of resolving seq-map.
Definition: seq_map_ci.hpp:151
SAnnotSelector & ExcludeFeatSubtype(TFeatSubtype subtype)
Exclude feature subtype from the search.
CSeq_id_Handle GetProductId(void) const
SAnnotSelector & SetResolveAll(void)
SetResolveAll() is equivalent to SetResolveMethod(eResolve_All).
SAnnotSelector & ExcludeFeatType(TFeatType type)
Exclude feature type from the search.
SAnnotSelector & SetMaxSearchSegmentsAction(EMaxSearchSegmentsAction action)
const CSeq_loc & GetLocation(void) const
SSeqMapSelector & SetFlags(TFlags flags)
Select segment type(s)
Definition: seq_map_ci.hpp:179
SAnnotSelector & SetAdaptiveDepth(bool value=true)
SetAdaptiveDepth() requests to restrict subsegment resolution depending on annotations found on lower...
SAnnotSelector & SetResolveDepth(int depth)
SetResolveDepth sets the limit of subsegment resolution in searching annotations.
SAnnotSelector & SetFeatComparator(IFeatComparator *comparator)
SAnnotSelector & IncludeNamedAnnotAccession(const string &acc, int zoom_level=0)
const CSeq_feat_Handle & GetSeq_feat_Handle(void) const
Get original feature handle.
Definition: mapped_feat.hpp:71
SAnnotSelector & SetExcludeExternal(bool exclude=true)
External annotations for the Object Manger are annotations located in top level Seq-entry different f...
SAnnotSelector & SetMaxSearchTime(TMaxSearchTime max_time)
Set maximum time (in seconds) to search before giving up.
SAnnotSelector & SetFailUnresolved(void)
SAnnotSelector & ExcludeNamedAnnots(const CAnnotName &name)
Add named annot to set of annots names to exclude.
SAnnotSelector & SetIgnoreStrand(bool value=true)
Ignore strand when testing for range overlap.
SAnnotSelector & SetMaxSearchSegments(TMaxSearchSegments max_segments)
Set maximum number of empty segments to search before giving up.
SAnnotSelector & ExcludeNamedAnnotAccession(const string &acc)
CSeq_id_Handle GetLocationId(void) const
bool CanGetRange(TSeqPos start, TSeqPos stop) const
Check if the sequence data is available for the interval [start, stop).
Definition: seq_vector.cpp:292
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void SetCoding(TCoding coding)
@ fFindGap
Definition: seq_map.hpp:130
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TRefType Lock(void) const
Lock the object and return reference to it.
Definition: ncbiobj.hpp:2713
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
void clear(void)
Clears the string.
Definition: tempstr.hpp:351
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3182
const char *const kEmptyCStr
Empty "C" string (points to a '\0').
Definition: ncbistr.cpp:68
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
Definition: GB_block_.hpp:526
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
Definition: GB_block_.hpp:514
const TCountry & GetCountry(void) const
Get the Country member data.
Definition: Id_pat_.hpp:478
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool CanGetOrg(void) const
Check if it is safe to call GetOrg method.
Definition: BioSource_.hpp:503
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
const TName & GetName(void) const
Get the Name member data.
Definition: SubSource_.hpp:350
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: SubSource_.hpp:338
const TStr & GetStr(void) const
Get the variant data.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
const TData & GetData(void) const
Get the Data member data.
bool IsSetTag(void) const
appropriate tag Check if a value has been assigned to Tag data member.
Definition: Dbtag_.hpp:255
bool IsStr(void) const
Check if variant Str is selected.
bool IsSetLabel(void) const
field label Check if a value has been assigned to Label data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TLabel & GetLabel(void) const
Get the Label member data.
const TType & GetType(void) const
Get the Type member data.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
const TLineage & GetLineage(void) const
Get the Lineage member data.
Definition: OrgName_.hpp:864
TFixed_level GetFixed_level(void) const
Get the Fixed_level member data.
const TSubname & GetSubname(void) const
Get the Subname member data.
Definition: OrgMod_.hpp:347
bool IsSetFixed_level(void) const
Check if a value has been assigned to Fixed_level data member.
bool IsPartial(void) const
Check if variant Partial is selected.
Definition: OrgName_.hpp:753
const TName & GetName(void) const
Get the Name member data.
Definition: OrgName_.hpp:771
const TLevel & GetLevel(void) const
Get the Level member data.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TBinomial & GetBinomial(void) const
Get the variant data.
Definition: OrgName_.cpp:121
bool CanGetLineage(void) const
Check if it is safe to call GetLineage method.
Definition: OrgName_.hpp:858
const Tdata & Get(void) const
Get the member data.
const TName & GetName(void) const
Get the Name member data.
bool IsSetGenus(void) const
required Check if a value has been assigned to Genus data member.
const TSpecies & GetSpecies(void) const
Get the Species member data.
list< CRef< CTaxElement > > Tdata
bool IsSetSubname(void) const
Check if a value has been assigned to Subname data member.
Definition: OrgMod_.hpp:335
bool IsSetLevel(void) const
Check if a value has been assigned to Level data member.
const TGenus & GetGenus(void) const
Get the Genus member data.
const TPartial & GetPartial(void) const
Get the variant data.
Definition: OrgName_.cpp:193
bool IsSet(void) const
Check if a value has been assigned to data member.
bool IsSetSpecies(void) const
species required if subspecies used Check if a value has been assigned to Species data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: OrgName_.hpp:759
bool IsBinomial(void) const
Check if variant Binomial is selected.
Definition: OrgName_.hpp:715
EProcessed
processing status
Definition: Prot_ref_.hpp:95
TProcessed GetProcessed(void) const
Get the Processed member data.
Definition: Prot_ref_.hpp:538
bool IsSetProcessed(void) const
Check if a value has been assigned to Processed data member.
Definition: Prot_ref_.hpp:513
bool IsStr(void) const
Check if variant Str is selected.
Definition: Rsite_ref_.hpp:264
const TStr & GetStr(void) const
Get the variant data.
Definition: Rsite_ref_.hpp:270
E_Choice Which(void) const
Which variant is currently selected.
bool IsRsite(void) const
Check if variant Rsite is selected.
E_Choice
Choice variants.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TBiosrc & GetBiosrc(void) const
Get the variant data.
const TRsite & GetRsite(void) const
Get the variant data.
const TProt & GetProt(void) const
Get the variant data.
@ e_Region
named region (globin locus)
@ e_Pub
publication applies to this seq
@ e_Comment
just a comment
TChain GetChain(void) const
Get the Chain member data.
bool IsSetChain_id(void) const
chain identifier; length-independent generalization of 'chain' Check if a value has been assigned to ...
bool IsSetChain(void) const
Deprecated: 'chain' can't support multiple character PDB chain identifiers (introduced in 2015).
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
bool IsSetCit(void) const
patent citation Check if a value has been assigned to Cit data member.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_loc_.hpp:475
TSeqid GetSeqid(void) const
Get the Seqid member data.
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
const TChain_id & GetChain_id(void) const
Get the Chain_id member data.
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
bool IsSetSeqid(void) const
number of sequence in patent Check if a value has been assigned to Seqid data member.
const TCit & GetCit(void) const
Get the Cit member data.
const TAccession & GetAccession(void) const
Get the Accession member data.
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
Definition: Seq_id_.hpp:113
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ e_Equiv
equivalent sets of locations
Definition: Seq_loc_.hpp:106
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TDescr & GetDescr(void) const
Get the Descr member data.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
bool IsSetDescr(void) const
Check if a value has been assigned to Descr data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
const TAnnot & GetAnnot(void) const
Get the Annot member data.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
bool IsMap(void) const
Check if variant Map is selected.
Definition: Seq_ext_.hpp:330
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
ERepr
representation class
Definition: Seq_inst_.hpp:91
const TGap & GetGap(void) const
Get the variant data.
Definition: Seq_data_.cpp:184
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
const Tdata & Get(void) const
Get the member data.
Definition: Map_ext_.hpp:164
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
const TMap & GetMap(void) const
Get the variant data.
Definition: Seq_ext_.cpp:158
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
list< CRef< CSeq_feat > > Tdata
Definition: Map_ext_.hpp:89
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
bool IsGap(void) const
Check if variant Gap is selected.
Definition: Seq_data_.hpp:704
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Map_ext_.hpp:152
const TComment & GetComment(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1058
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_map
ordered map of any kind
Definition: Seq_inst_.hpp:99
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
@ eCompleteness_unknown
Definition: MolInfo_.hpp:155
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Embl
EMBL specific information.
Definition: Seqdesc_.hpp:127
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Comment
a more extensive comment
Definition: Seqdesc_.hpp:117
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
@ e_Pdb
PDB specific information.
Definition: Seqdesc_.hpp:131
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
const TEntrys & GetEntrys(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
const TSub & GetSub(void) const
Get the Sub member data.
bool IsEntrys(void) const
Check if variant Entrys is selected.
bool CanGetSub(void) const
Check if it is safe to call GetSub method.
bool CanGetData(void) const
Check if it is safe to call GetData method.
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
const SUnicodeTranslation * UnicodeToAscii(TUnicode character, const TUnicodeTable *table=NULL, const SUnicodeTranslation *default_translation=NULL)
Convert Unicode character into ASCII string.
Definition: unicode.cpp:324
size_t UTF8ToUnicode(const char *utf, TUnicode *unicode)
Convert UTF8 into Unicode character.
Definition: unicode.cpp:382
unsigned int TUnicode
Definition: unicode.hpp:77
@ eSkip
Unicode to be skipped in translation. Usually it is combined mark.
Definition: unicode.hpp:52
Definition of all error codes used in objmgr libraries (xobjmgr.lib, xobjutil.lib and others).
CStaticArraySet< const char *, PCase_CStr > TStopWords
Definition: indexer.cpp:3584
static const char *const mixedTags[]
Definition: indexer.cpp:3647
NCBI_DEFINE_ERR_SUBCODE_X(11)
static bool s_BlankOrNotSpecialTaxname(string taxname)
Definition: indexer.cpp:1282
CStaticPairArrayMap< utf8::TUnicode, char > TExtraTranslations
Definition: indexer.cpp:3378
static int SkipMixedContent(const char *ptr)
Definition: indexer.cpp:3695
static const TExtraTranslationPair kExtraTranslations[]
Definition: indexer.cpp:3379
static CSeq_id_Handle s_IdxFindBestIdChoice(const CBioseq_Handle::TId &ids)
Definition: indexer.cpp:468
static const char * x_OrganelleName(TBIOSOURCE_GENOME genome, bool has_plasmid, bool virus_or_phage, bool wgs_suffix)
Definition: indexer.cpp:1176
static string s_IdxGetBestIdString(CBioseq_Handle bsh)
Definition: indexer.cpp:497
DEFINE_STATIC_ARRAY_MAP(TExtraTranslations, sc_ExtraTranslations, kExtraTranslations)
SStaticPair< utf8::TUnicode, char > TExtraTranslationPair
Definition: indexer.cpp:3377
static int s_IdxSeqIdHandle(const CSeq_id_Handle &idh)
Definition: indexer.cpp:460
static const char *const idxStopWords[]
Definition: indexer.cpp:3447
void(* FAddSnpFunc)(CBioseq_Handle bsh, string &na_acc)
Definition: indexer.hpp:61
int i
yy_size_t n
int len
constexpr auto sort(_Init &&init)
const char * tag
T max(T x_, T y_)
static char tmp[2048]
Definition: utf8.c:42
#define fi
static pcre_uint8 * buffer
Definition: pcretest.c:1051
#define NCBI_TECH(Type)
Definition: seq_macros.hpp:118
#define NCBI_SEQTOPOLOGY(Type)
Definition: seq_macros.hpp:66
#define FOR_EACH_COMPOUND_ON_PDBBLOCK(Itr, Var)
FOR_EACH_COMPOUND_ON_PDBBLOCK EDIT_EACH_COMPOUND_ON_PDBBLOCK.
#define NCBI_GENOME(Type)
@NAME Convenience macros for NCBI objects
#define SWITCH_ON_SUBSOURCE_CHOICE(Var)
SWITCH_ON_SUBSOURCE_CHOICE.
#define NCBI_ORGMOD(Type)
COrgMod definitions.
#define FOR_EACH_ORGMOD_ON_BIOSOURCE(Itr, Var)
FOR_EACH_ORGMOD_ON_BIOSOURCE EDIT_EACH_ORGMOD_ON_BIOSOURCE.
#define SWITCH_ON_ORGMOD_CHOICE(Var)
SWITCH_ON_ORGMOD_CHOICE.
#define FOR_EACH_SUBSOURCE_ON_BIOSOURCE(Itr, Var)
FOR_EACH_SUBSOURCE_ON_BIOSOURCE EDIT_EACH_SUBSOURCE_ON_BIOSOURCE.
#define NCBI_SUBSOURCE(Type)
CSubSource definitions.
CBioSource::TGenome TBIOSOURCE_GENOME
CSeq_id::EAccessionInfo TACCN_CHOICE
#define NCBI_SEQID(Type)
@NAME Convenience macros for NCBI objects
#define NCBI_ACCN(Type)
#define FIELD_IS_SET_AND_IS(Var, Fld, Chs)
FIELD_IS_SET_AND_IS base macro.
#define FOR_EACH_STRING_IN_LIST(Itr, Var)
FOR_EACH_STRING_IN_LIST EDIT_EACH_STRING_IN_LIST.
#define GET_FIELD(Var, Fld)
GET_FIELD base macro.
static const char * str(char *buf, int n)
Definition: stats.c:84
This indicates the text of the modifiers of a gap.
Definition: sequence.hpp:865
SAnnotSelector –.
Selector used in CSeqMap methods returning iterators.
Definition: seq_map_ci.hpp:113
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: type.c:6
#define _ASSERT
else result
Definition: token2.c:20
Modified on Fri Sep 29 07:32:01 2023 by modify_doxy.py rev. 669887