NCBI C++ ToolKit
indexer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jonathan Kans
27 *
28 */
29 
30 #include <ncbi_pch.hpp>
31 
32 #include <util/unicode.hpp>
33 #include <util/static_set.hpp>
34 #include <util/static_map.hpp>
35 
37 
38 #include <objmgr/feat_ci.hpp>
39 #include <objmgr/seqdesc_ci.hpp>
40 #include <objmgr/seq_map_ci.hpp>
41 #include <objmgr/error_codes.hpp>
42 
43 #include <objmgr/util/indexer.hpp>
44 #include <objmgr/util/sequence.hpp>
46 
47 #define NCBI_USE_ERRCODE_X ObjMgr_Indexer
48 
52 
53 
54 // CSeqEntryIndex
55 
56 // Constructors take top-level sequence object, create a CRef<CSeqMasterIndex>, and call its initializer
58 
59 {
60  m_Idx.Reset(new CSeqMasterIndex);
61  m_Idx->x_Initialize(topseh, policy, flags);
62 }
63 
65 
66 {
68  m_Idx->x_Initialize(bsh, policy, flags);
69 }
70 
72 
73 {
75  m_Idx->x_Initialize(topsep, policy, flags);
76 }
77 
79 
80 {
82  m_Idx->x_Initialize(seqset, policy, flags);
83 }
84 
86 
87 {
89  m_Idx->x_Initialize(bioseq, policy, flags);
90 }
91 
93 
94 {
96  m_Idx->x_Initialize(submit, policy, flags);
97 }
98 
100 
101 {
103  m_Idx->x_Initialize(topsep, sblock, policy, flags);
104 }
105 
107 
108 {
110  m_Idx->x_Initialize(topsep, descr, policy, flags);
111 }
112 
113 // Get first Bioseq index
115 
116 {
117  return m_Idx->GetBioseqIndex();
118 }
119 
120 // Get Nth Bioseq index
122 
123 {
124  return m_Idx->GetBioseqIndex(n);
125 }
126 
127 // Get Bioseq index by accession
129 
130 {
131  return m_Idx->GetBioseqIndex(accn);
132 }
133 
134 // Get Bioseq index by handle (via best Seq-id string)
136 
137 {
138  return m_Idx->GetBioseqIndex(bsh);
139 }
140 
141 // // Get Bioseq index by feature
143 
144 {
145  return m_Idx->GetBioseqIndex(mf);
146 }
147 
148 // Get Bioseq index by sublocation
150 
151 {
152  return m_Idx->GetBioseqIndex(loc);
153 }
154 
155 const vector<CRef<CBioseqIndex>>& CSeqEntryIndex::GetBioseqIndices(void)
156 
157 {
158  return m_Idx->GetBioseqIndices();
159 }
160 
161 const vector<CRef<CSeqsetIndex>>& CSeqEntryIndex::GetSeqsetIndices(void)
162 
163 {
164  return m_Idx->GetSeqsetIndices();
165 }
166 
168 
169 {
170  return m_Idx->DistributedReferences();
171 }
172 
174 
175 {
176  m_Idx->SetSnpFunc (snp);
177 }
178 
180 
181 {
182  return m_Idx->GetSnpFunc();
183 }
184 
185 void CSeqEntryIndex::SetFeatDepth(int featDepth)
186 
187 {
188  m_Idx->SetFeatDepth (featDepth);
189 }
190 
192 
193 {
194  return m_Idx->GetFeatDepth();
195 }
196 
197 void CSeqEntryIndex::SetGapDepth(int featDepth)
198 
199 {
200  m_Idx->SetGapDepth (featDepth);
201 }
202 
204 
205 {
206  return m_Idx->GetGapDepth();
207 }
208 
210 
211 {
212  return m_Idx->IsFetchFailure();
213 }
214 
216 
217 {
218  return m_Idx->IsIndexFailure();
219 }
220 
221 
222 // CSeqMasterIndex
223 
224 // Initializers take top-level sequence object, create Seq-entry wrapper if necessary
226 {
227  m_Policy = policy;
228  m_Flags = flags;
229 
230  m_Tseh = topseh.GetTopLevelEntry();
232  CSeq_entry& topsep = const_cast<CSeq_entry&>(*tcsep);
233  topsep.Parentize();
234  m_Tsep.Reset(&topsep);
235 
236  m_FeatTree = new feature::CFeatTree;
237 
238  m_HasOperon = false;
239  m_IsSmallGenomeSet = false;
240  m_DistributedReferences = false;
241  m_SnpFunc = 0;
242  m_FeatDepth = 0;
243  m_GapDepth = 0;
244  m_IndexFailure = false;
245 
246  try {
247  // Code copied from x_Init, then modified to reuse existing scope from CSeq_entry_Handle
249  if ( !m_Objmgr ) {
250  // raise hell
251  m_IndexFailure = true;
252  }
253 
255  if ( !m_Scope ) {
256  // raise hell
257  m_IndexFailure = true;
258  }
259 
260  m_Counter.Set(0);
261 
262  // Populate vector of CBioseqIndex objects representing local Bioseqs in blob
263  CRef<CSeqsetIndex> noparent;
264  x_InitSeqs( *m_Tsep, noparent );
265  }
266  catch (CException& e) {
267  m_IndexFailure = true;
268  ERR_POST_X(1, Error << "Error in CSeqMasterIndex::x_Init: " << e.what());
269  }
270 }
271 
273 {
274  m_Policy = policy;
275  m_Flags = flags;
276 
277  m_Tseh = bsh.GetTopLevelEntry();
279  CSeq_entry& topsep = const_cast<CSeq_entry&>(*tcsep);
280  topsep.Parentize();
281  m_Tsep.Reset(&topsep);
282 
283  m_FeatTree = new feature::CFeatTree;
284 
285  m_HasOperon = false;
286  m_IsSmallGenomeSet = false;
287  m_DistributedReferences = false;
288  m_SnpFunc = 0;
289  m_FeatDepth = 0;
290  m_GapDepth = 0;
291  m_IndexFailure = false;
292 
293  try {
294  // Code copied from x_Init, then modified to reuse existing scope from CSeq_entry_Handle
296  if ( !m_Objmgr ) {
297  // raise hell
298  m_IndexFailure = true;
299  }
300 
302  if ( !m_Scope ) {
303  // raise hell
304  m_IndexFailure = true;
305  }
306 
307  m_Counter.Set(0);
308 
309  // Populate vector of CBioseqIndex objects representing local Bioseqs in blob
310  CRef<CSeqsetIndex> noparent;
311  x_InitSeqs( *m_Tsep, noparent );
312  }
313  catch (CException& e) {
314  m_IndexFailure = true;
315  ERR_POST_X(1, Error << "Error in CSeqMasterIndex::x_Init: " << e.what());
316  }
317 }
318 
320 {
321  m_Policy = policy;
322  m_Flags = flags;
323 
324  topsep.Parentize();
325  m_Tsep.Reset(&topsep);
326 
327  x_Init();
328 }
329 
331 {
332  m_Policy = policy;
333  m_Flags = flags;
334 
335  CSeq_entry* parent = seqset.GetParentEntry();
336  if (parent) {
337  parent->Parentize();
338  m_Tsep.Reset(parent);
339  } else {
340  CRef<CSeq_entry> sep(new CSeq_entry);
341  sep->SetSet(seqset);
342  sep->Parentize();
343  m_Tsep.Reset(sep);
344  }
345 
346  x_Init();
347 }
348 
350 {
351  m_Policy = policy;
352  m_Flags = flags;
353 
354  CSeq_entry* parent = bioseq.GetParentEntry();
355  if (parent) {
356  parent->Parentize();
357  m_Tsep.Reset(parent);
358  } else {
359  CRef<CSeq_entry> sep(new CSeq_entry);
360  sep->SetSeq(bioseq);
361  sep->Parentize();
362  m_Tsep.Reset(sep);
363  }
364 
365  x_Init();
366 }
367 
369 {
370  m_Policy = policy;
371  m_Flags = flags;
372 
373  _ASSERT(submit.CanGetData());
374  _ASSERT(submit.CanGetSub());
375  _ASSERT(submit.GetData().IsEntrys());
376  _ASSERT(!submit.GetData().GetEntrys().empty());
377 
378  CRef<CSeq_entry> sep = submit.GetData().GetEntrys().front();
379  sep->Parentize();
380  m_Tsep.Reset(sep);
381  m_SbtBlk.Reset(&submit.GetSub());
382 
383  x_Init();
384 }
385 
387 {
388  m_Policy = policy;
389  m_Flags = flags;
390 
391  topsep.Parentize();
392  m_Tsep.Reset(&topsep);
393  m_SbtBlk.Reset(&sblock);
394 
395  x_Init();
396 }
397 
399 {
400  m_Policy = policy;
401  m_Flags = flags;
402 
403  topsep.Parentize();
404  m_Tsep.Reset(&topsep);
405  m_TopDescr.Reset(&descr);
406 
407  x_Init();
408 }
409 
411 
412 {
413  m_SnpFunc = snp;
414 }
415 
417 
418 {
419  return m_SnpFunc;
420 }
421 
422 void CSeqMasterIndex::SetFeatDepth (int featDepth)
423 
424 {
425  m_FeatDepth = featDepth;
426 }
427 
429 
430 {
431  return m_FeatDepth;
432 }
433 
434 void CSeqMasterIndex::SetGapDepth (int gapDepth)
435 
436 {
437  m_GapDepth = gapDepth;
438 }
439 
441 
442 {
443  return m_GapDepth;
444 }
445 
446 
447 // At end of program, poll all Bioseqs to check for far fetch failure flag
449 
450 {
451  for (auto& bsx : m_BsxList) {
452  if (bsx->IsFetchFailure()) {
453  return true;
454  }
455  }
456  return false;
457 }
458 
459 // FindBestIdChoice modified from feature_item.cpp
460 static int s_IdxSeqIdHandle(const CSeq_id_Handle& idh)
461 {
462  CConstRef<CSeq_id> id = idh.GetSeqId();
463  CRef<CSeq_id> id_non_const
464  (const_cast<CSeq_id*>(id.GetPointer()));
465  return CSeq_id::Score(id_non_const);
466 }
467 
469 {
472 
473  ITERATE( CBioseq_Handle::TId, it, ids ) {
474  switch( (*it).Which() ) {
475  case CSeq_id::e_Local:
476  case CSeq_id::e_Genbank:
477  case CSeq_id::e_Embl:
478  case CSeq_id::e_Ddbj:
480  case CSeq_id::e_Pir:
481  case CSeq_id::e_Gi:
482  case CSeq_id::e_Other:
483  case CSeq_id::e_General:
484  case CSeq_id::e_Tpg:
485  case CSeq_id::e_Tpe:
486  case CSeq_id::e_Tpd:
487  case CSeq_id::e_Gpipe:
488  tracker(*it);
489  break;
490  default:
491  break;
492  }
493  }
494  return tracker.GetBestChoice();
495 }
496 
498 
499 {
500  if (bsh) {
501  const CBioseq_Handle::TId& ids = bsh.GetId();
502  if (! ids.empty()) {
504  if (best) {
505  return best.AsString();
506  }
507  }
508  }
509 
510  return "";
511 }
512 
513 // Recursively explores from top-level Seq-entry to make flattened vector of CBioseqIndex objects
515 
516 {
517  if (sep.IsSeq()) {
518  // Is Bioseq
519  const CBioseq& bsp = sep.GetSeq();
521  if (bsh) {
522  // create CBioseqIndex object for current Bioseq
523  CRef<CBioseqIndex> bsx(new CBioseqIndex(bsh, bsp, bsh, prnt, m_Tseh, m_Scope, *this, m_Policy, m_Flags));
524 
525  // record CBioseqIndex in vector for IterateBioseqs or GetBioseqIndex
526  m_BsxList.push_back(bsx);
527 
528  // map from accession string to CBioseqIndex object
529  const string& accn = bsx->GetAccession();
530  m_AccnIndexMap[accn] = bsx;
531 
532  const CBioseq_Handle::TId& ids = bsh.GetId();
533  if (! ids.empty()) {
534  ITERATE( CBioseq_Handle::TId, it, ids ) {
535  TSEQID_CHOICE chs = (*it).Which();
536  switch( chs ) {
537  case CSeq_id::e_Local:
538  case CSeq_id::e_Genbank:
539  case CSeq_id::e_Embl:
540  case CSeq_id::e_Ddbj:
542  case CSeq_id::e_Pir:
543  case CSeq_id::e_Gi:
544  case CSeq_id::e_Other:
545  case CSeq_id::e_General:
546  case CSeq_id::e_Tpg:
547  case CSeq_id::e_Tpe:
548  case CSeq_id::e_Tpd:
549  case CSeq_id::e_Gpipe:
550  {
551  // map from handle to Seq-id string to CBioseqIndex object
552  string str = (*it).AsString();
553  m_BestIdIndexMap[str] = bsx;
554  break;
555  }
556  default:
557  break;
558  }
559  }
560  }
561 
562  if (bsp.IsSetDescr()) {
563  for (auto& desc : bsp.GetDescr().Get()) {
564  if (desc->Which() == CSeqdesc::e_Pub) {
566  }
567  }
568  }
569 
570  if (bsp.IsSetAnnot()) {
571  for (auto& annt : bsp.GetAnnot()) {
572  if (annt->IsFtable()) {
573  for (auto& feat : annt->GetData().GetFtable()) {
574  if (feat->IsSetData() && feat->GetData().Which() == CSeqFeatData::e_Pub) {
576  } else if (feat->IsSetCit()) {
578  }
579  }
580  }
581  }
582  }
583  }
584  } else if (sep.IsSet()) {
585  // Is Bioseq-set
586  const CBioseq_set& bssp = sep.GetSet();
588  if (ssh) {
589  // create CSeqsetIndex object for current Bioseq-set
590  CRef<CSeqsetIndex> ssx(new CSeqsetIndex(ssh, bssp, prnt));
591 
593  m_IsSmallGenomeSet = true;
594  }
595 
596  if (level > 0 && bssp.IsSetDescr()) {
597  for (auto& desc : bssp.GetDescr().Get()) {
598  if (desc->Which() == CSeqdesc::e_Pub) {
600  }
601  }
602  }
603 
604  // record CSeqsetIndex in vector
605  m_SsxList.push_back(ssx);
606 
607  for (auto& seqentry : bssp.GetSeq_set()) {
608  // recursively explore current Bioseq-set
609  x_InitSeqs(*seqentry, ssx, level + 1);
610  }
611 
612  if (bssp.IsSetAnnot()) {
613  for (auto& annt : bssp.GetAnnot()) {
614  if (annt->IsFtable()) {
615  for (auto& feat : annt->GetData().GetFtable()) {
616  if (feat->IsSetData() && feat->GetData().Which() == CSeqFeatData::e_Pub) {
618  } else if (feat->IsSetCit()) {
620  }
621  }
622  }
623  }
624  }
625  }
626  }
627 }
628 
629 // Common initialization function creates local default CScope
631 
632 {
633  m_FeatTree = new feature::CFeatTree;
634 
635  m_HasOperon = false;
636  m_IsSmallGenomeSet = false;
637  m_DistributedReferences = false;
638  m_SnpFunc = 0;
639  m_FeatDepth = 0;
640  m_GapDepth = 0;
641  m_IndexFailure = false;
642 
643  try {
645  if ( !m_Objmgr ) {
646  // raise hell
647  m_IndexFailure = true;
648  }
649 
650  m_Scope.Reset( new CScope( *m_Objmgr ) );
651  if ( !m_Scope ) {
652  // raise hell
653  m_IndexFailure = true;
654  }
655 
656  m_Counter.Set(0);
657 
658  m_Scope->AddDefaults();
659 
661 
662  // Populate vector of CBioseqIndex objects representing local Bioseqs in blob
663  CRef<CSeqsetIndex> noparent;
664  x_InitSeqs( *m_Tsep, noparent );
665  }
666  catch (CException& e) {
667  m_IndexFailure = true;
668  ERR_POST_X(1, Error << "Error in CSeqMasterIndex::x_Init: " << e.what());
669  }
670 }
671 
672 // Get first Bioseq index
674 
675 {
676  for (auto& bsx : m_BsxList) {
677  return bsx;
678  }
679  return CRef<CBioseqIndex> ();
680 }
681 
682 // Get Nth Bioseq index
684 
685 {
686  for (auto& bsx : m_BsxList) {
687  n--;
688  if (n > 0) continue;
689  return bsx;
690  }
691  return CRef<CBioseqIndex> ();
692 }
693 
694 // Get Bioseq index by accession
696 
697 {
699  if (it != m_AccnIndexMap.end()) {
700  CRef<CBioseqIndex> bsx = it->second;
701  return bsx;
702  }
703  return CRef<CBioseqIndex> ();
704 }
705 
706 // Get Bioseq index by handle (via best Seq-id string)
708 
709 {
710  string bestid = s_IdxGetBestIdString(bsh);
712  if (it != m_BestIdIndexMap.end()) {
713  CRef<CBioseqIndex> bsx = it->second;
714  return bsx;
715  }
716  return CRef<CBioseqIndex> ();
717 }
718 
719 // Get Bioseq index by string
721 
722 {
724  if (it != m_BestIdIndexMap.end()) {
725  CRef<CBioseqIndex> bsx = it->second;
726  return bsx;
727  }
728  return CRef<CBioseqIndex> ();
729 }
730 
731 // Get Bioseq index by feature
733 
734 {
735  CSeq_id_Handle idh = mf.GetLocationId();
737  return GetBioseqIndex(bsh);
738 }
739 
740 // Get Bioseq index by sublocation
742 
743 {
745  return GetBioseqIndex(bsh);
746 }
747 
748 // Allow access to internal vectors for application to use in iterators
749 const vector<CRef<CBioseqIndex>>& CSeqMasterIndex::GetBioseqIndices(void)
750 
751 {
752  return m_BsxList;
753 }
754 
755 const vector<CRef<CSeqsetIndex>>& CSeqMasterIndex::GetSeqsetIndices(void)
756 
757 {
758  return m_SsxList;
759 }
760 
761 
762 // CSeqsetIndex
763 
764 // Constructor
766  const CBioseq_set& bssp,
767  CRef<CSeqsetIndex> prnt)
768  : m_Ssh(ssh),
769  m_Bssp(bssp),
770  m_Prnt(prnt)
771 {
773 
774  if (ssh.IsSetClass()) {
775  m_Class = ssh.GetClass();
776  }
777 }
778 
779 
780 // CBioseqIndex
781 
782 // Constructor
784  const CBioseq& bsp,
785  CBioseq_Handle obsh,
786  CRef<CSeqsetIndex> prnt,
787  CSeq_entry_Handle tseh,
788  CRef<CScope> scope,
789  CSeqMasterIndex& idx,
792  : m_Bsh(bsh),
793  m_Bsp(bsp),
794  m_OrigBsh(obsh),
795  m_Prnt(prnt),
796  m_Tseh(tseh),
797  m_Scope(scope),
798  m_Idx(&idx),
799  m_Policy(policy),
800  m_Flags(flags)
801 {
802  m_FetchFailure = false;
803 
804  m_GapsInitialized = false;
805  m_DescsInitialized = false;
806  m_FeatsInitialized = false;
807  m_SourcesInitialized = false;
808  m_FeatForProdInitialized = false;
810 
811  m_ForceOnlyNearFeats = false;
812 
813  // reset member variables to cleared state
814  m_IsNA = false;
815  m_IsAA = false;
816  m_Topology = NCBI_SEQTOPOLOGY(not_set);
817 
818  m_IsDelta = false;
819  m_IsDeltaLitOnly = false;
820  m_IsVirtual = false;
821  m_IsMap = false;
822 
823  m_Title.clear();
824 
825  m_MolInfo.Reset();
829 
830  m_Accession.clear();
831 
832  m_IsRefSeq = false;
833  m_IsNC = false;
834  m_IsNM = false;
835  m_IsNR = false;
836  m_IsNZ = false;
837  m_IsPatent = false;
838  m_IsPDB = false;
839  m_IsWP = false;
840  m_ThirdParty = false;
841  m_WGSMaster = false;
842  m_TSAMaster = false;
843  m_TLSMaster = false;
844 
845  m_GeneralStr.clear();
846  m_GeneralId = 0;
847  m_PatentCountry.clear();
848  m_PatentNumber.clear();
849 
850  m_PatentSequence = 0;
851 
852  m_PDBChain = 0;
853  m_PDBChainID.clear();
854 
855  m_HTGTech = false;
856  m_HTGSUnfinished = false;
857  m_IsTLS = false;
858  m_IsTSA = false;
859  m_IsWGS = false;
860  m_IsEST_STS_GSS = false;
861 
862  m_UseBiosrc = false;
863 
864  m_HTGSCancelled = false;
865  m_HTGSDraft = false;
866  m_HTGSPooled = false;
867  m_TPAExp = false;
868  m_TPAInf = false;
869  m_TPAReasm = false;
870  m_Unordered = false;
871 
873 
875  m_DescTaxname.clear();
876 
877  m_BioSource.Reset();
878  m_Taxname.clear();
879  m_Common.clear();
880  m_Lineage.clear();
882  m_UsingAnamorph = false;
883  m_Genus.clear();
884  m_Species.clear();
885  m_Multispecies = false;
886  m_Genome = NCBI_GENOME(unknown);
887  m_IsPlasmid = false;
888  m_IsChromosome = false;
889 
890  m_Organelle.clear();
891 
892  m_FirstSuperKingdom.clear();
893  m_SecondSuperKingdom.clear();
894  m_IsCrossKingdom = false;
895 
898  m_Clone.clear();
899  m_has_clone = false;
900  m_Map.clear();
901  m_Plasmid.clear();
902  m_Segment.clear();
903 
904  m_Breed.clear();
905  m_Cultivar.clear();
907  m_Isolate.clear();
908  m_Strain.clear();
909  m_Substrain.clear();
911 
912  m_IsUnverified = false;
913  m_IsUnverifiedFeature = false;
914  m_IsUnverifiedOrganism = false;
917 
918  m_IsUnreviewed = false;
920 
922 
923  m_Comment.clear();
924  m_IsPseudogene = false;
925 
926  m_HasGene = false;
927  m_HasMultiIntervalGenes = false;
928  m_HasSource = false;
929 
930  m_rEnzyme.clear();
931 
932  // now start setting member variables from Bioseq
933  m_IsNA = m_Bsh.IsNa();
934  m_IsAA = m_Bsh.IsAa();
936  m_Length = 0;
937 
938  if (m_Bsh.IsSetInst()) {
939  if (m_Bsh.IsSetInst_Topology()) {
941  }
942 
943  if (m_Bsh.IsSetInst_Length()) {
945  } else {
947  }
948 
949  if (m_Bsh.IsSetInst_Repr()) {
951  m_IsDelta = (repr == CSeq_inst::eRepr_delta);
953  m_IsMap = (repr == CSeq_inst::eRepr_map);
954  }
955  if (m_IsDelta && m_Bsh.IsSetInst_Ext()) {
957  bool hasLoc = false;
958  if ( ext.IsDelta() ) {
959  ITERATE (CDelta_ext::Tdata, it, ext.GetDelta().Get()) {
960  if ( (*it)->IsLoc() ) {
961  const CSeq_loc& loc = (*it)->GetLoc();
962  if (loc.IsNull()) continue;
963  hasLoc = true;
964  }
965  }
966  }
967  if (! hasLoc) {
968  m_IsDeltaLitOnly = true;
969  }
970  }
971  }
972 
973  // process Seq-ids
974  for (CSeq_id_Handle sid : obsh.GetId()) {
975  // first switch to set RefSeq and ThirdParty flags
976  switch (sid.Which()) {
977  case NCBI_SEQID(Other):
978  m_IsRefSeq = true;
979  break;
980  case NCBI_SEQID(Tpg):
981  case NCBI_SEQID(Tpe):
982  case NCBI_SEQID(Tpd):
983  m_ThirdParty = true;
984  break;
985  default:
986  break;
987  }
988  // second switch now avoids complicated flag setting logic
989  switch (sid.Which()) {
990  case NCBI_SEQID(Tpg):
991  case NCBI_SEQID(Tpe):
992  case NCBI_SEQID(Tpd):
993  case NCBI_SEQID(Other):
994  case NCBI_SEQID(Genbank):
995  case NCBI_SEQID(Embl):
996  case NCBI_SEQID(Ddbj):
997  {
998  CConstRef<CSeq_id> id = sid.GetSeqId();
999  const CTextseq_id& tsid = *id->GetTextseq_Id ();
1000  if (tsid.IsSetAccession()) {
1001  m_Accession = tsid.GetAccession ();
1003  TACCN_CHOICE div = (TACCN_CHOICE) (type & NCBI_ACCN(division_mask));
1004  if ( div == NCBI_ACCN(wgs) )
1005  {
1006  if( (type & CSeq_id::fAcc_master) != 0 ) {
1007  m_WGSMaster = true;
1008  }
1009  } else if ( div == NCBI_ACCN(tsa) )
1010  {
1011  if( (type & CSeq_id::fAcc_master) != 0 && m_IsVirtual ) {
1012  m_TSAMaster = true;
1013  }
1014  } else if (type == NCBI_ACCN(refseq_chromosome)) {
1015  m_IsNC = true;
1016  } else if (type == NCBI_ACCN(refseq_mrna)) {
1017  m_IsNM = true;
1018  } else if (type == NCBI_ACCN(refseq_mrna_predicted)) {
1019  m_IsNM = true;
1020  } else if (type == NCBI_ACCN(refseq_ncrna)) {
1021  m_IsNR = true;
1022  } else if (type == NCBI_ACCN(refseq_wgs_nuc)) {
1023  m_IsNZ = true;
1024  } else if (type == NCBI_ACCN(refseq_unique_prot)) {
1025  m_IsWP = true;
1026  }
1027  }
1028  break;
1029  }
1030  case NCBI_SEQID(General):
1031  {
1032  CConstRef<CSeq_id> id = sid.GetSeqId();
1033  const CDbtag& gen_id = id->GetGeneral ();
1034  if (! gen_id.IsSkippable ()) {
1035  if (gen_id.IsSetTag ()) {
1036  const CObject_id& oid = gen_id.GetTag();
1037  if (oid.IsStr()) {
1038  m_GeneralStr = oid.GetStr();
1039  } else if (oid.IsId()) {
1040  m_GeneralId = oid.GetId();
1041  }
1042  }
1043  }
1044  break;
1045  }
1046  case NCBI_SEQID(Pdb):
1047  {
1048  m_IsPDB = true;
1049  CConstRef<CSeq_id> id = sid.GetSeqId();
1050  const CPDB_seq_id& pdb_id = id->GetPdb ();
1051  if (pdb_id.IsSetChain_id()) {
1052  m_PDBChainID = pdb_id.GetChain_id();
1053  } else if (pdb_id.IsSetChain()) {
1054  m_PDBChain = pdb_id.GetChain();
1055  }
1056  break;
1057  }
1058  case NCBI_SEQID(Patent):
1059  {
1060  m_IsPatent = true;
1061  CConstRef<CSeq_id> id = sid.GetSeqId();
1062  const CPatent_seq_id& pat_id = id->GetPatent();
1063  if (pat_id.IsSetSeqid()) {
1064  m_PatentSequence = pat_id.GetSeqid();
1065  }
1066  if (pat_id.IsSetCit()) {
1067  const CId_pat& cit = pat_id.GetCit();
1068  m_PatentCountry = cit.GetCountry();
1069  m_PatentNumber = cit.GetSomeNumber();
1070  }
1071  break;
1072  }
1073  case NCBI_SEQID(Gpipe):
1074  break;
1075  default:
1076  break;
1077  }
1078  }
1079 
1080  // process restriction map
1081  if (m_IsMap) {
1082  if (bsh.IsSetInst_Ext() && bsh.GetInst_Ext().IsMap()) {
1083  const CMap_ext& mp = bsh.GetInst_Ext().GetMap();
1084  if (mp.IsSet()) {
1085  const CMap_ext::Tdata& ft = mp.Get();
1086  ITERATE (CMap_ext::Tdata, itr, ft) {
1087  const CSeq_feat& feat = **itr;
1088  const CSeqFeatData& data = feat.GetData();
1089  if (! data.IsRsite()) continue;
1090  const CRsite_ref& rsite = data.GetRsite();
1091  if (rsite.IsStr()) {
1092  m_rEnzyme = rsite.GetStr();
1093  }
1094  }
1095  }
1096  }
1097  }
1098 }
1099 
1100 // Destructor
1102 
1103 {
1104 }
1105 
1106 // Gap collection (delayed until needed)
1108 
1109 {
1110  try {
1111  if (m_GapsInitialized) {
1112  return;
1113  }
1114 
1115  m_GapsInitialized = true;
1116 
1117  if (! m_IsDelta) {
1118  return;
1119  }
1120 
1121  SSeqMapSelector sel;
1122 
1123  size_t resolveCount = 0;
1124 
1126  auto idxl = idx.Lock();
1127  if (idxl) {
1128  resolveCount = idxl->GetGapDepth();
1129  }
1130 
1132  .SetResolveCount(resolveCount);
1133 
1134  // explore gaps, pass original target BioseqHandle if using Bioseq sublocation
1135  for (CSeqMap_CI gap_it(m_OrigBsh, sel); gap_it; ++gap_it) {
1136 
1137  TSeqPos start = gap_it.GetPosition();
1138  TSeqPos end = gap_it.GetEndPosition();
1139  TSeqPos length = gap_it.GetLength();
1140 
1141  // attempt to find CSeq_gap info
1142  const CSeq_gap * pGap = NULL;
1143  if( gap_it.IsSetData() && gap_it.GetData().IsGap() ) {
1144  pGap = &gap_it.GetData().GetGap();
1145  } else {
1146  CConstRef<CSeq_literal> pSeqLiteral = gap_it.GetRefGapLiteral();
1147  if( pSeqLiteral && pSeqLiteral->IsSetSeq_data() ) {
1148  const CSeq_data & seq_data = pSeqLiteral->GetSeq_data();
1149  if( seq_data.IsGap() ) {
1150  pGap = &seq_data.GetGap();
1151  }
1152  }
1153  }
1154 
1155  CFastaOstream::SGapModText gap_mod_text;
1156  if( pGap ) {
1157  CFastaOstream::GetGapModText(*pGap, gap_mod_text);
1158  }
1159  string type = gap_mod_text.gap_type;
1160  vector<string>& evidence = gap_mod_text.gap_linkage_evidences;
1161 
1162  bool isUnknownLength = gap_it.IsUnknownLength();
1163 
1164  // feature name depends on what quals we use
1165  bool isAssemblyGap = ( ! type.empty() || ! evidence.empty() );
1166 
1167  CRef<CGapIndex> sgx(new CGapIndex(start, end, length, type, evidence, isUnknownLength, isAssemblyGap, *this));
1168  m_GapList.push_back(sgx);
1169  }
1170  }
1171  catch (CException& e) {
1172  ERR_POST_X(3, Error << "Error in CBioseqIndex::x_InitGaps: " << e.what());
1173  }
1174 }
1175 
1176 static const char* x_OrganelleName (
1177  TBIOSOURCE_GENOME genome,
1178  bool has_plasmid,
1179  bool virus_or_phage,
1180  bool wgs_suffix
1181 )
1182 
1183 {
1184  const char* result = kEmptyCStr;
1185 
1186  switch (genome) {
1187  case NCBI_GENOME(chloroplast):
1188  result = "chloroplast";
1189  break;
1190  case NCBI_GENOME(chromoplast):
1191  result = "chromoplast";
1192  break;
1193  case NCBI_GENOME(kinetoplast):
1194  result = "kinetoplast";
1195  break;
1196  case NCBI_GENOME(mitochondrion):
1197  {
1198  if (has_plasmid || wgs_suffix) {
1199  result = "mitochondrial";
1200  } else {
1201  result = "mitochondrion";
1202  }
1203  break;
1204  }
1205  case NCBI_GENOME(plastid):
1206  result = "plastid";
1207  break;
1208  case NCBI_GENOME(macronuclear):
1209  {
1210  result = "macronuclear";
1211  break;
1212  }
1213  case NCBI_GENOME(extrachrom):
1214  {
1215  if (! wgs_suffix) {
1216  result = "extrachromosomal";
1217  }
1218  break;
1219  }
1220  case NCBI_GENOME(plasmid):
1221  {
1222  if (! wgs_suffix) {
1223  result = "plasmid";
1224  }
1225  break;
1226  }
1227  // transposon and insertion-seq are obsolete
1228  case NCBI_GENOME(cyanelle):
1229  result = "cyanelle";
1230  break;
1231  case NCBI_GENOME(proviral):
1232  {
1233  if (! virus_or_phage) {
1234  if (has_plasmid || wgs_suffix) {
1235  result = "proviral";
1236  } else {
1237  result = "provirus";
1238  }
1239  }
1240  break;
1241  }
1242  case NCBI_GENOME(virion):
1243  {
1244  if (! virus_or_phage) {
1245  result = "virus";
1246  }
1247  break;
1248  }
1249  case NCBI_GENOME(nucleomorph):
1250  {
1251  if (! wgs_suffix) {
1252  result = "nucleomorph";
1253  }
1254  break;
1255  }
1256  case NCBI_GENOME(apicoplast):
1257  result = "apicoplast";
1258  break;
1259  case NCBI_GENOME(leucoplast):
1260  result = "leucoplast";
1261  break;
1262  case NCBI_GENOME(proplastid):
1263  result = "proplastid";
1264  break;
1265  case NCBI_GENOME(endogenous_virus):
1266  result = "endogenous virus";
1267  break;
1268  case NCBI_GENOME(hydrogenosome):
1269  result = "hydrogenosome";
1270  break;
1271  case NCBI_GENOME(chromosome):
1272  result = "chromosome";
1273  break;
1274  case NCBI_GENOME(chromatophore):
1275  result = "chromatophore";
1276  break;
1277  }
1278 
1279  return result;
1280 }
1281 
1282 static bool s_BlankOrNotSpecialTaxname (string taxname)
1283 
1284 {
1285  if (taxname.empty()) {
1286  return true;
1287  }
1288 
1289  if (NStr::EqualNocase (taxname, "synthetic construct")) {
1290  return false;
1291  }
1292  if (NStr::EqualNocase (taxname, "artificial sequence")) {
1293  return false;
1294  }
1295  if (NStr::EqualNocase (taxname, "vector")) {
1296  return false;
1297  }
1298  if (NStr::EqualNocase (taxname, "Vector")) {
1299  return false;
1300  }
1301 
1302  return true;
1303 }
1304 
1306 
1307 {
1308  try {
1309  if (m_SourcesInitialized) {
1310  return;
1311  }
1312 
1313  m_SourcesInitialized = true;
1314 
1315  if (! m_DescsInitialized) {
1316  x_InitDescs();
1317  }
1318 
1323  if (sfxp) {
1325  if (bsrx) {
1326  CMappedFeat src_feat = bsrx->GetMappedFeat();
1327  if (src_feat) {
1328  const CBioSource& bsrc = src_feat.GetData().GetBiosrc();
1329  m_BioSource.Reset (&bsrc);
1330  }
1331  }
1332  }
1333  }
1334  }
1335 
1336  if (m_DescBioSource && ! m_BioSource) {
1338  }
1339 
1340  if (m_BioSource.NotEmpty()) {
1341  const string *common = 0;
1342 
1343  // get organism name
1344  if (m_BioSource->IsSetTaxname()) {
1346  }
1347  if (m_BioSource->IsSetCommon()) {
1348  common = &m_BioSource->GetCommon();
1349  }
1350  if (m_BioSource->IsSetOrgname()) {
1351  const COrgName& onp = m_BioSource->GetOrgname();
1352  if (onp.CanGetLineage()) {
1353  m_Lineage = onp.GetLineage();
1354  }
1355  }
1356  if (m_BioSource->CanGetOrg()) {
1357  const COrg_ref& org = m_BioSource->GetOrg();
1358  m_Taxid = org.GetTaxId();
1359  }
1360  if (m_BioSource->IsSetGenome()) {
1362  m_IsPlasmid = (m_Genome == NCBI_GENOME(plasmid));
1363  m_IsChromosome = (m_Genome == NCBI_GENOME(chromosome));
1364  }
1365 
1366  // process SubSource
1368  const CSubSource& sbs = **sbs_itr;
1369  if (! sbs.IsSetName()) continue;
1370  const string& str = sbs.GetName();
1372  case NCBI_SUBSOURCE(chromosome):
1373  m_Chromosome = str;
1374  break;
1375  case NCBI_SUBSOURCE(clone):
1376  m_Clone = str;
1377  m_has_clone = true;
1378  break;
1379  case NCBI_SUBSOURCE(map):
1380  m_Map = str;
1381  break;
1382  case NCBI_SUBSOURCE(plasmid_name):
1383  m_Plasmid = str;
1384  break;
1385  case NCBI_SUBSOURCE(segment):
1386  m_Segment = str;
1387  break;
1388  case NCBI_SUBSOURCE(linkage_group):
1389  m_LinkageGroup = str;
1390  break;
1391  default:
1392  break;
1393  }
1394  }
1395 
1396  if (m_BioSource->IsSetOrgname()) {
1397  const COrgName& onp = m_BioSource->GetOrgname();
1398  if (onp.IsSetName()) {
1399  const COrgName::TName& nam = onp.GetName();
1400  if (nam.IsBinomial()) {
1401  const CBinomialOrgName& bon = nam.GetBinomial();
1402  if (bon.IsSetGenus()) {
1403  m_Genus = bon.GetGenus();
1404  }
1405  if (bon.IsSetSpecies()) {
1406  m_Species = bon.GetSpecies();
1407  }
1408  } else if (nam.IsPartial()) {
1409  const CPartialOrgName& pon = nam.GetPartial();
1410  if (pon.IsSet()) {
1411  const CPartialOrgName::Tdata& tx = pon.Get();
1412  ITERATE (CPartialOrgName::Tdata, itr, tx) {
1413  const CTaxElement& te = **itr;
1414  if (te.IsSetFixed_level()) {
1415  int fl = te.GetFixed_level();
1416  if (fl > 0) {
1417  m_Multispecies = true;
1418  } else if (te.IsSetLevel()) {
1419  const string& lvl = te.GetLevel();
1420  if (! NStr::EqualNocase (lvl, "species")) {
1421  m_Multispecies = true;
1422  }
1423  }
1424  }
1425  }
1426  }
1427  }
1428  }
1429  }
1430 
1431  // process OrgMod
1432  const string *com = 0, *acr = 0, *syn = 0, *ana = 0,
1433  *gbacr = 0, *gbana = 0, *gbsyn = 0, *met = 0;
1434  int numcom = 0, numacr = 0, numsyn = 0, numana = 0,
1435  numgbacr = 0, numgbana = 0, numgbsyn = 0, nummet = 0;
1436 
1438  const COrgMod& omd = **omd_itr;
1439  if (! omd.IsSetSubname()) continue;
1440  const string& str = omd.GetSubname();
1441  SWITCH_ON_ORGMOD_CHOICE (omd) {
1442  case NCBI_ORGMOD(strain):
1443  if (m_Strain.empty()) {
1444  m_Strain = str;
1445  }
1446  break;
1447  case NCBI_ORGMOD(substrain):
1448  if (m_Substrain.empty()) {
1449  m_Substrain = str;
1450  }
1451  break;
1452  case NCBI_ORGMOD(cultivar):
1453  if (m_Cultivar.empty()) {
1454  m_Cultivar = str;
1455  }
1456  break;
1457  case NCBI_ORGMOD(specimen_voucher):
1458  if (m_SpecimenVoucher.empty()) {
1460  }
1461  break;
1462  case NCBI_ORGMOD(isolate):
1463  if (m_Isolate.empty()) {
1464  m_Isolate = str;
1465  }
1466  break;
1467  case NCBI_ORGMOD(breed):
1468  if (m_Breed.empty()) {
1469  m_Breed = str;
1470  }
1471  case NCBI_ORGMOD(common):
1472  com = &str;
1473  numcom++;
1474  break;
1475  case NCBI_ORGMOD(acronym):
1476  acr = &str;
1477  numacr++;
1478  break;
1479  case NCBI_ORGMOD(synonym):
1480  syn = &str;
1481  numsyn++;
1482  break;
1483  case NCBI_ORGMOD(anamorph):
1484  ana = &str;
1485  numana++;
1486  break;
1487  case NCBI_ORGMOD(gb_acronym):
1488  gbacr = &str;
1489  numgbacr++;
1490  break;
1491  case NCBI_ORGMOD(gb_synonym):
1492  gbsyn = &str;
1493  numgbsyn++;
1494  break;
1495  case NCBI_ORGMOD(gb_anamorph):
1496  gbana = &str;
1497  numgbana++;
1498  break;
1499  case NCBI_ORGMOD(metagenome_source):
1500  if (m_MetaGenomeSource.empty()) {
1502  }
1503  met = &str;
1504  nummet++;
1505  break;
1506  default:
1507  break;
1508  }
1509  }
1510 
1511  if (numacr > 1) {
1512  acr = NULL;
1513  }
1514  if (numana > 1) {
1515  ana = NULL;
1516  }
1517  if (numcom > 1) {
1518  com = NULL;
1519  }
1520  if (numsyn > 1) {
1521  syn = NULL;
1522  }
1523  if (numgbacr > 1) {
1524  gbacr = NULL;
1525  }
1526  if (numgbana > 1) {
1527  gbana = NULL;
1528  }
1529  if (numgbsyn > 1) {
1530  gbsyn = NULL;
1531  }
1532  if( nummet > 1 ) {
1533  met = NULL;
1534  }
1535 
1536  if( met != 0 ) {
1537  m_Common = *met;
1538  } else if ( syn != 0 ) {
1539  m_Common = *syn;
1540  } else if ( acr != 0 ) {
1541  m_Common = *acr;
1542  } else if ( ana != 0 ) {
1543  m_Common = *ana;
1544  m_UsingAnamorph = true;
1545  } else if ( com != 0 ) {
1546  m_Common = *com;
1547  } else if ( gbsyn != 0 ) {
1548  m_Common = *gbsyn;
1549  } else if ( gbacr != 0 ) {
1550  m_Common = *gbacr;
1551  } else if ( gbana != 0 ) {
1552  m_Common = *gbana;
1553  m_UsingAnamorph = true;
1554  } else if ( common != 0 ) {
1555  m_Common = *common;
1556  }
1557  }
1558 
1559  bool virus_or_phage = false;
1560  bool has_plasmid = false;
1561  bool wgs_suffix = false;
1562 
1563  if (NStr::FindNoCase(m_Taxname, "virus") != NPOS ||
1564  NStr::FindNoCase(m_Taxname, "phage") != NPOS) {
1565  virus_or_phage = true;
1566  }
1567 
1568  if (! m_Plasmid.empty()) {
1569  has_plasmid = true;
1570  /*
1571  if (NStr::FindNoCase(m_Plasmid, "plasmid") == NPOS &&
1572  NStr::FindNoCase(m_Plasmid, "element") == NPOS) {
1573  pls_pfx = " plasmid ";
1574  }
1575  */
1576  }
1577 
1578  if (m_IsWGS) {
1579  wgs_suffix = true;
1580  }
1581 
1582  m_Organelle = x_OrganelleName (m_Genome, has_plasmid, virus_or_phage, wgs_suffix);
1583  }
1584  catch (CException& e) {
1585  ERR_POST_X(4, Error << "Error in CBioseqIndex::x_InitSource: " << e.what());
1586  }
1587 }
1588 
1589 // Descriptor collection (delayed until needed)
1591 
1592 {
1593  try {
1594  if (m_DescsInitialized) {
1595  return;
1596  }
1597 
1598  m_DescsInitialized = true;
1599 
1600  const list <string> *keywords = NULL;
1601 
1602  int num_super_kingdom = 0;
1603  bool super_kingdoms_different = false;
1604 
1605  // explore descriptors, pass original target BioseqHandle if using Bioseq sublocation
1606  for (CSeqdesc_CI desc_it(m_OrigBsh); desc_it; ++desc_it) {
1607  const CSeqdesc& sd = *desc_it;
1608  CRef<CDescriptorIndex> sdx(new CDescriptorIndex(sd, *this));
1609  m_SdxList.push_back(sdx);
1610 
1611  switch (sd.Which()) {
1612  case CSeqdesc::e_Source:
1613  {
1614  if (! m_DescBioSource) {
1615  const CBioSource& biosrc = sd.GetSource();
1616  m_DescBioSource.Reset (&biosrc);
1617  if (m_IsNA && ! m_BioSource) {
1619  }
1620  }
1621  if (m_IsWP) {
1622  const CBioSource &bsrc = sd.GetSource();
1623  if (! bsrc.IsSetOrgname()) break;
1624  const COrgName& onp = bsrc.GetOrgname();
1625  if (! onp.IsSetName()) break;
1626  const COrgName::TName& nam = onp.GetName();
1627  if (! nam.IsPartial()) break;
1628  const CPartialOrgName& pon = nam.GetPartial();
1629  if (! pon.IsSet()) break;
1630  const CPartialOrgName::Tdata& tx = pon.Get();
1631  ITERATE (CPartialOrgName::Tdata, itr, tx) {
1632  const CTaxElement& te = **itr;
1633  if (! te.IsSetFixed_level()) continue;
1634  if (te.GetFixed_level() != 0) continue;
1635  if (! te.IsSetLevel()) continue;
1636  const string& lvl = te.GetLevel();
1637  if (! NStr::EqualNocase (lvl, "superkingdom")) continue;
1638  num_super_kingdom++;
1639  if (m_FirstSuperKingdom.empty() && te.IsSetName()) {
1641  } else if (te.IsSetName() && ! NStr::EqualNocase (m_FirstSuperKingdom, te.GetName())) {
1642  if (m_SecondSuperKingdom.empty()) {
1643  super_kingdoms_different = true;
1645  }
1646  }
1647  if (num_super_kingdom > 1 && super_kingdoms_different) {
1648  m_IsCrossKingdom = true;
1649  }
1650  }
1651  }
1652  break;
1653  }
1654  case CSeqdesc::e_Molinfo:
1655  {
1656  if (! m_MolInfo) {
1657  const CMolInfo& molinf = sd.GetMolinfo();
1658  m_MolInfo.Reset (&molinf);
1659  m_Biomol = molinf.GetBiomol();
1660  m_Tech = molinf.GetTech();
1661  m_Completeness = molinf.GetCompleteness();
1662 
1663  switch (m_Tech) {
1664  case NCBI_TECH(htgs_0):
1665  case NCBI_TECH(htgs_1):
1666  case NCBI_TECH(htgs_2):
1667  m_HTGSUnfinished = true;
1668  // manufacture all titles for unfinished HTG sequences
1669  // m_Reconstruct = true;
1670  m_Title.clear();
1671  // fall through
1672  case NCBI_TECH(htgs_3):
1673  m_HTGTech = true;
1674  m_UseBiosrc = true;
1675  break;
1676  case NCBI_TECH(est):
1677  case NCBI_TECH(sts):
1678  case NCBI_TECH(survey):
1679  m_IsEST_STS_GSS = true;
1680  m_UseBiosrc = true;
1681  break;
1682  case NCBI_TECH(wgs):
1683  m_IsWGS = true;
1684  m_UseBiosrc = true;
1685  break;
1686  case NCBI_TECH(tsa):
1687  m_IsTSA = true;
1688  m_UseBiosrc = true;
1689  if (m_IsVirtual) {
1690  m_TSAMaster = true;
1691  }
1692  break;
1693  case NCBI_TECH(targeted):
1694  m_IsTLS = true;
1695  m_UseBiosrc = true;
1696  if (m_IsVirtual) {
1697  m_TLSMaster = true;
1698  }
1699  break;
1700  default:
1701  break;
1702  }
1703  }
1704  break;
1705  }
1706  case CSeqdesc::e_Title:
1707  {
1708  if (m_Title.empty()) {
1709  // for everything other than PDB proteins, title must be packaged on Bioseq - RW-2005
1710  if ( m_IsPDB || desc_it.GetSeq_entry_Handle().IsSeq() ) {
1711  m_Title = sd.GetTitle();
1712  }
1713  }
1714  break;
1715  }
1716  case CSeqdesc::e_User:
1717  {
1718  const CUser_object& usr = sd.GetUser();
1719  if (usr.IsSetType()) {
1720  const CObject_id& oi = usr.GetType();
1721  if (oi.IsStr()) {
1722  const string& type = oi.GetStr();
1723  if (NStr::EqualNocase(type, "FeatureFetchPolicy")) {
1724  FOR_EACH_USERFIELD_ON_USEROBJECT (uitr, usr) {
1725  const CUser_field& fld = **uitr;
1726  if (fld.IsSetLabel() && fld.GetLabel().IsStr()) {
1727  const string &label_str = GET_FIELD(fld.GetLabel(), Str);
1728  if (! NStr::EqualNocase(label_str, "Policy")) continue;
1729  if (fld.IsSetData() && fld.GetData().IsStr()) {
1730  const string& str = fld.GetData().GetStr();
1731  if (NStr::EqualNocase(str, "OnlyNearFeatures")) {
1732  m_ForceOnlyNearFeats = true;
1733  }
1734  }
1735  }
1736  }
1737  } else if (NStr::EqualNocase(type, "Unverified")) {
1738  m_IsUnverified = true;
1739  if (usr.IsUnverifiedOrganism()) {
1740  m_IsUnverifiedOrganism = true;
1741  }
1742  if (usr.IsUnverifiedMisassembled()) {
1744  }
1745  if (usr.IsUnverifiedContaminant()) {
1747  }
1748  if (usr.IsUnverifiedFeature()) {
1749  m_IsUnverifiedFeature = true;
1750  }
1751  } else if (NStr::EqualNocase(type, "Unreviewed")) {
1752  m_IsUnreviewed = true;
1753  if (usr.IsUnreviewedUnannotated()) {
1755  }
1756  } else if (NStr::EqualNocase(type, "AutodefOptions")) {
1757  FOR_EACH_USERFIELD_ON_USEROBJECT (uitr, usr) {
1758  const CUser_field& fld = **uitr;
1759  if (! FIELD_IS_SET_AND_IS(fld, Label, Str)) continue;
1760  const string &label_str = GET_FIELD(fld.GetLabel(), Str);
1761  if (! NStr::EqualNocase(label_str, "Targeted Locus Name")) continue;
1762  if (fld.IsSetData() && fld.GetData().IsStr()) {
1763  m_TargetedLocus = fld.GetData().GetStr();
1764  }
1765  }
1766  }
1767  }
1768  }
1769  break;
1770  }
1771  case CSeqdesc::e_Comment:
1772  {
1773  m_Comment = sd.GetComment();
1774  if (NStr::Find (m_Comment, "[CAUTION] Could be the product of a pseudogene") != string::npos) {
1775  m_IsPseudogene = true;
1776  }
1777  break;
1778  }
1779  case CSeqdesc::e_Genbank:
1780  {
1781  const CGB_block& gbk = desc_it->GetGenbank();
1782  if (gbk.IsSetKeywords()) {
1783  keywords = &gbk.GetKeywords();
1784  }
1785  break;
1786  }
1787  case CSeqdesc::e_Embl:
1788  {
1789  const CEMBL_block& ebk = desc_it->GetEmbl();
1790  if (ebk.IsSetKeywords()) {
1791  keywords = &ebk.GetKeywords();
1792  }
1793  break;
1794  }
1795  case CSeqdesc::e_Pdb:
1796  {
1797  if (m_PDBCompound.empty()) {
1798  _ASSERT(m_IsPDB);
1799  const CPDB_block& pbk = desc_it->GetPdb();
1800  FOR_EACH_COMPOUND_ON_PDBBLOCK (cp_itr, pbk) {
1801  if (m_PDBCompound.empty()) {
1802  m_PDBCompound = *cp_itr;
1803  break;
1804  }
1805  }
1806  }
1807  break;
1808  }
1809  default:
1810  break;
1811  }
1812  }
1813 
1814  if (keywords != NULL) {
1815  FOR_EACH_STRING_IN_LIST (kw_itr, *keywords) {
1816  const string& clause = *kw_itr;
1817  list<string> kywds;
1818  NStr::Split( clause, ";", kywds, NStr::fSplit_Tokenize );
1819  FOR_EACH_STRING_IN_LIST ( k_itr, kywds ) {
1820  const string& str = *k_itr;
1821  if (NStr::EqualNocase (str, "UNORDERED")) {
1822  m_Unordered = true;
1823  }
1824  if ((! m_HTGTech) && (! m_ThirdParty)) continue;
1825  if (NStr::EqualNocase (str, "HTGS_DRAFT")) {
1826  m_HTGSDraft = true;
1827  } else if (NStr::EqualNocase (str, "HTGS_CANCELLED")) {
1828  m_HTGSCancelled = true;
1829  } else if (NStr::EqualNocase (str, "HTGS_POOLED_MULTICLONE")) {
1830  m_HTGSPooled = true;
1831  } else if (NStr::EqualNocase (str, "TPA:experimental")) {
1832  m_TPAExp = true;
1833  } else if (NStr::EqualNocase (str, "TPA:inferential")) {
1834  m_TPAInf = true;
1835  } else if (NStr::EqualNocase (str, "TPA:reassembly")) {
1836  m_TPAReasm = true;
1837  } else if (NStr::EqualNocase (str, "TPA:assembly")) {
1838  m_TPAReasm = true;
1839  }
1840  }
1841  }
1842  }
1843  }
1844  catch (CException& e) {
1845  ERR_POST_X(5, Error << "Error in CBioseqIndex::x_InitDescs: " << e.what());
1846  }
1847 }
1848 
1850 
1851 {
1852  bool snpOK = false;
1853  bool cddOK = false;
1854 
1855  if (policy == CSeqEntryIndex::eExhaustive) {
1856 
1857  // experimental policy forces collection of features from all sequence levels
1858  sel.SetResolveAll();
1860  // ignores RefSeq/INSD barrier, overrides far fetch policy user object
1861  // for now, always excludes external annots, ignores custom enable bits
1862 
1863  } else if (policy == CSeqEntryIndex::eInternal || onlyNear) {
1864 
1865  // do not fetch features from underlying sequence component records
1866  sel.SetResolveDepth(0);
1867  sel.SetExcludeExternal(true);
1868  // always excludes external annots, ignores custom enable bits
1869 
1870  } else if (policy == CSeqEntryIndex::eAdaptive) {
1871 
1872  sel.SetResolveAll();
1873  // normal situation uses adaptive depth for feature collection,
1874  // includes barrier between RefSeq and INSD accession types
1875  sel.SetAdaptiveDepth(true);
1876 
1877  // conditionally allows external annots, based on custom enable bits
1878  if ((flags & CSeqEntryIndex::fShowSNPFeats) != 0) {
1879  snpOK = true;
1880  }
1881  if ((flags & CSeqEntryIndex::fShowCDDFeats) != 0) {
1882  cddOK = true;
1883  }
1884 
1885  } else if (policy == CSeqEntryIndex::eExternal) {
1886 
1887  // same as eAdaptive
1888  sel.SetResolveAll();
1889  sel.SetAdaptiveDepth(true);
1890 
1891  // but always allows external annots without need for custom enable bits
1892  snpOK = true;
1893  cddOK = true;
1894 
1895  } else if (policy == CSeqEntryIndex::eFtp) {
1896 
1897  // for public ftp releases
1898  if (m_IsRefSeq) {
1899  // For genomes FTP, we're running with a local ASN cache. Fetching from ID has already
1900  // happened, and we specifically want to restrict to using annotation from the cache.
1901  sel.SetResolveDepth(0);
1902  sel.SetExcludeExternal(true);
1903  } else if (m_IsDeltaLitOnly) {
1904  sel.SetResolveDepth(0);
1905  sel.SetExcludeExternal(true);
1906  } else {
1907  sel.SetResolveDepth(0);
1908  sel.SetExcludeExternal(true);
1909  }
1910 
1911  } else if (policy == CSeqEntryIndex::eGenomes) {
1912 
1913  // for public ftp releases
1914 
1915  // Original comment was:
1916  // For genomes FTP, we're running with a local ASN cache. Fetching from ID has already
1917  // happened, and we specifically want to restrict to using annotation from the cache.
1918  // but later advice was to always use adaptive depth.
1919 
1920  if (m_IsRefSeq) {
1921  sel.SetResolveAll();
1922  sel.SetAdaptiveDepth(true);
1923  } else if (m_IsDeltaLitOnly) {
1924  sel.SetResolveAll();
1925  sel.SetAdaptiveDepth(true);
1926  } else {
1927  sel.SetResolveAll();
1928  sel.SetAdaptiveDepth(true);
1929  }
1930 
1931  } else if (policy == CSeqEntryIndex::eWeb) {
1932 
1933  // for public web pages
1934  if (m_IsRefSeq) {
1935  sel.SetResolveAll();
1936  sel.SetAdaptiveDepth(true);
1937  } else if (m_IsDeltaLitOnly) {
1938  sel.SetResolveAll();
1939  sel.SetAdaptiveDepth(true);
1940  } else {
1941  sel.SetResolveAll();
1942  sel.SetAdaptiveDepth(true);
1943  }
1944 
1945  // ID-6366 additional tests for -policy web to prevent gridlock caused by loading huge numbers of SNPs
1946  if (GetLength() <= 1000000) {
1947  // conditionally allows external annots, based on custom enable bits
1948  if ((flags & CSeqEntryIndex::fShowSNPFeats) != 0) {
1949  snpOK = true;
1950  }
1951  if ((flags & CSeqEntryIndex::fShowCDDFeats) != 0) {
1952  cddOK = true;
1953  }
1954  }
1955  }
1956 
1957  // fHideSNPFeats and fHideCDDFeats flags override any earlier settings
1958  if ((flags & CSeqEntryIndex::fHideSNPFeats) != 0) {
1959  snpOK = false;
1960  }
1961  if ((flags & CSeqEntryIndex::fHideCDDFeats) != 0) {
1962  cddOK = false;
1963  }
1964 
1965  // configure remote annot settings in selector
1966  if ( snpOK ) {
1967 
1969  auto idxl = idx.Lock();
1970  if (idxl) {
1971  FAddSnpFunc* func = idxl->GetSnpFunc();
1972  if (func) {
1973  // under PubSeq Gateway, need to get exact accession for SNP retrieval
1975  string na_acc;
1976  (*func) (bsh, na_acc);
1977  if (na_acc.length() > 0) {
1978  sel.IncludeNamedAnnotAccession(na_acc);
1979  }
1980  } else {
1981  // otherwise just give SNP name
1982  sel.IncludeNamedAnnotAccession("SNP");
1983  }
1984  }
1985 
1986  } else {
1987  sel.ExcludeNamedAnnotAccession("SNP");
1988  }
1989 
1990  if ( cddOK ) {
1991  sel.IncludeNamedAnnotAccession("CDD");
1992  } else {
1993  sel.ExcludeNamedAnnotAccession("CDD"); // This does not actually help to stop loading CDDs.
1994  sel.ExcludeNamedAnnots("CDD"); // This prevents annot-iterator from loading CDDs.
1995  }
1996 
1998  auto idxl = idx.Lock();
1999  if (idxl) {
2000  int featDepth = idxl->GetFeatDepth();
2001  if (featDepth > 0) {
2002  sel.SetResolveDepth(featDepth);
2003  }
2004  }
2005 
2006  // bit flags exclude specific features
2007  // source features are collected elsewhere
2009  // pub features are used in the REFERENCES section
2011  // some feature types are always excluded (deprecated?)
2012  // sel.ExcludeFeatSubtype(CSeqFeatData::eSubtype_non_std_residue)
2015  // exclude other types based on user flags
2016  if ((flags & CSeqEntryIndex::fHideImpFeats) != 0) {
2018  }
2019  if ((flags & CSeqEntryIndex::fHideSTSFeats) != 0) {
2021  }
2022  if ((flags & CSeqEntryIndex::fHideExonFeats) != 0) {
2023  sel.ExcludeNamedAnnots("Exon");
2025  }
2026  if ((flags & CSeqEntryIndex::fHideIntronFeats) != 0) {
2028  }
2029  if ((flags & CSeqEntryIndex::fHideMiscFeats) != 0) {
2036  }
2037  if ((flags & CSeqEntryIndex::fHideGapFeats) != 0) {
2040  }
2041 
2042  // additional common settings
2043  sel.SetFeatComparator(new feature::CFeatComparatorByLabel);
2044 
2045  // limit exploration of far deltas with no features to avoid timeout
2046  sel.SetMaxSearchSegments(500);
2048  sel.SetMaxSearchTime(25);
2049 
2050  // request exception to capture fetch failure
2051  sel.SetFailUnresolved();
2052 }
2053 
2054 // GetSelector is public access to selector populated by command-line arguments
2056 
2057 {
2059 }
2060 
2061 // Feature collection common implementation method (delayed until needed)
2063 
2064 {
2065  try {
2066  // Do not bail on m_FeatsInitialized flag
2067 
2068  if (! m_DescsInitialized) {
2069  // initialize descriptors first to get m_ForceOnlyNearFeats flag
2070  x_InitDescs();
2071  }
2072 
2073  m_FeatsInitialized = true;
2074 
2075  SAnnotSelector sel;
2076 
2078 
2079  bool onlyGeneRNACDS = false;
2081  onlyGeneRNACDS = true;
2082  }
2083 
2084  // variables for setting m_BestProteinFeature
2085  TSeqPos longest = 0;
2087  CProt_ref::EProcessed processed;
2088 
2090  auto idxl = idx.Lock();
2091  if (idxl) {
2092  /*
2093  if (! idxl->IsSmallGenomeSet()) {
2094  // limit feature collection to immediate Bioseq-set parent
2095  CRef<CSeqsetIndex> prnt = GetParent();
2096  if (prnt) {
2097  CBioseq_set_Handle bssh = prnt->GetSeqsetHandle();
2098  if (bssh) {
2099  CSeq_entry_Handle pseh = bssh.GetParentEntry();
2100  if (pseh) {
2101  sel.SetLimitSeqEntry(pseh);
2102  }
2103  }
2104  }
2105  }
2106  */
2107 
2108  CRef<feature::CFeatTree> ft = idxl->GetFeatTree();
2109 
2110  // start collection over on each segment
2111  m_SfxList.clear();
2112 
2113  // iterate features on Bioseq or sublocation
2114  CFeat_CI feat_it;
2115  CRef<CSeq_loc_Mapper> slice_mapper;
2116  if (slpp == 0) {
2117  feat_it = CFeat_CI(m_Bsh, sel);
2118  } else {
2119  SAnnotSelector sel_cpy = sel;
2120  sel_cpy.SetIgnoreStrand();
2121  /*
2122  if (selp->IsSetStrand() && selp->GetStrand() == eNa_strand_minus) {
2123  sel_cpy.SetSortOrder(SAnnotSelector::eSortOrder_Reverse);
2124  }
2125  */
2127  if (bsid) {
2128  SetDiagFilter(eDiagFilter_All, "!(1305.28,31)");
2129  CSeq_id seq_id;
2130  seq_id.Assign( *bsid );
2131  CSeq_loc old_loc;
2132  old_loc.SetInt().SetId( seq_id );
2133  old_loc.SetInt().SetFrom( 0 );
2134  old_loc.SetInt().SetTo( m_Length - 1 );
2135  slice_mapper = new CSeq_loc_Mapper( *slpp, old_loc, m_Scope );
2137  slice_mapper->TruncateNonmappingRanges();
2139  }
2140  feat_it = CFeat_CI(*m_Scope, *slpp, sel_cpy);
2141  }
2142 
2143  CConstRef<CSeq_loc> prev_loc;
2144 
2145  // iterate features on Bioseq
2146  for (; feat_it; ++feat_it) {
2147  const CMappedFeat mf = *feat_it;
2148 
2149  const CSeqFeatData& data = mf.GetData();
2150  CSeqFeatData::E_Choice typ = data.Which();
2151  if (onlyGeneRNACDS) {
2152  if (typ != CSeqFeatData::e_Gene &&
2153  typ != CSeqFeatData::e_Rna &&
2154  typ != CSeqFeatData::e_Cdregion) {
2155  continue;
2156  }
2157  }
2158 
2160 
2161  CConstRef<CSeq_loc> feat_loc(&mf.GetLocation());
2162  if (slpp) {
2163  feat_loc.Reset( slice_mapper->Map( mf.GetLocation() ) );
2164  }
2165 
2166  CRef<CFeatureIndex> sfx(new CFeatureIndex(hdl, mf, feat_loc, *this));
2167 
2169  CSeqFeatData::ESubtype subtype = sfx->GetSubtype();
2170 
2171  // For RW-1215, accession JB818822, insert instantiated gap feature before preceding misc_feature with the same location
2172 
2173  bool gapIsSame = false;
2174  if ( subtype == CSeqFeatData::eSubtype_gap && prev_loc && !m_SfxList.empty() ) {
2175  if ( feat_loc->GetStart(eExtreme_Positional) == prev_loc->GetStart(eExtreme_Positional) /* &&
2176  feat_loc->GetStop(eExtreme_Positional) == prev_loc->GetStop(eExtreme_Positional) */ ) {
2177  gapIsSame = true;
2178  }
2179  }
2180 
2181  if ( gapIsSame ) {
2182  m_SfxList.insert(m_SfxList.end() - 1, sfx);
2183  } else {
2184  m_SfxList.push_back(sfx);
2185  }
2186 
2187  prev_loc = feat_loc;
2188 
2189  // end of RW-1215 changes
2190 
2191  ft->AddFeature(mf);
2192 
2193  // CFeatureIndex from CMappedFeat for use with GetBestGene
2194  m_FeatIndexMap[mf] = sfx;
2195 
2196  // set specific flags for various feature types
2197  if (type == CSeqFeatData::e_Biosrc) {
2198  m_HasSource = true;
2199  if (! m_BioSource) {
2200  if (! mf.IsSetData ()) continue;
2201  const CSeqFeatData& sfdata = mf.GetData();
2202  const CBioSource& biosrc = sfdata.GetBiosrc();
2203  m_BioSource.Reset (&biosrc);
2204  }
2205  continue;
2206  }
2207 
2208  if (type == CSeqFeatData::e_Gene) {
2209  m_HasGene = true;
2211  continue;
2212  }
2213  const CSeq_loc& loc = mf.GetLocation ();
2214  switch (loc.Which()) {
2217  case CSeq_loc::e_Mix:
2218  case CSeq_loc::e_Equiv:
2219  m_HasMultiIntervalGenes = true;
2220  break;
2221  default:
2222  break;
2223  }
2224  continue;
2225  }
2226 
2227  if (subtype == CSeqFeatData::eSubtype_operon) {
2228  idxl->SetHasOperon(true);
2229  continue;
2230  }
2231 
2232  if (type == CSeqFeatData::e_Prot && IsAA()) {
2233  if (! mf.IsSetData ()) continue;
2234  const CSeqFeatData& sfdata = mf.GetData();
2235  const CProt_ref& prp = sfdata.GetProt();
2236  processed = CProt_ref::eProcessed_not_set;
2237  if (prp.IsSetProcessed()) {
2238  processed = prp.GetProcessed();
2239  }
2240  const CSeq_loc& loc = mf.GetLocation ();
2241  TSeqPos prot_length = sequence::GetLength(loc, m_Scope);
2242  if (prot_length > longest) {
2244  m_BestProteinFeature = sfx;
2245  longest = prot_length;
2246  bestprocessed = processed;
2247  } else if (prot_length == longest) {
2248  // unprocessed 0 > preprotein 1 > mat peptide 2
2249  if (processed < bestprocessed) {
2251  m_BestProteinFeature = sfx;
2252  longest = prot_length;
2253  bestprocessed = processed;
2254  }
2255  }
2256  continue;
2257  }
2258 
2259  if (type == CSeqFeatData::e_Cdregion && IsNA()) {
2260  } else if (type == CSeqFeatData::e_Rna && IsNA()) {
2261  } else if (type == CSeqFeatData::e_Prot && IsAA()) {
2262  } else {
2263  continue;
2264  }
2265 
2266  // index feature for (local) product Bioseq (CDS -> protein, mRNA -> cDNA, or Prot -> peptide)
2267  CSeq_id_Handle idh = mf.GetProductId();
2268  if (idh) {
2269  string str = idh.AsString();
2270  CRef<CBioseqIndex> bsxp = idxl->GetBioseqIndex(str);
2271  if (bsxp) {
2272  bsxp->m_FeatForProdInitialized = true;
2273  bsxp->m_FeatureForProduct = sfx;
2274  }
2275  }
2276  }
2277  }
2278  }
2279  catch (CException& e) {
2280  m_FetchFailure = true;
2281  ERR_POST_X(6, Error << "Error in CBioseqIndex::x_InitFeats: " << e.what());
2282  }
2283 }
2284 
2285 // Feature collection methods (delayed until needed)
2287 
2288 {
2289  x_InitFeats(0);
2290 }
2291 
2293 
2294 {
2295  x_InitFeats(&slp);
2296 }
2297 
2298 // GetFeatureForProduct allows hypothetical protein defline generator to obtain gene locus tag
2300 
2301 {
2302  if (! m_FeatForProdInitialized) {
2303  if (m_Bsh) {
2304  CFeat_CI fi(m_Bsh,
2306  .SetByProduct().SetLimitTSE(m_Bsh.GetTSE_Handle()));
2307  if (! fi) {
2308  fi = CFeat_CI(m_Bsh,
2310  .SetByProduct().SetLimitTSE(m_Bsh.GetTSE_Handle()));
2311  }
2312  if (! fi) {
2313  fi = CFeat_CI(m_Bsh,
2315  .SetByProduct().SetLimitTSE(m_Bsh.GetTSE_Handle()));
2316  }
2317  if (fi) {
2318  CMappedFeat mf = *fi;
2319  CSeq_id_Handle idh = mf.GetLocationId();
2320  CBioseq_Handle nbsh = m_Scope->GetBioseqHandle(idh);
2321  if (nbsh) {
2323  auto idxl = idx.Lock();
2324  if (idxl) {
2325  CRef<CBioseqIndex> bsxn = idxl->GetBioseqIndex(nbsh);
2326  if (bsxn) {
2327  if (! bsxn->m_FeatsInitialized) {
2328  bsxn->x_InitFeats();
2329  }
2330  }
2331  }
2332  }
2333  }
2334  }
2335  }
2336 
2337  return m_FeatureForProduct;
2338 }
2339 
2340 // Get Bioseq index containing feature with product pointing to this Bioseq
2342 
2343 {
2345  if (sfxp) {
2346  return sfxp->GetBioseqIndex();
2347  }
2348 
2349  return CWeakRef<CBioseqIndex> ();
2350 }
2351 
2352 // GetBestProteinFeature indexes longest protein feature on protein Bioseq
2354 
2355 {
2356  if (! m_BestProtFeatInitialized) {
2357  if (! m_FeatsInitialized) {
2358  x_InitFeats();
2359  }
2360  }
2361 
2362  return m_BestProteinFeature;
2363 }
2364 
2365 // Common descriptor field getters
2366 const string& CBioseqIndex::GetTitle (void)
2367 
2368 {
2369  if (! m_DescsInitialized) {
2370  x_InitDescs();
2371  }
2372 
2373  return m_Title;
2374 }
2375 
2377 
2378 {
2379  if (! m_DescsInitialized) {
2380  x_InitDescs();
2381  }
2382 
2383  return m_MolInfo;
2384 }
2385 
2387 
2388 {
2389  if (! m_DescsInitialized) {
2390  x_InitDescs();
2391  }
2392 
2393  return m_Biomol;
2394 }
2395 
2397 
2398 {
2399  if (! m_DescsInitialized) {
2400  x_InitDescs();
2401  }
2402 
2403  return m_Tech;
2404 }
2405 
2407 
2408 {
2409  if (! m_DescsInitialized) {
2410  x_InitDescs();
2411  }
2412 
2413  return m_Completeness;
2414 }
2415 
2417 
2418 {
2419  if (! m_DescsInitialized) {
2420  x_InitDescs();
2421  }
2422 
2423  return m_HTGTech;
2424 }
2425 
2427 
2428 {
2429  if (! m_DescsInitialized) {
2430  x_InitDescs();
2431  }
2432 
2433  return m_HTGSUnfinished;
2434 }
2435 
2437 
2438 {
2439  if (! m_DescsInitialized) {
2440  x_InitDescs();
2441  }
2442 
2443  return m_IsTLS;
2444 }
2445 
2447 
2448 {
2449  if (! m_DescsInitialized) {
2450  x_InitDescs();
2451  }
2452 
2453  return m_IsTSA;
2454 }
2455 
2457 
2458 {
2459  if (! m_DescsInitialized) {
2460  x_InitDescs();
2461  }
2462 
2463  return m_IsWGS;
2464 }
2465 
2467 
2468 {
2469  if (! m_DescsInitialized) {
2470  x_InitDescs();
2471  }
2472 
2473  return m_IsEST_STS_GSS;
2474 }
2475 
2477 
2478 {
2479  if (! m_DescsInitialized) {
2480  x_InitDescs();
2481  }
2482 
2483  return m_UseBiosrc;
2484 }
2485 
2487 
2488 {
2489  if (! m_SourcesInitialized) {
2490  x_InitSource();
2491  }
2492 
2493  return m_BioSource;
2494 }
2495 
2496 const string& CBioseqIndex::GetTaxname (void)
2497 
2498 {
2499  if (! m_SourcesInitialized) {
2500  x_InitSource();
2501  }
2502 
2503  return m_Taxname;
2504 }
2505 
2506 const string& CBioseqIndex::GetDescTaxname (void)
2507 
2508 {
2509  if (! m_SourcesInitialized) {
2510  x_InitSource();
2511  }
2512 
2513  return m_DescTaxname;
2514 }
2515 
2516 const string& CBioseqIndex::GetCommon (void)
2517 
2518 {
2519  if (! m_SourcesInitialized) {
2520  x_InitSource();
2521  }
2522 
2523  return m_Common;
2524 }
2525 
2526 const string& CBioseqIndex::GetLineage (void)
2527 
2528 {
2529  if (! m_SourcesInitialized) {
2530  x_InitSource();
2531  }
2532 
2533  return m_Lineage;
2534 }
2535 
2537 
2538 {
2539  if (! m_SourcesInitialized) {
2540  x_InitSource();
2541  }
2542 
2543  return m_Taxid;
2544 }
2545 
2547 
2548 {
2549  if (! m_SourcesInitialized) {
2550  x_InitSource();
2551  }
2552 
2553  return m_UsingAnamorph;
2554 }
2555 
2557 
2558 {
2559  if (! m_SourcesInitialized) {
2560  x_InitSource();
2561  }
2562 
2563  return m_Genus;
2564 }
2565 
2567 
2568 {
2569  if (! m_SourcesInitialized) {
2570  x_InitSource();
2571  }
2572 
2573  return m_Species;
2574 }
2575 
2577 
2578 {
2579  if (! m_SourcesInitialized) {
2580  x_InitSource();
2581  }
2582 
2583  return m_Multispecies;
2584 }
2585 
2587 
2588 {
2589  if (! m_SourcesInitialized) {
2590  x_InitSource();
2591  }
2592 
2593  return m_Genome;
2594 }
2595 
2597 
2598 {
2599  if (! m_SourcesInitialized) {
2600  x_InitSource();
2601  }
2602 
2603  return m_IsPlasmid;
2604 }
2605 
2607 
2608 {
2609  if (! m_SourcesInitialized) {
2610  x_InitSource();
2611  }
2612 
2613  return m_IsChromosome;
2614 }
2615 
2616 const string& CBioseqIndex::GetOrganelle (void)
2617 
2618 {
2619  if (! m_SourcesInitialized) {
2620  x_InitSource();
2621  }
2622 
2623  return m_Organelle;
2624 }
2625 
2627 
2628 {
2629  if (! m_SourcesInitialized) {
2630  x_InitSource();
2631  }
2632 
2633  return m_FirstSuperKingdom;
2634 }
2635 
2637 
2638 {
2639  if (! m_SourcesInitialized) {
2640  x_InitSource();
2641  }
2642 
2643  return m_SecondSuperKingdom;
2644 }
2645 
2647 
2648 {
2649  if (! m_SourcesInitialized) {
2650  x_InitSource();
2651  }
2652 
2653  return m_IsCrossKingdom;
2654 }
2655 
2657 
2658 {
2659  if (! m_SourcesInitialized) {
2660  x_InitSource();
2661  }
2662 
2663  return m_Chromosome;
2664 }
2665 
2667 
2668 {
2669  if (! m_SourcesInitialized) {
2670  x_InitSource();
2671  }
2672 
2673  return m_LinkageGroup;
2674 }
2675 
2677 
2678 {
2679  if (! m_SourcesInitialized) {
2680  x_InitSource();
2681  }
2682 
2683  return m_Clone;
2684 }
2685 
2687 
2688 {
2689  if (! m_SourcesInitialized) {
2690  x_InitSource();
2691  }
2692 
2693  return m_has_clone;
2694 }
2695 
2697 
2698 {
2699  if (! m_SourcesInitialized) {
2700  x_InitSource();
2701  }
2702 
2703  return m_Map;
2704 }
2705 
2707 
2708 {
2709  if (! m_SourcesInitialized) {
2710  x_InitSource();
2711  }
2712 
2713  return m_Plasmid;
2714 }
2715 
2717 
2718 {
2719  if (! m_SourcesInitialized) {
2720  x_InitSource();
2721  }
2722 
2723  return m_Segment;
2724 }
2725 
2727 
2728 {
2729  if (! m_SourcesInitialized) {
2730  x_InitSource();
2731  }
2732 
2733  return m_Breed;
2734 }
2735 
2737 
2738 {
2739  if (! m_SourcesInitialized) {
2740  x_InitSource();
2741  }
2742 
2743  return m_Cultivar;
2744 }
2745 
2746 
2748 
2749 {
2750  if (! m_SourcesInitialized) {
2751  x_InitSource();
2752  }
2753 
2754  return m_SpecimenVoucher;
2755 }
2756 
2757 
2759 
2760 {
2761  if (! m_SourcesInitialized) {
2762  x_InitSource();
2763  }
2764 
2765  return m_Isolate;
2766 }
2767 
2769 
2770 {
2771  if (! m_SourcesInitialized) {
2772  x_InitSource();
2773  }
2774 
2775  return m_Strain;
2776 }
2777 
2779 
2780 {
2781  if (! m_SourcesInitialized) {
2782  x_InitSource();
2783  }
2784 
2785  return m_Substrain;
2786 }
2787 
2789 
2790 {
2791  if (! m_SourcesInitialized) {
2792  x_InitSource();
2793  }
2794 
2795  return m_MetaGenomeSource;
2796 }
2797 
2799 
2800 {
2801  if (! m_DescsInitialized) {
2802  x_InitDescs();
2803  }
2804 
2805  return m_HTGSCancelled;
2806 }
2807 
2809 
2810 {
2811  if (! m_DescsInitialized) {
2812  x_InitDescs();
2813  }
2814 
2815  return m_HTGSDraft;
2816 }
2817 
2819 
2820 {
2821  if (! m_DescsInitialized) {
2822  x_InitDescs();
2823  }
2824 
2825  return m_HTGSPooled;
2826 }
2827 
2829 
2830 {
2831  if (! m_DescsInitialized) {
2832  x_InitDescs();
2833  }
2834 
2835  return m_TPAExp;
2836 }
2837 
2839 
2840 {
2841  if (! m_DescsInitialized) {
2842  x_InitDescs();
2843  }
2844 
2845  return m_TPAInf;
2846 }
2847 
2849 
2850 {
2851  if (! m_DescsInitialized) {
2852  x_InitDescs();
2853  }
2854 
2855  return m_TPAReasm;
2856 }
2857 
2859 
2860 {
2861  if (! m_DescsInitialized) {
2862  x_InitDescs();
2863  }
2864 
2865  return m_Unordered;
2866 }
2867 
2869 
2870 {
2871  if (! m_DescsInitialized) {
2872  x_InitDescs();
2873  }
2874 
2875  return m_PDBCompound;
2876 }
2877 
2879 
2880 {
2881  if (! m_DescsInitialized) {
2882  x_InitDescs();
2883  }
2884 
2885  return m_ForceOnlyNearFeats;
2886 }
2887 
2889 
2890 {
2891  if (! m_DescsInitialized) {
2892  x_InitDescs();
2893  }
2894 
2895  return m_IsUnverified;
2896 }
2897 
2899 
2900 {
2901  if (! m_DescsInitialized) {
2902  x_InitDescs();
2903  }
2904 
2905  return m_IsUnverifiedFeature;
2906 }
2907 
2909 
2910 {
2911  if (! m_DescsInitialized) {
2912  x_InitDescs();
2913  }
2914 
2915  return m_IsUnverifiedOrganism;
2916 }
2917 
2919 
2920 {
2921  if (! m_DescsInitialized) {
2922  x_InitDescs();
2923  }
2924 
2926 }
2927 
2929 
2930 {
2931  if (! m_DescsInitialized) {
2932  x_InitDescs();
2933  }
2934 
2936 }
2937 
2939 
2940 {
2941  if (! m_DescsInitialized) {
2942  x_InitDescs();
2943  }
2944 
2945  return m_IsUnreviewed;
2946 }
2947 
2949 
2950 {
2951  if (! m_DescsInitialized) {
2952  x_InitDescs();
2953  }
2954 
2956 }
2957 
2959 
2960 {
2961  if (! m_DescsInitialized) {
2962  x_InitDescs();
2963  }
2964 
2965  return m_TargetedLocus;
2966 }
2967 
2968 const string& CBioseqIndex::GetComment (void)
2969 
2970 {
2971  if (! m_DescsInitialized) {
2972  x_InitDescs();
2973  }
2974 
2975  return m_Comment;
2976 }
2977 
2979 
2980 {
2981  if (! m_DescsInitialized) {
2982  x_InitDescs();
2983  }
2984 
2985  return m_IsPseudogene;
2986 }
2987 
2989 
2990 {
2991  if (! m_FeatsInitialized) {
2992  x_InitFeats();
2993  }
2994 
2996  auto idxl = idx.Lock();
2997  if (idxl) {
2998  return idxl->HasOperon();
2999  }
3000 
3001  return false;
3002 }
3003 
3005 
3006 {
3007  if (! m_FeatsInitialized) {
3008  x_InitFeats();
3009  }
3010 
3011  return m_HasGene;
3012 }
3013 
3015 
3016 {
3017  if (! m_FeatsInitialized) {
3018  x_InitFeats();
3019  }
3020 
3021  return m_HasMultiIntervalGenes;
3022 }
3023 
3025 
3026 {
3027  if (! m_FeatsInitialized) {
3028  x_InitFeats();
3029  }
3030 
3031  return m_HasSource;
3032 }
3033 
3035 
3036 {
3037  if (! m_DescsInitialized) {
3038  x_InitDescs();
3039  }
3040 
3041  return m_rEnzyme;
3042 }
3043 
3045 
3046 {
3047  CRef<CFeatureIndex> sfx;
3048 
3050  if (it != m_FeatIndexMap.end()) {
3051  sfx = it->second;
3052  }
3053 
3054  return sfx;
3055 }
3056 
3057 void CBioseqIndex::GetSequence (int from, int to, string& buffer)
3058 
3059 {
3060  try {
3061  if (! m_SeqVec) {
3062  m_SeqVec = new CSeqVector(m_Bsh);
3063  if (m_SeqVec) {
3064  if (IsAA()) {
3066  } else {
3068  }
3069  }
3070  }
3071 
3072  if (m_SeqVec) {
3073  CSeqVector& vec = *m_SeqVec;
3074  if (from < 0) {
3075  from = 0;
3076  }
3077  if (to < 0 || to >= (int) vec.size()) {
3078  to = vec.size();
3079  }
3080  if (vec.CanGetRange(from, to)) {
3081  vec.GetSeqData(from, to, buffer);
3082  } else {
3083  m_FetchFailure = true;
3084  }
3085  }
3086  }
3087  catch (CException& e) {
3088  ERR_POST_X(7, Error << "Error in CBioseqIndex::GetSequence: " << e.what());
3089  }
3090 }
3091 
3092 string CBioseqIndex::GetSequence (int from, int to)
3093 
3094 {
3095  string buffer;
3096 
3097  GetSequence(from, to, buffer);
3098 
3099  return buffer;
3100 }
3101 
3103 
3104 {
3105  GetSequence(0, -1, buffer);
3106 }
3107 
3109 
3110 {
3111  string buffer;
3112 
3113  GetSequence(0, -1, buffer);
3114 
3115  return buffer;
3116 }
3117 
3118 const vector<CRef<CGapIndex>>& CBioseqIndex::GetGapIndices(void)
3119 
3120 {
3121  if (! m_GapsInitialized) {
3122  x_InitGaps();
3123  }
3124 
3125  return m_GapList;
3126 }
3127 
3128 const vector<CRef<CDescriptorIndex>>& CBioseqIndex::GetDescriptorIndices(void)
3129 
3130 {
3131  if (! m_DescsInitialized) {
3132  x_InitDescs();
3133  }
3134 
3135  return m_SdxList;
3136 }
3137 
3138 const vector<CRef<CFeatureIndex>>& CBioseqIndex::GetFeatureIndices(void)
3139 
3140 {
3141  if (! m_FeatsInitialized) {
3142  x_InitFeats();
3143  }
3144 
3145  return m_SfxList;
3146 }
3147 
3148 
3149 // CGapIndex
3150 
3151 // Constructor
3153  TSeqPos end,
3154  TSeqPos length,
3155  const string& type,
3156  const vector<string>& evidence,
3157  bool isUnknownLength,
3158  bool isAssemblyGap,
3159  CBioseqIndex& bsx)
3160  : m_Bsx(&bsx),
3161  m_Start(start),
3162  m_End(end),
3163  m_Length(length),
3164  m_GapType(type),
3165  m_GapEvidence(evidence),
3166  m_IsUnknownLength(isUnknownLength),
3167  m_IsAssemblyGap(isAssemblyGap)
3168 {
3169 }
3170 
3171 
3172 // CDescriptorIndex
3173 
3174 // Constructor
3176  CBioseqIndex& bsx)
3177  : m_Sd(sd),
3178  m_Bsx(&bsx)
3179 {
3180  m_Type = m_Sd.Which();
3181 }
3182 
3183 
3184 // CFeatureIndex
3185 
3186 // Constructor
3188  const CMappedFeat mf,
3189  CConstRef<CSeq_loc> feat_loc,
3190  CBioseqIndex& bsx)
3191  : m_Sfh(sfh),
3192  m_Mf(mf),
3193  m_Bsx(&bsx)
3194 {
3195  const CSeqFeatData& data = m_Mf.GetData();
3196  m_Type = data.Which();
3197  m_Subtype = data.GetSubtype();
3198  m_Fl = feat_loc;
3201 }
3202 
3203 // Find CFeatureIndex object for best gene using internal CFeatTree
3205 
3206 {
3207  try {
3208  CMappedFeat best;
3210  auto bsxl = bsx.Lock();
3211  if (bsxl) {
3212  CWeakRef<CSeqMasterIndex> idx = bsxl->GetSeqMasterIndex();
3213  auto idxl = idx.Lock();
3214  if (idxl) {
3215  best = feature::GetBestGeneForFeat(m_Mf, idxl->GetFeatTree(), 0,
3216  /* feature::CFeatTree::eBestGene_AllowOverlapped */
3217  feature::CFeatTree::eBestGene_TreeOnly);
3218  }
3219  if (best) {
3220  return bsxl->GetFeatIndex(best);
3221  }
3222  }
3223  } catch (CException& e) {
3224  ERR_POST_X(8, Error << "Error in CFeatureIndex::GetBestGene: " << e.what());
3225  }
3226  return CRef<CFeatureIndex> ();
3227 }
3228 
3229 
3230 // Find CFeatureIndex object for best parent using internal CFeatTree
3232 
3233 {
3234  try {
3235  CMappedFeat best;
3237  auto bsxl = bsx.Lock();
3238  if (bsxl) {
3239  CWeakRef<CSeqMasterIndex> idx = bsxl->GetSeqMasterIndex();
3240  auto idxl = idx.Lock();
3241  if (idxl) {
3242  static const CSeqFeatData::ESubtype sm_SpecialVDJTypes[] = {
3248  };
3249  for ( const CSeqFeatData::ESubtype* type_ptr = sm_SpecialVDJTypes;
3250  *type_ptr != CSeqFeatData::eSubtype_bad; ++type_ptr ) {
3251  best = feature::GetBestParentForFeat(m_Mf, *type_ptr, idxl->GetFeatTree(), 0);
3252  if (best) {
3253  return bsxl->GetFeatIndex(best);
3254  }
3255  }
3256  }
3257  }
3258  } catch (CException& e) {
3259  ERR_POST_X(8, Error << "Error in CFeatureIndex::GetBestParent: " << e.what());
3260  }
3261  return CRef<CFeatureIndex> ();
3262 }
3263 
3265 
3266 {
3268  auto bsxl = bsx.Lock();
3269  if (bsxl) {
3270  bsxl->SetFetchFailure(fails);
3271  }
3272 }
3273 
3274 // Find CFeatureIndex object for overlapping source feature using internal CFeatTree
3276 
3277 {
3278  try {
3279  CMappedFeat best;
3281  auto bsxl = bsx.Lock();
3282  if (bsxl) {
3283  if (bsxl->HasSource()) {
3284  CWeakRef<CSeqMasterIndex> idx = bsxl->GetSeqMasterIndex();
3285  auto idxl = idx.Lock();
3286  if (idxl) {
3287  CRef<feature::CFeatTree> ft = idxl->GetFeatTree();
3288  try {
3289  best = ft->GetParent(m_Mf, CSeqFeatData::eSubtype_biosrc);
3290  } catch (CException& e) {
3291  ERR_POST_X(9, Error << "Error in CFeatureIndex::GetOverlappingSource: " << e.what());
3292  }
3293  }
3294  if (best) {
3295  return bsxl->GetFeatIndex(best);
3296  }
3297  }
3298  }
3299  } catch (CException& e) {
3300  ERR_POST_X(10, Error << "Error in CFeatureIndex::GetOverlappingSource: " << e.what());
3301  }
3302  return CRef<CFeatureIndex> ();
3303 }
3304 
3305 void CFeatureIndex::GetSequence (int from, int to, string& buffer)
3306 
3307 {
3308  try {
3309  if (! m_SeqVec) {
3311  auto bsxl = bsx.Lock();
3312  if (bsxl) {
3314  if (lc) {
3315  m_SeqVec = new CSeqVector(*lc, *bsxl->GetScope());
3316  if (m_SeqVec) {
3317  if (bsxl->IsAA()) {
3319  } else {
3321  }
3322  }
3323  }
3324  }
3325  }
3326 
3327  if (m_SeqVec) {
3328  CSeqVector& vec = *m_SeqVec;
3329  if (from < 0) {
3330  from = 0;
3331  }
3332  if (to < 0 || to >= (int) vec.size()) {
3333  to = vec.size();
3334  }
3335  if (vec.CanGetRange(from, to)) {
3336  vec.GetSeqData(from, to, buffer);
3337  } else {
3338  SetFetchFailure(true);
3339  }
3340  }
3341  }
3342  catch (CException& e) {
3343  SetFetchFailure(true);
3344  ERR_POST_X(11, Error << "Error in CFeatureIndex::GetSequence: " << e.what());
3345  }
3346 }
3347 
3348 string CFeatureIndex::GetSequence (int from, int to)
3349 
3350 {
3351  string buffer;
3352 
3353  GetSequence(from, to, buffer);
3354 
3355  return buffer;
3356 }
3357 
3359 
3360 {
3361  GetSequence(0, -1, buffer);
3362 }
3363 
3365 
3366 {
3367  string buffer;
3368 
3369  GetSequence(0, -1, buffer);
3370 
3371  return buffer;
3372 }
3373 
3374 
3375 // CWordPairIndexer
3376 
3377 // superscript and subscript code points not handled by UTF8ToAsciiString
3381  { 0x00B2, '2' },
3382  { 0x00B3, '3' },
3383  { 0x00B9, '1' },
3384  { 0x2070, '0' },
3385  { 0x2071, '1' },
3386  { 0x2074, '4' },
3387  { 0x2075, '5' },
3388  { 0x2076, '6' },
3389  { 0x2077, '7' },
3390  { 0x2078, '8' },
3391  { 0x2079, '9' },
3392  { 0x207A, '+' },
3393  { 0x207B, '-' },
3394  { 0x207C, '=' },
3395  { 0x207D, '(' },
3396  { 0x207E, ')' },
3397  { 0x207F, 'n' },
3398  { 0x2080, '0' },
3399  { 0x2081, '1' },
3400  { 0x2082, '2' },
3401  { 0x2083, '3' },
3402  { 0x2084, '4' },
3403  { 0x2085, '5' },
3404  { 0x2086, '6' },
3405  { 0x2087, '7' },
3406  { 0x2088, '8' },
3407  { 0x2089, '9' },
3408  { 0x208A, '+' },
3409  { 0x208B, '-' },
3410  { 0x208C, '=' },
3411  { 0x208D, '(' },
3412  { 0x208E, ')' }
3413 };
3416 
3418 
3419 {
3420  const char* src = str.c_str();
3421  string dst;
3422  while (*src) {
3423  if (static_cast<unsigned char>(*src) < 128) { // no translation needed
3424  dst += *src++;
3425  } else {
3426  utf8::TUnicode character;
3427  size_t n = utf8::UTF8ToUnicode(src, &character);
3428  src += n;
3430  = sc_ExtraTranslations.find(character);
3431  if (it != sc_ExtraTranslations.end()) {
3432  dst += it->second;
3433  } else {
3434  const utf8::SUnicodeTranslation* translation =
3435  utf8::UnicodeToAscii(character);
3436  if (translation != NULL && translation->Type != utf8::eSkip) {
3437  _ASSERT(translation->Type == utf8::eString);
3438  if (translation->Subst != NULL) {
3439  dst += translation->Subst;
3440  }
3441  }
3442  }
3443  }
3444  }
3445  return dst;
3446 }
3447 
3448 static const char* const idxStopWords[] = {
3449  "+",
3450  "-",
3451  "a",
3452  "about",
3453  "again",
3454  "all",
3455  "almost",
3456  "also",
3457  "although",
3458  "always",
3459  "among",
3460  "an",
3461  "and",
3462  "another",
3463  "any",
3464  "are",
3465  "as",
3466  "at",
3467  "be",
3468  "because",
3469  "been",
3470  "before",
3471  "being",
3472  "between",
3473  "both",
3474  "but",
3475  "by",
3476  "can",
3477  "could",
3478  "did",
3479  "do",
3480  "does",
3481  "done",
3482  "due",
3483  "during",
3484  "each",
3485  "either",
3486  "enough",
3487  "especially",
3488  "etc",
3489  "for",
3490  "found",
3491  "from",
3492  "further",
3493  "had",
3494  "has",
3495  "have",
3496  "having",
3497  "here",
3498  "how",
3499  "however",
3500  "i",
3501  "if",
3502  "in",
3503  "into",
3504  "is",
3505  "it",
3506  "its",
3507  "itself",
3508  "just",
3509  "kg",
3510  "km",
3511  "made",
3512  "mainly",
3513  "make",
3514  "may",
3515  "mg",
3516  "might",
3517  "ml",
3518  "mm",
3519  "most",
3520  "mostly",
3521  "must",
3522  "nearly",
3523  "neither",
3524  "no",
3525  "nor",
3526  "obtained",
3527  "of",
3528  "often",
3529  "on",
3530  "our",
3531  "overall",
3532  "perhaps",
3533  "pmid",
3534  "quite",
3535  "rather",
3536  "really",
3537  "regarding",
3538  "seem",
3539  "seen",
3540  "several",
3541  "should",
3542  "show",
3543  "showed",
3544  "shown",
3545  "shows",
3546  "significantly",
3547  "since",
3548  "so",
3549  "some",
3550  "such",
3551  "than",
3552  "that",
3553  "the",
3554  "their",
3555  "theirs",
3556  "them",
3557  "then",
3558  "there",
3559  "therefore",
3560  "these",
3561  "they",
3562  "this",
3563  "those",
3564  "through",
3565  "thus",
3566  "to",
3567  "upon",
3568  "use",
3569  "used",
3570  "using",
3571  "various",
3572  "very",
3573  "was",
3574  "we",
3575  "were",
3576  "what",
3577  "when",
3578  "which",
3579  "while",
3580  "with",
3581  "within",
3582  "without",
3583  "would",
3584 };
3587 
3589 
3590 {
3591  TStopWords::const_iterator iter = sc_StopWords.find(str.c_str());
3592  return (iter != sc_StopWords.end());
3593 }
3594 
3596 
3597 {
3598  string dst = str;
3599 
3600  int max = (int) dst.length();
3601 
3602  for (; max > 0; max--) {
3603  char ch = dst[0];
3604  if (ch != '.' && ch != ',' && ch != ':' && ch != ';') {
3605  break;
3606  }
3607  // trim leading period, comma, colon, and semicolon
3608  dst.erase(0, 1);
3609  }
3610 
3611  for (; max > 0; max--) {
3612  char ch = dst[max-1];
3613  if (ch != '.' && ch != ',' && ch != ':' && ch != ';') {
3614  break;
3615  }
3616  // // trim trailing period, comma, colon, and semicolon
3617  dst.erase(max-1, 1);
3618  }
3619 
3620  if (max > 1) {
3621  if (dst[0] == '(' && dst[max-1] == ')') {
3622  // trim flanking parentheses
3623  dst.erase(max-1, 1);
3624  dst.erase(0, 1);
3625  max -= 2;
3626  }
3627  }
3628 
3629  if (max > 0) {
3630  if (dst[0] == '(' && NStr::Find (dst, ")") == NPOS) {
3631  // trim isolated left parentheses
3632  dst.erase(0, 1);
3633  max--;
3634  }
3635  }
3636 
3637  if (max > 1) {
3638  if (dst[max-1] == ')' && NStr::Find (dst, "(") == NPOS) {
3639  // trim isolated right parentheses
3640  dst.erase(max-1, 1);
3641  // max--;
3642  }
3643  }
3644 
3645  return dst;
3646 }
3647 
3648 static const char* const mixedTags[] = {
3649  "<b>",
3650  "<i>",
3651  "<u>",
3652  "<sup>",
3653  "<sub>",
3654  "</b>",
3655  "</i>",
3656  "</u>",
3657  "</sup>",
3658  "</sub>",
3659  "<b/>",
3660  "<i/>",
3661  "<u/>",
3662  "<sup/>",
3663  "<sub/>",
3664  "&lt;i&gt;",
3665  "&lt;/i&gt;",
3666  "&lt;i/&gt;",
3667  "&lt;b&gt;",
3668  "&lt;/b&gt;",
3669  "&lt;b/&gt;",
3670  "&lt;u&gt;",
3671  "&lt;/u&gt;",
3672  "&lt;u/&gt;",
3673  "&lt;sub&gt;",
3674  "&lt;/sub&gt;",
3675  "&lt;sub/&gt;",
3676  "&lt;sup&gt;",
3677  "&lt;/sup&gt;",
3678  "&lt;sup/&gt;",
3679  "&amp;lt;i&amp;gt;",
3680  "&amp;lt;/i&amp;gt;",
3681  "&amp;lt;i/&amp;gt;",
3682  "&amp;lt;b&amp;gt;",
3683  "&amp;lt;/b&amp;gt;",
3684  "&amp;lt;b/&amp;gt;",
3685  "&amp;lt;u&amp;gt;",
3686  "&amp;lt;/u&amp;gt;",
3687  "&amp;lt;u/&amp;gt;",
3688  "&amp;lt;sub&amp;gt;",
3689  "&amp;lt;/sub&amp;gt;",
3690  "&amp;lt;sub/&amp;gt;",
3691  "&amp;lt;sup&amp;gt;",
3692  "&amp;lt;/sup&amp;gt;",
3693  "&amp;lt;sup/&amp;gt;",
3694 };
3695 
3696 static int SkipMixedContent ( const char* ptr )
3697 
3698 {
3699  for (size_t i = 0; i < sizeof (mixedTags); i++) {
3700  const char* tag = mixedTags[i];
3701  const char* tmp = ptr;
3702  int len = 0;
3703  while (*tag && *tmp && *tag == *tmp) {
3704  tag++;
3705  tmp++;
3706  len++;
3707  }
3708  if (! *tag) {
3709  return len;
3710  }
3711  }
3712  return 0;
3713 }
3714 
3715 string CWordPairIndexer::TrimMixedContent ( const string& str )
3716 
3717 {
3718  const char* src = str.c_str();
3719  string dst;
3720  while (*src) {
3721  if (*src == '<' || *src == '&') {
3722  int skip = SkipMixedContent (src);
3723  if (skip > 0) {
3724  src += skip;
3725  } else {
3726  dst += *src++;
3727  }
3728  } else {
3729  dst += *src++;
3730  }
3731  }
3732  return dst;
3733 }
3734 
3735 string CWordPairIndexer::x_AddToWordPairIndex (string item, string prev)
3736 
3737 {
3738  if (IsStopWord(item)) {
3739  return "";
3740  }
3741  // append item
3742  m_Norm.push_back(item);
3743  if (! prev.empty()) {
3744  // append prev+" "+item
3745  string pair = prev + " " + item;
3746  m_Pair.push_back(pair);
3747  }
3748  return item;
3749 }
3750 
3752 
3753 {
3754  m_Norm.clear();
3755  m_Pair.clear();
3756 
3758  NStr::ToLower(str);
3759 
3760  if (NStr::Find(str, "<") != NPOS || NStr::Find(str, "&") != NPOS) {
3762  }
3763 
3764  // split terms at spaces
3765  list<string> terms;
3766  NStr::Split( str, " ", terms, NStr::fSplit_Tokenize );
3767  string prev;
3768  ITERATE( list<string>, it, terms ) {
3769  string curr = NStr::TruncateSpaces( *it );
3770  // allow parentheses in chemical formula
3771  curr = TrimPunctuation(curr);
3772  prev = x_AddToWordPairIndex (curr, prev);
3773  }
3774 
3775  // convert non-alphanumeric punctuation to space
3776  for (size_t i = 0; i < str.length(); i++) {
3777  char ch = str[i];
3778  if (ch >= 'A' && ch <= 'Z') {
3779  } else if (ch >= 'a' && ch <= 'z') {
3780  } else if (ch >= '0' && ch <= '9') {
3781  } else {
3782  str[i] = ' ';
3783  }
3784  }
3785  // now splitting at all punctuation
3786  list<string> words;
3787  NStr::Split( str, " ", words, NStr::fSplit_Tokenize );
3788  prev = "";
3789  ITERATE( list<string>, it, words ) {
3790  string curr = NStr::TruncateSpaces( *it );
3791  prev = x_AddToWordPairIndex (curr, prev);
3792  }
3793 
3794  std::sort(m_Norm.begin(), m_Norm.end());
3795  auto nit = std::unique(m_Norm.begin(), m_Norm.end());
3796  m_Norm.erase(nit, m_Norm.end());
3797 
3798  std::sort(m_Pair.begin(), m_Pair.end());
3799  auto pit = std::unique(m_Pair.begin(), m_Pair.end());
3800  m_Pair.erase(pit, m_Pair.end());
3801 }
3802 
3803 
static CRef< CScope > m_Scope
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
Tracks the best score (lowest value).
Definition: ncbiutil.hpp:219
CBinomialOrgName –.
bool IsSetCommon(void) const
Definition: BioSource.cpp:345
const string & GetTaxname(void) const
Definition: BioSource.cpp:340
const string & GetCommon(void) const
Definition: BioSource.cpp:350
const COrgName & GetOrgname(void) const
Definition: BioSource.cpp:410
bool IsSetOrgname(void) const
Definition: BioSource.cpp:405
bool IsSetTaxname(void) const
Definition: BioSource.cpp:335
CTempString GetSubstrain(void)
Definition: indexer.cpp:2778
CBioSource::TGenome m_Genome
Definition: indexer.hpp:725
CRef< CScope > m_Scope
Definition: indexer.hpp:622
bool m_FeatForProdInitialized
Definition: indexer.hpp:637
bool m_HTGSPooled
Definition: indexer.hpp:756
bool m_IsTLS
Definition: indexer.hpp:703
bool m_IsUnverifiedMisassembled
Definition: indexer.hpp:771
bool m_BestProtFeatInitialized
Definition: indexer.hpp:640
bool m_has_clone
Definition: indexer.hpp:739
bool m_ThirdParty
Definition: indexer.hpp:677
vector< CRef< CFeatureIndex > > m_SfxList
Definition: indexer.hpp:633
TFeatIndexMap m_FeatIndexMap
Definition: indexer.hpp:645
bool IsHTGTech(void)
Definition: indexer.cpp:2416
bool m_HasGene
Definition: indexer.hpp:785
CTempString m_Species
Definition: indexer.hpp:723
CTempString m_SpecimenVoucher
Definition: indexer.hpp:747
bool HasSource(void)
Definition: indexer.cpp:3024
~CBioseqIndex(void)
Definition: indexer.cpp:1101
CSeq_inst::TLength GetLength(void) const
Definition: indexer.hpp:474
bool m_IsDeltaLitOnly
Definition: indexer.hpp:662
CSeq_inst::TTopology m_Topology
Definition: indexer.hpp:658
string m_SecondSuperKingdom
Definition: indexer.hpp:732
CRef< CSeqVector > m_SeqVec
Definition: indexer.hpp:647
CTempString GetCultivar(void)
Definition: indexer.cpp:2736
bool IsForceOnlyNearFeats(void)
Definition: indexer.cpp:2878
bool HasClone(void)
Definition: indexer.cpp:2686
CTempString GetMetaGenomeSource(void)
Definition: indexer.cpp:2788
CTempString m_Clone
Definition: indexer.hpp:738
bool IsTPAReasm(void)
Definition: indexer.cpp:2848
string m_PDBChainID
Definition: indexer.hpp:690
bool IsTPAInf(void)
Definition: indexer.cpp:2838
bool m_IsPseudogene
Definition: indexer.hpp:782
void x_InitGaps(void)
Definition: indexer.cpp:1107
bool m_IsPatent
Definition: indexer.hpp:674
bool m_IsUnreviewed
Definition: indexer.hpp:775
CTempString GetBreed(void)
Definition: indexer.cpp:2726
const vector< CRef< CGapIndex > > & GetGapIndices(void)
Definition: indexer.cpp:3118
bool IsChromosome(void)
Definition: indexer.cpp:2606
CBioseqIndex(CBioseq_Handle bsh, const CBioseq &bsp, CBioseq_Handle obsh, CRef< CSeqsetIndex > prnt, CSeq_entry_Handle tseh, CRef< CScope > scope, CSeqMasterIndex &idx, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags)
Definition: indexer.cpp:783
string m_Lineage
Definition: indexer.hpp:718
string m_PatentCountry
Definition: indexer.hpp:685
bool IsWGS(void)
Definition: indexer.cpp:2456
bool m_ForceOnlyNearFeats
Definition: indexer.hpp:766
bool m_IsUnreviewedUnannotated
Definition: indexer.hpp:776
CConstRef< CBioSource > m_BioSource
Definition: indexer.hpp:714
bool m_UsingAnamorph
Definition: indexer.hpp:720
CTempString GetGenus(void)
Definition: indexer.cpp:2556
CMolInfo::TTech GetTech(void)
Definition: indexer.cpp:2396
CRef< CFeatureIndex > GetFeatureForProduct(void)
Definition: indexer.cpp:2299
const vector< CRef< CFeatureIndex > > & GetFeatureIndices(void)
Definition: indexer.cpp:3138
bool IsEST_STS_GSS(void)
Definition: indexer.cpp:2466
string m_Title
Definition: indexer.hpp:693
bool m_FetchFailure
Definition: indexer.hpp:652
const string & GetLineage(void)
Definition: indexer.cpp:2526
CConstRef< CMolInfo > m_MolInfo
Definition: indexer.hpp:696
bool m_IsUnverifiedFeature
Definition: indexer.hpp:769
CTempString GetStrain(void)
Definition: indexer.cpp:2768
CTempString m_Chromosome
Definition: indexer.hpp:736
CTempString m_Isolate
Definition: indexer.hpp:748
bool m_WGSMaster
Definition: indexer.hpp:678
bool IsUnreviewed(void)
Definition: indexer.cpp:2938
bool m_UseBiosrc
Definition: indexer.hpp:708
bool m_IsMap
Definition: indexer.hpp:664
bool m_HTGTech
Definition: indexer.hpp:701
bool m_TSAMaster
Definition: indexer.hpp:679
bool m_HTGSUnfinished
Definition: indexer.hpp:702
bool IsTPAExp(void)
Definition: indexer.cpp:2828
bool m_IsUnverifiedContaminant
Definition: indexer.hpp:772
bool IsHTGSPooled(void)
Definition: indexer.cpp:2818
bool m_IsPDB
Definition: indexer.hpp:675
bool IsHTGSCancelled(void)
Definition: indexer.cpp:2798
bool m_IsVirtual
Definition: indexer.hpp:663
bool m_IsDelta
Definition: indexer.hpp:661
void x_InitFeats(void)
Definition: indexer.cpp:2286
bool IsAA(void) const
Definition: indexer.hpp:472
CConstRef< CMolInfo > GetMolInfo(void)
Definition: indexer.cpp:2376
TTaxId m_Taxid
Definition: indexer.hpp:719
CBioseq_Handle GetBioseqHandle(void) const
Definition: indexer.hpp:428
bool IsUnverifiedFeature(void)
Definition: indexer.cpp:2898
bool m_IsRefSeq
Definition: indexer.hpp:669
CTempString GetSpecies(void)
Definition: indexer.cpp:2566
string m_PatentNumber
Definition: indexer.hpp:686
CRef< CFeatureIndex > GetFeatIndex(const CMappedFeat &mf)
Definition: indexer.cpp:3044
CConstRef< CBioSource > GetBioSource(void)
Definition: indexer.cpp:2486
string m_FirstSuperKingdom
Definition: indexer.hpp:731
CTempString m_Cultivar
Definition: indexer.hpp:746
bool HasOperon(void)
Definition: indexer.cpp:2988
string GetrEnzyme(void)
Definition: indexer.cpp:3034
string GetSequence(void)
Definition: indexer.cpp:3108
string m_GeneralStr
Definition: indexer.hpp:682
string GetSecondSuperKingdom(void)
Definition: indexer.cpp:2636
bool m_HTGSCancelled
Definition: indexer.hpp:754
bool IsUsingAnamorph(void)
Definition: indexer.cpp:2546
const string & GetAccession(void) const
Definition: indexer.hpp:482
CMolInfo::TCompleteness GetCompleteness(void)
Definition: indexer.cpp:2406
bool m_Multispecies
Definition: indexer.hpp:724
CWeakRef< CSeqMasterIndex > GetSeqMasterIndex(void) const
Definition: indexer.hpp:436
bool IsPlasmid(void)
Definition: indexer.cpp:2596
bool IsCrossKingdom(void)
Definition: indexer.cpp:2646
CTempString m_Substrain
Definition: indexer.hpp:750
const vector< CRef< CDescriptorIndex > > & GetDescriptorIndices(void)
Definition: indexer.cpp:3128
CSeqEntryIndex::TFlags m_Flags
Definition: indexer.hpp:650
bool IsUnordered(void)
Definition: indexer.cpp:2858
bool m_IsUnverifiedOrganism
Definition: indexer.hpp:770
CConstRef< CBioSource > m_DescBioSource
Definition: indexer.hpp:711
CSeq_inst::TLength m_Length
Definition: indexer.hpp:659
bool IsNA(void) const
Definition: indexer.hpp:471
string m_Organelle
Definition: indexer.hpp:729
CTempString GetIsolate(void)
Definition: indexer.cpp:2758
bool m_IsEST_STS_GSS
Definition: indexer.hpp:706
CTempString GetPDBCompound(void)
Definition: indexer.cpp:2868
CTempString m_Breed
Definition: indexer.hpp:745
bool HasGene(void)
Definition: indexer.cpp:3004
const string & GetTitle(void)
Definition: indexer.cpp:2366
CTempString GetMap(void)
Definition: indexer.cpp:2696
CBioseq_Handle m_Bsh
Definition: indexer.hpp:617
bool m_HasMultiIntervalGenes
Definition: indexer.hpp:786
bool m_TLSMaster
Definition: indexer.hpp:680
bool m_HasSource
Definition: indexer.hpp:787
bool IsUnreviewedUnannotated(void)
Definition: indexer.cpp:2948
bool m_IsTSA
Definition: indexer.hpp:704
void GetSelector(SAnnotSelector &sel)
Definition: indexer.cpp:2055
bool IsPseudogene(void)
Definition: indexer.cpp:2978
CTempString GetSegment(void)
Definition: indexer.cpp:2716
bool m_IsWGS
Definition: indexer.hpp:705
CTempString GetLinkageGroup(void)
Definition: indexer.cpp:2666
CBioseq_Handle m_OrigBsh
Definition: indexer.hpp:619
bool m_FeatsInitialized
Definition: indexer.hpp:632
vector< CRef< CDescriptorIndex > > m_SdxList
Definition: indexer.hpp:630
bool HasMultiIntervalGenes(void)
Definition: indexer.cpp:3014
bool IsHTGSDraft(void)
Definition: indexer.cpp:2808
CMolInfo::TTech m_Tech
Definition: indexer.hpp:698
string m_Accession
Definition: indexer.hpp:667
CTempString m_Segment
Definition: indexer.hpp:742
CMolInfo::TBiomol m_Biomol
Definition: indexer.hpp:697
const string & GetTaxname(void)
Definition: indexer.cpp:2496
bool m_SourcesInitialized
Definition: indexer.hpp:635
bool m_IsCrossKingdom
Definition: indexer.hpp:733
CWeakRef< CBioseqIndex > GetBioseqForProduct(void)
Definition: indexer.cpp:2341
bool m_TPAExp
Definition: indexer.hpp:757
const string & GetOrganelle(void)
Definition: indexer.cpp:2616
CRef< CFeatureIndex > GetBestProteinFeature(void)
Definition: indexer.cpp:2353
CTempString GetSpecimenVoucher(void)
Definition: indexer.cpp:2747
int m_GeneralId
Definition: indexer.hpp:683
string m_DescTaxname
Definition: indexer.hpp:712
vector< CRef< CGapIndex > > m_GapList
Definition: indexer.hpp:627
int m_PDBChain
Definition: indexer.hpp:689
CSeqEntryIndex::EPolicy m_Policy
Definition: indexer.hpp:649
CTempString GetClone(void)
Definition: indexer.cpp:2676
int m_PatentSequence
Definition: indexer.hpp:687
CRef< CFeatureIndex > m_BestProteinFeature
Definition: indexer.hpp:641
string m_Comment
Definition: indexer.hpp:781
CBioSource::TGenome GetGenome(void)
Definition: indexer.cpp:2586
bool m_Unordered
Definition: indexer.hpp:760
CTempString GetChromosome(void)
Definition: indexer.cpp:2656
void x_InitSource(void)
Definition: indexer.cpp:1305
bool IsUnverifiedMisassembled(void)
Definition: indexer.cpp:2918
CTempString m_TargetedLocus
Definition: indexer.hpp:778
CMolInfo::TBiomol GetBiomol(void)
Definition: indexer.cpp:2386
bool IsHTGSUnfinished(void)
Definition: indexer.cpp:2426
CTempString GetPlasmid(void)
Definition: indexer.cpp:2706
string GetFirstSuperKingdom(void)
Definition: indexer.cpp:2626
bool IsUseBiosrc(void)
Definition: indexer.cpp:2476
CTempString m_MetaGenomeSource
Definition: indexer.hpp:751
CTempString m_Plasmid
Definition: indexer.hpp:741
bool m_IsUnverified
Definition: indexer.hpp:768
void x_DefaultSelector(SAnnotSelector &sel, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, bool onlyNear, CScope &scope)
Definition: indexer.cpp:1849
CTempString m_PDBCompound
Definition: indexer.hpp:763
bool m_HTGSDraft
Definition: indexer.hpp:755
bool IsTSA(void)
Definition: indexer.cpp:2446
bool IsUnverifiedContaminant(void)
Definition: indexer.cpp:2928
CTempString m_Map
Definition: indexer.hpp:740
string m_Taxname
Definition: indexer.hpp:715
string m_rEnzyme
Definition: indexer.hpp:790
const string & GetCommon(void)
Definition: indexer.cpp:2516
CMolInfo::TCompleteness m_Completeness
Definition: indexer.hpp:699
CTempString m_Strain
Definition: indexer.hpp:749
bool IsMultispecies(void)
Definition: indexer.cpp:2576
bool m_IsPlasmid
Definition: indexer.hpp:726
bool IsTLS(void)
Definition: indexer.cpp:2436
CRef< CFeatureIndex > m_FeatureForProduct
Definition: indexer.hpp:638
bool m_TPAReasm
Definition: indexer.hpp:759
void x_InitDescs(void)
Definition: indexer.cpp:1590
const string & GetComment(void)
Definition: indexer.cpp:2968
TTaxId GetTaxid(void)
Definition: indexer.cpp:2536
bool m_TPAInf
Definition: indexer.hpp:758
bool IsUnverifiedOrganism(void)
Definition: indexer.cpp:2908
CTempString GetTargetedLocus(void)
Definition: indexer.cpp:2958
CTempString m_LinkageGroup
Definition: indexer.hpp:737
bool m_DescsInitialized
Definition: indexer.hpp:629
const string & GetDescTaxname(void)
Definition: indexer.cpp:2506
bool m_GapsInitialized
Definition: indexer.hpp:626
string m_Common
Definition: indexer.hpp:717
bool IsUnverified(void)
Definition: indexer.cpp:2888
CTempString m_Genus
Definition: indexer.hpp:722
bool m_IsChromosome
Definition: indexer.hpp:727
CBioseq_Handle –.
CBioseq_set_Handle –.
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq_set.hpp:122
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq.hpp:174
Definition: Dbtag.hpp:53
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
const CSeqdesc & m_Sd
Definition: indexer.hpp:870
CSeqdesc::E_Choice m_Type
Definition: indexer.hpp:873
CDescriptorIndex(const CSeqdesc &sd, CBioseqIndex &bsx)
Definition: indexer.cpp:3175
CEMBL_block –.
Definition: EMBL_block.hpp:66
CFeat_CI –.
Definition: feat_ci.hpp:64
CFeatureIndex(CSeq_feat_Handle sfh, const CMappedFeat mf, CConstRef< CSeq_loc > feat_loc, CBioseqIndex &bsx)
Definition: indexer.cpp:3187
const CMappedFeat m_Mf
Definition: indexer.hpp:935
CSeqFeatData::ESubtype m_Subtype
Definition: indexer.hpp:941
CSeqFeatData::ESubtype GetSubtype(void) const
Definition: indexer.hpp:909
CRef< CFeatureIndex > GetOverlappingSource(void)
Definition: indexer.cpp:3275
CSeqFeatData::E_Choice GetType(void) const
Definition: indexer.hpp:906
TSeqPos m_End
Definition: indexer.hpp:944
CRef< CSeqVector > m_SeqVec
Definition: indexer.hpp:937
CRef< CFeatureIndex > GetBestParent(void)
Definition: indexer.cpp:3231
CRef< CFeatureIndex > GetBestGene(void)
Definition: indexer.cpp:3204
CSeqFeatData::E_Choice m_Type
Definition: indexer.hpp:940
const CMappedFeat GetMappedFeat(void) const
Definition: indexer.hpp:897
CConstRef< CSeq_loc > m_Fl
Definition: indexer.hpp:936
void SetFetchFailure(bool fails)
Definition: indexer.cpp:3264
TSeqPos m_Start
Definition: indexer.hpp:943
CConstRef< CSeq_loc > GetMappedLocation(void) const
Definition: indexer.hpp:900
CWeakRef< CBioseqIndex > GetBioseqIndex(void) const
Definition: indexer.hpp:903
string GetSequence(void)
Definition: indexer.cpp:3364
CGapIndex(TSeqPos start, TSeqPos end, TSeqPos length, const string &type, const vector< string > &evidence, bool isUnknownLength, bool isAssemblyGap, CBioseqIndex &bsx)
Definition: indexer.cpp:3152
const string & GetSomeNumber(void) const
Definition: Id_pat.cpp:96
CMap_ext –.
Definition: Map_ext.hpp:66
CMappedFeat –.
Definition: mapped_feat.hpp:59
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
TTaxId GetTaxId() const
Definition: Org_ref.cpp:72
CPDB_block –.
Definition: PDB_block.hpp:66
CPartialOrgName –.
CRsite_ref –.
Definition: Rsite_ref.hpp:66
CScope –.
Definition: scope.hpp:92
CRef< CSeqMasterIndex > m_Idx
Definition: indexer.hpp:193
bool IsFetchFailure(void)
Definition: indexer.cpp:209
const vector< CRef< CBioseqIndex > > & GetBioseqIndices(void)
Definition: indexer.cpp:155
bool DistributedReferences(void)
Definition: indexer.cpp:167
void SetFeatDepth(int featDepth)
Definition: indexer.cpp:185
const vector< CRef< CSeqsetIndex > > & GetSeqsetIndices(void)
Definition: indexer.cpp:161
CSeqEntryIndex(CSeq_entry_Handle &topseh, EPolicy policy=eAdaptive, TFlags flags=fDefault)
Definition: indexer.cpp:57
int GetGapDepth(void)
Definition: indexer.cpp:203
bool IsIndexFailure(void)
Definition: indexer.cpp:215
FAddSnpFunc * GetSnpFunc(void)
Definition: indexer.cpp:179
void SetGapDepth(int gapDepth)
Definition: indexer.cpp:197
void SetSnpFunc(FAddSnpFunc *snp)
Definition: indexer.cpp:173
CRef< CBioseqIndex > GetBioseqIndex(void)
Definition: indexer.cpp:114
int GetFeatDepth(void)
Definition: indexer.cpp:191
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
bool m_DistributedReferences
Definition: indexer.hpp:321
CSeqEntryIndex::TFlags m_Flags
Definition: indexer.hpp:304
const vector< CRef< CSeqsetIndex > > & GetSeqsetIndices(void)
Definition: indexer.cpp:755
CConstRef< CSeq_descr > m_TopDescr
Definition: indexer.hpp:300
FAddSnpFunc * m_SnpFunc
Definition: indexer.hpp:323
const vector< CRef< CBioseqIndex > > & GetBioseqIndices(void)
Definition: indexer.cpp:749
bool DistributedReferences(void) const
Definition: indexer.hpp:265
void x_Initialize(CSeq_entry_Handle &topseh, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags)
Definition: indexer.cpp:225
CSeqEntryIndex::EPolicy m_Policy
Definition: indexer.hpp:303
bool m_IndexFailure
Definition: indexer.hpp:330
TAccnIndexMap m_AccnIndexMap
Definition: indexer.hpp:310
void x_InitSeqs(const CSeq_entry &sep, CRef< CSeqsetIndex > prnt, int level=0)
Definition: indexer.cpp:514
CConstRef< CSubmit_block > m_SbtBlk
Definition: indexer.hpp:299
FAddSnpFunc * GetSnpFunc(void)
Definition: indexer.cpp:416
bool IsFetchFailure(void)
Definition: indexer.cpp:448
void SetGapDepth(int gapDepth)
Definition: indexer.cpp:434
CRef< CObjectManager > m_Objmgr
Definition: indexer.hpp:294
TBestIdIndexMap m_BestIdIndexMap
Definition: indexer.hpp:314
CAtomicCounter m_Counter
Definition: indexer.hpp:328
int GetGapDepth(void)
Definition: indexer.cpp:440
void SetFeatDepth(int featDepth)
Definition: indexer.cpp:422
int GetFeatDepth(void)
Definition: indexer.cpp:428
vector< CRef< CBioseqIndex > > m_BsxList
Definition: indexer.hpp:306
void SetSnpFunc(FAddSnpFunc *snp)
Definition: indexer.cpp:410
CConstRef< CSeq_entry > m_Tsep
Definition: indexer.hpp:298
void x_Init(void)
Definition: indexer.cpp:630
CRef< CBioseqIndex > GetBioseqIndex(void)
Definition: indexer.cpp:673
CRef< CScope > m_Scope
Definition: indexer.hpp:295
vector< CRef< CSeqsetIndex > > m_SsxList
Definition: indexer.hpp:316
bool m_IsSmallGenomeSet
Definition: indexer.hpp:319
bool IsIndexFailure(void) const
Definition: indexer.hpp:283
CSeq_entry_Handle m_Tseh
Definition: indexer.hpp:296
CRef< feature::CFeatTree > m_FeatTree
Definition: indexer.hpp:301
CSeqVector –.
Definition: seq_vector.hpp:65
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
void Parentize(void)
Definition: Seq_entry.cpp:71
CSeq_ext –.
Definition: Seq_ext.hpp:66
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeq_loc_Mapper –.
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
CBioseq_set::TClass m_Class
Definition: indexer.hpp:363
CBioseq_set::TClass GetClass(void) const
Definition: indexer.hpp:356
CSeqsetIndex(CBioseq_set_Handle ssh, const CBioseq_set &bssp, CRef< CSeqsetIndex > prnt)
Definition: indexer.cpp:765
TBase::const_iterator const_iterator
Definition: static_set.hpp:828
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
CSubmit_block –.
CTaxElement –.
Definition: TaxElement.hpp:66
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
bool IsUnverifiedMisassembled() const
bool IsUnverifiedOrganism() const
bool IsUnverifiedContaminant() const
bool IsUnverifiedFeature() const
bool IsUnreviewedUnannotated() const
static string TrimMixedContent(const string &str)
Definition: indexer.cpp:3715
vector< string > m_Pair
Definition: indexer.hpp:981
static bool IsStopWord(const string &str)
Definition: indexer.cpp:3588
void PopulateWordPairIndex(string str)
Definition: indexer.cpp:3751
vector< string > m_Norm
Definition: indexer.hpp:980
string x_AddToWordPairIndex(string item, string prev)
Definition: indexer.cpp:3735
static string TrimPunctuation(const string &str)
Definition: indexer.cpp:3595
static string ConvertUTF8ToAscii(const string &str)
Definition: indexer.cpp:3417
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
static uch flags
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static int lc
Definition: getdata.c:30
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
#define FOR_EACH_USERFIELD_ON_USEROBJECT(Itr, Var)
FOR_EACH_USERFIELD_ON_USEROBJECT EDIT_EACH_USERFIELD_ON_USEROBJECT.
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define NULL
Definition: ncbistd.hpp:225
void Set(TValue new_value) THROWS_NONE
Set atomic counter value.
Definition: ncbicntr.hpp:185
void SetDiagFilter(EDiagFilter what, const char *filter_str)
Set diagnostic filter.
Definition: ncbidiag.cpp:7673
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
@ eDiagFilter_All
for all non-FATAL
Definition: ncbidiag.hpp:2531
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
CConstRef< CSeq_id > GetSeqId(void) const
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
string AsString(void) const
static int Score(const CRef< CSeq_id > &id)
Wrappers for use with FindBestChoice from <corelib/ncbiutil.hpp>
Definition: Seq_id.hpp:772
@ fAcc_master
Definition: Seq_id.hpp:256
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
CMappedFeat GetBestParentForFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype parent_subtype, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3462
CMappedFeat GetBestGeneForFeat(const CMappedFeat &feat, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0, CFeatTree::EBestGeneType lookup_type=CFeatTree::eBestGene_TreeOnly)
Definition: feature.cpp:3443
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
vector< string > gap_linkage_evidences
A vector representing the linkage-evidences of the gap.
Definition: sequence.hpp:871
string gap_type
String representing the gap type.
Definition: sequence.hpp:868
static void GetGapModText(const CSeq_gap &seq_gap, SGapModText &out_gap_mod_text)
Given a CSeq_gap object, this outputs the Gap information.
Definition: sequence.cpp:3483
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
CSeq_loc_Mapper_Base & TruncateNonmappingRanges(void)
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CBioseq_set_Handle GetBioseq_setHandle(const CBioseq_set &seqset, EMissing action=eMissing_Default)
Definition: scope.cpp:176
void SetFuzzOption(TFuzzOption newOption)
vector< CSeq_id_Handle > TId
const CTSE_Handle & GetTSE_Handle(void) const
Get CTSE_Handle of containing TSE.
TClass GetClass(void) const
const TInst_Ext & GetInst_Ext(void) const
bool IsSetInst_Ext(void) const
const CSeqFeatData & GetData(void) const
TSeqPos GetBioseqLength(void) const
bool IsAa(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
bool IsSetInst_Length(void) const
TInst_Topology GetInst_Topology(void) const
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
TInst_Length GetInst_Length(void) const
bool IsSetInst(void) const
bool IsSetInst_Repr(void) const
bool IsSetClass(void) const
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
TInst_Repr GetInst_Repr(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
bool IsNa(void) const
bool IsSetInst_Topology(void) const
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
const TId & GetId(void) const
bool IsSetData(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
SSeqMapSelector & SetResolveCount(size_t res_cnt)
Set max depth of resolving seq-map.
Definition: seq_map_ci.hpp:151
SAnnotSelector & ExcludeFeatSubtype(TFeatSubtype subtype)
Exclude feature subtype from the search.
CSeq_id_Handle GetProductId(void) const
SAnnotSelector & SetResolveAll(void)
SetResolveAll() is equivalent to SetResolveMethod(eResolve_All).
SAnnotSelector & ExcludeFeatType(TFeatType type)
Exclude feature type from the search.
SAnnotSelector & SetMaxSearchSegmentsAction(EMaxSearchSegmentsAction action)
const CSeq_loc & GetLocation(void) const
SSeqMapSelector & SetFlags(TFlags flags)
Select segment type(s)
Definition: seq_map_ci.hpp:179
SAnnotSelector & SetAdaptiveDepth(bool value=true)
SetAdaptiveDepth() requests to restrict subsegment resolution depending on annotations found on lower...
SAnnotSelector & SetResolveDepth(int depth)
SetResolveDepth sets the limit of subsegment resolution in searching annotations.
SAnnotSelector & SetFeatComparator(IFeatComparator *comparator)
SAnnotSelector & IncludeNamedAnnotAccession(const string &acc, int zoom_level=0)
const CSeq_feat_Handle & GetSeq_feat_Handle(void) const
Get original feature handle.
Definition: mapped_feat.hpp:71
SAnnotSelector & SetExcludeExternal(bool exclude=true)
External annotations for the Object Manger are annotations located in top level Seq-entry different f...
SAnnotSelector & SetMaxSearchTime(TMaxSearchTime max_time)
Set maximum time (in seconds) to search before giving up.
SAnnotSelector & SetFailUnresolved(void)
SAnnotSelector & ExcludeNamedAnnots(const CAnnotName &name)
Add named annot to set of annots names to exclude.
SAnnotSelector & SetIgnoreStrand(bool value=true)
Ignore strand when testing for range overlap.
SAnnotSelector & SetMaxSearchSegments(TMaxSearchSegments max_segments)
Set maximum number of empty segments to search before giving up.
SAnnotSelector & ExcludeNamedAnnotAccession(const string &acc)
CSeq_id_Handle GetLocationId(void) const
bool CanGetRange(TSeqPos start, TSeqPos stop) const
Check if the sequence data is available for the interval [start, stop).
Definition: seq_vector.cpp:292
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void SetCoding(TCoding coding)
@ fFindGap
Definition: seq_map.hpp:130
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TRefType Lock(void) const
Lock the object and return reference to it.
Definition: ncbiobj.hpp:2713
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2984
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
void clear(void)
Clears the string.
Definition: tempstr.hpp:351
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5355
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
Definition: ncbistr.cpp:3177
const char *const kEmptyCStr
Empty "C" string (points to a '\0').
Definition: ncbistr.cpp:68
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2510
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
const TKeywords & GetKeywords(void) const
Get the Keywords member data.
Definition: GB_block_.hpp:526
bool IsSetKeywords(void) const
Check if a value has been assigned to Keywords data member.
Definition: GB_block_.hpp:514
const TCountry & GetCountry(void) const
Get the Country member data.
Definition: Id_pat_.hpp:478
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool CanGetOrg(void) const
Check if it is safe to call GetOrg method.
Definition: BioSource_.hpp:503
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
const TName & GetName(void) const
Get the Name member data.
Definition: SubSource_.hpp:350
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: SubSource_.hpp:338
const TStr & GetStr(void) const
Get the variant data.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
const TData & GetData(void) const
Get the Data member data.
bool IsSetTag(void) const
appropriate tag Check if a value has been assigned to Tag data member.
Definition: Dbtag_.hpp:255
bool IsStr(void) const
Check if variant Str is selected.
bool IsSetLabel(void) const
field label Check if a value has been assigned to Label data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TLabel & GetLabel(void) const
Get the Label member data.
const TType & GetType(void) const
Get the Type member data.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
const TLineage & GetLineage(void) const
Get the Lineage member data.
Definition: OrgName_.hpp:864
TFixed_level GetFixed_level(void) const
Get the Fixed_level member data.
const TSubname & GetSubname(void) const
Get the Subname member data.
Definition: OrgMod_.hpp:347
bool IsSetFixed_level(void) const
Check if a value has been assigned to Fixed_level data member.
bool IsPartial(void) const
Check if variant Partial is selected.
Definition: OrgName_.hpp:753
const TName & GetName(void) const
Get the Name member data.
Definition: OrgName_.hpp:771
const TLevel & GetLevel(void) const
Get the Level member data.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TBinomial & GetBinomial(void) const
Get the variant data.
Definition: OrgName_.cpp:121
bool CanGetLineage(void) const
Check if it is safe to call GetLineage method.
Definition: OrgName_.hpp:858
const Tdata & Get(void) const
Get the member data.
const TName & GetName(void) const
Get the Name member data.
bool IsSetGenus(void) const
required Check if a value has been assigned to Genus data member.
const TSpecies & GetSpecies(void) const
Get the Species member data.
list< CRef< CTaxElement > > Tdata
bool IsSetSubname(void) const
Check if a value has been assigned to Subname data member.
Definition: OrgMod_.hpp:335
bool IsSetLevel(void) const
Check if a value has been assigned to Level data member.
const TGenus & GetGenus(void) const
Get the Genus member data.
const TPartial & GetPartial(void) const
Get the variant data.
Definition: OrgName_.cpp:193
bool IsSet(void) const
Check if a value has been assigned to data member.
bool IsSetSpecies(void) const
species required if subspecies used Check if a value has been assigned to Species data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
Definition: OrgName_.hpp:759
bool IsBinomial(void) const
Check if variant Binomial is selected.
Definition: OrgName_.hpp:715
EProcessed
processing status
Definition: Prot_ref_.hpp:95
TProcessed GetProcessed(void) const
Get the Processed member data.
Definition: Prot_ref_.hpp:538
bool IsSetProcessed(void) const
Check if a value has been assigned to Processed data member.
Definition: Prot_ref_.hpp:513
bool IsStr(void) const
Check if variant Str is selected.
Definition: Rsite_ref_.hpp:264
const TStr & GetStr(void) const
Get the variant data.
Definition: Rsite_ref_.hpp:270
E_Choice
Choice variants.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TBiosrc & GetBiosrc(void) const
Get the variant data.
const TProt & GetProt(void) const
Get the variant data.
@ e_Region
named region (globin locus)
@ e_Pub
publication applies to this seq
@ e_Comment
just a comment
TChain GetChain(void) const
Get the Chain member data.
bool IsSetChain_id(void) const
chain identifier; length-independent generalization of 'chain' Check if a value has been assigned to ...
bool IsSetChain(void) const
Deprecated: 'chain' can't support multiple character PDB chain identifiers (introduced in 2015).
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
bool IsSetCit(void) const
patent citation Check if a value has been assigned to Cit data member.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_loc_.hpp:475
TSeqid GetSeqid(void) const
Get the Seqid member data.
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
const TChain_id & GetChain_id(void) const
Get the Chain_id member data.
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
bool IsSetSeqid(void) const
number of sequence in patent Check if a value has been assigned to Seqid data member.
const TCit & GetCit(void) const
Get the Cit member data.
const TAccession & GetAccession(void) const
Get the Accession member data.
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
Definition: Seq_id_.hpp:113
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ e_Equiv
equivalent sets of locations
Definition: Seq_loc_.hpp:106
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TDescr & GetDescr(void) const
Get the Descr member data.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
bool IsSetDescr(void) const
Check if a value has been assigned to Descr data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
const TAnnot & GetAnnot(void) const
Get the Annot member data.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
bool IsMap(void) const
Check if variant Map is selected.
Definition: Seq_ext_.hpp:330
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
ERepr
representation class
Definition: Seq_inst_.hpp:91
const TGap & GetGap(void) const
Get the variant data.
Definition: Seq_data_.cpp:184
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
const Tdata & Get(void) const
Get the member data.
Definition: Map_ext_.hpp:164
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
const TMap & GetMap(void) const
Get the variant data.
Definition: Seq_ext_.cpp:158
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
list< CRef< CSeq_feat > > Tdata
Definition: Map_ext_.hpp:89
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
bool IsGap(void) const
Check if variant Gap is selected.
Definition: Seq_data_.hpp:704
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Map_ext_.hpp:152
const TComment & GetComment(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1058
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_map
ordered map of any kind
Definition: Seq_inst_.hpp:99
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
@ eCompleteness_unknown
Definition: MolInfo_.hpp:155
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Embl
EMBL specific information.
Definition: Seqdesc_.hpp:127
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Comment
a more extensive comment
Definition: Seqdesc_.hpp:117
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
@ e_Pdb
PDB specific information.
Definition: Seqdesc_.hpp:131
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
const TEntrys & GetEntrys(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
const TSub & GetSub(void) const
Get the Sub member data.
bool IsEntrys(void) const
Check if variant Entrys is selected.
bool CanGetSub(void) const
Check if it is safe to call GetSub method.
bool CanGetData(void) const
Check if it is safe to call GetData method.
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
const SUnicodeTranslation * UnicodeToAscii(TUnicode character, const TUnicodeTable *table=NULL, const SUnicodeTranslation *default_translation=NULL)
Convert Unicode character into ASCII string.
Definition: unicode.cpp:324
size_t UTF8ToUnicode(const char *utf, TUnicode *unicode)
Convert UTF8 into Unicode character.
Definition: unicode.cpp:382
unsigned int TUnicode
Definition: unicode.hpp:77
@ eSkip
Unicode to be skipped in translation. Usually it is combined mark.
Definition: unicode.hpp:52
Definition of all error codes used in objmgr libraries (xobjmgr.lib, xobjutil.lib and others).
CStaticArraySet< const char *, PCase_CStr > TStopWords
Definition: indexer.cpp:3585
static const char *const mixedTags[]
Definition: indexer.cpp:3648
NCBI_DEFINE_ERR_SUBCODE_X(11)
static bool s_BlankOrNotSpecialTaxname(string taxname)
Definition: indexer.cpp:1282
CStaticPairArrayMap< utf8::TUnicode, char > TExtraTranslations
Definition: indexer.cpp:3379
static int SkipMixedContent(const char *ptr)
Definition: indexer.cpp:3696
static const TExtraTranslationPair kExtraTranslations[]
Definition: indexer.cpp:3380
static CSeq_id_Handle s_IdxFindBestIdChoice(const CBioseq_Handle::TId &ids)
Definition: indexer.cpp:468
static const char * x_OrganelleName(TBIOSOURCE_GENOME genome, bool has_plasmid, bool virus_or_phage, bool wgs_suffix)
Definition: indexer.cpp:1176
static string s_IdxGetBestIdString(CBioseq_Handle bsh)
Definition: indexer.cpp:497
DEFINE_STATIC_ARRAY_MAP(TExtraTranslations, sc_ExtraTranslations, kExtraTranslations)
SStaticPair< utf8::TUnicode, char > TExtraTranslationPair
Definition: indexer.cpp:3378
static int s_IdxSeqIdHandle(const CSeq_id_Handle &idh)
Definition: indexer.cpp:460
static const char *const idxStopWords[]
Definition: indexer.cpp:3448
void(* FAddSnpFunc)(CBioseq_Handle bsh, string &na_acc)
Definition: indexer.hpp:61
int i
yy_size_t n
int len
constexpr auto sort(_Init &&init)
const char * tag
T max(T x_, T y_)
static uint8_t * buffer
Definition: pcre2test.c:1016
#define NCBI_TECH(Type)
Definition: seq_macros.hpp:118
#define NCBI_SEQTOPOLOGY(Type)
Definition: seq_macros.hpp:66
#define FOR_EACH_COMPOUND_ON_PDBBLOCK(Itr, Var)
FOR_EACH_COMPOUND_ON_PDBBLOCK EDIT_EACH_COMPOUND_ON_PDBBLOCK.
#define NCBI_GENOME(Type)
@NAME Convenience macros for NCBI objects
#define SWITCH_ON_SUBSOURCE_CHOICE(Var)
SWITCH_ON_SUBSOURCE_CHOICE.
#define NCBI_ORGMOD(Type)
COrgMod definitions.
#define FOR_EACH_ORGMOD_ON_BIOSOURCE(Itr, Var)
FOR_EACH_ORGMOD_ON_BIOSOURCE EDIT_EACH_ORGMOD_ON_BIOSOURCE.
#define SWITCH_ON_ORGMOD_CHOICE(Var)
SWITCH_ON_ORGMOD_CHOICE.
#define FOR_EACH_SUBSOURCE_ON_BIOSOURCE(Itr, Var)
FOR_EACH_SUBSOURCE_ON_BIOSOURCE EDIT_EACH_SUBSOURCE_ON_BIOSOURCE.
#define NCBI_SUBSOURCE(Type)
CSubSource definitions.
CBioSource::TGenome TBIOSOURCE_GENOME
CSeq_id::EAccessionInfo TACCN_CHOICE
#define NCBI_SEQID(Type)
@NAME Convenience macros for NCBI objects
#define NCBI_ACCN(Type)
#define FIELD_IS_SET_AND_IS(Var, Fld, Chs)
FIELD_IS_SET_AND_IS base macro.
#define FOR_EACH_STRING_IN_LIST(Itr, Var)
FOR_EACH_STRING_IN_LIST EDIT_EACH_STRING_IN_LIST.
#define GET_FIELD(Var, Fld)
GET_FIELD base macro.
This indicates the text of the modifiers of a gap.
Definition: sequence.hpp:865
SAnnotSelector –.
Selector used in CSeqMap methods returning iterators.
Definition: seq_map_ci.hpp:113
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: type.c:6
#define _ASSERT
else result
Definition: token2.c:20
Modified on Wed Sep 04 15:05:40 2024 by modify_doxy.py rev. 669887