NCBI C++ ToolKit
seqdboidlist.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqdboidlist.cpp 100101 2023-06-15 14:10:29Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file seqdboidlist.cpp
31 /// Implementation for the CSeqDBOIDList class, an array of bits
32 /// describing a subset of the virtual oid space.
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistr.hpp>
35 #include "seqdboidlist.hpp"
36 #include "seqdbfilter.hpp"
38 #include "seqdbgilistset.hpp"
39 #include <algorithm>
40 
42 
44  const CSeqDBVolSet & volset,
45  CSeqDB_FilterTree & filters,
46  CRef<CSeqDBGiList> & gi_list,
47  CRef<CSeqDBNegativeList> & neg_list,
48  CSeqDBLockHold & locked,
49  const CSeqDBLMDBSet & lmdb_set)
50  : m_Atlas (atlas),
51  m_Lease (atlas),
52  m_NumOIDs (0)
53 {
54  _ASSERT(gi_list.NotEmpty() || neg_list.NotEmpty() || filters.HasFilter());
55  x_Setup( volset, filters, gi_list, neg_list, locked, lmdb_set);
56 }
57 
59 {
60 }
61 
62 // The general rule I am following in these methods is to use byte
63 // computations except during actual looping.
64 
66  CSeqDB_FilterTree & filters,
67  CRef<CSeqDBGiList> & gi_list,
68  CRef<CSeqDBNegativeList> & neg_list,
69  CSeqDBLockHold & locked,
70  const CSeqDBLMDBSet & lmdb_set)
71 {
72  // First, get the memory space for the OID bitmap and clear it.
73 
74  // Pad memory space to word boundary, add 8 bytes for "insurance". Some
75  // of the algorithms here need to do bit shifting and OR half of a source
76  // element into this destination element, and the other half into this
77  // other destination element. Rather than sprinkle this code with range
78  // checks, padding is used.
79 
80  m_NumOIDs = volset.GetNumOIDs();
81 
83 
84  CSeqDBGiListSet gi_list_set(m_Atlas,
85  volset,
86  gi_list,
87  neg_list,
88  locked,
89  lmdb_set);
90  // Then get the list of filenames and offsets to overlay onto it.
91 
92  for(int i = 0; i < volset.GetNumVols(); i++) {
93  const CSeqDBVolEntry * v1 = volset.GetVolEntry(i);
94 
95  CRef<CSeqDB_BitSet> vol_bits =
96  x_ComputeFilters(filters, *v1, gi_list_set, locked, lmdb_set.IsBlastDBVersion5());
97 
98  m_AllBits->UnionWith(*vol_bits, true);
99  }
100 
101  if (lmdb_set.IsBlastDBVersion5() && filters.HasFilter()) {
102  CSeqDB_BitSet f_bits(0, m_NumOIDs);
103  f_bits.AssignBitRange(0, m_NumOIDs, true);
104  if(x_ComputeFilters(volset, filters, lmdb_set, f_bits, gi_list, neg_list)) {
105  m_AllBits->IntersectWith(f_bits, true);
106  }
107  }
108 
109  if (gi_list.NotEmpty()) {
110  x_ApplyUserGiList(*gi_list);
111  }
112  if (neg_list.NotEmpty()) {
113  x_ApplyNegativeList(*neg_list, lmdb_set.IsBlastDBVersion5());
114  }
115 
116  while(m_NumOIDs && (! x_IsSet(m_NumOIDs - 1))) {
117  -- m_NumOIDs;
118  }
119  LOG_POST(Info << "Num Of Oids: " << m_NumOIDs);
120 }
121 
124  const CSeqDBVolEntry & vol,
125  CSeqDBGiListSet & gis,
126  CSeqDBLockHold & locked,
127  bool isBlastDBv5)
128 
129 {
130  const string & vn = vol.Vol()->GetVolName();
131  CRef<CSeqDB_FilterTree> ft = filters.Specialize(vn);
132 
133  int vol_start = vol.OIDStart();
134  int vol_end = vol.OIDEnd();
135 
136  CRef<CSeqDB_BitSet> volume_map;
137 
138  // Step 1: Compute the bitmap representing the filtering done by
139  // all subnodes. This is a "union".
140 
141  int vols = static_cast<int>(ft->GetVolumes().size());
142 
143  _ASSERT(vols || ft->GetNodes().size());
144 
145  if (vols > 0) {
146  // This filter tree is filtered by volume name, so all nodes
147  // below this point can be ignored if this node contains a
148  // volume. This volume will be ORred with those nodes,
149  // flushing them to all "1"s anyway (at least until this
150  // node's filtering is applied.)
151 
152  // This loop really just verifies that specialization was done
153  // properly in the case where there are multiple volume names
154  // (which must be the same).
155 
156  for(int j = 1; j < vols; j++) {
157  _ASSERT(ft->GetVolumes()[j] == ft->GetVolumes()[0]);
158  }
159 
160  volume_map.Reset(new CSeqDB_BitSet(vol_start,
161  vol_end,
163  } else {
164  // Since this node did not have a volume, we OR together all
165  // of its subnodes.
166 
167  volume_map.Reset(new CSeqDB_BitSet(vol_start,
168  vol_end,
170 
171  ITERATE(vector< CRef< CSeqDB_FilterTree > >, sub, ft->GetNodes()) {
172  CRef<CSeqDB_BitSet> sub_bits =
173  x_ComputeFilters(**sub, vol, gis, locked, isBlastDBv5);
174 
175  volume_map->UnionWith(*sub_bits, true);
176  }
177  }
178 
179  // Now we apply this level's filtering. The first question is, is
180  // it appropriate for a node to use multiple filtering mechanisms
181  // (GI list, OID list, or OID range), either of the same or
182  // different types? The second question is how are multiply
183  // filtered nodes interpreted?
184 
185  // The SeqDB unit tests assume that multiple filters at a given
186  // level are ANDed together. The unit tests assume this for the
187  // case of combining OID masks and OID ranges, but in the absence
188  // of another motivating example, I'll assume it means ANDing of
189  // all such mechanisms.
190 
191  CRef<CSeqDB_BitSet> filter(new CSeqDB_BitSet(vol_start,
192  vol_end,
194 
195  // First, apply any 'range' filters, because they can be combined
196  // very efficiently.
197 
198  typedef CSeqDB_FilterTree::TFilters TFilters;
199 
200  ITERATE(TFilters, range, ft->GetFilters()) {
201  const CSeqDB_AliasMask & mask = **range;
202 
203  if (mask.GetType() == CSeqDB_AliasMask::eOidRange) {
204  CSeqDB_BitSet r2(mask.GetBegin(),
205  mask.GetEnd(),
207  filter->IntersectWith(r2, true);
208  } else if (mask.GetType() == CSeqDB_AliasMask::eMemBit) {
209  // TODO, adding vol-specific OR and AND
210  vol.Vol()->SetMemBit(mask.GetMemBit());
211  // No filter->IntersectWith here since
212  // MEMBIT can not be done at OID level, therefore,
213  // we delegate to seqdbvol (in x_GetFilteredHeader())
214  // for further process.
215  }
216  }
217 
218  ITERATE(TFilters, filt, ft->GetFilters()) {
219  const CSeqDB_AliasMask & mask = **filt;
220 
221  if ((mask.GetType() == CSeqDB_AliasMask::eOidRange)
222  || (mask.GetType() == CSeqDB_AliasMask::eMemBit)
223  || (isBlastDBv5 && (mask.GetType() == CSeqDB_AliasMask::eSiList))
224  || (mask.GetType() == CSeqDB_AliasMask::eTaxIdList)) {
225  continue;
226  }
227 
229  CRef<CSeqDBGiList> idlist;
230  switch(mask.GetType()) {
232  f = x_GetOidMask(mask.GetPath(), vol_start, vol_end);
233  vol.Vol()->SetOidMaskType(mask.GetOidMaskType());
234  break;
235 
237  idlist = gis.GetNodeIdList(mask.GetPath(),
238  vol.Vol(),
240  locked);
241  f = x_IdsToBitSet(*idlist, vol_start, vol_end);
242  break;
243 
245  idlist = gis.GetNodeIdList(mask.GetPath(),
246  vol.Vol(),
248  locked);
249  f = x_IdsToBitSet(*idlist, vol_start, vol_end);
250  break;
251 
253  idlist = gis.GetNodeIdList(mask.GetPath(),
254  vol.Vol(),
256  locked);
257  f = x_IdsToBitSet(*idlist, vol_start, vol_end);
258  break;
259 
263 
264  // these should have been handled in the previous loop.
265  break;
266  }
267 
268  filter->IntersectWith(*f, true);
269  }
270 
271  if (gis.GetUserMaskOpts()) {
272 
273  const CSeqDBVol * v = vol.Vol();
274  bool is_protein= v->GetSeqType() == 'p' ? true : false;
275  string p_str = v->GetVolName()+ ".";
276 
278  string path_str(p_str + SeqDB_GetOidMaskFileExt(is_protein, EOidMaskType::fExcludeModel));
279  CSeqDB_Path mask_path(path_str);
280  CFile check_file(path_str);
281  if (!check_file.Exists()) {
282  NCBI_THROW(CSeqDBException, eArgErr, "Oid Mask file not found");
283  }
284 
285  CRef<CSeqDB_BitSet> m = x_GetOidMask( mask_path,vol_start,vol_end);
286  filter->IntersectWith (*m,true);
287  }
288  }
289 
290  volume_map->IntersectWith(*filter, true);
291  return volume_map;
292 }
293 
295 
296 {
297  //m_Atlas.Lock(locked);
298 
299  if (gis.Empty()) {
301  m_NumOIDs = 0;
302  return;
303  }
304 
305  // This is the trivial way to 'sort' OIDs; build a bit vector
306  // spanning the OID range, turn on the bit indexed by each
307  // included OID, and then scan the vector sequentially. This
308  // technique also uniqifies the set, which is desireable here.
309 
310 
311  int j = 0;
312 
313  if (gis.GetNumGis() || gis.GetNumSis() || gis.GetNumTis() || gis.GetNumPigs()){
314  CRef<CSeqDB_BitSet> gilist_oids(new CSeqDB_BitSet(0, m_NumOIDs));
315  if (gis.GetNumGis()) {
316  for(j = 0; j < gis.GetNumGis(); j++) {
317  int oid = gis.GetGiOid(j).oid;
318  if ((oid != -1) && (oid < m_NumOIDs)) {
319  gilist_oids->SetBit(oid);
320  }
321  }
322  }
323 
324  if(gis.GetNumSis()) {
325  for(j = 0; j < gis.GetNumSis(); j++) {
326  int oid = gis.GetSiOid(j).oid;
327  if ((oid != -1) && (oid < m_NumOIDs)) {
328  gilist_oids->SetBit(oid);
329  }
330  }
331  }
332 
333  if(gis.GetNumTis()) {
334  for(j = 0; j < gis.GetNumTis(); j++) {
335  int oid = gis.GetTiOid(j).oid;
336  if ((oid != -1) && (oid < m_NumOIDs)) {
337  gilist_oids->SetBit(oid);
338  }
339  }
340  }
341 
342  if(gis.GetNumPigs()) {
343  for(j = 0; j < gis.GetNumPigs(); j++) {
344  int oid = gis.GetPigOid(j).oid;
345  if ((oid != -1) && (oid < m_NumOIDs)) {
346  gilist_oids->SetBit(oid);
347  }
348  }
349  }
350  m_AllBits->IntersectWith(*gilist_oids, true);
351  }
352  const vector<blastdb::TOid> & oids_tax = gis.GetOidsForTaxIdsList();
353  if(oids_tax.size()) {
354  CRef<CSeqDB_BitSet> taxlist_oids(new CSeqDB_BitSet(0, m_NumOIDs));
355  for(unsigned int k = 0; k < oids_tax.size(); k++) {
356  if (oids_tax[k] < m_NumOIDs) {
357  taxlist_oids->SetBit(oids_tax[k]);
358  }
359  }
360  m_AllBits->IntersectWith(*taxlist_oids, true);
361  }
362 
363 }
364 
366 
367 {
368  // We require a normalized list in order to turn bits off.
369  m_AllBits->Normalize();
370  const vector<blastdb::TOid> & excluded_oids = nlist.GetExcludedOids();
371  for(unsigned int i=0; i < excluded_oids.size(); i++) {
372  m_AllBits->ClearBit(excluded_oids[i]);
373  }
374 
375  if((!is_v5 && nlist.GetNumSis() > 0) || nlist.GetNumGis() > 0 || nlist.GetNumTis() > 0) {
376 
377  // Intersect the user GI list with the OID bit map.
378 
379  // Iterate over the bitmap, clearing bits we find there but not in
380  // the bool vector. For very dense OID bit maps, it might be
381  // faster to use two similarly implemented bitmaps and AND them
382  // together word-by-word.
383 
384  int max = nlist.GetNumOids();
385 
386  // Clear any OIDs after the included range.
387 
388  if (max < m_NumOIDs) {
390  m_AllBits->IntersectWith(new_range, true);
391  }
392 
393  // If a 'get next included oid' method was added to the negative
394  // list, the following loop could be made a bit faster.
395 
396  for(int oid = 0; oid < max; oid++) {
397  if (! nlist.GetOidStatus(oid)) {
398  m_AllBits->ClearBit(oid);
399  }
400  }
401  }
402 
403 
404 }
405 
408  int oid_start,
409  int oid_end)
410 {
412  (new CSeqDB_BitSet(oid_start, oid_end, CSeqDB_BitSet::eNone));
413 
414  CSeqDB_BitSet & bitset = *bits;
415 
416  int num_gis = gilist.GetNumGis();
417  int num_tis = gilist.GetNumTis();
418  int num_sis = gilist.GetNumSis();
419  int prev_oid = -1;
420 
421  for(int i = 0; i < num_gis; i++) {
422  int oid = gilist.GetGiOid(i).oid;
423 
424  if (oid != prev_oid) {
425  if ((oid >= oid_start) && (oid < oid_end)) {
426  bitset.SetBit(oid);
427  }
428  prev_oid = oid;
429  }
430  }
431 
432  for(int i = 0; i < num_tis; i++) {
433  int oid = gilist.GetTiOid(i).oid;
434 
435  if (oid != prev_oid) {
436  if ((oid >= oid_start) && (oid < oid_end)) {
437  bitset.SetBit(oid);
438  }
439  prev_oid = oid;
440  }
441  }
442 
443  for(int i = 0; i < num_sis; i++) {
444  int oid = gilist.GetSiOid(i).oid;
445 
446  if (oid != prev_oid) {
447  if ((oid >= oid_start) && (oid < oid_end)) {
448  bitset.SetBit(oid);
449  }
450  prev_oid = oid;
451  }
452  }
453 
454  return bits;
455 }
456 
458  int oid_end)
459 {
460  m_AllBits->AssignBitRange(oid_start, oid_end, false);
461 }
462 
465  int vol_start,
466  int vol_end)
467 
468 {
469 
470  // Open file and get pointers
471 
472  TCUC* bitmap = 0;
473  TCUC* bitend = 0;
474 
475  CSeqDBRawFile volmask(m_Atlas);
476  CSeqDBFileMemMap lease(m_Atlas);
477 
478  Uint4 num_oids = 0;
479 
480  {
481  volmask.Open(fn);
482  lease.Init(fn.GetPathS());
483  volmask.ReadSwapped(lease, 0, & num_oids);
484 
485  // This is the index of the last oid, not the count of oids...
486  num_oids++;
487 
488  size_t file_length = (size_t) volmask.GetFileLength();
489 
490  // Cast forces signed/unsigned conversion.
491 
492  volmask.GetFileDataPtr(lease, sizeof(Int4), file_length);
493 
494  bitmap = (TCUC*) lease.GetFileDataPtr(sizeof(Int4));
495 
496  bitend = bitmap + (((num_oids + 31) / 32) * 4);
497  }
498  CRef<CSeqDB_BitSet> bitset(new CSeqDB_BitSet(vol_start, vol_end, bitmap, bitend));
499 
500  // Disable any enabled bits occuring after the volume end point
501  // [this should not normally occur.]
502 
503  for(size_t oid = vol_end; bitset->CheckOrFindBit(oid); oid++) {
504  bitset->ClearBit(oid);
505  }
506 
507  return bitset;
508 }
509 
510 void
512 {
513  ddc.SetFrame("CSeqDBOIDList");
515  ddc.Log("m_NumOIDs", m_NumOIDs);
516  ddc.Log("m_AllBits", m_AllBits, depth);
517 }
518 
519 void
520 s_GetFilteredOidRange(const CSeqDBVolSet & volset, const vector<string> & vol_basenames,
521  vector<const CSeqDBVolEntry * > & excluded_vols,
522  CRef<CSeqDBGiList> & si_list)
523 {
524  unsigned int num_vol = volset.GetNumVols();
525  vector<bool> vol_included(num_vol, false);
526  excluded_vols.clear();
527  for(unsigned int i=0; i < num_vol; i++) {
528  const CSeqDBVol * vol = volset.GetVol(i);
529  if(std::find(vol_basenames.begin(), vol_basenames.end(), vol->GetVolName()) != vol_basenames.end()) {
530  vol->AttachVolumeGiList(si_list);
531  continue;
532  }
533  excluded_vols.push_back(volset.GetVolEntry(i));
534  }
535 }
536 
537 bool
538 s_IsOidInFilteredVol(blastdb::TOid oid, vector<const CSeqDBVolEntry * > & excluded_vols)
539 {
540  for(unsigned int i = 0; i < excluded_vols.size(); i++) {
541  const CSeqDBVolEntry & entry = *(excluded_vols[i]);
542  if ((entry.OIDStart() <= oid) && (entry.OIDEnd() > oid)) {
543  return true;
544  }
545  }
546  return false;
547 }
548 
549 void s_AddFilterFile(string & name, const string & vn, vector<string> & fnames, vector<vector<string> > & fnames_vols)
550 {
551  unsigned int j=0;
552  for(; j < fnames.size(); j++) {
553  if(fnames[j] == name) {
554  fnames_vols[j].push_back(vn);
555  break;
556  }
557  }
558  if( fnames.size() == j) {
559  vector<string> p(1,vn);
560  fnames.push_back(name);
561  fnames_vols.push_back(p);
562  }
563 }
564 
565 bool s_CompareSeqId(const string & id1, const string & id2)
566 {
567  if (id1 == id2){
568  return false;
569  }
572  if (seq_id1.Match(seq_id2)) {
573  return false;
574  }
575  return (id1 < id2);
576 }
577 
578 void s_ProcessSeqIdFilters(const vector<string> & fnames,
579  vector<vector<string> > & fnames_vols,
580  CRef<CSeqDBGiList> user_list,
581  CRef<CSeqDBNegativeList> neg_user_list,
582  const CSeqDBLMDBSet & lmdb_set,
583  const CSeqDBVolSet & volset,
584  CSeqDB_BitSet & filter_bit)
585 {
586  if (fnames.size() == 0) {
587  return;
588  }
589  vector<string> user_accs;
590  if ((!user_list.Empty()) && (user_list->GetNumSis() > 0)) {
591  user_list->GetSiList(user_accs);
592  sort(user_accs.begin(), user_accs.end(), s_CompareSeqId);
593  }
594  vector<string> neg_user_accs;
595  if ((!neg_user_list.Empty()) && (neg_user_list->GetNumSis() > 0)) {
596  neg_user_accs = neg_user_list->GetSiList();
597  sort(neg_user_accs.begin(), neg_user_accs.end());
598  }
599 
600  for(unsigned int k=0; k < fnames.size(); k++) {
601  vector<const CSeqDBVolEntry * > excluded_vols;
602  vector<blastdb::TOid> oids;
604  s_GetFilteredOidRange(volset, fnames_vols[k], excluded_vols, list);
605  vector<string> accs;
606  list->GetSiList(accs);
607  if(accs.size() == 0){
608  continue;
609  }
610  if((user_accs.size() > 0) || (neg_user_accs.size() > 0)){
611  sort(accs.begin(), accs.end(), s_CompareSeqId);
612  if (user_accs.size() > 0) {
613  vector<string> common;
614  common.resize(accs.size());
615  vector<string>::iterator itr = set_intersection(accs.begin(), accs.end(),
616  user_accs.begin(), user_accs.end(), common.begin(), s_CompareSeqId);
617  common.resize(itr-common.begin());
618  if(common.size() == 0){
619  continue;
620  }
621  swap(accs, common);
622  }
623  if(neg_user_accs.size() > 0) {
624  vector<string> difference;
625  difference.resize(accs.size());
626  vector<string>::iterator itr = set_difference(accs.begin(), accs.end(),
627  neg_user_accs.begin(), neg_user_accs.end(), difference.begin(), s_CompareSeqId);
628  difference.resize(itr-difference.begin());
629  if(difference.size() == 0){
630  continue;
631  }
632  swap(accs, difference);
633  }
634  }
635 
636  lmdb_set.AccessionsToOids(accs, oids);
637  for(unsigned int i=0; i < accs.size(); i++) {
638  if(oids[i] == kSeqDBEntryNotFound) {
639  continue;
640  }
641  if(excluded_vols.size() != 0) {
642  if (s_IsOidInFilteredVol(oids[i], excluded_vols)) {
643  continue;
644  }
645  }
646  filter_bit.SetBit(oids[i]);
647  }
648  }
649 }
650 
651 void s_ProcessTaxIdFilters(const vector<string> & fnames,
652  vector<vector<string> > & fnames_vols,
653  CRef<CSeqDBGiList> user_list,
654  CRef<CSeqDBNegativeList> neg_user_list,
655  const CSeqDBLMDBSet & lmdb_set,
656  const CSeqDBVolSet & volset,
657  CSeqDB_BitSet & filter_bit)
658 {
659  if (fnames.size() == 0) {
660  return;
661  }
662 
663  set<TTaxId> user_taxids;
664  if(!user_list.Empty() && (user_list->GetNumTaxIds() > 0)) {
665  user_taxids = user_list->GetTaxIdsList();
666  }
667  set<TTaxId> neg_user_taxids;
668  if(!neg_user_list.Empty() && (neg_user_list->GetNumTaxIds() > 0)) {
669  neg_user_taxids = neg_user_list->GetTaxIdsList();
670  }
671 
672  for(unsigned int k=0; k < fnames.size(); k++) {
673  vector<const CSeqDBVolEntry * > excluded_vols;
674  vector<blastdb::TOid> oids;
676  s_GetFilteredOidRange(volset, fnames_vols[k], excluded_vols, list);
677  set<TTaxId> taxids;
678  taxids = list->GetTaxIdsList();
679  if(taxids.size() == 0){
680  continue;
681  }
682  if(user_taxids.size() > 0){
683  vector<TTaxId> common;
684  common.resize(taxids.size());
685  vector<TTaxId>::iterator itr = set_intersection(taxids.begin(), taxids.end(),
686  user_taxids.begin(), user_taxids.end(), common.begin());
687  common.resize(itr-common.begin());
688  if( common.size() == 0) {
689  continue;
690  }
691  taxids.clear();
692  taxids.insert(common.begin(), common.end());
693  }
694  if(neg_user_taxids.size() > 0) {
695  vector<TTaxId> difference;
696  difference.resize(taxids.size());
697  vector<TTaxId>::iterator itr = set_difference(taxids.begin(), taxids.end(),
698  neg_user_taxids.begin(), neg_user_taxids.end(), difference.begin());
699  difference.resize(itr-difference.begin());
700  if(difference.size() == 0){
701  continue;
702  }
703  taxids.clear();
704  taxids.insert(difference.begin(), difference.end());
705  }
706 
707  lmdb_set.TaxIdsToOids(taxids, oids);
708  for(unsigned int i=0; i < oids.size(); i++) {
709  if(excluded_vols.size() != 0) {
710  if (s_IsOidInFilteredVol(oids[i], excluded_vols)) {
711  continue;
712  }
713  }
714  filter_bit.SetBit(oids[i]);
715  }
716  }
717 }
718 
719 bool
721  const CSeqDB_FilterTree & filters,
722  const CSeqDBLMDBSet & lmdb_set,
723  CSeqDB_BitSet & filter_bit,
724  CRef<CSeqDBGiList> user_list,
725  CRef<CSeqDBNegativeList> neg_user_list)
726 {
727  vector<string> seqid_fnames;
728  vector<string> taxid_fnames;
729  vector< vector<string> > seqid_fnames_vols;
730  vector< vector<string> > taxid_fnames_vols;
731 
732  for(int i = 0; i < volset.GetNumVols(); i++) {
733  const CSeqDBVolEntry & vol = *(volset.GetVolEntry(i));
734  const string & vn = vol.Vol()->GetVolName();
735  CRef<CSeqDB_FilterTree> ft = filters.Specialize(vn);
737  if(((*itr)->GetType() == CSeqDB_AliasMask::eSiList) ||
738  ((*itr)->GetType() == CSeqDB_AliasMask::eTaxIdList)) {
739  string name = (*itr)->GetPath().GetPathS();
740  if((*itr)->GetType() == CSeqDB_AliasMask::eSiList) {
741  s_AddFilterFile(name, vn, seqid_fnames, seqid_fnames_vols);
742  }
743  else {
744  s_AddFilterFile(name, vn, taxid_fnames, taxid_fnames_vols);
745  }
746  filter_bit.AssignBitRange(vol.OIDStart(), vol.OIDEnd(), false);
747  }
748  }
749  }
750 
751  if (seqid_fnames.size() > 0) {
752  s_ProcessSeqIdFilters(seqid_fnames, seqid_fnames_vols, user_list, neg_user_list,
753  lmdb_set, volset, filter_bit);
754  }
755  if (taxid_fnames.size() > 0) {
756  s_ProcessTaxIdFilters(taxid_fnames, taxid_fnames_vols, user_list, neg_user_list,
757  lmdb_set, volset, filter_bit);
758  }
759 
760  return ((seqid_fnames.size() + taxid_fnames.size()) > 0 ? true:false);
761 }
762 
763 
765 
ncbi::TMaskedQueryRegions mask
void SetFrame(const string &frame)
Definition: ddumpable.cpp:137
void Log(const string &name, const char *value, CDebugDumpFormatter::EValueType type=CDebugDumpFormatter::eValue, const string &comment=kEmptyStr)
Definition: ddumpable.cpp:151
CFile –.
Definition: ncbifile.hpp:1605
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:297
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDBFileGiList.
const char * GetFileDataPtr(const string &fname, TIndx offset)
Get a pointer to the specified offset.
Definition: seqdbatlas.hpp:755
void Init(const string &filename)
Initializes a memory map object.
Definition: seqdbatlas.hpp:702
CSeqDBGiListSet class.
TListRef GetNodeIdList(const CSeqDB_Path &filename, const CSeqDBVol *volp, EGiListType list_type, CSeqDBLockHold &locked)
Get a reference to a named GI list.
CSeqDBGiList.
int GetNumGis() const
Get the number of GIs in the array.
const SGiOid & GetGiOid(int index) const
Access an element of the array.
int GetNumTaxIds() const
int GetNumSis() const
Get the number of Seq-ids in the array.
const SPigOid & GetPigOid(int index) const
int GetNumTis() const
Get the number of TIs in the array.
const SSiOid & GetSiOid(int index) const
Access an element of the array.
int GetNumPigs() const
void GetSiList(vector< string > &sis) const
TODO Get the seqid list?
const vector< blastdb::TOid > & GetOidsForTaxIdsList()
set< TTaxId > & GetTaxIdsList()
const STiOid & GetTiOid(int index) const
Access an element of the array.
bool Empty() const
Return false if there are elements present.
CSeqDBLMDBSet.
void AccessionsToOids(const vector< string > &accs, vector< TOid > &oids) const
bool IsBlastDBVersion5() const
void TaxIdsToOids(set< TTaxId > &tax_ids, vector< blastdb::TOid > &rv) const
CSeqDBLockHold.
Definition: seqdbatlas.hpp:166
CSeqDBNegativeList.
int GetNumTaxIds() const
const vector< string > & GetSiList()
int GetNumTis() const
Get the number of TIs in the array.
bool GetOidStatus(int oid)
Get the inclusion status of an OID.
int GetNumGis() const
Get the number of GIs in the array.
const vector< blastdb::TOid > & GetExcludedOids()
int GetNumSis() const
Get the number of SeqIds in the array.
set< TTaxId > & GetTaxIdsList()
int GetNumOids()
Get the size of the OID array.
CSeqDBOIDList(CSeqDBAtlas &atlas, const CSeqDBVolSet &volumes, CSeqDB_FilterTree &filters, CRef< CSeqDBGiList > &gi_list, CRef< CSeqDBNegativeList > &neg_list, CSeqDBLockHold &locked, const CSeqDBLMDBSet &lmdb_set)
Constructor.
CRef< CSeqDB_BitSet > x_IdsToBitSet(const CSeqDBGiList &ids, int vol_start, int vol_end)
Load an ID (GI or TI) list file into a bitset object.
bool x_IsSet(TOID oid) const
Check if a bit is set.
void DebugDump(CDebugDumpContext ddc, unsigned int depth) const
Dump debug information for this object.
void x_Setup(const CSeqDBVolSet &volset, CSeqDB_FilterTree &filters, CRef< CSeqDBGiList > &gi_list, CRef< CSeqDBNegativeList > &neg_list, CSeqDBLockHold &locked, const CSeqDBLMDBSet &lmdb_set)
Build an oid mask in memory.
void x_ClearBitRange(int oid_start, int oid_end)
Clear all bits in a range.
CRef< CSeqDB_BitSet > x_GetOidMask(const CSeqDB_Path &fn, int vol_start, int vol_end)
Load the named OID mask file into a bitset object.
void x_ApplyUserGiList(CSeqDBGiList &gis)
Apply a user GI list to a volume.
void x_ApplyNegativeList(CSeqDBNegativeList &neg, bool is_v5)
Apply a negative user GI list to a volume.
~CSeqDBOIDList()
Destructor.
CSeqDBAtlas & m_Atlas
The memory management layer object.
CRef< CSeqDB_BitSet > x_ComputeFilters(const CSeqDB_FilterTree &ft, const CSeqDBVolEntry &vol, CSeqDBGiListSet &gis, CSeqDBLockHold &locked, bool isBlastDBv5)
Compute the oid mask bitset for a database volume.
const unsigned char TCUC
Shorthand type to clarify code that iterates over memory.
CRef< CSeqDB_BitSet > m_AllBits
An OID bit set covering all volumes.
int m_NumOIDs
The total number of OIDs represented in the bit set.
Raw file.
Definition: seqdbfile.hpp:64
TIndx GetFileLength() const
Get the length of the file.
Definition: seqdbfile.hpp:143
TIndx ReadSwapped(CSeqDBFileMemMap &lease, TIndx offset, Uint4 *value) const
Read a four byte numerical object from the file.
Definition: seqdbfile.cpp:71
bool Open(const CSeqDB_Path &name)
MMap or Open a file.
Definition: seqdbfile.hpp:93
const char * GetFileDataPtr(CSeqDBFileMemMap &lease, TIndx start, TIndx end) const
Get a pointer to a section of the file.
Definition: seqdbfile.hpp:123
CSeqDBVolEntry.
Definition: seqdbvolset.hpp:59
int OIDStart() const
Get the starting OID in this volume's range.
int OIDEnd() const
Get the ending OID in this volume's range.
CSeqDBVol * Vol()
Get a pointer to the underlying volume object.
CSeqDBVolSet.
const CSeqDBVolEntry * GetVolEntry(int i) const
Find a volume entry by index.
const CSeqDBVol * GetVol(int i) const
Find a volume by index.
int GetNumVols() const
Get the number of volumes.
int GetNumOIDs() const
Get the size of the OID range.
CSeqDBVol class.
Definition: seqdbvol.hpp:169
const string & GetVolName() const
Get the volume name.
Definition: seqdbvol.hpp:452
void SetOidMaskType(int oid_masks) const
Definition: seqdbvol.hpp:887
void AttachVolumeGiList(CRef< CSeqDBGiList > gilist) const
Filter this volume using the specified GI list.
Definition: seqdbvol.hpp:639
char GetSeqType() const
Get the sequence type stored in this database.
Definition: seqdbvol.cpp:265
void SetMemBit(int mbit) const
Set the MEMB_BIT fitlering for this volume.
Definition: seqdbvol.hpp:879
Something else yet again etc.
Definition: seqdbfilter.hpp:51
@ eGiList
GI list.
Definition: seqdbfilter.hpp:55
@ eSiList
SI list.
Definition: seqdbfilter.hpp:57
@ eTaxIdList
Taxonomay Id List.
Definition: seqdbfilter.hpp:61
@ eOidRange
OID Range [start, end).
Definition: seqdbfilter.hpp:59
@ eTiList
TI list.
Definition: seqdbfilter.hpp:56
@ eMemBit
MEMBIT filter.
Definition: seqdbfilter.hpp:60
@ eOidList
OID list.
Definition: seqdbfilter.hpp:58
Bit set class.
Definition: seqdbbitset.hpp:49
@ eAllSet
All OIDs are set.
Definition: seqdbbitset.hpp:76
@ eNone
Normal OID list.
Definition: seqdbbitset.hpp:75
@ eAllClear
All OIDs are clear.
Definition: seqdbbitset.hpp:77
void IntersectWith(CSeqDB_BitSet &other, bool consume)
This bitset is assigned to the intersection of it and another.
void UnionWith(CSeqDB_BitSet &other, bool consume)
This bitset is assigned to the union of it and another.
void Normalize()
If this is a special case bitset, convert it to a normal one.
void ClearBit(size_t index)
Clear the specified bit (to false).
Definition: seqdbbitset.cpp:75
bool CheckOrFindBit(size_t &index) const
Check if a bit is true or find the next bit that is.
Definition: seqdbbitset.cpp:90
void AssignBitRange(size_t start, size_t end, bool value)
Store the provided value in a range of bits.
void SetBit(size_t index)
Set the specified bit (to true).
Definition: seqdbbitset.cpp:60
Tree of nodes describing filtering of database sequences.
bool HasFilter() const
Check whether this tree represents any volume filtering.
Definition: seqdbfilter.cpp:81
const vector< CRef< CSeqDB_FilterTree > > & GetNodes() const
Get child nodes attached to this node.
vector< CRef< CSeqDB_AliasMask > > TFilters
Type used to store lists of filters found here.
const TFilters & GetFilters() const
Get filters from this node.
const vector< CSeqDB_BasePath > & GetVolumes() const
Get volumes attached to this node.
CRef< CSeqDB_FilterTree > Specialize(string volname) const
Specialized this tree for the indicated volume.
Definition: seqdbfilter.cpp:38
CSeqDB_Path.
const string & GetPathS() const
Get the path as a string.
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
void clear()
Definition: set.hpp:153
size_type size() const
Definition: set.hpp:132
const_iterator end() const
Definition: set.hpp:136
The NCBI C++ standard methods for dealing with std::string.
static unsigned char depth[2 *(256+1+29)+1]
#define true
Definition: bool.h:35
static FILE * f
Definition: readconf.c:23
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4039
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1065
@ fParse_AnyRaw
Definition: Seq_id.hpp:83
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
Definition: Seq_id.hpp:87
virtual void DebugDump(CDebugDumpContext ddc, unsigned int depth) const
Define method for dumping debug information.
Definition: ncbiobj.cpp:988
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
int i
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
T max(T x_, T y_)
const blastdb::TOid kSeqDBEntryNotFound
Int4 TOid
Ordinal ID in BLAST databases.
Definition: seqdbcommon.hpp:58
const string SeqDB_GetOidMaskFileExt(bool db_is_protein, EOidMaskType t)
@ fExcludeModel
File access objects for CSeqDB.
Implementation for some assorted ID list filtering code.
Defines set of GI lists.
void s_AddFilterFile(string &name, const string &vn, vector< string > &fnames, vector< vector< string > > &fnames_vols)
void s_GetFilteredOidRange(const CSeqDBVolSet &volset, const vector< string > &vol_basenames, vector< const CSeqDBVolEntry * > &excluded_vols, CRef< CSeqDBGiList > &si_list)
void s_ProcessTaxIdFilters(const vector< string > &fnames, vector< vector< string > > &fnames_vols, CRef< CSeqDBGiList > user_list, CRef< CSeqDBNegativeList > neg_user_list, const CSeqDBLMDBSet &lmdb_set, const CSeqDBVolSet &volset, CSeqDB_BitSet &filter_bit)
void s_ProcessSeqIdFilters(const vector< string > &fnames, vector< vector< string > > &fnames_vols, CRef< CSeqDBGiList > user_list, CRef< CSeqDBNegativeList > neg_user_list, const CSeqDBLMDBSet &lmdb_set, const CSeqDBVolSet &volset, CSeqDB_BitSet &filter_bit)
bool s_IsOidInFilteredVol(blastdb::TOid oid, vector< const CSeqDBVolEntry * > &excluded_vols)
bool s_CompareSeqId(const string &id1, const string &id2)
The SeqDB oid filtering layer.
static const sljit_gpr r2
int oid
The OID or -1 if unknown.
int oid
The OID or -1 if unknown.
int oid
The OID or -1 if unknown.
int oid
The OID or -1 if unknown.
#define _ASSERT
Modified on Fri Sep 20 14:58:32 2024 by modify_doxy.py rev. 669887