NCBI C++ ToolKit
wgs_client.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: wgs_client.cpp 103123 2024-09-11 18:57:02Z satskyse $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Aleksey Grichenko, Eugene Vasilchenko
27  *
28  * File Description: client for loading data from WGS
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include "wgs_client.hpp"
35 #include "pubseq_gateway.hpp"
40 //#include "osg_getblob_base.hpp"
41 //#include "osg_resolve_base.hpp"
44 
45 
49 
51 
52 /////////////////////////////////////////////////////////////////////////////
53 // Processor settings
54 /////////////////////////////////////////////////////////////////////////////
55 
56 NCBI_PARAM_DECL(bool, WGS, FILTER_ALL);
57 NCBI_PARAM_DEF_EX(bool, WGS, FILTER_ALL, false, eParam_NoThread, WGS_FILTER_ALL);
58 
59 NCBI_PARAM_DECL(bool, WGS, SPLIT_FEATURES);
60 NCBI_PARAM_DEF(bool, WGS, SPLIT_FEATURES, true);
61 
62 NCBI_PARAM_DECL(bool, WGS, KEEP_REPLACED);
63 NCBI_PARAM_DEF(bool, WGS, KEEP_REPLACED, true);
64 
65 NCBI_PARAM_DECL(bool, WGS, KEEP_MIGRATED);
66 NCBI_PARAM_DEF(bool, WGS, KEEP_MIGRATED, false);
67 
68 NCBI_PARAM_DECL(bool, WGS, KEEP_EXTERNAL);
69 NCBI_PARAM_DEF(bool, WGS, KEEP_EXTERNAL, true);
70 
71 NCBI_PARAM_DECL(string, WGS, ADD_MASTER_DESCR);
72 NCBI_PARAM_DEF(string, WGS, ADD_MASTER_DESCR, "detached");
73 
74 NCBI_PARAM_DECL(bool, WGS, MARK_MASTER_DESCR);
75 NCBI_PARAM_DEF(bool, WGS, MARK_MASTER_DESCR, false);
76 
77 
78 static bool s_SplitFeatures(void)
79 {
80  static bool value = NCBI_PARAM_TYPE(WGS, SPLIT_FEATURES)::GetDefault();
81  return value;
82 }
83 
84 static bool s_KeepReplaced(void)
85 {
86  static bool value = NCBI_PARAM_TYPE(WGS, KEEP_REPLACED)::GetDefault();
87  return value;
88 }
89 
90 static bool s_KeepMigrated(void)
91 {
92  static bool value = NCBI_PARAM_TYPE(WGS, KEEP_MIGRATED)::GetDefault();
93  return value;
94 }
95 
97 {
101 };
102 
104 {
105  auto value = NCBI_PARAM_TYPE(WGS, ADD_MASTER_DESCR)::GetDefault();
106  return (NStr::EqualNocase(value, "detached")? eAddMasterDescr_detached:
109 }
110 
112 {
114  return value;
115 }
116 
118 {
119  // master descr on contig should be added only with "all" setting
121 }
122 
124 {
125  // master descr on scaffold should be added only with any setting except "none"
127 }
128 
130 {
131  // master descr on protein should be added only with any setting except "none"
133 }
134 
135 static bool s_MarkMasterDescr(void)
136 {
137  static bool value = NCBI_PARAM_TYPE(WGS, MARK_MASTER_DESCR)::GetDefault();
138  return value;
139 }
140 
141 
146 };
148 
149 static const char kSubSatSeparator = '/';
150 static const int kOSG_Sat_WGS_min = 1000;
151 static const int kOSG_Sat_WGS_max = 1130;
152 static const int kOSG_Sat_SNP_min = 2001;
153 static const int kOSG_Sat_SNP_max = 3999;
154 static const int kOSG_Sat_CDD_min = 8087;
155 static const int kOSG_Sat_CDD_max = 8088;
156 
157 static inline bool s_IsEnabledOSGSat(CWGSClient::TEnabledFlags enabled_flags, Int4 sat)
158 {
159  if ( sat >= kOSG_Sat_WGS_min &&
160  sat <= kOSG_Sat_WGS_max &&
161  (enabled_flags & CWGSClient::fEnabledWGS) ) {
162  return true;
163  }
164  if ( sat >= kOSG_Sat_SNP_min &&
165  sat <= kOSG_Sat_SNP_max &&
166  (enabled_flags & CWGSClient::fEnabledSNP) ) {
167  return true;
168  }
169  if ( sat >= kOSG_Sat_CDD_min &&
170  sat <= kOSG_Sat_CDD_max &&
171  (enabled_flags & CWGSClient::fEnabledCDD) ) {
172  return true;
173  }
174  /*
175  if ( sat >= kOSG_Sat_NAGraph_min &&
176  sat <= kOSG_Sat_NAGraph_max &&
177  (enabled_flags & CWGSClient::fEnabledNAGraph) ) {
178  return true;
179  }
180  */
181  return false;
182 }
183 
184 
185 static bool s_IsOSGSat(Int4 sat)
186 {
188 }
189 
190 
191 static bool s_Skip(CTempString& str, char c)
192 {
193  if ( str.empty() || str[0] != c ) {
194  return false;
195  }
196  str = str.substr(1);
197  return true;
198 }
199 
200 
201 static inline bool s_IsValidIntChar(char c)
202 {
203  return c == '-' || (c >= '0' && c <= '9');
204 }
205 
206 
207 template<class Int>
208 static bool s_ParseInt(CTempString& str, Int& v)
209 {
210  size_t int_size = 0;
211  while ( int_size < str.size() && s_IsValidIntChar(str[int_size]) ) {
212  ++int_size;
213  }
214  if ( !NStr::StringToNumeric(str.substr(0, int_size), &v,
216  return false;
217  }
218  str = str.substr(int_size);
219  return true;
220 }
221 
222 
223 static bool s_IsOSGBlob(Int4 sat, Int4 /*subsat*/, Int4 /*satkey*/)
224 {
225  return s_IsOSGSat(sat);
226 }
227 
228 
230  Int4& sat, Int4& subsat, Int4& satkey)
231 {
232  if ( s.find(kSubSatSeparator) == NPOS ) {
233  return false;
234  }
235  if ( !s_ParseInt(s, sat) ) {
236  return false;
237  }
238  if ( !s_IsOSGSat(sat) ) {
239  return false;
240  }
241  if ( !s_Skip(s, kSubSatSeparator) ) {
242  return false;
243  }
244  if ( !s_ParseInt(s, subsat) ) {
245  return false;
246  }
247  if ( !s_Skip(s, '.') ) {
248  return false;
249  }
250  if ( !s_ParseInt(s, satkey) ) {
251  return false;
252  }
253  return s_IsOSGBlob(sat, subsat, satkey);
254 }
255 
256 static void s_FormatBlobId(ostream& s, const CID2_Blob_Id& blob_id)
257 {
258  s << blob_id.GetSat()
259  << kSubSatSeparator << blob_id.GetSub_sat()
260  << '.' << blob_id.GetSat_key();
261 }
262 
263 
264 /////////////////////////////////////////////////////////////////////////////
265 // WGS seq-ids
266 /////////////////////////////////////////////////////////////////////////////
267 
268 
269 // WGS accession parameters
270 static const size_t kTypePrefixLen = 4; // "WGS:" or "TSA:"
271 static const size_t kNumLettersV1 = 4;
272 static const size_t kNumLettersV2 = 6;
273 static const size_t kVersionDigits = 2;
274 static const size_t kPrefixLenV1 = kNumLettersV1 + kVersionDigits;
275 static const size_t kPrefixLenV2 = kNumLettersV2 + kVersionDigits;
276 static const size_t kMinRowDigitsV1 = 6;
277 static const size_t kMaxRowDigitsV1 = 8;
278 static const size_t kMinRowDigitsV2 = 7;
279 static const size_t kMaxRowDigitsV2 = 9;
280 
281 static const size_t kMinProtAccLen = 8; // 3+5
282 static const size_t kMaxProtAccLen = 10; // 3+7
283 
284 static bool IsWGSGeneral(const CDbtag& dbtag)
285 {
286  const string& db = dbtag.GetDb();
287  if ( db.size() != kTypePrefixLen+kNumLettersV1 /* WGS:AAAA */ &&
288  db.size() != kTypePrefixLen+kPrefixLenV1 /* WGS:AAAA01 */ &&
289  db.size() != kTypePrefixLen+kNumLettersV2 /* WGS:AAAAAA */ &&
290  db.size() != kTypePrefixLen+kPrefixLenV2 /* WGS:AAAAAA01 */ ) {
291  return false;
292  }
293  if ( !NStr::StartsWith(db, "WGS:", NStr::eNocase) &&
294  !NStr::StartsWith(db, "TSA:", NStr::eNocase) ) {
295  return false;
296  }
297  return true;
298 }
299 
300 
304  fAllow_protein = 4
305 };
306 typedef int TAllowSeqType;
307 
308 static bool IsWGSAccession(const string& acc,
309  const CTextseq_id& id,
310  TAllowSeqType allow_seq_type)
311 {
312  if ( acc.size() < kPrefixLenV1 + kMinRowDigitsV1 ||
313  acc.size() > kPrefixLenV2 + kMaxRowDigitsV2 + 1 ) { // one for type letter
314  return false;
315  }
316  size_t num_letters;
317  for ( num_letters = 0; num_letters < kNumLettersV2; ++num_letters ) {
318  if ( !isalpha(acc[num_letters]&0xff) ) {
319  break;
320  }
321  }
322  if ( num_letters != kNumLettersV1 && num_letters != kNumLettersV2 ) {
323  return false;
324  }
325  size_t prefix_len = num_letters + kVersionDigits;
326  for ( size_t i = num_letters; i < prefix_len; ++i ) {
327  if ( !isdigit(acc[i]&0xff) ) {
328  return false;
329  }
330  }
331  SIZE_TYPE row_pos = prefix_len;
332  switch ( acc[row_pos] ) { // optional type letter
333  case 's':
334  case 'S':
335  // scaffold
336  if ( !(allow_seq_type & fAllow_scaffold) ) {
337  return false;
338  }
339  ++row_pos;
340  break;
341  case 'p':
342  case 'P':
343  // protein
344  if ( !(allow_seq_type & fAllow_protein) ) {
345  return false;
346  }
347  ++row_pos;
348  break;
349  default:
350  // contig
351  if ( !(allow_seq_type & fAllow_contig) ) {
352  return false;
353  }
354  break;
355  }
356  size_t row_digits = acc.size() - row_pos;
357  if ( num_letters == kNumLettersV1 ) {
358  if ( row_digits < kMinRowDigitsV1 || row_digits > kMaxRowDigitsV1 ) {
359  return false;
360  }
361  }
362  else {
363  if ( row_digits < kMinRowDigitsV2 || row_digits > kMaxRowDigitsV2 ) {
364  return false;
365  }
366  }
367  Uint8 row = 0;
368  for ( size_t i = row_pos; i < acc.size(); ++i ) {
369  char c = acc[i];
370  if ( c < '0' || c > '9' ) {
371  return false;
372  }
373  row = row*10+(c-'0');
374  }
375  if ( !row ) {
376  return false;
377  }
378  return true;
379 }
380 
381 
382 static bool IsWGSProtAccession(const CTextseq_id& id)
383 {
384  const string& acc = id.GetAccession();
385  if ( acc.size() < kMinProtAccLen || acc.size() > kMaxProtAccLen ) {
386  return false;
387  }
388  return true;
389 }
390 
391 
392 static bool IsWGSAccession(const CTextseq_id& id)
393 {
394  if ( id.IsSetName() ) {
395  // first try name reference if it has WGS format like AAAA01P000001
396  // as it directly contains WGS accession
397  return IsWGSAccession(id.GetName(), id, fAllow_protein);
398  }
399  if ( !id.IsSetAccession() ) {
400  return false;
401  }
402  const string& acc = id.GetAccession();
404  switch ( type & CSeq_id::eAcc_division_mask ) {
405  // accepted accession types
406  case CSeq_id::eAcc_wgs:
408  case CSeq_id::eAcc_tsa:
410  if ( type & CSeq_id::fAcc_prot ) {
411  return IsWGSProtAccession(id);
412  }
413  else {
415  }
416  case CSeq_id::eAcc_other:
417  // Some EMBL WGS accession aren't identified as WGS, so we'll try lookup anyway
418  if ( type == CSeq_id::eAcc_embl_prot ||
419  (type == CSeq_id::eAcc_gb_prot && acc.size() == 10) ) { // TODO: remove
420  return IsWGSProtAccession(id);
421  }
422  return false;
423  default:
424  // non-WGS accessions
425  return false;
426  }
427 }
428 
429 
430 
431 /////////////////////////////////////////////////////////////////////////////
432 // WGS blob-ids
433 /////////////////////////////////////////////////////////////////////////////
434 
435 // satkey: row-id
436 static const int kBlobIdV1Sat = 1000;
437 static const int kBlobIdV2SatMin = 1001;
438 static const int kBlobIdV2SatMax = 1129;
439 static const int kBlobIdV2VersionScaffold = 0;
440 static const int kBlobIdV2VersionProtein = 1;
441 static const int kBlobIdV2VersionContig = 2;
442 enum EBlobType {
446 };
451 };
452 
453 
454 /////////////////////////////////////////////////////////////////////////////
455 // Helper classes
456 /////////////////////////////////////////////////////////////////////////////
457 
459 
461 {
462 public:
463  CIndexUpdateThread(unsigned update_delay, CRef<CWGSResolver> resolver)
464  : CThreadNonStop(update_delay),
465  m_FirstRun(true),
466  m_Resolver(resolver)
467  {
468  }
469 
470 protected:
471  virtual void DoJob(void) {
472  if ( m_FirstRun ) {
473  // CThreadNonStop runs first iteration immediately, ignore it
474  m_FirstRun = false;
475  return;
476  }
477  try {
478  if ( m_Resolver->Update() ) {
479  PSG_INFO("PSGS_WGS: updated WGS index");
480  }
481  }
482  catch ( CException& exc ) {
483  PSG_ERROR("PSGS_WGS: Exception while updating WGS index: " << exc);
484  }
485  catch ( exception& exc ) {
486  PSG_ERROR("PSGS_WGS: Exception while updating WGS index: " << exc.what());
487  }
488  }
489 
490 private:
493 };
494 
496 
497 
498 /////////////////////////////////////////////////////////////////////////////
499 // CWGSClient
500 /////////////////////////////////////////////////////////////////////////////
501 
502 
504  : m_Config(config),
505  m_WGSDbCache(config.m_CacheSize, config.m_FileReopenTime, config.m_FileRecheckTime)
506 {
507 }
508 
509 
511 {
512  if ( m_IndexUpdateThread ) {
515  }
516 }
517 
518 
520 {
521  if ( !m_Resolver ) {
523  if ( !m_Resolver ) {
525  }
526  if ( m_Resolver && !m_IndexUpdateThread ) {
529  }
530  }
531  return m_Resolver;
532 }
533 
534 
536 {
537  auto req_type = request.GetRequestType();
538  string seq_id;
539  int seq_id_type = -1;
540  CRef<CID2_Blob_Id> blob_id;
541 
542  switch ( req_type ) {
544  {
545  auto& resolve_request = request.GetRequest<SPSGS_ResolveRequest>();
546  seq_id = resolve_request.m_SeqId;
547  seq_id_type = resolve_request.m_SeqIdType;
548  break;
549  }
551  {
552  auto& blob_sid_request = request.GetRequest<SPSGS_BlobBySeqIdRequest>();
553  seq_id = blob_sid_request.m_SeqId;
554  seq_id_type = blob_sid_request.m_SeqIdType;
555  break;
556  }
559  break;
561  {
562  auto& chunk_request = request.GetRequest<SPSGS_TSEChunkRequest>();
563  blob_id = ParsePSGId2Info(chunk_request.m_Id2Info).tse_id;
564  break;
565  }
566  default:
567  return false;
568  }
569 
570  if ( !seq_id.empty() ) {
571  return CanBeWGS(seq_id_type, seq_id);
572  }
573  if ( blob_id ) {
574  return ResolveBlobId(*blob_id, true).m_ValidWGS;
575  }
576  return false;
577 }
578 
579 
580 shared_ptr<SWGSData> CWGSClient::ResolveSeqId(const CSeq_id& seq_id)
581 {
582  shared_ptr<SWGSData> ret;
583  SWGSSeqInfo seq = Resolve(seq_id);
584  if (seq && HasMigrated(seq) && !s_KeepMigrated() ) {
585  seq = SWGSSeqInfo();
586  }
587  if ( !seq ) return ret;
588 
589  GetBioseqInfo(ret, seq);
590  _ASSERT(ret);
591  return ret;
592 }
593 
594 
595 shared_ptr<SWGSData> CWGSClient::GetSeqInfoBySeqId(const CSeq_id& seq_id,
596  SWGSSeqInfo& seq,
597  const TBlobIds& excluded)
598 {
599  shared_ptr<SWGSData> ret;
600  seq = Resolve(seq_id);
601  if (seq && HasMigrated(seq) && !s_KeepMigrated() ) {
602  seq = SWGSSeqInfo();
603  }
604  if ( !seq ) return ret;
605 
606  GetBioseqInfo(ret, seq);
607  _ASSERT(ret);
608  if ( find(excluded.begin(), excluded.end(), ret->m_BlobId) != excluded.end() ) {
609  ret->m_GetResult = SWGSData::eResult_Excluded;
610  }
611 
612  return ret;
613 }
614 
615 
616 shared_ptr<SWGSData> CWGSClient::GetBlobByBlobId(const string& blob_id)
617 {
618  shared_ptr<SWGSData> ret;
619  CRef<CID2_Blob_Id> id2_blob_id(ParsePSGBlobId(blob_id));
620  if ( !id2_blob_id ) return ret;
621 
622  SWGSSeqInfo seq = ResolveBlobId(*id2_blob_id);
623  if ( !seq ) return ret;
624 
625  GetWGSData(ret, seq);
626  return ret;
627 }
628 
629 
630 shared_ptr<SWGSData> CWGSClient::GetChunk(const string& id2info, int64_t chunk_id)
631 {
632  shared_ptr<SWGSData> ret;
633  SParsedId2Info parsed_id2info = ParsePSGId2Info(id2info);
634  if ( !parsed_id2info.tse_id ) return ret;
635 
636  SWGSSeqInfo seq0 = ResolveBlobId(*parsed_id2info.tse_id);
637  if ( !seq0 ) return ret;
638 
639  auto id2_blob_state = GetID2BlobState(seq0);
640  if ( SWGSData::IsForbidden(id2_blob_state) ) {
641  ret = make_shared<SWGSData>();
642  ret->m_GetResult = SWGSData::eResult_Found;
643  ret->m_Id2BlobId.Reset(&GetBlobId(seq0));
644  ret->m_BlobId = GetPSGBlobId(*ret->m_Id2BlobId);
645  ret->m_Id2BlobState = id2_blob_state;
646  return ret;
647  }
648 
649  SWGSSeqInfo& seq = GetRootSeq(seq0);
650  if ( seq.IsContig() ) {
652  // master descr shouldn't be added to proteins in chunks
653  //CWGSSeqIterator::TFlags flags = it.fDefaultFlags & ~it.fMasterDescr;
654  ret = make_shared<SWGSData>();
655  ret->m_GetResult = SWGSData::eResult_Found;
656  ret->m_Id2BlobId.Reset(&GetBlobId(seq0));
657  ret->m_BlobId = GetPSGBlobId(*ret->m_Id2BlobId);
658  ret->m_SplitVersion = parsed_id2info.split_version;
659  ret->m_Id2BlobState = id2_blob_state;
660  ret->m_Data = it.GetChunkDataForVersion(chunk_id, parsed_id2info.split_version);
661  if ( !ret->m_Data ) {
662  ret->m_Data = new CAsnBinData(*it.GetChunkForVersion(chunk_id, parsed_id2info.split_version));
663  }
664  ret->m_Compress = GetCompress(m_Config.m_CompressData, seq, *ret->m_Data);
665  }
666  return ret;
667 }
668 
669 
672  EPSGOperationStatus status)
673 {
675  GetTiming().Register(nullptr, operation, status, start, 0);
676 }
677 
678 
679 CWGSDb CWGSClient::GetWGSDb(const string& prefix)
680 {
681  CWGSDb wgs_db;
682  {{
683  CRef<CWGSDbInfo> delete_info; // delete stale file info after releasing mutex
684  auto slot = m_WGSDbCache.GetSlot(prefix);
685  TWGSDbCache::CSlot::TSlotMutex::TWriteLockGuard guard(slot->GetSlotMutex());
686  CRef<CWGSDbInfo> info = slot->GetObject<CWGSDbInfo>();
687  if ( info && slot->IsExpired(m_WGSDbCache, prefix) ) {
688  PSG_INFO("PSGS_WGS: GetWGSDb: opened " << prefix << " has expired");
689  slot->ResetObject();
690  delete_info.Swap(info);
691  }
692  if ( !info ) {
693  slot->UpdateExpiration(m_WGSDbCache, prefix);
694  try {
695  psg_time_point_t start = psg_clock_t::now();
696  wgs_db = CWGSDb(m_Mgr, prefix);
698  wgs_db.LoadMasterDescr();
699  }
701  }
702  catch ( CSraException& exc ) {
703  if ( exc.GetErrCode() == exc.eNotFoundDb ||
704  exc.GetErrCode() == exc.eProtectedDb ) {
705  // no such WGS table
706  }
707  else {
708  // problem in VDB or WGS reader
709  PSG_ERROR("PSGS_WGS: Exception while opening WGS DB " << prefix << ": " << exc);
710  throw;
711  }
712  return CWGSDb();
713  }
714  catch ( CException& exc ) {
715  // problem in VDB or WGS reader
716  PSG_ERROR("PSGS_WGS: Exception while opening WGS DB " << prefix << ": " << exc);
717  throw;
718  }
719  catch ( exception& exc ) {
720  // problem in VDB or WGS reader
721  PSG_ERROR("PSGS_WGS: Exception while opening WGS DB " << prefix << ": " << exc.what());
722  throw;
723  }
724  info = new CWGSDbInfo;
725  info->m_WGSDb = wgs_db;
726  slot->SetObject(info);
727  }
728  wgs_db = info->m_WGSDb;
729  }}
730  if ( wgs_db->IsReplaced() && !s_KeepReplaced() ) {
731  // replaced
732  PSG_INFO("PSGS_WGS: GetWGSDb: " << prefix << " is replaced");
733  return CWGSDb();
734  }
735  else {
736  // found
737  PSG_INFO("PSGS_WGS: GetWGSDb: " << prefix);
738  return wgs_db;
739  }
740 }
741 
742 
744 {
745  if ( !seq.m_WGSDb ) {
746  seq.m_WGSDb = GetWGSDb(seq.m_WGSAcc);
747  if ( seq.m_WGSDb ) {
748  seq.m_IsWGS = true;
749  seq.m_RowDigits = Uint1(seq.m_WGSDb->GetIdRowDigits());
750  }
751  }
752  return seq.m_WGSDb;
753 }
754 
755 
757 {
758  seq.m_ContigIter.Reset();
759  seq.m_ScaffoldIter.Reset();
760  seq.m_ProteinIter.Reset();
761  seq.m_BlobId.Reset();
762 }
763 
764 
766 {
767  if ( !seq.m_ContigIter ) {
770  seq.m_ContigIter.SelectAccVersion(seq.m_Version);
771  }
772  return seq.m_ContigIter;
773 }
774 
775 
777 {
778  if ( !seq.m_ScaffoldIter ) {
780  }
781  return seq.m_ScaffoldIter;
782 }
783 
784 
786 {
787  if ( !seq.m_ProteinIter ) {
789  }
790  return seq.m_ProteinIter;
791 }
792 
793 
796 {
797  if ( seq0.m_RootSeq.get() ) {
798  return *seq0.m_RootSeq;
799  }
800  if ( seq0.m_NoRootSeq ) {
801  return seq0;
802  }
803  if ( !seq0.IsProtein() ) {
804  seq0.m_NoRootSeq = true;
805  return seq0;
806  }
807  // proteins can be located in nuc-prot set
808  TVDBRowId cds_row_id = GetProteinIterator(seq0).GetBestProductFeatRowId();
809  if ( !cds_row_id ) {
810  seq0.m_NoRootSeq = true;
811  return seq0;
812  }
813  CWGSFeatureIterator cds_it(GetWGSDb(seq0), cds_row_id);
814  if ( !cds_it ) {
815  seq0.m_NoRootSeq = true;
816  return seq0;
817  }
818  switch ( cds_it.GetLocSeqType() ) {
820  {
821  // switch to contig
822  seq0.m_RootSeq.reset(new SWGSSeqInfo(seq0));
823  SWGSSeqInfo& seq = *seq0.m_RootSeq;
824  seq.SetContig();
825  seq.m_RowId = cds_it.GetLocRowId();
826  ResetIteratorCache(seq);
827  return seq;
828  }
830  {
831  // switch to scaffold
832  seq0.m_RootSeq.reset(new SWGSSeqInfo(seq0));
833  SWGSSeqInfo& seq = *seq0.m_RootSeq;
834  seq.SetScaffold();
835  seq.m_RowId = cds_it.GetLocRowId();
836  ResetIteratorCache(seq);
837  return seq;
838  }
839  default:
840  seq0.m_NoRootSeq = true;
841  return seq0;
842  }
843 }
844 
845 
847 {
848  if ( seq.IsContig() ) {
849  return GetContigIterator(seq);
850  }
851  if ( seq.IsScaffold() ) {
852  return GetScaffoldIterator(seq);
853  }
854  if ( seq.IsProtein() ) {
855  return GetProteinIterator(seq);
856  }
857  // master
858  return true;
859 }
860 
861 
863 {
864  if ( !seq ) {
865  return false;
866  }
867  if ( seq.IsContig() ) {
869  return it && it.HasAccVersion(version);
870  }
871  else if ( seq.IsProtein() ) {
873  return it && it.GetAccVersion() == version;
874  }
875  else if ( seq.IsMaster() ) {
876  // master version is already checked
877  return true;
878  }
879  else {
880  // scaffolds can have only version 1
881  return version == 1;
882  }
883 }
884 
885 
887 {
888  if ( !seq.IsProtein() ) {
889  return false;
890  }
892  if ( !it.HasGi() ) {
893  return false;
894  }
895  const auto project_state = seq.m_WGSDb->GetProjectGBState();
896  switch (project_state) {
899  return it.GetGBState() == special_state;
900  default:
901  return project_state == special_state;
902  }
903 }
904 
905 
907 {
908  if ( !seq.IsProtein() ) {
909  return false;
910  }
912  if ( !it.HasGi() ) {
913  return false;
914  }
915  const auto project_state = seq.m_WGSDb->GetProjectGBState();
916  switch (project_state) {
920  default:
921  return false;
922  }
923 }
924 
925 
927 CWGSClient::Resolve(const CSeq_id& id, bool skip_lookup)
928 {
929  switch ( id.Which() ) {
930  case CSeq_id::e_Gi:
931  return ResolveGi(id.GetGi(), skip_lookup);
932  case CSeq_id::e_General:
933  return ResolveGeneral(id.GetGeneral(), skip_lookup);
934  case CSeq_id::e_not_set:
935  case CSeq_id::e_Local:
936  case CSeq_id::e_Gibbsq:
937  case CSeq_id::e_Gibbmt:
938  case CSeq_id::e_Giim:
939  case CSeq_id::e_Patent:
940  case CSeq_id::e_Pdb:
941  return SWGSSeqInfo();
942  default:
943  break;
944  }
945  const CTextseq_id* text_id = id.GetTextseq_Id();
946  if ( !text_id ) {
947  return SWGSSeqInfo();
948  }
949  SWGSSeqInfo seq = ResolveAcc(*text_id, skip_lookup);
950  if ( !seq ) {
951  return seq;
952  }
953  if ( text_id->IsSetVersion() ) {
954  int version = text_id->GetVersion();
955  if ( !IsCorrectVersion(seq, version) ) {
956  seq.m_ValidWGS = false;
957  return seq;
958  }
959  if ( seq.IsContig() ) {
960  GetContigIterator(seq).SelectAccVersion(version);
961  seq.m_Version = version;
962  }
963  }
964  seq.m_ValidWGS = true;
965  return seq;
966 }
967 
968 
970 CWGSClient::ResolveGeneral(const CDbtag& dbtag, bool skip_lookup)
971 {
972  const CObject_id& object_id = dbtag.GetTag();
973  const string& db = dbtag.GetDb();
974  if ( db.size() != kTypePrefixLen+kNumLettersV1 /* WGS:AAAA */ &&
975  db.size() != kTypePrefixLen+kPrefixLenV1 /* WGS:AAAA01 */ &&
976  db.size() != kTypePrefixLen+kNumLettersV2 /* WGS:AAAAAA */ &&
977  db.size() != kTypePrefixLen+kPrefixLenV2 /* WGS:AAAAAA01 */ ) {
978  return SWGSSeqInfo();
979  }
980  bool is_tsa = false;
981  if ( NStr::StartsWith(db, "WGS:", NStr::eNocase) ) {
982  }
983  else if ( NStr::StartsWith(db, "TSA:", NStr::eNocase) ) {
984  is_tsa = true;
985  }
986  else {
987  return SWGSSeqInfo();
988  }
989  string wgs_acc = db.substr(kTypePrefixLen); // remove "WGS:" or "TSA:"
990 
991  NStr::ToUpper(wgs_acc);
992  if ( isalpha(wgs_acc.back()&0xff) ) {
993  wgs_acc += "01"; // add default version digits
994  }
995  SWGSSeqInfo seq;
996  seq.m_WGSAcc = wgs_acc;
997  seq.m_IsWGS = true;
998  if (skip_lookup) {
999  seq.m_ValidWGS = true;
1000  return seq;
1001  }
1002  CWGSDb wgs_db = GetWGSDb(seq);
1003  if ( !wgs_db || wgs_db->IsTSA() != is_tsa ) {
1004  // TSA or WGS type must match
1005  return seq;
1006  }
1007  string tag;
1008  if ( object_id.IsStr() ) {
1009  tag = object_id.GetStr();
1010  NStr::ToUpper(tag);
1011  }
1012  else {
1013  tag = NStr::NumericToString(object_id.GetId());
1014  }
1015  if ( TVDBRowId row = wgs_db.GetContigNameRowId(tag) ) {
1016  seq.m_ValidWGS = true;
1017  seq.SetContig();
1018  seq.m_RowId = row;
1019  }
1020  if ( TVDBRowId row = wgs_db.GetScaffoldNameRowId(tag) ) {
1021  seq.m_ValidWGS = true;
1022  seq.SetScaffold();
1023  seq.m_RowId = row;
1024  }
1025  if ( TVDBRowId row = wgs_db.GetProteinNameRowId(tag) ) {
1026  seq.m_ValidWGS = true;
1027  seq.SetProtein();
1028  seq.m_RowId = row;
1029  }
1030  return seq;
1031 }
1032 
1033 
1035 CWGSClient::ResolveGi(TGi gi, bool skip_lookup)
1036 {
1037  CRef<CWGSResolver> wgs_resolver = GetWGSResolver();
1038  psg_time_point_t start = psg_clock_t::now();
1039  CWGSResolver::TWGSPrefixes prefixes = wgs_resolver->GetPrefixes(gi);
1041  prefixes.empty()? eOpStatusNotFound: eOpStatusFound);
1042  ITERATE ( CWGSResolver::TWGSPrefixes, it, prefixes ) {
1043  if (skip_lookup) {
1044  SWGSSeqInfo fake_info;
1045  fake_info.m_IsWGS = fake_info.m_ValidWGS = true;
1046  return fake_info;
1047  } else if ( CWGSDb wgs_db = GetWGSDb(*it) ) {
1049  gi == wgs_db->GetMasterGi() ) {
1050  // resolve master sequence with GI from VDB
1051  wgs_resolver->SetWGSPrefix(gi, prefixes, *it);
1052  SWGSSeqInfo seq;
1053  seq.m_WGSAcc = *it;
1054  seq.m_IsWGS = true;
1055  seq.m_ValidWGS = true;
1056  seq.m_WGSDb = wgs_db;
1057  seq.m_RowDigits = Uint1(wgs_db->GetIdRowDigits());
1058  seq.SetMaster();
1059  return seq;
1060  }
1061  CWGSGiIterator gi_it(wgs_db, gi);
1062  if ( gi_it ) {
1063  wgs_resolver->SetWGSPrefix(gi, prefixes, *it);
1064  SWGSSeqInfo seq;
1065  seq.m_WGSAcc = *it;
1066  seq.m_IsWGS = true;
1067  seq.m_ValidWGS = true;
1068  seq.m_WGSDb = wgs_db;
1069  seq.m_RowDigits = Uint1(wgs_db->GetIdRowDigits());
1070  seq.m_RowId = gi_it.GetRowId();
1071  if ( gi_it.GetSeqType() == gi_it.eProt ) {
1072  seq.SetProtein();
1073  if ( !GetProteinIterator(seq) ) {
1074  return SWGSSeqInfo();
1075  }
1076  }
1077  else {
1078  seq.SetContig();
1079  if ( !GetContigIterator(seq) ) {
1080  return SWGSSeqInfo();
1081  }
1082  }
1083  return seq;
1084  }
1085  }
1086  }
1087  if ( !prefixes.empty() ) {
1088  wgs_resolver->SetNonWGS(gi, prefixes);
1089  }
1090  return SWGSSeqInfo();
1091 }
1092 
1093 
1095 CWGSClient::ResolveAcc(const CTextseq_id& id, bool skip_lookup)
1096 {
1097  if ( id.IsSetName() ) {
1098  // first try name reference if it has WGS format like AAAA01P000001
1099  // as it directly contains WGS accession
1100  if ( SWGSSeqInfo seq = ResolveWGSAcc(id.GetName(), id, fAllow_aa,
1101  skip_lookup) ) {
1102  _ASSERT(seq.IsProtein());
1103  if ( !id.IsSetAccession() ||
1105  id.GetAccession()) ) {
1106  return seq;
1107  }
1108  }
1109  }
1110  if ( !id.IsSetAccession() ) {
1111  return SWGSSeqInfo();
1112  }
1113  const string& acc = id.GetAccession();
1115  switch ( type & CSeq_id::eAcc_division_mask ) {
1116  // accepted accession types
1117  case CSeq_id::eAcc_wgs:
1119  case CSeq_id::eAcc_tsa:
1121  if ( type & CSeq_id::fAcc_prot ) {
1122  return ResolveProtAcc(id, skip_lookup);
1123  }
1124  else {
1125  return ResolveWGSAcc(acc, id, fAllow_master|fAllow_na,
1126  skip_lookup);
1127  }
1128  case CSeq_id::eAcc_other:
1129  // Some EMBL WGS accession aren't identified as WGS, so we'll try lookup anyway
1130  if ( type == CSeq_id::eAcc_embl_prot ||
1131  (type == CSeq_id::eAcc_gb_prot && acc.size() == 10) ) { // TODO: remove
1132  return ResolveProtAcc(id, skip_lookup);
1133  }
1134  return SWGSSeqInfo();
1135  default:
1136  // non-WGS accessions
1137  return SWGSSeqInfo();
1138  }
1139 }
1140 
1141 
1143 CWGSClient::ResolveProtAcc(const CTextseq_id& id, bool skip_lookup)
1144 {
1145  const string& acc = id.GetAccession();
1146  if ( acc.size() < kMinProtAccLen || acc.size() > kMaxProtAccLen ) {
1147  return SWGSSeqInfo();
1148  }
1149  int ask_version = id.IsSetVersion()? id.GetVersion(): -1;
1150 
1151  CRef<CWGSResolver> wgs_resolver = GetWGSResolver();
1152  psg_time_point_t start = psg_clock_t::now();
1153  CWGSResolver::TWGSPrefixes prefixes = wgs_resolver->GetPrefixes(acc);
1155  prefixes.empty()? eOpStatusNotFound: eOpStatusFound);
1156  ITERATE ( CWGSResolver::TWGSPrefixes, it, prefixes ) {
1157  if (skip_lookup) {
1158  SWGSSeqInfo fake_info;
1159  fake_info.m_IsWGS = fake_info.m_ValidWGS = true;
1160  return fake_info;
1161  } else if ( CWGSDb wgs_db = GetWGSDb(*it) ) {
1162  if ( TVDBRowId row = wgs_db.GetProtAccRowId(acc, ask_version) ) {
1163  wgs_resolver->SetWGSPrefix(acc, prefixes, *it);
1164  SWGSSeqInfo seq;
1165  seq.m_WGSAcc = *it;
1166  seq.m_IsWGS = true;
1167  seq.m_ValidWGS = true;
1168  seq.m_WGSDb = wgs_db;
1169  seq.SetProtein();
1170  seq.m_RowDigits = Uint1(wgs_db->GetIdRowDigits());
1171  seq.m_RowId = row;
1172  return seq;
1173  }
1174  }
1175  }
1176  if ( !prefixes.empty() ) {
1177  wgs_resolver->SetNonWGS(acc, prefixes);
1178  }
1179  return SWGSSeqInfo();
1180 }
1181 
1182 
1184 CWGSClient::ResolveWGSAcc(const string& acc,
1185  const CTextseq_id& id,
1186  TAllowSeqType allow_seq_type,
1187  bool skip_lookup)
1188 {
1189  if ( acc.size() < kPrefixLenV1 + kMinRowDigitsV1 ||
1190  acc.size() > kPrefixLenV2 + kMaxRowDigitsV2 + 1 ) { // one for type letter
1191  return SWGSSeqInfo();
1192  }
1193  size_t num_letters;
1194  for ( num_letters = 0; num_letters < kNumLettersV2; ++num_letters ) {
1195  if ( !isalpha(acc[num_letters]&0xff) ) {
1196  break;
1197  }
1198  }
1199  if ( num_letters != kNumLettersV1 && num_letters != kNumLettersV2 ) {
1200  return SWGSSeqInfo();
1201  }
1202  size_t prefix_len = num_letters + kVersionDigits;
1203  for ( size_t i = num_letters; i < prefix_len; ++i ) {
1204  if ( !isdigit(acc[i]&0xff) ) {
1205  return SWGSSeqInfo();
1206  }
1207  }
1208  SWGSSeqInfo seq;
1209  seq.m_WGSAcc = acc.substr(0, prefix_len);
1210  NStr::ToUpper(seq.m_WGSAcc);
1211  seq.m_IsWGS = true;
1212  SIZE_TYPE row_pos = prefix_len;
1213  switch ( acc[row_pos] ) { // optional type letter
1214  case 'S':
1215  seq.SetScaffold();
1216  if ( !(allow_seq_type & fAllow_scaffold) ) {
1217  return seq;
1218  }
1219  ++row_pos;
1220  break;
1221  case 'P':
1222  seq.SetProtein();
1223  if ( !(allow_seq_type & fAllow_protein) ) {
1224  return seq;
1225  }
1226  ++row_pos;
1227  break;
1228  default:
1229  // it can be either contig or master sequence
1230  if ( !(allow_seq_type & (fAllow_master|fAllow_contig)) ) {
1231  return seq;
1232  }
1233  break;
1234  }
1235  size_t row_digits = acc.size() - row_pos;
1236  if ( num_letters == kNumLettersV1 ) {
1237  if ( row_digits < kMinRowDigitsV1 || row_digits > kMaxRowDigitsV1 ) {
1238  return SWGSSeqInfo();
1239  }
1240  }
1241  else {
1242  if ( row_digits < kMinRowDigitsV2 || row_digits > kMaxRowDigitsV2 ) {
1243  return SWGSSeqInfo();
1244  }
1245  }
1246  Uint8 row = 0;
1247  for ( size_t i = row_pos; i < acc.size(); ++i ) {
1248  char c = acc[i];
1249  if ( c < '0' || c > '9' ) {
1250  return SWGSSeqInfo();
1251  }
1252  row = row*10+(c-'0');
1253  }
1254  seq.m_RowId = row;
1255  if ( !row ) {
1256  // zero row might be master WGS sequence
1257  // it mustn't have type letter, version digits and row must be zero
1258  // version must be positive
1259  if ( !seq.IsMaster() ) {
1260  return SWGSSeqInfo();
1261  }
1262  if ( !(allow_seq_type & fAllow_master) ) {
1263  return seq;
1264  }
1265  // now, move version into version digits of the accession
1266  int version = id.IsSetVersion()? id.GetVersion(): 1;
1267  if ( version <= 0 ) {
1268  return SWGSSeqInfo();
1269  }
1270  for ( size_t i = kVersionDigits; i--; version /= 10) {
1271  if ( acc[num_letters+i] != '0' ) {
1272  return SWGSSeqInfo();
1273  }
1274  seq.m_WGSAcc[num_letters+i] = char('0'+version%10);
1275  }
1276  if ( version ) {
1277  // doesn't fit
1278  return SWGSSeqInfo();
1279  }
1280  }
1281  else if ( seq.IsContig() ) {
1282  if ( !(allow_seq_type & fAllow_contig) ) {
1283  return seq;
1284  }
1285  }
1286  if (skip_lookup) {
1287  seq.m_ValidWGS = true;
1288  return seq;
1289  }
1290  if ( !GetWGSDb(seq) ) {
1291  return seq;
1292  }
1293  if ( seq.m_WGSDb->GetIdRowDigits() != row_digits ) {
1294  return seq;
1295  }
1296  if ( !row ) {
1298  // no master resolution
1299  seq.m_IsWGS = false;
1300  return seq;
1301  }
1302  else if ( kResolveMaster == eResolveMaster_without_gi ) {
1303  // only master sequences w/o GI are resolved
1304  if ( GetWGSDb(seq)->GetMasterGi() != ZERO_GI ) {
1305  // Let master sequences with GI to be processed by ID
1306  seq.m_IsWGS = false;
1307  return seq;
1308  }
1309  }
1310  }
1311  else if ( !IsValidRowId(seq) ) {
1312  return seq;
1313  }
1314  seq.m_ValidWGS = true;
1315  return seq;
1316 }
1317 
1318 
1320 CWGSClient::ResolveBlobId(const CID2_Blob_Id& id, bool skip_lookup)
1321 {
1322  SWGSSeqInfo seq;
1323  CID2_Blob_Id::TSat sat = id.GetSat();
1324  if ( sat == kBlobIdV1Sat ) {
1325  // old 4-letter WGS accession format
1326  seq.m_IsWGS = true;
1327  unsigned subsat = unsigned(id.GetSub_sat());
1328  if ( unsigned seq_type = (subsat & ((1 << eBlobIdBits_type)-1)) ) {
1329  bool bad = false;
1330  // old blob-id subsat format
1331  switch ( seq_type ) {
1332  case eBlobType_contig: seq.SetContig(); break;
1333  case eBlobType_scaffold: seq.SetScaffold(); break;
1334  case eBlobType_protein: seq.SetProtein(); break;
1335  }
1336  int bit = eBlobIdBits_type;
1337  for ( size_t i = 0; i < kPrefixLenV1; ++i ) {
1338  if ( i < kNumLettersV1 ) {
1339  int v = (subsat >> bit)&((1 << eBlobIdBits_letter)-1);
1340  if ( v < 26 ) {
1341  seq.m_WGSAcc += char('A'+v);
1342  }
1343  else {
1344  bad = true;
1345  break;
1346  }
1347  bit += eBlobIdBits_letter;
1348  }
1349  else {
1350  int v = (subsat >> bit)&((1 << eBlobIdBits_digit)-1);
1351  if ( v < 10 ) {
1352  seq.m_WGSAcc += char('0'+v);
1353  }
1354  else {
1355  bad = true;
1356  break;
1357  }
1358  bit += eBlobIdBits_digit;
1359  }
1360  }
1361  if ( seq.IsContig() ) {
1362  // old format means version is 1
1363  seq.m_Version = 1;
1364  }
1365  if ( bad ) { // bad format - illegal letters or digits
1366  return seq;
1367  }
1368  }
1369  else {
1370  seq.SetContig();
1371  subsat /= 4;
1372  for ( size_t i = 0; i < kPrefixLenV1; ++i ) {
1373  if ( i < kNumLettersV1 ) {
1374  seq.m_WGSAcc += char('A'+subsat%26);
1375  subsat /= 26;
1376  }
1377  else {
1378  seq.m_WGSAcc += char('0'+subsat%10);
1379  subsat /= 10;
1380  }
1381  }
1382  seq.m_Version = subsat + 2; // remaining value is version
1383  }
1384  // verify if the WGS accession actually exists in VDB
1385  if (skip_lookup || GetWGSDb(seq)) {
1386  seq.m_ValidWGS = true;
1387  seq.m_RowId = id.GetSat_key();
1388  }
1389  }
1390  else if ( sat >= kBlobIdV2SatMin && sat <= kBlobIdV2SatMax ) {
1391  seq.m_IsWGS = true;
1392  Uint8 v = (Uint8(sat-kBlobIdV2SatMin) << 32)|Uint4(id.GetSub_sat());
1393  for ( size_t i = 0; i < 6; ++i ) {
1394  seq.m_WGSAcc += char('A'+v%26);
1395  v /= 26;
1396  }
1397  for ( size_t i = 0; i < 2; ++i ) {
1398  seq.m_WGSAcc += char('0'+v%10);
1399  v /= 10;
1400  }
1401  if ( v == kBlobIdV2VersionScaffold ) {
1402  seq.SetScaffold();
1403  }
1404  else if ( v == kBlobIdV2VersionProtein ) {
1405  seq.SetProtein();
1406  }
1407  else {
1408  seq.SetContig();
1409  seq.m_Version = int(v - kBlobIdV2VersionContig + 1);
1410  }
1411  // verify if the WGS accession actually exists in VDB
1412  if (skip_lookup || GetWGSDb(seq)) {
1413  seq.m_ValidWGS = true;
1414  seq.m_RowId = id.GetSat_key();
1415  }
1416  }
1417  return seq;
1418 }
1419 
1420 
1422 {
1423  if ( seq0.m_BlobId ) {
1424  return *seq0.m_BlobId;
1425  }
1427  SWGSSeqInfo& seq = GetRootSeq(seq0);
1428  if ( seq.m_WGSAcc.size() == kPrefixLenV2 ) {
1429  Uint8 mul = 1;
1430  Uint8 value = 0;
1431  for ( size_t i = 0; i < seq.m_WGSAcc.size(); ++i ) {
1432  if ( i < kNumLettersV2 ) {
1433  value += (seq.m_WGSAcc[i]-'A')*mul;
1434  mul *= 26;
1435  }
1436  else {
1437  value += (seq.m_WGSAcc[i]-'0')*mul;
1438  mul *= 10;
1439  }
1440  }
1441  unsigned version;
1442  if ( seq.IsScaffold() ) {
1444  }
1445  else if ( seq.IsProtein() ) {
1447  }
1448  else {
1449  _ASSERT(seq.IsContig());
1450  if ( seq.m_Version == -1 ) {
1451  // need contig version to choose appropriate blob-id format
1452  seq.m_Version = GetContigIterator(seq).GetLatestAccVersion();
1453  }
1454  _ASSERT(seq.m_Version >= 1);
1455  _ASSERT(seq.m_Version <= 16);
1457  }
1458  value += mul*version;
1459  CID2_Blob_Id::TSat sat = kBlobIdV2SatMin + int(value >> 32); // high 32 bits
1460  CID2_Blob_Id::TSub_sat subsat = int(value & 0xFFFFFFFF); // low 32 bits
1461  _ASSERT(sat >= kBlobIdV2SatMin && sat <= kBlobIdV2SatMax);
1462  id->SetSat(sat);
1463  id->SetSub_sat(subsat);
1464  id->SetSat_key(int(seq.m_RowId));
1465  }
1466  else {
1467  _ASSERT(seq.m_WGSAcc.size() == kPrefixLenV1);
1468  unsigned subsat;
1469  if ( seq.IsContig() && seq.m_Version == -1 ) {
1470  // need contig version to choose appropriate blob-id format
1471  seq.m_Version = GetContigIterator(seq).GetLatestAccVersion();
1472  }
1473  if ( !seq.IsContig() || seq.m_Version <= 1 ) {
1474  // old blob-id subsat format, version is 1
1475  if ( seq.IsScaffold() ) {
1476  subsat = eBlobType_scaffold;
1477  }
1478  else if ( seq.IsProtein() ) {
1479  subsat = eBlobType_protein;
1480  }
1481  else { // contig or master
1482  subsat = eBlobType_contig;
1483  }
1484  int bit = eBlobIdBits_type;
1485  for ( size_t i = 0; i < seq.m_WGSAcc.size(); ++i ) {
1486  if ( i < kNumLettersV1 ) {
1487  subsat |= (seq.m_WGSAcc[i]-'A') << bit;
1488  bit += eBlobIdBits_letter;
1489  }
1490  else {
1491  subsat |= (seq.m_WGSAcc[i]-'0') << bit;
1492  bit += eBlobIdBits_digit;
1493  }
1494  }
1495  }
1496  else {
1497  // new blob-id subsat format that includes contig version > 1
1498  _ASSERT(seq.IsContig());
1499  _ASSERT(seq.m_Version >= 2);
1500  _ASSERT(seq.m_Version <= 24);
1501  subsat = 0;
1502  unsigned mul = 4;
1503  for ( size_t i = 0; i < seq.m_WGSAcc.size(); ++i ) {
1504  if ( i < kNumLettersV1 ) {
1505  subsat += (seq.m_WGSAcc[i]-'A')*mul;
1506  mul *= 26;
1507  }
1508  else {
1509  subsat += (seq.m_WGSAcc[i]-'0')*mul;
1510  mul *= 10;
1511  }
1512  }
1513  subsat += (seq.m_Version - 2)*mul;
1514  }
1515  id->SetSat(kBlobIdV1Sat);
1516  id->SetSub_sat(int(subsat));
1517  id->SetSat_key(int(seq.m_RowId));
1518  }
1519  seq0.m_BlobId = id;
1520  return *id;
1521 }
1522 
1523 
1524 static int s_GBStateToID2(NCBI_gb_state gb_state)
1525 {
1526  int state = 0;
1527  switch ( gb_state ) {
1530  break;
1533  state |= 1 << eID2_Blob_State_dead;
1534  break;
1537  break;
1538  default:
1539  break;
1540  }
1541  return state;
1542 }
1543 
1544 
1546 {
1547  return s_GBStateToID2(GetGBState(seq)) | s_GBStateToID2(seq.m_WGSDb->GetProjectGBState());
1548 }
1549 
1550 
1552 {
1553  SWGSSeqInfo& seq = GetRootSeq(seq0);
1554  if ( seq.IsContig() ) {
1555  return GetContigIterator(seq).GetGBState();
1556  }
1557  if ( seq.IsScaffold() ) {
1558  return 0;
1559  }
1560  if ( seq.IsProtein() ) {
1561  return GetProteinIterator(seq).GetGBState();
1562  }
1563  // master
1564  return 0;
1565 }
1566 
1567 
1569 {
1570  if ( seq.IsContig() ) {
1571  return GetContigIterator(seq).GetAccSeq_id();
1572  }
1573  if ( seq.IsScaffold() ) {
1574  return GetScaffoldIterator(seq).GetAccSeq_id();
1575  }
1576  if ( seq.IsProtein() ) {
1577  return GetProteinIterator(seq).GetAccSeq_id();
1578  }
1579  // master
1580  return GetWGSDb(seq)->GetMasterSeq_id();
1581 }
1582 
1583 
1585 {
1586  if ( seq.IsContig() ) {
1587  return GetContigIterator(seq).GetGeneralOrPatentSeq_id();
1588  }
1589  if ( seq.IsScaffold() ) {
1590  return GetScaffoldIterator(seq).GetGeneralOrPatentSeq_id();
1591  }
1592  if ( seq.IsProtein() ) {
1593  return GetProteinIterator(seq).GetGeneralOrPatentSeq_id();
1594  }
1595  // master
1596  return null;
1597 }
1598 
1599 
1601 {
1602  if ( seq.IsContig() ) {
1604  return it.HasGi()? it.GetGi(): ZERO_GI;
1605  }
1606  if ( seq.IsScaffold() ) {
1607  // scaffolds have no GIs
1608  return ZERO_GI;
1609  }
1610  if ( seq.IsProtein() ) {
1612  return it.HasGi()? it.GetGi(): ZERO_GI;
1613  }
1614  // master
1615  return ZERO_GI;
1616 }
1617 
1618 
1620 {
1621  ids.push_back(GetAccVer(seq));
1622  if ( CRef<CSeq_id> id = GetGeneral(seq) ) {
1623  ids.push_back(id);
1624  }
1625  TGi gi = GetGi(seq);
1626  if ( gi != ZERO_GI ) {
1627  CRef<CSeq_id> gi_id(new CSeq_id);
1628  gi_id->SetGi(gi);
1629  ids.push_back(gi_id);
1630  }
1631 }
1632 
1633 
1636  const SWGSSeqInfo& seq,
1637  const CAsnBinData& data) const
1638 {
1639  switch ( comp ) {
1641  case SWGSProcessor_Config::eCompressData_never: return false;
1642  default: return dynamic_cast<const CSeq_entry*>(&data.GetMainObject()) && seq.IsMaster();
1643  }
1644 }
1645 
1646 
1647 void CWGSClient::SetSeqId(CSeq_id& id, int seq_id_type, const string& seq_id)
1648 {
1649  if (seq_id_type <= 0) {
1650  // no type check
1651  id.Set(seq_id);
1652  }
1653  else {
1654  id.Set(CSeq_id::eFasta_AsTypeAndContent, CSeq_id::E_Choice(seq_id_type), seq_id);
1655  }
1656 }
1657 
1658 
1659 void CWGSClient::GetBioseqInfo(shared_ptr<SWGSData>& data, SWGSSeqInfo& seq)
1660 {
1661  if ( !seq ) return;
1662 
1663  data = make_shared<SWGSData>();
1664  data->m_GetResult = SWGSData::eResult_Found;
1665  data->m_BioseqInfo = make_shared<CBioseqInfoRecord>();
1666  CBioseqInfoRecord& info = *data->m_BioseqInfo;
1667 
1668  list< CRef<CSeq_id> > wgs_ids;
1669  GetSeqIds(seq, wgs_ids);
1671  TGi gi = ZERO_GI;
1672  for ( auto& id : wgs_ids ) {
1673  if ( id->IsGi() ) {
1674  gi = id->GetGi();
1675  info.SetGI(GI_TO(CBioseqInfoRecord::TGI, gi));
1676  data->m_BioseqInfoFlags |= SPSGS_ResolveRequest::fPSGS_Gi;
1677  continue;
1678  }
1679  else if ( auto text_id = id->GetTextseq_Id() ) {
1680  // only versioned accession goes to canonical id
1681  if ( !(data->m_BioseqInfoFlags & SPSGS_ResolveRequest::fPSGS_CanonicalId) &&
1682  text_id->IsSetAccession() && text_id->IsSetVersion() ) {
1683  info.SetSeqIdType(id->Which());
1684  info.SetAccession(text_id->GetAccession());
1685  info.SetVersion(text_id->GetVersion());
1686  if ( text_id->IsSetName() ) {
1687  info.SetName(text_id->GetName());
1688  }
1689  data->m_BioseqInfoFlags |=
1692  continue;
1693  }
1694  }
1695  string content;
1696  id->GetLabel(&content, CSeq_id::eFastaContent);
1697  psg_ids.insert(make_tuple(id->Which(), std::move(content)));
1698  }
1699  if ( gi != ZERO_GI ) {
1700  // gi goes either to canonical id or to other ids
1701  CSeq_id gi_id(CSeq_id::e_Gi, gi);
1702  string content;
1703  gi_id.GetLabel(&content, CSeq_id::eFastaContent);
1704  if ( !(data->m_BioseqInfoFlags & SPSGS_ResolveRequest::fPSGS_CanonicalId) ) {
1705  // set canonical id from gi
1706  info.SetAccession(content);
1707  info.SetVersion(0);
1708  info.SetSeqIdType(gi_id.Which());
1709  data->m_BioseqInfoFlags |=
1712  }
1713  else {
1714  // to other ids
1715  psg_ids.insert(make_tuple(gi_id.Which(), std::move(content)));
1716  }
1717  }
1718  if ( (data->m_BioseqInfoFlags & SPSGS_ResolveRequest::fPSGS_CanonicalId) || !psg_ids.empty() ) {
1719  info.SetSeqIds(std::move(psg_ids));
1720  // all ids are requested, so we should get GI and acc.ver too if they exist
1721  info.SetGI(GI_TO(CBioseqInfoRecord::TGI, gi)); // even if it's zero
1722  data->m_BioseqInfoFlags |=
1726  }
1727 
1728  if ( seq.IsContig() ) {
1730  info.SetHash(it.GetSeqHash());
1731  info.SetLength(it.GetSeqLength());
1732  info.SetMol(GetWGSDb(seq)->GetContigMolType());
1733  data->m_BioseqInfoFlags |=
1737  if ( it.HasTaxId() ) {
1738  info.SetTaxId(it.GetTaxId());
1739  data->m_BioseqInfoFlags |= SPSGS_ResolveRequest::fPSGS_TaxId;
1740  }
1741  }
1742  if ( seq.IsScaffold() ) {
1744  info.SetLength(it.GetSeqLength());
1745  info.SetMol(GetWGSDb(seq)->GetScaffoldMolType());
1746  data->m_BioseqInfoFlags |=
1749  }
1750  if ( seq.IsProtein() ) {
1752  info.SetLength(it.GetSeqLength());
1753  info.SetMol(GetWGSDb(seq)->GetProteinMolType());
1754  data->m_BioseqInfoFlags |=
1757  if ( it.HasSeqHash() ) {
1758  info.SetHash(it.GetSeqHash());
1759  data->m_BioseqInfoFlags |=
1761  }
1762  {{
1763  // set taxid
1764  auto wgs = GetWGSDb(seq);
1765  // faster common taxid retrieval if possible
1766  if ( wgs->HasCommonTaxId() ) {
1767  info.SetTaxId(wgs->GetCommonTaxId());
1768  data->m_BioseqInfoFlags |= SPSGS_ResolveRequest::fPSGS_TaxId;
1769  }
1770  else {
1771  // otherwise get taxid from root sequence (contig or protein itself)
1772  auto& root_seq = GetRootSeq(seq);
1773  if ( root_seq.IsContig() ) {
1774  CWGSSeqIterator root_it = GetContigIterator(root_seq);
1775  if ( root_it.HasTaxId() ) {
1776  info.SetTaxId(root_it.GetTaxId());
1777  data->m_BioseqInfoFlags |= SPSGS_ResolveRequest::fPSGS_TaxId;
1778  }
1779  }
1780  if ( root_seq.IsProtein() ) {
1781  if ( it.HasTaxId() ) {
1782  info.SetTaxId(it.GetTaxId());
1783  data->m_BioseqInfoFlags |= SPSGS_ResolveRequest::fPSGS_TaxId;
1784  }
1785  }
1786  }
1787  }}
1788  }
1789 
1790  data->m_Id2BlobId.Reset(&GetBlobId(seq));
1791  data->m_BlobId = GetPSGBlobId(*data->m_Id2BlobId);
1792  data->m_BioseqInfoFlags |= SPSGS_ResolveRequest::fPSGS_BlobId;
1793  if ( data->m_Id2BlobId->IsSetVersion() ) {
1794  // ID2 version is minutes since UNIX epoch
1795  // PSG date_changed is ms since UNIX epoch
1796  info.SetDateChanged(data->m_Id2BlobId->GetVersion()*60000);
1797  data->m_BioseqInfoFlags |= SPSGS_ResolveRequest::fPSGS_DateChanged;
1798  }
1799 
1800  data->m_Id2BlobState = GetID2BlobState(seq);
1801  info.SetState(data->GetPSGBioseqState());
1802  data->m_BioseqInfoFlags |= SPSGS_ResolveRequest::fPSGS_State;
1803 }
1804 
1805 
1806 void CWGSClient::GetWGSData(shared_ptr<SWGSData>& data, SWGSSeqInfo& seq0)
1807 {
1808  if (!data) {
1809  data = make_shared<SWGSData>();
1810  data->m_GetResult = SWGSData::eResult_Found;
1811  }
1812  SWGSSeqInfo& seq = GetRootSeq(seq0);
1813 
1814  if ( !data->m_Id2BlobId ) data->m_Id2BlobId.Reset(&GetBlobId(seq0));
1815  if ( data->m_BlobId.empty() ) data->m_BlobId = GetPSGBlobId(*data->m_Id2BlobId);
1816  data->m_Id2BlobState = GetID2BlobState(seq0);
1817  if ( data->IsForbidden() ) return;
1818 
1819  if ( seq.IsMaster() ) {
1820  data->m_Data = new CAsnBinData(*GetWGSDb(seq)->GetMasterSeq_entry());
1821  }
1822  else if ( seq.IsContig() ) {
1824  CWGSSeqIterator::TFlags flags = it.fDefaultFlags ;
1825  if ( !s_AddMasterDescrContig() ) {
1826  flags &= ~it.fMasterDescr;
1827  }
1828  else if ( s_MarkMasterDescr() ) {
1829  flags |= it.fMasterDescrMark;
1830  }
1831  if ( !s_SplitFeatures() ) {
1832  flags &= ~it.fSplitFeatures;
1833  }
1834  auto asn_data = it.GetSplitInfoDataAndVersion(flags);
1835  if ( asn_data.first ) {
1836  data->m_SplitVersion = asn_data.second;
1837  }
1838  if ( !asn_data.first ) {
1839  asn_data.first = it.GetSeq_entryData(flags);
1840  }
1841  if ( !asn_data.first ) {
1842  asn_data.first = new CAsnBinData(*it.GetSeq_entry(flags));
1843  }
1844  data->m_Data = asn_data.first;
1845  }
1846  else if ( seq.IsScaffold() ) {
1848  CWGSScaffoldIterator::TFlags flags = it.fDefaultFlags;
1849  if ( !s_AddMasterDescrScaffold() ) {
1850  flags &= ~it.fMasterDescr;
1851  }
1852  else if ( s_MarkMasterDescr() ) {
1853  flags |= it.fMasterDescrMark;
1854  }
1855  data->m_Data = new CAsnBinData(*it.GetSeq_entry(flags));
1856  }
1857  else if ( seq.IsProtein() ) {
1859  CWGSProteinIterator::TFlags flags = it.fDefaultFlags;
1860  if ( !s_AddMasterDescrProtein() ) {
1861  flags &= ~it.fMasterDescr;
1862  }
1863  else if ( s_MarkMasterDescr() ) {
1864  flags |= it.fMasterDescrMark;
1865  }
1866  data->m_Data = new CAsnBinData(*it.GetSeq_entry(flags));
1867  }
1868  if ( data->m_Data ) {
1869  data->m_Compress = GetCompress(m_Config.m_CompressData, seq, *data->m_Data);
1870  }
1871  else {
1872  data.reset();
1873  }
1874 }
1875 
1876 
1878 {
1879  Int4 sat;
1880  Int4 subsat;
1881  Int4 satkey;
1882  auto id_str = blob_id.GetId();
1883  CTempString s = id_str;
1884  if ( !s_ParseOSGBlob(s, sat, subsat, satkey) || !s.empty() ) {
1885  return null;
1886  }
1888  id->SetSat(sat);
1889  id->SetSub_sat(subsat);
1890  id->SetSat_key(satkey);
1891  return id;
1892 }
1893 
1894 
1896 {
1897  Int4 sat;
1898  Int4 subsat;
1899  Int4 satkey;
1900  TID2BlobVersion tse_version;
1901  TID2SplitVersion split_version;
1902 
1903  CTempString s = id2_info;
1904  if ( !s_ParseOSGBlob(s, sat, subsat, satkey) ||
1905  !s_Skip(s, '.') ||
1906  !s_ParseInt(s, tse_version) ||
1907  !s_Skip(s, '.') ||
1908  !s_ParseInt(s, split_version) ||
1909  !s.empty() ) {
1910  return SParsedId2Info{};
1911  }
1912 
1914  id->SetSat(sat);
1915  id->SetSub_sat(subsat);
1916  id->SetSat_key(satkey);
1917  id->SetVersion(tse_version);
1918  return SParsedId2Info{id, split_version};
1919 }
1920 
1921 
1922 bool CWGSClient::CanBeWGS(int seq_id_type, const string& seq_id)
1923 {
1924  try {
1925  CSeq_id id;
1926  SetSeqId(id, seq_id_type, seq_id);
1927  if ( id.IsGi() ) {
1928  return true;
1929  }
1930  else if ( id.IsGeneral() ) {
1931  return IsWGSGeneral(id.GetGeneral());
1932  }
1933  else if ( auto text_id = id.GetTextseq_Id() ) {
1934  return IsWGSAccession(*text_id);
1935  }
1936  return false;
1937  }
1938  catch ( exception& /*ignored*/ ) {
1939  return false;
1940  }
1941 }
1942 
1943 
1945 {
1946  ostringstream s;
1947  if ( IsOSGBlob(blob_id) ) {
1948  s_FormatBlobId(s, blob_id);
1949  }
1950  return s.str();
1951 }
1952 
1953 
1955 {
1956  return s_IsOSGBlob(blob_id.GetSat(), blob_id.GetSub_sat(), blob_id.GetSat_key());
1957 }
1958 
1959 
1960 
1962 {
1963  if ( m_Id2BlobState == 0 ||
1965  return eLive;
1966  }
1967  else if ( m_Id2BlobState & (1<<eID2_Blob_State_suppressed) ) {
1968  return eReserved;
1969  }
1970  else if ( m_Id2BlobState & (1<<eID2_Blob_State_dead) ) {
1971  return eDead;
1972  }
1973  else if ( m_Id2BlobState & (1<<eID2_Blob_State_withdrawn) ) {
1974  return eDead; // assume withdrawn as dead ???
1975  }
1976  else if ( m_Id2BlobState & (1<<eID2_Blob_State_protected) ) {
1977  return eDead; // assume protected (unauthorized) as dead ???
1978  }
1979  else {
1980  return eDead;
1981  }
1982 }
1983 
1984 
1985 bool SWGSData::IsForbidden(int id2_blob_state)
1986 {
1987  if ( id2_blob_state & (1<<eID2_Blob_State_withdrawn) ) {
1988  return true;
1989  }
1990  else if ( id2_blob_state & (1<<eID2_Blob_State_protected) ) {
1991  return true;
1992  }
1993  return false;
1994 }
1995 
1996 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Definition: Dbtag.hpp:53
CID2_Blob_Id –.
Definition: ID2_Blob_Id.hpp:66
virtual void DoJob(void)
Payload function.
Definition: wgs_client.cpp:471
CIndexUpdateThread(unsigned update_delay, CRef< CWGSResolver > resolver)
Definition: wgs_client.cpp:463
CRef< CWGSResolver > m_Resolver
Definition: wgs_client.cpp:492
EPSGS_Type GetRequestType(void) const
@ ePSGS_BlobBySatSatKeyRequest
TRequest & GetRequest(void)
static CPubseqGatewayApp * GetInstance(void)
Definition: Seq_entry.hpp:56
@ eProtectedDb
DB is protected.
Definition: exception.hpp:98
@ eNotFoundDb
DB main file not found.
Definition: exception.hpp:92
virtual TErrCode GetErrCode(void) const
Definition: sraread.cpp:164
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Adaptation of CThread class repeatedly running some job.
void RequestStop()
Schedule thread Stop.
CRef< CSlot > GetSlot(const string &acc_or_path)
Definition: vdbcache.cpp:187
static void SetSeqId(CSeq_id &id, int seq_id_type, const string &seq_id)
shared_ptr< SWGSData > GetChunk(const string &id2info, int64_t chunk_id)
Definition: wgs_client.cpp:630
bool GetCompress(SWGSProcessor_Config::ECompressData comp, const SWGSSeqInfo &seq, const objects::CAsnBinData &data) const
objects::CVDBMgr m_Mgr
Definition: wgs_client.hpp:267
SWGSSeqInfo ResolveBlobId(const objects::CID2_Blob_Id &id, bool skip_lookup=false)
int TID2BlobVersion
Definition: wgs_client.hpp:193
SWGSSeqInfo ResolveGi(TGi gi, bool skip_lookup=false)
static bool IsOSGBlob(const CID2_Blob_Id &blob_id)
shared_ptr< SWGSData > GetBlobByBlobId(const string &blob_id)
Definition: wgs_client.cpp:616
bool IsCorrectVersion(SWGSSeqInfo &seq, int version)
Definition: wgs_client.cpp:862
void ResetIteratorCache(SWGSSeqInfo &seq)
Definition: wgs_client.cpp:756
void GetWGSData(shared_ptr< SWGSData > &data, SWGSSeqInfo &seq0)
static string GetPSGBlobId(const CID2_Blob_Id &blob_id)
CRef< objects::CSeq_id > GetAccVer(SWGSSeqInfo &seq)
CWGSClient(const SWGSProcessor_Config &config)
Definition: wgs_client.cpp:503
CFastMutex m_ResolverMutex
Definition: wgs_client.hpp:268
int TAllowSeqType
Definition: wgs_client.hpp:217
SWGSSeqInfo & GetRootSeq(SWGSSeqInfo &seq0)
Definition: wgs_client.cpp:795
static bool CanBeWGS(int seq_id_type, const string &seq_id)
objects::CWGSScaffoldIterator & GetScaffoldIterator(SWGSSeqInfo &seq)
Definition: wgs_client.cpp:776
shared_ptr< SWGSData > ResolveSeqId(const objects::CSeq_id &seq_id)
Definition: wgs_client.cpp:580
TGi GetGi(SWGSSeqInfo &seq)
shared_ptr< SWGSData > GetSeqInfoBySeqId(const objects::CSeq_id &seq_id, SWGSSeqInfo &seq, const TBlobIds &excluded)
Definition: wgs_client.cpp:595
CRef< objects::CWGSResolver > m_Resolver
Definition: wgs_client.hpp:269
~CWGSClient(void)
Definition: wgs_client.cpp:510
bool IsValidRowId(SWGSSeqInfo &seq)
Definition: wgs_client.cpp:846
CRef< objects::CWGSResolver > GetWGSResolver(void)
Definition: wgs_client.cpp:519
SWGSSeqInfo ResolveAcc(const objects::CTextseq_id &id, bool skip_lookup=false)
SWGSSeqInfo Resolve(const objects::CSeq_id &id, bool skip_lookup=false)
Definition: wgs_client.cpp:927
bool HasMigrated(SWGSSeqInfo &seq)
Definition: wgs_client.cpp:906
objects::CID2_Blob_Id & GetBlobId(SWGSSeqInfo &id)
static SParsedId2Info ParsePSGId2Info(const string &idsss2_info)
objects::CWGSDb GetWGSDb(const string &prefix)
Definition: wgs_client.cpp:679
CRef< objects::CSeq_id > GetGeneral(SWGSSeqInfo &seq)
bool CanProcessRequest(CPSGS_Request &request)
Definition: wgs_client.cpp:535
int GetID2BlobState(SWGSSeqInfo &seq)
objects::CWGSSeqIterator & GetContigIterator(SWGSSeqInfo &seq)
Definition: wgs_client.cpp:765
vector< string > TBlobIds
Definition: wgs_client.hpp:122
NCBI_gb_state GetGBState(SWGSSeqInfo &seq0)
TWGSDbCache m_WGSDbCache
Definition: wgs_client.hpp:270
void GetBioseqInfo(shared_ptr< SWGSData > &data, SWGSSeqInfo &seq)
objects::CWGSProteinIterator & GetProteinIterator(SWGSSeqInfo &seq)
Definition: wgs_client.cpp:785
static CRef< CID2_Blob_Id > ParsePSGBlobId(const SPSGS_BlobId &blob_id)
void x_RegisterTiming(psg_time_point_t start, EPSGOperation operation, EPSGOperationStatus status)
Definition: wgs_client.cpp:670
int TID2SplitVersion
Definition: wgs_client.hpp:192
SWGSSeqInfo ResolveWGSAcc(const string &acc, const objects::CTextseq_id &id, TAllowSeqType allow_seq_type, bool skip_lookup=false)
bool HasSpecialState(SWGSSeqInfo &seq, NCBI_gb_state special_state)
Definition: wgs_client.cpp:886
SWGSSeqInfo ResolveGeneral(const objects::CDbtag &dbtag, bool skip_lookup=false)
Definition: wgs_client.cpp:970
SWGSSeqInfo ResolveProtAcc(const objects::CTextseq_id &id, bool skip_lookup=false)
SWGSProcessor_Config m_Config
Definition: wgs_client.hpp:266
void GetSeqIds(SWGSSeqInfo &seq, list< CRef< objects::CSeq_id > > &ids)
CRef< CThreadNonStop > m_IndexUpdateThread
Definition: wgs_client.hpp:271
int TEnabledFlags
Definition: wgs_client.hpp:191
TVDBRowId GetContigNameRowId(const string &name) const
Definition: wgsread.hpp:744
TVDBRowId GetProteinNameRowId(const string &name) const
Definition: wgsread.hpp:756
bool LoadMasterDescr(EDescrFilter filter=eDescrDefaultFilter) const
Definition: wgsread.hpp:779
TVDBRowId GetScaffoldNameRowId(const string &name) const
Definition: wgsread.hpp:750
TVDBRowId GetLocRowId(void) const
Definition: wgsread.cpp:7860
NCBI_WGS_seqtype GetLocSeqType(void) const
Definition: wgsread.cpp:7846
ESeqType GetSeqType(void) const
Definition: wgsread.hpp:1379
TVDBRowId GetRowId(void) const
Definition: wgsread.hpp:1384
bool HasGi(void) const
Definition: wgsread.cpp:7149
NCBI_gb_state GetGBState(void) const
Definition: wgsread.cpp:7361
bool HasTaxId(void) const
Definition: wgsread.cpp:7302
TSeqPos GetSeqLength(void) const
Definition: wgsread.cpp:7337
CSeq_id::TGi GetGi(void) const
Definition: wgsread.cpp:7155
THash GetSeqHash(void) const
Definition: wgsread.cpp:7331
CRef< CSeq_entry > GetSeq_entry(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:7703
int GetAccVersion(void) const
Definition: wgsread.cpp:7181
bool HasSeqHash(void) const
Definition: wgsread.cpp:7323
TTaxId GetTaxId(void) const
Definition: wgsread.cpp:7312
virtual TWGSPrefixes GetPrefixes(TGi gi)=0
virtual void SetNonWGS(TGi gi, const TWGSPrefixes &prefixes)
vector< string > TWGSPrefixes
Definition: wgsresolver.hpp:51
virtual bool Update(void)
static CRef< CWGSResolver > CreateResolver(const CVDBMgr &mgr)
Definition: wgsresolver.cpp:75
virtual void SetWGSPrefix(TGi gi, const TWGSPrefixes &prefixes, const string &prefix)
Definition: wgsresolver.cpp:96
CRef< CSeq_entry > GetSeq_entry(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6863
TSeqPos GetSeqLength(void) const
Definition: wgsread.cpp:6674
CRef< CAsnBinData > GetSeq_entryData(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6275
CRef< CAsnBinData > GetChunkDataForVersion(TChunkId chunk_id, TSplitVersion split_version) const
Definition: wgsread.cpp:6362
CRef< CID2S_Chunk > GetChunkForVersion(TChunkId chunk_id, TSplitVersion split_version) const
Definition: wgsread.cpp:6339
TTaxId GetTaxId(void) const
Definition: wgsread.cpp:4405
bool HasGi(void) const
Definition: wgsread.cpp:4256
CSeq_id::TGi GetGi(void) const
Definition: wgsread.cpp:4262
THash GetSeqHash(void) const
Definition: wgsread.cpp:4422
pair< CRef< CAsnBinData >, TSplitVersion > GetSplitInfoDataAndVersion(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6314
bool HasTaxId(void) const
Definition: wgsread.cpp:4399
TSeqPos GetSeqLength(EClipType clip_type=eDefaultClip) const
Definition: wgsread.cpp:4478
bool HasAccVersion(int version) const
Definition: wgsread.cpp:4301
CRef< CSeq_entry > GetSeq_entry(TFlags flags=fDefaultFlags) const
Definition: wgsread.cpp:6264
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
bool empty() const
Definition: set.hpp:133
static uch flags
#define true
Definition: bool.h:35
static const char * str(char *buf, int n)
Definition: stats.c:84
char data[12]
Definition: iconv.c:80
Int8 int64_t
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
EBlobType
Definition: types.hpp:78
@ eBlobType_contig
Definition: wgs_client.cpp:443
@ eBlobType_scaffold
Definition: wgs_client.cpp:444
@ eBlobType_protein
Definition: wgs_client.cpp:445
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
@ eAcc_wgs
Definition: Seq_id.hpp:290
@ fAcc_prot
Definition: Seq_id.hpp:252
@ eAcc_targeted
Definition: Seq_id.hpp:298
@ eAcc_embl_prot
Definition: Seq_id.hpp:383
@ eAcc_wgs_intermed
Definition: Seq_id.hpp:294
@ eAcc_gb_prot
Definition: Seq_id.hpp:345
@ eAcc_tsa
Definition: Seq_id.hpp:273
@ eAcc_other
Definition: Seq_id.hpp:264
@ eAcc_division_mask
Definition: Seq_id.hpp:299
@ eFasta_AsTypeAndContent
Definition: Seq_id.hpp:117
@ eFastaContent
Like eFasta, but without any tag.
Definition: Seq_id.hpp:608
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
void Swap(TThisType &ref)
Swaps the pointer with another reference.
Definition: ncbiobj.hpp:754
#define NCBI_PARAM_TYPE(section, name)
Generate typename for a parameter from its {section, name} attributes.
Definition: ncbi_param.hpp:149
@ eParam_NoThread
Do not use per-thread values.
Definition: ncbi_param.hpp:418
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define NPOS
Definition: ncbistr.hpp:133
static TNumeric StringToNumeric(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to a numeric value.
Definition: ncbistr.hpp:330
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5347
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
@ fConvErr_NoErrMessage
Set errno, but do not set CNcbiError message on error.
Definition: ncbistr.hpp:291
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
bool Run(TRunMode flags=fRunDefault)
Run the thread.
Definition: ncbithr.cpp:724
CGuard< CRWLock, SSimpleWriteLock< CRWLock > > TWriteLockGuard
Definition: ncbimtx.hpp:934
void Join(void **exit_data=0)
Wait for the thread termination.
Definition: ncbithr.cpp:863
operation
Bit operations.
Definition: bmconst.h:191
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
const TDb & GetDb(void) const
Get the Db member data.
Definition: Dbtag_.hpp:220
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
TSat_key GetSat_key(void) const
Get the Sat_key member data.
TSub_sat GetSub_sat(void) const
Get the Sub_sat member data.
TSat GetSat(void) const
Get the Sat member data.
@ eID2_Blob_State_dead
@ eID2_Blob_State_suppressed
@ eID2_Blob_State_protected
@ eID2_Blob_State_live
@ eID2_Blob_State_withdrawn
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
TGi & SetGi(void)
Select the variant.
Definition: Seq_id_.hpp:896
TVersion GetVersion(void) const
Get the Version member data.
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
@ e_Gibbmt
Geninfo backbone moltype.
Definition: Seq_id_.hpp:97
@ e_Giim
Geninfo import id.
Definition: Seq_id_.hpp:98
@ e_Gibbsq
Geninfo backbone seqid.
Definition: Seq_id_.hpp:96
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
static MDB_envinfo info
Definition: mdb_load.c:37
const string version
version string
Definition: variables.hpp:66
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
const CConstRef< CSeq_id > GetAccession(const CSeq_id_Handle &id_handle)
#define PSG_ERROR(message)
#define PSG_INFO(message)
psg_clock_t::time_point psg_time_point_t
#define row(bind, expected)
Definition: string_bind.c:73
TID2SplitVersion split_version
Definition: wgs_client.hpp:199
CRef< CID2_Blob_Id > tse_id
Definition: wgs_client.hpp:198
AutoPtr< SWGSSeqInfo > m_RootSeq
Definition: wgs_client.hpp:163
objects::CWGSProteinIterator m_ProteinIter
Definition: wgs_client.hpp:161
bool IsProtein(void) const
Definition: wgs_client.hpp:141
objects::CWGSDb m_WGSDb
Definition: wgs_client.hpp:158
bool IsMaster(void) const
Definition: wgs_client.hpp:139
objects::CWGSScaffoldIterator m_ScaffoldIter
Definition: wgs_client.hpp:160
objects::CWGSSeqIterator m_ContigIter
Definition: wgs_client.hpp:159
objects::TVDBRowId m_RowId
Definition: wgs_client.hpp:155
CRef< objects::CID2_Blob_Id > m_BlobId
Definition: wgs_client.hpp:162
bool IsContig(void) const
Definition: wgs_client.hpp:138
bool IsScaffold(void) const
Definition: wgs_client.hpp:140
string GetId(void) const
int GetPSGBioseqState() const
int m_Id2BlobState
Definition: wgs_client.hpp:103
bool IsForbidden() const
Definition: wgs_client.hpp:99
@ eResult_Excluded
Definition: wgs_client.hpp:91
@ eResult_Found
Definition: wgs_client.hpp:89
@ fMasterDescrMark
Definition: wgsread.hpp:157
@ fSplitFeatures
Definition: wgsread.hpp:170
unsigned m_IndexUpdateDelay
Definition: wgs_client.hpp:69
ECompressData m_CompressData
Definition: wgs_client.hpp:72
Definition: type.c:6
#define _ASSERT
EPSGOperationStatus
Definition: timing.hpp:60
@ eOpStatusFound
Definition: timing.hpp:61
@ eOpStatusNotFound
Definition: timing.hpp:62
EPSGOperation
Definition: timing.hpp:65
@ eWGS_VDBLookup
Definition: timing.hpp:90
@ eVDBOpen
Definition: timing.hpp:87
int64_t TVDBRowId
Definition: vdbread.hpp:80
static EAddMasterDescr s_AddMasterDescrLevel(void)
Definition: wgs_client.cpp:111
static bool s_AddMasterDescrScaffold()
Definition: wgs_client.cpp:123
static bool s_ParseInt(CTempString &str, Int &v)
Definition: wgs_client.cpp:208
static const size_t kNumLettersV1
Definition: wgs_client.cpp:271
USING_SCOPE(objects)
static bool s_Skip(CTempString &str, char c)
Definition: wgs_client.cpp:191
static const size_t kMaxRowDigitsV2
Definition: wgs_client.cpp:279
static const int kOSG_Sat_CDD_max
Definition: wgs_client.cpp:155
static const EResolveMaster kResolveMaster
Definition: wgs_client.cpp:147
static bool s_IsValidIntChar(char c)
Definition: wgs_client.cpp:201
END_NAMESPACE(wgs)
static int s_GBStateToID2(NCBI_gb_state gb_state)
static const size_t kMinRowDigitsV2
Definition: wgs_client.cpp:278
EBlobIdBits
Definition: wgs_client.cpp:447
@ eBlobIdBits_type
Definition: wgs_client.cpp:448
@ eBlobIdBits_letter
Definition: wgs_client.cpp:449
@ eBlobIdBits_digit
Definition: wgs_client.cpp:450
static bool s_MarkMasterDescr(void)
Definition: wgs_client.cpp:135
static bool IsWGSProtAccession(const CTextseq_id &id)
Definition: wgs_client.cpp:382
BEGIN_LOCAL_NAMESPACE
Definition: wgs_client.cpp:458
static bool s_IsEnabledOSGSat(CWGSClient::TEnabledFlags enabled_flags, Int4 sat)
Definition: wgs_client.cpp:157
static const size_t kNumLettersV2
Definition: wgs_client.cpp:272
NCBI_PARAM_DEF_EX(bool, WGS, FILTER_ALL, false, eParam_NoThread, WGS_FILTER_ALL)
NCBI_PARAM_DEF(bool, WGS, SPLIT_FEATURES, true)
static const int kOSG_Sat_WGS_max
Definition: wgs_client.cpp:151
static bool s_KeepReplaced(void)
Definition: wgs_client.cpp:84
static bool s_ParseOSGBlob(CTempString &s, Int4 &sat, Int4 &subsat, Int4 &satkey)
Definition: wgs_client.cpp:229
static const size_t kVersionDigits
Definition: wgs_client.cpp:273
EAlligSeqType
Definition: wgs_client.cpp:301
@ fAllow_protein
Definition: wgs_client.cpp:304
@ fAllow_contig
Definition: wgs_client.cpp:302
@ fAllow_scaffold
Definition: wgs_client.cpp:303
static const int kBlobIdV2VersionContig
Definition: wgs_client.cpp:441
static const int kBlobIdV2SatMax
Definition: wgs_client.cpp:438
static const int kBlobIdV2SatMin
Definition: wgs_client.cpp:437
static const size_t kTypePrefixLen
Definition: wgs_client.cpp:270
END_LOCAL_NAMESPACE
Definition: wgs_client.cpp:495
static bool s_AddMasterDescrContig()
Definition: wgs_client.cpp:117
static bool s_SplitFeatures(void)
Definition: wgs_client.cpp:78
static const size_t kMaxRowDigitsV1
Definition: wgs_client.cpp:277
static const size_t kPrefixLenV2
Definition: wgs_client.cpp:275
static bool IsWGSGeneral(const CDbtag &dbtag)
Definition: wgs_client.cpp:284
static const int kBlobIdV2VersionScaffold
Definition: wgs_client.cpp:439
EAddMasterDescr
Definition: wgs_client.cpp:97
@ eAddMasterDescr_none
Definition: wgs_client.cpp:98
@ eAddMasterDescr_all
Definition: wgs_client.cpp:100
@ eAddMasterDescr_detached
Definition: wgs_client.cpp:99
int TAllowSeqType
Definition: wgs_client.cpp:306
NCBI_PARAM_DECL(bool, WGS, FILTER_ALL)
static EAddMasterDescr s_ProcessAddMasterDescr(void)
Definition: wgs_client.cpp:103
static bool s_IsOSGSat(Int4 sat)
Definition: wgs_client.cpp:185
static const int kBlobIdV1Sat
Definition: wgs_client.cpp:436
END_NCBI_NAMESPACE
static bool s_KeepMigrated(void)
Definition: wgs_client.cpp:90
static const size_t kMaxProtAccLen
Definition: wgs_client.cpp:282
static const int kBlobIdV2VersionProtein
Definition: wgs_client.cpp:440
static const int kOSG_Sat_CDD_min
Definition: wgs_client.cpp:154
static const int kOSG_Sat_WGS_min
Definition: wgs_client.cpp:150
static bool s_IsOSGBlob(Int4 sat, Int4, Int4)
Definition: wgs_client.cpp:223
static const char kSubSatSeparator
Definition: wgs_client.cpp:149
static const size_t kMinRowDigitsV1
Definition: wgs_client.cpp:276
BEGIN_NCBI_NAMESPACE
Definition: wgs_client.cpp:46
static bool s_AddMasterDescrProtein()
Definition: wgs_client.cpp:129
static const int kOSG_Sat_SNP_min
Definition: wgs_client.cpp:152
static const size_t kMinProtAccLen
Definition: wgs_client.cpp:281
BEGIN_NAMESPACE(psg)
EResolveMaster
Definition: wgs_client.cpp:142
@ eResolveMaster_never
Definition: wgs_client.cpp:143
@ eResolveMaster_always
Definition: wgs_client.cpp:145
@ eResolveMaster_without_gi
Definition: wgs_client.cpp:144
static const int kOSG_Sat_SNP_max
Definition: wgs_client.cpp:153
static bool IsWGSAccession(const string &acc, const CTextseq_id &id, TAllowSeqType allow_seq_type)
Definition: wgs_client.cpp:308
static void s_FormatBlobId(ostream &s, const CID2_Blob_Id &blob_id)
Definition: wgs_client.cpp:256
static const size_t kPrefixLenV1
Definition: wgs_client.cpp:274
@ NCBI_gb_state_eWGSGenBankUnverified
Definition: wgsread.hpp:90
@ NCBI_gb_state_eWGSGenBankReplaced
Definition: wgsread.hpp:88
@ NCBI_gb_state_eWGSGenBankSuppressed
Definition: wgsread.hpp:87
@ NCBI_gb_state_eWGSGenBankMigrated
Definition: wgsread.hpp:92
@ NCBI_gb_state_eWGSGenBankLive
Definition: wgsread.hpp:86
@ NCBI_gb_state_eWGSGenBankWithdrawn
Definition: wgsread.hpp:89
@ NCBI_WGS_seqtype_scaffold
Definition: wgsread.hpp:68
@ NCBI_WGS_seqtype_contig
Definition: wgsread.hpp:67
uint32_t NCBI_gb_state
Definition: wgsread.hpp:53
Modified on Fri Sep 20 14:58:09 2024 by modify_doxy.py rev. 669887