NCBI C++ ToolKit
osg_resolve_base.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: osg_resolve_base.cpp 98033 2022-09-21 12:30:07Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eugene Vasilchenko
27  *
28  * File Description: processor for data from OSG
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include "osg_resolve.hpp"
35 
40 #include <objects/id2/id2__.hpp>
43 #include "osg_connection.hpp"
44 #include "osg_getblob_base.hpp"
45 #include "insdc_utils.hpp"
47 
51 
52 
54  : m_BioseqInfoFlags(0),
55  m_Withdrawn(false),
56  m_Confidential(false)
57 {
58 }
59 
60 
62 {
63 }
64 
65 
66 void CPSGS_OSGResolveBase::SetSeqId(CSeq_id& id, int seq_id_type, const string& seq_id)
67 {
68  if (seq_id_type <= 0) {
69  // no type check
70  id.Set(seq_id);
71  }
72  else {
73  id.Set(CSeq_id::eFasta_AsTypeAndContent, CSeq_id::E_Choice(seq_id_type), seq_id);
74  }
75 }
76 
77 
78 static const char kSpecialId_label[] = "LABEL";
79 static const char kSpecialId_taxid[] = "TAXID";
80 static const char kSpecialId_hash[] = "HASH";
81 static const char kSpecialId_length[] = "Seq-inst.length";
82 static const char kSpecialId_type[] = "Seq-inst.mol";
83 
84 
86 {
88  if ( reply.GetReply().IsGet_seq_id() ) {
89  auto& reply_ids = reply.GetReply().GetGet_seq_id();
90  auto& req_id = reply_ids.GetRequest();
91  TGi gi = ZERO_GI;
92  for ( auto& id : reply_ids.GetSeq_id() ) {
93  if ( id->IsGeneral() ) {
94  const CDbtag& dbtag = id->GetGeneral();
95  const CObject_id& obj_id = dbtag.GetTag();
96  if ( dbtag.GetDb() == kSpecialId_label ) {
97  //m_BioseqInfo.SetLabel(obj_id.GetStr());
98  //m_BioseqInfoFlags |= ;
99  continue;
100  }
101  if ( dbtag.GetDb() == kSpecialId_taxid ) {
102  m_BioseqInfo.SetTaxId(obj_id.GetId());
104  continue;
105  }
106  if ( dbtag.GetDb() == kSpecialId_hash ) {
107  m_BioseqInfo.SetHash(obj_id.GetId());
109  continue;
110  }
111  if ( dbtag.GetDb() == kSpecialId_length ) {
112  m_BioseqInfo.SetLength(obj_id.GetId());
114  continue;
115  }
116  if ( dbtag.GetDb() == kSpecialId_type ) {
117  m_BioseqInfo.SetMol(obj_id.GetId());
119  continue;
120  }
121  }
122  else if ( id->IsGi() ) {
123  gi = id->GetGi();
126  continue;
127  }
128  else if ( auto text_id = id->GetTextseq_Id() ) {
129  // only versioned accession goes to canonical id
131  text_id->IsSetAccession() && text_id->IsSetVersion() ) {
132  m_BioseqInfo.SetSeqIdType(id->Which());
133  m_BioseqInfo.SetAccession(text_id->GetAccession());
134  m_BioseqInfo.SetVersion(text_id->GetVersion());
135  if ( text_id->IsSetName() ) {
136  m_BioseqInfo.SetName(text_id->GetName());
137  }
140  continue;
141  }
142  }
143  string content;
144  id->GetLabel(&content, CSeq_id::eFastaContent);
145  seq_ids.insert(make_tuple(id->Which(), move(content)));
146  }
147  if ( gi != ZERO_GI ) {
148  // gi goes either to canonical id or to other ids
149  CSeq_id gi_id(CSeq_id::e_Gi, gi);
150  string content;
151  gi_id.GetLabel(&content, CSeq_id::eFastaContent);
153  // set canonical id from gi
154  m_BioseqInfo.SetAccession(content);
159  }
160  else {
161  // to other ids
162  seq_ids.insert(make_tuple(gi_id.Which(), move(content)));
163  }
164  }
165  if ( req_id.GetSeq_id_type() == req_id.eSeq_id_type_all &&
167  !seq_ids.empty()) ) {
168  m_BioseqInfo.SetSeqIds(move(seq_ids));
169  // all ids are requested, so we should get GI and acc.ver too if they exist
170  m_BioseqInfo.SetGI(GI_TO(CBioseqInfoRecord::TGI,gi)); // even if it's zero
174  }
175  else if ( req_id.GetSeq_id_type() == req_id.eSeq_id_type_any ) {
176  // TODO?
177  }
178  }
179  else if ( reply.GetReply().IsGet_blob_id()) {
180  auto& reply_ids = reply.GetReply().GetGet_blob_id();
181  if ( reply_ids.IsSetBlob_id() &&
182  CPSGS_OSGGetBlobBase::IsEnabledOSGBlob(GetEnabledFlags(), reply_ids.GetBlob_id()) ) {
183  const CID2_Blob_Id& blob_id = reply_ids.GetBlob_id();
186  if ( blob_id.IsSetVersion() ) {
187  // ID2 version is minutes since UNIX epoch
188  // PSG date_changed is ms since UNIX epoch
189  m_BioseqInfo.SetDateChanged(blob_id.GetVersion()*60000);
191  }
193  if ( reply_ids.IsSetBlob_state() ) {
194  id2_state = reply_ids.GetBlob_state();
195  }
196  enum EPSGBioseqState {
197  eDead = 0,
198  eSought = 1,
199  eReserved = 5,
200  eMerged = 7,
201  eLive = 10
202  };
203  int psg_state = eDead;
204  if ( id2_state == 0 ||
205  (id2_state & (1<<eID2_Blob_State_live)) ) {
206  psg_state = eLive;
207  }
208  else if ( id2_state & (1<<eID2_Blob_State_suppressed) ) {
209  psg_state = eReserved;
210  }
211  else if ( id2_state & (1<<eID2_Blob_State_dead) ) {
212  psg_state = eDead;
213  }
214  else if ( id2_state & (1<<eID2_Blob_State_withdrawn) ) {
215  m_Withdrawn = true;
216  }
217  else if ( id2_state & (1<<eID2_Blob_State_protected) ) {
218  m_Confidential = true;
219  }
220  m_BioseqInfo.SetState(psg_state);
222  }
223  }
224  else {
225  PSG_ERROR(GetName()<<": "
226  "Unknown reply "<<MSerial_AsnText<<reply);
227  }
228 }
229 
230 
232 {
236  }
237  return format;
238 }
239 
240 
241 void CPSGS_OSGResolveBase::SendResult(const string& data_to_send,
242  EOutputFormat output_format)
243 {
244  size_t item_id = GetReply()->GetItemId();
245  GetReply()->PrepareBioseqData(item_id, GetName(), data_to_send, output_format);
246  GetReply()->PrepareBioseqCompletion(item_id, GetName(), 2);
247 }
248 
249 
251 {
252  output_format = GetOutputFormat(output_format);
253 
254  /*
255  if (bioseq_resolution.m_ResolutionResult == ePSGS_BioseqDB ||
256  bioseq_resolution.m_ResolutionResult == ePSGS_BioseqCache)
257  AdjustBioseqAccession(bioseq_resolution);
258  */
259 
260  string data_to_send;
261  if ( output_format == SPSGS_ResolveRequest::ePSGS_JsonFormat ) {
263  if ( GetDebugLevel() >= eDebug_exchange ) {
264  LOG_POST(GetDiagSeverity() << "OSG: "
265  "Sending reply "<<data_to_send);
266  }
267  } else {
268  data_to_send = ToBioseqProtobuf(m_BioseqInfo);
269  }
270 
271  SendResult(data_to_send, output_format);
272 }
273 
274 
275 /////////////////////////////////////////////////////////////////////////////
276 // Common Seq-id identification code
277 /////////////////////////////////////////////////////////////////////////////
278 
279 // WGS accession parameters
280 static const size_t kTypePrefixLen = 4; // "WGS:" or "TSA:"
281 static const size_t kNumLettersV1 = 4;
282 static const size_t kNumLettersV2 = 6;
283 static const size_t kVersionDigits = 2;
284 static const size_t kPrefixLenV1 = kNumLettersV1 + kVersionDigits;
285 static const size_t kPrefixLenV2 = kNumLettersV2 + kVersionDigits;
286 static const size_t kMinRowDigitsV1 = 6;
287 static const size_t kMaxRowDigitsV1 = 8;
288 static const size_t kMinRowDigitsV2 = 7;
289 static const size_t kMaxRowDigitsV2 = 9;
290 
291 static const size_t kMinProtAccLen = 8; // 3+5
292 static const size_t kMaxProtAccLen = 10; // 3+7
293 
294 
295 static bool IsWGSGeneral(const CDbtag& dbtag)
296 {
297  const string& db = dbtag.GetDb();
298  if ( db.size() != kTypePrefixLen+kNumLettersV1 /* WGS:AAAA */ &&
299  db.size() != kTypePrefixLen+kPrefixLenV1 /* WGS:AAAA01 */ &&
300  db.size() != kTypePrefixLen+kNumLettersV2 /* WGS:AAAAAA */ &&
301  db.size() != kTypePrefixLen+kPrefixLenV2 /* WGS:AAAAAA01 */ ) {
302  return false;
303  }
304  if ( !NStr::StartsWith(db, "WGS:", NStr::eNocase) &&
305  !NStr::StartsWith(db, "TSA:", NStr::eNocase) ) {
306  return false;
307  }
308  return true;
309 }
310 
311 
315  fAllow_protein = 4
316 };
317 typedef int TAllowSeqType;
318 
319 static bool IsWGSAccession(const string& acc,
320  const CTextseq_id& id,
321  TAllowSeqType allow_seq_type)
322 {
323  if ( acc.size() < kPrefixLenV1 + kMinRowDigitsV1 ||
324  acc.size() > kPrefixLenV2 + kMaxRowDigitsV2 + 1 ) { // one for type letter
325  return false;
326  }
327  size_t num_letters;
328  for ( num_letters = 0; num_letters < kNumLettersV2; ++num_letters ) {
329  if ( !isalpha(acc[num_letters]&0xff) ) {
330  break;
331  }
332  }
333  if ( num_letters != kNumLettersV1 && num_letters != kNumLettersV2 ) {
334  return false;
335  }
336  size_t prefix_len = num_letters + kVersionDigits;
337  for ( size_t i = num_letters; i < prefix_len; ++i ) {
338  if ( !isdigit(acc[i]&0xff) ) {
339  return false;
340  }
341  }
342  SIZE_TYPE row_pos = prefix_len;
343  switch ( acc[row_pos] ) { // optional type letter
344  case 's':
345  case 'S':
346  // scaffold
347  if ( !(allow_seq_type & fAllow_scaffold) ) {
348  return false;
349  }
350  ++row_pos;
351  break;
352  case 'p':
353  case 'P':
354  // protein
355  if ( !(allow_seq_type & fAllow_protein) ) {
356  return false;
357  }
358  ++row_pos;
359  break;
360  default:
361  // contig
362  if ( !(allow_seq_type & fAllow_contig) ) {
363  return false;
364  }
365  break;
366  }
367  size_t row_digits = acc.size() - row_pos;
368  if ( num_letters == kNumLettersV1 ) {
369  if ( row_digits < kMinRowDigitsV1 || row_digits > kMaxRowDigitsV1 ) {
370  return false;
371  }
372  }
373  else {
374  if ( row_digits < kMinRowDigitsV2 || row_digits > kMaxRowDigitsV2 ) {
375  return false;
376  }
377  }
378  Uint8 row = 0;
379  for ( size_t i = row_pos; i < acc.size(); ++i ) {
380  char c = acc[i];
381  if ( c < '0' || c > '9' ) {
382  return false;
383  }
384  row = row*10+(c-'0');
385  }
386  if ( !row ) {
387  return false;
388  }
389  return true;
390 }
391 
392 
393 static bool IsWGSProtAccession(const CTextseq_id& id)
394 {
395  const string& acc = id.GetAccession();
396  if ( acc.size() < kMinProtAccLen || acc.size() > kMaxProtAccLen ) {
397  return false;
398  }
399  return true;
400 }
401 
402 
403 static bool IsWGSAccession(const CTextseq_id& id)
404 {
405  if ( id.IsSetName() ) {
406  // first try name reference if it has WGS format like AAAA01P000001
407  // as it directly contains WGS accession
408  return IsWGSAccession(id.GetName(), id, fAllow_protein);
409  }
410  if ( !id.IsSetAccession() ) {
411  return false;
412  }
413  const string& acc = id.GetAccession();
415  switch ( type & CSeq_id::eAcc_division_mask ) {
416  // accepted accession types
417  case CSeq_id::eAcc_wgs:
419  case CSeq_id::eAcc_tsa:
421  if ( type & CSeq_id::fAcc_prot ) {
422  return IsWGSProtAccession(id);
423  }
424  else {
426  }
427  case CSeq_id::eAcc_other:
428  // Some EMBL WGS accession aren't identified as WGS, so we'll try lookup anyway
429  if ( type == CSeq_id::eAcc_embl_prot ||
430  (type == CSeq_id::eAcc_gb_prot && acc.size() == 10) ) { // TODO: remove
431  return IsWGSProtAccession(id);
432  }
433  return false;
434  default:
435  // non-WGS accessions
436  return false;
437  }
438 }
439 
440 
441 bool CPSGS_OSGResolveBase::CanBeWGS(int seq_id_type, const string& seq_id)
442 {
443  try {
444  CSeq_id id;
445  SetSeqId(id, seq_id_type, seq_id);
446  if ( id.IsGi() ) {
447  return true;
448  }
449  else if ( id.IsGeneral() ) {
450  return IsWGSGeneral(id.GetGeneral());
451  }
452  else if ( auto text_id = id.GetTextseq_Id() ) {
453  return IsWGSAccession(*text_id);
454  }
455  return false;
456  }
457  catch ( exception& /*ignored*/ ) {
458  return false;
459  }
460 }
461 
462 
CBioseqInfoRecord & SetDateChanged(TDateChanged value)
Definition: record.hpp:112
CBioseqInfoRecord & SetName(TName value)
Definition: record.hpp:184
CBioseqInfoRecord & SetLength(TLength value)
Definition: record.hpp:130
CBioseqInfoRecord & SetTaxId(TTaxId value)
Definition: record.hpp:178
CBioseqInfoRecord & SetSeqIdType(TSeqIdType value)
Definition: record.hpp:106
CBioseqInfoRecord & SetSeqIds(TSeqIds const &value)
Definition: record.hpp:154
CBioseqInfoRecord & SetState(TState value)
Definition: record.hpp:172
CBioseqInfoRecord & SetGI(TGI value)
Definition: record.hpp:124
CBioseqInfoRecord & SetMol(TMol value)
Definition: record.hpp:136
CBioseqInfoRecord & SetVersion(TVersion value)
Definition: record.hpp:100
CBioseqInfoRecord & SetHash(THash value)
Definition: record.hpp:118
CBioseqInfoRecord & SetAccession(const TAccession &value)
Definition: record.hpp:94
Definition: Dbtag.hpp:53
CID2_Blob_Id –.
Definition: ID2_Blob_Id.hpp:66
CID2_Reply –.
Definition: ID2_Reply.hpp:66
static string GetPSGBlobId(const CID2_Blob_Id &blob_id)
static bool IsEnabledOSGBlob(TEnabledFlags enabled_flags, const CID2_Blob_Id &blob_id)
TEnabledFlags GetEnabledFlags() const
void SendBioseqInfo(EOutputFormat output_format)
void SendResult(const string &data_to_send, EOutputFormat format)
TBioseqInfoFlags m_BioseqInfoFlags
static bool CanBeWGS(int seq_id_type, const string &seq_id)
EOutputFormat GetOutputFormat(EOutputFormat format)
static void SetSeqId(CSeq_id &id, int seq_id_type, const string &seq_id)
CBioseqInfoRecord m_BioseqInfo
void ProcessResolveReply(const CID2_Reply &reply)
virtual string GetName(void) const =0
Tells the processor name (used in logging and tracing)
shared_ptr< CPSGS_Reply > GetReply(void) const
Provides the reply wrapper.
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
bool empty() const
Definition: set.hpp:133
#define false
Definition: bool.h:36
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
@ eAcc_wgs
Definition: Seq_id.hpp:290
@ fAcc_prot
Definition: Seq_id.hpp:252
@ eAcc_targeted
Definition: Seq_id.hpp:298
@ eAcc_embl_prot
Definition: Seq_id.hpp:383
@ eAcc_wgs_intermed
Definition: Seq_id.hpp:294
@ eAcc_gb_prot
Definition: Seq_id.hpp:345
@ eAcc_tsa
Definition: Seq_id.hpp:273
@ eAcc_other
Definition: Seq_id.hpp:264
@ eAcc_division_mask
Definition: Seq_id.hpp:299
@ eFasta_AsTypeAndContent
Definition: Seq_id.hpp:117
@ eFastaContent
Like eFasta, but without any tag.
Definition: Seq_id.hpp:608
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
const TDb & GetDb(void) const
Get the Db member data.
Definition: Dbtag_.hpp:220
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
TVersion GetVersion(void) const
Get the Version member data.
const TGet_blob_id & GetGet_blob_id(void) const
Get the variant data.
Definition: ID2_Reply_.cpp:186
bool IsGet_seq_id(void) const
Check if variant Get_seq_id is selected.
Definition: ID2_Reply_.hpp:769
bool IsSetVersion(void) const
version of blob, optional in some requests Check if a value has been assigned to Version data member.
const TRequest & GetRequest(void) const
Get the Request member data.
const TReply & GetReply(void) const
Get the Reply member data.
Definition: ID2_Reply_.hpp:940
const TGet_seq_id & GetGet_seq_id(void) const
Get the variant data.
Definition: ID2_Reply_.cpp:164
bool IsGet_blob_id(void) const
Check if variant Get_blob_id is selected.
Definition: ID2_Reply_.hpp:775
@ eID2_Blob_State_dead
@ eID2_Blob_State_suppressed
@ eID2_Blob_State_protected
@ eID2_Blob_State_live
@ eID2_Blob_State_withdrawn
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
int i
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
static Format format
Definition: njn_ioutil.cpp:53
Severity GetDiagSeverity()
int GetDebugLevel()
@ eDebug_exchange
static const size_t kNumLettersV1
static const size_t kMaxRowDigitsV2
static const char kSpecialId_type[]
static const size_t kMinRowDigitsV2
static const char kSpecialId_taxid[]
static bool IsWGSProtAccession(const CTextseq_id &id)
static const size_t kNumLettersV2
static const size_t kVersionDigits
EAlligSeqType
@ fAllow_protein
@ fAllow_contig
@ fAllow_scaffold
static const char kSpecialId_length[]
static const char kSpecialId_label[]
static const size_t kTypePrefixLen
static const size_t kMaxRowDigitsV1
static const size_t kPrefixLenV2
static bool IsWGSGeneral(const CDbtag &dbtag)
int TAllowSeqType
END_NCBI_NAMESPACE
static const size_t kMaxProtAccLen
static const size_t kMinRowDigitsV1
BEGIN_NCBI_NAMESPACE
static const size_t kMinProtAccLen
BEGIN_NAMESPACE(psg)
static const char kSpecialId_hash[]
static bool IsWGSAccession(const string &acc, const CTextseq_id &id, TAllowSeqType allow_seq_type)
END_NAMESPACE(osg)
static const size_t kPrefixLenV1
string ToJsonString(const CBioseqInfoRecord &bioseq_info, SPSGS_ResolveRequest::TPSGS_BioseqIncludeData include_data_flags, const string &custom_blob_id)
string ToBioseqProtobuf(const CBioseqInfoRecord &bioseq_info)
#define PSG_ERROR(message)
#define row(bind, expected)
Definition: string_bind.c:73
Definition: type.c:6
Modified on Fri Apr 12 17:14:38 2024 by modify_doxy.py rev. 669887