NCBI C++ ToolKit
gencoll_svc.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gencoll_svc.cpp 47447 2023-04-07 14:45:38Z asztalos $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Liangshou Wu
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiapp.hpp>
34 
40 #include <serial/objistrasnb.hpp>
41 
42 #include <objmgr/util/sequence.hpp>
43 
47 
50 
51 DEFINE_CLASS_STATIC_FAST_MUTEX(CGencollSvc::sm_SeqAccToAssmAccsCacheMutex);
53 
55 {
57  return Ref(&s_Instance.Get());
58 }
59 
61 {
62 }
63 
64 static string s_MakeCacheKey(const string& assm_acc, const string& mode)
65 {
66  return assm_acc + "#" + mode;
67 }
68 
70  bool use_caching,
71  const string& mode,
72  IGencollSvcErrors *errors,
73  bool isUseSeqconfigService)
74 {
75  CLogPerformance perflog("CGencollSvc::GetGCAssembly()");
76  string sCacheKey{s_MakeCacheKey(assm_acc, mode)};
77  if (use_caching) {
79  auto it = m_GC_Assembly_cache.find(sCacheKey);
80  if (it != m_GC_Assembly_cache.end()) {
81  perflog.AddParameter("Assembly source", "cache");
82  perflog.Post();
83  return it->second;
84  }
85  }
86  CRef<CGC_Assembly> gc_assembly;
87  if(isUseSeqconfigService) {
88  perflog.AddParameter("Assembly source", "seqconfig");
89  perflog.Post();
90  string host = "www.ncbi.nlm.nih.gov";
91  string path = "/projects/sviewer/seqconfig.cgi";
92  string params = "&assmaccs=" + NStr::URLEncode(assm_acc) +
93  "&mode=" + mode +
94  "&req=getassminfos" +
95 // "&delaytest=30" +
96  "&ofmt=asnb";
97  string url = "https://" + host + path + "?" + params;
98 
99  try {
100  CGuiHttpSessionRequest httpRequest(url);
101  unique_ptr<CObjectIStream> obj_strm(new CObjectIStreamAsnBinary(httpRequest.GetResponseStream()));
103  *obj_strm >> *res;
104 
105  if(res && res->CanGetAssemblies() && !res->GetAssemblies().empty()) {
106  gc_assembly = res->GetAssemblies().front();
107  perflog.AddParameter("Assembly source", "seqconfig");
108  }
109  } NCBI_CATCH("Could not retrieve assembly information");
110  } else {
112  bool isError(true);
113  try {
114  gc_assembly = gencoll_service->GetAssembly(assm_acc, mode);
115  perflog.AddParameter("Assembly source", "GenColl");
116  isError = false;
117  }
118  catch(const CException& e) {
119  ERR_POST(Error << "GenColl request failed: " << e.GetMsg());
120  if(nullptr != errors)
121  errors->ReportError(assm_acc, e.GetMsg());
122  }
123  catch(const exception& e) {
124  ERR_POST(Error << "GenColl request failed: " << e.what());
125  if(nullptr != errors)
126  errors->ReportError(assm_acc, e.what());
127  }
128  if(isError) {
129  perflog.AddParameter("Assembly source", "GenColl error");
130  perflog.Post();
131  return CRef<CGC_Assembly>();
132  }
133  }
134  // should we cache if gc_assembly is null?
135  if(use_caching) {
136  CFastMutexGuard lock(m_Mutex);
137  auto it = m_GC_Assembly_cache.find(sCacheKey);
138  if(it != m_GC_Assembly_cache.end()) {
139  perflog.Post();
140  return it->second;
141  }
142  m_GC_Assembly_cache[sCacheKey] = gc_assembly;
143  if(gc_assembly && gc_assembly->GetAccession() != assm_acc) {
144  m_GC_Assembly_cache[s_MakeCacheKey(gc_assembly->GetAccession(), mode)] = gc_assembly;
145  }
146  }
147  perflog.Post();
148  return gc_assembly;
149 }
150 
151 // get an instance of CGenomicCollectionsService with a given timeout in seconds
152 // this will never return a null (unless new fails)
153 // if the timeout is -1, will try to read it from registry (section [gencoll], key timeout, default value 5 seconds)
154 // default number of retries is 1, unless otherwise specified (section [gencoll], key retries)
156 {
158 
159  const auto& reg = CNcbiApplication::Instance()->GetConfig();
160  string sGuessTimeout(reg.GetString("gencoll", "timeout", "5"));
161  unsigned timeout_sec_reg = NStr::StringToUInt(sGuessTimeout, NStr::fConvErr_NoThrow);
162  STimeout timeout;
163  timeout.sec = timeout_sec >= 0 ? (unsigned)timeout_sec : (timeout_sec_reg ? timeout_sec_reg : 5);
164  timeout.usec = 0;
165  gencoll_service->SetTimeout(&timeout);
166 
167  int retries = reg.GetInt("gencoll", "retries", 1);
168  gencoll_service->SetTryLimit((unsigned)retries);
169 
170  return gencoll_service;
171 }
172 
173 // checks whether the bioseq handle is potentially in GenColl
174 // should be used to avoid unnecessary calls to GenColl (SV-2822, SV-3997)
175 bool CGencollSvc::isGenCollSequence(const objects::CBioseq_Handle& handle)
176 {
177  try {
178  // first: check that this is an NC_ (SV-4678)
179  string text_id;
180  objects::CSeq_id_Handle best_idh(sequence::GetId(handle, sequence::eGetId_Best));
181  if(!best_idh) {
182  text_id = handle.GetInitialSeqIdOrNull().IsNull() ? "" : handle.GetInitialSeqIdOrNull()->GetSeqIdString();
183  } else {
184  best_idh.GetSeqId()->GetLabel(&text_id, CSeq_id::eContent);
185  }
186  if(NStr::StartsWith(text_id, "NC_", NStr::eNocase)) {
187  return true;
188  }
189  // second: filter out remaining RNAs and all proteins
190  CBioseq_Handle::TMol tmol(handle.GetSequenceType());
191  if(tmol == CBioseq_Handle::TMol::eMol_rna || tmol == CBioseq_Handle::TMol::eMol_aa) {
192  return false;
193  }
194  // third: filter out local ids
195  CConstRef<CSeq_id> seqid(handle.GetSeqId());
198  return false;
199  }
200  // all others are good
201  return true;
202  } catch(...) {
203  // if an accession is so bad that it can't be recognized, then it's definitely not useful for getting an assembly
204  return false;
205  }
206 }
207 
208 // get assemblies for a given sequence
209 CGencollSvc::EGencollSvcStatus CGencollSvc::GetAssmAccs(const objects::CBioseq_Handle& handle,
211  bool isOne,
212  TAssmAccs& AssmAccs)
213 {
214  CLogPerformance perflog("CGencollSvc::GetAssmAccs()");
215  AssmAccs.clear();
216 
217  // do not attempt guessing for sequences that are not ever supposed to be in assemblies
218  if(!isGenCollSequence(handle)) {
219  perflog.AddParameter("AssmAccs source", "no guess attempt");
220  perflog.Post();
221  return eGSS_no_attempt;
222  }
223  string seqAcc;
224  handle.GetSeqId()->GetLabel(&seqAcc, CSeq_id::eContent);
225  string sCacheKey{seqAcc + "#" + NStr::NumericToString((int)filter) + "#" + NStr::NumericToString((int)isOne)};
226 
227  // check for presence of a given cache key in cache
228  {
229  CFastMutexGuard lock(sm_SeqAccToAssmAccsCacheMutex);
230  auto iCacheHit(sm_SeqAccToAssmAccsCache.find(sCacheKey));
231  if(iCacheHit != sm_SeqAccToAssmAccsCache.end()) {
232  AssmAccs = iCacheHit->second;
233  perflog.AddParameter("AssmAccs source", "cache");
234  perflog.Post();
235  return eGSS_from_cache;
236  }
237  }
238  // if not cached, read from GenColl and record in cache
239  {
241  try {
242  if(isOne) {
243  CRef<CGCClient_AssemblyInfo> pAssemblyInfo(gencoll_service->FindOneAssemblyBySequences(seqAcc,
244  filter,
246  if(pAssemblyInfo.NotNull() && pAssemblyInfo->CanGetAccession()) {
247  AssmAccs.push_back(pAssemblyInfo->GetAccession());
248  }
249  } else {
250  CRef<CGCClient_AssembliesForSequences> reply(gencoll_service->FindAssembliesBySequences(seqAcc,
251  filter));
252  if (reply && reply->CanGetAssemblies()) {
253  for (auto i: reply->GetAssemblies()) {
254  if (i->CanGetAssembly()) {
255  AssmAccs.push_back(i->GetAssembly().GetAccession());
256  }
257  }
258  }
259  }
260  {
261  CFastMutexGuard lock(sm_SeqAccToAssmAccsCacheMutex);
262  sm_SeqAccToAssmAccsCache[sCacheKey] = AssmAccs;
263  }
264  } catch(...) {
265  LOG_POST(Error << "Call to GenColl timed out when guessing assembly for ID: " << seqAcc);
266  perflog.AddParameter("AssmAccs source", "failed");
267  perflog.Post();
268  return eGSS_failed;
269  }
270  perflog.AddParameter("AssmAccs source", "GenColl");
271  perflog.Post();
272  return eGSS_from_gencoll;
273  }
274 }
275 
276 
277 void CGencollSvc::GetAssmsInfo(const TAssmAccs& assm_accs, const string& mode, TAssemblies& assemblies, IGencollSvcErrors *errors)
278 {
279  string sAssmAcc;
280  try {
281  for(auto &iAssmAccs: assm_accs) {
282  sAssmAcc = iAssmAccs;
283  CRef<objects::CGC_Assembly> assembly{ GetInstance()->GetGCAssembly(sAssmAcc, true, mode, errors) };
284  if (assembly.NotNull()) {
285  assemblies.push_back(assembly);
286  }
287  else {
288  LOG_POST(Warning << "Call to GenColl failed for assembly " << sAssmAcc);
289  }
290  }
291  }
292  catch (const CException& e) {
293  LOG_POST(Warning << "Call to GenColl failed for assembly " << sAssmAcc << " :" << e.what());
294  }
295 }
296 
297 
298 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CGC_Assemblies –.
string GetAccession() const
Retrieve the accession for this assembly.
Definition: GC_Assembly.cpp:99
static void GetAssmsInfo(const TAssmAccs &assm_accs, const string &mode, TAssemblies &assemblies, IGencollSvcErrors *errors=nullptr)
query GenColl for assembly information, return list of assemblies mode is as described above for GetG...
CGencollSvc(void)
Definition: gencoll_svc.cpp:60
map< string, CRef< objects::CGC_Assembly > > m_GC_Assembly_cache
mutex to sync our internals
CRef< objects::CGC_Assembly > GetGCAssembly(const string &assm_acc, bool use_caching, const string &mode, IGencollSvcErrors *errors=nullptr, bool isUseSeqconfigService=false)
Definition: gencoll_svc.cpp:69
static bool isGenCollSequence(const objects::CBioseq_Handle &handle)
list< string > TAssmAccs
Definition: gencoll_svc.hpp:57
CFastMutex m_Mutex
static CRef< CGencollSvc > GetInstance(void)
Definition: gencoll_svc.cpp:54
static CRef< objects::CGenomicCollectionsService > GetGenCollService(int timeout_sec=-1)
static TSeqAccToAssmAccsCache sm_SeqAccToAssmAccsCache
EGencollSvcStatus GetAssmAccs(const objects::CBioseq_Handle &handle, objects::EGCClient_GetAssemblyBySequenceFilter filter, bool isOne, TAssmAccs &AssmAccs)
get all assembly accessions for a given sequence and filter if isOne == 1, return only the best one a...
list< CRef< objects::CGC_Assembly > > TAssemblies
Definition: gencoll_svc.hpp:58
void AddParameter(const std::string &name, const std::string &value)
void Post(CRequestStatus::ECode status=CRequestStatus::e200_Ok)
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CObjectIStreamAsnBinary –.
Definition: objistrasnb.hpp:59
CRef –.
Definition: ncbiobj.hpp:618
CSafeStatic<>::
virtual void ReportError(const string &accession, const string &error)=0
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
USING_SCOPE(objects)
static string s_MakeCacheKey(const string &assm_acc, const string &mode)
Definition: gencoll_svc.cpp:64
DEFINE_CLASS_STATIC_FAST_MUTEX(CGencollSvc::sm_SeqAccToAssmAccsCacheMutex)
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_CATCH(message)
Catch CExceptions as well This macro is deprecated - use *_X or *_XX variant instead of it.
Definition: ncbiexpt.hpp:580
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
@ eAcc_type_mask
Definition: Seq_id.hpp:247
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:605
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
bool NotNull(void) const THROWS_NONE
Check if pointer is not null – same effect as NotEmpty().
Definition: ncbiobj.hpp:744
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static string URLEncode(const CTempString str, EUrlEncode flag=eUrlEnc_SkipMarkChars)
URL-encode string.
Definition: ncbistr.cpp:6062
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
unsigned int usec
microseconds (modulo 1,000,000)
Definition: ncbi_types.h:78
unsigned int sec
seconds
Definition: ncbi_types.h:77
const TAccession & GetAccession(void) const
Get the Accession member data.
bool CanGetAccession(void) const
Check if it is safe to call GetAccession method.
EGCClient_GetAssemblyBySequenceFilter
This is a bitfield. All values are powers of two.
@ e_Local
local use
Definition: Seq_id_.hpp:95
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
int i
static MDB_envinfo info
Definition: mdb_load.c:37
static CSafeStatic< CMetaRegistry > s_Instance
Definition: metareg.cpp:59
mdb_mode_t mode
Definition: lmdb++.h:38
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Timeout structure.
Definition: ncbi_types.h:76
Modified on Tue Apr 23 07:38:14 2024 by modify_doxy.py rev. 669887