NCBI C++ ToolKit
vdb2blast_util.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: vdb2blast_util.cpp 101102 2023-10-30 13:07:22Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Vahram Avagyan
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include <common/ncbi_export.h>
35 #include "vdb_priv.h"
36 #include "vdbsequtil.h"
37 
39 USING_SCOPE(blast);
41 
42 // ==========================================================================//
43 // Constants
44 
45 // ==========================================================================//
46 
47 /// CVDBSeqInfoSrc
48 ///
49 /// Implementation of the IBlastSeqInfoSrc interface for SRA databases.
50 ///
51 /// This class can be used internally by the Blast API classes to generate
52 /// SeqIDs and other high-level sequence information from ordinal numbers
53 /// (OIDs), which represent the implementation-level numbering of sequences.
54 /// This class communicates with the CVDBBlastUtil class to convert
55 /// OIDs to SRA-specific SeqIDs.
56 
57 class CVDBSeqInfoSrc : public blast::IBlastSeqInfoSrc
58 {
59 public:
60  /// Constructor taking a CVDBBlastUtil object.
61  /// @param sraBlastUtil Properly initialized CVDBBlastUtil object [in]
62  CVDBSeqInfoSrc(CRef<CVDBBlastUtil> sraBlastUtil);
63 
64  /// Destructor.
65  virtual ~CVDBSeqInfoSrc();
66 
67  /// Method to retrieve a sequence identifier given its ordinal number.
68  /// @param oid the ordinal number to retrieve [in]
69  /// @return list of SeqIDs identifying this sequence.
70  virtual list< CRef<objects::CSeq_id> > GetId(Uint4 oid) const;
71 
72  /// Method to retrieve the sequence location given its ordinal number.
73  /// @param oid the ordinal number to retrieve [in]
74  /// @return a SeqLoc identifying this sequence.
75  virtual CConstRef<objects::CSeq_loc> GetSeqLoc(Uint4 oid) const;
76 
77  /// Method to retrieve a sequence length given its ordinal number.
78  /// @param oid the ordinal number to retrieve [in]
79  /// @return length of the sequence.
80  virtual Uint4 GetLength(Uint4 oid) const;
81 
82  /// Returns the size of the underlying container of sequences
83  /// @return total size of all the sequences.
84  virtual size_t Size() const;
85 
86  /// Returns true if the subject is restricted by a GI list,
87  /// always returns false in this implementation.
88  virtual bool HasGiList() const;
89 
90  /// Retrieves the subject masks for the corresponding oid,
91  /// always returns false in this implementation.
92  virtual bool GetMasks(Uint4 oid,
93  const vector<TSeqRange>& target_ranges,
94  TMaskedSubjRegions& retval) const;
95 
96  /// Retrieves the subject masks for the corresponding oid,
97  /// always returns false in this implementation.
98  virtual bool GetMasks(Uint4 oid,
99  const TSeqRange& target_range,
100  TMaskedSubjRegions& retval) const;
101 
102  /// Return true if the implementation can return anything besides a seq-loc
103  /// for the entire sequence. If in doubt, the implementation must
104  /// return true.
105  virtual bool CanReturnPartialSequence() const {return true;}
106 
107 private:
108  /// The CVDBBlastUtil object that takes care of various conversions.
110 };
111 
112 static const char kDigits[] = "0123456789";
113 
114 // ==========================================================================//
115 // CVDBSeqInfoSrc implementation
116 
118 : m_sraBlastUtil(sraBlastUtil)
119 {
121 }
122 
124 {
126 }
127 
128 list< CRef<CSeq_id> > CVDBSeqInfoSrc::GetId(Uint4 oid) const
129 {
130  CRef<CSeq_id> seqIdVDB =
132  ASSERT(seqIdVDB.NotEmpty());
133 
134  list< CRef<CSeq_id> > listIds;
135  listIds.push_back(seqIdVDB);
136 
137  return listIds;
138 }
139 
141 {
142  list< CRef<CSeq_id> > listIds = GetId(oid);
143  ASSERT(!listIds.empty());
144 
145  CRef<CSeq_loc> seqLoc(new CSeq_loc);
146  seqLoc->SetWhole().Assign(**listIds.begin());
147 
148  return seqLoc;
149 }
150 
152 {
154  ASSERT(seqSrc);
155 
156  return BlastSeqSrcGetSeqLen(seqSrc, (void*) &oid);
157 
158 }
159 
160 size_t CVDBSeqInfoSrc::Size() const
161 {
163  ASSERT(seqSrc);
164  return BlastSeqSrcGetNumSeqs(seqSrc);
165 }
166 
168 {
169  return false;
170 }
171 
173  const vector<TSeqRange>& target_ranges,
174  TMaskedSubjRegions& retval) const
175 {
176  return false;
177 }
178 
180  const TSeqRange& target_range,
181  TMaskedSubjRegions& retval) const
182 {
183  return false;
184 }
185 
186 // ==========================================================================//
187 /// CVDBBlastUtil implementation
188 
189 void
190 CVDBBlastUtil::x_GetSRARunAccessions(vector<string>& vecSRARunAccessions)
191 {
192  try
193  {
194  NStr::Split(m_strAllRuns, " ", vecSRARunAccessions, NStr::fSplit_Tokenize);
195  // remove any redundancy
196  set<string> string_set;
197  copy(vecSRARunAccessions.begin(),
198  vecSRARunAccessions.end(),
199  inserter(string_set, string_set.begin()));
200  vecSRARunAccessions.clear();
201  copy(string_set.begin(), string_set.end(),
202  back_inserter(vecSRARunAccessions));
203  }
204  catch (...)
205  {
207  "Failed to process SRA accession list: " + m_strAllRuns);
208  }
209 
210  if (vecSRARunAccessions.empty())
211  {
213  "Invalid SRA accession list: " + m_strAllRuns);
214  }
215 }
216 
217 
220 {
221  // Parse the SRA run accessions
222  vector<string> vecSRARunAccessions;
223  x_GetSRARunAccessions(vecSRARunAccessions);
224 
225  // Prepare the input data for SRA BlastSeqSrc construction
226  Uint4 numRuns = vecSRARunAccessions.size();
227  char** vdbRunAccessions = new char*[numRuns];
228  for (Uint4 iRun = 0; iRun < numRuns; iRun++) {
229  if (!vecSRARunAccessions[iRun].empty()) {
230  vdbRunAccessions[iRun] =
231  strdup(vecSRARunAccessions[iRun].c_str());
232  }
233  }
234 
235  Boolean * isRunExcluded = new Boolean[numRuns];
236  Uint4 rc;
237  // Construct the BlastSeqSrc object
238  BlastSeqSrc* seqSrc =
239  SRABlastSeqSrcInit((const char**)vdbRunAccessions, numRuns, false, isRunExcluded, &rc, m_isCSRAUtil, m_IncludeFilteredReads);
240 
241  string excluded_runs= kEmptyStr;
242  // Clean up
243  for (Uint4 iRun = 0; iRun < numRuns; iRun++)
244  {
245  if(isRunExcluded[iRun]) {
246  excluded_runs += vdbRunAccessions[iRun];
247  excluded_runs += " ";
248  }
249  }
250  delete [] vdbRunAccessions;
251  delete [] isRunExcluded;
252 
253  if(rc > 0 && excluded_runs != kEmptyStr) {
254  if(seqSrc)
255  BlastSeqSrcFree(seqSrc);
257  "Error opening the following db(s): " + excluded_runs);
258  }
259 
260 
261  if (!seqSrc) {
263  "Failed to construct the VDB BlastSeqSrc object");
264  }
265 
266  // Check for errors
267  char* errMsg = BlastSeqSrcGetInitError(seqSrc);
268  if (errMsg) {
269  string strErrMsg(errMsg);
270  free(errMsg);
271  BlastSeqSrcFree(seqSrc);
273  "VDB BlastSeqSrc construction failed: " + strErrMsg);
274  }
275 
276 
277  return seqSrc;
278 }
279 
280 CVDBBlastUtil::CVDBBlastUtil(const string& strAllRuns,
281  bool bOwnSeqSrc,
282  bool bCSRA,
283  bool bIncludeFilteredReads):
284  m_bOwnSeqSrc(bOwnSeqSrc), m_strAllRuns(strAllRuns), m_isCSRAUtil(bCSRA),
285  m_IncludeFilteredReads(bIncludeFilteredReads)
286 {
289  NCBI_THROW(CException, eUnknown, "VDB Num of seqs overflow");
290  }
291 }
292 
294 {
295  if (m_bOwnSeqSrc)
296  {
298  }
299 }
300 
303 {
304  if (!m_seqSrc)
305  {
306  NCBI_THROW(CException, eUnknown, "VDB BlastSeqSrc is not available");
307  }
308 
309  return m_seqSrc;
310 }
311 
314 {
315  if (!m_seqSrc)
316  {
317  NCBI_THROW(CException, eUnknown, "VDB BlastSeqSrc is not available");
318  }
319 
320  CRef<CVDBBlastUtil> refThis(this);
321  CRef<IBlastSeqInfoSrc> infoSrc(new CVDBSeqInfoSrc(refThis));
322  return infoSrc;
323 }
324 
325 static bool
326 s_IsWGSId(const string & id)
327 {
328  size_t first_digit_pos = id.find_first_of(kDigits);
329  if((first_digit_pos > 3) && (first_digit_pos <= 6)) {
330  if(id.find_first_not_of(kDigits, first_digit_pos) == std::string::npos) {
331  return true;
332  }
333  }
334  return false;
335 }
336 
337 Uint4
339 {
340  if (!m_seqSrc) {
341  NCBI_THROW(CException, eUnknown, "VDB BlastSeqSrc is not available");
342  }
343 
344  // Decode the tag to collect the SRA-specific indices
345  const char * readName = NULL;
346  string nameStr= kEmptyStr;
347  if(seqId->IsGeneral()) {
348  if (!seqId->GetGeneral().CanGetDb() ||
349  !seqId->GetGeneral().CanGetTag() ||
350  !seqId->GetGeneral().GetTag().IsStr()) {
352  "Incomplete SeqID for SRA sequence");
353  }
354 
355  // Decode the tag to collect the SRA-specific indices
356  nameStr = seqId->GetGeneral().GetTag().GetStr();
357  }
358  else {
359  nameStr = seqId->GetSeqIdString(true);
360  }
361 
362  if (nameStr == kEmptyStr) {
364  "Empty VDB tag in SeqID");
365  }
366  readName = nameStr.c_str();
367 
368  // Get the VDB Data
370  if (!vdbData) {
371  NCBI_THROW(CException, eUnknown, "Invalid VDB BlastSeqSrc");
372  }
373 
374  // Get the OID
375  Int4 oid = 0;
376  if (!VDBSRC_GetOIDFromReadName(vdbData, readName, &oid)) {
378  "Failed to get the OID for the VDB tag: " + string(readName));
379  }
380  return Uint4(oid);
381 }
382 
383 
384 bool
385 CVDBBlastUtil::IsSRA(const string & db_name)
386 {
387  if (!db_name.empty())
388  {
389  size_t last_pos = db_name.find_last_of(CDirEntry::GetPathSeparator());
390  string tmp = db_name;
391  if(last_pos != string::npos) {
392  tmp = db_name.substr(last_pos +1);
393  }
394 
395  if(tmp.find_first_of(kDigits) == 3) {
396  if(tmp.find_first_not_of(kDigits, 4) == std::string::npos)
397  return true;
398  }
399  }
400  return false;
401 }
404 {
405  if (!m_seqSrc)
406  {
407  NCBI_THROW(CException, eUnknown, "SRA BlastSeqSrc is not available");
408  }
409 
410  CRef<CSeq_id> seqId;
411  // Get the SRA Data
412  TVDBData* vdbData =
414  if (!vdbData)
415  {
416  NCBI_THROW(CException, eUnknown, "Invalid SRA BlastSeqSrc");
417  }
418 
419  char nameRun[100];
420  if(!VDBSRC_GetReadNameForOID(vdbData, oid,nameRun, 100)) {
421  return seqId;
422  }
423 
424  const string gnl_tag("gnl|SRA|");
425  string strId = string(nameRun);
426  if(m_isCSRAUtil) {
427  vector<string> tmp;
428  NStr::Split(strId, "/", tmp, NStr::fSplit_Tokenize);
429  if (NStr::Find(tmp.back(), "|") != NPOS) {
430  list<CRef<CSeq_id> > ids;
431  CSeq_id::ParseFastaIds(ids, tmp.back(), true);
434  strId = gnl_tag + strId;
435  }
436  else {
437  seqId.Reset(bs_id);
438  return seqId;
439  }
440  }
442  strId = gnl_tag + strId;
443  }
444  else {
445  strId =tmp.back();
446  }
447  }
448  else {
449  if (!s_IsWGSId(strId)) {
450  strId = gnl_tag + strId;
451  }
452  }
453  seqId.Reset(new CSeq_id(strId));
454 
455  return seqId;
456 }
457 
460 {
461  if (!m_seqSrc)
462  {
463  NCBI_THROW(CException, eUnknown, "VDB BlastSeqSrc is not available");
464  }
465 
466  CRef<CBioseq> bioseqResult;
467 
468  // Get the VDB Data
469  TVDBData* vdbData =
471  if (!vdbData)
472  {
473  NCBI_THROW(CException, eUnknown, "Invalid VDB BlastSeqSrc");
474  }
475 
476  uint64_t oid = (uint64_t) GetOIDFromVDBSeqId(seqId);
477 
478  // Read the sequence as string
479  char* cstrSeq = 0;
480  bool rc = false;
481  TVDBErrMsg errMsg;
482  VDBSRC_InitEmptyErrorMsg(&errMsg);
483  rc = VDBSRC_Get4naSequenceAsString(vdbData, oid, &cstrSeq, &errMsg);
484 
485  if(!rc)
486  {
487  char * errString;
488  VDBSRC_FormatErrorMsg(&errString,&errMsg);
489  VDBSRC_ReleaseErrorMsg(&errMsg);
490  //ERR_POST(Error << errString);
492  "Failed to read the VDB sequence string for OID=" +
493  NStr::UInt8ToString(oid));
494  }
495  VDBSRC_ReleaseErrorMsg(&errMsg);
496 
497  if (!cstrSeq || strlen(cstrSeq) == 0)
498  {
500  "Got an empty VDB sequence string for OID=" +
501  NStr::UInt8ToString(oid));
502  }
503 
504  // Store the sequence in the Bioseq
505  CRef<CSeq_data> seqData(new CSeq_data(cstrSeq, CSeq_data::e_Iupacna));
506  CRef<CSeq_inst> seqInst(new CSeq_inst);
507  seqInst->SetRepr(CSeq_inst::eRepr_raw);
508  seqInst->SetMol(CSeq_inst::eMol_dna);
509  seqInst->SetLength(strlen(cstrSeq));
510  seqInst->SetSeq_data(*seqData);
511 
512  bioseqResult.Reset(new CBioseq);
513  bioseqResult->SetInst(*seqInst);
514 
515  // Store the Seq ID in the Bioseq
516  bioseqResult->SetId().push_back(seqId);
517 
518  // Store the spot name as a title in the Bioseq
519  CRef<CSeqdesc> descTitle(new CSeqdesc);
520  string title = "Length: " + NStr::UIntToString(seqInst->GetLength());
521  descTitle->SetTitle(title);
522  bioseqResult->SetDescr().Set().push_back(descTitle);
523 
524  free(cstrSeq);
525  return bioseqResult;
526 }
527 
530 {
531  if (!m_seqSrc)
532  {
533  NCBI_THROW(CException, eUnknown, "VDB BlastSeqSrc is not available");
534  }
535 
536  CRef<CBioseq> bioseqResult;
537  TVDBErrMsg errMsg;
538  VDBSRC_InitEmptyErrorMsg(&errMsg);
539 
540  // Get the VDB Data
541  TVDBData* vdbData =
543  if (!vdbData)
544  {
545  NCBI_THROW(CException, eUnknown, "Invalid VDB BlastSeqSrc");
546  }
547 
548  if(vdbData->reader_2na == NULL)
549  {
550  if(errMsg.isError)
551  {
552  char * errString;
553  VDBSRC_FormatErrorMsg(&errString,&errMsg);
554  VDBSRC_ReleaseErrorMsg(&errMsg);
555  ERR_POST(Error << errString);
557  "2na reader has not been initialized");
558  }
559  }
560 
561  // Read the sequence as string
562  char* cstrSeq = 0;
563  bool rc = false;
564 
565  rc = VDBSRC_Get2naSequenceAsString(vdbData, oid, &cstrSeq, &errMsg);
566 
567  if((rc == FALSE) && (errMsg.isError))
568  {
569  char * errString;
570  VDBSRC_FormatErrorMsg(&errString,&errMsg);
571  VDBSRC_ReleaseErrorMsg(&errMsg);
572  ERR_POST(Error << errString);
574  "Failed to read the VDB sequence string for OID=" +
575  NStr::UInt8ToString(oid));
576  }
577  else if((rc == FALSE) && (!errMsg.isError))
578  {
579  ERR_POST(Warning << "All sequences in the set has been read");
580  VDBSRC_ReleaseErrorMsg(&errMsg);
581  return bioseqResult;
582  }
583 
584  VDBSRC_ReleaseErrorMsg(&errMsg);
585  if (!cstrSeq || strlen(cstrSeq) == 0)
586  {
588  "Got an empty VDB sequence string for OID=" +
589  NStr::UInt8ToString(oid));
590  }
591 
593  if (id.Empty())
594  {
596  "Failed to seq id for OID=" +
597  NStr::UInt8ToString(oid));
598  }
599 
600  // Store the sequence in the Bioseq
601  CRef<CSeq_data> seqData(new CSeq_data(cstrSeq, CSeq_data::e_Iupacna));
602  CRef<CSeq_inst> seqInst(new CSeq_inst);
603  seqInst->SetRepr(CSeq_inst::eRepr_raw);
604  seqInst->SetMol(CSeq_inst::eMol_dna);
605  seqInst->SetLength(strlen(cstrSeq));
606  seqInst->SetSeq_data(*seqData);
607 
608  bioseqResult.Reset(new CBioseq);
609  bioseqResult->SetInst(*seqInst);
610 
611  // Store the Seq ID in the Bioseq
612  bioseqResult->SetId().push_back(id);
613 
614  // Store the spot name as a title in the Bioseq
615  CRef<CSeqdesc> descTitle(new CSeqdesc);
616  string title = "Length: " + NStr::UIntToString(seqInst->GetLength());
617  descTitle->SetTitle(title);
618  bioseqResult->SetDescr().Set().push_back(descTitle);
619 
620  return bioseqResult;
621 }
622 
625 {
626  if (alnSet.Empty())
627  return;
628 
629  CSeq_align_set::Tdata::const_iterator itAln;
630  for (itAln = alnSet->Get().begin(); itAln != alnSet->Get().end(); itAln++)
631  {
632  CRef<CSeq_align> alnCur = *itAln;
633  const CSeq_id& subjIdFromAln = alnCur->GetSeq_id(1);
634  CRef<CSeq_id> subjId(new CSeq_id);
635  subjId->Assign(subjIdFromAln);
636 
637  CRef<CBioseq> bioseq = CreateBioseqFromVDBSeqId(subjId);
638  scope->AddBioseq(*bioseq, CScope::kPriority_Default,
640  }
641 }
642 
643 void
644 CVDBBlastUtil::FillVDBInfo(vector< CBlastFormatUtil::SDbInfo >& vecDbInfo)
645 {
646  if (!m_seqSrc)
647  {
648  NCBI_THROW(CException, eUnknown, "VDB BlastSeqSrc is not available");
649  }
650 
651  // Create a DB info structure describing the list of open SRA runs
652  CBlastFormatUtil::SDbInfo dbInfo;
653  dbInfo.is_protein = false;
654  dbInfo.name = NStr::Replace(BlastSeqSrcGetName(m_seqSrc), "|", " ") ;
655  dbInfo.definition = dbInfo.name;
656  dbInfo.total_length = BlastSeqSrcGetTotLen(m_seqSrc);
657  dbInfo.number_seqs = BlastSeqSrcGetNumSeqs(m_seqSrc);
658  vecDbInfo.push_back(dbInfo);
659 }
660 
661 CVDBBlastUtil::CVDBBlastUtil(bool bCSRA, const string& strAllRuns):
662  m_bOwnSeqSrc(true), m_strAllRuns(strAllRuns), m_isCSRAUtil(bCSRA)
663 {
665 }
666 
667 void CVDBBlastUtil::GetVDBStats(const string & strAllRuns, Uint8 & num_seqs, Uint8 & length, bool getRefStats)
668 {
669  CVDBBlastUtil util(getRefStats, strAllRuns);
670  BlastSeqSrc* seq_src = util.GetSRASeqSrc();
671  TVDBData* vdbData =
673  if (!vdbData)
674  {
675  NCBI_THROW(CException, eUnknown, "Invalid SRA BlastSeqSrc");
676  }
677 
678  num_seqs = vdbData->numSeqs;
679  length = VDBSRC_GetTotSeqLen(vdbData);
680 }
681 
682 void CVDBBlastUtil::GetVDBStats(const string & strAllRuns, Uint8 & num_seqs, Uint8 & length,
683  Uint8 & max_seq_length, Uint8 & av_seq_length, bool getRefStats)
684 {
685  CVDBBlastUtil util(getRefStats, strAllRuns);
686  BlastSeqSrc* seq_src = util.GetSRASeqSrc();
687  TVDBData* vdbData =
689  if (!vdbData)
690  {
691  NCBI_THROW(CException, eUnknown, "Invalid SRA BlastSeqSrc");
692  }
693 
694  num_seqs = vdbData->numSeqs;
695  length = VDBSRC_GetTotSeqLen(vdbData);
696  max_seq_length = VDBSRC_GetMaxSeqLen(vdbData);
697  av_seq_length = VDBSRC_GetAvgSeqLen(vdbData);
698 }
699 
700 void CVDBBlastUtil::CheckVDBs(const vector<string> & vdbs)
701 {
702  unsigned int numRuns = vdbs.size();
703 
704  char** vdbRunAccessions = new char*[numRuns];
705  for (Uint4 iRun = 0; iRun < numRuns; iRun++) {
706  if (!vdbs[iRun].empty()) {
707  vdbRunAccessions[iRun] = strdup(vdbs[iRun].c_str());
708  }
709  }
710 
711  AutoPtr<Boolean, ArrayDeleter<Boolean> > isRunExcluded(new Boolean[numRuns]);
712  Uint4 rc;
713  // Construct the BlastSeqSrc object
714  SRABlastSeqSrcInit((const char**)vdbRunAccessions, numRuns, false, isRunExcluded.get(), &rc, false, false);
715 
716  // Clean up
717  string cannot_open= kEmptyStr;
718  for (Uint4 iRun = 0; iRun < numRuns; iRun++)
719  {
720  if(isRunExcluded.get()[iRun]) {
721  cannot_open += vdbs[iRun]+ " ";
722  }
723  free(vdbRunAccessions[iRun]);
724  }
725  delete [] vdbRunAccessions;
726 
727  if(cannot_open != kEmptyStr)
728  {
729  NCBI_THROW(CException, eInvalid, "Invalid vdbs: " + cannot_open);
730  }
731 }
732 
734 {
735  Uint4 status = 0;
736  VdbBlastMgr* mgr = VDBSRC_GetVDBManager(&status);
737  if(status != 0 || mgr == NULL) {
738  NCBI_THROW(CException, eInvalid, "Fail to setup VDB manager");
739  }
740  status = VdbBlastMgrKLogHandlerSetStdErr(mgr);
741  if (status == 0) {
742  status = VdbBlastMgrKLogLibHandlerSetStdErr(mgr);
743  }
744  if (status == 0) {
745  status = VdbBlastMgrKLogLevelSetWarn(mgr);
746  }
747  return status;
748 }
749 
751 {
753 }
754 
755 bool CVDBBlastUtil::IsCSRA(const string & db_name)
756 {
757  if(CVDBBlastUtil::IsSRA(db_name)) {
758  int rv = VDBSRC_IsCSRA(db_name.c_str());
759  if(rv == 1)
760  return true;
761 
762  if(rv == -1)
763  NCBI_THROW(CException, eInvalid, "Check for csra retruns error");
764  }
765 
766  return false;
767 }
768 
769 void CVDBBlastUtil::GetAllStats(const string & strAllRuns, Uint8 & num_seqs, Uint8 & length,
770  Uint8 & ref_num_seqs, Uint8 & ref_length)
771 {
772  CVDBBlastUtil util(false, strAllRuns);
773  BlastSeqSrc* seq_src = util.GetSRASeqSrc();
774  TVDBData* vdbData = (TVDBData*)(_BlastSeqSrcImpl_GetDataStructure(seq_src));
775  if (!vdbData) {
776  NCBI_THROW(CException, eUnknown, "Invalid SRA BlastSeqSrc");
777  }
778 
779  num_seqs = vdbData->numSeqs;
780  length = VDBSRC_GetTotSeqLen(vdbData);
781 
782  TVDBErrMsg vdbErrMsg;
783  VDBSRC_InitEmptyErrorMsg(&vdbErrMsg);
784  VDBSRC_MakeCSRASeqSrcFromSRASeqSrc(vdbData, &vdbErrMsg, true);
785  if(vdbErrMsg.isError) {
786  ref_num_seqs = 0;
787  ref_length = 0;
788  }
789  else {
790  ref_num_seqs = vdbData->numSeqs;
791  ref_length = VDBSRC_GetTotSeqLen(vdbData);
792  }
793 }
794 
796 {
797  if ( seq_id.Which() == CSeq_id::e_General ) {
798  const CDbtag& dbtag = seq_id.GetGeneral();
799  if ( NStr::EqualNocase(dbtag.GetDb(), "SRA")) {
800  if ( dbtag.GetTag().IsStr() ) {
801  const string& str = dbtag.GetTag().GetStr();
802  SIZE_TYPE srr_len = str.find('/');
803  if ( srr_len != NPOS ) {
804  return eCSRALocalRefId;
805  }
806  else {
807  return eSRAId;
808  }
809  }
810  }
811  }
812  if (s_IsWGSId(seq_id.GetSeqIdString())) {
813  return eWGSId;
814  }
815 
817  return eCSRARefId;
818  }
819  return eUnknownId;
820 }
821 
823 {
825  if(mem_size == 0)
826  return 0;
827  Uint4 num_thread = (mem_size * 0.5/VDB_2NA_CHUNK_BUF_SIZE);
828 
829  return (num_thread == 0? 1: num_thread);
830 }
831 
832 void CVDBBlastUtil::GetOidsFromSeqIds_WGS(const vector<string> & ids , vector<int> & oids)
833 {
834  int num_ids = ids.size();
835  int i = 0;
836  BlastSeqSrc* seqsrc = GetSRASeqSrc();
837  TVDBData* vdbData =
839  if (!vdbData) {
840  NCBI_THROW(CException, eUnknown, "Invalid SRA BlastSeqSrc");
841  }
842 
843  Uint8 num_seqs = vdbData->numSeqs;
844  for(Uint8 j=0; j < num_seqs; j++) {
846  if(id.Empty()){
847  continue;
848  }
849  if(ids[i] == id->GetSeqIdString(true)) {
850  oids[i] = j;
851  i++;
852  if(i >= num_ids) {
853  break;
854  }
855  }
856  }
857 
858  if (i != num_ids) {
859  NCBI_THROW(CException, eInvalid, "Not all oids have been found");
860  }
861 }
862 
864 {
865  if (m_isCSRAUtil) {
866  return false;
867  }
868 
869  BlastSeqSrc* seqsrc = GetSRASeqSrc();
870  TVDBData* vdbData = (TVDBData*)(_BlastSeqSrcImpl_GetDataStructure(seqsrc));
871  if (!vdbData) {
872  NCBI_THROW(CException, eUnknown, "Invalid SRA BlastSeqSrc");
873  }
874  unsigned int num_wgs = 0;
875  vector <string> dbs;
876  NStr::Split(vdbData->names, " ", dbs, NStr::fSplit_Tokenize);
877 
878  for (unsigned int i=0; i < dbs.size(); i++) {
879  size_t last_pos = dbs[i].find_last_of(CDirEntry::GetPathSeparator());
880  string tmp = dbs[i];
881  if(last_pos != string::npos) {
882  tmp = dbs[i].substr(last_pos +1);
883  }
884 
885  if(s_IsWGSId(tmp)) {
886  num_wgs ++;
887  }
888  }
889 
890  if (num_wgs == dbs.size()) {
891  return true;
892  }
893  return false;
894 }
895 
896 // ==========================================================================//
897 
Int4 BlastSeqSrcGetSeqLen(const BlastSeqSrc *seq_src, void *oid)
Retrieve sequence length (number of residues/bases)
Definition: blast_seqsrc.c:281
char * BlastSeqSrcGetInitError(const BlastSeqSrc *seq_src)
Function to retrieve NULL terminated string containing the description of an initialization error or ...
Definition: blast_seqsrc.c:159
Int4 BlastSeqSrcGetNumSeqs(const BlastSeqSrc *seq_src)
Get the number of sequences contained in the sequence source.
Definition: blast_seqsrc.c:177
Int8 BlastSeqSrcGetTotLen(const BlastSeqSrc *seq_src)
Get the total length of all sequences in the sequence source.
Definition: blast_seqsrc.c:219
BlastSeqSrc * BlastSeqSrcFree(BlastSeqSrc *seq_src)
Frees the BlastSeqSrc structure by invoking the destructor function set by the user-defined construct...
Definition: blast_seqsrc.c:112
const char * BlastSeqSrcGetName(const BlastSeqSrc *seq_src)
Get the Blast Sequence source name (e.g.
Definition: blast_seqsrc.c:235
NCBI_XBLAST_EXPORT void * _BlastSeqSrcImpl_GetDataStructure(const BlastSeqSrc *var)
Definition: blast_seqsrc.c:555
AutoPtr –.
Definition: ncbimisc.hpp:401
Definition: Dbtag.hpp:53
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
static Uint8 GetTotalPhysicalMemorySize(void)
Return the amount of actual/total physical memory, in bytes.
CVDBBlastUtil.
BlastSeqSrc * GetSRASeqSrc()
Return the stored SRA BlastSeqSrc object.
static Uint4 GetMaxNumCSRAThread(void)
void FillVDBInfo(vector< CBlastFormatUtil::SDbInfo > &vecDbInfo)
Populate the DB info list with information on open SRA runs.
string m_strAllRuns
Space-delimited list of opened SRA run accessions.
Uint4 GetOIDFromVDBSeqId(CRef< objects::CSeq_id > seqId)
Get the ordinal number (OID) for the given SRA sequence.
void x_GetSRARunAccessions(vector< string > &vecSRARunAccessions)
Tokenize the stored whitespace-delimited string of SRA runs.
bool m_bOwnSeqSrc
Release the BlastSeqSrc object in destructor.
CRef< objects::CSeq_id > GetVDBSeqIdFromOID(Uint4 oid)
Get the SRA sequence SeqID given its ordinal number (OID).
static void GetVDBStats(const string &strAllRuns, Uint8 &num_seqs, Uint8 &length, bool getRefStats=false)
Fucntion to get around the OID (blastseqsrc) limit So num of seqs > int4 can be returned.
static Uint4 SetupVDBManager()
*Note* Call this in main thread first, if you are going to instantiate this object or use any of the ...
BlastSeqSrc * m_seqSrc
Pointer to a properly initialized SRA BlastSeqSrc.
CRef< objects::CBioseq > CreateBioseqFromVDBSeqId(CRef< objects::CSeq_id > seqId)
Construct a Bioseq object for the given SRA sequence.
static bool IsCSRA(const string &db_name)
static void CheckVDBs(const vector< string > &vdbs)
Function to check a list of dbs if they can be opened Throw an exception if any of the db cannot be o...
BlastSeqSrc * x_MakeVDBSeqSrc()
Construct an SRA BlastSeqSrc object from the given strings.
void GetOidsFromSeqIds_WGS(const vector< string > &ids, vector< int > &oids)
CRef< blast::IBlastSeqInfoSrc > GetSRASeqInfoSrc()
Return the SRA BlastSeqInfoSrc object (create if none exists).
virtual ~CVDBBlastUtil()
Destructor.
CVDBBlastUtil(const string &strAllRuns, bool bOwnSeqSrc=false, bool bCSRA=false, bool bIncludeFilteredReads=false)
Constructor that creates and stores the SRA BlastSeqSrc object.
static bool IsSRA(const string &db_name)
static void GetAllStats(const string &strAllRuns, Uint8 &num_seqs, Uint8 &length, Uint8 &ref_num_seqs, Uint8 &ref_length)
static void ReleaseVDBManager()
Call this release vdb manager if SetupManger has been explicitly called in the main thread.
void AddSubjectsToScope(CRef< CScope > scope, CConstRef< CSeq_align_set > alnSet)
Populate the CScope object with subject sequence Bioseqs.
CRef< objects::CBioseq > CreateBioseqFromOid(Uint8 oid)
static IDType VDBIdType(const CSeq_id &id)
CVDBSeqInfoSrc.
virtual bool HasGiList() const
Returns true if the subject is restricted by a GI list, always returns false in this implementation.
virtual CConstRef< objects::CSeq_loc > GetSeqLoc(Uint4 oid) const
Method to retrieve the sequence location given its ordinal number.
CVDBSeqInfoSrc(CRef< CVDBBlastUtil > sraBlastUtil)
Constructor taking a CVDBBlastUtil object.
virtual bool GetMasks(Uint4 oid, const vector< TSeqRange > &target_ranges, TMaskedSubjRegions &retval) const
Retrieves the subject masks for the corresponding oid, always returns false in this implementation.
virtual size_t Size() const
Returns the size of the underlying container of sequences.
virtual list< CRef< objects::CSeq_id > > GetId(Uint4 oid) const
Method to retrieve a sequence identifier given its ordinal number.
virtual bool CanReturnPartialSequence() const
Return true if the implementation can return anything besides a seq-loc for the entire sequence.
virtual ~CVDBSeqInfoSrc()
Destructor.
virtual Uint4 GetLength(Uint4 oid) const
Method to retrieve a sequence length given its ordinal number.
CRef< CVDBBlastUtil > m_sraBlastUtil
The CVDBBlastUtil object that takes care of various conversions.
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
const_iterator begin() const
Definition: set.hpp:135
const_iterator end() const
Definition: set.hpp:136
#define VDBSRC_OVERFLOW_RV
Definition: common_priv.h:109
void VDBSRC_ReleaseErrorMsg(TVDBErrMsg *vdbErrMsg)
Release the Error message.
Definition: error_priv.c:142
void VDBSRC_InitEmptyErrorMsg(TVDBErrMsg *vdbErrMsg)
Initialize an empty Error message (No Error).
Definition: error_priv.c:131
void VDBSRC_FormatErrorMsg(char **errMsg, const TVDBErrMsg *vdbErrMsg)
Format the error message as a single human-readable string.
Definition: error_priv.c:154
bool Empty(const CNcbiOstrstream &src)
Definition: fileutil.cpp:523
#define true
Definition: bool.h:35
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
Uint8 uint64_t
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:469
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
Definition: Seq_id.cpp:2603
static int Score(const CRef< CSeq_id > &id)
Wrappers for use with FindBestChoice from <corelib/ncbiutil.hpp>
Definition: Seq_id.hpp:772
@ eAcc_unknown
Definition: Seq_id.hpp:322
@ fParse_RawText
Try to ID raw non-numeric accessions.
Definition: Seq_id.hpp:81
@ fParse_AnyRaw
Definition: Seq_id.hpp:83
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
@ eExist_Get
Definition: scope.hpp:260
@ kPriority_Default
Use default priority for added data.
Definition: scope.hpp:100
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3314
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5109
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
Definition: ncbistr.hpp:5168
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
bool CanGetDb(void) const
Check if it is safe to call GetDb method.
Definition: Dbtag_.hpp:214
bool CanGetTag(void) const
Check if it is safe to call GetTag method.
Definition: Dbtag_.hpp:261
const TDb & GetDb(void) const
Get the Db member data.
Definition: Dbtag_.hpp:220
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const Tdata & Get(void) const
Get the member data.
bool IsGeneral(void) const
Check if variant General is selected.
Definition: Seq_id_.hpp:877
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
@ e_General
for other databases
Definition: Seq_id_.hpp:105
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
int i
constexpr bool empty(list< Ts... >) noexcept
#define strdup
Definition: ncbi_ansi_ext.h:70
Defines to provide correct exporting from DLLs in some configurations.
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
BlastSeqSrc * SRABlastSeqSrcInit(const char **sraRunAccessions, Uint4 numRuns, Boolean isProtein, Boolean *excluded_runs, Uint4 *status, Boolean isCSRA, Boolean include_filtered_reads)
Allocate and initialize the SRA BlastSeqSrc object.
Definition: seqsrc_vdb.c:546
#define uint64_t
Definition: config.h:48
Complete type definition of Blast Sequence Source ADT.
Definition: blast_seqsrc.c:43
Structure providing top-level VDB data access.
Definition: vdb_priv.h:78
TVDB2naICReader * reader_2na
Definition: vdb_priv.h:94
uint64_t numSeqs
Definition: vdb_priv.h:85
char * names
Names of the VDB data represented by this object (usually this will include all the SVDB run accessio...
Definition: vdb_priv.h:92
Structure describing the error messages the library can generate.
Definition: error_priv.h:89
Boolean isError
True if the object describes an error.
Definition: error_priv.h:90
USING_SCOPE(blast)
static const char kDigits[]
static bool s_IsWGSId(const string &id)
Boolean VDBSRC_GetOIDFromReadName(TVDBData *vdbData, const char *nameRun, Int4 *oid)
Get the sequence OID given its SRA-specific sequence information.
Definition: vdb_priv.c:665
uint64_t VDBSRC_GetAvgSeqLen(TVDBData *vdbData)
Get the average sequence length in the open SRA data.
Definition: vdb_priv.c:67
int VDBSRC_IsCSRA(const char *run)
Return 1 if run is csra, 0 if not and -1 for error.
Definition: vdb_priv.c:745
VdbBlastMgr * VDBSRC_GetVDBManager(uint32_t *status)
This will call VdbBlastInit and intiailize a singleton for VDBBlastMgr This needs to be called in the...
Definition: vdb_priv.c:732
void VDBSRC_ReleaseVDBManager()
This needs to be called if VDBSRC_GetVDBManager has been called in the main thread.
Definition: vdb_priv.c:738
uint64_t VDBSRC_GetTotSeqLen(TVDBData *vdbData)
Get the total sequence length in the open SRA data.
Definition: vdb_priv.c:84
void VDBSRC_MakeCSRASeqSrcFromSRASeqSrc(TVDBData *vdbData, TVDBErrMsg *vdbErrMsg, Boolean getStats)
Definition: vdb_priv.c:788
uint64_t VDBSRC_GetMaxSeqLen(TVDBData *vdbData)
Get the maximum sequence length in the open SRA data.
Definition: vdb_priv.c:50
Boolean VDBSRC_GetReadNameForOID(TVDBData *vdbData, Int4 oid, char *name_buffer, size_t buf_size)
Get the SRA-specific sequence information for the given OID.
Definition: vdb_priv.c:368
File contains internal structures and functions for reading VDB databases.
#define VDB_2NA_CHUNK_BUF_SIZE
Definition: vdb_priv.h:275
Boolean VDBSRC_Get4naSequenceAsString(TVDBData *vdbData, uint64_t oid, char **seqIupacna, TVDBErrMsg *vdbErrMsg)
Access and convert the selected sequence to a human-readable string.
Definition: vdbsequtil.c:534
Boolean VDBSRC_Get2naSequenceAsString(TVDBData *vdbData, uint64_t oid, char **seqIupacna, TVDBErrMsg *vdbErrMsg)
Definition: vdbsequtil.c:584
void free(voidpf ptr)
Modified on Thu Apr 25 08:16:55 2024 by modify_doxy.py rev. 669887