NCBI C++ ToolKit
bioseq_extract_data_priv.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 
2 /* ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file bioseq_extract_data_priv.cpp
31  * Implementations of CBlastQuerySourceBioseqSet and
32  * CBlastSeqVectorFromCSeq_data classes.
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 // BLAST API includes
40 
41 // Sequence utilities includes
44 
45 // Serial includes
46 #include <serial/iterator.hpp>
47 #include <serial/enumvalues.hpp>
48 
49 // Object includes
52 #include <objects/seq/Seqdesc.hpp>
54 #include <objects/seq/Seq_inst.hpp>
55 #include <objects/general/User_object.hpp> // for has_pair
58 
59 // Private BLAST API headers
60 #include "blast_setup.hpp"
62 
63 /** @addtogroup AlgoBlast
64  *
65  * @{
66  */
67 
70 BEGIN_SCOPE(blast)
71 
72 /////////////////////////////////////////////////////////////////////////////
73 //
74 // CBlastSeqVectorFromCSeq_data
75 //
76 /////////////////////////////////////////////////////////////////////////////
77 
79  (const objects::CSeq_data& seq_data, TSeqPos length)
80 {
81  m_SequenceData.reserve(length);
82  m_Strand = eNa_strand_plus;
83 
84  switch (seq_data.Which()) {
85  // Nucleotide encodings
87  CSeqConvert::Convert(seq_data.GetNcbi2na().Get(),
88  CSeqUtil::e_Ncbi2na, 0, length,
89  m_SequenceData, CSeqUtil::e_Ncbi2na_expand);
90  m_Encoding = CSeqUtil::e_Ncbi2na_expand;
91  break;
93  CSeqConvert::Convert(seq_data.GetNcbi4na().Get(),
94  CSeqUtil::e_Ncbi4na, 0, length,
95  m_SequenceData, CSeqUtil::e_Ncbi4na_expand);
96  m_Encoding = CSeqUtil::e_Ncbi4na_expand;
97  break;
99  CSeqConvert::Convert(seq_data.GetIupacna().Get(),
100  CSeqUtil::e_Iupacna, 0, length,
101  m_SequenceData, CSeqUtil::e_Ncbi4na_expand);
102  m_Encoding = CSeqUtil::e_Ncbi4na_expand;
103  break;
104 
105  // Protein encodings
107  m_SequenceData = const_cast< vector<char>& >
108  (seq_data.GetNcbistdaa().Get());
109  m_Encoding = CSeqUtil::e_Ncbistdaa;
110  break;
111  case CSeq_data::e_Ncbieaa:
112  CSeqConvert::Convert(seq_data.GetNcbieaa().Get(),
113  CSeqUtil::e_Ncbieaa, 0, length,
114  m_SequenceData, CSeqUtil::e_Ncbistdaa);
115  m_Encoding = CSeqUtil::e_Ncbistdaa;
116  break;
118  CSeqConvert::Convert(seq_data.GetIupacaa().Get(),
119  CSeqUtil::e_Iupacaa, 0, length,
120  m_SequenceData, CSeqUtil::e_Ncbistdaa);
121  m_Encoding = CSeqUtil::e_Ncbistdaa;
122  break;
123  default:
124  NCBI_THROW(CBlastException, eNotSupported, "Encoding not handled in " +
125  string(NCBI_CURRENT_FUNCTION) + " " +
126  NStr::IntToString((int) seq_data.Which()));
127  }
128 }
129 
130 void
131 CBlastSeqVectorFromCSeq_data::SetCoding(objects::CSeq_data::E_Choice c)
132 {
133  if (c != CSeq_data::e_Ncbi2na && c != CSeq_data::e_Ncbi4na &&
134  c != CSeq_data::e_Ncbistdaa) {
135  NCBI_THROW(CBlastException, eInvalidArgument,
136  "Requesting invalid encoding, only Ncbistdaa, Ncbi4na, "
137  "and Ncbi2na are supported");
138  }
139 
141  // FIXME: are ambiguities randomized if the encoding requested is
142  // ncbi2na?
143  vector<char> tmp;
145  0, size(),
146  tmp,
148  _ASSERT(nconv == tmp.size());
149  nconv += 0; // to eliminate compiler warning
152  }
153 }
154 
155 inline TSeqPos
157 {
158  return static_cast<TSeqPos>(m_SequenceData.size());
159 }
160 
161 inline Uint1
163 {
164  // N.B.: we're not using the at() method for compatibility with GCC 2.95
165  if (pos >= x_Size()) {
166  NCBI_THROW(CCoreException, eInvalidArg,
167  "CBlastSeqVectorFromCSeq_data: position out of range");
168  }
169  return m_SequenceData[pos];
170 }
171 
174 {
176  SBlastSequence retval(size());
177  int i = 0;
178  ITERATE(vector<char>, itr, m_SequenceData) {
179  retval.data.get()[i++] = *itr;
180  }
181  return retval;
182 }
183 
184 void
186 {
187  if (GetStrand() != eNa_strand_plus) {
189  }
190 }
191 
192 void
194 {
195  if (GetStrand() != eNa_strand_minus) {
197  }
198 }
199 
200 void
202 {
204  m_Encoding, 0, size()));
205  _ASSERT(nconv == size());
206  nconv += 0; // eliminate compiler warning
207 }
208 
211 (objects::CSeq_data::E_Choice c)
212 {
213  switch (c) {
217  default: NCBI_THROW(CBlastException, eNotSupported,
218  "Encoding not handled in " +
219  string(NCBI_CURRENT_FUNCTION));
220 
221  }
222 }
223 
224 /////////////////////////////////////////////////////////////////////////////
225 //
226 // CBlastQuerySourceBioseqSet
227 //
228 /////////////////////////////////////////////////////////////////////////////
229 
231  (const objects::CBioseq_set& bss, bool is_prot)
232  : m_IsProt(is_prot)
233 {
234  // sacrifice speed for protection against infinite loops
236  for (; itr; ++itr) {
237  x_BioseqSanityCheck(*itr);
238  m_Bioseqs.push_back(CConstRef<objects::CBioseq>(&*itr));
239  }
240 }
241 
243  (const objects::CBioseq& bioseq, bool is_prot)
244  : m_IsProt(is_prot)
245 {
246  x_BioseqSanityCheck(bioseq);
247  m_Bioseqs.push_back(CConstRef<objects::CBioseq>(&bioseq));
248 }
249 
252 {
253  // Although the strand represented in the Bioseq is always the plus
254  // strand, the default for searching BLAST is both strands in the
255  // query, unless specified otherwise in the BLAST options
257 }
258 
259 TSeqPos
261 {
262  return static_cast<TSeqPos>(m_Bioseqs.size());
263 }
264 
267 {
269 }
270 
273 {
274  return TMaskedQueryRegions();
275 }
276 
279 {
280  CRef<objects::CSeq_loc> retval(new objects::CSeq_loc);
281  retval->SetWhole().Assign(*m_Bioseqs[index]->GetFirstId());
282  // FIXME: make sure this works (perhaps we need to build our own
283  // Seq-interval
284  return retval;
285 }
286 
287 const CSeq_id*
289 {
290  return m_Bioseqs[index]->GetFirstId();
291 }
292 
293 Uint4
295 {
296  Uint4 retval = numeric_limits<Uint4>::max(); // i.e.: not applicable
297  if (m_IsProt) {
298  return retval;
299  }
300 
301  ITERATE(CSeq_descr::Tdata, itr, m_Bioseqs[index]->GetDescr().Get()) {
302  if ((*itr)->IsSource()) {
303  retval = (*itr)->GetSource().GetGenCode();
304  break;
305  }
306  }
307  return retval;
308 }
309 
312  EBlastEncoding encoding,
313  objects::ENa_strand strand,
314  ESentinelType sentinel,
315  string* warnings) const
316 {
317  const objects::CSeq_inst& inst = m_Bioseqs[index]->GetInst();
318  if ( !inst.CanGetLength()) {
319  NCBI_THROW(CBlastException, eInvalidArgument,
320  "Cannot get sequence length");
321  }
322  if ( !inst.CanGetSeq_data() ) {
323  NCBI_THROW(CBlastException, eInvalidArgument,
324  "Cannot get sequence data");
325  }
326 
327  CBlastSeqVectorFromCSeq_data seq_data(inst.GetSeq_data(), inst.GetLength());
328  return GetSequence_OMF(seq_data, encoding, strand, sentinel, warnings);
329 }
330 
331 TSeqPos
333 {
334  if ( !m_Bioseqs[index]->GetInst().CanGetLength() ) {
335  NCBI_THROW(CBlastException, eInvalidArgument,
336  "Bioseq " + NStr::IntToString(index) + " does not "
337  "have its length field set");
338  }
339  return m_Bioseqs[index]->GetInst().GetLength();
340 }
341 
342 // Lifted from s_GetFastaTitle in objmgr/util/sequence.cpp as this needs to be
343 // object manager free :(
344 string
346 {
347  string retval(kEmptyStr);
348  CConstRef<CBioseq> bioseq = m_Bioseqs[index];
349  if ( !bioseq->CanGetDescr() ) {
350  return retval;
351  }
352  const CSeq_descr::Tdata& descr = bioseq->GetDescr().Get();
353  string title(kEmptyStr);
354  bool has_molinfo = false;
355  ITERATE(CSeq_descr::Tdata, desc, descr) {
356  if ((*desc)->Which() == CSeqdesc::e_Title && title == kEmptyStr) {
357  title = (*desc)->GetTitle();
358  }
359  if ((*desc)->Which() == CSeqdesc::e_Molinfo) {
360  has_molinfo = true;
361  }
362  }
363 
364  if (title != kEmptyStr && !has_molinfo) {
365  while (NStr::EndsWith(title, ".") || NStr::EndsWith(title, " ")) {
366  title.erase(title.end() - 1);
367  }
368  retval.assign(title);
369  }
370 
371  return retval;
372 }
373 
374 bool
376 {
377  return GetSegmentInfo(index) == eFirstSegment;
378 }
379 
380 
381 int
383 {
384  // FIXME: this is a hack, a better field in Bioseq may be needed to store
385  // this information
386  CConstRef<CBioseq> bioseq = m_Bioseqs[index];
387  int retval = 0;
388  if (!bioseq->CanGetDescr()) {
389  return retval;
390  }
391  const CSeq_descr::Tdata& descr = bioseq->GetDescr().Get();
392  ITERATE(CSeq_descr::Tdata, desc, descr) {
393  if ((*desc)->Which() == CSeqdesc::e_User) {
394 
395  if (!(*desc)->GetUser().IsSetType() ||
396  !(*desc)->GetUser().GetType().IsStr() ||
397  (*desc)->GetUser().GetType().GetStr() != "Mapping") {
398  continue;
399  }
400 
401  if (!(*desc)->GetUser().HasField("has_pair")) {
402  break;
403  }
404 
405  const CUser_field& field = (*desc)->GetUser().GetField("has_pair");
406  if (!field.GetData().IsInt()) {
407  break;
408  }
409 
410  retval = field.GetData().GetInt();
411  }
412  }
413 
414  return retval;
415 }
416 
417 
418 void
420 {
421  // Verify that the correct representation is used
422  switch (objects::CSeq_inst::ERepr repr = bs.GetInst().GetRepr()) {
423  case objects::CSeq_inst::eRepr_raw: break;
424  default:
425  {
426  const CEnumeratedTypeValues* p =
428  string msg = p->FindName(repr, false) + " is not supported for "
429  "BLAST query sequence data - Use object manager "
430  "interface or provide " +
431  p->FindName(CSeq_inst::eRepr_raw, false) +
432  " representation";
433  NCBI_THROW(CBlastException, eNotSupported, msg);
434  }
435  }
436 
437  // Verify that the molecule of the data is the same as the one
438  // specified by the program requested
439 
440  if ( bs.GetInst().IsAa() && !m_IsProt ) {
441  NCBI_THROW(CBlastException, eInvalidArgument,
442  "Protein Bioseq specified in program which expects "
443  "nucleotide query");
444  }
445 
446  if ( bs.GetInst().IsNa() && m_IsProt ) {
447  NCBI_THROW(CBlastException, eInvalidArgument,
448  "Nucleotide Bioseq specified in program which expects "
449  "protein query");
450  }
451 }
452 
453 
454 END_SCOPE(blast)
456 
457 /* @} */
Internal auxiliary setup classes/functions for extracting sequence data from Bioseqs.
Declares the BLAST exception class.
Declares class to encapsulate all BLAST options.
@ eFirstSegment
The first sequence of a pair with both sequences read and accepted.
Internal auxiliary setup classes/functions for C++ BLAST APIs.
Defines BLAST error codes (user errors included)
Implementation of the IBlastSeqVector interface which obtains data from a CSeq_data object.
CCoreException –.
Definition: ncbiexpt.hpp:1476
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
static SIZE_TYPE ReverseComplement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
@ e_Ncbi4na_expand
Definition: sequtil.hpp:51
@ e_Iupacna
Definition: sequtil.hpp:47
@ e_Ncbieaa
Definition: sequtil.hpp:57
@ e_Ncbi4na
Definition: sequtil.hpp:50
@ e_Ncbi2na_expand
Definition: sequtil.hpp:49
@ e_Ncbi2na
Definition: sequtil.hpp:48
@ e_Ncbistdaa
Definition: sequtil.hpp:58
@ e_Iupacaa
Definition: sequtil.hpp:55
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user field.
Definition: User_field.cpp:211
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
static char tmp[3200]
Definition: utf8.c:42
void x_BioseqSanityCheck(const objects::CBioseq &bs)
Auxiliary method to do some sanity checks on the input Bioseqs.
USING_SCOPE(objects)
vector< CConstRef< objects::CBioseq > > m_Bioseqs
Contains pointers to the input data.
virtual CConstRef< objects::CSeq_loc > GetSeqLoc(int index) const
Return the CSeq_loc associated with a sequence.
EBlastEncoding
Different types of sequence encodings for sequence retrieval from the BLAST database.
virtual bool IsFirstOfAPair(int index) const
Is this sequence followed by a mate (for mapping short reads)
SBlastSequence GetSequence_OMF(IBlastSeqVector &sv, EBlastEncoding encoding, objects::ENa_strand strand, ESentinelType sentinel, std::string *warnings=0)
Object manager free version of GetSequence.
virtual TMaskedQueryRegions GetMaskedRegions(int index)
Return the filtered (masked) regions for a sequence.
virtual TSeqPos x_Size() const
@inheritDoc
virtual void x_SetMinusStrand()
@inheritDoc
objects::ENa_strand GetStrand() const
Accessor for the strand currently set.
virtual objects::ENa_strand GetStrand(int index) const
Return strand for a sequence.
virtual Uint4 GetGeneticCodeId(int index) const
Retrieve the genetic code associated with a sequence.
virtual SBlastSequence GetBlastSequence(int index, EBlastEncoding encoding, objects::ENa_strand strand, ESentinelType sentinel, string *warnings=0) const
Return the sequence data for a sequence.
virtual TSeqPos GetLength(int index) const
Return the length of a sequence.
virtual int GetSegmentInfo(int index) const
Get segment information (for mapping paired short reads)
bool m_IsProt
True if the data contained in this object is protein.
TAutoUint1Ptr data
Sequence data.
Definition: blast_setup.hpp:64
ESentinelType
Allows specification of whether sentinel bytes should be used or not.
Definition: blast_setup.hpp:93
virtual const objects::CSeq_id * GetSeqId(int index) const
Return the sequence identifier associated with a sequence.
CSeqUtil::ECoding m_Encoding
Encoding used in the data above.
CBlastQuerySourceBioseqSet(const objects::CBioseq_set &bss, bool is_prot)
Parametrized constructor for a Bioseq-set.
virtual void x_SetPlusStrand()
@inheritDoc
void x_ComplementData()
Complements the data in m_SequenceData in place.
virtual void SetCoding(objects::CSeq_data::E_Choice c)
@inheritDoc
virtual Uint1 operator[](TSeqPos pos) const
@inheritDoc
CSeqUtil::ECoding x_Encoding_CSeq_data2CSeqUtil(objects::CSeq_data::E_Choice c)
Auxiliary function to map the description of the encoding in CSeq_data::EChoice to CSeqUtil::ECoding.
vector< char > m_SequenceData
Container for the actual sequence data.
virtual SBlastSequence GetCompressedPlusStrand()
@inheritDoc
TSeqPos size() const
Returns the length of the sequence data (in the case of nucleotides, only one strand)
virtual string GetTitle(int index) const
Return the title of a sequence.
virtual TSeqPos Size() const
Return the number of elements in the sequence container.
virtual CConstRef< objects::CSeq_loc > GetMask(int index)
Return the filtered (masked) regions for a sequence.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:469
#define NCBI_CURRENT_FUNCTION
Get current function name.
Definition: ncbidiag.hpp:142
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & FindName(TEnumValueType value, bool allowBadValue) const
Find name of the enum by its numeric value.
Definition: enumerated.cpp:146
#define ENUM_METHOD_NAME(EnumName)
Definition: serialbase.hpp:994
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
Definition: iterator.hpp:1012
@ eDetectLoops
Definition: iterator.hpp:998
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
const TData & GetData(void) const
Get the Data member data.
bool IsInt(void) const
Check if variant Int is selected.
TInt GetInt(void) const
Get the variant data.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
Definition: Bioseq_.hpp:309
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Ncbi2na
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
@ e_Ncbi4na
4 bit nucleic acid code
Definition: Seq_data_.hpp:107
@ e_Iupacaa
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
int i
const TYPE & Get(const CNamedParameterList *param)
T max(T x_, T y_)
Structure to store sequence data and its length for use in the CORE of BLAST (it's a malloc'ed array ...
Definition: blast_setup.hpp:62
#define _ASSERT
#define const
Definition: zconf.h:232
Modified on Wed Jun 19 17:07:32 2024 by modify_doxy.py rev. 669887