NCBI C++ ToolKit
msa_pssm_input.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* ===========================================================================
2  *
3  * PUBLIC DOMAIN NOTICE
4  * National Center for Biotechnology Information
5  *
6  * This software/database is a "United States Government Work" under the
7  * terms of the United States Copyright Act. It was written as part of
8  * the author's official duties as a United States Government employee and
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author: Christiam Camacho
26  *
27  */
28 
29 /** @file msa_pssm_input.cpp
30  * Implementation of the concrete strategy to obtain PSSM input data for
31  * PSI-BLAST from a multiple sequence alignment.
32  */
33 
34 #include <ncbi_pch.hpp>
35 
36 // BLAST includes
39 #include "../core/blast_psi_priv.h" // for kQueryIndex
40 
41 // Objtools includes
42 #include <objtools/readers/aln_reader.hpp> // for CAlnReader
43 #include <objtools/readers/reader_exception.hpp> // for CObjReaderParseException
44 
45 // Object includes
46 #include <objects/seq/Bioseq.hpp>
47 #include <objects/seq/Seq_inst.hpp>
48 #include <objects/seq/Seq_data.hpp>
50 
51 // Serial includes
52 #include <serial/iterator.hpp> // for CTypeIterator, Begin
53 
54 /** @addtogroup AlgoBlast
55  *
56  * @{
57  */
58 
61 BEGIN_SCOPE(blast)
62 
63 /// The representation of a gap in ASCII format
64 static const char kGapChar('-');
65 
66 //////////////////////////////////////////////////////////////////////////////
67 
70  const PSIBlastOptions& opts,
71  const char* matrix_name /* = NULL */,
72  const PSIDiagnosticsRequest* diags /* = NULL */,
73  const unsigned char* query /* = NULL */,
74  unsigned int query_length /* = 0 */,
75  int gap_existence /* = 0 */,
76  int gap_extension /* = 0 */,
77  unsigned int msa_master_idx /* = 0 */)
78  : m_Query(0), m_GapExistence(gap_existence), m_GapExtension(gap_extension)
79 {
80  if (query) {
81  _ASSERT(query_length);
82  m_MsaDimensions.query_length = query_length;
83  m_Query.reset(new Uint1[query_length]);
84  memcpy((void*) m_Query.get(), (void*) query, query_length);
85  }
86 
87  m_Opts = opts;
88  m_Opts.ignore_unaligned_positions = true;
89 
90  x_ReadAsciiMsa(input_file);
91  if ( !m_Query || msa_master_idx != 0) {
92  x_ExtractQueryFromMsa(msa_master_idx);
93  }
94  x_ValidateQueryInMsa();
95  _ASSERT(m_Query);
96  _ASSERT(m_MsaDimensions.query_length);
97  // query is included in m_AsciiMsa, so decrement it by 1
98  m_MsaDimensions.num_seqs = static_cast<Uint4>(m_AsciiMsa.size() - 1);
99 
100  m_Msa = NULL;
101 
102  // Default value provided by base class
103  m_MatrixName = string(matrix_name ? matrix_name : "");
104  if (diags) {
105  m_DiagnosticsRequest = PSIDiagnosticsRequestNew();
106  *m_DiagnosticsRequest = *diags;
107  } else {
108  m_DiagnosticsRequest = NULL;
109  }
110 }
111 
113 {
114  PSIMsaFree(m_Msa);
116 }
117 
118 void
120 {
121  _ASSERT(m_AsciiMsa.empty());
122  CAlnReader reader(input_file);
124  try {
125  reader.Read(false, true);
126  } catch (const CObjReaderParseException& e) {
127  // Workaround to provide a more useful error message when repeated
128  // Seq-IDs are encountered
130  (NStr::Find(e.GetMsg(), "Not all sequences have same length") != NPOS)) {
131  string msg("Repeated Seq-IDs detected in multiple sequence ");
132  msg += "alignment file, please ensure all Seq-IDs are unique ";
133  msg += "before proceeding.";
134  NCBI_THROW(CBlastException, eInvalidOptions, msg);
135  }
136  }
137  m_AsciiMsa = reader.GetSeqs();
138  m_SeqEntry = reader.GetSeqEntry();
139  // Test our post-condition
140  _ASSERT( !m_AsciiMsa.empty() );
141  _ASSERT( !m_SeqEntry.Empty() );
142 }
143 
144 /// Auxiliary function to retrieve the sequence data in NCBI-stdaa format from
145 /// the bioseq.
146 /// @param bioseq Bioseq to extract the data from [in]
147 /// @param query_length size of the query [in]
148 /// @param retval return value of this function [in|out]
149 /// @sa CPssm::GetQuerySequenceData
150 static void
151 s_GetQuerySequenceData(const CBioseq& bioseq, size_t query_length, CNCBIstdaa& retval)
152 {
153  const CSeq_data& seq_data = bioseq.GetInst().GetSeq_data();
154  retval.Set().reserve(query_length);
155  if ( !seq_data.IsNcbistdaa() ) {
156  CSeq_data ncbistdaa;
157  CSeqportUtil::Convert(seq_data, &ncbistdaa, CSeq_data::e_Ncbistdaa);
158  copy(ncbistdaa.GetNcbistdaa().Get().begin(),
159  ncbistdaa.GetNcbistdaa().Get().end(),
160  back_inserter(retval.Set()));
161  } else {
162  copy(seq_data.GetNcbistdaa().Get().begin(),
163  seq_data.GetNcbistdaa().Get().end(),
164  back_inserter(retval.Set()));
165  }
166 }
167 
168 /// Returns true iff sequence is identical to query
169 static bool
171 {
172  bool retval = true;
173  for (TSeqPos i = 0; i < sequence.Get().size(); i++) {
174  if (sequence.Get()[i] != query[i]) {
175  retval = false;
176  break;
177  }
178  }
179  return retval;
180 }
181 
182 void
184 {
185  // Test our pre-conditions
188 
189  for (CTypeIterator<CBioseq> itr(Begin(*m_SeqEntry)); itr; ++itr) {
190  _ASSERT(itr->IsAa());
191  if (itr->GetLength() != GetQueryLength()) {
192  continue;
193  }
194  // let's check the sequence data
195  CNCBIstdaa sequence;
196  s_GetQuerySequenceData(*itr, GetQueryLength(), sequence);
197  if (s_AreSequencesEqual(sequence, m_Query.get())) {
198  m_QueryBioseq.Reset(&*itr);
199  break;
200  }
201  }
202  // note that the title cannot be set because we're getting the query
203  // sequence from the multiple sequence alignment file via CAlnReader
204 
205  // Test our post-condition
207 }
208 
209 void
211 {
212  // Create multiple alignment data structure and populate with query
213  // sequence
215  if ( !m_Msa ) {
216  NCBI_THROW(CBlastSystemException, eOutOfMemory,
217  "Multiple alignment data structure");
218  }
219 
223 }
224 
225 void
227 {
228  const size_t kAligmentLength = m_AsciiMsa.front().size();
229  const char kMaskingRes = NCBISTDAA_TO_AMINOACID[kProtMask];
230  _ASSERT( !m_AsciiMsa.empty() );
231 
232  size_t seq_idx = 0;
233  for (; seq_idx < m_AsciiMsa.size(); seq_idx++) {
234  size_t query_idx = 0;
235  for (size_t align_idx = 0;
236  align_idx < kAligmentLength && query_idx < GetQueryLength();
237  align_idx++) {
238  if (m_AsciiMsa[seq_idx][align_idx] == kGapChar) {
239  continue;
240  }
241  char query_res = NCBISTDAA_TO_AMINOACID[m_Query.get()[query_idx]];
242  const char kCurrentRes = toupper(m_AsciiMsa[seq_idx][align_idx]);
243  /* Selenocysteines are replaced by X's in query; test for this
244  * possibility */
245  if (query_res == kMaskingRes && kCurrentRes == 'U') {
246  query_res = kCurrentRes;
247  }
248  if (query_res != kCurrentRes) {
249  break; // character mismatch
250  } else {
251  query_idx++;
252  }
253  }
254 
255  if (query_idx == GetQueryLength()) {
256  break;
257  }
258  }
259 
260  if (seq_idx < m_AsciiMsa.size()) {
261  // If the query was found at position seq_idx, swap it with the first
262  // element in the m_AsciiMsa vector
263  for (size_t align_idx = 0; align_idx < kAligmentLength; align_idx++) {
264  swap(m_AsciiMsa[seq_idx][align_idx], m_AsciiMsa.front()[align_idx]);
265  }
266  } else {
267  string msg("No sequence in the multiple sequence alignment provided ");
268  msg += "matches the query sequence";
269  NCBI_THROW(CBlastException, eInvalidOptions, msg);
270  }
271 }
272 
273 void
274 CPsiBlastInputClustalW::x_ExtractQueryFromMsa(unsigned int msa_master_idx/*=0*/)
275 {
276  if (msa_master_idx >= m_AsciiMsa.size()) {
277  CNcbiOstrstream oss;
278  oss << "Invalid master sequence index, please use a value between 1 "
279  << "and " << m_AsciiMsa.size();
280  NCBI_THROW(CBlastException, eInvalidOptions,
282  }
283  const string& kQuery = m_AsciiMsa.at(msa_master_idx);
284  size_t kNumGaps = 0;
285  ITERATE(string, residue, kQuery) {
286  if (*residue == kGapChar) {
287  kNumGaps++;
288  }
289  }
290  const unsigned int kQueryLength = static_cast<unsigned int>(kQuery.size() - kNumGaps);
291 
292  m_MsaDimensions.query_length = kQueryLength;
293  m_Query.reset(new Uint1[kQueryLength]);
294  unsigned int query_idx = 0;
295  ITERATE(string, residue, kQuery) {
296  _ASSERT(isalpha(*residue) || *residue == kGapChar);
297  if (*residue == kGapChar) {
298  continue;
299  }
300  m_Query.get()[query_idx] = AMINOACID_TO_NCBISTDAA[toupper(*residue)];
301  query_idx++;
302  }
303  _ASSERT(query_idx == kQueryLength);
304 
305  // Test our post-conditions
306  _ASSERT(m_Query.get() != NULL);
308 }
309 
310 void
312 {
313  _ASSERT(m_Msa);
314  const string& ascii_query = m_AsciiMsa.front();
315 
316  unsigned int query_idx = 0;
317  ITERATE(string, residue, ascii_query) {
318  if (*residue == kGapChar) {
319  continue;
320  }
321  m_Msa->data[kQueryIndex][query_idx].letter = m_Query.get()[query_idx];
322  m_Msa->data[kQueryIndex][query_idx].is_aligned =
323  (isupper(*residue) ? true : false);
324  query_idx++;
325  }
326  _ASSERT(query_idx == GetQueryLength());
327 }
328 
329 void
331 {
332  const size_t kAlignmentLength = m_AsciiMsa.front().size();
333  _ASSERT( !m_AsciiMsa.empty() );
334 
335  size_t seq_index = kQueryIndex + 1;
336  for (; seq_index < m_AsciiMsa.size(); seq_index++) {
337  size_t query_idx = 0;
338  for (size_t align_idx = 0; align_idx < kAlignmentLength; align_idx++) {
339  if (m_AsciiMsa.front()[align_idx] == kGapChar) {
340  continue;
341  }
342  _ASSERT(toupper(m_AsciiMsa.front()[align_idx]) ==
343  NCBISTDAA_TO_AMINOACID[m_Query.get()[query_idx]]);
344  const char kCurrentRes = m_AsciiMsa[seq_index][align_idx];
345  _ASSERT(isalpha(kCurrentRes) || kCurrentRes == kGapChar);
346  m_Msa->data[seq_index][query_idx].letter =
347  AMINOACID_TO_NCBISTDAA[(int) toupper(kCurrentRes)];
348  m_Msa->data[seq_index][query_idx].is_aligned = true;
349  query_idx++;
350  }
351  }
352 
353  // set flanking gaps and long internal gaps as unaligned
354  seq_index = kQueryIndex + 1;
355  const int kGapResidue = 0;
356  const int kLongGapLen = 10;
357  for (; seq_index < m_Msa->dimensions->num_seqs + 1; seq_index++) {
358  size_t i = 0;
359 
360  // find left flanking gaps
361  while (i < m_Msa->dimensions->query_length &&
362  m_Msa->data[seq_index][i].letter == kGapResidue) {
363 
364  m_Msa->data[seq_index][i].is_aligned = false;
365  i++;
366  }
367 
368  // find long internal gaps
369  while (i < m_Msa->dimensions->query_length) {
370  while (i < m_Msa->dimensions->query_length &&
371  m_Msa->data[seq_index][i].letter != kGapResidue) {
372  i++;
373  }
374 
375  int k = static_cast<int>(i) + 1;
376  while (k < m_Msa->dimensions->query_length &&
377  m_Msa->data[seq_index][k].letter == kGapResidue) {
378  k++;
379  }
380 
381  if (k - i >= kLongGapLen) {
382  for (int j=static_cast<int>(i);j < k;j++) {
383  m_Msa->data[seq_index][j].is_aligned = false;
384  }
385  }
386 
387  i = k;
388  }
389 
390  // find right flanking gaps
391  int k = m_Msa->dimensions->query_length - 1;
392  while (k >= 0 && m_Msa->data[seq_index][k].letter == kGapResidue) {
393  m_Msa->data[seq_index][k].is_aligned = false;
394  k--;
395  }
396  }
397 }
398 
399 END_SCOPE(blast)
401 
402 /* @} */
#define static
Declares the BLAST exception class.
const Uint1 kProtMask
NCBISTDAA element used to mask residues in BLAST.
Definition: blast_filter.c:39
PSIMsa * PSIMsaFree(PSIMsa *msa)
Deallocates the PSIMsa structure.
Definition: blast_psi.c:513
PSIDiagnosticsRequest * PSIDiagnosticsRequestNew(void)
Allocates a PSIDiagnosticsRequest structure, setting all fields to false.
Definition: blast_psi.c:585
PSIMsa * PSIMsaNew(const PSIMsaDimensions *dimensions)
Allocates and initializes the multiple sequence alignment data structure for use as input to the PSSM...
Definition: blast_psi.c:462
PSIDiagnosticsRequest * PSIDiagnosticsRequestFree(PSIDiagnosticsRequest *diags_request)
Deallocates the PSIDiagnosticsRequest structure passed in.
Definition: blast_psi.c:611
const unsigned int kQueryIndex
Index into multiple sequence alignment structure for the query sequence.
class CAlnReader supports importing a large variety of text-based alignment formats into standard dat...
Definition: aln_reader.hpp:100
const vector< string > & GetSeqs(void) const
Definition: aln_reader.hpp:226
void Read(bool guess, bool generate_local_ids=false, objects::ILineErrorListener *pErrorListener=nullptr)
void SetClustal(EAlphabet alpha)
Definition: aln_reader.cpp:244
CRef< objects::CSeq_entry > GetSeqEntry(TFastaFlags fasta_flags=objects::CFastaReader::fAddMods, objects::ILineErrorListener *pErrorListener=nullptr)
Definition: aln_reader.cpp:722
Defines BLAST error codes (user errors included)
Defines system exceptions occurred while running BLAST.
CNCBIstdaa –.
Definition: NCBIstdaa.hpp:66
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
@ eFormat
Some of these are pretty specialized.
This class is a concrete strategy for IPssmInputData which converts the CLUSTALW-style output contain...
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
static FILE * input_file
Definition: common.c:35
#define true
Definition: bool.h:35
static const char * kQuery
static void s_GetQuerySequenceData(const CBioseq &bioseq, size_t query_length, CNCBIstdaa &retval)
Auxiliary function to retrieve the sequence data in NCBI-stdaa format from the bioseq.
static const char kGapChar('-')
The representation of a gap in ASCII format.
PSIMsaDimensions m_MsaDimensions
Multiple sequence alignment dimensions.
void x_ValidateQueryInMsa()
Searches the query sequence (m_Query) in the aligned sequences (m_AsciiMsa) and moves the first insta...
unsigned int GetQueryLength()
Get the query's length.
void Process()
The work to process the alignment is done here.
CRef< objects::CSeq_entry > m_SeqEntry
CSeq_entry obtained from the multiple sequence alignment.
void x_CopyQueryToMsa()
Copies query sequence data to multiple alignment data structure.
TAutoUint1ArrayPtr m_Query
Pointer to query sequence.
void x_ExtractQueryForPssm()
Extracts the query bioseq from m_SeqEntry.
static bool s_AreSequencesEqual(const CNCBIstdaa &sequence, Uint1 *query)
Returns true iff sequence is identical to query.
CRef< objects::CBioseq > m_QueryBioseq
Query as CBioseq for PSSM.
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
void x_ReadAsciiMsa(CNcbiIstream &input_file)
Reads the multiple sequence alignment from the input file.
PSIDiagnosticsRequest * m_DiagnosticsRequest
Diagnostics request structure.
void x_ExtractAlignmentData()
Populates the multiple alignment data structure.
PSIMsa * m_Msa
Structure representing the multiple sequence alignment.
vector< string > m_AsciiMsa
The raw multiple sequence alignment in ASCII read from the input file.
const char NCBISTDAA_TO_AMINOACID[]
Translates between ncbieaa and ncbistdaa.
virtual ~CPsiBlastInputClustalW()
virtual destructor
void x_ExtractQueryFromMsa(unsigned int msa_master_idx=0)
Extracts the query sequence from the multiple sequence alignment, assuming it's the first one,...
void reset(element_type *p=0, EOwnership ownership=eTakeOwnership)
Reset will delete the old pointer (if owned), set content to the new value, and assume the ownership ...
Definition: ncbimisc.hpp:480
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:469
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
TPrim & Set(void)
Definition: serialbase.hpp:351
const TPrim & Get(void) const
Definition: serialbase.hpp:347
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbistr.hpp:4455
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
bool IsNcbistdaa(void) const
Check if variant Ncbistdaa is selected.
Definition: Seq_data_.hpp:684
const TNcbistdaa & GetNcbistdaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:690
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST from a multiple sequence alignmen...
const struct ncbi::grid::netcache::search::fields::SIZE size
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int isupper(Uchar c)
Definition: ncbictype.hpp:70
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Definition: blast_psi.h:181
Boolean is_aligned
Is this letter part of the alignment?
Definition: blast_psi.h:52
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Definition: blast_psi.h:50
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Definition: blast_psi.h:59
Uint4 query_length
Length of the query.
Definition: blast_psi.h:58
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Definition: blast_psi.h:77
PSIMsaDimensions * dimensions
dimensions of the msa
Definition: blast_psi.h:76
static string query
#define _ASSERT
#define const
Definition: zconf.h:232
Modified on Wed Apr 17 13:09:20 2024 by modify_doxy.py rev. 669887