1 /* ===========================================================================
2  *
4  * National Center for Biotechnology Information
5  *
6  * This software/database is a "United States Government Work" under the
7  * terms of the United States Copyright Act. It was written as part of
8  * the author's official duties as a United States Government employee and
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author: Christiam Camacho
26  *
27  */
29 /** @file msa_pssm_input.cpp
30  * Implementation of the concrete strategy to obtain PSSM input data for
31  * PSI-BLAST from a multiple sequence alignment.
32  */
34 #include <ncbi_pch.hpp>
36 // BLAST includes
39 #include "../core/blast_psi_priv.h" // for kQueryIndex
41 // Objtools includes
42 #include <objtools/readers/aln_reader.hpp> // for CAlnReader
43 #include <objtools/readers/reader_exception.hpp> // for CObjReaderParseException
45 // Object includes
46 #include <objects/seq/Bioseq.hpp>
47 #include <objects/seq/Seq_inst.hpp>
48 #include <objects/seq/Seq_data.hpp>
51 // Serial includes
52 #include <serial/iterator.hpp> // for CTypeIterator, Begin
54 /** @addtogroup AlgoBlast
55  *
56  * @{
57  */
61 BEGIN_SCOPE(blast)
63 /// The representation of a gap in ASCII format
64 static const char kGapChar('-');
66 //////////////////////////////////////////////////////////////////////////////
70  const PSIBlastOptions& opts,
71  const char* matrix_name /* = NULL */,
72  const PSIDiagnosticsRequest* diags /* = NULL */,
73  const unsigned char* query /* = NULL */,
74  unsigned int query_length /* = 0 */,
75  int gap_existence /* = 0 */,
76  int gap_extension /* = 0 */,
77  unsigned int msa_master_idx /* = 0 */)
78  : m_Query(0), m_GapExistence(gap_existence), m_GapExtension(gap_extension)
79 {
80  if (query) {
81  _ASSERT(query_length);
82  m_MsaDimensions.query_length = query_length;
83  m_Query.reset(new Uint1[query_length]);
84  memcpy((void*) m_Query.get(), (void*) query, query_length);
85  }
87  m_Opts = opts;
88  m_Opts.ignore_unaligned_positions = true;
90  x_ReadAsciiMsa(input_file);
91  if ( !m_Query || msa_master_idx != 0) {
92  x_ExtractQueryFromMsa(msa_master_idx);
93  }
94  x_ValidateQueryInMsa();
95  _ASSERT(m_Query);
96  _ASSERT(m_MsaDimensions.query_length);
97  // query is included in m_AsciiMsa, so decrement it by 1
98  m_MsaDimensions.num_seqs = static_cast<Uint4>(m_AsciiMsa.size() - 1);
100  m_Msa = NULL;
102  // Default value provided by base class
103  m_MatrixName = string(matrix_name ? matrix_name : "");
104  if (diags) {
105  m_DiagnosticsRequest = PSIDiagnosticsRequestNew();
106  *m_DiagnosticsRequest = *diags;
107  } else {
108  m_DiagnosticsRequest = NULL;
109  }
110 }
113 {
114  PSIMsaFree(m_Msa);
116 }
118 void
120 {
121  _ASSERT(m_AsciiMsa.empty());
122  CAlnReader reader(input_file);
124  try {
125  reader.Read(false, true);
126  } catch (const CObjReaderParseException& e) {
127  // Workaround to provide a more useful error message when repeated
128  // Seq-IDs are encountered
130  (NStr::Find(e.GetMsg(), "Not all sequences have same length") != NPOS)) {
131  string msg("Repeated Seq-IDs detected in multiple sequence ");
132  msg += "alignment file, please ensure all Seq-IDs are unique ";
133  msg += "before proceeding.";
134  NCBI_THROW(CBlastException, eInvalidOptions, msg);
135  }
136  }
137  m_AsciiMsa = reader.GetSeqs();
138  m_SeqEntry = reader.GetSeqEntry();
139  // Test our post-condition
140  _ASSERT( !m_AsciiMsa.empty() );
141  _ASSERT( !m_SeqEntry.Empty() );
142 }
144 /// Auxiliary function to retrieve the sequence data in NCBI-stdaa format from
145 /// the bioseq.
146 /// @param bioseq Bioseq to extract the data from [in]
147 /// @param query_length size of the query [in]
148 /// @param retval return value of this function [in|out]
149 /// @sa CPssm::GetQuerySequenceData
150 static void
151 s_GetQuerySequenceData(const CBioseq& bioseq, size_t query_length, CNCBIstdaa& retval)
152 {
153  const CSeq_data& seq_data = bioseq.GetInst().GetSeq_data();
154  retval.Set().reserve(query_length);
155  if ( !seq_data.IsNcbistdaa() ) {
156  CSeq_data ncbistdaa;
157  CSeqportUtil::Convert(seq_data, &ncbistdaa, CSeq_data::e_Ncbistdaa);
158  copy(ncbistdaa.GetNcbistdaa().Get().begin(),
159  ncbistdaa.GetNcbistdaa().Get().end(),
160  back_inserter(retval.Set()));
161  } else {
162  copy(seq_data.GetNcbistdaa().Get().begin(),
163  seq_data.GetNcbistdaa().Get().end(),
164  back_inserter(retval.Set()));
165  }
166 }
168 /// Returns true iff sequence is identical to query
169 static bool
171 {
172  bool retval = true;
173  for (TSeqPos i = 0; i < sequence.Get().size(); i++) {
174  if (sequence.Get()[i] != query[i]) {
175  retval = false;
176  break;
177  }
178  }
179  return retval;
180 }
182 void
184 {
185  // Test our pre-conditions
189  for (CTypeIterator<CBioseq> itr(Begin(*m_SeqEntry)); itr; ++itr) {
190  _ASSERT(itr->IsAa());
191  if (itr->GetLength() != GetQueryLength()) {
192  continue;
193  }
194  // let's check the sequence data
195  CNCBIstdaa sequence;
196  s_GetQuerySequenceData(*itr, GetQueryLength(), sequence);
197  if (s_AreSequencesEqual(sequence, m_Query.get())) {
198  m_QueryBioseq.Reset(&*itr);
199  break;
200  }
201  }
202  // note that the title cannot be set because we're getting the query
203  // sequence from the multiple sequence alignment file via CAlnReader
205  // Test our post-condition
207 }
209 void
211 {
212  // Create multiple alignment data structure and populate with query
213  // sequence
215  if ( !m_Msa ) {
216  NCBI_THROW(CBlastSystemException, eOutOfMemory,
217  "Multiple alignment data structure");
218  }
223 }
225 void
227 {
228  const size_t kAligmentLength = m_AsciiMsa.front().size();
229  const char kMaskingRes = NCBISTDAA_TO_AMINOACID[kProtMask];
230  _ASSERT( !m_AsciiMsa.empty() );
232  size_t seq_idx = 0;
233  for (; seq_idx < m_AsciiMsa.size(); seq_idx++) {
234  size_t query_idx = 0;
235  for (size_t align_idx = 0;
236  align_idx < kAligmentLength && query_idx < GetQueryLength();
237  align_idx++) {
238  if (m_AsciiMsa[seq_idx][align_idx] == kGapChar) {
239  continue;
240  }
241  char query_res = NCBISTDAA_TO_AMINOACID[m_Query.get()[query_idx]];
242  const char kCurrentRes = toupper(m_AsciiMsa[seq_idx][align_idx]);
243  /* Selenocysteines are replaced by X's in query; test for this
244  * possibility */
245  if (query_res == kMaskingRes && kCurrentRes == 'U') {
246  query_res = kCurrentRes;
247  }
248  if (query_res != kCurrentRes) {
249  break; // character mismatch
250  } else {
251  query_idx++;
252  }
253  }
255  if (query_idx == GetQueryLength()) {
256  break;
257  }
258  }
260  if (seq_idx < m_AsciiMsa.size()) {
261  // If the query was found at position seq_idx, swap it with the first
262  // element in the m_AsciiMsa vector
263  for (size_t align_idx = 0; align_idx < kAligmentLength; align_idx++) {
264  swap(m_AsciiMsa[seq_idx][align_idx], m_AsciiMsa.front()[align_idx]);
265  }
266  } else {
267  string msg("No sequence in the multiple sequence alignment provided ");
268  msg += "matches the query sequence";
269  NCBI_THROW(CBlastException, eInvalidOptions, msg);
270  }
271 }
273 void
274 CPsiBlastInputClustalW::x_ExtractQueryFromMsa(unsigned int msa_master_idx/*=0*/)
275 {
276  if (msa_master_idx >= m_AsciiMsa.size()) {
277  CNcbiOstrstream oss;
278  oss << "Invalid master sequence index, please use a value between 1 "
279  << "and " << m_AsciiMsa.size();
280  NCBI_THROW(CBlastException, eInvalidOptions,
282  }
283  const string& kQuery =;
284  size_t kNumGaps = 0;
285  ITERATE(string, residue, kQuery) {
286  if (*residue == kGapChar) {
287  kNumGaps++;
288  }
289  }
290  const unsigned int kQueryLength = static_cast<unsigned int>(kQuery.size() - kNumGaps);
292  m_MsaDimensions.query_length = kQueryLength;
293  m_Query.reset(new Uint1[kQueryLength]);
294  unsigned int query_idx = 0;
295  ITERATE(string, residue, kQuery) {
296  _ASSERT(isalpha(*residue) || *residue == kGapChar);
297  if (*residue == kGapChar) {
298  continue;
299  }
300  m_Query.get()[query_idx] = AMINOACID_TO_NCBISTDAA[toupper(*residue)];
301  query_idx++;
302  }
303  _ASSERT(query_idx == kQueryLength);
305  // Test our post-conditions
306  _ASSERT(m_Query.get() != NULL);
308 }
310 void
312 {
313  _ASSERT(m_Msa);
314  const string& ascii_query = m_AsciiMsa.front();
316  unsigned int query_idx = 0;
317  ITERATE(string, residue, ascii_query) {
318  if (*residue == kGapChar) {
319  continue;
320  }
321  m_Msa->data[kQueryIndex][query_idx].letter = m_Query.get()[query_idx];
322  m_Msa->data[kQueryIndex][query_idx].is_aligned =
323  (isupper(*residue) ? true : false);
324  query_idx++;
325  }
326  _ASSERT(query_idx == GetQueryLength());
327 }
329 void
331 {
332  const size_t kAlignmentLength = m_AsciiMsa.front().size();
333  _ASSERT( !m_AsciiMsa.empty() );
335  size_t seq_index = kQueryIndex + 1;
336  for (; seq_index < m_AsciiMsa.size(); seq_index++) {
337  size_t query_idx = 0;
338  for (size_t align_idx = 0; align_idx < kAlignmentLength; align_idx++) {
339  if (m_AsciiMsa.front()[align_idx] == kGapChar) {
340  continue;
341  }
342  _ASSERT(toupper(m_AsciiMsa.front()[align_idx]) ==
343  NCBISTDAA_TO_AMINOACID[m_Query.get()[query_idx]]);
344  const char kCurrentRes = m_AsciiMsa[seq_index][align_idx];
345  _ASSERT(isalpha(kCurrentRes) || kCurrentRes == kGapChar);
346  m_Msa->data[seq_index][query_idx].letter =
347  AMINOACID_TO_NCBISTDAA[(int) toupper(kCurrentRes)];
348  m_Msa->data[seq_index][query_idx].is_aligned = true;
349  query_idx++;
350  }
351  }
353  // set flanking gaps and long internal gaps as unaligned
354  seq_index = kQueryIndex + 1;
355  const int kGapResidue = 0;
356  const int kLongGapLen = 10;
357  for (; seq_index < m_Msa->dimensions->num_seqs + 1; seq_index++) {
358  size_t i = 0;
360  // find left flanking gaps
361  while (i < m_Msa->dimensions->query_length &&
362  m_Msa->data[seq_index][i].letter == kGapResidue) {
364  m_Msa->data[seq_index][i].is_aligned = false;
365  i++;
366  }
368  // find long internal gaps
369  while (i < m_Msa->dimensions->query_length) {
370  while (i < m_Msa->dimensions->query_length &&
371  m_Msa->data[seq_index][i].letter != kGapResidue) {
372  i++;
373  }
375  int k = static_cast<int>(i) + 1;
376  while (k < m_Msa->dimensions->query_length &&
377  m_Msa->data[seq_index][k].letter == kGapResidue) {
378  k++;
379  }
381  if (k - i >= kLongGapLen) {
382  for (int j=static_cast<int>(i);j < k;j++) {
383  m_Msa->data[seq_index][j].is_aligned = false;
384  }
385  }
387  i = k;
388  }
390  // find right flanking gaps
391  int k = m_Msa->dimensions->query_length - 1;
392  while (k >= 0 && m_Msa->data[seq_index][k].letter == kGapResidue) {
393  m_Msa->data[seq_index][k].is_aligned = false;
394  k--;
395  }
396  }
397 }
399 END_SCOPE(blast)
402 /* @} */
