NCBI C++ ToolKit
aln_scanner.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2  * $Id: aln_scanner.cpp 93579 2021-05-01 20:54:52Z stakhovv $
3  *
4  * ===========================================================================
5  *
6  * PUBLIC DOMAIN NOTICE
7  * National Center for Biotechnology Information
8  *
9  * This software/database is a "United States Government Work" under the
10  * terms of the United States Copyright Act. It was written as part of
11  * the author's official duties as a United States Government employee and
12  * thus cannot be copyrighted. This software/database is freely available
13  * to the public for use. The National Library of Medicine and the U.S.
14  * Government have not placed any restriction on its use or reproduction.
15  *
16  * Although all reasonable efforts have been taken to ensure the accuracy
17  * and reliability of the software and data, the NLM and the U.S.
18  * Government do not and cannot warrant the performance or results that
19  * may be obtained by using this software or data. The NLM and the U.S.
20  * Government disclaim all warranties, express or implied, including
21  * warranties of performance, merchantability or fitness for any particular
22  * purpose.
23  *
24  * Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors: Frank Ludwig
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistr.hpp>
38 #include "aln_errors.hpp"
39 #include "aln_peek_ahead.hpp"
40 #include "aln_scanner_fastagap.hpp"
41 
42 
45 
46 // ----------------------------------------------------------------------------
47 void
49  CSequenceInfo& sequenceInfo,
50  CLineInput& iStr,
51  SAlignmentFile& alignInfo)
52 // ----------------------------------------------------------------------------
53 {
54  xImportAlignmentData(sequenceInfo, iStr);
55  xAdjustSequenceInfo(sequenceInfo);
56  xVerifyAlignmentData(sequenceInfo);
57  xExportAlignmentData(alignInfo);
58 }
59 
60 // ----------------------------------------------------------------------------
61 void
63  CSequenceInfo& sequenceInfo,
64  CLineInput& iStr)
65 // ----------------------------------------------------------------------------
66 {
67  throw SShowStopper(
68  -1,
70  "Input file format not recognized.");
71 }
72 
73 // ----------------------------------------------------------------------------
74 void
76  CSequenceInfo& sequenceInfo)
77 // ----------------------------------------------------------------------------
78 {
79 }
80 
81 // ----------------------------------------------------------------------------
82 void
84  const CSequenceInfo& sequenceInfo)
85 // ----------------------------------------------------------------------------
86 {
87  // make sure all sequence are of the same length(once we no longer enforce
88  // harmonized data sizes):
89 
90  // make sure all sequence characters are legal, and legal in the places where
91  // they show up:
92  for (auto i=0; i < mSequences.size(); ++i) {
93  const auto& seqData = mSequences[i];
94  const auto& seqId = mSeqIds[i];
95  xVerifySingleSequenceData(sequenceInfo, seqId, seqData);
96  }
97 }
98 
99 // ----------------------------------------------------------------------------
100 void
102  const CSequenceInfo& sequenceInfo,
103  const TLineInfo& seqIdInfo,
104  const vector<TLineInfo> lineInfos)
105 // -----------------------------------------------------------------------------
106 {
107  const char* errTempl("Bad character [%c] found at data position %d.");
108 
109  enum ESeqPart {
110  HEAD, BODY, TAIL
111  };
112  const string& alphabet = sequenceInfo.Alphabet();
113  const string legalInHead =
114  sequenceInfo.BeginningGap() + sequenceInfo.Missing();
115  const string legalInBody =
116  alphabet
117  + sequenceInfo.MiddleGap()
118  + sequenceInfo.Missing()
119  + sequenceInfo.Match();
120  const string legalInTail =
121  sequenceInfo.EndGap() + sequenceInfo.Missing();
122 
123  ESeqPart seqPart = ESeqPart::HEAD;
124 
125  for (auto lineInfo: lineInfos) {
126  if (lineInfo.mData.empty()) {
127  continue;
128  }
129  string seqData(lineInfo.mData);
130 
131  if (seqPart == ESeqPart::HEAD) {
132  auto startBody = seqData.find_first_not_of(legalInHead);
133  if (startBody == string::npos) {
134  continue;
135  }
136  seqPart = ESeqPart::BODY;
137  seqData = seqData.substr(startBody);
138  if (alphabet.find(seqData[0]) == string::npos) {
139  auto linePos = lineInfo.mData.size() - seqData.size();
140  string description = ErrorPrintf(errTempl, seqData[0], linePos);
141  throw SShowStopper(
142  lineInfo.mNumLine,
144  description,
145  seqIdInfo.mData);
146  }
147  }
148  if (seqPart == ESeqPart::BODY) {
149  auto startTail = seqData.find_first_not_of(legalInBody);
150  if (startTail == string::npos) {
151  continue;
152  }
153  seqPart = ESeqPart::TAIL;
154  seqData = seqData.substr(startTail);
155  }
156  if (seqPart == ESeqPart::TAIL) {
157  auto startBad = seqData.find_first_not_of(legalInTail);
158  if (startBad == string::npos) {
159  continue;
160  }
161  auto linePos = lineInfo.mData.size() - seqData.size() + startBad;
162  string description = ErrorPrintf(
163  errTempl, seqData[startBad], linePos);
164  throw SShowStopper(
165  lineInfo.mNumLine,
167  description,
168  seqIdInfo.mData);
169  }
170  }
171 }
172 
173 // ----------------------------------------------------------------------------
174 void
176  SAlignmentFile& alignInfo)
177 // ----------------------------------------------------------------------------
178 {
179  alignInfo.mIds.reserve(mSeqIds.size());
180  for (auto seqId: mSeqIds) {
181  alignInfo.mIds.push_back(seqId);
182  }
183 
184  alignInfo.mDeflines.assign(mDeflines.begin(), mDeflines.end());
185 
186  auto numSequences = mSequences.size();
187  alignInfo.mSequences.resize(numSequences);
188  auto index = 0;
189  for (auto sequence: mSequences) {
190  for (auto seqPart: sequence) {
191  alignInfo.mSequences[index] += seqPart.mData;
192  }
193  ++index;
194  }
195 }
196 
197 // ----------------------------------------------------------------------------
200  const string& seqId,
201  TLineInfo& existingInfo)
202 // ----------------------------------------------------------------------------
203 {
204 
205  for (const auto& idInfo : mSeqIds) {
206  if (seqId == idInfo.mData) {
207  existingInfo = idInfo;
209  }
210  }
211 
212  string seqIdLower(seqId);
213  NStr::ToLower(seqIdLower);
214  for (const auto& idInfo : mSeqIds) {
215  string idLower(idInfo.mData);
216  NStr::ToLower(idLower);
217  if (seqIdLower == idLower) {
218  existingInfo = idInfo;
220  }
221  }
222 
224 };
225 
226 // ----------------------------------------------------------------------------
227 bool
228 // ----------------------------------------------------------------------------
230  const string& seqId,
231  int index)
232 {
233  if (index >= mSeqIds.size()) {
234  return false;
235  }
236  auto seqIdCompare(mSeqIds[index].mData);
237  if (seqId.size() != seqIdCompare.size()) {
238  return false;
239  }
240 
241  return (seqId == seqIdCompare);
242 }
243 
END_ENUM_INFO string ErrorPrintf(const char *format,...)
Definition: aln_errors.cpp:99
bool xSeqIdIsEqualToInfoAt(const string &seqId, int index)
virtual void xImportAlignmentData(CSequenceInfo &, CLineInput &)
Definition: aln_scanner.cpp:62
virtual void xExportAlignmentData(SAlignmentFile &alignmentInfo)
vector< TLineInfo > mSeqIds
virtual void ProcessAlignmentFile(CSequenceInfo &, CLineInput &, SAlignmentFile &)
Definition: aln_scanner.cpp:48
vector< vector< TLineInfo > > mSequences
virtual void xAdjustSequenceInfo(CSequenceInfo &)
Definition: aln_scanner.cpp:75
ESeqIdComparison xGetExistingSeqIdInfo(const string &seqId, TLineInfo &existingInfo)
virtual void xVerifyAlignmentData(const CSequenceInfo &)
Definition: aln_scanner.cpp:83
vector< TLineInfo > mDeflines
virtual void xVerifySingleSequenceData(const CSequenceInfo &, const TLineInfo &seqId, const vector< TLineInfo > seqData)
const string & Missing() const
Definition: alnread.hpp:94
const string & BeginningGap() const
Definition: alnread.hpp:101
const string & Alphabet() const
Definition: alnread.hpp:80
const string & MiddleGap() const
Definition: alnread.hpp:108
const string & EndGap() const
Definition: alnread.hpp:115
const string & Match() const
Definition: alnread.hpp:87
vector< string > mSequences
Definition: alnread.hpp:145
vector< TLineInfo > mIds
Definition: alnread.hpp:144
vector< TLineInfo > mDeflines
Definition: alnread.hpp:146
The NCBI C++ standard methods for dealing with std::string.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
#define TAIL()
int i
@ eAlnSubcode_BadDataChars
@ eAlnSubcode_UnsupportedFileFormat
string mData
Definition: alnread.hpp:52
SShowStopper.
@ HEAD
Definition: inflate.h:21
Modified on Fri Sep 20 14:58:27 2024 by modify_doxy.py rev. 669887