NCBI C++ ToolKit
blast_input_aux.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_input_aux.hpp 100875 2023-09-22 12:46:36Z madden $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file blast_input_aux.hpp
31  * Auxiliary classes/functions for BLAST input library
32  */
33 
34 #ifndef ALGO_BLAST_BLASTINPUT__BLAST_INPUT_AUX__HPP
35 #define ALGO_BLAST_BLASTINPUT__BLAST_INPUT_AUX__HPP
36 
37 #include <algo/blast/api/sseqloc.hpp> /* for CBlastQueryVector */
38 #include <corelib/ncbiargs.hpp>
40 
42 BEGIN_SCOPE(blast)
43 
44 /// Auxiliary class to store the name of an output file, which is reset every
45 /// time its GetStream method is invoked. This is needed to handle files which
46 /// are over-written at every PSI-BLAST iteration (e.g.: PSSM and ASCII PSSM)
48 {
49 public:
50  /// Constructor
51  /// @param file_name name of the output file [in]
52  /// @param use_version if true, new files will be saved with versions
53  /// added to the file name
54  CAutoOutputFileReset(const string& file_name, bool use_versions = false)
55  : m_FileName(file_name), m_Version(use_versions ? 1 : 0) {}
56 
57  /// Retrieve the newly opened stream, caller doesn't own the return value
58  CNcbiOstream* GetStream();
59 
60 private:
61  /// The file's name
62  string m_FileName;
63  /// The output stream
64  unique_ptr<CNcbiOstream> m_FileStream;
65 
66  /// File version if larger than zero
67  int m_Version;
68 
69  /// Prohibit copy constructor
71  /// Prohibit assignment operator
73 };
74 
75 /// Class to constrain the length of the file name passed to a given CArgDescriptions key
77 {
78 public:
79  static constexpr Uint4 kDfltMaxLength = 256;
80 
81  CArgAllowMaximumFileNameLength(Uint4 max = kDfltMaxLength) : m_MaxLength(max) {}
82 
83 protected:
84  /// Overloaded method from CArgAllow
85  virtual bool Verify(const string& value) const {
86  CFile fname(value);
87  return fname.GetName().size() < m_MaxLength;
88  }
89 
90  /// Overloaded method from CArgAllow
91  virtual string GetUsage(void) const {
92  return "file name length < " + NStr::IntToString(m_MaxLength);
93  }
94 
95 private:
96  Uint4 m_MaxLength; /**< Maximum string length value for this object */
97 };
98 
99 /// Class to constrain the values of an argument to those greater than or equal
100 /// to the value specified in the constructor
102 {
103 public:
104  /// Constructor taking an integer
106  /// Constructor taking a double
107  CArgAllowValuesGreaterThanOrEqual(double min) : m_MinValue(min) {}
108 
109 protected:
110  /// Overloaded method from CArgAllow
111  virtual bool Verify(const string& value) const {
112  return NStr::StringToDouble(value) >= m_MinValue;
113  }
114 
115  /// Overloaded method from CArgAllow
116  virtual string GetUsage(void) const {
117  return ">=" + NStr::DoubleToString(m_MinValue);
118  }
119 
120 private:
121  double m_MinValue; /**< Minimum value for this object */
122 };
123 
124 /// Class to constrain the values of an argument to those less than or equal
125 /// to the value specified in the constructor
127 {
128 public:
129  /// Constructor taking an integer
130  CArgAllowValuesLessThanOrEqual(int max) : m_MaxValue(max) {}
131  /// Constructor taking a double
132  CArgAllowValuesLessThanOrEqual(double max) : m_MaxValue(max) {}
133 
134 protected:
135  /// Overloaded method from CArgAllow
136  virtual bool Verify(const string& value) const {
137  return NStr::StringToDouble(value) <= m_MaxValue;
138  }
139 
140  /// Overloaded method from CArgAllow
141  virtual string GetUsage(void) const {
142  return "<=" + NStr::DoubleToString(m_MaxValue);
143  }
144 
145 private:
146  double m_MaxValue; /**< Maximum value for this object */
147 };
148 
149 /// Class to constrain the values of an argument to those in between the values
150 /// specified in the constructor
152 {
153 public:
154  /// Constructor taking an integer
155  CArgAllowValuesBetween(int min, int max, bool inclusive = false)
156  : m_MinValue(min), m_MaxValue(max), m_Inclusive(inclusive) {}
157  /// Constructor taking a double
158  CArgAllowValuesBetween(double min, double max, bool inclusive = false)
159  : m_MinValue(min), m_MaxValue(max), m_Inclusive(inclusive) {}
160 
161 protected:
162  /// Overloaded method from CArgAllow
163  virtual bool Verify(const string& value) const {
164  double val = NStr::StringToDouble(value);
165  bool retval = false;
166  if ( !m_Inclusive ) {
167  retval = (val > m_MinValue && val < m_MaxValue);
168  } else {
169  retval = (val >= m_MinValue && val <= m_MaxValue);
170  }
171  return retval;
172  }
173 
174  /// Overloaded method from CArgAllow
175  virtual string GetUsage(void) const {
176  string retval;
177  if ( !m_Inclusive ) {
178  retval = "(>" + NStr::DoubleToString(m_MinValue) + " and <"
179  + NStr::DoubleToString(m_MaxValue) + ")";
180  } else {
181  retval = "(>=" + NStr::DoubleToString(m_MinValue) + " and =<"
182  + NStr::DoubleToString(m_MaxValue) + ")";
183  }
184  return retval;
185  }
186 
187 private:
188  double m_MinValue; /**< Minimum value for this object */
189  double m_MaxValue; /**< Maximum value for this object */
190  bool m_Inclusive; /**< Whether the values above should be included or not */
191 };
192 
193 /**
194  * @brief Macro to create a subclass of CArgAllow that allows the specification
195  * of sets of data
196  *
197  * @param ClassName Name of the class to be created [in]
198  * @param DataType data type of the allowed arguments [in]
199  * @param String2DataTypeFn Conversion function from a string to DataType [in]
200  */
201 #define DEFINE_CARGALLOW_SET_CLASS(ClassName, DataType, String2DataTypeFn) \
202 class NCBI_BLASTINPUT_EXPORT ClassName : public CArgAllow \
203 { \
204 public: \
205  ClassName(const set<DataType>& values) \
206  : m_AllowedValues(values) \
207  { \
208  if (values.empty()) { \
209  throw runtime_error("Allowed values set must not be empty"); \
210  } \
211  } \
212  \
213 protected: \
214  virtual bool Verify(const string& value) const { \
215  DataType value2check = String2DataTypeFn(value); \
216  ITERATE(set<DataType>, itr, m_AllowedValues) { \
217  if (*itr == value2check) { \
218  return true; \
219  } \
220  } \
221  return false; \
222  } \
223  \
224  virtual string GetUsage(void) const { \
225  CNcbiOstrstream os; \
226  os << "Permissible values: "; \
227  ITERATE(set<DataType>, itr, m_AllowedValues) { \
228  os << "'" << *itr << "' "; \
229  } \
230  return CNcbiOstrstreamToString(os); \
231  } \
232  \
233 private: \
234  /* Set containing the permissible values */ \
235  set<DataType> m_AllowedValues; \
236 }
237 
238 #ifndef SKIP_DOXYGEN_PROCESSING
241 #endif /* SKIP_DOXYGEN_PROCESSING */
242 
243 /** Parse and extract a sequence range from argument provided to this function.
244  * The format is N-M, where N and M are positive integers in 1-based offsets and
245  * N < M.
246  * @param range_str string to extract the range from [in]
247  * @param error_prefix error message prefix which will be encoded in the
248  * exception thrown in case of error (if NULL a default message will be used) [in]
249  * @return properly constructed range if parsing succeeded in 0-based offsets.
250  * @throw CStringException or CBlastException with error code eInvalidArgument
251  * if parsing fails or the range is invalid (i.e.: empty, negative, N>M,
252  * in 0-based offsets)
253  */
255 TSeqRange
256 ParseSequenceRange(const string& range_str, const char* error_prefix = NULL);
257 
258 /** Parse and extract a sequence range from argument provided to this function.
259  * The format is N-M, where N and M are positive integers in 1-based offsets and
260  * N < M. Open end range N- and single range N-N formats are supported.
261  * @param range_str string to extract the range from [in]
262  * @param error_prefix error message prefix which will be encoded in the
263  * exception thrown in case of error (if NULL a default message will be used) [in]
264  * @return properly constructed range if parsing succeeded in 0-based offsets.
265  * @throw CStringException or CBlastException with error code eInvalidArgument
266  * if parsing fails or the range is invalid (i.e.: empty, negative, N>M,
267  * in 0-based offsets)
268  */
270 TSeqRange
271 ParseSequenceRangeOpenEnd(const string& range_str, const char* error_prefix = NULL);
272 
273 /** Retrieve the appropriate batch size for the specified task
274  * @param program BLAST task [in]
275  * @param is_ungapped true if ungapped BLAST search is requested [in]
276  * @param remote true if remote BLAST search is requested [in]
277  * @param use_default true if a defaut value should be returned [in]
278  * @param task task (e.g., blastx-fast). Auto-set if empty [in]
279  * @param mt_mode thread by queries (true) or by database (false) [in]
280  */
282 int
283 GetQueryBatchSize(EProgram program, bool is_ungapped = false, bool remote = false,
284  bool use_default = true, string task = "", bool mt_mode = false);
285 
286 /** Read sequence input for BLAST
287  * @param in input stream from which to read [in]
288  * @param read_proteins expect proteins or nucleotides as input [in]
289  * @param range range restriction to apply to sequences read [in]
290  * @param parse_deflines true if the subject deflines should be parsed [in]
291  * @param use_lcase_masking true if the subject lowercase sequence characters
292  * should be interpreted as masked regions [in]
293  * @param sequences output will be placed here [in|out]
294  * @praram gaps_to_Ns convert all gaps in the sequences to Ns (only for
295  * nucleotide sequences) [in]
296  * @return CScope object which contains all the sequences read
297  */
301  bool read_proteins,
302  const TSeqRange& range,
303  bool parse_deflines,
304  bool use_lcase_masking,
305  CRef<CBlastQueryVector>& sequences,
306  bool gaps_to_Ns = false);
307 
308 /// Calculates the formatting parameters based on the maximum number of target
309 /// sequences selected (a.k.a.: hitlist size).
310 /// @param max_target_seqs the hitlist size [in]
311 /// @param num_descriptions the number of one-line descriptions to show [out]
312 /// @param num_alignments the number of alignments to show [out]
313 /// @param num_overview the number of sequences to show in the overview image
314 /// displayed in the BLAST report on the web [out]
315 /// @return string containing warnings (if any)
317 string
318 CalculateFormattingParams(TSeqPos max_target_seqs,
319  TSeqPos* num_descriptions,
320  TSeqPos* num_alignments,
321  TSeqPos* num_overview = NULL);
322 
323 /// Returns true if the Bioseq passed as argument has the full, raw sequence
324 /// data in its Seq-inst field
325 /// @param bioseq Bioseq to examine [in]
327 bool HasRawSequenceData(const objects::CBioseq& bioseq);
328 
329 /// Inspect the sequences parameter for empty sequences.
330 /// Returns a non-empty string in the warnings parameter
331 /// if there are empty sequence(s) in its first parameter.
332 /// @param sequences sequence set to inspect [in]
333 /// @param warnings populated if empty sequence(s) are found
334 /// among non-empty sequences [in|out]
335 /// @throw CInputException if there is only 1 empty sequence
337 CheckForEmptySequences(const TSeqLocVector& sequences, string& warnings);
338 
339 /// Inspect the sequences parameter for empty sequences.
340 /// Returns a non-empty string in the warnings parameter
341 /// if there are empty sequence(s) in its first parameter.
342 /// @param sequences sequence set to inspect [in]
343 /// @param warnings populated if empty sequence(s) are found
344 /// among non-empty sequences [in|out]
345 /// @throw CInputException if there is only 1 empty sequence
347 CheckForEmptySequences(CRef<CBlastQueryVector> sequences, string& warnings);
348 
349 /// Inspect the sequences parameter for empty sequences.
350 /// Returns a non-empty string in the warnings parameter
351 /// if there are empty sequence(s) in its first parameter.
352 /// @param sequences sequence set to inspect [in]
353 /// @param warnings populated if empty sequence(s) are found
354 /// among non-empty sequences [in|out]
355 /// @throw CInputException if there is only 1 empty sequence
358 
359 
360 END_SCOPE(blast)
362 
363 #endif /* ALGO_BLAST_BLASTINPUT__BLAST_INPUT_AUX__HPP */
364 
void CheckForEmptySequences(const TSeqLocVector &sequences, string &warnings)
Inspect the sequences parameter for empty sequences.
TSeqRange ParseSequenceRangeOpenEnd(const string &range_str, const char *error_prefix=NULL)
Parse and extract a sequence range from argument provided to this function.
bool HasRawSequenceData(const objects::CBioseq &bioseq)
Returns true if the Bioseq passed as argument has the full, raw sequence data in its Seq-inst field.
TSeqRange ParseSequenceRange(const string &range_str, const char *error_prefix=NULL)
Parse and extract a sequence range from argument provided to this function.
string CalculateFormattingParams(TSeqPos max_target_seqs, TSeqPos *num_descriptions, TSeqPos *num_alignments, TSeqPos *num_overview=NULL)
Calculates the formatting parameters based on the maximum number of target sequences selected (a....
#define DEFINE_CARGALLOW_SET_CLASS(ClassName, DataType, String2DataTypeFn)
Macro to create a subclass of CArgAllow that allows the specification of sets of data.
int GetQueryBatchSize(EProgram program, bool is_ungapped=false, bool remote=false, bool use_default=true, string task="", bool mt_mode=false)
Retrieve the appropriate batch size for the specified task.
CRef< objects::CScope > ReadSequencesToBlast(CNcbiIstream &in, bool read_proteins, const TSeqRange &range, bool parse_deflines, bool use_lcase_masking, CRef< CBlastQueryVector > &sequences, bool gaps_to_Ns=false)
Read sequence input for BLAST.
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
Definition: blast_types.hpp:56
Class to constrain the length of the file name passed to a given CArgDescriptions key.
virtual bool Verify(const string &value) const
Overloaded method from CArgAllow.
virtual string GetUsage(void) const
Overloaded method from CArgAllow.
Uint4 m_MaxLength
Maximum string length value for this object.
CArgAllowMaximumFileNameLength(Uint4 max=kDfltMaxLength)
Class to constrain the values of an argument to those in between the values specified in the construc...
double m_MaxValue
Maximum value for this object.
CArgAllowValuesBetween(int min, int max, bool inclusive=false)
Constructor taking an integer.
bool m_Inclusive
Whether the values above should be included or not.
CArgAllowValuesBetween(double min, double max, bool inclusive=false)
Constructor taking a double.
double m_MinValue
Minimum value for this object.
virtual string GetUsage(void) const
Overloaded method from CArgAllow.
virtual bool Verify(const string &value) const
Overloaded method from CArgAllow.
Class to constrain the values of an argument to those greater than or equal to the value specified in...
virtual bool Verify(const string &value) const
Overloaded method from CArgAllow.
double m_MinValue
Minimum value for this object.
virtual string GetUsage(void) const
Overloaded method from CArgAllow.
CArgAllowValuesGreaterThanOrEqual(int min)
Constructor taking an integer.
CArgAllowValuesGreaterThanOrEqual(double min)
Constructor taking a double.
Class to constrain the values of an argument to those less than or equal to the value specified in th...
double m_MaxValue
Maximum value for this object.
CArgAllowValuesLessThanOrEqual(double max)
Constructor taking a double.
virtual string GetUsage(void) const
Overloaded method from CArgAllow.
CArgAllowValuesLessThanOrEqual(int max)
Constructor taking an integer.
virtual bool Verify(const string &value) const
Overloaded method from CArgAllow.
CArgAllow –.
Definition: ncbiargs.hpp:1488
Auxiliary class to store the name of an output file, which is reset every time its GetStream method i...
CAutoOutputFileReset & operator=(const CAutoOutputFileReset &rhs)
Prohibit assignment operator.
CAutoOutputFileReset(const CAutoOutputFileReset &rhs)
Prohibit copy constructor.
unique_ptr< CNcbiOstream > m_FileStream
The output stream.
string m_FileName
The file's name.
int m_Version
File version if larger than zero.
CAutoOutputFileReset(const string &file_name, bool use_versions=false)
Constructor.
CFile –.
Definition: ncbifile.hpp:1605
CObject –.
Definition: ncbiobj.hpp:180
const char * file_name[]
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NULL
Definition: ncbistd.hpp:225
string GetName(void) const
Get the base entry name with extension (if any).
Definition: ncbifile.hpp:3917
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5181
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1381
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
#define NCBI_BLASTINPUT_EXPORT
Definition: ncbi_export.h:336
range(_Ty, _Ty) -> range< _Ty >
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Defines command line argument related classes.
T max(T x_, T y_)
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
Definition of SSeqLoc structure.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Modified on Fri Sep 20 14:57:09 2024 by modify_doxy.py rev. 669887