NCBI C++ ToolKit
snp_filter.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: snp_filter.cpp 36594 2016-10-12 20:17:36Z evgeniev $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Melvin Quintos
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <sstream>
35 #include <vector>
36 
38 #include <stdio.h>
39 
41 
42 /* The Structure of the serialized filter is as follows:
43 Three parts separated by a pipe '|'
44 <version>|<name>|<fields>
45 
46 <version> is the version of the filter format. It is an integer starting at 1
47 
48 <name> is the name of the filter. It can contain any alpha numeric character.
49 
50 <fields> are encoded properties grouped into 'words' separated by spaces.
51 A detailed explanation follows:
52 
53 The first 'word' is 9 characters representing the boolean 'check' variables.
54 The order is:
55 1 checkLinks;
56 2 checkGeneFxn;
57 3 checkMapping;
58 4 checkWeight;
59 5 checkAlleleFreqList;
60 6 checkAlleleFreqChoice;
61 7 checkGaP_Hapmap;
62 8 checkVarClass;
63 9 checkQualityCheck;
64 
65 The next 3 words are integer values of the choices
66 The order is:
67 1 choiceWeight;
68 2 choiceAlleleFreq;
69 3 choiceVarClass;
70 
71 The next 6 words are lists of ids of the specific properties. Values are separated
72 by the ':' character.
73 
74 The special character '_' denotes an empty list
75 The order is:
76 1 listLinks;
77 2 listGeneFxns
78 3 listMappings;
79 4 listAlleleFreqs;
80 5 listGaP_Hapmaps;
81 6 listQualityChecks;
82 
83 example:
84 1|My Custom Filter|100000011 -1 -1 0 12:14:43:48 _ _ _ _ 2:3
85 */
86 
87 static const string kIdDelimiter = ":";
88 static const string kPartDelimiter = "|";
89 static const string kEscapedPartDelimiter = "\\|";
90 
92 {
93  Clear(); // zeroize filter
94 }
95 
97 {
99 }
100 
102 {
106 
107 
109 
110  listLinks.clear();
111  listGeneFxns.clear();
112  listMappings.clear();
113  listAlleleFreqs.clear();
114  listGaP_Hapmaps.clear();
115  listQualityChecks.clear();
116 }
117 
119 {
120  Clear();
121 
122  // there is a possibility that the pipe "|" may come escaped depending on the source of the filter,
123  // so it must be replaced with the canonic form
125 
126  typedef vector<string> TListString;
127  TListString tokens;
128  string strFields;
129 
130  // parse data
131  NStr::Split(input, kPartDelimiter, tokens);
132 
133  // determine version
134  if (tokens.size() < 2) {
135  return;
136  }
137  else if (tokens.size() == 2) {
138  // No version number
139  name = tokens.at(0);
140  strFields = tokens.at(1);
141  }
142  else {
143  // We have a version.
144  // For now, there is only 1 possible version number
145  // later version formats will need to handle the data differently
146  // version = tokens.at(0); // will be '1', so skip for now
147  name = tokens.at(1);
148  strFields = tokens.at(2);
149  }
150 
151  // Parse the specific properties
152  stringstream s(strFields);
153 
154  try {{
155 
156  // first word
157  string checks;
158  s >> checks;
159 
160  checkLinks = (checks.at(0)=='0') ? false : true;
161  checkGeneFxn = (checks.at(1)=='0') ? false : true;
162  checkMapping = (checks.at(2)=='0') ? false : true;
163  checkWeight = (checks.at(3)=='0') ? false : true;
164  checkAlleleFreqList = (checks.at(4)=='0') ? false : true;
165  checkAlleleFreqChoice = (checks.at(5)=='0') ? false : true;
166  checkGaP_Hapmap = (checks.at(6)=='0') ? false : true;
167  checkVarClass = (checks.at(7)=='0') ? false : true;
168  checkQualityCheck = (checks.at(8)=='0') ? false : true;
169 
170  // next 3 words for choices
172 
173  // next 6 words for lists
174  string word;
175 
176  // List Links
177  s >> word;
178  if (word.at(0) != '_') {
179  tokens.clear();
180  NStr::Split(word, kIdDelimiter, tokens);
181 
182  ITERATE(TListString, iter, tokens) {
183  int val = NStr::StringToInt(*iter);
184  listLinks.push_back(val);
185  }
186  }
187 
188  // List Gene Functions
189  s >> word;
190  if (word.at(0) != '_') {
191  tokens.clear();
192  NStr::Split(word, kIdDelimiter, tokens);
193 
194  ITERATE(TListString, iter, tokens) {
195  int val = NStr::StringToInt(*iter);
196  listGeneFxns.push_back(val);
197  }
198  }
199 
200  // List Mappings
201  s >> word;
202  if (word.at(0) != '_') {
203  tokens.clear();
204  NStr::Split(word, kIdDelimiter, tokens);
205 
206  ITERATE(TListString, iter, tokens) {
207  int val = NStr::StringToInt(*iter);
208  listMappings.push_back(val);
209  }
210  }
211 
212  // List AlleleFreqs
213  s >> word;
214  if (word.at(0) != '_') {
215  tokens.clear();
216  NStr::Split(word, kIdDelimiter, tokens);
217 
218  ITERATE(TListString, iter, tokens) {
219  int val = NStr::StringToInt(*iter);
220  listAlleleFreqs.push_back(val);
221  }
222  }
223 
224  // List Gap_Hapmaps
225  s >> word;
226  if (word.at(0) != '_') {
227  tokens.clear();
228  NStr::Split(word, kIdDelimiter, tokens);
229 
230  ITERATE(TListString, iter, tokens) {
231  int val = NStr::StringToInt(*iter);
232  listGaP_Hapmaps.push_back(val);
233  }
234  }
235 
236  // List Quality Checks
237  s >> word;
238  if (word.at(0) != '_') {
239  tokens.clear();
240  NStr::Split(word, kIdDelimiter, tokens);
241 
242  ITERATE(TListString, iter, tokens) {
243  int val = NStr::StringToInt(*iter);
244  listQualityChecks.push_back(val);
245  }
246  }
247 
248  }}
249  catch (std::out_of_range &) { /*ignore out of range */ }
250  catch (CException &) { /*ignore exceptions from NStr operations */ }
251 
252 }
253 
254 void SSnpFilter::SerializeTo(string &output) const
255 {
256  output.clear();
257 
258  // version
259  {{
260  output = "1|";
261  }}
262 
263  // name
264  {{
265  output += name + "|";
266  }}
267 
268  // first word
269  {{
270  char checks[11]; // 11 chars for word, space, and end of string characters
271  sprintf(checks, "%1d%1d%1d%1d%1d%1d%1d%1d%1d ",
272  checkLinks,
273  checkGeneFxn,
274  checkMapping,
275  checkWeight,
281  output += checks;
282  }}
283 
284  // Next 3 words
285  {{
289  }}
290 
291  // Next 6 words
292  {{
293  output += " ";
294 
295  // links
296  if (listLinks.empty()) {
297  output += "_ ";
298  }
299  else {
301  string val = NStr::IntToString(*iter);
302  output += val + kIdDelimiter;
303  }
304  output.replace(output.length()-1, 1, " "); // replace the last delimitter with space
305  }
306 
307  // gene fxn
308  if (listGeneFxns.empty()) {
309  output += "_ ";
310  }
311  else {
313  string val = NStr::IntToString(*iter);
314  output += val + kIdDelimiter;
315  }
316  output.replace(output.length()-1, 1, " "); // replace the last delimitter with space
317  }
318 
319  // mapping
320  if (listMappings.empty()) {
321  output += "_ ";
322  }
323  else {
325  string val = NStr::IntToString(*iter);
326  output += val + kIdDelimiter;
327  }
328  output.replace(output.length()-1, 1, " "); // replace the last delimitter with space
329  }
330 
331  // allelefreq
332  if (listAlleleFreqs.empty()) {
333  output += "_ ";
334  }
335  else {
337  string val = NStr::IntToString(*iter);
338  output += val + kIdDelimiter;
339  }
340  output.replace(output.length()-1, 1, " "); // replace the last delimitter with space
341  }
342 
343  // GaP Hapmap
344  if (listGaP_Hapmaps.empty()) {
345  output += "_ ";
346  }
347  else {
349  string val = NStr::IntToString(*iter);
350  output += val + kIdDelimiter;
351  }
352  output.replace(output.length()-1, 1, " "); // replace the last delimitter with space
353  }
354 
355  // Quality checks
356  if (listQualityChecks.empty()) {
357  output += "_ ";
358  }
359  else {
361  string val = NStr::IntToString(*iter);
362  output += val + kIdDelimiter;
363  }
364  output.erase(output.length()-1, 1); // erase the last delimitter
365  }
366  }}
367 }
368 
370 {
371  bool pass = true;
372 
373  if (pass && checkWeight) {
374  pass = (choiceWeight == b.GetWeight());
375  }
376  if (pass && checkVarClass) {
377  pass = (choiceVarClass == b.GetVariationClass());
378  }
379  if (pass && checkMapping) {
382  prop = (CSnpBitfield::EProperty)(*iter);
383  pass = b.IsTrue(prop);
384  if (pass == false)
385  break;
386  }
387  }
388  if (pass && checkAlleleFreqChoice) {
391  pass = b.IsTrue(prop);
392  }
393  if (pass && checkAlleleFreqList) {
396  prop = (CSnpBitfield::EProperty)(*iter);
397  pass = b.IsTrue(prop);
398  if (pass == false)
399  break;
400  }
401  }
402  if (pass && checkGaP_Hapmap) {
405  prop = (CSnpBitfield::EProperty)(*iter);
406  pass = b.IsTrue(prop);
407  if (pass == false)
408  break;
409  }
410  }
411  if (pass && checkGeneFxn) {
412  // for this property, we do a logical 'OR'.
413  // As long as one property is true, the test passes
414  bool has_prop = false;
417  prop = (CSnpBitfield::EFunctionClass)(*iter);
418  has_prop = b.IsTrue(prop);
419  if (has_prop)
420  break;
421  }
422  pass = has_prop;
423  }
424  if (pass && checkLinks) {
427  prop = (CSnpBitfield::EProperty)(*iter);
428  pass = b.IsTrue(prop);
429  if (pass == false)
430  break;
431  }
432  }
433  if (pass && checkQualityCheck) {
436  prop = (CSnpBitfield::EProperty)(*iter);
437  pass = b.IsTrue(prop);
438  if (pass == false)
439  break;
440  }
441  }
442 
443  return pass;
444 }
445 
447 
CSnpBitfield is a facade for representing any version of the SNP bitfield.
static SQLCHAR output[256]
Definition: print.c:5
static const char * str(char *buf, int n)
Definition: stats.c:84
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
static int input()
static const string kIdDelimiter
Definition: snp_filter.cpp:87
static const string kEscapedPartDelimiter
Definition: snp_filter.cpp:89
static const string kPartDelimiter
Definition: snp_filter.cpp:88
bool checkQualityCheck
Definition: snp_filter.hpp:58
TList listGaP_Hapmaps
Definition: snp_filter.hpp:69
bool checkAlleleFreqList
Definition: snp_filter.hpp:54
TList listLinks
Definition: snp_filter.hpp:65
bool checkVarClass
Definition: snp_filter.hpp:57
bool checkLinks
Definition: snp_filter.hpp:50
bool checkWeight
Definition: snp_filter.hpp:53
bool checkGaP_Hapmap
Definition: snp_filter.hpp:56
int choiceWeight
Definition: snp_filter.hpp:60
list< int > TList
Definition: snp_filter.hpp:64
int choiceVarClass
Definition: snp_filter.hpp:62
TList listGeneFxns
Definition: snp_filter.hpp:66
TList listAlleleFreqs
Definition: snp_filter.hpp:68
bool checkAlleleFreqChoice
Definition: snp_filter.hpp:55
void SerializeTo(string &output) const
Definition: snp_filter.cpp:254
void Clear()
Definition: snp_filter.cpp:101
bool checkGeneFxn
Definition: snp_filter.hpp:51
void SerializeFrom(string input)
Definition: snp_filter.cpp:118
string name
Definition: snp_filter.hpp:48
int choiceAlleleFreq
Definition: snp_filter.hpp:61
TList listQualityChecks
Definition: snp_filter.hpp:70
virtual bool Passes(const CSnpBitfield &b) const
Definition: snp_filter.cpp:369
TList listMappings
Definition: snp_filter.hpp:67
bool checkMapping
Definition: snp_filter.hpp:52
Modified on Mon Apr 22 04:00:39 2024 by modify_doxy.py rev. 669887