NCBI C++ ToolKit
sp_index.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sp_index.cpp 99335 2023-03-13 13:48:10Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: sp_index.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Build SWISS-PROT format index block. Parsing SP to memory blocks.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "ftacpp.hpp"
38 
39 #include "index.h"
40 #include "sprot.h"
41 #include "ftaerr.hpp"
42 #include "indx_blk.h"
43 #include "indx_def.h"
44 #include "utilfun.h"
45 #include "entry.h"
46 
47 #ifdef THIS_FILE
48 # undef THIS_FILE
49 #endif
50 #define THIS_FILE "sp_index.cpp"
51 
53 
54 vector<string> swissProtKeywords = {
55  "ID",
56  "AC",
57  "DT",
58  "DE",
59  "GN",
60  "OS",
61  "RN",
62  "CC",
63  "PE",
64  "DR",
65  "KW",
66  "FT",
67  "SQ",
68  "//",
69 };
70 
71 /**********************************************************/
72 static bool sp_err_field(const char* name)
73 {
74  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField, "Missing %s line, entry dropped", name);
75  return true;
76 }
77 
78 /**********************************************************/
79 static void SPGetVerNum(char* str, IndexblkPtr ibp)
80 {
81  char* p;
82  char* q;
83 
84  if (! str || ! ibp)
85  return;
86 
87  p = StringIStr(str, "sequence version");
88  if (! p)
89  return;
90 
91  for (p += 16; *p == ' ';)
92  p++;
93  for (q = p; *p >= '0' && *p <= '9';)
94  p++;
95  if (*p == '.' && (p[1] == '\0' || p[1] == '\n')) {
96  *p = '\0';
97  ibp->vernum = atoi(q);
98  *p = '.';
99  }
100 }
101 
102 /**********************************************************
103  *
104  * bool SprotIndex(pp, (*fun)()):
105  *
106  * 3-26-93
107  *
108  **********************************************************/
109 bool SprotIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len))
110 {
111  TokenStatBlkPtr stoken;
112 
113  bool after_AC;
114  bool after_OS;
115  bool after_OC;
116  bool after_RN;
117  bool after_SQ;
118  bool end_of_file;
119 
120  IndexblkPtr entry;
122  Int4 i;
123  Int4 indx = 0;
124  IndBlkNextPtr ibnp;
125  IndBlkNextPtr tibnp;
126  char* p;
127 
128  bool reviewed;
129 
130  FinfoBlk finfo;
131 
132  end_of_file = SkipTitleBuf(pp->ffbuf, finfo, swissProtKeywords[ParFlatSP_ID]);
133  if (end_of_file) {
134  MsgSkipTitleFail("Swiss-Prot", finfo);
135  return false;
136  }
137 
138  ibnp = new IndBlkNode(nullptr);
139  tibnp = ibnp;
140 
141  while (! end_of_file) {
142  entry = InitialEntry(pp, finfo);
143  if (entry) {
144  pp->curindx = indx;
145  tibnp->next = new IndBlkNode(entry);
146  tibnp = tibnp->next;
147 
148  indx++;
149 
150  after_AC = false;
151  after_OS = false;
152  after_OC = false;
153  after_RN = false;
154  after_SQ = false;
155 
157  reviewed = StringEquNI(p, "reviewed", 8);
158 
159  while (! end_of_file &&
161  if (StringEquN(finfo.str, "RM", 2)) {
162  ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType, "RM line type has been replaced by RX, skipped %s", finfo.str);
163  }
164  if (after_SQ && isalpha(finfo.str[0]) != 0) {
165  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd, "Missing end of the entry, entry dropped");
166  entry->drop = true;
167  break;
168  }
170  after_SQ = true;
171 
173  after_OS = true;
174 
175  if (StringEquN(finfo.str, "OC", 2))
176  after_OC = true;
177 
179  after_RN = true;
180 
182  if (after_AC == false) {
183  after_AC = true;
184  if (! GetAccession(pp, finfo.str, entry, 2))
185  pp->num_drop++;
186  } else if (! entry->drop && ! GetAccession(pp, finfo.str, entry, 1))
187  pp->num_drop++;
188  } else if (StringEquN(finfo.str, swissProtKeywords[ParFlatSP_DT].c_str(), swissProtKeywords[ParFlatSP_DT].size())) {
189  if (reviewed && pp->sp_dt_seq_ver && entry->vernum < 1)
190  SPGetVerNum(finfo.str, entry);
191  stoken = TokenString(finfo.str, ' ');
192  if (stoken->num > 2) {
193  entry->date = GetUpdateDate(stoken->list->next->str,
194  pp->source);
195  }
196  FreeTokenstatblk(stoken);
197  }
198 
199  end_of_file = XReadFileBuf(pp->ffbuf, finfo);
200 
201  } /* while, end of one entry */
202 
203  if (! entry->drop) {
204  if (after_AC == false) {
205  ErrPostStr(SEV_ERROR, ERR_ACCESSION_NoAccessNum, "Missing AC (accession #) line, entry dropped");
206  entry->drop = true;
207  }
208 
209  if (after_OS == false)
210  entry->drop = sp_err_field("OS (organism)");
211 
212  if (after_OC == false)
213  entry->drop = sp_err_field("OC (organism classification)");
214 
215  if (after_RN == false)
216  entry->drop = sp_err_field("RN (reference data)");
217 
218  if (after_SQ == false)
219  entry->drop = sp_err_field("SQ (sequence data)");
220  }
221 
222  entry->len = pp->ffbuf.get_offs() - entry->offset;
223 
224  if (fun) {
225  data = LoadEntry(pp, entry->offset, entry->len);
226  (*fun)(entry, data->mOffset, static_cast<Int4>(data->len));
227  delete data;
228  }
229  } /* if, entry */
230  else {
231  end_of_file = FindNextEntryBuf(
232  end_of_file, pp->ffbuf, finfo, swissProtKeywords[ParFlatSP_END]);
233  }
234  end_of_file = FindNextEntryBuf(
235  end_of_file, pp->ffbuf, finfo, swissProtKeywords[ParFlatSP_ID]);
236 
237  } /* while, end_of_file */
238 
239  pp->indx = indx;
240 
241  pp->entrylist.resize(indx);
242  tibnp = ibnp->next;
243  delete ibnp;
244  for (i = 0; i < indx && tibnp; i++, tibnp = ibnp) {
245  pp->entrylist[i] = tibnp->ibp;
246  ibnp = tibnp->next;
247  delete tibnp;
248  }
249 
250  return end_of_file;
251 }
252 
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
Definition: entry.cpp:300
#define ERR_FORMAT_MissingEnd
Definition: flat2err.h:39
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:125
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:115
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
char data[12]
Definition: iconv.c:80
#define SEV_ERROR
Definition: gicache.c:91
#define ErrPostStr
Definition: ncbierr.hpp:68
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
Definition: indx_blk.cpp:611
bool FindNextEntryBuf(bool end_of_file, FileBuf &fbuf, FinfoBlk &finfo, const CTempString &keyword)
Definition: indx_blk.cpp:2189
IndexblkPtr InitialEntry(ParserPtr pp, FinfoBlk &finfo)
Definition: indx_blk.cpp:788
bool SkipTitleBuf(FileBuf &fbuf, FinfoBlk &finfo, const CTempString &keyword)
Definition: indx_blk.cpp:358
bool XReadFileBuf(FileBuf &fbuf, FinfoBlk &finfo)
Definition: indx_blk.cpp:314
void MsgSkipTitleFail(const char *flatfile, FinfoBlk &finfo)
Definition: indx_blk.cpp:2181
#define ERR_ACCESSION_NoAccessNum
Definition: indx_err.h:68
#define ERR_FORMAT_MissingField
Definition: indx_err.h:42
#define ERR_ENTRY_InvalidLineType
Definition: indx_err.h:64
int i
int len
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
const CConstRef< CSeq_id > GetAccession(const CSeq_id_Handle &id_handle)
static bool sp_err_field(const char *name)
Definition: sp_index.cpp:72
vector< string > swissProtKeywords
Definition: sp_index.cpp:54
bool SprotIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char *offset, Int4 len))
Definition: sp_index.cpp:109
static void SPGetVerNum(char *str, IndexblkPtr ibp)
Definition: sp_index.cpp:79
@ ParFlatSP_SQ
Definition: sprot.h:54
@ ParFlatSP_END
Definition: sprot.h:55
@ ParFlatSP_ID
Definition: sprot.h:42
@ ParFlatSP_AC
Definition: sprot.h:43
@ ParFlatSP_RN
Definition: sprot.h:48
@ ParFlatSP_DT
Definition: sprot.h:44
@ ParFlatSP_OS
Definition: sprot.h:47
#define ParFlat_COL_DATA_SP
Definition: sprot.h:38
size_t get_offs() const
Char str[256]
Definition: indx_blk.h:42
Indexblk * ibp
Definition: indx_blk.h:56
IndBlkNode * next
Definition: indx_blk.h:57
CRef< objects::CDate_std > date
Definition: ftablock.h:190
Int2 vernum
Definition: ftablock.h:170
bool drop
Definition: ftablock.h:185
size_t len
Definition: ftablock.h:187
size_t offset
Definition: ftablock.h:171
vector< IndexblkPtr > entrylist
TokenBlk * next
Definition: ftablock.h:135
char * str
Definition: ftablock.h:134
TokenBlk * list
Definition: ftablock.h:140
Char * StringIStr(const Char *where, const Char *what)
Definition: utilfun.cpp:674
void FreeTokenstatblk(TokenStatBlkPtr tsbp)
Definition: utilfun.cpp:534
char * PointToNextToken(char *ptr)
Definition: utilfun.cpp:795
TokenStatBlkPtr TokenString(const char *str, Char delimiter)
Definition: utilfun.cpp:489
Modified on Sun Apr 14 05:24:34 2024 by modify_doxy.py rev. 669887