NCBI C++ ToolKit
em_index.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: em_index.cpp 102384 2024-04-29 11:24:44Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: em_index.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Parsing embl to blocks. Build Embl format index block.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "ftacpp.hpp"
38 
39 #include "index.h"
40 #include "embl.h"
41 
42 #include "ftaerr.hpp"
43 #include "indx_blk.h"
44 #include "indx_def.h"
45 #include "utilfun.h"
46 #include "entry.h"
47 #include "keyword_parse.hpp"
48 
49 #ifdef THIS_FILE
50 # undef THIS_FILE
51 #endif
52 #define THIS_FILE "em_index.cpp"
53 
55 
56 vector<string> emblKeywords = {
57  "ID",
58  "AC",
59  "NI",
60  "DT",
61  "DE",
62  "KW",
63  "OS",
64  "RN",
65  "DR",
66  "CC",
67  "FH",
68  "SQ",
69  "SV",
70  "CO",
71  "AH",
72  "PR",
73  "//",
74 };
75 
76 vector<string> checkedEmblKeywords = {
77  "ID", "AC", "NI", "DT", "DE", "KW", "OS", "OC", "OG", "RN", "RP", "RX", "RC", "RG", "RA", "RT", "RL", "DR", "FH", "FT", "SQ", "CC", "SV", "CO", "XX", "AH", "AS", "PR", "//"
78 };
79 
80 
81 // LCOV_EXCL_START
82 // Excluded per Mark's request on 12/14/2016
83 /**********************************************************
84  *
85  * static void EmblSegment(pp):
86  *
87  * 2-24-93
88  *
89  **********************************************************/
90 static void EmblSegment(ParserPtr pp)
91 {
92  size_t i = 0;
93  int j;
94  IndexblkPtr ibp;
95  char* locus;
96 
97  locus = StringSave(pp->entrylist[0]->locusname);
98 
99  for (i = StringLen(locus); isdigit(locus[i - 1]) != 0 && i > 0; i--)
100  locus[i - 1] = '\0';
101 
102  for (j = 0; j < pp->indx; j++) {
103  ibp = pp->entrylist[j];
104  ibp->segnum = static_cast<Uint2>(j + 1);
105  ibp->segtotal = pp->indx;
106 
107  StringCpy(ibp->blocusname, locus);
108  }
109 
110  MemFree(locus);
111 }
112 // LCOV_EXCL_STOP
113 
114 /**********************************************************/
115 static bool em_err_field(const char* str)
116 {
117  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField, "No %s in Embl format file, entry dropped", str);
118  return true;
119 }
120 
121 /**********************************************************/
122 static void ParseEmblVersion(IndexblkPtr entry, char* line)
123 {
124  char* p;
125  char* q;
126 
127  p = StringRChr(line, '.');
128  if (! p) {
129  ErrPostEx(SEV_FATAL, ERR_VERSION_MissingVerNum, "Missing VERSION number in SV line.");
130  entry->drop = true;
131  return;
132  }
133  *p++ = '\0';
134  for (q = p; *q >= '0' && *q <= '9';)
135  q++;
136  if (*q != '\0') {
137  ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitVerNum, "Incorrect VERSION number in SV line: \"%s\".", p);
138  entry->drop = true;
139  return;
140  }
141  if (! StringEqu(entry->acnum, line)) {
142  ErrPostEx(SEV_FATAL, ERR_VERSION_AccessionsDontMatch, "Accessions in SV and AC lines don't match: \"%s\" vs \"%s\".", line, entry->acnum);
143  entry->drop = true;
144  return;
145  }
146  entry->vernum = atoi(p);
147  if (entry->vernum < 1) {
148  ErrPostEx(SEV_FATAL, ERR_VERSION_InvalidVersion, "Version number \"%d\" from Accession.Version value \"%s.%d\" is not a positive integer.", entry->vernum, entry->acnum, entry->vernum);
149  entry->drop = true;
150  }
151 }
152 
153 /**********************************************************/
154 static char* EmblGetNewIDVersion(char* locus, char* str)
155 {
156  char* res;
157  char* p;
158  char* q;
159 
160  if (! locus || ! str)
161  return nullptr;
162  p = StringChr(str, ';');
163  if (! p)
164  return nullptr;
165  for (p++; *p == ' ';)
166  p++;
167  if (p[0] != 'S' || p[1] != 'V')
168  return nullptr;
169  for (p += 2; *p == ' ';)
170  p++;
171  q = StringChr(p, ';');
172  if (! q)
173  return nullptr;
174  *q = '\0';
175 
176  string s = locus;
177  s.append(".");
178  s.append(p);
179  res = StringSave(s);
180 
181  *q = ';';
182  return (res);
183 }
184 
185 /**********************************************************
186  *
187  * bool EmblIndex(pp, (*fun)()):
188  *
189  * 3-25-93
190  *
191  **********************************************************/
192 bool EmblIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len))
193 {
194  TokenStatBlkPtr stoken;
195  FinfoBlk finfo;
196 
197  bool after_AC;
198  bool after_NI;
199  bool after_ID;
200  bool after_OS;
201  bool after_OC;
202  bool after_RN;
203  bool after_SQ;
204  bool after_SV;
205  bool after_DT;
206 
207  bool end_of_file;
208 
209  IndexblkPtr entry;
211  Int4 indx = 0;
212  IndBlkNextPtr ibnp;
213  IndBlkNextPtr tibnp;
214  size_t i;
215  int j;
216  char* line_sv;
217  char* p;
218  char* q;
219 
220  end_of_file = SkipTitleBuf(pp->ffbuf, finfo, emblKeywords[ParFlat_ID]);
221  if (end_of_file) {
222  MsgSkipTitleFail("Embl", finfo);
223  return false;
224  }
225 
226  bool tpa_check = (pp->source == Parser::ESource::EMBL);
227 
228  ibnp = new IndBlkNode(nullptr);
229  tibnp = ibnp;
230 
231  while (! end_of_file) {
232  entry = InitialEntry(pp, finfo);
233 
234  if (entry) {
235  pp->curindx = indx;
236  tibnp->next = new IndBlkNode(entry);
237  tibnp = tibnp->next;
238 
239  indx++;
240 
241  entry->is_contig = false;
242  entry->origin = false;
243  after_AC = false;
244  after_ID = false;
245  after_OS = false;
246  after_OC = false;
247  after_RN = false;
248  after_SQ = false;
249  after_NI = false;
250  after_SV = false;
251  after_DT = false;
252 
253  line_sv = nullptr;
254 
255  auto keywordEnd = emblKeywords[ParFlatEM_END];
256  auto keywordId = emblKeywords[ParFlat_ID];
257  auto keywordNi = emblKeywords[ParFlat_NI];
258  auto keywordAh = emblKeywords[ParFlat_AH];
259  auto keywordSq = emblKeywords[ParFlat_SQ];
260  auto keywordOs = emblKeywords[ParFlat_OS];
261  auto keywordSv = emblKeywords[ParFlat_SV];
262  auto keywordKw = emblKeywords[ParFlat_KW];
263 
264  while (! end_of_file &&
265  ! StringEquN(finfo.str, keywordEnd.c_str(), keywordEnd.size())) {
266  if (StringEquN(finfo.str, keywordKw.c_str(), 2)) {
267  if (pp->source == Parser::ESource::EMBL ||
268  pp->source == Parser::ESource::DDBJ) {
269  pp->KeywordParser().AddDataLine(finfo.str);
270  }
271  } else if (StringEquN(finfo.str, keywordId.c_str(), keywordId.size())) {
272  if (after_ID) {
273  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd, "Missing end of the entry, entry dropped");
274  entry->drop = true;
275  break;
276  }
277  after_ID = true;
278  if (entry->embl_new_ID)
279  line_sv = EmblGetNewIDVersion(entry->locusname,
280  finfo.str);
281  } else if (StringEquN(finfo.str, keywordAh.c_str(), keywordAh.size())) {
282  if (entry->is_tpa == false && entry->tsa_allowed == false) {
283  ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Line type \"AH\" is allowed for TPA or TSA records only. Continue anyway.");
284  }
285  }
286  if (after_SQ && isalpha(finfo.str[0]) != 0) {
287  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd, "Missing end of the entry, entry dropped");
288  entry->drop = true;
289  break;
290  }
291  if (StringEquN(finfo.str, keywordNi.c_str(), 2)) {
292  if (after_NI) {
293  ErrPostStr(SEV_ERROR, ERR_FORMAT_Multiple_NI, "Multiple NI lines in the entry, entry dropped");
294  entry->drop = true;
295  break;
296  }
297  after_NI = true;
298  } else if (StringEquN(finfo.str, keywordSq.c_str(), keywordSq.size())) {
299  after_SQ = true;
300  entry->origin = true;
301  } else if (StringEquN(finfo.str, keywordOs.c_str(), keywordOs.size())) {
302  if (after_OS && pp->source != Parser::ESource::EMBL) {
303  ErrPostStr(SEV_INFO, ERR_ORGANISM_Multiple, "Multiple OS lines in the entry");
304  }
305  after_OS = true;
306  }
307  if (pp->accver &&
308  StringEquN(finfo.str, keywordSv.c_str(), keywordSv.size())) {
309  if (entry->embl_new_ID) {
310  ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Line type \"SV\" is not allowed in conjunction with the new format of \"ID\" line. Entry dropped.");
311  entry->drop = true;
312  } else {
313  if (after_SV) {
314  ErrPostStr(SEV_FATAL, ERR_FORMAT_Multiple_SV, "Multiple SV lines in the entry");
315  entry->drop = true;
316  break;
317  }
318  after_SV = true;
319  p = finfo.str + ParFlat_COL_DATA_EMBL;
320  while (*p == ' ' || *p == '\t')
321  p++;
322  for (q = p; *q != '\0' && *q != ' ' && *q != '\t' &&
323  *q != '\n';)
324  q++;
325  i = q - p;
326  line_sv = StringNew(i);
327  StringNCpy(line_sv, p, i);
328  line_sv[i] = '\0';
329  }
330  }
331  if (StringEquN(finfo.str, "OC", 2))
332  after_OC = true;
333 
334  auto keywordRn = emblKeywords[ParFlat_RN];
335  if (StringEquN(finfo.str, keywordRn.c_str(), keywordRn.size()))
336  after_RN = true;
337 
338  auto keywordCo = emblKeywords[ParFlat_CO];
339  if (StringEquN(finfo.str, keywordCo.c_str(), keywordCo.size()))
340  entry->is_contig = true;
341 
342  auto keywordAc = emblKeywords[ParFlat_AC];
343  auto keywordDt = emblKeywords[ParFlat_DT];
344  if (StringEquN(finfo.str, keywordAc.c_str(), keywordAc.size())) {
345  if (after_AC == false) {
346  after_AC = true;
347  if (GetAccession(pp, finfo.str, entry, 2) == false)
348  pp->num_drop++;
349  } else if (! entry->drop &&
350  GetAccession(pp, finfo.str, entry, 1) == false)
351  pp->num_drop++;
352  } else if (StringEquN(finfo.str, keywordDt.c_str(), keywordDt.size())) {
353  stoken = TokenString(finfo.str, ' ');
354  if (stoken->num > 2) {
355  after_DT = true;
356  entry->date = GetUpdateDate(stoken->list->next->c_str(),
357  pp->source);
358  }
359 
360  FreeTokenstatblk(stoken);
361  }
362 
363  end_of_file = XReadFileBuf(pp->ffbuf, finfo);
364 
365  if (finfo.str[0] != ' ' && finfo.str[0] != '\t') {
366  if (CheckLineType(finfo.str, finfo.line, checkedEmblKeywords, false) == false)
367  entry->drop = true;
368  }
369  } /* while, end of one entry */
370 
372  pp->KeywordParser().KeywordList(),
373  tpa_check,
374  entry);
375 
376  entry->is_tpa_wgs_con = (entry->is_contig && entry->is_wgs && entry->is_tpa);
377 
378  if (! entry->drop) {
379  if (after_AC == false) {
380  ErrPostStr(SEV_ERROR, ERR_ACCESSION_NoAccessNum, "No AC in Embl format file, entry dropped");
381  entry->drop = true;
382  }
383 
384  if (after_ID == false)
385  entry->drop = em_err_field("ID");
386 
387  if (after_SV == false && pp->accver &&
388  entry->embl_new_ID == false)
389  entry->drop = em_err_field("Version number (SV)");
390 
391  if (after_OS == false)
392  entry->drop = em_err_field("Organism data (OS)");
393 
394  if (after_OC == false)
395  entry->drop = em_err_field("Organism data (OC)");
396 
397  if (after_RN == false)
398  entry->drop = em_err_field("Reference data");
399 
400  if (after_DT == false)
401  entry->drop = em_err_field("Update and Create dates");
402 
403  if (after_SQ == false && entry->is_contig == false)
404  entry->drop = em_err_field("Sequence data");
405  }
406  if (! entry->drop && pp->accver) {
407  ParseEmblVersion(entry, line_sv);
408  }
409  if (line_sv) {
410  MemFree(line_sv);
411  line_sv = nullptr;
412  }
413 
414  entry->len = pp->ffbuf.get_offs() - entry->offset;
415 
416  if (fun) {
417  data = LoadEntry(pp, entry->offset, entry->len);
418  (*fun)(entry, data->mOffset, static_cast<Int4>(data->len));
419  delete data;
420  }
421  } /* if, entry */
422  else {
423  end_of_file = FindNextEntryBuf(
424  end_of_file, pp->ffbuf, finfo, emblKeywords[ParFlatEM_END]);
425  }
426 
427  end_of_file = FindNextEntryBuf(
428  end_of_file, pp->ffbuf, finfo, emblKeywords[ParFlat_ID]);
429 
430  } /* while, end_of_file */
431 
432  pp->indx = indx;
433 
435 
436  if (pp->qsfd && QSIndex(pp, ibnp->next) == false)
437  return false;
438 
439  pp->entrylist.resize(indx);
440  tibnp = ibnp->next;
441  delete ibnp;
442  for (j = 0; j < indx && tibnp; j++, tibnp = ibnp) {
443  pp->entrylist[j] = tibnp->ibp;
444  ibnp = tibnp->next;
445  delete tibnp;
446  }
447 
448  if (pp->segment)
449  // LCOV_EXCL_START
450  // Excluded per Mark's request on 12/14/2016
451  EmblSegment(pp);
452  // LCOV_EXCL_STOP
453 
454  return (end_of_file);
455 }
456 
bool QSIndex(ParserPtr pp, IndBlkNextPtr ibnp)
Definition: block.cpp:207
const list< string > KeywordList() const
void AddDataLine(const string &line)
vector< string > emblKeywords
Definition: em_index.cpp:56
static char * EmblGetNewIDVersion(char *locus, char *str)
Definition: em_index.cpp:154
bool EmblIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char *offset, Int4 len))
Definition: em_index.cpp:192
static void ParseEmblVersion(IndexblkPtr entry, char *line)
Definition: em_index.cpp:122
static void EmblSegment(ParserPtr pp)
Definition: em_index.cpp:90
static bool em_err_field(const char *str)
Definition: em_index.cpp:115
vector< string > checkedEmblKeywords
Definition: em_index.cpp:76
@ ParFlat_NI
Definition: embl.h:44
@ ParFlat_KW
Definition: embl.h:47
@ ParFlat_AH
Definition: embl.h:56
@ ParFlat_DT
Definition: embl.h:45
@ ParFlat_SQ
Definition: embl.h:53
@ ParFlat_OS
Definition: embl.h:48
@ ParFlat_CO
Definition: embl.h:55
@ ParFlat_SV
Definition: embl.h:54
@ ParFlat_RN
Definition: embl.h:49
@ ParFlat_ID
Definition: embl.h:42
@ ParFlat_AC
Definition: embl.h:43
@ ParFlatEM_END
Definition: embl.h:58
#define ParFlat_COL_DATA_EMBL
Definition: embl.h:38
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
Definition: entry.cpp:300
#define ERR_FORMAT_MissingEnd
Definition: flat2err.h:39
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:89
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:90
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:93
char * StringNew(size_t sz)
Definition: ftacpp.hpp:43
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:346
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
char data[12]
Definition: iconv.c:80
#define SEV_INFO
Definition: gicache.c:89
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_FATAL
Definition: gicache.c:93
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
Definition: indx_blk.cpp:611
bool FindNextEntryBuf(bool end_of_file, FileBuf &fbuf, FinfoBlk &finfo, const CTempString &keyword)
Definition: indx_blk.cpp:2187
IndexblkPtr InitialEntry(ParserPtr pp, FinfoBlk &finfo)
Definition: indx_blk.cpp:788
bool SkipTitleBuf(FileBuf &fbuf, FinfoBlk &finfo, const CTempString &keyword)
Definition: indx_blk.cpp:358
bool XReadFileBuf(FileBuf &fbuf, FinfoBlk &finfo)
Definition: indx_blk.cpp:314
void MsgSkipTitleFail(const char *flatfile, FinfoBlk &finfo)
Definition: indx_blk.cpp:2179
#define ERR_VERSION_NonDigitVerNum
Definition: indx_err.h:83
#define ERR_ORGANISM_Multiple
Definition: indx_err.h:97
#define ERR_VERSION_MissingVerNum
Definition: indx_err.h:82
#define ERR_ACCESSION_NoAccessNum
Definition: indx_err.h:68
#define ERR_FORMAT_Multiple_SV
Definition: indx_err.h:48
#define ERR_VERSION_InvalidVersion
Definition: indx_err.h:88
#define ERR_FORMAT_MissingField
Definition: indx_err.h:42
#define ERR_FORMAT_Multiple_NI
Definition: indx_err.h:46
#define ERR_ENTRY_InvalidLineType
Definition: indx_err.h:64
#define ERR_VERSION_AccessionsDontMatch
Definition: indx_err.h:84
int i
int len
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
const CConstRef< CSeq_id > GetAccession(const CSeq_id_Handle &id_handle)
size_t get_offs() const
Char str[256]
Definition: indx_blk.h:42
Int4 line
Definition: indx_blk.h:43
Indexblk * ibp
Definition: indx_blk.h:56
IndBlkNode * next
Definition: indx_blk.h:57
Char acnum[200]
Definition: ftablock.h:177
bool tsa_allowed
Definition: ftablock.h:222
Char blocusname[200]
Definition: ftablock.h:189
CRef< objects::CDate_std > date
Definition: ftablock.h:198
bool is_tpa_wgs_con
Definition: ftablock.h:220
Int2 vernum
Definition: ftablock.h:178
bool is_tpa
Definition: ftablock.h:217
bool embl_new_ID
Definition: ftablock.h:229
bool is_wgs
Definition: ftablock.h:216
bool origin
Definition: ftablock.h:212
bool is_contig
Definition: ftablock.h:208
bool drop
Definition: ftablock.h:193
Uint2 segtotal
Definition: ftablock.h:186
size_t len
Definition: ftablock.h:195
size_t offset
Definition: ftablock.h:179
Uint2 segnum
Definition: ftablock.h:184
Char locusname[200]
Definition: ftablock.h:181
vector< IndexblkPtr > entrylist
CKeywordParser & KeywordParser()
TokenBlk * next
Definition: ftablock.h:139
const char * c_str() const
Definition: ftablock.h:141
TokenBlk * list
Definition: ftablock.h:148
bool CheckLineType(char *ptr, Int4 line, const vector< string > &keywordList, bool after_origin)
Definition: utilfun.cpp:993
void FreeTokenstatblk(TokenStatBlkPtr tsbp)
Definition: utilfun.cpp:528
void xCheckEstStsGssTpaKeywords(const list< string > keywordList, bool tpa_check, IndexblkPtr entry)
Definition: utilfun.cpp:1418
TokenStatBlkPtr TokenString(const char *str, Char delimiter)
Definition: utilfun.cpp:479
Modified on Wed May 01 14:23:07 2024 by modify_doxy.py rev. 669887