NCBI C++ ToolKit
utilfeat.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: utilfeat.cpp 102411 2024-05-02 10:00:24Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: utilfeat.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description: functions for features parsing
31  *
32  */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "ftacpp.hpp"
37 
40 #include <objects/seq/Bioseq.hpp>
45 
46 #include "index.h"
47 
50 
51 #include "ftaerr.hpp"
52 #include "asci_blk.h"
53 #include "add.h"
54 #include "utilfeat.h"
55 #include "utilfun.h"
56 
57 #ifdef THIS_FILE
58 # undef THIS_FILE
59 #endif
60 #define THIS_FILE "utilfeat.cpp"
61 
64 
65 const char* ParFlat_GImod[] = {
66  "Mitochondr",
67  "Chloroplast",
68  "Kinetoplas",
69  "Cyanelle",
70  "Chromoplast",
71  "Plastid",
72  "Macronuclear",
73  "Extrachrom",
74  "Plasmid",
75  "Leucoplast",
76  "Apicoplast",
77  nullptr
78 };
79 
80 const char* valid_organelle[] = {
81  "apicoplast",
82  "chloroplast",
83  "chromatophore",
84  "chromoplast",
85  "cyanelle",
86  "hydrogenosome",
87  "kinetoplast",
88  "leucoplast",
89  "mitochondrion",
90  "nucleomorph",
91  "plastid",
92  "proplastid",
93  nullptr
94 };
95 
96 /**********************************************************/
97 bool SeqLocHaveFuzz(const CSeq_loc& loc)
98 {
99  bool flag;
100 
101  string loc_str;
102  loc.GetLabel(&loc_str);
103 
104  if (loc_str.find('<') == string::npos && loc_str.find('>') == string::npos)
105  flag = false;
106  else
107  flag = true;
108 
109  return (flag);
110 }
111 
112 /**********************************************************
113  *
114  * char* CpTheQualValue(qlist, qual):
115  *
116  * Return qual's value if found the "qual" in the
117  * "qlist"; otherwise, return NULL.
118  *
119  **********************************************************/
120 string CpTheQualValue(const TQualVector& qlist, const Char* qual)
121 {
122  for (const auto& cur : qlist) {
123  if (cur->GetQual() != qual)
124  continue;
125 
126  const string& val = cur->GetVal();
127  if (val == "\"\"") {
128  ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownQualSpelling, "Empty qual %s : %s", qual, val.c_str());
129  break;
130  }
131 
132  return NStr::Sanitize(val);
133  }
134 
135  return {};
136 }
137 
138 /**********************************************************
139  *
140  * char* GetTheQualValue(qlist, qual):
141  *
142  * Return qual's value if found the "qual" in the
143  * "qlist", and remove the "qual" from the qlist;
144  * otherwise, return NULL.
145  *
146  **********************************************************/
147 optional<string> GetTheQualValue(TQualVector& qlist, const Char* qual)
148 {
149  optional<string> qvalue;
150 
151  for (TQualVector::iterator cur = qlist.begin(); cur != qlist.end(); ++cur) {
152  if ((*cur)->GetQual() != qual)
153  continue;
154 
155  const string& val = (*cur)->GetVal();
156  if (val == "\"\"") {
157  ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownQualSpelling, "Empty qual %s : %s", qual, val.c_str());
158  break;
159  }
160 
161  string s = tata_save(val);
162  if (! s.empty())
163  qvalue = s;
164 
165  qlist.erase(cur);
166  break;
167  }
168 
169  return qvalue;
170 }
171 
172 /**********************************************************
173  *
174  * bool DeleteQual(qlist, qual):
175  *
176  * Return TRUE the "qual" has found in and removed
177  * from the "qlist".
178  *
179  **********************************************************/
180 bool DeleteQual(TQualVector& qlist, const Char* qual)
181 {
182  bool got = false;
183  for (TQualVector::iterator cur = qlist.begin(); cur != qlist.end();) {
184  if ((*cur)->GetQual() != qual) {
185  ++cur;
186  continue;
187  }
188 
189  cur = qlist.erase(cur);
190  got = true;
191  }
192 
193  return (got);
194 }
195 
196 /**********************************************************
197  *
198  * Uint1 GetQualValueAa(qual, checkseq):
199  *
200  * Return 255 if not a valid amino acid, not in
201  * "ParFlat_AA_array".
202  *
203  **********************************************************/
204 Uint1 GetQualValueAa(const char* qval, bool checkseq)
205 {
206  const char* str;
207  const char* p;
208 
209  str = StringStr(qval, "aa:");
210  if (! str)
211  return (255);
212 
213  for (str += 3; *str == ' ';)
214  str++;
215  for (p = str; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z');)
216  p++;
217 
218  if (checkseq && ! StringStr(p, "seq:"))
219  ErrPostEx(SEV_ERROR, ERR_QUALIFIER_AntiCodonLacksSequence, "Anticodon qualifier \"%s\" lacks a 'seq' field for the sequence of the anticodon.", qval);
220 
221  return CCleanup::ValidAminoAcid(string_view(str, p - str));
222 }
223 
224 /**********************************************************/
225 bool GetGenomeInfo(CBioSource& bsp, string_view bptr)
226 {
228  if (i < 0)
229  return false;
230 
231  if (i == 0)
233  else if (i == 1)
235  else if (i == 2)
237  else if (i == 3)
239  else if (i == 4)
241  else if (i == 5)
243  else if (i == 6)
245  else if (i == 7)
247  else if (i == 8)
249  else
251 
252  return true;
253 }
254 
255 /**********************************************************/
256 static void GetTaxnameNameFromDescrs(const TSeqdescList& descrs, vector<string>& names)
257 {
258  for (const auto& descr : descrs) {
259  if (! descr->IsSource() || ! descr->GetSource().IsSetOrg() ||
260  ! descr->GetSource().GetOrg().IsSetTaxname())
261  continue;
262 
263  const COrg_ref& org_ref = descr->GetSource().GetOrg();
264  names[0] = org_ref.GetTaxname();
265 
266  if (org_ref.IsSetOrgname() && org_ref.GetOrgname().IsSetMod()) {
267  for (const auto& mod : org_ref.GetOrgname().GetMod()) {
268  if (! mod->IsSetSubname() || ! mod->IsSetSubtype())
269  continue;
270 
271  COrgMod::TSubtype stype = mod->GetSubtype();
272 
273  if (stype == COrgMod::eSubtype_old_name)
274  names[1] = mod->GetSubname();
275  /* acronym(19), synonym(28), anamorph(29), teleomorph(30),
276  gb-acronym(32), gb-anamorph(33), gb-synonym(34) */
277  else if (stype == COrgMod::eSubtype_acronym || stype == COrgMod::eSubtype_synonym ||
280  stype == COrgMod::eSubtype_gb_synonym) {
281  names.push_back(mod->GetSubname());
282  }
283  }
284  }
285 
286  if (descr->GetSource().IsSetSubtype()) {
287  for (const auto& subtype : descr->GetSource().GetSubtype()) {
288  /* subtype = "other" */
289  if (! subtype->IsSetSubtype() || subtype->GetSubtype() != CSubSource::eSubtype_other || ! subtype->IsSetName())
290  continue;
291 
292  const Char* p = StringIStr(subtype->GetName().c_str(), "common:");
293  if (! p)
294  continue;
295 
296  for (p += 7; *p == ' ';)
297  p++;
298  if (*p == '\0')
299  continue;
300 
301  names.push_back(p);
302  }
303  }
304 
305  if (org_ref.IsSetCommon())
306  names[2] = org_ref.GetCommon();
307 
308  break;
309  }
310 }
311 
312 /**********************************************************/
313 static void GetTaxnameName(TEntryList& seq_entries, vector<string>& names)
314 {
315  names.resize(3);
316 
317  for (auto& entry : seq_entries) {
318  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
319  if (bio_set->IsSetDescr())
320  GetTaxnameNameFromDescrs(bio_set->SetDescr().Set(), names);
321  }
322 
323  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
324  if (bioseq->IsSetDescr())
325  GetTaxnameNameFromDescrs(bioseq->SetDescr().Set(), names);
326  }
327  }
328 }
329 
330 /**********************************************************/
331 static void CheckDelGbblockSourceFromDescrs(TSeqdescList& descrs, const vector<string>& names)
332 {
333  for (auto& descr : descrs) {
334  if (! descr->IsGenbank())
335  continue;
336 
337  if (! descr->GetGenbank().IsSetSource())
338  break;
339 
340  CGB_block& gb_block = descr->SetGenbank();
341  char* p = StringSave(gb_block.GetSource());
342  char* pper = nullptr;
343 
344  size_t len = StringLen(p);
345  if (p[len - 1] == '.') {
346  pper = StringSave(p);
347  p[len - 1] = '\0';
348  }
349 
350  char* q = StringChr(p, ' ');
351  if (q)
352  *q = '\0';
353 
354  if (StringMatchIcase(valid_organelle, p) >= 0) {
355  if (q) {
356  for (q++; *q == ' ';)
357  q++;
358  fta_StringCpy(p, q);
359  }
360  } else if (q)
361  *q = ' ';
362 
363  vector<string>::const_iterator name = names.begin();
364  for (name += 2; name != names.end(); ++name) {
365  if (name->empty())
366  continue;
367 
368  len = name->size();
369  for (q = p;; q++) {
370  q = StringChr(q, '(');
371  if (! q)
372  break;
373  char* s = q + 1;
374  if (StringEquN(s, "acronym:", 8) ||
375  StringEquN(s, "synonym:", 8))
376  s += 8;
377  else if (StringEquN(s, "anamorph:", 9))
378  s += 9;
379  else if (StringEquN(s, "teleomorph:", 11))
380  s += 11;
381  if (*s == ' ')
382  while (*s == ' ')
383  s++;
384  if (StringEquNI(s, name->c_str(), len) && s[len] == ')') {
385  char* t = nullptr;
386  for (t = s + len + 1; *t == ' ';)
387  t++;
388  if (*t != '\0')
389  fta_StringCpy(q, t);
390  else {
391  if (q > p)
392  q--;
393  *q = '\0';
394  }
395  break;
396  }
397  }
398  }
399 
400  if (pper) {
401  string s = p;
402  s.append(".");
403  MemFree(pper);
404  pper = StringSave(s);
405  }
406 
407  const string& first_name = names[0];
408  const string& second_name = names[1];
409 
410  if (NStr::CompareNocase(p, first_name.c_str()) == 0 || (pper && NStr::CompareNocase(pper, first_name.c_str()) == 0)) {
411  gb_block.ResetSource();
412  } else if (NStr::CompareNocase(p, second_name.c_str()) == 0 || (pper && NStr::CompareNocase(pper, second_name.c_str()) == 0)) {
413  gb_block.ResetSource();
414  }
415 
416  MemFree(p);
417  if (pper)
418  MemFree(pper);
419  break;
420  }
421 }
422 
423 /**********************************************************/
424 static void CheckDelGbblockSource(TEntryList& seq_entries, const vector<string>& names)
425 {
426  for (auto& entry : seq_entries) {
427  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
428  if (bioseq->IsSetDescr())
429  CheckDelGbblockSourceFromDescrs(bioseq->SetDescr().Set(), names);
430  }
431  }
432 }
433 
434 /**********************************************************/
436 {
437  vector<string> names; /* 0 - taxname */
438  /* 1 - 254 old-name */
439  /* 2 etc. - common name */
440 
441  GetTaxnameName(seq_entries, names);
442 
443  if (! names[0].empty())
444  CheckDelGbblockSource(seq_entries, names);
445 }
446 
447 /**********************************************************/
449 {
450  const static Char STR_TO_REPLACE[] = "minus";
451 
452  // changing brackets is for backward compatibility
453  if (! str.empty()) {
454  if (str[0] == '[')
455  str[0] = '(';
456 
457  size_t last = str.size() - 1;
458  if (str[last] == ']')
459  str[last] = ')';
460  }
461 
462  // for backward compatibility with C-toolkit version
463  size_t pos = str.find(STR_TO_REPLACE);
464  while (pos != string::npos) {
465  str.replace(pos, sizeof(STR_TO_REPLACE) - 1, "c");
466  pos = str.find(STR_TO_REPLACE);
467  }
468 }
469 
470 /**********************************************************/
471 string location_to_string(const CSeq_loc& loc)
472 {
473  string loc_str;
474  loc.GetLabel(&loc_str);
475 
476  MakeLocStrCompatible(loc_str);
477  return loc_str.substr(0, 50);
478 }
479 
string tata_save(string_view t)
Definition: add.cpp:148
static char ValidAminoAcid(string_view abbrev)
Definition: cleanup.cpp:4974
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
#define ERR_QUALIFIER_AntiCodonLacksSequence
Definition: flat2err.h:134
#define ERR_FEATURE_UnknownQualSpelling
Definition: flat2err.h:334
list< CRef< objects::CSeq_entry > > TEntryList
std::list< CRef< objects::CSeqdesc > > TSeqdescList
Definition: ftablock.h:61
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:131
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
static const struct name_t names[]
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static const char * str(char *buf, int n)
Definition: stats.c:84
#define SEV_ERROR
Definition: gicache.c:91
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
void GetLabel(string *label) const
Appends a label suitable for display (e.g., error messages) label must point to an existing string ob...
Definition: Seq_loc.cpp:3467
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static string Sanitize(CTempString str, TSS_Flags flags=fSS_print)
Sanitize a string, allowing only specified classes of characters.
Definition: ncbistr.hpp:2878
void ResetSource(void)
Reset Source data member.
Definition: GB_block_.cpp:57
const TSource & GetSource(void) const
Get the Source member data.
Definition: GB_block_.hpp:479
void SetGenome(TGenome value)
Assign a value to Genome data member.
Definition: BioSource_.hpp:428
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
bool IsSetCommon(void) const
common name Check if a value has been assigned to Common data member.
Definition: Org_ref_.hpp:407
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
const TCommon & GetCommon(void) const
Get the Common member data.
Definition: Org_ref_.hpp:419
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_gb_acronym
used by taxonomy database
Definition: OrgMod_.hpp:115
@ eSubtype_gb_synonym
used by taxonomy database
Definition: OrgMod_.hpp:117
@ eSubtype_anamorph
Definition: OrgMod_.hpp:112
@ eSubtype_old_name
Definition: OrgMod_.hpp:124
@ eSubtype_teleomorph
Definition: OrgMod_.hpp:113
@ eSubtype_synonym
Definition: OrgMod_.hpp:111
@ eSubtype_acronym
Definition: OrgMod_.hpp:102
@ eSubtype_gb_anamorph
used by taxonomy database
Definition: OrgMod_.hpp:116
int i
int len
constexpr bool empty(list< Ts... >) noexcept
EIPRangeType t
Definition: ncbi_localip.c:101
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
USING_SCOPE(objects)
bool DeleteQual(TQualVector &qlist, const Char *qual)
Definition: utilfeat.cpp:180
const char * ParFlat_GImod[]
Definition: utilfeat.cpp:65
Uint1 GetQualValueAa(const char *qval, bool checkseq)
Definition: utilfeat.cpp:204
string location_to_string(const CSeq_loc &loc)
Definition: utilfeat.cpp:471
static void CheckDelGbblockSourceFromDescrs(TSeqdescList &descrs, const vector< string > &names)
Definition: utilfeat.cpp:331
static void GetTaxnameNameFromDescrs(const TSeqdescList &descrs, vector< string > &names)
Definition: utilfeat.cpp:256
string CpTheQualValue(const TQualVector &qlist, const Char *qual)
Definition: utilfeat.cpp:120
bool GetGenomeInfo(CBioSource &bsp, string_view bptr)
Definition: utilfeat.cpp:225
static void GetTaxnameName(TEntryList &seq_entries, vector< string > &names)
Definition: utilfeat.cpp:313
bool SeqLocHaveFuzz(const CSeq_loc &loc)
Definition: utilfeat.cpp:97
const char * valid_organelle[]
Definition: utilfeat.cpp:80
void MaybeCutGbblockSource(TEntryList &seq_entries)
Definition: utilfeat.cpp:435
void MakeLocStrCompatible(string &str)
Definition: utilfeat.cpp:448
optional< string > GetTheQualValue(TQualVector &qlist, const Char *qual)
Definition: utilfeat.cpp:147
static void CheckDelGbblockSource(TEntryList &seq_entries, const vector< string > &names)
Definition: utilfeat.cpp:424
Int2 StringMatchIcase(const Char **array, string_view text)
Definition: utilfun.cpp:507
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1496
Char * StringIStr(const Char *where, const Char *what)
Definition: utilfun.cpp:591
std::vector< CRef< objects::CGb_qual > > TQualVector
Definition: xgbfeat.h:12
Modified on Fri Sep 20 14:57:44 2024 by modify_doxy.py rev. 669887