NCBI C++ ToolKit
keywords_item.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: keywords_item.cpp 102556 2024-05-30 15:00:13Z kans $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Mati Shomrat, NCBI
27 *
28 * File Description:
29 * flat-file generator -- keywords item implementation
30 *
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
39 #include <objects/seq/MolInfo.hpp>
45 #include <objmgr/bioseq_ci.hpp>
46 #include <objmgr/seqdesc_ci.hpp>
47 #include <util/static_set.hpp>
48 #include <algorithm>
49 
56 
58 
59 
62 
63 
65  CFlatItem(&ctx)
66 {
67  x_GatherInfo(ctx);
68 }
69 
71 {
72  return eItem_Keywords;
73 }
74 
76 (IFormatter& formatter,
77  IFlatTextOStream& text_os) const
78 {
79  formatter.FormatKeywords(*this, text_os);
80 }
81 
82 
83 /***************************************************************************/
84 /* PRIVATE */
85 /***************************************************************************/
86 
87 
88 enum ETechFlags {
92  eGSS
93 };
94 
95 
96 // EST keywords
97 static const char* const sc_EST[] = {
98  "EST", "EST (expressed sequence tag)", "EST PROTO((expressed sequence tag)",
99  "EST(expressed sequence tag)", "TSR", "UK putts", "expressed sequence tag",
100  "partial cDNA sequence", "putatively transcribed partial sequence",
101  "transcribed sequence fragment"
102 };
105 
106 
107 // GSS keywords
108 static const char* const sc_GSS[] = {
109  "GSS", "trapped exon"
110 };
112 
113 // STS keywords
114 static const char* const sc_STS[] = {
115  "STS", "STS (sequence tagged site)", "STS sequence",
116  "STS(sequence tagged site)", "sequence tagged site"
117 };
119 
120 
121 static bool s_CheckSpecialKeyword(const string& keyword, ETechFlags tech)
122 {
123  if (tech == eEST) {
124  if (sc_STS_kw.find(keyword.c_str()) != sc_STS_kw.end()) {
125  return false;
126  }
127  if (sc_GSS_kw.find(keyword.c_str()) != sc_GSS_kw.end()) {
128  return false;
129  }
130  }
131 
132  if (tech == eSTS) {
133  if (sc_EST_kw.find(keyword.c_str()) != sc_EST_kw.end()) {
134  return false;
135  }
136  if (sc_GSS_kw.find(keyword.c_str()) != sc_GSS_kw.end()) {
137  return false;
138  }
139  }
140 
141  if (tech == eGSS) {
142  if (sc_EST_kw.find(keyword.c_str()) != sc_EST_kw.end()) {
143  return false;
144  }
145  if (sc_STS_kw.find(keyword.c_str()) != sc_STS_kw.end()) {
146  return false;
147  }
148  }
149 
150  return true;
151 }
152 
153 
155 {
156  switch( ctx.GetRepr() ) {
158  x_AddKeyword("Whole_Genome_Map");
159  break;
160  default:
161  // no action needed yet for other types
162  break;
163  }
164 
165  // check if env sample or metagenome_source
166  bool is_env_sample = false;
167  bool is_metagenome_source = false;
168  CSeqdesc_CI src_desc(ctx.GetHandle(), CSeqdesc::e_Source);
169  if (src_desc) {
170  ITERATE(CBioSource::TSubtype, it, src_desc->GetSource().GetSubtype()) {
171  if (! (*it)->IsSetSubtype()) continue;
172  if ((*it)->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
173  is_env_sample = true;
174  }
175  }
176  if (src_desc->GetSource().IsSetOrg()) {
177  const CBioSource::TOrg& org = src_desc->GetSource().GetOrg();
178  if ( org.IsSetOrgname()) {
179  ITERATE (COrgName::TMod, it, org.GetOrgname().GetMod()) {
180  if (! (*it)->IsSetSubtype()) continue;
181  if ((*it)->GetSubtype() == COrgMod::eSubtype_metagenome_source) {
182  is_metagenome_source = true;
183  }
184  }
185  }
186  }
187  }
188 
189  // we might set this in the mol-info switch statement below
190  bool is_tsa = false;
191 
192  // add keywords based on mol-info
193  ETechFlags tech = e_not_set;
194  // don't do tech-related keywords if molinfo isn't set
195  if (ctx.GetMolinfo()) {
196  switch ( ctx.GetTech() ) {
197  case CMolInfo::eTech_est:
198  tech = eEST;
199  x_AddKeyword("EST");
200  break;
201 
202  case CMolInfo::eTech_sts:
203  tech = eSTS;
204  x_AddKeyword("STS");
205  break;
206 
208  tech = eGSS;
209  x_AddKeyword("GSS");
210  break;
211 
213  x_AddKeyword("HTG");
214  x_AddKeyword("HTGS_PHASE0");
215  break;
216 
218  x_AddKeyword("HTG");
219  x_AddKeyword("HTGS_PHASE1");
220  break;
221 
223  x_AddKeyword("HTG");
224  x_AddKeyword("HTGS_PHASE2");
225  break;
226 
228  x_AddKeyword("HTG");
229  break;
230 
232  x_AddKeyword("FLI_CDNA");
233  break;
234 
235  case CMolInfo::eTech_htc:
236  x_AddKeyword("HTC");
237  break;
238 
239  case CMolInfo::eTech_wgs:
240  x_AddKeyword("WGS");
241  break;
242 
243  case CMolInfo::eTech_tsa:
244  x_AddKeyword("TSA");
245  x_AddKeyword("Transcriptome Shotgun Assembly");
246  is_tsa = true; // remember so we don't add it twice
247  break;
248 
250  x_AddKeyword("TLS");
251  x_AddKeyword("Targeted Locus Study");
252  break;
253 
263  break;
264 
265  default:
266  break;
267  }
268  }
269 
270  if (is_env_sample) {
271  x_AddKeyword("ENV");
272  }
273 
274  if (is_metagenome_source) {
275  x_AddKeyword("Metagenome Assembled Genome");
276  x_AddKeyword("MAG");
277  }
278 
279  // propagate TSA keyword from nuc to prot in same nuc-prot set
280  if( ! is_tsa && ctx.IsProt() && ctx.IsInNucProt() ) {
281  CBioseq_set_Handle parent_bioseq_set = ctx.GetHandle().GetParentBioseq_set();
282  if( parent_bioseq_set ) {
283  CBioseq_CI bioseq_ci( parent_bioseq_set, CSeq_inst::eMol_na );
284  if( bioseq_ci ) {
285  CBioseq_Handle nuc = *bioseq_ci;
286  if( nuc ) {
287  CSeqdesc_CI desc_ci( nuc, CSeqdesc::e_Molinfo );
288  for( ; desc_ci; ++desc_ci ) {
289  if( desc_ci->GetMolinfo().CanGetTech() &&
290  desc_ci->GetMolinfo().GetTech() == CMolInfo::eTech_tsa )
291  {
292  x_AddKeyword("TSA");
293  x_AddKeyword("Transcriptome Shotgun Assembly");
294  break;
295  }
296  }
297  }
298  }
299  }
300  }
301 
302  CBioseq_Handle bsh = ctx.GetHandle();
303  for (CSeqdesc_CI di(bsh, CSeqdesc::e_User); di; ++di) {
304  const CUser_object& usr = di->GetUser();
305  if ( ! CComment_rule::IsStructuredComment (usr) ) continue;
306  string pfx = CComment_rule::GetStructuredCommentPrefix ( usr, true );
307  bool is_valid = false;
309  if (comment_rules) {
310  CConstRef<CComment_rule> ruler = comment_rules->FindCommentRuleEx(pfx);
311  if (ruler) {
312  const CComment_rule& rule = *ruler;
313  CComment_rule::TErrorList errors = rule.IsValid(usr);
314  if(errors.size() == 0) {
315  is_valid = true;
316  }
317  }
318  }
319  if ( is_valid ) {
320  if ( NStr::EqualNocase (pfx, "MIGS:5.0-Data" )) {
321  x_AddKeyword("GSC:MIxS");
322  x_AddKeyword("MIGS:5.0.");
323  } else if ( NStr::EqualNocase (pfx, "MIMS:5.0-Data" )) {
324  x_AddKeyword("GSC:MIxS");
325  x_AddKeyword("MIMS:5.0.");
326  } else if ( NStr::EqualNocase (pfx, "MIMARKS:5.0-Data" )) {
327  x_AddKeyword("GSC:MIxS");
328  x_AddKeyword("MIMARKS:5.0.");
329  } else if ( NStr::EqualNocase (pfx, "MISAG:5.0-Data" )) {
330  x_AddKeyword("GSC:MIxS");
331  x_AddKeyword("MISAG:5.0.");
332  } else if ( NStr::EqualNocase (pfx, "MIMAG:5.0-Data" )) {
333  x_AddKeyword("GSC:MIxS");
334  x_AddKeyword("MIMAG:5.0.");
335  } else if ( NStr::EqualNocase (pfx, "MIUVIG:5.0-Data" )) {
336  x_AddKeyword("GSC:MIxS");
337  x_AddKeyword("MIUVIG:5.0.");
338  }
339  }
340  try {
341  list<string> keywords = CComment_set::GetKeywords(usr);
342  FOR_EACH_STRING_IN_LIST ( s_itr, keywords ) {
343  x_AddKeyword(*s_itr);
344  }
345  } catch (CException&) {
346  }
347  }
348 
349  CBioseqContext::TUnverified unv = ctx.GetUnverifiedType();
351  x_AddKeyword("UNVERIFIED");
352  }
353  if ((unv & CBioseqContext::fUnverified_Organism) != 0) {
354  x_AddKeyword("UNVERIFIED");
355  x_AddKeyword("UNVERIFIED_ORGANISM");
356  }
357  if ((unv & CBioseqContext::fUnverified_Misassembled) != 0) {
358  x_AddKeyword("UNVERIFIED");
359  x_AddKeyword("UNVERIFIED_MISASSEMBLY");
360  }
361  if ((unv & CBioseqContext::fUnverified_Contaminant) != 0) {
362  x_AddKeyword("UNVERIFIED");
363  x_AddKeyword("UNVERIFIED_CONTAMINANT");
364  }
365 
366  CBioseqContext::TUnreviewed urv = ctx.GetUnreviewedType();
367  if (urv == 0) {
368  //x_AddKeyword("UNREVIEWED");
369  }
370  if ((urv & CBioseqContext::fUnreviewed_Unannotated) != 0) {
371  x_AddKeyword("UNREVIEWED");
372  x_AddKeyword("UNREVIEWED_UNANNOTATED");
373  }
374 
375  if (ctx.IsEncode()) {
376  x_AddKeyword("ENCODE");
377  }
378 
379  if( ctx.IsGenomeAssembly() && ! ctx.GetFinishingStatus().empty() ) {
380  x_AddKeyword( ctx.GetFinishingStatus() );
381  }
382 
383  if ( ctx.IsTPA() ) {
384  // add TPA keywords
385  x_AddKeyword("Third Party Data");
386  x_AddKeyword("TPA");
387  } else if ( ctx.IsRefSeq() ) {
388  // add RefSeq keyword
389  x_AddKeyword("RefSeq");
390  }
391 
392  if ( ctx.IsCrossKingdom() && ctx.IsRSUniqueProt() ) {
393  // add CrossKingdom keyword
394  x_AddKeyword("CROSS_KINGDOM");
395  }
396 
397  for (CSeqdesc_CI it(ctx.GetHandle()); it; ++it) {
398  const list<string>* keywords = nullptr;
399 
400  switch (it->Which()) {
401  case CSeqdesc::e_Pir:
402  keywords = &(it->GetPir().GetKeywords());
403  break;
404  case CSeqdesc::e_Genbank:
405  keywords = &(it->GetGenbank().GetKeywords());
406  break;
407  case CSeqdesc::e_Sp:
408  keywords = &(it->GetSp().GetKeywords());
409  break;
410  case CSeqdesc::e_Embl:
411  keywords = &(it->GetEmbl().GetKeywords());
412  break;
413  case CSeqdesc::e_Prf:
414  keywords = &(it->GetPrf().GetKeywords());
415  break;
416  default:
417  keywords = nullptr;
418  break;
419  }
420 
421  if (keywords) {
422  if (!IsSetObject()) {
423  x_SetObject(*it);
424  }
425  ITERATE (list<string>, kwd, *keywords) {
426  if (s_CheckSpecialKeyword(*kwd, tech)) {
427  x_AddKeyword(*kwd);
428  }
429  }
430  }
431  }
432 }
433 
434 
435 // Add a keyword to the list
436 static bool x_OkayToAddKeyword(const string& keyword, vector<string> keywords)
437 {
438  ITERATE (vector<string>, it, keywords) {
439  if (NStr::EqualNocase(keyword, *it)) {
440  return false;
441  }
442  }
443  return true;
444 }
445 void CKeywordsItem::x_AddKeyword(const string& keyword)
446 {
447  list<string> kywds;
448  NStr::Split( keyword, ";", kywds, NStr::fSplit_Tokenize );
449  FOR_EACH_STRING_IN_LIST ( k_itr, kywds ) {
450  const string& kw = *k_itr;
451  if (x_OkayToAddKeyword (kw, m_Keywords)) {
452  m_Keywords.push_back(kw);
453  }
454  }
455 }
456 
457 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ fUnreviewed_Unannotated
Definition: context.hpp:235
Int8 TUnreviewed
Definition: context.hpp:239
Int8 TUnverified
Definition: context.hpp:230
@ fUnverified_Contaminant
Definition: context.hpp:226
@ fUnverified_SequenceOrAnnotation
Definition: context.hpp:224
@ fUnverified_Organism
Definition: context.hpp:223
@ fUnverified_Misassembled
Definition: context.hpp:225
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
CBioseq_set_Handle –.
vector< TError > TErrorList
TErrorList IsValid(const CUser_object &user) const
static string GetStructuredCommentPrefix(const CUser_object &user, bool normalize=true)
static bool IsStructuredComment(const CUser_object &user)
static list< string > GetKeywords(const CUser_object &user)
static CConstRef< CComment_set > GetCommentRules()
CConstRef –.
Definition: ncbiobj.hpp:1266
bool IsSetObject(void) const
Definition: item_base.hpp:106
void x_SetObject(const CSerialObject &obj)
Definition: item_base.hpp:160
TKeywords m_Keywords
void x_GatherInfo(CBioseqContext &ctx) override
EItem GetItemType() const override
void x_AddKeyword(const string &keyword)
void Format(IFormatter &formatter, IFlatTextOStream &text_os) const override
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
@ eItem_Keywords
Definition: item.hpp:68
virtual void FormatKeywords(const CKeywordsItem &keys, IFlatTextOStream &text_os)=0
static bool is_valid(const char *num, int type, CONV_RESULT *cr)
Include a standard set of the NCBI C++ Toolkit most basic headers.
CS_CONTEXT * ctx
Definition: t0006.c:12
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5347
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2510
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
list< CRef< CSubSource > > TSubtype
Definition: BioSource_.hpp:145
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
list< CRef< COrgMod > > TMod
Definition: OrgName_.hpp:332
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_metagenome_source
Definition: OrgMod_.hpp:120
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
bool CanGetTech(void) const
Check if it is safe to call GetTech method.
Definition: MolInfo_.hpp:478
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
@ eRepr_map
ordered map of any kind
Definition: Seq_inst_.hpp:99
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_other
use Source.techexp
Definition: MolInfo_.hpp:148
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_both
concept transl. w/ partial pept. seq.
Definition: MolInfo_.hpp:133
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_seq_pept_homol
sequenced peptide, ordered by homology
Definition: MolInfo_.hpp:135
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_seq_pept_overlap
sequenced peptide, ordered by overlap
Definition: MolInfo_.hpp:134
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_concept_trans
conceptual translation
Definition: MolInfo_.hpp:131
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_standard
standard sequencing
Definition: MolInfo_.hpp:124
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_seq_pept
peptide was sequenced
Definition: MolInfo_.hpp:132
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ eTech_concept_trans_a
conceptual transl. supplied by author
Definition: MolInfo_.hpp:136
@ e_Embl
EMBL specific information.
Definition: Seqdesc_.hpp:127
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Pir
PIR specific info.
Definition: Seqdesc_.hpp:120
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Prf
PRF specific information.
Definition: Seqdesc_.hpp:130
@ e_Sp
SWISSPROT specific info.
Definition: Seqdesc_.hpp:125
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
ETechFlags
@ eGSS
@ e_not_set
@ eEST
@ eSTS
static const char *const sc_EST[]
static const char *const sc_STS[]
static bool s_CheckSpecialKeyword(const string &keyword, ETechFlags tech)
static bool x_OkayToAddKeyword(const string &keyword, vector< string > keywords)
static const char *const sc_GSS[]
CStaticArraySet< const char *, PCase_CStr > TStaticKeywordSet
DEFINE_STATIC_ARRAY_MAP(TStaticKeywordSet, sc_EST_kw, sc_EST)
Generic utility macros and templates for exploring NCBI objects.
#define FOR_EACH_STRING_IN_LIST(Itr, Var)
FOR_EACH_STRING_IN_LIST EDIT_EACH_STRING_IN_LIST.
Modified on Fri Sep 20 14:57:35 2024 by modify_doxy.py rev. 669887