NCBI C++ ToolKit
influenza_set.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: influenza_set.cpp 101210 2023-11-16 14:19:11Z gotvyans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin, Justin Foley
27  *
28  * File Description:
29  *
30  */
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbistd.hpp>
34 
39 #include <objmgr/seqdesc_ci.hpp>
41 #include <objmgr/feat_ci.hpp>
42 
45 
46 // From SQD-4297
47 // Influenza is a multi-segmented virus. We would like to create
48 // small-genome sets when all segments of a particular viral strain
49 // are submitted together. This is made more difficult due to fact
50 // that submitters often have large submissions with multiple strains
51 // at one time.
52 // This function will segregate sequences with the same taxname
53 // plus additional qualifiers into small-genome sets, if there are enough
54 // sequences for that type of Influenza *AND* all CDS and gene features
55 // on the sequences are complete.
56 // * Influenza A virus: 8 or more nucleotide sequences with same strain and serotype
57 // * Influenza B virus: 8 or more nucleotide sequences with same strain
58 // * Influenza C virus: 7 or more nucleotide sequences with same strain
59 // * Influenza D virus: 7 or more records with same strain
60 // Note that as long as we are making strain-specific organism names,
61 // the taxname must only start with the Influenza designation, not match it.
62 // Can only make a set if at least one instance of each segment value is represented.
63 
65 {
66  m_FluType = GetInfluenzaType(key);
67  m_Required = GetNumRequired(m_FluType);
68 }
69 
70 
72 {
73  if (fluType == eInfluenzaA || fluType == eInfluenzaB) {
74  return 8;
75  }
76  return 7;
77 }
78 
79 
81 {
82  if (NStr::StartsWith(taxname, "Influenza A virus", NStr::eNocase)) {
83  return eInfluenzaA;
84  } else if (NStr::StartsWith(taxname, "Influenza B virus", NStr::eNocase)) {
85  return eInfluenzaB;
86  } else if (NStr::StartsWith(taxname, "Influenza C virus", NStr::eNocase)) {
87  return eInfluenzaC;
88  } else if (NStr::StartsWith(taxname, "Influenza D virus", NStr::eNocase)) {
89  return eInfluenzaD;
90  }
91  return eNotInfluenza;
92 }
93 
94 
95 string CInfluenzaSet::GetKey(const COrg_ref& org)
96 {
97  if (!org.IsSetTaxname() || !org.IsSetOrgname() || !org.GetOrgname().IsSetMod()) {
98  return kEmptyStr;
99  }
100  EInfluenzaType flu_type = GetInfluenzaType(org.GetTaxname());
101  if (flu_type == eNotInfluenza) {
102  return kEmptyStr;
103  }
104 
105  CTempString strain = kEmptyStr;
106  CTempString serotype = kEmptyStr;
107 
108  for (auto pOrgMod : org.GetOrgname().GetMod()) {
109  if (pOrgMod->IsSetSubtype() && pOrgMod->IsSetSubname()) {
110  if (pOrgMod->GetSubtype() == COrgMod::eSubtype_strain) {
111  strain = pOrgMod->GetSubname();
112  } else if (pOrgMod->GetSubtype() == COrgMod::eSubtype_serotype &&
113  flu_type == eInfluenzaA) {
114  serotype = pOrgMod->GetSubname();
115  }
116  }
117  }
118 
119  if(NStr::IsBlank(strain)) {
120  return kEmptyStr;
121  }
122  if (flu_type == eInfluenzaA) {
123  if (NStr::IsBlank(serotype)) {
124  return kEmptyStr;
125  }
126  return org.GetTaxname() + ":" + strain + ":" + serotype;
127  }
128 
129  return org.GetTaxname() + ":" + strain;
130 }
131 
132 
134 {
135  m_Members.push_back(bsh);
136 }
137 
138 
139 bool g_FindSegs(const CBioSource& src, size_t numRequired, set<size_t>& segsFound)
140 {
141  if (!src.IsSetSubtype()) {
142  return false;
143  }
144 
145  bool foundSeg = false;
146  for (auto pSubSource : src.GetSubtype()) {
147  if (pSubSource && pSubSource->IsSetSubtype() && pSubSource->IsSetName() &&
148  pSubSource->GetSubtype() == CSubSource::eSubtype_segment) {
149  auto segment = NStr::StringToSizet(pSubSource->GetName(), NStr::fConvErr_NoThrow);
150  if (segment < 1 || segment > numRequired ) {
151  return false;
152  }
153  segsFound.insert(segment);
154  foundSeg = true;
155  }
156  }
157  return foundSeg;
158 }
159 
160 
162 {
163  if (m_Members.size() < m_Required) {
164  return false;
165  }
166 
167  set<size_t> segsFound;
168  for(auto bsh : m_Members) {
169  // check to make sure one of each segment is represented
171  if (!g_FindSegs(src->GetSource(), m_Required, segsFound)) {
172  return false;
173  }
174  // make sure all coding regions and genes are complete
175  SAnnotSelector sel;
178  CFeat_CI f(bsh, sel);
179  while (f) {
180  if (f->GetLocation().IsPartialStart(eExtreme_Biological) ||
181  f->GetLocation().IsPartialStop(eExtreme_Biological)) {
182  return false;
183  }
184  ++f;
185  }
186  }
187 
188  return (segsFound.size() == m_Required);
189 }
190 
191 
193 {
194  if (m_Members.empty()) {
195  return;
196  }
197  CBioseq_set_Handle parent = m_Members[0].GetParentBioseq_set();
198  if (!parent) {
199  return;
200  }
201  if (parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
202  parent = parent.GetParentBioseq_set();
203  }
204  if (!parent) {
205  return;
206  }
207  CSeq_entry_Handle peh = parent.GetParentEntry();
208  CSeq_entry_EditHandle peeh(peh);
209  CBioseq_set_EditHandle parent_edit(parent);
210  CRef<CSeq_entry> ns(new CSeq_entry());
212  CSeq_entry_EditHandle new_set = parent_edit.AttachEntry(*ns, -1);
213  ITERATE(TMembers, it, m_Members) {
215  if (np && np.IsSetClass() && np.GetClass() == CBioseq_set::eClass_nuc_prot) {
217  CSeq_entry_EditHandle npse(nps);
218  npse.Remove();
219  new_set.AttachEntry(npse);
220  } else {
222  CSeq_entry_EditHandle se(s);
223  se.Remove();
224  new_set.AttachEntry(se);
225  }
226  }
227 }
228 
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
CBioseq_Handle –.
CBioseq_set_EditHandle –.
CBioseq_set_Handle –.
CFeat_CI –.
Definition: feat_ci.hpp:64
static size_t GetNumRequired(EInfluenzaType fluType)
TMembers m_Members
bool OkToMakeSet() const
static EInfluenzaType GetInfluenzaType(const string &taxname)
static string GetKey(const COrg_ref &org)
void AddBioseq(CBioseq_Handle bsh)
vector< CBioseq_Handle > TMembers
CSeq_entry_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
size_type size() const
Definition: set.hpp:132
Include a standard set of the NCBI C++ Toolkit most basic headers.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
TClass GetClass(void) const
void Remove(void) const
Remove this Seq-entry from parent, or scope if it's top level Seq-entry.
CBioseq_set_Handle GetParentBioseq_set(void) const
Return a handle for the parent Bioseq-set, or null handle.
CSeq_entry_Handle GetParentEntry(void) const
Return a handle for the parent seq-entry of the bioseq.
CSeq_entry_EditHandle AttachEntry(CSeq_entry &entry, int index=-1) const
Attach an existing seq-entry.
bool IsSetClass(void) const
CSeq_entry_EditHandle AttachEntry(CSeq_entry &entry, int index=-1) const
Attach an existing seq-entry.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
SAnnotSelector & IncludeFeatType(TFeatType type)
Include feature type in the search.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static size_t StringToSizet(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to size_t.
Definition: ncbistr.cpp:1769
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_strain
Definition: OrgMod_.hpp:85
@ eSubtype_serotype
Definition: OrgMod_.hpp:90
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
void SetClass(TClass value)
Assign a value to Class data member.
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
bool g_FindSegs(const CBioSource &src, size_t numRequired, set< size_t > &segsFound)
const struct ncbi::grid::netcache::search::fields::KEY key
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
SAnnotSelector –.
#define const
Definition: zconf.h:230
Modified on Tue Nov 28 02:24:10 2023 by modify_doxy.py rev. 669887