NCBI C++ ToolKit
molecule_identifier.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: molecule_identifier.cpp 97253 2022-06-29 17:35:29Z dzhang $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Paul Thiessen
27 *
28 * File Description:
29 * Class to hold, and factory to generate, general
30 * (instance-independent) identifier for any molecule
31 *
32 * ===========================================================================
33 */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <corelib/ncbistl.hpp>
38 
44 
46 
47 #include "molecule_identifier.hpp"
48 #include "structure_set.hpp"
49 #include "molecule.hpp"
50 #include "sequence_set.hpp"
51 #include "cn3d_tools.hpp"
52 
54 
55 BEGIN_SCOPE(Cn3D)
57 
58 // there is one (global) list of molecule identifiers
59 
62 
63 const int MoleculeIdentifier::VALUE_NOT_SET = -1;
64 const TGi MoleculeIdentifier::GI_NOT_SET = -1;
65 
67 {
68  const StructureObject *object;
69  if (!molecule->GetParentOfType(&object)) return NULL;
70 
71  // get or create identifer
72  MoleculeIdentifier *identifier = ((ids.size() > 0) ? GetIdentifier(ids) : GetIdentifier(object->mmdbID, molecule->id));
73  if (!identifier)
74  return NULL;
75 
76  // check/assign mmdb id
77  if (object->mmdbID != StructureObject::NO_MMDB_ID) {
78  if ((identifier->mmdbID != VALUE_NOT_SET && identifier->mmdbID != object->mmdbID) ||
79  (identifier->moleculeID != VALUE_NOT_SET && identifier->moleculeID != molecule->id)) {
80  ERRORMSG("MoleculeIdentifier::GetIdentifier() - mmdb/molecule ID mismatch for " << identifier->ToString());
81  } else {
82  identifier->mmdbID = object->mmdbID;
83  identifier->moleculeID = molecule->id;
84  }
85  }
86 
87  // check/assign #residues
88  if (identifier->nResidues == 0)
89  identifier->nResidues = molecule->residues.size();
90  else if (identifier->nResidues != molecule->residues.size())
91  ERRORMSG("# residue mismatch in molecule identifier for " << identifier->ToString());
92 
93  // check/assign pdb id
94  string name = molecule->name;
95  if (identifier->pdbID.size() == 0 && identifier->pdbChain.empty()) {
96  identifier->pdbID = object->GetPDBID();
97  identifier->pdbChain = name;
98  } else if (identifier->pdbID != object->GetPDBID() || identifier->pdbChain != name)
99  ERRORMSG("PDB ID mismatch in molecule identifier for " << identifier->ToString());
100 
101  return identifier;
102 }
103 
104 const MoleculeIdentifier * MoleculeIdentifier::GetIdentifier(const Sequence *sequence, int mmdbID, const SeqIdList& ids)
105 {
106  // get or create identifer
107  MoleculeIdentifier *identifier = GetIdentifier(ids);
108  if (!identifier)
109  return NULL;
110 
111  // check/assign mmdb id
112  if (mmdbID != VALUE_NOT_SET) {
113  if (identifier->mmdbID != VALUE_NOT_SET) {
114  if (identifier->mmdbID != mmdbID)
115  ERRORMSG("MoleculeIdentifier::GetIdentifier() - mmdbID mismatch for " << identifier->ToString());
116  } else {
117  identifier->mmdbID = mmdbID;
118  }
119  }
120 
121  // check/assign length
122  if (identifier->nResidues == 0)
123  identifier->nResidues = sequence->Length();
124  else if (identifier->nResidues != sequence->Length())
125  ERRORMSG("Length mismatch in sequence identifier for " << identifier->ToString());
126 
127  return identifier;
128 }
129 
131 {
132  // first check known identifiers to see if there's a match, and posibly merge in new ids
133  MoleculeIdentifierList::iterator k, ke = knownIdentifiers.end();
134  for (k=knownIdentifiers.begin(); k!=ke; ++k) {
135 
136  // for each known, compare lists of Seq-ids, looking for matches and mismatches
137  SeqIdList newIDs;
138  vector < string > matches, mismatches;
139  bool mismatchGIonly = false;
140  SeqIdList::const_iterator o, oe = k->seqIDs.end(), n, ne = ids.end();
141  for (n=ids.begin(); n!=ne; ++n) {
142 
143  // does the new (incoming) Seq-id (mis)match any old (existing) Seq-id?
144  bool foundMatch = false, foundMismatch = false;
145  for (o=k->seqIDs.begin(); o!=oe; ++o) {
146  switch ((*o)->Compare(**n)) {
147  case CSeq_id::e_DIFF: // different types, can't compare; do nothing
148  break;
149  case CSeq_id::e_NO: // same type but different id -> mismatch
150  mismatches.push_back((*o)->GetSeqIdString() + " != " + (*n)->GetSeqIdString());
151  foundMismatch = true;
152  if (mismatches.size() == 1) {
153  if ((*n)->IsGi())
154  mismatchGIonly = true;
155  } else {
156  mismatchGIonly = false;
157  }
158  break;
159  case CSeq_id::e_YES: // same type and same id -> match
160  matches.push_back((*o)->GetSeqIdString() + " == " + (*n)->GetSeqIdString());
161  foundMatch = true;
162  break;
163  default:
164  ERRORMSG("Problem comparing Seq-ids " << (*o)->GetSeqIdString() << " and " << (*n)->GetSeqIdString());
165  continue;
166  }
167  }
168 
169  // if no match or mismatch is found, this is a potential new id for this known identifier
170  if (!foundMatch && !foundMismatch)
171  newIDs.push_back(*n);
172  }
173 
174  // if we have matches and no mismatches, then we've found the identifier; merge in any new ids
175  if (matches.size() > 0 && mismatches.size() == 0) {
176  if (newIDs.size() > 0)
177  k->AddFields(newIDs);
178  return &(*k);
179  }
180 
181  // if we have matches *and* mismatches then there's a problem
182  if (matches.size() > 0 && mismatches.size() > 0) {
183 
184  // special case: gi (only) is different but something else (presumably an accession) is the same, then
185  // warn about possibly outdated gi; don't merge in new ids
186  if (mismatchGIonly) {
187  ERRORMSG("GetIdentifier(): incoming Seq-id list has a GI mismatch ("
188  << mismatches.front() << ") with sequence " << k->seqIDs.front()->GetSeqIdString()
189  << " but otherwise matches (" << matches.front()
190  << "); please update outdated GI(s) for this sequence");
191  return &(*k);
192  }
193 
194  // otherwise, error
195  ERRORMSG("GetIdentifier(): incoming Seq-id list has match(es) ("
196  << matches.front() << ") and mismatch(es) ("
197  << mismatches.front() << ") with identifier " << k->ToString());
198  return NULL;
199  }
200  }
201 
202  // if we get here, then this is a new sequence
204  MoleculeIdentifier *identifier = &(knownIdentifiers.back());
205  identifier->AddFields(ids);
206  return identifier;
207 }
208 
210 {
211  // first check known identifiers to see if there's a match, and posibly merge in new ids
212  MoleculeIdentifierList::iterator k, ke = knownIdentifiers.end();
213  for (k=knownIdentifiers.begin(); k!=ke; ++k) {
214  if (k->mmdbID == mmdbID && k->moleculeID == moleculeID)
215  return &(*k);
216  }
217 
218  // if we get here, then this is a new sequence
220  MoleculeIdentifier *identifier = &(knownIdentifiers.back());
221  identifier->mmdbID = mmdbID;
222  identifier->moleculeID = moleculeID;
223  return identifier;
224 }
225 
227 {
228  // save these ids (should already know that the new ids don't overlap any existing ones)
229  seqIDs.insert(seqIDs.end(), ids.begin(), ids.end());
230 
231  bool bPdbChainNotSet = pdbChain.empty();
232 
233  SeqIdList::const_iterator n, ne = ids.end();
234  for (n=ids.begin(); n!=ne; ++n) {
235 
236  // pdb
237  if ((*n)->IsPdb()) {
238  string newID = (*n)->GetPdb().GetMol();
239  if (pdbID.size() == 0 && pdbChain.empty()) {
240  pdbID = newID;
241  pdbChain = (*n)->GetPdb().GetEffectiveChain_id();
242  } else if (pdbID != newID || pdbChain != (*n)->GetPdb().GetEffectiveChain_id()) {
243  // special case: for merged structures with multiple pdb ids, allow match to a sequence from a single specific pdb id
244  if (pdbID.size() > 4 && pdbChain == (*n)->GetPdb().GetEffectiveChain_id() && NStr::Find(pdbID, newID) != NPOS)
245  pdbID = newID;
246  else
247  ERRORMSG("AddFields(): identifier conflict, already has pdb ID '" << pdbID << "_" << pdbChain << "'");
248  }
249  }
250 
251  // gi
252  else if ((*n)->IsGi()) {
253  if (gi == GI_NOT_SET)
254  gi = (*n)->GetGi();
255  else if (gi != (*n)->GetGi())
256  ERRORMSG("AddFields(): identifier conflict: already has gi " << gi);
257  }
258 
259  // special case where local accession is actually a PDB identifier + chain + extra stuff,
260  // separated by spaces: of the format '1ABC X ...' where X can be a chain alphanum character or space
261  //else if (pdbID.size() == 0 && pdbChain == VALUE_NOT_SET &&
262  else if (pdbID.size() == 0 && bPdbChainNotSet &&
263  (*n)->IsLocal() && (*n)->GetLocal().IsStr() &&
264  (*n)->GetLocal().GetStr().size() >= 7 && (*n)->GetLocal().GetStr()[4] == ' ' &&
265  (*n)->GetLocal().GetStr()[6] == ' ' &&
266  (isalnum((unsigned char) (*n)->GetLocal().GetStr()[5]) || (*n)->GetLocal().GetStr()[5] == ' '))
267  {
268  pdbID = (*n)->GetLocal().GetStr().substr(0, 4);
269  string tmpStr(1, (*n)->GetLocal().GetStr()[5]);
270  pdbChain = tmpStr;
271  }
272  }
273 }
274 
275 const MoleculeIdentifier * MoleculeIdentifier::FindIdentifier(int mmdbID, int moleculeID)
276 {
277  const MoleculeIdentifier *identifier = NULL;
278  MoleculeIdentifierList::const_iterator i, ie = knownIdentifiers.end();
279  for (i=knownIdentifiers.begin(); i!=ie; ++i) {
280  if (mmdbID == i->mmdbID && moleculeID == i->moleculeID) {
281  identifier = &(*i);
282  break;
283  }
284  }
285  return identifier;
286 }
287 
289 {
290  knownIdentifiers.clear();
291 }
292 
293 bool MoleculeIdentifier::MatchesSeqId(const ncbi::objects::CSeq_id& sid) const
294 {
295  SeqIdList::const_iterator i, ie = seqIDs.end();
296  for (i=seqIDs.begin(); i!=ie; ++i)
297  if (sid.Match(**i))
298  return true;
299 
300  return false;
301 }
302 
304 {
305  // identifier sort - float sequences with PDB id's to the top, then gi's, then accessions
306  if (a->pdbID.size() > 0) {
307  if (b->pdbID.size() > 0) {
308  if (a->pdbID < b->pdbID)
309  return true;
310  else if (a->pdbID > b->pdbID)
311  return false;
312  else {
313  return (a->pdbChain.compare(b->pdbChain) < 0);
314  }
315  } else
316  return true;
317  }
318 
319  else if (a->gi != GI_NOT_SET) {
320  if (b->pdbID.size() > 0)
321  return false;
322  else if (b->gi != GI_NOT_SET)
323  return (a->gi < b->gi);
324  else
325  return true;
326  }
327 
328  else if (b->pdbID.size() > 0 || b->gi != GI_NOT_SET)
329  return false;
330 
331  else if (a->seqIDs.size() > 0 && b->seqIDs.size() > 0)
332  return (a->seqIDs.front()->GetSeqIdString() < b->seqIDs.front()->GetSeqIdString());
333 
334  ERRORMSG("Don't know how to compare identifiers " << a->ToString() << " and " << b->ToString());
335  return false;
336 }
337 
339 {
340  CNcbiOstrstream oss;
341  bool bPdbChainNotSet = pdbChain.empty();
342 
343  if (pdbID.size() == 4 && !bPdbChainNotSet) {
344  oss << pdbID;
345 
346  if (pdbChain != " ") {
347  oss << '_' << pdbChain;
348  }
349 
350  } else if (gi != GI_NOT_SET) {
351  oss << "gi " << gi;
352  } else if (mmdbID != VALUE_NOT_SET && moleculeID != VALUE_NOT_SET) {
353  oss << "mmdb " << mmdbID << " molecule " << moleculeID;
354  } else if (seqIDs.size() > 0) {
355  oss << seqIDs.front()->GetSeqIdString();
356  } else {
357  oss << '?';
358  }
359  return (string) CNcbiOstrstreamToString(oss);
360 }
361 
362 END_SCOPE(Cn3D)
#define static
User-defined methods of the data storage class.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
void AddFields(const SeqIdList &ids)
static bool CompareIdentifiers(const MoleculeIdentifier *a, const MoleculeIdentifier *b)
bool MatchesSeqId(const ncbi::objects::CSeq_id &sid) const
static void ClearIdentifiers(void)
static const int VALUE_NOT_SET
std::string ToString(void) const
std::list< ncbi::CRef< ncbi::objects::CSeq_id > > SeqIdList
static const MoleculeIdentifier * GetIdentifier(const Molecule *molecule, const SeqIdList &ids)
static const TGi GI_NOT_SET
static const MoleculeIdentifier * FindIdentifier(int mmdbID, int moleculeID)
unsigned int Length(void) const
std::string GetPDBID(char separator='_') const
static const int NO_MMDB_ID
#define ERRORMSG(stream)
Definition: cn3d_tools.hpp:86
#define NULL
Definition: ncbistd.hpp:225
@ e_NO
different SeqId types-can't compare
Definition: Seq_id.hpp:582
@ e_DIFF
some problem
Definition: Seq_id.hpp:581
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:583
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
int i
yy_size_t n
USING_SCOPE(objects)
list< MoleculeIdentifier > MoleculeIdentifierList
USING_NCBI_SCOPE
static MoleculeIdentifierList knownIdentifiers
unsigned int a
Definition: ncbi_localip.c:102
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
The NCBI C++/STL use hints.
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
bool ne(T x_, T y_, T round_)
Definition: njn_approx.hpp:82
#define const
Definition: zconf.h:232
Modified on Wed May 29 18:39:38 2024 by modify_doxy.py rev. 669887