NCBI C++ ToolKit
cuSequence.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuSequence.cpp 100598 2023-08-15 14:35:24Z gaudaensj $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Chris Lanczycki
27  *
28  * File Description:
29  *
30  * Functions for manipulating Bioseqs and other sequence representations
31  *
32  * ===========================================================================
33  */
34 
35 
36 
37 #include <ncbi_pch.hpp>
40 
42 #include <objects/seq/Seq_data.hpp>
44 #include <objects/seq/Seq_inst.hpp>
45 #include <objects/seq/Seqdesc.hpp>
46 
47 #include <objects/seq/NCBIeaa.hpp>
48 #include <objects/seq/IUPACaa.hpp>
50 
53 
60 
62 
65 
67 BEGIN_SCOPE(cd_utils)
68 
69 
71 //-------------------------------------------------------------------------
72 // return true if the 2 sequence-id's match
73 //-------------------------------------------------------------------------
74  if (ID1.Empty() || ID2.Empty()) {
75  return false;
76  }
77  return ID1->Match(*ID2);
78 }
79 
80 bool SeqIdHasMatchInBioseq(const CRef< CSeq_id>& id, const CBioseq& bioseq)
81 {
82 //-------------------------------------------------------------------------
83 // return true if 'id' matches at least one element in bioseq's id list
84 //-------------------------------------------------------------------------
85  if (id.Empty()) return false;
86 
87  bool result = false;
88  const CBioseq::TId& bioseqIds = bioseq.GetId();
89  CBioseq::TId::const_iterator cit = bioseqIds.begin(), cend = bioseqIds.end();
90  for (; cit != cend && !result; ++cit) {
91  result = SeqIdsMatch(id, *cit);
92  }
93  return result;
94 }
95 
96 // Return 0 if Seq_id is not of proper type (e_General and database 'CDD')
98  int pssmId = 0;
99  string database = "CDD";
100 
101  if (id.NotEmpty() && id->IsGeneral()) {
102  if (id->GetGeneral().GetDb() == database) {
103  if (id->GetGeneral().GetTag().IsId()) {
104  pssmId = id->GetGeneral().GetTag().GetId();
105  }
106  }
107  }
108  return pssmId;
109 }
110 
111 // formerly FindMMDBIdInBioseq(...)
112 int GetMMDBId(const CBioseq& bioseq) {
113  list< CRef< CSeq_annot > >::const_iterator j;
114  list< CRef< CSeq_id > >::const_iterator k;
115  int id = -1;
116 
117  // look through each seq-annot
118  if (bioseq.IsSetAnnot()) {
119  for (j = bioseq.GetAnnot().begin(); j!= bioseq.GetAnnot().end(); j++) {
120  // look through ids for an mmdb id
121  if ((*j)->GetData().IsIds()) {
122  for (k = (*j)->GetData().GetIds().begin(); k != (*j)->GetData().GetIds().end(); k++) {
123  if ((*k)->IsGeneral()) {
124  if ((*k)->GetGeneral().GetDb() == "mmdb") {
125  if ((*k)->GetGeneral().GetTag().IsId()) {
126  id = (*k)->GetGeneral().GetTag().GetId();
127  return(id);
128  }
129  }
130  }
131  }
132  }
133  }
134  }
135 
136  return(id);
137 }
138 
140 
141  bool isTaxIdFound = false;
142  TTaxId thisTaxid, taxid = ZERO_TAX_ID;
143  list< CRef< CSeqdesc > >::const_iterator j, jend;
144 
145  if (bioseq.IsSetDescr())
146  {
147  jend = bioseq.GetDescr().Get().end();
148 
149  // look through the sequence descriptions
150  for (j=bioseq.GetDescr().Get().begin(); j!=jend; j++)
151  {
152  const COrg_ref *org = NULL;
153  if ((*j)->IsOrg())
154  org = &((*j)->GetOrg());
155  else if ((*j)->IsSource())
156  org = &((*j)->GetSource().GetOrg());
157  if (org)
158  {
159  vector < CRef< CDbtag > >::const_iterator k, kend = org->GetDb().end();
160  for (k=org->GetDb().begin(); k != kend; ++k) {
161  if ((*k)->GetDb() == "taxon") {
162  if ((*k)->GetTag().IsId()) {
163  thisTaxid = TAX_ID_FROM(CObject_id::TId, (*k)->GetTag().GetId());
164 
165  // Mark the first valid tax id found; if there are others,
166  // return -(firstTaxid) if they are not all equal. Allow for
167  // thisTaxid < 0, which CTaxon1 allows when there are multiple ids.
168  if (isTaxIdFound && taxid != thisTaxid && taxid != -thisTaxid) {
169  if (taxid > ZERO_TAX_ID) taxid = -taxid;
170  } else if (taxid == ZERO_TAX_ID && thisTaxid != ZERO_TAX_ID && !isTaxIdFound) {
171  taxid = (thisTaxid > ZERO_TAX_ID) ? thisTaxid : -thisTaxid;
172  isTaxIdFound = true;
173  }
174 // break;
175  }
176  }
177  }
178  } //end if (org)
179  }//end for
180  }
181  return taxid;
182 }
183 
184 bool IsEnvironmentalSeq(const CBioseq& bioseq) {
185  //256318 is the taxid for Venter's infamous environmental sequences
187 }
188 
189 //string CCd::GetSpecies(int SeqIndex) {
190 string GetSpeciesFromBioseq(const CBioseq& bioseq) {
191 //-------------------------------------------------------------------------
192 // get the species of the SeqIndex sequence; does not use taxonomy server
193 //-------------------------------------------------------------------------
194 // int SeqCount;
195  list< CRef< CSeqdesc > >::const_iterator j;
196  if (bioseq.IsSetDescr()) {
197  // look through the sequence descriptions
198  for (j=bioseq.GetDescr().Get().begin();
199  j!=bioseq.GetDescr().Get().end(); j++) {
200  // if there's an organism identifier
201  if ((*j)->IsSource()) {
202  // retrieve common or formal name
203  if ((*j)->GetSource().GetOrg().IsSetTaxname()) {
204  return((*j)->GetSource().GetOrg().GetTaxname());
205  }
206  if ((*j)->GetSource().GetOrg().IsSetCommon()) {
207  return((*j)->GetSource().GetOrg().GetCommon());
208  }
209  }
210  }
211  }
212  return(kEmptyStr);
213 }
214 
215 // Length = 0 on failure.
216 int GetSeqLength(const CBioseq& bioseq)
217 {
218  int len = 0;
219 
220  if (bioseq.GetInst().IsSetLength()) {
221  len = bioseq.GetInst().GetLength();
222  } else if (bioseq.GetInst().IsSetSeq_data()) {
223  const CSeq_data & pDat = bioseq.GetInst().GetSeq_data();
224 
225  if (pDat.IsNcbieaa()) {
226  len = pDat.GetNcbieaa().Get().size();
227  } else if (pDat.IsIupacaa()) {
228  len = pDat.GetIupacaa().Get().size();
229  } else if (pDat.IsNcbistdaa()) {
230  len = pDat.GetNcbistdaa().Get().size();
231  } else {
232  len = 0;
233  }
234  }
235  return len;
236 }
237 
238 
239 
240 // Returns false (length = 0) on failure, empty seq_entry,
241 // or if the seq_entry represents a set of seq_entry objects.
242 bool GetSeqLength(const CRef< CSeq_entry >& Seq, int& len)
243 {
244  len = 0;
245  if (Seq.Empty() || Seq->IsSet()) return false;
246  if (Seq->GetSeq().GetInst().IsSetLength()) {
247  len = Seq->GetSeq().GetInst().GetLength();
248  } else {
249  len = GetSeqLength(Seq->GetSeq());
250  }
251  return (len != 0);
252 }
253 
254 
255 
256 
257 // for converting ncbistdaa sequences to ncbieaa sequences
258 void NcbistdaaToNcbieaaString(const std::vector < char >& vec, std::string *str)
259 {
260  if (str) {
261  str->erase();
262  str->resize(vec.size());
263  try {
265  } catch (exception& e) {
266  *str = e.what();
267  }
268  }
269 }
270 
271 // for converting ncbieaa sequences to ncbistdaa sequences
272 bool NcbieaaToNcbistdaaString(const std::string& str, vector < char >& vec)
273 {
274  bool result = true;
275  vec.clear();
276  if (str.size() > 0) {
277  vec.reserve(str.size());
278  try {
280  } catch (...) {
281  result = false;
282  }
283  }
284  return result;
285 }
286 
287 // some stuff from cdt_manipcd
288 // False if the seq_entry is not a single bioseq.
289 bool GetNcbieaaString(const CRef< CSeq_entry >& Seq, string & Str)
290 {
291  if (Seq->IsSeq() && Seq->GetSeq().GetInst().IsSetSeq_data()) {
292  const CBioseq& bioseq = Seq->GetSeq();
293  return GetNcbieaaString(bioseq, Str);
294  }
295  return false;
296 }
297 
298 bool GetNcbieaaString(const CBioseq& bioseq, string & Str)
299 {
300  if (bioseq.GetInst().IsSetSeq_data()) {
301 
302  const CSeq_data & pDat= bioseq.GetInst().GetSeq_data();
303 
304  if (pDat.IsNcbieaa()) Str = pDat.GetNcbieaa().Get();
305  else if (pDat.IsIupacaa()) Str = pDat.GetIupacaa().Get();
306  else if (pDat.IsNcbistdaa()) {
307  const std::vector < char >& vec = pDat.GetNcbistdaa().Get();
308  NcbistdaaToNcbieaaString(vec, &Str);
309  }
310  return true;
311  }
312  return false;
313 }
314 
315 bool GetNcbistdSeq(const CBioseq& bioseq, vector<char>& seqData)
316 {
317  if (bioseq.GetInst().IsSetSeq_data())
318  {
319  const CSeq_data & pDat= bioseq.GetInst().GetSeq_data();
320  if (pDat.IsNcbieaa())
321  {
322  string Str = pDat.GetNcbieaa().Get();
323  try {
324  CSeqConvert::Convert(Str, CSeqUtil::e_Ncbieaa, 0, Str.size(), seqData, CSeqUtil::e_Ncbistdaa);
325  } catch (...) {
326  return false;
327  }
328  }
329  else if (pDat.IsIupacaa())
330  {
331  string Str = pDat.GetIupacaa().Get();
332  try {
333  CSeqConvert::Convert(Str, CSeqUtil::e_Iupacaa, 0, Str.size(), seqData, CSeqUtil::e_Ncbistdaa);
334  } catch (...) {
335  return false;
336  }
337  }
338  else if (pDat.IsNcbistdaa()) {
339  const std::vector < char >& vec = pDat.GetNcbistdaa().Get();
340  seqData.assign(vec.begin(), vec.end());
341  }
342  return true;
343  }
344  return false;
345 }
346 
347 // Return as a raw string whatever found in the bioseq. Ignore types ncbi8aa
348 // and ncbipaa, and all nucleic acid encodings.
349 string GetRawSequenceString(const CBioseq& bioseq)
350 {
351  // copied from algo/blast/core/blast_encoding.h
352  // usage copied as per line 2190 in objtools/blast_format/blastfmtutil.cpp
353  static const char MY_NCBISTDAA_TO_AMINOACID[28] = {
354  '-','A','B','C','D','E','F','G','H','I','K','L','M',
355  'N','P','Q','R','S','T','V','W','X','Y','Z','U','*',
356  'O', 'J'};
357 
358 
359  string s = kEmptyStr;
360  if (bioseq.GetInst().IsSetSeq_data()) {
361  const CSeq_data & pDat= bioseq.GetInst().GetSeq_data();
362  if (pDat.IsNcbieaa()) {
363  s = pDat.GetNcbieaa().Get();
364  } else if (pDat.IsIupacaa()) {
365  s = pDat.GetIupacaa().Get();
366  } else if (pDat.IsNcbistdaa()) {
367  const std::vector < char >& vec = pDat.GetNcbistdaa().Get();
368  s.resize(vec.size());
369  for (unsigned int i=0; i<vec.size(); i++) {
370  // simply doing s.at(i) = vec[i] didn't work
371  s.at(i) = MY_NCBISTDAA_TO_AMINOACID[(int)vec[i]];
372  }
373  }
374  }
375  return s;
376 }
377 
378 
379 // If zeroBased == true, first letter is 0, otherwise number residues from 1.
380 char GetResidueAtPosition(const CRef< CSeq_entry >& seqEntry, int pos, bool zeroBased)
381 {
382  if (pos > 0 && seqEntry->IsSeq() && seqEntry->GetSeq().GetInst().IsSetSeq_data()) {
383  return GetResidueAtPosition(seqEntry->GetSeq(), pos, zeroBased);
384  }
385  return 0;
386 }
387 
388 // If zeroBased == true, first letter is 0, otherwise number residues from 1.
389 char GetResidueAtPosition(const CBioseq& bioseq, int pos, bool zeroBased)
390 {
391 
392  char residue = 0;
393  string str = "";
394  if (pos >= 0 && GetNcbieaaString(bioseq, str)) {
395  if (zeroBased && pos < (int) str.size()) {
396  residue = str[pos];
397  } else if (!zeroBased && pos <= (int) str.size() && pos != 0) {
398  residue = str[pos-1];
399  }
400  }
401  return residue;
402 }
403 
404 
405 bool IsConsensus(const CRef< CSeq_id >& seqId) {
406  bool result = false;
407 
408  if (seqId.NotEmpty()) {
409  if (seqId->IsLocal()) {
410  if (seqId->GetLocal().IsStr()) {
411  if(seqId->GetLocal().GetStr() == "consensus") {
412  result = true;
413  }
414  }
415  }
416  }
417  return result;
418 }
419 
420 bool GetAccAndVersion(const CRef< CBioseq > bioseq, string& acc, int& version, CRef< CSeq_id>& seqId)
421 {
422  acc.erase();
423  const list< CRef< CSeq_id > >& seqIds = bioseq->GetId();
424  for (list< CRef< CSeq_id > >::const_iterator cit = seqIds.begin();
425  cit != seqIds.end(); cit++)
426  {
427  const CTextseq_id* textId = (*cit)->GetTextseq_Id();
428  if (textId)
429  {
430  if (textId->CanGetAccession())
431  acc = textId->GetAccession();
432  if (acc.size() > 0)
433  {
434  if (textId->CanGetVersion())
435  version = textId->GetVersion();
436  seqId = new CSeq_id;
437  seqId->Assign(**cit);
438  break;
439  }
440  }
441  }
442  return acc.size() != 0;
443 }
444 
445 
447 {
448  if (seqEntry->IsSeq())
449  {
450  const list< CRef< CSeqdesc > >& descrList = seqEntry->GetSeq().GetDescr().Get();
451  list< CRef< CSeqdesc > >::const_iterator cit = descrList.begin();
452  for (; cit != descrList.end(); cit++)
453  {
454  if ((*cit)->IsPdb())
455  {
456  CRef< CSeqdesc > desc= *cit;
457  pdbBlock.Reset( &(desc->SetPdb()) );
458  return true;
459  }
460  }
461  }
462  else
463  {
464  const list< CRef< CSeqdesc > >& descrList = seqEntry->GetSet().GetDescr().Get();
465  list< CRef< CSeqdesc > >::const_iterator cit = descrList.begin();
466  for (; cit != descrList.end(); cit++)
467  {
468  if ((*cit)->IsPdb())
469  {
470  CRef< CSeqdesc > desc= *cit;
471  pdbBlock.Reset( &(desc->SetPdb()) );
472  return true;
473  }
474  }
475  list< CRef< CSeq_entry > >::const_iterator lsei;
476  const list< CRef< CSeq_entry > >& seqEntryList = seqEntry->GetSet().GetSeq_set();
477  for (lsei = seqEntryList.begin(); lsei != seqEntryList.end(); ++lsei)
478  {
479  if(GetPDBBlockFromSeqEntry(*lsei, pdbBlock))
480  return true;
481  }
482  }
483  return false;
484 }
485 
486 
487 bool CopyGiSeqId(const CRef<CBioseq>& bioseq, CRef<CSeq_id>& giSeqId, unsigned int nth)
488 {
489  bool result = false;
490  unsigned int ctr = 0;
491  CBioseq::TId::const_iterator idCit, idEnd;
492 
493  idEnd = bioseq->GetId().end();
494  for (idCit = bioseq->GetId().begin(); idCit != idEnd && ctr < nth; ++idCit) {
495  if ((*idCit).NotEmpty() && (*idCit)->IsGi()) {
496 
497  // Skip until hit the specified entry in the bioseq.
498  ++ctr;
499  if (ctr != nth) continue;
500 
501  giSeqId->Assign(**idCit);
502  result = true;
503  }
504  }
505  return result;
506 }
507 
508 bool ExtractGi(const CRef<CBioseq>& bioseq, TGi& gi, unsigned int nth)
509 {
510  bool result = false;
511  CRef< CSeq_id > giSeqId(new CSeq_id());
512 
513  gi = ZERO_GI;
514  if (CopyGiSeqId(bioseq, giSeqId, nth)) {
515  gi = giSeqId->GetGi();
516  result = true;
517  }
518  return result;
519 }
520 
521 // Last arg tells which id to use if there are multiple pdbs - which there shouldn't be.
522 bool CopyPdbSeqId(const CRef<CBioseq>& bioseq, CRef<CSeq_id>& pdbSeqId, unsigned int nth)
523 {
524  bool result = false;
525  unsigned int ctr = 0;
526  CBioseq::TId::const_iterator idCit, idEnd;
527 
528  idEnd = bioseq->GetId().end();
529  for (idCit = bioseq->GetId().begin(); idCit != idEnd && ctr < nth; ++idCit) {
530  if ((*idCit).NotEmpty() && (*idCit)->IsPdb()) {
531 
532  // Skip until hit the specified entry in the bioseq.
533  ++ctr;
534  if (ctr != nth) continue;
535 
536  pdbSeqId->Assign(**idCit);
537  result = true;
538  }
539  }
540  return result;
541 }
542 
543 // Last arg tells which id to use if there are multiple pdbs - which there shouldn't be.
544 bool ExtractPdbMolChain(const CRef<CBioseq>& bioseq, string& pdbMol, string& pdbChain, unsigned int nth)
545 {
546  bool result = false;
547  CRef< CSeq_id > pdbSeqId(new CSeq_id());
548 
549  pdbMol = "";
550  pdbChain = "";
551  if (CopyPdbSeqId(bioseq, pdbSeqId, nth)) {
552  pdbMol = pdbSeqId->GetPdb().GetMol().Get();
553  pdbChain = pdbSeqId->GetPdb().GetEffectiveChain_id();
554  result = true;
555  }
556  return result;
557 }
558 
559 bool HasSeqIdOfType(const CBioseq& bioseq, CSeq_id::E_Choice choice)
560 {
561  bool result = false;
562  CBioseq::TId::const_iterator idCit = bioseq.GetId().begin(), idEnd = bioseq.GetId().end();
563 
564  for (; idCit != idEnd && !result; ++idCit) {
565  if ((*idCit)->Which() == choice) {
566  result = true;
567  }
568  }
569  return result;
570 }
571 
573 {
574  bool result = false;
575  CBioseq_set::TSeq_set::const_iterator bssCit, bssEnd;
576 
577  if (seqEntry.NotEmpty()) {
578  if (seqEntry->IsSet()) {
579  bssCit = seqEntry->GetSet().GetSeq_set().begin();
580  bssEnd = seqEntry->GetSet().GetSeq_set().end();
581  for (; bssCit != bssEnd && !result; ++bssCit) {
582  if ((*bssCit)->IsSeq()) {
583  result = HasSeqIdOfType((*bssCit)->GetSeq(), choice);
584  } else if ((*bssCit)->IsSet()) {
585  result = HasSeqIdOfType(*bssCit, choice); // recursive
586  }
587  }
588  } else if (seqEntry->IsSeq()) {
589  result = HasSeqIdOfType(seqEntry->GetSeq(), choice);
590  }
591  }
592  return result;
593 }
594 
595 unsigned int CopySeqIdsOfType(const CBioseq& bioseq, CSeq_id::E_Choice choice, list< CRef< CSeq_id > >& idsOfType)
596 {
597  CBioseq::TId::const_iterator idCit = bioseq.GetId().begin(), idEnd = bioseq.GetId().end();
598 
599  idsOfType.clear();
600  for (; idCit != idEnd; ++idCit) {
601  if ((*idCit)->Which() == choice) {
602  CRef< CSeq_id > id(new CSeq_id);
603  id->Assign(**idCit);
604  idsOfType.push_back(id);
605  }
606  }
607  return idsOfType.size();
608 }
609 
610 unsigned int CopySeqIdsOfType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, list< CRef< CSeq_id > >& idsOfType)
611 {
612  list< CRef< CSeq_id > > tmpList;
613  CBioseq_set::TSeq_set::const_iterator bssCit, bssEnd;
614 
615  idsOfType.clear();
616  if (seqEntry.NotEmpty()) {
617  if (seqEntry->IsSet()) {
618  bssCit = seqEntry->GetSet().GetSeq_set().begin();
619  bssEnd = seqEntry->GetSet().GetSeq_set().end();
620  for (; bssCit != bssEnd; ++bssCit) {
621  tmpList.clear();
622  if ((*bssCit)->IsSeq()) {
623  if (CopySeqIdsOfType((*bssCit)->GetSeq(), choice, tmpList) > 0) {
624  idsOfType.insert(idsOfType.end(), tmpList.begin(), tmpList.end());
625  }
626  } else if ((*bssCit)->IsSet()) { //recursive
627  if (CopySeqIdsOfType(*bssCit, choice, tmpList) > 0) {
628  idsOfType.insert(idsOfType.end(), tmpList.begin(), tmpList.end());
629  }
630  }
631  }
632  } else if (seqEntry->IsSeq()) {
633  CopySeqIdsOfType(seqEntry->GetSeq(), choice, idsOfType);
634  }
635  }
636  return idsOfType.size();
637 }
638 
639 
640 bool CopyBioseqWithType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq >& seqEntryBioseq)
641 {
642  bool result = false;
643  list< CRef< CSeq_id > > tmpList;
644  CBioseq_set::TSeq_set::const_iterator bssCit, bssEnd;
645 
646  if (seqEntry.NotEmpty()) {
647  if (seqEntry->IsSet()) {
648 
649  bssCit = seqEntry->GetSet().GetSeq_set().begin();
650  bssEnd = seqEntry->GetSet().GetSeq_set().end();
651  for (; bssCit != bssEnd && !result; ++bssCit) {
652  if ((*bssCit)->IsSeq()) {
653  tmpList.clear();
654  if (CopySeqIdsOfType((*bssCit)->GetSeq(), choice, tmpList) > 0) {
655  seqEntryBioseq->Assign((*bssCit)->GetSeq());
656  result = true;
657  }
658  }
659  }
660 
661  } else if (seqEntry->IsSeq()) {
662  if (CopySeqIdsOfType(seqEntry->GetSeq(), choice, tmpList) > 0) {
663  seqEntryBioseq->Assign(seqEntry->GetSeq());
664  result = true;
665  }
666  }
667  }
668 
669  return result;
670 }
671 
673 {
674  bool result = false;
675  list< CRef< CSeq_id > > tmpList;
676  CBioseq_set::TSeq_set::iterator bssIt, bssEnd;
677 
678  if (seqEntry.NotEmpty()) {
679  if (seqEntry->IsSet()) {
680 
681  bssIt = seqEntry->SetSet().SetSeq_set().begin();
682  bssEnd = seqEntry->SetSet().SetSeq_set().end();
683  for (; bssIt != bssEnd && !result; ++bssIt) {
684  if ((*bssIt)->IsSeq()) {
685  tmpList.clear();
686  if (CopySeqIdsOfType((*bssIt)->GetSeq(), choice, tmpList) > 0) {
687  seqEntryBioseq = &(*bssIt)->SetSeq();
688  result = true;
689  }
690  }
691  }
692 
693  } else if (seqEntry->IsSeq()) {
694  if (CopySeqIdsOfType(seqEntry->GetSeq(), choice, tmpList) > 0) {
695  seqEntryBioseq = &(seqEntry->SetSeq());
696  result = true;
697  }
698  }
699  }
700 
701  return result;
702 }
703 
704 bool AddCommentToBioseq(CBioseq& bioseq, const string& comment)
705 {
706  bool result = (bioseq.IsSetDescr() && comment.length() > 0);
707 
708  if (result) {
709  CSeq_descr& seqDescr = bioseq.SetDescr();
710  CRef< CSeqdesc> seqdescComment(new CSeqdesc);
711  seqdescComment->SetComment(comment);
712  seqDescr.Set().push_back(seqdescComment);
713  }
714 
715  return result;
716 }
717 
718 void SimplifyBioseqForCD(CBioseq& bioseq, const vector<string>& keptComments, bool keepPDBBlock)
719 {
720  bool hasSource = false;
721  bool hasTitle = false;
722  string newTitle = kEmptyStr;
723  CSeq_descr& seqDescr = bioseq.SetDescr();
724 
725  if (seqDescr.IsSet()) {
726  list< CRef< CSeqdesc > >& descrList = seqDescr.Set();
727  list< CRef< CSeqdesc > >::iterator it = descrList.begin();
728 
729  // See if we have a title present...
730  while (!hasTitle && it != descrList.end()) {
731  hasTitle = ((*it)->IsTitle());
732  ++it;
733  }
734 
735  // Can't pre-compute descrList.end() since descrList may change within the while loop.
736  it = descrList.begin();
737  while (it != descrList.end()) {
738  //only keep one source field
739  if ((*it)->IsSource() && (!hasSource)) {
740  hasSource = true;
741  it++;
742  } else if ((*it)->IsTitle()) {
743  it++;
744  } else if ((*it)->IsComment() && find(keptComments.begin(), keptComments.end(), (*it)->GetComment()) != keptComments.end()) {
745  it++;
746  } else if ((*it)->IsPdb()) {
747 
748  // If there is no title, create one from the PDB-Block 'compound' if possible.
749  const CPDB_block& pdbBlock = (*it)->GetPdb();
750  if (!hasTitle && pdbBlock.CanGetCompound() && pdbBlock.GetCompound().size() > 0) {
751  newTitle = pdbBlock.GetCompound().front();
752  if (newTitle.length() > 0) {
753  CRef< CSeqdesc > addedTitle(new CSeqdesc);
754  addedTitle->SetTitle(newTitle);
755  descrList.push_back(addedTitle);
756  hasTitle = true;
757  }
758  }
759  if (keepPDBBlock) {
760  it++;
761  } else {
762  it = descrList.erase(it);
763  }
764  } else {
765  it = descrList.erase(it);
766  }
767  }
768  }
769 
770  if(bioseq.GetDescr().Get().size() == 0){
771  bioseq.ResetDescr();
772  }
773 
774  // reset annot field
775  bioseq.ResetAnnot();
776 }
777 
778 void SimplifySeqEntryForCD(CRef< CSeq_entry >& seqEntry, const vector<string>& keptComments, bool keepPDBBlock)
779 {
780  if (seqEntry.Empty()) return;
781 
782  if (seqEntry->IsSeq()) {
783  SimplifyBioseqForCD(seqEntry->SetSeq(), keptComments, keepPDBBlock);
784  } else if (seqEntry->IsSet()) {
785  CBioseq_set::TSeq_set::iterator bssIt = seqEntry->SetSet().SetSeq_set().begin(), bssEnd = seqEntry->SetSet().SetSeq_set().end();
786  for (; bssIt != bssEnd; ++bssIt) {
787  SimplifySeqEntryForCD(*bssIt, keptComments, keepPDBBlock);
788 // if ((*bssIt)->IsSeq()) {
789 // SimplifyBioseqForCD((*bssIt)->SetSeq(), keptComment, keepPDBBlock);
790 // }
791  }
792  }
793 }
794 
795 
797 {
798  string acc, dbSource;
799  GetAccessionAndDatabaseSource(seqID, acc, dbSource, false);
800  return dbSource;
801 }
802 
804 {
805  string acc, dbSource;
806  GetAccessionAndDatabaseSource(seqID, acc, dbSource);
807  return acc;
808 }
809 
810 
811 void GetAccessionAndDatabaseSource(const CRef< CSeq_id >& seqID, string& accession, string& dbSource, bool getGenericSource)
812 {
814  accession = "unknown";
815  if (!seqID) {
816  return;
817  }
818 
819  // Only getting the generic source at this point.
820  dbSource = CCdDbPriority::SeqIdTypeToSource((unsigned int) seqID->Which());
821 
822  if (seqID->IsGi()) {
823  accession = NStr::NumericToString(seqID->GetGi());
824  }
825  else if (seqID->IsPdb()) {
826  const CPDB_seq_id& pPDB_ID = seqID->GetPdb();
827  accession = pPDB_ID.GetMol().Get() + " " + pPDB_ID.GetEffectiveChain_id();
828  }
829  else if (seqID->IsLocal()) {
830  const CObject_id& pLocal = seqID->GetLocal();
831  if (pLocal.IsId()) {
832  accession = NStr::IntToString(pLocal.GetId());
833  }
834  else if (pLocal.IsStr()) {
835  accession = pLocal.GetStr();
836  }
837  }
838  else if (seqID->IsGeneral()) {
839  const CDbtag& pGeneral = seqID->GetGeneral();
840  if (pGeneral.IsSetDb() && !getGenericSource) { // look for a specific type only if requested
841  dbSource = dbSource + ": " + pGeneral.GetDb();
842  }
843  if (pGeneral.IsSetTag()) {
844  if (pGeneral.GetTag().IsId()) {
845  accession = NStr::IntToString(pGeneral.GetTag().GetId());
846  }
847  else if (pGeneral.GetTag().IsStr()) {
848  accession = pGeneral.GetTag().GetStr();
849  }
850  }
851  }
852  // Four unexpected Seq-id types added for completeness
853  else if (seqID->IsGibbsq()) {
854  accession = NStr::IntToString(seqID->GetGibbsq());
855  }
856  else if (seqID->IsGibbmt()) {
857  accession = NStr::IntToString(seqID->GetGibbmt());
858  }
859  else if (seqID->IsGiim()) {
860  if (seqID->GetGiim().CanGetDb()) {
861  dbSource = seqID->GetGiim().GetDb();
862  }
863  accession = NStr::IntToString(seqID->GetGiim().GetId());
864  }
865  else if (seqID->IsPatent()) {
866  accession = NStr::IntToString(seqID->GetPatent().GetSeqid());
867  }
868 
869  // The rest have a CTextseq_id type....
870  else {
871  const CTextseq_id* textseqId = seqID->GetTextseq_Id();
872  if (!textseqId) return;
873 
874  // Report the 'accession' field as the accession.
875  // If the accession field is not set, use the 'name' field if available, as some
876  // types (PRF, PIR) use the 'name' field in Entrez Genpept reports as an accession.
877  string tidName = (textseqId->CanGetName()) ? textseqId->GetName() : "";
878  accession = (textseqId->CanGetAccession()) ? textseqId->GetAccession() : tidName;
879 
880  }
881 
882  // Use a specific source based on accession when requested...
883  // (as a special case, general IDs have the specific source set above)
884  if (!getGenericSource && !seqID->IsGeneral()) {
885  dbSource = CCdDbPriority::SeqIdTypeToSource((unsigned int) seqID->Which(), accession);
886  }
887 }
888 
890 {
891  info.acession.erase();
892  const list< CRef< CSeq_id > >& seqIds = bioseq->GetId();
893  for (list< CRef< CSeq_id > >::const_iterator cit = seqIds.begin();
894  cit != seqIds.end(); cit++)
895  {
896  const CTextseq_id* textId = (*cit)->GetTextseq_Id();
897  if (textId)
898  {
899  if (textId->CanGetAccession())
900  info.acession = textId->GetAccession();
901  if (info.acession.size() > 0)
902  {
903  if (textId->CanGetVersion())
904  info.version = textId->GetVersion();
905  info.dbsource = (*cit)->Which();
906  break;
907  }
908  }
909  }
910  if (bioseq->IsSetDescr())
911  {
912  list< CRef< CSeqdesc > >::const_iterator dit;
913  // look through the sequence descriptions
914  for (dit=bioseq->GetDescr().Get().begin();
915  dit!=bioseq->GetDescr().Get().end(); dit++)
916  {
917  // if there's a title, return that description
918  if ((*dit)->IsTitle())
919  info.defline = ((*dit)->GetTitle());
920  }
921  }
922  return !info.acession.empty();
923 }
924 
925 END_SCOPE(cd_utils) // namespace ncbi::objects::
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static string SeqIdTypeToSource(unsigned int seqIdType, string accession=kEmptyStr)
static string GetSourceName(EDbSource priority)
Definition: Dbtag.hpp:53
CPDB_block –.
Definition: PDB_block.hpp:66
string GetEffectiveChain_id(EBothUnsetPriority bothUnsetPriority=eBothUnset_ChainId) const
Definition: PDB_seq_id.cpp:116
CRef –.
Definition: ncbiobj.hpp:618
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
@ e_Ncbieaa
Definition: sequtil.hpp:57
@ e_Ncbistdaa
Definition: sequtil.hpp:58
@ e_Iupacaa
Definition: sequtil.hpp:55
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
bool NcbieaaToNcbistdaaString(const std::string &str, vector< char > &vec)
Definition: cuSequence.cpp:272
bool ExtractGi(const CRef< CBioseq > &bioseq, TGi &gi, unsigned int nth)
Definition: cuSequence.cpp:508
bool SeqIdHasMatchInBioseq(const CRef< CSeq_id > &id, const CBioseq &bioseq)
Definition: cuSequence.cpp:80
void NcbistdaaToNcbieaaString(const std::vector< char > &vec, std::string *str)
Definition: cuSequence.cpp:258
TTaxId GetTaxIdInBioseq(const CBioseq &bioseq)
Definition: cuSequence.cpp:139
bool GetNcbistdSeq(const CBioseq &bioseq, vector< char > &seqData)
Definition: cuSequence.cpp:315
bool CopyBioseqWithType(const CRef< CSeq_entry > &seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq > &seqEntryBioseq)
Definition: cuSequence.cpp:640
bool GetNcbieaaString(const CRef< CSeq_entry > &Seq, string &Str)
Definition: cuSequence.cpp:289
int GetSeqLength(const CBioseq &bioseq)
Definition: cuSequence.cpp:216
bool GetBioseqWithType(CRef< CSeq_entry > &seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq > &seqEntryBioseq)
Definition: cuSequence.cpp:672
void SimplifySeqEntryForCD(CRef< CSeq_entry > &seqEntry, const vector< string > &keptComments, bool keepPDBBlock)
Definition: cuSequence.cpp:778
bool AddCommentToBioseq(CBioseq &bioseq, const string &comment)
Definition: cuSequence.cpp:704
string GetSpeciesFromBioseq(const CBioseq &bioseq)
Definition: cuSequence.cpp:190
bool CopyGiSeqId(const CRef< CBioseq > &bioseq, CRef< CSeq_id > &giSeqId, unsigned int nth)
Definition: cuSequence.cpp:487
void SimplifyBioseqForCD(CBioseq &bioseq, const vector< string > &keptComments, bool keepPDBBlock)
Definition: cuSequence.cpp:718
char GetResidueAtPosition(const CRef< CSeq_entry > &seqEntry, int pos, bool zeroBased)
Definition: cuSequence.cpp:380
bool GetPDBBlockFromSeqEntry(CRef< CSeq_entry > seqEntry, CRef< CPDB_block > &pdbBlock)
Definition: cuSequence.cpp:446
bool extractBioseqInfo(const CRef< CBioseq > bioseq, BioseqInfo &info)
Definition: cuSequence.cpp:889
string GetAccessionForSeqId(const CRef< CSeq_id > &seqID)
Definition: cuSequence.cpp:803
string GetDbSourceForSeqId(const CRef< CSeq_id > &seqID)
Definition: cuSequence.cpp:796
int GetCDDPssmIdFromSeqId(const CRef< CSeq_id > &id)
Definition: cuSequence.cpp:97
bool IsEnvironmentalSeq(const CBioseq &bioseq)
Definition: cuSequence.cpp:184
string GetRawSequenceString(const CBioseq &bioseq)
Definition: cuSequence.cpp:349
bool CopyPdbSeqId(const CRef< CBioseq > &bioseq, CRef< CSeq_id > &pdbSeqId, unsigned int nth)
Definition: cuSequence.cpp:522
bool ExtractPdbMolChain(const CRef< CBioseq > &bioseq, string &pdbMol, string &pdbChain, unsigned int nth)
Definition: cuSequence.cpp:544
void GetAccessionAndDatabaseSource(const CRef< CSeq_id > &seqID, string &accession, string &dbSource, bool getGenericSource)
Definition: cuSequence.cpp:811
unsigned int CopySeqIdsOfType(const CBioseq &bioseq, CSeq_id::E_Choice choice, list< CRef< CSeq_id > > &idsOfType)
Definition: cuSequence.cpp:595
bool HasSeqIdOfType(const CBioseq &bioseq, CSeq_id::E_Choice choice)
Definition: cuSequence.cpp:559
int GetMMDBId(const CBioseq &bioseq)
Definition: cuSequence.cpp:112
bool SeqIdsMatch(const CRef< CSeq_id > &ID1, const CRef< CSeq_id > &ID2)
Definition: cuSequence.cpp:70
bool IsConsensus(const CRef< CSeq_id > &seqId)
Definition: cuSequence.cpp:405
bool GetAccAndVersion(const CRef< CBioseq > bioseq, string &acc, int &version, CRef< CSeq_id > &seqId)
Definition: cuSequence.cpp:420
const TTaxId ENVIRONMENTAL_SEQUENCE_TAX_ID
Definition: cuSequence.hpp:53
bool Empty(const CNcbiOstrstream &src)
Definition: fileutil.cpp:523
#define bool
Definition: bool.h:34
static const char * str(char *buf, int n)
Definition: stats.c:84
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
#define ZERO_GI
Definition: ncbimisc.hpp:1088
string
Definition: cgiapp.hpp:690
#define NULL
Definition: ncbistd.hpp:225
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
Definition: serialbase.hpp:347
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetDb(void) const
name of database or system Check if a value has been assigned to Db data member.
Definition: Dbtag_.hpp:208
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
bool IsSetTag(void) const
appropriate tag Check if a value has been assigned to Tag data member.
Definition: Dbtag_.hpp:255
const TDb & GetDb(void) const
Get the Db member data.
Definition: Dbtag_.hpp:220
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
bool IsPatent(void) const
Check if variant Patent is selected.
Definition: Seq_id_.hpp:865
const TPdb & GetPdb(void) const
Get the variant data.
Definition: Seq_id_.cpp:435
TGibbsq GetGibbsq(void) const
Get the variant data.
Definition: Seq_id_.hpp:787
TId GetId(void) const
Get the Id member data.
const TName & GetName(void) const
Get the Name member data.
bool IsGibbmt(void) const
Check if variant Gibbmt is selected.
Definition: Seq_id_.hpp:808
bool IsGeneral(void) const
Check if variant General is selected.
Definition: Seq_id_.hpp:877
bool CanGetName(void) const
Check if it is safe to call GetName method.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
bool IsPdb(void) const
Check if variant Pdb is selected.
Definition: Seq_id_.hpp:922
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
bool CanGetDb(void) const
Check if it is safe to call GetDb method.
TVersion GetVersion(void) const
Get the Version member data.
bool CanGetVersion(void) const
Check if it is safe to call GetVersion method.
const TMol & GetMol(void) const
Get the Mol member data.
TSeqid GetSeqid(void) const
Get the Seqid member data.
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
const TGiim & GetGiim(void) const
Get the variant data.
Definition: Seq_id_.cpp:215
const TLocal & GetLocal(void) const
Get the variant data.
Definition: Seq_id_.cpp:193
bool IsGiim(void) const
Check if variant Giim is selected.
Definition: Seq_id_.hpp:835
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Seq_id_.hpp:775
bool CanGetAccession(void) const
Check if it is safe to call GetAccession method.
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
const TPatent & GetPatent(void) const
Get the variant data.
Definition: Seq_id_.cpp:325
TGibbmt GetGibbmt(void) const
Get the variant data.
Definition: Seq_id_.hpp:814
bool IsGibbsq(void) const
Check if variant Gibbsq is selected.
Definition: Seq_id_.hpp:781
const TDb & GetDb(void) const
Get the Db member data.
const TAccession & GetAccession(void) const
Get the Accession member data.
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TDescr & GetDescr(void) const
Get the Descr member data.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
const TIupacaa & GetIupacaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:530
void ResetDescr(void)
Reset Descr data member.
Definition: Bioseq_.cpp:60
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
bool IsNcbieaa(void) const
Check if variant Ncbieaa is selected.
Definition: Seq_data_.hpp:644
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
bool IsIupacaa(void) const
Check if variant Iupacaa is selected.
Definition: Seq_data_.hpp:524
bool IsNcbistdaa(void) const
Check if variant Ncbistdaa is selected.
Definition: Seq_data_.hpp:684
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TPdb & SetPdb(void)
Select the variant.
Definition: Seqdesc_.cpp:544
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
void ResetAnnot(void)
Reset Annot data member.
Definition: Bioseq_.cpp:91
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
const TNcbieaa & GetNcbieaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:650
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
const TNcbistdaa & GetNcbistdaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:690
bool IsSetLength(void) const
length of sequence in residues Check if a value has been assigned to Length data member.
Definition: Seq_inst_.hpp:640
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Seq_descr_.hpp:154
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
bool CanGetCompound(void) const
Check if it is safe to call GetCompound method.
Definition: PDB_block_.hpp:441
const TCompound & GetCompound(void) const
Get the Compound member data.
Definition: PDB_block_.hpp:447
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
use only n Cassandra database for the lookups</td > n</tr > n< tr > n< td > yes</td > n< td > do not use tables BIOSEQ_INFO and BLOB_PROP in the Cassandra database
int i
int len
static MDB_envinfo info
Definition: mdb_load.c:37
const string version
version string
Definition: variables.hpp:66
else result
Definition: token2.c:20
#define const
Definition: zconf.h:232
Modified on Fri Sep 20 14:57:20 2024 by modify_doxy.py rev. 669887