NCBI C++ ToolKit
cuSequence.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuSequence.hpp 95643 2021-12-03 16:24:20Z lanczyck $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Adapted from CDTree1 code by Chris Lanczycki
27  *
28  * File Description:
29  *
30  * Functions for manipulating Bioseqs and other sequence representations
31  *
32  * ===========================================================================
33  */
34 
35 #ifndef CU_SEQUENCE_HPP
36 #define CU_SEQUENCE_HPP
37 
38 // include ncbistd.hpp, ncbiobj.hpp, ncbi_limits.h, various stl containers
39 #include <corelib/ncbiargs.hpp>
40 #include <corelib/ncbienv.hpp>
41 #include <corelib/ncbistre.hpp>
42 #include <objects/seq/Bioseq.hpp>
46 
49 
50 BEGIN_SCOPE(cd_utils)
51 
52 // the taxid for environmental sequences
54 
55 // Wraps the CSeq_id.Match(id) method: id1.Match(id2).
57 bool SeqIdsMatch(const CRef< CSeq_id>& id1, const CRef< CSeq_id>& id2);
58 
59 // Does the CSeq_id match any CSeq_id in the CBioseq? (Uses SeqIdsMatch above.)
62 
63 // Return 0 if Seq_id is not of proper type (e_General and database 'CDD')
66 
67 // Return -1 on failure; was FindMMDBIdInBioseq
69 int GetMMDBId (const CBioseq& bioseq);
70 
71 // Consistent w/ CTaxon1 class, return 0 if no tax id was found,
72 // or -(firstTaxId) if multiple tax ids found.
75 
77 bool IsEnvironmentalSeq(const CBioseq& bioseq);
78 
79 // Return species description as a string.
80 // Empty string returned on failure; was CCd::GetSpecies(...).
82 string GetSpeciesFromBioseq(const CBioseq& bioseq);
83 
84 // length = 0 if detect error condition.
85 // Incorporates code from cdt_vutils & cdt_manipcd
87 int GetSeqLength(const CBioseq& bioseq);
89 bool GetSeqLength(const CRef< CSeq_entry >& seqEntry, int& len);
90 
92 void NcbistdaaToNcbieaaString(const vector< char >& vec, string* str); // StringFromStdaa(...)
93 // Return false if there was an exception trying to convert the input string.
94 // Returns true otherwise, including for the case of an empty input string.
96 bool NcbieaaToNcbistdaaString(const std::string& str, vector < char >& vec);
98 bool GetNcbieaaString(const CBioseq& bioseq, string& str);
100 bool GetNcbistdSeq(const CBioseq& bioseq, vector<char>& seqData);
102 bool GetNcbieaaString(const CRef< CSeq_entry >& seqEntry, string& str); // from cdt_manipcd
104 string GetRawSequenceString(const CBioseq& bioseq);
105 
106 // On failure, returns \0 (i.e., null character)
107 // If zeroBased == true, first letter is at index 0, otherwise number residues from 1.
109 char GetResidueAtPosition(const CBioseq& bioseq, int pos, bool zeroBasedPos = true);
111 char GetResidueAtPosition(const CRef< CSeq_entry >& seqEntry, int pos, bool zeroBasedPos = true);
112 
114 bool IsConsensus(const CRef< CSeq_id >& seqId);
116 bool GetAccAndVersion(const CRef< CBioseq > bioseq, string& acc, int& version, CRef< CSeq_id>& seqId);
118 bool GetPDBBlockFromSeqEntry(CRef< CSeq_entry > seqEntry, CRef< CPDB_block >& pdbBlock);
119 
120 // Return 'false' if the bioseq doesn't have a gi-typed seq-id.
121 // Last arg tells which id to use if there are multiple gis.
123 bool ExtractGi(const CRef<CBioseq>& bioseq, TGi& gi, unsigned int nth = 1);
125 bool CopyGiSeqId(const CRef<CBioseq>& bioseq, CRef<CSeq_id>& giSeqId, unsigned int nth = 1);
126 
127 // Return 'false' if the bioseq doesn't have a pdb-typed seq-id.
128 // Last arg tells which id to use if there are multiple pdbs.
130 bool ExtractPdbMolChain(const CRef<CBioseq>& bioseq, string& pdbMol, string& pdbChain, unsigned int nth = 1);
132 bool CopyPdbSeqId(const CRef<CBioseq>& bioseq, CRef<CSeq_id>& pdbSeqId, unsigned int nth = 1);
133 
134 // Returns true iff there is at least one ids of the requested type found.
136 bool HasSeqIdOfType(const CBioseq& bioseq, CSeq_id::E_Choice choice);
138 bool HasSeqIdOfType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice);
139 
140 // Returns number of ids of the requested type found.
141 // Returned CSeq_id objects are copies of those found in the bioseq/seqEntry.
143 unsigned int CopySeqIdsOfType(const CBioseq& bioseq, CSeq_id::E_Choice choice, list< CRef< CSeq_id > >& idsOfType);
145 unsigned int CopySeqIdsOfType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, list< CRef< CSeq_id > >& idsOfType);
146 
147 // Return 'false' if the seqEntry doesn't have a bioseq containing a seq-id of the requested type.
148 // Returned CBioseq object is a copy of that found in the bioseq/seqEntry.
150 bool CopyBioseqWithType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq >& seqEntryBioseq) ;
151 
152 // Return 'false' if the seqEntry doesn't have a bioseq containing a seq-id of the requested type.
153 // Returned CBioseq object is an editable reference to the one in the CSeq_entry passed in.
155 bool GetBioseqWithType(CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq >& seqEntryBioseq) ;
156 
157 // Return 'false' if the comment was not added. Empty comment strings are not added.
159 bool AddCommentToBioseq(CBioseq& bioseq, const string& comment);
160 
161 // Simplify the CBioseq object to strip out elements not needed in a CD.
162 // Keep any comment-type CSeqdesc that match a strings in 'keptComments',
163 // and keep the CPDB_block for PDB CSeqdesc if 'keepPDBBlock' is true.
164 // Initially used for simplifying CBioseqs in CSeq_entry blobs retrieved from ID1.
166 void SimplifyBioseqForCD(CBioseq& bioseq, const vector<string>& keptComments, bool keepPDBBlock);
167 
168 // Simplify the CBioseq objects in a CSeq_entry to strip out elements not needed in a CD.
169 // Wrapper for SimplifyBioseqForCD.
171 void SimplifySeqEntryForCD(CRef< CSeq_entry >& seqEntry, const vector<string>& keptComments, bool keepPDBBlock);
172 
173 // First two are wrappers for the third function, that extracts a database source or accession
174 // for any Seq-id type.
176 string GetDbSourceForSeqId(const CRef< CSeq_id >& seqID); // gets the most exact source
178 string GetAccessionForSeqId(const CRef< CSeq_id >& seqID);
179 
180 // If the 'getGenericSource' flag is true, only the generic type of the database source is reported;
181 // when false, a more exact dbSource is returned, where possible: relevant primarily when dealing
182 // with a refseq.
184 void GetAccessionAndDatabaseSource(const CRef< CSeq_id >& seqID, string& accession, string& dbSource, bool getGenericSource = true);
185 
187 {
188  string acession;
189  int version;
190  string defline;
191  short dbsource;
192 };
193 
194 //return false if no accession is found
196 bool extractBioseqInfo(const CRef< CBioseq > bioseq, BioseqInfo&);
197 
198 END_SCOPE(cd_utils) // namespace ncbi::objects::
199 
201 
202 
203 #endif // ALGSEQUENCE_HPP
User-defined methods of the data storage class.
CPDB_block –.
Definition: PDB_block.hpp:66
CRef –.
Definition: ncbiobj.hpp:618
Definition: Seq_entry.hpp:56
USING_SCOPE(objects)
bool NcbieaaToNcbistdaaString(const std::string &str, vector< char > &vec)
Definition: cuSequence.cpp:272
bool SeqIdHasMatchInBioseq(const CRef< CSeq_id > &id, const CBioseq &bioseq)
Definition: cuSequence.cpp:80
bool CopyPdbSeqId(const CRef< CBioseq > &bioseq, CRef< CSeq_id > &pdbSeqId, unsigned int nth=1)
Definition: cuSequence.cpp:522
bool CopyGiSeqId(const CRef< CBioseq > &bioseq, CRef< CSeq_id > &giSeqId, unsigned int nth=1)
Definition: cuSequence.cpp:487
bool SeqIdsMatch(const CRef< CSeq_id > &id1, const CRef< CSeq_id > &id2)
Definition: cuSequence.cpp:70
TTaxId GetTaxIdInBioseq(const CBioseq &bioseq)
Definition: cuSequence.cpp:139
bool GetNcbistdSeq(const CBioseq &bioseq, vector< char > &seqData)
Definition: cuSequence.cpp:315
bool CopyBioseqWithType(const CRef< CSeq_entry > &seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq > &seqEntryBioseq)
Definition: cuSequence.cpp:640
bool ExtractGi(const CRef< CBioseq > &bioseq, TGi &gi, unsigned int nth=1)
Definition: cuSequence.cpp:508
char GetResidueAtPosition(const CBioseq &bioseq, int pos, bool zeroBasedPos=true)
Definition: cuSequence.cpp:389
void GetAccessionAndDatabaseSource(const CRef< CSeq_id > &seqID, string &accession, string &dbSource, bool getGenericSource=true)
Definition: cuSequence.cpp:811
int GetSeqLength(const CBioseq &bioseq)
Definition: cuSequence.cpp:216
bool GetBioseqWithType(CRef< CSeq_entry > &seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq > &seqEntryBioseq)
Definition: cuSequence.cpp:672
void SimplifySeqEntryForCD(CRef< CSeq_entry > &seqEntry, const vector< string > &keptComments, bool keepPDBBlock)
Definition: cuSequence.cpp:778
bool AddCommentToBioseq(CBioseq &bioseq, const string &comment)
Definition: cuSequence.cpp:704
string GetSpeciesFromBioseq(const CBioseq &bioseq)
Definition: cuSequence.cpp:190
void SimplifyBioseqForCD(CBioseq &bioseq, const vector< string > &keptComments, bool keepPDBBlock)
Definition: cuSequence.cpp:718
void NcbistdaaToNcbieaaString(const vector< char > &vec, string *str)
bool ExtractPdbMolChain(const CRef< CBioseq > &bioseq, string &pdbMol, string &pdbChain, unsigned int nth=1)
Definition: cuSequence.cpp:544
bool GetPDBBlockFromSeqEntry(CRef< CSeq_entry > seqEntry, CRef< CPDB_block > &pdbBlock)
Definition: cuSequence.cpp:446
string GetAccessionForSeqId(const CRef< CSeq_id > &seqID)
Definition: cuSequence.cpp:803
string GetDbSourceForSeqId(const CRef< CSeq_id > &seqID)
Definition: cuSequence.cpp:796
int GetCDDPssmIdFromSeqId(const CRef< CSeq_id > &id)
Definition: cuSequence.cpp:97
bool IsEnvironmentalSeq(const CBioseq &bioseq)
Definition: cuSequence.cpp:184
string GetRawSequenceString(const CBioseq &bioseq)
Definition: cuSequence.cpp:349
bool GetNcbieaaString(const CBioseq &bioseq, string &str)
Definition: cuSequence.cpp:298
bool extractBioseqInfo(const CRef< CBioseq > bioseq, BioseqInfo &)
Definition: cuSequence.cpp:889
unsigned int CopySeqIdsOfType(const CBioseq &bioseq, CSeq_id::E_Choice choice, list< CRef< CSeq_id > > &idsOfType)
Definition: cuSequence.cpp:595
bool HasSeqIdOfType(const CBioseq &bioseq, CSeq_id::E_Choice choice)
Definition: cuSequence.cpp:559
int GetMMDBId(const CBioseq &bioseq)
Definition: cuSequence.cpp:112
const TTaxId ENVIRONMENTAL_SEQUENCE_TAX_ID
Definition: cuSequence.hpp:53
bool IsConsensus(const CRef< CSeq_id > &seqId)
Definition: cuSequence.cpp:405
bool GetAccAndVersion(const CRef< CBioseq > bioseq, string &acc, int &version, CRef< CSeq_id > &seqId)
Definition: cuSequence.cpp:420
#define true
Definition: bool.h:35
static const char * str(char *buf, int n)
Definition: stats.c:84
#define TAX_ID_CONST(id)
Definition: ncbimisc.hpp:1112
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_CDUTILS_EXPORT
Definition: ncbi_export.h:376
int len
static int version
Definition: mdb_load.c:29
Defines command line argument related classes.
Defines unified interface to application:
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
short dbsource
Definition: cuSequence.hpp:191
string acession
Definition: cuSequence.hpp:188
string defline
Definition: cuSequence.hpp:190
#define const
Definition: zconf.h:232
Modified on Mon Apr 22 04:02:15 2024 by modify_doxy.py rev. 669887