NCBI C++ ToolKit
seq_loc_util.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef SEQ_LOC_UTIL__HPP
2 #define SEQ_LOC_UTIL__HPP
3 
4 /* $Id: seq_loc_util.hpp 90173 2020-05-19 12:49:06Z grichenk $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Clifford Clausen, Aaron Ucko, Aleksey Grichenko
30 *
31 * File Description:
32 * Seq-loc utilities requiring CScope
33 */
34 
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbiobj.hpp>
39 #include <objmgr/scope.hpp>
40 
41 
44 
45 // Forward declarations
46 class CSeq_loc;
47 class CSeq_id_Handle;
48 class CSeq_id;
49 class CBioseq_Handle;
50 
51 BEGIN_SCOPE(sequence)
52 
53 
54 /** @addtogroup ObjUtilSeqLoc
55  *
56  * @{
57  */
58 
59 
60 /** @name Basic information
61  * Basic seq-loc information and verification
62  * @{
63  */
64 
65 /// Get sequence length if scope not null, else return max possible TSeqPos
67 TSeqPos GetLength(const CSeq_id& id, CScope* scope);
68 
69 /// Get length of sequence represented by CSeq_loc, if possible
71 TSeqPos GetLength(const CSeq_loc& loc, CScope* scope);
72 
73 /// Get number of unique bases in the location
75 TSeqPos GetCoverage(const CSeq_loc& loc, CScope* scope);
76 
77 /// Get length of CSeq_loc_mix == sum (length of embedded CSeq_locs)
79 TSeqPos GetLength(const CSeq_loc_mix& mix, CScope* scope);
80 
81 /// Checks that point >= 0 and point < length of Bioseq
83 bool IsValid(const CSeq_point& pt, CScope* scope);
84 
85 /// Checks that all points >=0 and < length of CBioseq. If scope is 0
86 /// assumes length of CBioseq is max value of TSeqPos.
88 bool IsValid(const CPacked_seqpnt& pts, CScope* scope);
89 
90 /// Checks from and to of CSeq_interval. If from < 0, from > to, or
91 /// to >= length of CBioseq this is an interval for, returns false, else true.
93 bool IsValid(const CSeq_interval& interval, CScope* scope);
94 
95 /// Determines if two CSeq_ids represent the same CBioseq
97 bool IsSameBioseq(const CSeq_id& id1, const CSeq_id& id2, CScope* scope,
100 bool IsSameBioseq(const CSeq_id_Handle& id1, const CSeq_id_Handle& id2, CScope* scope,
102 
103 /// Returns true if all embedded CSeq_ids represent the same CBioseq, else false
105 bool IsOneBioseq(const CSeq_loc& loc, CScope* scope);
106 
107 /// If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns
108 /// the first CSeq_id found, else throws CObjmgrUtilException exception.
110 const CSeq_id& GetId(const CSeq_loc& loc, CScope* scope);
112 CSeq_id_Handle GetIdHandle(const CSeq_loc& loc, CScope* scope);
113 
114 
115 /// Returns eNa_strand_unknown if multiple Bioseqs in loc
116 /// Returns eNa_strand_other if multiple strands in same loc
117 /// Returns eNa_strand_both if loc is a Whole
118 /// Returns strand otherwise
120 ENa_strand GetStrand(const CSeq_loc& loc, CScope* scope = 0);
121 
122 /// If only one CBioseq is represented by CSeq_loc, returns the position at the
123 /// start of the location. By defulat this is the lowest residue position
124 /// represented by the location.
125 /// If not null, scope is used to determine if two
126 /// CSeq_ids represent the same CBioseq. Throws CObjmgrUtilException if
127 /// CSeq_loc does not represent one CBioseq.
129 TSeqPos GetStart(const CSeq_loc& loc, CScope* scope,
131 
132 /// If only one CBioseq is represented by CSeq_loc, returns the position at the
133 /// stop of the location. By defualt this is the highest residue position
134 /// represented by the location.
135 /// If not null, scope is used to determine if two
136 /// CSeq_ids represent the same CBioseq. Throws CObjmgrUtilException exception
137 /// if CSeq_loc does not represent one CBioseq.
139 TSeqPos GetStop(const CSeq_loc& loc, CScope* scope,
141 
142 
143 /// SeqLocCheck results
148 };
149 
150 /// Checks that a CSeq_loc is all on one strand on one CBioseq. For embedded
151 /// points, checks that the point location is <= length of sequence of point.
152 /// For packed points, checks that all points are within length of sequence.
153 /// For intervals, ensures from <= to and interval is within length of sequence.
154 /// If no mixed strands and lengths are valid, returns eSeqLocCheck_ok. If
155 /// only mixed strands/CBioseq error, then returns eSeqLocCheck_warning. If
156 /// length error, then returns eSeqLocCheck_error.
158 ESeqLocCheck SeqLocCheck(const CSeq_loc& loc, CScope* scope);
159 
160 /// Returns true if the order of Seq_locs is bad, otherwise, false
162 bool BadSeqLocSortOrder(const CBioseq_Handle& bsh,
163  const CSeq_loc& loc);
165 bool BadSeqLocSortOrder(const CBioseq& seq,
166  const CSeq_loc& loc,
167  CScope* scope);
168 
169 /* @} */
170 
171 
172 /** @name Compare
173  * Containment relationships between CSeq_locs
174  * @{
175  */
176 
177 enum ECompare {
178  eNoOverlap = 0, ///< CSeq_locs do not overlap or abut
179  eContained, ///< First CSeq_loc contained by second
180  eContains, ///< First CSeq_loc contains second
181  eSame, ///< CSeq_locs contain each other
182  eOverlap, ///< CSeq_locs overlap
183 
184  /// Abutting seq-locs. The flag can be returned only by the 4-argument
185  /// version of Compare() if fCompareAbutting flag is set in the
186  /// TCompareFlags argument.
188  /// Seq-locs do both abut and overlap. The flag can be returned only by
189  /// the 4-argument version of Compare() if fCompareAbutting flag is set
190  /// in the TCompareFlags argument.
192 };
193 
194 /// Returns the sequence::ECompare containment relationship between CSeq_locs.
195 /// For backward compatibility the function does not check for abutting
196 /// seq-locs and never returns eAbutting or eAbutAndOverlap.
197 /// @deprecated Use the new Compare() taking the additional TCompareFlags arg.
201  const CSeq_loc& loc2,
202  CScope* scope);
203 
204 /* @} */
205 
206 
207 /** @name Compare
208  * Check if the two CSeq_locs are abutting
209  * @{
210  */
211 
213  /// Check if seq-locs are abutting (loc2 follows loc1)
215  /// Check if seq-locs are overlapping
218 
219  /// Use positional coordinates (ignore strands) when looking for
220  /// abutting locations.
221  fComparePositional = 1 << 2
222 };
223 typedef int TCompareFlags;
224 
225 /// Compare the seq-locs. Depending on the selected flags the function
226 /// checks if the seq-locs are abutting (loc2 immediately follows loc1),
227 /// overlap (see ECompare) or both. Unless fComparePositional flag is set,
228 /// locations are compared using biological order of ranges: if locations
229 /// are on minus strand, to be abutting the first location must begin
230 /// right after the second location's end.
233  const CSeq_loc& loc2,
234  CScope* scope,
236 
237 /* @} */
238 /** @name Change id
239  * Replace seq-id with the best or worst rank
240  * @{
241  */
242 
243 /// Change a CSeq_id to the one for the CBioseq that it represents
244 /// that has the best rank or worst rank according on value of best.
245 /// Just returns if scope == 0
247 void ChangeSeqId(CSeq_id* id, bool best, CScope* scope);
248 
249 /// Change each of the CSeq_ids embedded in a CSeq_loc to the best
250 /// or worst CSeq_id accoring to the value of best. Just returns if
251 /// scope == 0
253 void ChangeSeqLocId(CSeq_loc* loc, bool best, CScope* scope);
254 
255 /* @} */
256 
257 
258 /** @name Overlapping
259  * Overlapping of seq-locs
260  * @{
261  */
262 
264  /// For positive-orientation strands, start = left and end = right;
265  /// for reverse-orientation strands, start = right and end = left.
266  eOffset_FromStart, ///< relative to beginning of location
267  eOffset_FromEnd, ///< relative to end of location
268  eOffset_FromLeft, ///< relative to low-numbered end
269  eOffset_FromRight ///< relative to high-numbered end
270 };
271 
272 /// returns (TSeqPos)-1 if the locations don't overlap
274 TSeqPos LocationOffset(const CSeq_loc& outer, const CSeq_loc& inner,
275  EOffsetType how = eOffset_FromStart, CScope* scope = 0);
276 
278  eOverlap_Simple, ///< any overlap of extremes
279  eOverlap_Contained, ///< 2nd contained within 1st extremes
280  eOverlap_Contains, ///< 2nd contains 1st extremes
281  eOverlap_Subset, ///< 2nd is a subset of 1st ranges
282  eOverlap_SubsetRev, ///< 1st is a subset of 2nd ranges
283  eOverlap_CheckIntervals, ///< 2nd is a subset of 1st with matching boundaries
284  eOverlap_CheckIntRev, ///< 1st is a subset of 2nd with matching boundaries
285  eOverlap_Interval ///< at least one pair of intervals must overlap
286 };
287 
288 /// 64-bit version of TestForOverlap()
289 /// Check if the two locations have ovarlap of a given type.
290 /// Return quality of the overlap: lower values mean better overlapping.
291 /// 0 = exact match of the ranges, -1 = no overlap.
293 Int8 TestForOverlap64(const CSeq_loc& loc1,
294  const CSeq_loc& loc2,
296  TSeqPos circular_len = kInvalidSeqPos,
297  CScope* scope = 0);
298 
299 /// Flags, controlling behavior of TestForOverlapEx().
301  fOverlap_NoMultiSeq = 1 << 0, ///< Throw if locations reference multiple bioseqs
302  fOverlap_NoMultiStrand = 1 << 1, ///< Throw if locations reference multiple strands
303  fOverlap_IgnoreTopology = 1 << 2, ///< Ignore sequence topology (circularity)
304  fOverlap_Default = 0 ///< Enable multi-id, multi-strand, check topology
305 };
306 typedef int TOverlapFlags;
307 
308 /// Updated version of TestForOverlap64(). Allows more control over
309 /// handling multi-id/multi-strand bioseqs.
310 /// Return quality of the overlap: lower values mean better overlapping.
311 /// 0 = exact match of the ranges, -1 = no overlap.
313 Int8 TestForOverlapEx(const CSeq_loc& loc1,
314  const CSeq_loc& loc2,
316  CScope* scope = 0,
318 
319 /// Calls TestForOverlap64() and if the result is greater than kMax_Int
320 /// truncates it to kMax_Int. To get the exact value use TestForOverlap64().
322 int TestForOverlap(const CSeq_loc& loc1,
323  const CSeq_loc& loc2,
325  TSeqPos circular_len = kInvalidSeqPos,
326  CScope* scope = 0);
327 
328 /* @} */
329 
330 
331 /** @name PartialCheck
332  * Sets bits for incomplete location and/or errors
333  * @{
334  */
335 
347 };
348 
350 int SeqLocPartialCheck(const CSeq_loc& loc, CScope* scope);
351 
352 /* @} */
353 
354 /// Get reverse complement of the seq-loc (?)
356 CSeq_loc* SeqLocRevCmpl(const CSeq_loc& loc, CScope* scope);
357 
358 /// Old name for this function. Now it's just a wrapper
359 /// for the new name, which will be removed in the future.
361 inline
362 CSeq_loc* SeqLocRevCmp(const CSeq_loc& loc, CScope* scope)
363 {
364  return SeqLocRevCmpl(loc, scope);
365 }
366 
367 /** @name Operations
368  * Seq-loc operations
369  * All operations create and return a new seq-loc object.
370  * Optional scope or synonym mapper may be provided to detect and convert
371  * synonyms of a bioseq.
372  * @{
373  */
374 
375 /// Merge ranges in the seq-loc
379  CScope* scope);
380 
381 /// Merge multiple locations
382 template<typename TSeq_loc_Set>
383 CSeq_loc* Seq_locs_Merge(TSeq_loc_Set& locs,
385  CScope* scope)
386 {
387  // create a single Seq-loc holding all the locations
388  CSeq_loc temp;
389  ITERATE(typename TSeq_loc_Set, it, locs) {
390  temp.Add(**it);
391  }
392  return Seq_loc_Merge(temp, flags, scope);
393 }
394 
395 /// Add two seq-locs
398  const CSeq_loc& loc2,
400  CScope* scope);
401 
402 /// Subtract the second seq-loc from the first one
405  const CSeq_loc& loc2,
407  CScope* scope);
408 
409 /* @} */
410 
411 
412 END_SCOPE(sequence)
415 
416 #endif /* SEQ_LOC_UTIL__HPP */
ESeqLocExtremes
Used to determine the meaning of a location's Start/Stop positions.
Definition: Na_strand.hpp:61
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
CBioseq_Handle –.
CScope –.
Definition: scope.hpp:92
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
int TOpFlags
Definition: Seq_loc.hpp:336
ESeqlocPartial
FCompareFlags
TSeqPos GetStop(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the stop of the location.
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
bool BadSeqLocSortOrder(const CBioseq_Handle &bsh, const CSeq_loc &loc)
Returns true if the order of Seq_locs is bad, otherwise, false.
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
ESeqLocCheck
SeqLocCheck results.
EOverlapType
ENa_strand GetStrand(const CSeq_loc &loc, CScope *scope=0)
Returns eNa_strand_unknown if multiple Bioseqs in loc Returns eNa_strand_other if multiple strands in...
bool IsValid(const CSeq_point &pt, CScope *scope)
Checks that point >= 0 and point < length of Bioseq.
Int8 TestForOverlapEx(const CSeq_loc &loc1, const CSeq_loc &loc2, EOverlapType type, CScope *scope=0, TOverlapFlags flags=fOverlap_Default)
Updated version of TestForOverlap64().
int TOverlapFlags
TSeqPos LocationOffset(const CSeq_loc &outer, const CSeq_loc &inner, EOffsetType how=eOffset_FromStart, CScope *scope=0)
returns (TSeqPos)-1 if the locations don't overlap
int TCompareFlags
void ChangeSeqLocId(CSeq_loc *loc, bool best, CScope *scope)
Change each of the CSeq_ids embedded in a CSeq_loc to the best or worst CSeq_id accoring to the value...
TSeqPos GetStart(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the start of the location.
int SeqLocPartialCheck(const CSeq_loc &loc, CScope *scope)
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
TSeqPos GetCoverage(const CSeq_loc &loc, CScope *scope)
Get number of unique bases in the location.
CSeq_loc * SeqLocRevCmp(const CSeq_loc &loc, CScope *scope)
Old name for this function.
CRef< CSeq_loc > Seq_loc_Subtract(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags, CScope *scope)
Subtract the second seq-loc from the first one.
Int8 TestForOverlap64(const CSeq_loc &loc1, const CSeq_loc &loc2, EOverlapType type, TSeqPos circular_len=kInvalidSeqPos, CScope *scope=0)
64-bit version of TestForOverlap() Check if the two locations have ovarlap of a given type.
CSeq_id_Handle GetIdHandle(const CSeq_loc &loc, CScope *scope)
void ChangeSeqId(CSeq_id *id, bool best, CScope *scope)
Change a CSeq_id to the one for the CBioseq that it represents that has the best rank or worst rank a...
CRef< CSeq_loc > Seq_loc_Merge(const CSeq_loc &loc, CSeq_loc::TOpFlags flags, CScope *scope)
Merge ranges in the seq-loc.
bool IsOneBioseq(const CSeq_loc &loc, CScope *scope)
Returns true if all embedded CSeq_ids represent the same CBioseq, else false.
CSeq_loc * Seq_locs_Merge(TSeq_loc_Set &locs, CSeq_loc::TOpFlags flags, CScope *scope)
Merge multiple locations.
ECompare
CRef< CSeq_loc > Seq_loc_Add(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags, CScope *scope)
Add two seq-locs.
int TestForOverlap(const CSeq_loc &loc1, const CSeq_loc &loc2, EOverlapType type, TSeqPos circular_len=kInvalidSeqPos, CScope *scope=0)
Calls TestForOverlap64() and if the result is greater than kMax_Int truncates it to kMax_Int.
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
EOverlapFlags
Flags, controlling behavior of TestForOverlapEx().
EOffsetType
CSeq_loc * SeqLocRevCmpl(const CSeq_loc &loc, CScope *scope)
Get reverse complement of the seq-loc (?)
ESeqLocCheck SeqLocCheck(const CSeq_loc &loc, CScope *scope)
Checks that a CSeq_loc is all on one strand on one CBioseq.
@ eSeqlocPartial_Nostart
@ eSeqlocPartial_Haderror
@ eSeqlocPartial_Nostop
@ eSeqlocPartial_Internal
@ eSeqlocPartial_Nointernal
@ eSeqlocPartial_Other
@ eSeqlocPartial_Complete
@ eSeqlocPartial_Stop
@ eSeqlocPartial_Limwrong
@ eSeqlocPartial_Start
@ fCompareAbutting
Check if seq-locs are abutting (loc2 follows loc1)
@ fCompareAll
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ fComparePositional
Use positional coordinates (ignore strands) when looking for abutting locations.
@ eSeqLocCheck_ok
@ eSeqLocCheck_error
@ eSeqLocCheck_warning
@ eOverlap_SubsetRev
1st is a subset of 2nd ranges
@ eOverlap_CheckIntervals
2nd is a subset of 1st with matching boundaries
@ eOverlap_Contains
2nd contains 1st extremes
@ eOverlap_CheckIntRev
1st is a subset of 2nd with matching boundaries
@ eOverlap_Simple
any overlap of extremes
@ eOverlap_Interval
at least one pair of intervals must overlap
@ eOverlap_Contained
2nd contained within 1st extremes
@ eOverlap_Subset
2nd is a subset of 1st ranges
@ eContains
First CSeq_loc contains second.
@ eOverlap
CSeq_locs overlap.
@ eSame
CSeq_locs contain each other.
@ eAbutting
Abutting seq-locs.
@ eContained
First CSeq_loc contained by second.
@ eAbutAndOverlap
Seq-locs do both abut and overlap.
@ eNoOverlap
CSeq_locs do not overlap or abut.
@ fOverlap_NoMultiSeq
Throw if locations reference multiple bioseqs.
@ fOverlap_NoMultiStrand
Throw if locations reference multiple strands.
@ fOverlap_IgnoreTopology
Ignore sequence topology (circularity)
@ fOverlap_Default
Enable multi-id, multi-strand, check topology.
@ eOffset_FromLeft
relative to low-numbered end
@ eOffset_FromRight
relative to high-numbered end
@ eOffset_FromEnd
relative to end of location
@ eOffset_FromStart
For positive-orientation strands, start = left and end = right; for reverse-orientation strands,...
EGetBioseqFlag
Definition: scope.hpp:125
@ eGetBioseq_All
Search bioseq, load if not loaded yet.
Definition: scope.hpp:128
#define NCBI_DEPRECATED
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_XOBJUTIL_EXPORT
Definition: ncbi_export.h:1339
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
Definition: type.c:6
Modified on Sun Apr 21 03:43:50 2024 by modify_doxy.py rev. 669887