NCBI C++ ToolKit
gap_analysis.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gap_analysis.cpp 70871 2016-01-27 18:57:01Z kornbluh $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:
27  * Michael Kornbluh, NCBI
28  *
29  * File Description:
30  * Given a Bioseq, etc. it returns analysis of the gap data.
31 
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
38 
39 #include <objmgr/seq_map_ci.hpp>
40 #include <objmgr/seq_vector.hpp>
41 
44 
45 CTempString CGapAnalysis::s_GapTypeToStr(
46  EGapType eGapType)
47 {
48  switch (eGapType)
49  {
50  case eGapType_All:
51  return "All Gaps";
52  case eGapType_SeqGap:
53  return "Seq Gaps";
54  case eGapType_UnknownBases:
55  return "Unknown Bases";
56  default:
57  return "UNKNOWN GAP TYPE";
58  }
59 }
60 
62  const CSeq_entry_Handle & entry_h,
63  CSeq_inst::EMol filter,
65  TAddFlag add_flags,
66  TFlag fFlags,
67  size_t max_resolve_count)
68 {
69  CBioseq_CI bioseq_ci(entry_h, filter, level);
70  for( ; bioseq_ci; ++bioseq_ci ) {
71  AddBioseqGaps(*bioseq_ci, add_flags, fFlags, max_resolve_count);
72  }
73 }
74 
76  const CBioseq_Handle & bioseq_h,
77  TAddFlag add_flags,
78  TFlag fFlags,
79  size_t max_resolve_count)
80 {
81  // get CSeq_id of CBioseq
82  TSeqIdConstRef pSeqId = bioseq_h.GetSeqId();
83  const TSeqPos bioseq_len = bioseq_h.GetBioseqLength();
84 
85  // fFlags control what we look at
86  CSeqMap::TFlags seq_map_flags = 0;
87  if( add_flags & fAddFlag_IncludeSeqGaps ) {
88  seq_map_flags |= CSeqMap::fFindGap;
89  }
90  if( add_flags & fAddFlag_IncludeUnknownBases ) {
91  seq_map_flags |= CSeqMap::fFindData;
92  }
93 
94  TSeqPos end_of_last_segment = 0; // exclusive
95  bool all_segments_and_in_order = true;
96 
97  SSeqMapSelector selector;
98  selector.SetFlags(seq_map_flags).SetResolveCount(max_resolve_count);
99  CSeqMap_CI seqmap_ci(bioseq_h, selector);
100  for( ; seqmap_ci; ++seqmap_ci ) {
101  if( seqmap_ci.GetPosition() != end_of_last_segment ) {
102  all_segments_and_in_order = false;
103  }
104  end_of_last_segment = seqmap_ci.GetEndPosition();
105 
106  CSeqMap::ESegmentType seg_type = seqmap_ci.GetType();
107  switch(seg_type) {
108  case CSeqMap::eSeqGap:
109  _ASSERT(add_flags & fAddFlag_IncludeSeqGaps);
110  AddGap(
111  eGapType_SeqGap, pSeqId,
112  seqmap_ci.GetLength(),
113  bioseq_len,
114  seqmap_ci.GetPosition(), seqmap_ci.GetEndPosition(),
115  fFlags);
116  break;
117  case CSeqMap::eSeqData:
120  seqmap_ci, pSeqId,
121  bioseq_len, fFlags);
122  break;
123  default:
125  "This segment type is not supported at this time: " <<
126  static_cast<int>(seg_type) );
127  }
128  }
129 
130  if( end_of_last_segment != bioseq_len ) {
131  all_segments_and_in_order = false;
132  }
133  if( ! all_segments_and_in_order ) {
134  ERR_POST(
135  Warning << "Not all segments on bioseq '"
136  << pSeqId->AsFastaString() << "' were in order "
137  "or some positions appear to have been skipped. "
138  "One possible reason is that there were far references for "
139  "which no attempt was made to resolve due to max resolve count "
140  "being reached.");
141  }
142 }
143 
145  EGapType eGapType,
146  TSeqIdConstRef pSeqId,
147  TGapLength iGapLength,
148  TSeqPos iBioseqLength,
149  TSeqPos iGapStartPos,
150  TSeqPos iGapEndPosExclusive,
151  TFlag fFlags)
152 {
153  // filter out edge gaps if requested
154  _ASSERT(iGapStartPos < iGapEndPosExclusive);
155  _ASSERT((iGapEndPosExclusive - iGapStartPos) == iGapLength);
156 
157  if( ! (fFlags & fFlag_IncludeEndGaps) ) {
158  if( iGapStartPos == 0 ||
159  iGapEndPosExclusive == iBioseqLength )
160  {
161  // skip since it's an end gap
162  return;
163  }
164  }
165 
166  m_gapTypeAndLengthToSeqIds[eGapType][iGapLength].insert(pSeqId);
167  m_gapTypeAndLengthToSeqIds[eGapType_All][iGapLength].insert(pSeqId);
168 
169  ++m_gapTypeAndLengthToNumAppearances[eGapType][iGapLength];
171 
172  x_GetOrCreateHistogramBinner(eGapType).AddNumber(iGapLength);
174 }
175 
177 {
181 }
182 
183 ostream& operator<<(
184  ostream& s,
185  const CGapAnalysis::SOneGapLengthSummary & one_gap_len_summary )
186 {
187  s << "SOneGapLengthSummary("
188  << "gap_length: " << one_gap_len_summary.gap_length
189  << ", num_seqs: " << one_gap_len_summary.num_seqs
190  << ", num_gaps: " << one_gap_len_summary.num_gaps
191  << ")";
192  return s;
193 }
194 
195 ostream& operator<<(
196  ostream& s,
197  const CGapAnalysis::TVectorGapLengthSummary& gap_len_summary)
198 {
199  s << "TVectorGapLengthSummary(" << endl;
200  ITERATE(CGapAnalysis::TVectorGapLengthSummary, summ_it, gap_len_summary )
201  {
202  s << **summ_it << endl;
203  }
204 
205  s << ")";
206  return s;
207 }
208 
211  EGapType eGapType,
212  ESortGapLength eSortGapLength,
213  ESortDir eSortDir) const
214 {
216  const TMapGapLengthToSeqIds & mapGapLengthToSeqIds =
217  GetGapLengthSeqIds(eGapType);
218  const TMapGapLengthToNumAppearances & m_mapGapLengthToNumAppearances =
220  ITERATE( TMapGapLengthToSeqIds, gap_map_iter, mapGapLengthToSeqIds ) {
221  const TGapLength iGapLength = gap_map_iter->first;
222  const TSetSeqIdConstRef & setSeqIds = gap_map_iter->second;
223 
224  // find appearances of each gap length
225  Uint8 num_gaps = 0;
227  m_mapGapLengthToNumAppearances.find(iGapLength);
228  _ASSERT( find_iter != m_mapGapLengthToNumAppearances.end() );
229  num_gaps = find_iter->second;
230 
231  pAnswer->push_back(
233  iGapLength,
234  setSeqIds.size(),
235  num_gaps )));
236  }
237 
238  // sort if user uses non-default ordering
239  if( eSortGapLength != eSortGapLength_Length ||
240  eSortDir != eSortDir_Ascending )
241  {
242  SOneGapLengthSummarySorter sorter(eSortGapLength, eSortDir);
243  stable_sort(pAnswer->begin(), pAnswer->end(), sorter );
244  }
245 
246  return pAnswer;
247 }
248 
251 {
252  static const TMapGapLengthToSeqIds empty_map;
255  if( find_it != m_gapTypeAndLengthToSeqIds.end() ) {
256  return find_it->second;
257  } else {
258  return empty_map;
259  }
260 }
261 
264 {
265  static TMapGapLengthToNumAppearances empty_map;
268  if( find_it != m_gapTypeAndLengthToNumAppearances.end() ) {
269  return find_it->second;
270  } else {
271  return empty_map;
272  }
273 }
274 
277  EGapType eGapType,
278  Uint8 num_bins,
280 {
281  CHistogramBinning & histogramBinner =
283  histogramBinner.SetNumBins(num_bins);
284  return histogramBinner.CalcHistogram(eHistAlgo);
285 }
286 
288  ESortGapLength sort_gap_length_arg,
289  ESortDir sort_dir_arg )
290  : sort_gap_length(sort_gap_length_arg), sort_dir(sort_dir_arg)
291 {
292  // nothing to do
293 }
294 
297  const CConstRef<SOneGapLengthSummary> & rhs ) const
298 {
299  // handle if sorting reversed
300  const SOneGapLengthSummary & real_lhs = (sort_dir == eSortDir_Ascending ? *lhs : *rhs);
301  const SOneGapLengthSummary & real_rhs = (sort_dir == eSortDir_Ascending ? *rhs : *lhs);
302 
303  switch(sort_gap_length) {
305  return real_lhs.gap_length < real_rhs.gap_length;
307  return real_lhs.num_seqs < real_rhs.num_seqs;
309  return real_lhs.num_gaps < real_rhs.num_gaps;
310  default:
311  NCBI_USER_THROW_FMT("Unknown sort_gap_length: " <<
312  static_cast<int>(sort_gap_length) );
313  }
314 }
315 
318 {
321  if( find_iter != m_gapTypeToHistogramBinner.end() ) {
322  return find_iter->second->GetData();
323  }
324 
325  // not found, so create
326  TRefHistogramBinning new_value(
328  typedef pair<TGapTypeToHistogramBinner::iterator, bool> TInsertResult;
329  TInsertResult insert_result = m_gapTypeToHistogramBinner.insert(
330  make_pair(eGapType, new_value));
331  _ASSERT(insert_result.second);
332 
333  return insert_result.first->second->GetData();
334 }
335 
337  const CSeqMap_CI & seqmap_ci,
338  TSeqIdConstRef bioseq_seq_id,
339  TSeqPos iBioseqLength,
340  TFlag fFlags)
341 {
342  const TSeqPos begin_pos = seqmap_ci.GetPosition();
343 
344  // get location representing this segment's bases
345  CRef<CSeq_loc> loc_of_bases(
346  new CSeq_loc(
347  *SerialClone(*bioseq_seq_id),
348  begin_pos,
349  (begin_pos + seqmap_ci.GetLength() - 1)));
350  CSeqVector seq_vec(
351  *loc_of_bases, *seqmap_ci.GetScope(), CBioseq_Handle::eCoding_Iupac);
352  const char kGapChar = seq_vec.GetGapChar(
354 
355  // a simple "runs of unknown bases" algo
356  size_t size_of_curr_gap = 0;
357  size_t start_pos_of_curr_gap = kInvalidSeqPos;
358 
359  CSeqVector_CI seq_vec_ci = seq_vec.begin();
360  for( ; seq_vec_ci; ++seq_vec_ci ) {
361  if( *seq_vec_ci == kGapChar ) {
362  ++size_of_curr_gap;
363  if( start_pos_of_curr_gap == kInvalidSeqPos ) {
364  start_pos_of_curr_gap = (begin_pos + seq_vec_ci.GetPos());
365  }
366  } else if( size_of_curr_gap > 0 ) {
367  _ASSERT(start_pos_of_curr_gap != kInvalidSeqPos);
368  AddGap(
369  eGapType_UnknownBases, bioseq_seq_id, size_of_curr_gap,
370  iBioseqLength,
371  start_pos_of_curr_gap, (begin_pos + seq_vec_ci.GetPos()),
372  fFlags);
373  size_of_curr_gap = 0;
374  start_pos_of_curr_gap = kInvalidSeqPos;
375  }
376  }
377  if( size_of_curr_gap > 0 ) {
378  _ASSERT(start_pos_of_curr_gap != kInvalidSeqPos);
379  AddGap(
380  eGapType_UnknownBases, bioseq_seq_id, size_of_curr_gap,
381  iBioseqLength,
382  start_pos_of_curr_gap, (begin_pos + seq_vec_ci.GetPos()),
383  fFlags);
384  }
385 }
386 
AutoPtr –.
Definition: ncbimisc.hpp:401
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
This class does comparison of SOneGapLengthSummary, and it is adjustable which field to sort on and w...
bool operator()(const CConstRef< SOneGapLengthSummary > &lhs, const CConstRef< SOneGapLengthSummary > &rhs) const
less-than
SOneGapLengthSummarySorter(ESortGapLength sort_gap_length_arg, ESortDir sort_dir_arg)
Give this gaps, or handles containing gaps and then you can get statistics on those gaps.
const TMapGapLengthToSeqIds & GetGapLengthSeqIds(EGapType eGapType) const
Returns a map of gap_length to the set of all seq-ids that contain at least one gap of that length.
@ fFlag_IncludeEndGaps
include gaps that are at the very start or very end of their sequence.
void AddSeqEntryGaps(const CSeq_entry_Handle &entry_h, CSeq_inst::EMol filter=CSeq_inst::eMol_not_set, CBioseq_CI::EBioseqLevelFlag level=CBioseq_CI::eLevel_All, TAddFlag add_flags=fAddFlag_All, TFlag fFlags=0, size_t max_resolve_count=kMax_Int)
Calls AddGap for each gap anywhere under the given CSeq_entry.
void AddGap(EGapType eGapType, TSeqIdConstRef pSeqId, TGapLength iGapLength, TSeqPos iBioseqLength, TSeqPos iGapStartPos, TSeqPos iGapEndPosExclusive, TFlag fFlags=0)
AddSeqEntryGaps is more convenient, but if you want finer-grained control you can use this function t...
vector< CConstRef< SOneGapLengthSummary > > TVectorGapLengthSummary
This holds the information for every encountered gap length.
@ fAddFlag_IncludeSeqGaps
include seq-gaps
@ fAddFlag_IncludeUnknownBases
include runs of N for nucs or X for prots.
ESortGapLength
Use this to control what results are sorted on.
@ eSortGapLength_NumGaps
Sort gap lengths by number of times they appear anywhere.
@ eSortGapLength_Length
Sort by gap length.
@ eSortGapLength_NumSeqs
Sort gap lengths by number of sequences that contain one or more gaps of the given length.
AutoPtr< TVectorGapLengthSummary > GetGapLengthSummary(EGapType eGapType, ESortGapLength eSortGapLength=eSortGapLength_Length, ESortDir eSortDir=eSortDir_Ascending) const
This gives summary information about every gap-length encountered so far.
Uint8 TGapLength
Use typedef in case we change the underlying.
void AddBioseqGaps(const CBioseq_Handle &bioseq_h, TAddFlag add_flags=fAddFlag_All, TFlag fFlags=0, size_t max_resolve_count=kMax_Int)
Similar to AddSeqEntryGaps, but for one Bioseq.
const TMapGapLengthToNumAppearances & GetGapLengthToNumAppearances(EGapType eGapType) const
Returns a map of gap_length to the number of times such a gap appears.
TGapTypeToHistogramBinner m_gapTypeToHistogramBinner
void clear(void)
Start analysis over again.
CHistogramBinning & x_GetOrCreateHistogramBinner(EGapType eGapType)
Use this instead of operator[] because the default constructor of TRefHistogramBinning is an empty re...
AutoPtr< CHistogramBinning::TListOfBins > GetGapHistogram(EGapType eGapType, Uint8 num_bins=0, CHistogramBinning::EHistAlgo eHistAlgo=CHistogramBinning::eHistAlgo_Default)
This returns a histogram of gap length vs.
TGapTypeAndLengthToNumAppearances m_gapTypeAndLengthToNumAppearances
void x_AddGapsFromBases(const CSeqMap_CI &seqmap_ci, TSeqIdConstRef bioseq_seq_id, TSeqPos iBioseqLength, TFlag fFlags)
Add gaps based on unknown bases which are letters.
TGapTypeAndLengthToSeqIds m_gapTypeAndLengthToSeqIds
For each gap-type and gap length, this holds all the seq-ids which have one or more gaps of that leng...
ESortDir
This controls the direction of sort order for functions that also take ESortGapLength.
Given a set of integer data, this will bin the data for use in histograms.
void SetNumBins(Uint8 num_bins)
This should not normally be needed, since number of bins is usually picked in the constructor.
TListOfBins * CalcHistogram(EHistAlgo eHistAlgo=eHistAlgo_Default) const
Call this after data is loaded via AddNumber, etc.
void AddNumber(TValue the_number, Uint8 num_appearances=1)
Give this histogram another number to bin.
EHistAlgo
Pick which binning algorithm to use when generating the histogram.
CObjectFor –.
Definition: ncbiobj.hpp:2335
CRef –.
Definition: ncbiobj.hpp:618
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_Handle –.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
Definition: set.hpp:45
size_type size() const
Definition: set.hpp:132
ostream & operator<<(ostream &s, const CGapAnalysis::SOneGapLengthSummary &one_gap_len_summary)
Analyzes gaps and produces various statistics.
static const char kGapChar('-')
The representation of a gap in ASCII format.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
Definition: ncbiexpt.hpp:724
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
C * SerialClone(const C &src)
Create on heap a clone of the source object.
Definition: serialbase.hpp:512
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
TSeqPos GetBioseqLength(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
TSeqPos GetEndPosition(void) const
return end position of current segment in sequence (exclusive)
Definition: seq_map_ci.hpp:679
SSeqMapSelector & SetResolveCount(size_t res_cnt)
Set max depth of resolving seq-map.
Definition: seq_map_ci.hpp:151
EBioseqLevelFlag
Class of bioseqs to iterate.
Definition: bioseq_ci.hpp:72
TSeqPos GetPos(void) const
CScope * GetScope(void) const
Definition: seq_map_ci.hpp:644
SSeqMapSelector & SetFlags(TFlags flags)
Select segment type(s)
Definition: seq_map_ci.hpp:179
CSeqMap::ESegmentType GetType(void) const
Definition: seq_map_ci.hpp:651
TSeqPos GetPosition(void) const
return position of current segment in sequence
Definition: seq_map_ci.hpp:665
TSeqPos GetLength(void) const
return length of current segment
Definition: seq_map_ci.hpp:672
int TFlags
Definition: seq_map.hpp:142
const_iterator begin(void) const
Definition: seq_vector.hpp:298
ESegmentType
Definition: seq_map.hpp:96
TResidue GetGapChar(ECaseConversion case_cvt=eCaseConversion_none) const
Return gap symbol corresponding to the selected coding.
Definition: seq_vector.hpp:318
@ fFindGap
Definition: seq_map.hpp:130
@ fFindData
Definition: seq_map.hpp:129
@ eSeqData
real sequence data
Definition: seq_map.hpp:98
@ eSeqGap
gap
Definition: seq_map.hpp:97
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
This holds information about a given gap_length.
Uint8 num_gaps
number of times gaps of this length appear anywhere
Uint8 num_seqs
number of sequences which contain one or more gaps of the given length
Selector used in CSeqMap methods returning iterators.
Definition: seq_map_ci.hpp:113
#define _ASSERT
Modified on Sat Dec 02 09:20:03 2023 by modify_doxy.py rev. 669887