54 case eGapType_UnknownBases:
55 return "Unknown Bases";
57 return "UNKNOWN GAP TYPE";
67 size_t max_resolve_count)
70 for( ; bioseq_ci; ++bioseq_ci ) {
71 AddBioseqGaps(*bioseq_ci, add_flags, fFlags, max_resolve_count);
79 size_t max_resolve_count)
94 TSeqPos end_of_last_segment = 0;
95 bool all_segments_and_in_order =
true;
100 for( ; seqmap_ci; ++seqmap_ci ) {
101 if( seqmap_ci.
GetPosition() != end_of_last_segment ) {
102 all_segments_and_in_order =
false;
125 "This segment type is not supported at this time: " <<
126 static_cast<int>(seg_type) );
130 if( end_of_last_segment != bioseq_len ) {
131 all_segments_and_in_order =
false;
133 if( ! all_segments_and_in_order ) {
135 Warning <<
"Not all segments on bioseq '"
137 "or some positions appear to have been skipped. "
138 "One possible reason is that there were far references for "
139 "which no attempt was made to resolve due to max resolve count "
154 _ASSERT(iGapStartPos < iGapEndPosExclusive);
155 _ASSERT((iGapEndPosExclusive - iGapStartPos) == iGapLength);
158 if( iGapStartPos == 0 ||
159 iGapEndPosExclusive == iBioseqLength )
187 s <<
"SOneGapLengthSummary("
188 <<
"gap_length: " << one_gap_len_summary.
gap_length
189 <<
", num_seqs: " << one_gap_len_summary.
num_seqs
190 <<
", num_gaps: " << one_gap_len_summary.
num_gaps
199 s <<
"TVectorGapLengthSummary(" << endl;
202 s << **summ_it << endl;
221 const TGapLength iGapLength = gap_map_iter->first;
227 m_mapGapLengthToNumAppearances.
find(iGapLength);
228 _ASSERT( find_iter != m_mapGapLengthToNumAppearances.
end() );
229 num_gaps = find_iter->second;
243 stable_sort(pAnswer->begin(), pAnswer->end(), sorter );
256 return find_it->second;
269 return find_it->second;
290 : sort_gap_length(sort_gap_length_arg), sort_dir(sort_dir_arg)
303 switch(sort_gap_length) {
312 static_cast<int>(sort_gap_length) );
322 return find_iter->second->GetData();
328 typedef pair<TGapTypeToHistogramBinner::iterator, bool> TInsertResult;
330 make_pair(eGapType, new_value));
333 return insert_result.first->second->GetData();
349 (begin_pos + seqmap_ci.
GetLength() - 1)));
356 size_t size_of_curr_gap = 0;
360 for( ; seq_vec_ci; ++seq_vec_ci ) {
364 start_pos_of_curr_gap = (begin_pos + seq_vec_ci.
GetPos());
366 }
else if( size_of_curr_gap > 0 ) {
371 start_pos_of_curr_gap, (begin_pos + seq_vec_ci.
GetPos()),
373 size_of_curr_gap = 0;
377 if( size_of_curr_gap > 0 ) {
382 start_pos_of_curr_gap, (begin_pos + seq_vec_ci.
GetPos()),
This class does comparison of SOneGapLengthSummary, and it is adjustable which field to sort on and w...
bool operator()(const CConstRef< SOneGapLengthSummary > &lhs, const CConstRef< SOneGapLengthSummary > &rhs) const
less-than
SOneGapLengthSummarySorter(ESortGapLength sort_gap_length_arg, ESortDir sort_dir_arg)
Give this gaps, or handles containing gaps and then you can get statistics on those gaps.
const TMapGapLengthToSeqIds & GetGapLengthSeqIds(EGapType eGapType) const
Returns a map of gap_length to the set of all seq-ids that contain at least one gap of that length.
@ fFlag_IncludeEndGaps
include gaps that are at the very start or very end of their sequence.
void AddSeqEntryGaps(const CSeq_entry_Handle &entry_h, CSeq_inst::EMol filter=CSeq_inst::eMol_not_set, CBioseq_CI::EBioseqLevelFlag level=CBioseq_CI::eLevel_All, TAddFlag add_flags=fAddFlag_All, TFlag fFlags=0, size_t max_resolve_count=kMax_Int)
Calls AddGap for each gap anywhere under the given CSeq_entry.
void AddGap(EGapType eGapType, TSeqIdConstRef pSeqId, TGapLength iGapLength, TSeqPos iBioseqLength, TSeqPos iGapStartPos, TSeqPos iGapEndPosExclusive, TFlag fFlags=0)
AddSeqEntryGaps is more convenient, but if you want finer-grained control you can use this function t...
vector< CConstRef< SOneGapLengthSummary > > TVectorGapLengthSummary
This holds the information for every encountered gap length.
@ fAddFlag_IncludeSeqGaps
include seq-gaps
@ fAddFlag_IncludeUnknownBases
include runs of N for nucs or X for prots.
ESortGapLength
Use this to control what results are sorted on.
@ eSortGapLength_NumGaps
Sort gap lengths by number of times they appear anywhere.
@ eSortGapLength_Length
Sort by gap length.
@ eSortGapLength_NumSeqs
Sort gap lengths by number of sequences that contain one or more gaps of the given length.
AutoPtr< TVectorGapLengthSummary > GetGapLengthSummary(EGapType eGapType, ESortGapLength eSortGapLength=eSortGapLength_Length, ESortDir eSortDir=eSortDir_Ascending) const
This gives summary information about every gap-length encountered so far.
Uint8 TGapLength
Use typedef in case we change the underlying.
void AddBioseqGaps(const CBioseq_Handle &bioseq_h, TAddFlag add_flags=fAddFlag_All, TFlag fFlags=0, size_t max_resolve_count=kMax_Int)
Similar to AddSeqEntryGaps, but for one Bioseq.
const TMapGapLengthToNumAppearances & GetGapLengthToNumAppearances(EGapType eGapType) const
Returns a map of gap_length to the number of times such a gap appears.
TGapTypeToHistogramBinner m_gapTypeToHistogramBinner
void clear(void)
Start analysis over again.
CHistogramBinning & x_GetOrCreateHistogramBinner(EGapType eGapType)
Use this instead of operator[] because the default constructor of TRefHistogramBinning is an empty re...
AutoPtr< CHistogramBinning::TListOfBins > GetGapHistogram(EGapType eGapType, Uint8 num_bins=0, CHistogramBinning::EHistAlgo eHistAlgo=CHistogramBinning::eHistAlgo_Default)
This returns a histogram of gap length vs.
TGapTypeAndLengthToNumAppearances m_gapTypeAndLengthToNumAppearances
void x_AddGapsFromBases(const CSeqMap_CI &seqmap_ci, TSeqIdConstRef bioseq_seq_id, TSeqPos iBioseqLength, TFlag fFlags)
Add gaps based on unknown bases which are letters.
TGapTypeAndLengthToSeqIds m_gapTypeAndLengthToSeqIds
For each gap-type and gap length, this holds all the seq-ids which have one or more gaps of that leng...
ESortDir
This controls the direction of sort order for functions that also take ESortGapLength.
Given a set of integer data, this will bin the data for use in histograms.
void SetNumBins(Uint8 num_bins)
This should not normally be needed, since number of bins is usually picked in the constructor.
TListOfBins * CalcHistogram(EHistAlgo eHistAlgo=eHistAlgo_Default) const
Call this after data is loaded via AddNumber, etc.
void AddNumber(TValue the_number, Uint8 num_appearances=1)
Give this histogram another number to bin.
EHistAlgo
Pick which binning algorithm to use when generating the histogram.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
container_type::const_iterator const_iterator
container_type::iterator iterator
const_iterator end() const
iterator_bool insert(const value_type &val)
const_iterator find(const key_type &key) const
ostream & operator<<(ostream &s, const CGapAnalysis::SOneGapLengthSummary &one_gap_len_summary)
Analyzes gaps and produces various statistics.
static const char kGapChar('-')
The representation of a gap in ASCII format.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
void Warning(CExceptionArgs_Base &args)
C * SerialClone(const C &src)
Create on heap a clone of the source object.
const string AsFastaString(void) const
TSeqPos GetBioseqLength(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
TSeqPos GetEndPosition(void) const
return end position of current segment in sequence (exclusive)
SSeqMapSelector & SetResolveCount(size_t res_cnt)
Set max depth of resolving seq-map.
EBioseqLevelFlag
Class of bioseqs to iterate.
TSeqPos GetPos(void) const
CScope * GetScope(void) const
SSeqMapSelector & SetFlags(TFlags flags)
Select segment type(s)
CSeqMap::ESegmentType GetType(void) const
TSeqPos GetPosition(void) const
return position of current segment in sequence
TSeqPos GetLength(void) const
return length of current segment
const_iterator begin(void) const
TResidue GetGapChar(ECaseConversion case_cvt=eCaseConversion_none) const
Return gap symbol corresponding to the selected coding.
@ eSeqData
real sequence data
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
EMol
molecule class in living organism
This holds information about a given gap_length.
Uint8 num_gaps
number of times gaps of this length appear anywhere
Uint8 num_seqs
number of sequences which contain one or more gaps of the given length
Selector used in CSeqMap methods returning iterators.