60 template<
typename TType,
int Size>
61 void s_FillArray(TType(&
array)[Size],
const TType &
value) {
68 template<
int TableSize,
int InputSize>
69 void s_SetAmbigLookupTableFromArray(
70 bool (&ambig_lookup_table)[TableSize],
76 _ASSERT( chInputChar >= kFirstCharInLookupTable &&
77 chInputChar <= kLastCharInLookupTable );
78 ambig_lookup_table[chInputChar - kFirstCharInLookupTable] =
value;
85 return (iSeqPos == 1 || iSeqPos == -1);
88 bool s_IsSupportedSegmentType(
const CSeqMap_CI & segment )
106 if( iTrimDirection < 0 ) {
107 return (iStartPos < iEndPos);
109 return (iStartPos > iEndPos);
113 struct PVecTrimRulesLessThan {
126 struct PVecTrimRulesHaveSameNumberOfBases {
134 struct PVecTrimRuleAlwaysPasses {
148 pTrimRuleVec->push_back(arrTrimRules[rule_idx]);
150 return pTrimRuleVec.release();
159 s_DefaultRuleCreator,
NULL );
160 return s_DefaultTrimRules.
Get();
168 : m_eMeaningOfAmbig(eMeaningOfAmbig),
170 m_vecTrimRules(trimRuleVec),
171 m_uMinSeqLen(uMinSeqLen)
177 (1 + kLastCharInLookupTable - kFirstCharInLookupTable) );
179 (1 + kLastCharInLookupTable - kFirstCharInLookupTable) );
197 s_SetAmbigLookupTableFromArray(
201 'B',
'J',
'X',
'Z' };
202 s_SetAmbigLookupTableFromArray(
222 if( bioseq_len < 1 ) {
230 seqvec, leftmost_good_base, rightmost_good_base,
233 if( leftmost_good_base > rightmost_good_base ) {
235 if( trimmed_ranges ) {
236 *trimmed_ranges +=
TSeqRange(0, bioseq_len - 1);
243 rightmost_good_base =
245 seqvec, rightmost_good_base, leftmost_good_base,
248 if( leftmost_good_base > rightmost_good_base ) {
250 if( trimmed_ranges ) {
251 *trimmed_ranges +=
TSeqRange(0, bioseq_len - 1);
258 if( (leftmost_good_base == 0) &&
259 (rightmost_good_base == (bioseq_len - 1)) )
266 leftmost_good_base, rightmost_good_base,
268 if ( trimmed_ranges ) {
269 if( leftmost_good_base > 0 ) {
270 *trimmed_ranges +=
TSeqRange(0, leftmost_good_base - 1);
272 if( rightmost_good_base < bioseq_len - 1 ) {
273 *trimmed_ranges +=
TSeqRange(rightmost_good_base + 1,
289 sort( vecTrimRules.begin(), vecTrimRules.end(),
290 PVecTrimRulesLessThan() );
297 TTrimRuleVec::iterator new_end_iter =
299 vecTrimRules.begin(), vecTrimRules.end(),
300 PVecTrimRulesHaveSameNumberOfBases() );
301 vecTrimRules.erase( new_end_iter, vecTrimRules.end() );
306 vecTrimRules.begin(), vecTrimRules.end(),
307 PVecTrimRuleAlwaysPasses() );
308 vecTrimRules.erase( new_end_iter, vecTrimRules.end() );
313 const STrimRule & trimRule = *trim_rule_it;
315 problems_strm <<
"A rule has a non-positive number of "
316 "bases to check" << endl;
321 problems_strm <<
"There is a rule where bases_to_check "
323 "equal to max bases allowed ("
332 if( ! sProblems.empty() ) {
334 "Cannot create CSequenceAmbigTrimmer due to issues with rules: "
356 bioseq_eh.
SetInst( *pNewSeqInst );
370 iStartPosInclusive_arg, iEndPosInclusive_arg, iTrimDirection) )
372 return ( iTrimDirection > 0
380 TSignedSeqPos uStartOfGoodBasesSoFar = iStartPosInclusive_arg;
394 1 +
abs(uEndOfGoodBasesSoFar - uStartOfGoodBasesSoFar );
398 uOldBasesLeft = iNumBasesLeft;
403 const STrimRule & trimRule = *trim_rule_it;
415 uStartOfGoodBasesSoFar +
422 uStartOfGoodBasesSoFar,
423 iEndPosToCheckForThisRule,
438 iEndPosToCheckForThisRule,
443 uStartOfGoodBasesSoFar +=
454 uStartOfGoodBasesSoFar,
455 uEndOfGoodBasesSoFar,
457 uFewestBasesCheckedInARule );
461 uStartOfGoodBasesSoFar =
470 if( s_IsEmptyRange(uStartOfGoodBasesSoFar, uEndOfGoodBasesSoFar, iTrimDirection) ) {
473 iNumBasesLeft = 1 +
abs(uEndOfGoodBasesSoFar - uStartOfGoodBasesSoFar );
475 if( iNumBasesLeft == uOldBasesLeft ) {
488 uStartOfGoodBasesSoFar,
489 uEndOfGoodBasesSoFar,
494 return uStartOfGoodBasesSoFar;
506 in_out_uStartOfGoodBasesSoFar, uEndOfGoodBasesSoFar, iTrimDirection) )
515 if( ! pAmbigLookupTable ) {
517 "Unable to determine molecule type of sequence");
520 TSignedSeqPos newStartOfGoodBases = in_out_uStartOfGoodBasesSoFar;
521 while( ! s_IsEmptyRange(newStartOfGoodBases, uEndOfGoodBasesSoFar, iTrimDirection) &&
522 (*pAmbigLookupTable)[ seqvec[newStartOfGoodBases] - kFirstCharInLookupTable] )
527 newStartOfGoodBases, &seqvec.
GetScope() );
532 while( ! s_IsEmptyRange(newStartOfGoodBases, end_of_segment, iTrimDirection) &&
533 ! s_IsEmptyRange(newStartOfGoodBases, uEndOfGoodBasesSoFar, iTrimDirection) &&
534 (*pAmbigLookupTable)[ seqvec[newStartOfGoodBases] - kFirstCharInLookupTable] )
536 newStartOfGoodBases += iTrimDirection;
555 if( s_IsEmptyRange(newStartOfGoodBases, uEndOfGoodBasesSoFar, iTrimDirection) )
558 iNumBasesToRemove = 1 +
abs(uEndOfGoodBasesSoFar - in_out_uStartOfGoodBasesSoFar);
560 iNumBasesToRemove =
abs(newStartOfGoodBases - in_out_uStartOfGoodBasesSoFar);
565 iNumBasesToRemove = (iNumBasesToRemove / uChunkSize) * uChunkSize;
568 in_out_uStartOfGoodBasesSoFar += (iTrimDirection * iNumBasesToRemove);
579 iStartPosInclusive_arg, iEndPosInclusive_arg, iTrimDirection) )
590 iStartPosInclusive_arg, pScope );
596 if(
NULL == pAmbigLookupTable ) {
604 iEndPosInclusive_arg, iTrimDirection);
616 switch( eSegmentType ) {
622 1 +
abs(segmentEndPosInclusive - segmentStartPosInclusive),
623 1 +
abs(segmentStartPosInclusive - iEndPosInclusive_arg) );
641 ! s_IsEmptyRange(pos, segmentEndPosInclusive, iTrimDirection) &&
642 ! s_IsEmptyRange(pos, iEndPosInclusive_arg, iTrimDirection)
644 pos += iTrimDirection)
647 if( residue < kFirstCharInLookupTable || residue > kLastCharInLookupTable ||
648 (*pAmbigLookupTable)[residue - kFirstCharInLookupTable])
654 }
else if( s_IsEmptyRange(
665 <<
static_cast<int>(eSegmentType)
666 <<
" are not supported at this time");
678 _ASSERT( s_IsSupportedSegmentType(segment) );
680 if( iTrimDirection == 1 ) {
685 _ASSERT( iTrimDirection == -1 );
697 return ( iTrimDirection == 1 ? ++in_out_segment_it : --in_out_segment_it );
714 1 + ( rightmost_good_base - leftmost_good_base ) );
715 for( ; seqmap_ci; ++seqmap_ci ) {
727 if( pOriginalGapSeqLiteral ) {
728 pNewGapLiteral->Assign(*pOriginalGapSeqLiteral);
730 if( ! bIsLengthKnown ) {
733 pNewGapLiteral->SetLength( uGapLength );
735 pDeltaSeq->SetLiteral( *pNewGapLiteral );
737 pDeltaExt->
Set().push_back(
Ref(&*pDeltaSeq) );
750 pDeltaSeq->SetLiteral().SetLength( seqmap_ci.
GetLength() );
751 pDeltaSeq->SetLiteral().SetSeq_data( *pSeqData );
753 pDeltaExt->
Set().push_back(
Ref(&*pDeltaSeq) );
758 "seqmap segments of type " <<
static_cast<int>(
eType) );
770 seq_inst.
SetLength( 1 + ( rightmost_good_base - leftmost_good_base ) );
771 if( pDeltaExt->
Set().empty() ) {
773 }
else if( pDeltaExt->
Set().size() == 1 ) {
776 CSeq_data & seq_data = pDeltaSeq->SetLiteral().SetSeq_data();
779 seq_inst.
SetExt().SetDelta( *pDeltaExt );
void remove_if(Container &c, Predicate *__pred)
void Set(T *object)
Initialize with an existing object.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
position_type GetCoveredLength(void) const
Returns total length covered by ranges in this collection, i.e.
T & Get(void)
Create the variable if not created yet, return the reference.
Include a standard set of the NCBI C++ Toolkit most basic headers.
static bool s_IsValidDirection(const string &direction)
#define ITERATE_0_IDX(idx, up_to)
idx loops from 0 (inclusive) to up_to (exclusive)
unsigned int TSeqPos
Type for sequence locations and lengths.
constexpr size_t ArraySize(const Element(&)[Size])
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
#define NCBI_USER_THROW(message)
Throw a quick-and-dirty runtime exception of type 'CException' with the given error message and error...
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
C * SerialClone(const C &src)
Create on heap a clone of the source object.
TSignedSeqPos pos_after_last_gap
Inclusive.
EMeaningOfAmbig m_eMeaningOfAmbig
This holds the current interpretation for "ambiguous".
bool TAmbigLookupTable[26]
virtual EResult x_TrimToNothing(CBioseq_Handle &bioseq_handle)
The bioseq is trimmed to size 0.
TSignedSeqPos m_uMinSeqLen
When the bioseq gets trimmed down to less than this size, we halt the trimming.
TSignedSeqPos max_bases_allowed_to_be_ambig
TSignedSeqPos x_SegmentGetEndInclusive(const CSeqMap_CI &segment, const TSignedSeqPos iTrimDirection)
This returns the (inclusive) position at the end of the segment currently at iStartPosInclusive_arg.
TFlags m_fFlags
This holds the flags that affect the behavior of this class.
CSequenceAmbigTrimmer(EMeaningOfAmbig eMeaningOfAmbig, TFlags fFlags=0, const TTrimRuleVec &vecTrimRules=GetDefaultTrimRules(), TSignedSeqPos uMinSeqLen=50)
This sets up the parameters for how this trimmer will act.
CSeqMap_CI & x_SeqMapIterDoNext(CSeqMap_CI &in_out_segment_it, const TSignedSeqPos iTrimDirection)
Returns the "next" segment.
EMeaningOfAmbig
This enum is used to set what is meant by "ambiguous".
virtual void x_NormalizeVecTrimRules(TTrimRuleVec &vecTrimRules)
This prepares the vector of trimming rules to be used by the trimming algorithm.
TAmbigLookupTable m_arrNucAmbigLookupTable
EResult
This indicates what happened with the trim.
TSignedSeqPos bases_to_check
virtual TSignedSeqPos x_FindWhereToTrim(const CSeqVector &seqvec, const TSignedSeqPos iStartPosInclusive_arg, const TSignedSeqPos iEndPosInclusive_arg, TSignedSeqPos iTrimDirection)
This returns the last good base that won't be trimmed (note: last really means "first" when we're sta...
vector< STrimRule > TTrimRuleVec
Multiple STrimRules are allowed, which are applied from smallest bases_to_check to largest bases_to_c...
TAmbigLookupTable m_arrProtAmbigLookupTable
virtual EResult DoTrim(CBioseq_Handle &bioseq_handle, CRangeCollection< TSeqPos > *trimmed_ranges=nullptr)
This trims the given bioseq, using params set in the CSequenceAmbigTrimmer constructor.
virtual void x_EdgeSeqMapGapAdjust(const CSeqVector &seqvec, TSignedSeqPos &in_out_uStartOfGoodBasesSoFar, const TSignedSeqPos uEndOfGoodBasesSoFar, const TSignedSeqPos iTrimDirection, const TSignedSeqPos uChunkSize)
This adjusts in_out_uStartOfGoodBasesSoFar if we're at a CSeqMap gap.
static const TTrimRuleVec & GetDefaultTrimRules(void)
This returns a reasonable default for trimming rules.
TSignedSeqPos x_SegmentGetBeginningInclusive(const CSeqMap_CI &segment, const TSignedSeqPos iTrimDirection)
This returns the (inclusive) position at the beginning of the segment.
virtual void x_CountAmbigInRange(SAmbigCount &out_result, const CSeqVector &seqvec, const TSignedSeqPos iStartPosInclusive_arg, const TSignedSeqPos iEndPosInclusive_arg, const TSignedSeqPos iTrimDirection)
This counts the number of ambiguous bases in the range [leftmost_pos_to_check, rightmost_pos_to_check...
void x_SliceBioseq(TSignedSeqPos leftmost_good_base, TSignedSeqPos rightmost_good_base, CBioseq_Handle &bioseq_handle)
TSignedSeqPos num_ambig_bases
the number of ambiguous bases found in the range supplied to x_CountAmbigInRange
TTrimRuleVec m_vecTrimRules
This holds the trimming rules that will be applied.
bool x_TestFlag(TFlags fFlag)
Test if a given flag is set.
@ fFlags_DoNotTrimBeginning
0x01 ("Beginning" as defined by CSeqVector)
@ fFlags_DoNotTrimEnd
0x02 ("End" as defined by CSeqVector)
@ fFlags_DoNotTrimSeqGap
0x04 (Seq-gaps are not considered trimmable if this flag is set, only letter gaps (e....
@ eMeaningOfAmbig_AnyAmbig
Here, anything that's not certain is considered ambiguous.
@ eMeaningOfAmbig_OnlyCompletelyUnknown
Here, only N for nucleotides and X for amino acids is considered ambiguous.
@ eResult_NoTrimNeeded
Bioseq is left unchanged because it did not need to be trimmed at all.
@ eResult_SuccessfullyTrimmed
Bioseq is now trimmed.
TSeqPos GetBioseqLength(void) const
void SetInst(TInst &v) const
CBioseq_EditHandle GetEditHandle(void) const
Get 'edit' version of handle.
CScope & GetScope(void) const
Get scope this handle belongs to.
const CSeqMap & GetSeqMap(void) const
Get sequence map.
const TInst & GetInst(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
TSeqPos GetEndPosition(void) const
return end position of current segment in sequence (exclusive)
CSeqMap::ESegmentType GetType(void) const
bool IsUnknownLength(void) const
return true if current segment is a gap of unknown length
TSeqPos GetPosition(void) const
return position of current segment in sequence
TSeqPos GetLength(void) const
return length of current segment
CConstRef< CSeq_literal > GetRefGapLiteral(void) const
return CSeq_literal with gap data, or null if either the segment is not a gap, or an unspecified gap
TCoding GetCoding(void) const
Target sequence coding.
const CSeqMap & GetSeqMap(void) const
CSeqMap_CI FindSegment(TSeqPos pos, CScope *scope) const
Find segment containing the position.
bool IsProtein(void) const
CSeqMap_CI ResolvedRangeIterator(CScope *scope, TSeqPos from, TSeqPos length, ENa_strand strand=eNa_strand_plus, size_t maxResolve=size_t(-1), TFlags flags=fDefaultFlags) const
Iterate segments in the range with specified strand coordinates.
bool IsNucleotide(void) const
void GetPackedSeqData(string &buffer, TSeqPos start=0, TSeqPos stop=kInvalidSeqPos)
TMol GetSequenceType(void) const
CScope & GetScope(void) const
@ eSeqData
real sequence data
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
void SetExt(TExt &value)
Assign a value to Ext data member.
void SetRepr(TRepr value)
Assign a value to Repr data member.
void SetLength(TLength value)
Assign a value to Length data member.
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
void ResetExt(void)
Reset Ext data member.
void ResetSeq_data(void)
Reset Seq_data data member.
@ eRepr_raw
continuous sequence
@ eRepr_virtual
no seq data
constexpr auto sort(_Init &&init)
const GenericPointer< typename T::ValueType > T2 value
Static variables safety - create on demand, destroy on application termination.
This holds the output of x_CountAmbigInRange.
For example, if bases_to_check is 10 and max_bases_allowed_to_be_ambig is 5, then on each iteration w...