NCBI C++ ToolKit
seq_trimmer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_trimmer.cpp 91808 2020-12-14 15:35:18Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Michael Kornbluh, NCBI
27 *
28 * File Description:
29 * Trims the end of sequences, based on various criteria.
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 
35 #include <objmgr/util/sequence.hpp>
36 #include <objmgr/bioseq_handle.hpp>
37 #include <objmgr/seq_map.hpp>
38 #include <objmgr/annot_ci.hpp>
39 #include <objmgr/seq_map.hpp>
40 #include <objmgr/seq_map_ci.hpp>
41 #include <objmgr/seq_vector.hpp>
42 
43 #include <corelib/ncbistd.hpp>
46 
48 
49 #include <algorithm>
50 
53 
54 namespace {
55  const static CSeqVector::TResidue kFirstCharInLookupTable = 'A';
56  const static CSeqVector::TResidue kLastCharInLookupTable = 'Z';
57 
58  // fill the given array with the given value
59  // (just a simple wrapper over std::fill)
60  template<typename TType, int Size>
61  void s_FillArray(TType(& array)[Size], const TType & value) {
62  fill( array, array + Size, value );
63  }
64 
65  // ambig_lookup_table is a table for each letter of the alphabet.
66  // input_table is an array of all the letters that should
67  // be set to value.
68  template<int TableSize, int InputSize>
69  void s_SetAmbigLookupTableFromArray(
70  bool (&ambig_lookup_table)[TableSize],
71  const CSeqVector::TResidue(&input_table)[InputSize],
72  const bool & value )
73  {
74  ITERATE_0_IDX(input_idx, InputSize) {
75  const CSeqVector::TResidue chInputChar = input_table[input_idx];
76  _ASSERT( chInputChar >= kFirstCharInLookupTable &&
77  chInputChar <= kLastCharInLookupTable );
78  ambig_lookup_table[chInputChar - kFirstCharInLookupTable] = value;
79  }
80  }
81 
82 #ifdef _DEBUG
83  bool s_IsValidDirection(const TSignedSeqPos iSeqPos)
84  {
85  return (iSeqPos == 1 || iSeqPos == -1);
86  }
87 
88  bool s_IsSupportedSegmentType(const CSeqMap_CI & segment )
89  {
90  switch( segment.GetType() ) {
91  case CSeqMap::eSeqGap:
92  case CSeqMap::eSeqData:
93  return true;
94  default:
95  return false;
96  }
97  }
98 #endif
99 
100  bool s_IsEmptyRange(
101  const TSignedSeqPos iStartPos,
102  const TSignedSeqPos iEndPos,
103  const TSignedSeqPos iTrimDirection)
104  {
105  _ASSERT( s_IsValidDirection(iTrimDirection) );
106  if( iTrimDirection < 0 ) {
107  return (iStartPos < iEndPos);
108  } else {
109  return (iStartPos > iEndPos);
110  }
111  }
112 
113  struct PVecTrimRulesLessThan {
114  bool operator()( const CSequenceAmbigTrimmer::STrimRule & lhs,
115  const CSequenceAmbigTrimmer::STrimRule & rhs ) const
116  {
117  if( lhs.bases_to_check != rhs.bases_to_check ) {
118  return lhs.bases_to_check < rhs.bases_to_check;
119  }
120 
121  return lhs.max_bases_allowed_to_be_ambig <
123  }
124  };
125 
126  struct PVecTrimRulesHaveSameNumberOfBases {
127  bool operator()( const CSequenceAmbigTrimmer::STrimRule & lhs,
128  const CSequenceAmbigTrimmer::STrimRule & rhs ) const
129  {
130  return lhs.bases_to_check == rhs.bases_to_check;
131  }
132  };
133 
134  struct PVecTrimRuleAlwaysPasses {
135  bool operator()( const CSequenceAmbigTrimmer::STrimRule & rule ) {
136  return rule.bases_to_check == rule.max_bases_allowed_to_be_ambig;
137  }
138  };
139 
140  CSequenceAmbigTrimmer::TTrimRuleVec *s_DefaultRuleCreator(void)
141  {
142  unique_ptr<CSequenceAmbigTrimmer::TTrimRuleVec> pTrimRuleVec( new CSequenceAmbigTrimmer::TTrimRuleVec );
143  CSequenceAmbigTrimmer::STrimRule arrTrimRules[] = {
144  { 10, 5 },
145  { 50, 15 }
146  };
147  ITERATE_0_IDX( rule_idx, ArraySize(arrTrimRules) ) {
148  pTrimRuleVec->push_back(arrTrimRules[rule_idx]);
149  }
150  return pTrimRuleVec.release();
151  }
152 }
153 
154 // static
157 {
158  static CSafeStatic<TTrimRuleVec> s_DefaultTrimRules(
159  s_DefaultRuleCreator, NULL );
160  return s_DefaultTrimRules.Get();
161 }
162 
164  EMeaningOfAmbig eMeaningOfAmbig,
165  TFlags fFlags,
166  const TTrimRuleVec & trimRuleVec,
167  TSignedSeqPos uMinSeqLen )
168  : m_eMeaningOfAmbig(eMeaningOfAmbig),
169  m_fFlags(fFlags),
170  m_vecTrimRules(trimRuleVec),
171  m_uMinSeqLen(uMinSeqLen)
172 {
174 
175  // set up ambig lookup tables:
177  (1 + kLastCharInLookupTable - kFirstCharInLookupTable) );
179  (1 + kLastCharInLookupTable - kFirstCharInLookupTable) );
180 
181  // most letters are unambiguous for amino acids
182  s_FillArray(m_arrProtAmbigLookupTable, false);
183 
184  switch( m_eMeaningOfAmbig ) {
186  // here, only one letter is considered ambig
187  s_FillArray(m_arrNucAmbigLookupTable, false);
188  m_arrNucAmbigLookupTable['N'- kFirstCharInLookupTable] = true;
189  m_arrProtAmbigLookupTable['X' - kFirstCharInLookupTable] = true;
190  break;
192  // anything not specific is considered ambiguous:
193 
194  s_FillArray(m_arrNucAmbigLookupTable, true);
195  const static CSeqVector::TResidue arrNucCertainBases[] = {
196  'A', 'C', 'G', 'T'};
197  s_SetAmbigLookupTableFromArray(
198  m_arrNucAmbigLookupTable, arrNucCertainBases, false);
199 
200  const static CSeqVector::TResidue arrProtAmbigBases[] = {
201  'B', 'J', 'X', 'Z' };
202  s_SetAmbigLookupTableFromArray(
203  m_arrProtAmbigLookupTable, arrProtAmbigBases, true);
204  break;
205  }
206  default:
207  NCBI_USER_THROW_FMT("Unknown EMeaningOfAmbig: "
208  << static_cast<int>(m_eMeaningOfAmbig) );
209  }
210 }
211 
214  CRangeCollection<TSeqPos> *trimmed_ranges )
215 {
216  _ASSERT( bioseq_handle );
217 
218  const CSeqVector seqvec( bioseq_handle, CBioseq_Handle::eCoding_Iupac );
219 
220  // there's already no sequence, so nothing to trim
221  const TSignedSeqPos bioseq_len = bioseq_handle.GetBioseqLength();
222  if( bioseq_len < 1 ) {
223  return eResult_NoTrimNeeded;
224  }
225 
226  TSignedSeqPos leftmost_good_base = 0;
227  TSignedSeqPos rightmost_good_base = (bioseq_len - 1);
229  leftmost_good_base = x_FindWhereToTrim(
230  seqvec, leftmost_good_base, rightmost_good_base,
231  1 ); // 1 means "towards the right"
232  }
233  if( leftmost_good_base > rightmost_good_base ) {
234  // trimming leaves nothing left
235  if( trimmed_ranges ) {
236  *trimmed_ranges += TSeqRange(0, bioseq_len - 1);
237  _ASSERT(bioseq_len == trimmed_ranges->GetCoveredLength());
238  }
239  return x_TrimToNothing( bioseq_handle );
240  }
241 
243  rightmost_good_base =
245  seqvec, rightmost_good_base, leftmost_good_base,
246  -1 ); // -1 means "towards the left"
247  }
248  if( leftmost_good_base > rightmost_good_base ) {
249  // trimming leaves nothing left
250  if( trimmed_ranges ) {
251  *trimmed_ranges += TSeqRange(0, bioseq_len - 1);
252  _ASSERT(bioseq_len == trimmed_ranges->GetCoveredLength());
253  }
254  return x_TrimToNothing( bioseq_handle );
255  }
256 
257  // check if nothing to do
258  if( (leftmost_good_base == 0) &&
259  (rightmost_good_base == (bioseq_len - 1)) )
260  {
261  return eResult_NoTrimNeeded;
262  }
263 
264  // do the actual slicing of the bioseq
265  x_SliceBioseq(
266  leftmost_good_base, rightmost_good_base,
267  bioseq_handle );
268  if ( trimmed_ranges ) {
269  if( leftmost_good_base > 0 ) {
270  *trimmed_ranges += TSeqRange(0, leftmost_good_base - 1);
271  }
272  if( rightmost_good_base < bioseq_len - 1 ) {
273  *trimmed_ranges += TSeqRange(rightmost_good_base + 1,
274  bioseq_len - 1);
275  }
276  _ASSERT( bioseq_handle.GetBioseqLength() ==
277  bioseq_len - trimmed_ranges->GetCoveredLength());
278  }
279 
281 }
282 
283 void
285  TTrimRuleVec & vecTrimRules )
286 {
287  // we want rules that check fewer bases first.
288  // then, we sort by number of ambig bases in the rule.
289  sort( vecTrimRules.begin(), vecTrimRules.end(),
290  PVecTrimRulesLessThan() );
291 
292  // For trim rules that represent the same number of bases,
293  // we want only the strictest
294 
295  /// unique_copy only copies the first when multiple in a row
296  /// are equal.
297  TTrimRuleVec::iterator new_end_iter =
298  unique(
299  vecTrimRules.begin(), vecTrimRules.end(),
300  PVecTrimRulesHaveSameNumberOfBases() );
301  vecTrimRules.erase( new_end_iter, vecTrimRules.end() );
302 
303  // remove rules that will always pass because they don't
304  // do anything.
305  new_end_iter = remove_if(
306  vecTrimRules.begin(), vecTrimRules.end(),
307  PVecTrimRuleAlwaysPasses() );
308  vecTrimRules.erase( new_end_iter, vecTrimRules.end() );
309 
310  // check if rules have any consistency issues
311  CNcbiOstrstream problems_strm;
312  ITERATE(TTrimRuleVec, trim_rule_it, vecTrimRules) {
313  const STrimRule & trimRule = *trim_rule_it;
314  if( trimRule.bases_to_check <= 0 ) {
315  problems_strm << "A rule has a non-positive number of "
316  "bases to check" << endl;
317  continue;
318  }
319  if( trimRule.bases_to_check <= trimRule.max_bases_allowed_to_be_ambig )
320  {
321  problems_strm << "There is a rule where bases_to_check "
322  << "(" << trimRule.bases_to_check << ") is less than or "
323  "equal to max bases allowed ("
324  << trimRule.max_bases_allowed_to_be_ambig << ")" << endl;
325  continue;
326  }
327 
328  // if we're here, this rule is okay
329  }
330 
331  const string sProblems = CNcbiOstrstreamToString(problems_strm);
332  if( ! sProblems.empty() ) {
334  "Cannot create CSequenceAmbigTrimmer due to issues with rules: "
335  << sProblems );
336  }
337 }
338 
341 {
342  // nothing to do if already empty
343  if( bioseq_handle.GetBioseqLength() < 1 ) {
344  return eResult_NoTrimNeeded;
345  }
346 
347  // create new CSeq_inst since we're destroying the whole Bioseq
348  CRef<CSeq_inst> pNewSeqInst( SerialClone(bioseq_handle.GetInst()) );
349 
350  pNewSeqInst->SetRepr( CSeq_inst::eRepr_virtual );
351  pNewSeqInst->SetLength(0);
352  pNewSeqInst->ResetSeq_data();
353  pNewSeqInst->ResetExt();
354 
355  CBioseq_EditHandle bioseq_eh = bioseq_handle.GetEditHandle();
356  bioseq_eh.SetInst( *pNewSeqInst );
357 
359 }
360 
363  const CSeqVector & seqvec,
364  const TSignedSeqPos iStartPosInclusive_arg,
365  const TSignedSeqPos iEndPosInclusive_arg,
366  const TSignedSeqPos iTrimDirection )
367 {
368  _ASSERT( s_IsValidDirection(iTrimDirection) );
369  if( s_IsEmptyRange(
370  iStartPosInclusive_arg, iEndPosInclusive_arg, iTrimDirection) )
371  {
372  return ( iTrimDirection > 0
375  }
376 
377  // there is a range in the middle of the bioseq from
378  // start to end where we keep the sequence.
379  // These are the inclusive bounds of that range:
380  TSignedSeqPos uStartOfGoodBasesSoFar = iStartPosInclusive_arg;
381  TSignedSeqPos uEndOfGoodBasesSoFar = iEndPosInclusive_arg;
382 
383  // if no rules given, there's nothing to initially do
384  if( ! m_vecTrimRules.empty() ) {
385 
386  // holds the minimum number of bases that will be checked
387  // out of all the rules.
388  const TSignedSeqPos uFewestBasesCheckedInARule =
389  m_vecTrimRules.front().bases_to_check;
390 
391  // while sequence hasn't been shrunken too far, we
392  // apply the trimming rules we're given
393  TSignedSeqPos iNumBasesLeft =
394  1 + abs(uEndOfGoodBasesSoFar - uStartOfGoodBasesSoFar );
395  // so we can see if any rule was triggered
396  TSignedSeqPos uOldBasesLeft = -1;
397  while( iNumBasesLeft >= m_uMinSeqLen ) {
398  uOldBasesLeft = iNumBasesLeft;
399 
400  // apply rules in order, restarting our rule-checking whenever
401  // a rule matches.
402  ITERATE(TTrimRuleVec, trim_rule_it, m_vecTrimRules) {
403  const STrimRule & trimRule = *trim_rule_it;
404 
405  // skip the rule if the sequence is too small
406  // (later rules are greater in number of
407  // bases_to_check, so no point in even checking them
408  // this go-round)
409  if( trimRule.bases_to_check > iNumBasesLeft ) {
410  break;
411  }
412 
413  // get the positions to check just for this rule
414  const TSignedSeqPos iEndPosToCheckForThisRule =
415  uStartOfGoodBasesSoFar +
416  (iTrimDirection * (trimRule.bases_to_check - 1));
417 
418  // get ambig count, etc. for the given range
419  SAmbigCount ambig_count(iTrimDirection);
420  x_CountAmbigInRange( ambig_count,
421  seqvec,
422  uStartOfGoodBasesSoFar,
423  iEndPosToCheckForThisRule,
424  iTrimDirection );
425 
426  // would this rule trigger?
427  if( ambig_count.num_ambig_bases <=
429  {
430  // rule did not trigger, so go to next rule
431  continue;
432  }
433 
434  // at this point, the rule has been triggered
435 
436  if( s_IsEmptyRange(
437  ambig_count.pos_after_last_gap,
438  iEndPosToCheckForThisRule,
439  iTrimDirection) )
440  {
441  // here, the entire region we checked is
442  // all ambiguous bases.
443  uStartOfGoodBasesSoFar +=
444  (iTrimDirection * trimRule.bases_to_check);
445 
446  // optimization:
447 
448  // are we at a tremendous gap feature?
449  // consider that gap's length instead of
450  // turning it into ambig bases that we slowly iterate over
451  // individually.
453  seqvec,
454  uStartOfGoodBasesSoFar, // this var will be adjusted
455  uEndOfGoodBasesSoFar,
456  iTrimDirection,
457  uFewestBasesCheckedInARule );
458  } else {
459  // this part happens when there is at least one
460  // non-ambiguous base in the region we checked.
461  uStartOfGoodBasesSoFar =
462  ambig_count.pos_after_last_gap;
463  }
464 
465  // when a rule triggers, we start over from the first rule
466  break;
467  } // end of iterating through the trimRules
468 
469  // calculate how many bases are left now
470  if( s_IsEmptyRange(uStartOfGoodBasesSoFar, uEndOfGoodBasesSoFar, iTrimDirection) ) {
471  iNumBasesLeft = 0;
472  } else {
473  iNumBasesLeft = 1 + abs(uEndOfGoodBasesSoFar - uStartOfGoodBasesSoFar );
474  }
475  if( iNumBasesLeft == uOldBasesLeft ) {
476  // no rule triggered this iteration,
477  // so break to avoid an infinite loop
478  break;
479  }
480  } // end of iterating while the remaining sequence is big enough
481  } // end of "if(there are rules to process)"
482 
483  // always perform final edge trimming, regardless of
484  // m_uMinSeqLen. There should only be a few left, so
485  // this can be done with simple base iteration.
487  seqvec,
488  uStartOfGoodBasesSoFar,
489  uEndOfGoodBasesSoFar,
490  iTrimDirection,
491  1 // "1" means "no chunking"
492  );
493 
494  return uStartOfGoodBasesSoFar;
495 }
496 
498  const CSeqVector & seqvec,
499  TSignedSeqPos & in_out_uStartOfGoodBasesSoFar,
500  const TSignedSeqPos uEndOfGoodBasesSoFar,
501  const TSignedSeqPos iTrimDirection,
502  const TSignedSeqPos uChunkSize )
503 {
504  // check if we've already removed the whole thing
505  if( s_IsEmptyRange(
506  in_out_uStartOfGoodBasesSoFar, uEndOfGoodBasesSoFar, iTrimDirection) )
507  {
508  return;
509  }
510 
511  const TAmbigLookupTable * const pAmbigLookupTable =
512  ( seqvec.IsNucleotide() ? & m_arrNucAmbigLookupTable :
513  seqvec.IsProtein() ? & m_arrProtAmbigLookupTable :
514  NULL );
515  if( ! pAmbigLookupTable ) {
517  "Unable to determine molecule type of sequence");
518  }
519 
520  TSignedSeqPos newStartOfGoodBases = in_out_uStartOfGoodBasesSoFar;
521  while( ! s_IsEmptyRange(newStartOfGoodBases, uEndOfGoodBasesSoFar, iTrimDirection) &&
522  (*pAmbigLookupTable)[ seqvec[newStartOfGoodBases] - kFirstCharInLookupTable] )
523  {
524  // find the end of this sequence of gaps
525  CSeqMap_CI gap_seqmap_ci =
526  seqvec.GetSeqMap().FindSegment(
527  newStartOfGoodBases, &seqvec.GetScope() );
528  if( gap_seqmap_ci.GetType() == CSeqMap::eSeqData ) {
529  const TSignedSeqPos end_of_segment =
530  x_SegmentGetEndInclusive(gap_seqmap_ci, iTrimDirection);
531 
532  while( ! s_IsEmptyRange(newStartOfGoodBases, end_of_segment, iTrimDirection) &&
533  ! s_IsEmptyRange(newStartOfGoodBases, uEndOfGoodBasesSoFar, iTrimDirection) &&
534  (*pAmbigLookupTable)[ seqvec[newStartOfGoodBases] - kFirstCharInLookupTable] )
535  {
536  newStartOfGoodBases += iTrimDirection;
537  }
538  } else if( gap_seqmap_ci.GetType() == CSeqMap::eSeqGap ) {
540  // do not trim past Seq-gaps here
541  break;
542  }
543  newStartOfGoodBases = iTrimDirection + x_SegmentGetEndInclusive(gap_seqmap_ci, iTrimDirection);
544  } else {
545  // this is not a gap segment
546  // (Note that this does NOT check for sequence data with ambiguous
547  // bases in it).
548  return;
549  }
550  }
551 
552  // if endOfGapPos is past uEndOfGoodBasesSoFar, then
553  // stop it there
554  TSignedSeqPos iNumBasesToRemove = 0;
555  if( s_IsEmptyRange(newStartOfGoodBases, uEndOfGoodBasesSoFar, iTrimDirection) )
556  {
557  // we're removing all bases
558  iNumBasesToRemove = 1 + abs(uEndOfGoodBasesSoFar - in_out_uStartOfGoodBasesSoFar);
559  } else {
560  iNumBasesToRemove = abs(newStartOfGoodBases - in_out_uStartOfGoodBasesSoFar);
561  }
562 
563  // chunking:
564  // iNumBasesToRemove must be a multiple of uChunkSize
565  iNumBasesToRemove = (iNumBasesToRemove / uChunkSize) * uChunkSize;
566 
567  // adjust our output variable
568  in_out_uStartOfGoodBasesSoFar += (iTrimDirection * iNumBasesToRemove);
569 }
570 
572  SAmbigCount & out_result,
573  const CSeqVector & seqvec,
574  const TSignedSeqPos iStartPosInclusive_arg,
575  const TSignedSeqPos iEndPosInclusive_arg,
576  TSignedSeqPos iTrimDirection )
577 {
578  if( s_IsEmptyRange(
579  iStartPosInclusive_arg, iEndPosInclusive_arg, iTrimDirection) )
580  {
581  out_result = SAmbigCount(iTrimDirection);
582  return;
583  }
584 
585  const CSeqMap & seqmap = seqvec.GetSeqMap();
586  CScope * const pScope = & seqvec.GetScope();
587 
588  CSeqMap_CI segment_ci =
589  seqmap.FindSegment(
590  iStartPosInclusive_arg, pScope );
591 
592  const TAmbigLookupTable * const pAmbigLookupTable =
593  ( seqvec.IsNucleotide() ? & m_arrNucAmbigLookupTable :
594  seqvec.IsProtein() ? & m_arrProtAmbigLookupTable :
595  NULL );
596  if( NULL == pAmbigLookupTable ) {
597  NCBI_USER_THROW_FMT("Unexpected seqvector mol: "
598  << static_cast<int>(seqvec.GetSequenceType()) );
599  }
600 
601  for( ; segment_ci &&
602  ! s_IsEmptyRange(
603  x_SegmentGetBeginningInclusive(segment_ci, iTrimDirection),
604  iEndPosInclusive_arg, iTrimDirection);
605  x_SeqMapIterDoNext(segment_ci, iTrimDirection) )
606  {
607  // get type of segment at current_pos
608  const CSeqMap::ESegmentType eSegmentType = segment_ci.GetType();
609 
610  // get the part of this segment that we're actually considering
611  const TSignedSeqPos segmentStartPosInclusive =
612  x_SegmentGetBeginningInclusive(segment_ci, iTrimDirection);
613  const TSignedSeqPos segmentEndPosInclusive =
614  x_SegmentGetEndInclusive(segment_ci, iTrimDirection);
615 
616  switch( eSegmentType ) {
617  case CSeqMap::eSeqGap: {
618  // the "min" is to catch the case of the gap segment going past
619  // the end of the range we're looking at
620  const TSignedSeqPos numBasesInGapInRange =
621  min(
622  1 + abs(segmentEndPosInclusive - segmentStartPosInclusive),
623  1 + abs(segmentStartPosInclusive - iEndPosInclusive_arg) );
625  // if fFlags_DoNotTrimSeqGap is set, then we return 0 ambig
626  // bases to make sure no rule is triggered
627  out_result = SAmbigCount(iTrimDirection);
628  return;
629  }
630  // gaps are always ambiguous no matter what our definition
631  // of ambiguous is.
632  out_result.num_ambig_bases += numBasesInGapInRange;
633  out_result.pos_after_last_gap = ( (iTrimDirection > 0)
636  break;
637  }
638  case CSeqMap::eSeqData: {
639  // count ambig in this chunk
640  for( TSignedSeqPos pos = segmentStartPosInclusive;
641  ! s_IsEmptyRange(pos, segmentEndPosInclusive, iTrimDirection) &&
642  ! s_IsEmptyRange(pos, iEndPosInclusive_arg, iTrimDirection)
643  ;
644  pos += iTrimDirection)
645  {
646  const CSeqVector::TResidue residue = seqvec[pos];
647  if( residue < kFirstCharInLookupTable || residue > kLastCharInLookupTable ||
648  (*pAmbigLookupTable)[residue - kFirstCharInLookupTable])
649  {
650  ++out_result.num_ambig_bases;
651  out_result.pos_after_last_gap = ( (iTrimDirection > 0)
654  } else if( s_IsEmptyRange(
655  out_result.pos_after_last_gap, iEndPosInclusive_arg,
656  iTrimDirection) )
657  {
658  out_result.pos_after_last_gap = pos;
659  }
660  }
661  break;
662  }
663  default:
664  NCBI_USER_THROW_FMT( "CSeqMap segments of type "
665  << static_cast<int>(eSegmentType)
666  << " are not supported at this time");
667  }
668  }
669 }
670 
671 /// This returns the (inclusive) position at the end of the
672 /// segment currently at iStartPosInclusive_arg. As always,
673 /// the definition of "end" depends on iTrimDirection.
675  const CSeqMap_CI & segment,
676  const TSignedSeqPos iTrimDirection )
677 {
678  _ASSERT( s_IsSupportedSegmentType(segment) );
679  _ASSERT( s_IsValidDirection(iTrimDirection) );
680  if( iTrimDirection == 1 ) {
681  // the "-1" turns an exclusive end-position
682  // into an inclusive one.
683  return (segment.GetEndPosition() - 1);
684  } else {
685  _ASSERT( iTrimDirection == -1 );
686  return segment.GetPosition();
687  }
688 }
689 
690 
691 CSeqMap_CI &
693  CSeqMap_CI & in_out_segment_it,
694  const TSignedSeqPos iTrimDirection )
695 {
696  _ASSERT( s_IsValidDirection(iTrimDirection) );
697  return ( iTrimDirection == 1 ? ++in_out_segment_it : --in_out_segment_it );
698 }
699 
700 void
702  TSignedSeqPos leftmost_good_base,
703  TSignedSeqPos rightmost_good_base,
704  CBioseq_Handle & bioseq_handle )
705 {
706  CSeqVector seqvec( bioseq_handle );
707 
708  CAutoInitRef<CDelta_ext> pDeltaExt;
709 
710  const CSeqMap & seqmap = bioseq_handle.GetSeqMap();
711  CSeqMap_CI seqmap_ci = seqmap.ResolvedRangeIterator(
712  &bioseq_handle.GetScope(),
713  leftmost_good_base,
714  1 + ( rightmost_good_base - leftmost_good_base ) );
715  for( ; seqmap_ci; ++seqmap_ci ) {
716  CSeqMap::ESegmentType eType = seqmap_ci.GetType();
717  switch( eType ) {
718  case CSeqMap::eSeqGap: {
719  const TSeqPos uGapLength = seqmap_ci.GetLength();
720  const bool bIsLengthKnown = ! seqmap_ci.IsUnknownLength();
721  CConstRef<CSeq_literal> pOriginalGapSeqLiteral =
722  seqmap_ci.GetRefGapLiteral();
723 
724  CAutoInitRef<CDelta_seq> pDeltaSeq;
725 
726  CAutoInitRef<CSeq_literal> pNewGapLiteral;
727  if( pOriginalGapSeqLiteral ) {
728  pNewGapLiteral->Assign(*pOriginalGapSeqLiteral);
729  }
730  if( ! bIsLengthKnown ) {
731  pNewGapLiteral->SetFuzz().SetLim( CInt_fuzz::eLim_unk );
732  }
733  pNewGapLiteral->SetLength( uGapLength );
734 
735  pDeltaSeq->SetLiteral( *pNewGapLiteral );
736 
737  pDeltaExt->Set().push_back( Ref(&*pDeltaSeq) );
738  break;
739  }
740  case CSeqMap::eSeqData: {
741  string new_data;
742  seqvec.GetPackedSeqData(
743  new_data, seqmap_ci.GetPosition(),
744  seqmap_ci.GetEndPosition() );
745 
746  CRef<CSeq_data> pSeqData(
747  new CSeq_data( new_data, seqvec.GetCoding() ) );
748 
749  CAutoInitRef<CDelta_seq> pDeltaSeq;
750  pDeltaSeq->SetLiteral().SetLength( seqmap_ci.GetLength() );
751  pDeltaSeq->SetLiteral().SetSeq_data( *pSeqData );
752 
753  pDeltaExt->Set().push_back( Ref(&*pDeltaSeq) );
754  break;
755  }
756  default:
757  NCBI_USER_THROW_FMT("CSequenceAmbigTrimmer does not yet support "
758  "seqmap segments of type " << static_cast<int>(eType) );
759  break;
760  }
761  }
762 
763  // use the new pDeltaExt
764 
765  // (we use const_case to defeat the type system to avoid the expense of
766  // copying the Seq-inst. We must be careful, though.
767  CSeq_inst & seq_inst = const_cast<CSeq_inst &>(bioseq_handle.GetInst());
768  seq_inst.ResetExt();
769  seq_inst.ResetSeq_data();
770  seq_inst.SetLength( 1 + ( rightmost_good_base - leftmost_good_base ) );
771  if( pDeltaExt->Set().empty() ) {
772  seq_inst.SetRepr( CSeq_inst::eRepr_virtual );
773  } else if( pDeltaExt->Set().size() == 1 ) {
774  seq_inst.SetRepr( CSeq_inst::eRepr_raw );
775  CRef<CDelta_seq> pDeltaSeq = *pDeltaExt->Set().begin();
776  CSeq_data & seq_data = pDeltaSeq->SetLiteral().SetSeq_data();
777  seq_inst.SetSeq_data( seq_data );
778  } else {
779  seq_inst.SetExt().SetDelta( *pDeltaExt );
780  }
781  CBioseq_EditHandle bioseq_eh = bioseq_handle.GetEditHandle();
782  bioseq_eh.SetInst( seq_inst );
783 
784  // at this time, annots aren't sliced, but that may be supported in the
785  // future.
786 }
787 
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
CAutoInitRef<>::
void Set(T *object)
Initialize with an existing object.
CBioseq_EditHandle –.
CBioseq_Handle –.
CConstRef –.
Definition: ncbiobj.hpp:1266
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
position_type GetCoveredLength(void) const
Returns total length covered by ranges in this collection, i.e.
Definition: range_coll.hpp:157
CSafeStatic<>::
T & Get(void)
Create the variable if not created yet, return the reference.
CScope –.
Definition: scope.hpp:92
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
CSeqMap –.
Definition: seq_map.hpp:93
CSeqVector –.
Definition: seq_vector.hpp:65
char value[7]
Definition: config.c:431
Include a standard set of the NCBI C++ Toolkit most basic headers.
static bool s_IsValidDirection(const string &direction)
#define ITERATE_0_IDX(idx, up_to)
idx loops from 0 (inclusive) to up_to (exclusive)
Definition: ncbimisc.hpp:865
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_USER_THROW(message)
Throw a quick-and-dirty runtime exception of type 'CException' with the given error message and error...
Definition: ncbiexpt.hpp:715
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
Definition: ncbiexpt.hpp:724
C * SerialClone(const C &src)
Create on heap a clone of the source object.
Definition: serialbase.hpp:512
@ eType
Definition: feature.hpp:86
TSignedSeqPos pos_after_last_gap
Inclusive.
Definition: sequence.hpp:1483
EMeaningOfAmbig m_eMeaningOfAmbig
This holds the current interpretation for "ambiguous".
Definition: sequence.hpp:1380
bool TAmbigLookupTable[26]
Definition: sequence.hpp:1566
virtual EResult x_TrimToNothing(CBioseq_Handle &bioseq_handle)
The bioseq is trimmed to size 0.
TSignedSeqPos m_uMinSeqLen
When the bioseq gets trimmed down to less than this size, we halt the trimming.
Definition: sequence.hpp:1389
TSignedSeqPos max_bases_allowed_to_be_ambig
Definition: sequence.hpp:1316
TSignedSeqPos x_SegmentGetEndInclusive(const CSeqMap_CI &segment, const TSignedSeqPos iTrimDirection)
This returns the (inclusive) position at the end of the segment currently at iStartPosInclusive_arg.
TFlags m_fFlags
This holds the flags that affect the behavior of this class.
Definition: sequence.hpp:1382
CSequenceAmbigTrimmer(EMeaningOfAmbig eMeaningOfAmbig, TFlags fFlags=0, const TTrimRuleVec &vecTrimRules=GetDefaultTrimRules(), TSignedSeqPos uMinSeqLen=50)
This sets up the parameters for how this trimmer will act.
CSeqMap_CI & x_SeqMapIterDoNext(CSeqMap_CI &in_out_segment_it, const TSignedSeqPos iTrimDirection)
Returns the "next" segment.
EMeaningOfAmbig
This enum is used to set what is meant by "ambiguous".
Definition: sequence.hpp:1289
virtual void x_NormalizeVecTrimRules(TTrimRuleVec &vecTrimRules)
This prepares the vector of trimming rules to be used by the trimming algorithm.
TAmbigLookupTable m_arrNucAmbigLookupTable
Definition: sequence.hpp:1567
EResult
This indicates what happened with the trim.
Definition: sequence.hpp:1352
virtual TSignedSeqPos x_FindWhereToTrim(const CSeqVector &seqvec, const TSignedSeqPos iStartPosInclusive_arg, const TSignedSeqPos iEndPosInclusive_arg, TSignedSeqPos iTrimDirection)
This returns the last good base that won't be trimmed (note: last really means "first" when we're sta...
vector< STrimRule > TTrimRuleVec
Multiple STrimRules are allowed, which are applied from smallest bases_to_check to largest bases_to_c...
Definition: sequence.hpp:1322
TAmbigLookupTable m_arrProtAmbigLookupTable
Definition: sequence.hpp:1568
virtual EResult DoTrim(CBioseq_Handle &bioseq_handle, CRangeCollection< TSeqPos > *trimmed_ranges=nullptr)
This trims the given bioseq, using params set in the CSequenceAmbigTrimmer constructor.
virtual void x_EdgeSeqMapGapAdjust(const CSeqVector &seqvec, TSignedSeqPos &in_out_uStartOfGoodBasesSoFar, const TSignedSeqPos uEndOfGoodBasesSoFar, const TSignedSeqPos iTrimDirection, const TSignedSeqPos uChunkSize)
This adjusts in_out_uStartOfGoodBasesSoFar if we're at a CSeqMap gap.
static const TTrimRuleVec & GetDefaultTrimRules(void)
This returns a reasonable default for trimming rules.
TSignedSeqPos x_SegmentGetBeginningInclusive(const CSeqMap_CI &segment, const TSignedSeqPos iTrimDirection)
This returns the (inclusive) position at the beginning of the segment.
Definition: sequence.hpp:1521
virtual void x_CountAmbigInRange(SAmbigCount &out_result, const CSeqVector &seqvec, const TSignedSeqPos iStartPosInclusive_arg, const TSignedSeqPos iEndPosInclusive_arg, const TSignedSeqPos iTrimDirection)
This counts the number of ambiguous bases in the range [leftmost_pos_to_check, rightmost_pos_to_check...
void x_SliceBioseq(TSignedSeqPos leftmost_good_base, TSignedSeqPos rightmost_good_base, CBioseq_Handle &bioseq_handle)
TSignedSeqPos num_ambig_bases
the number of ambiguous bases found in the range supplied to x_CountAmbigInRange
Definition: sequence.hpp:1480
TTrimRuleVec m_vecTrimRules
This holds the trimming rules that will be applied.
Definition: sequence.hpp:1386
bool x_TestFlag(TFlags fFlag)
Test if a given flag is set.
Definition: sequence.hpp:1392
@ fFlags_DoNotTrimBeginning
0x01 ("Beginning" as defined by CSeqVector)
Definition: sequence.hpp:1301
@ fFlags_DoNotTrimEnd
0x02 ("End" as defined by CSeqVector)
Definition: sequence.hpp:1302
@ fFlags_DoNotTrimSeqGap
0x04 (Seq-gaps are not considered trimmable if this flag is set, only letter gaps (e....
Definition: sequence.hpp:1304
@ eMeaningOfAmbig_AnyAmbig
Here, anything that's not certain is considered ambiguous.
Definition: sequence.hpp:1297
@ eMeaningOfAmbig_OnlyCompletelyUnknown
Here, only N for nucleotides and X for amino acids is considered ambiguous.
Definition: sequence.hpp:1292
@ eResult_NoTrimNeeded
Bioseq is left unchanged because it did not need to be trimmed at all.
Definition: sequence.hpp:1358
@ eResult_SuccessfullyTrimmed
Bioseq is now trimmed.
Definition: sequence.hpp:1354
TSeqPos GetBioseqLength(void) const
void SetInst(TInst &v) const
CBioseq_EditHandle GetEditHandle(void) const
Get 'edit' version of handle.
CScope & GetScope(void) const
Get scope this handle belongs to.
const CSeqMap & GetSeqMap(void) const
Get sequence map.
const TInst & GetInst(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
TSeqPos GetEndPosition(void) const
return end position of current segment in sequence (exclusive)
Definition: seq_map_ci.hpp:679
unsigned char TResidue
CSeqMap::ESegmentType GetType(void) const
Definition: seq_map_ci.hpp:651
bool IsUnknownLength(void) const
return true if current segment is a gap of unknown length
Definition: seq_map_ci.cpp:302
TSeqPos GetPosition(void) const
return position of current segment in sequence
Definition: seq_map_ci.hpp:665
TSeqPos GetLength(void) const
return length of current segment
Definition: seq_map_ci.hpp:672
CConstRef< CSeq_literal > GetRefGapLiteral(void) const
return CSeq_literal with gap data, or null if either the segment is not a gap, or an unspecified gap
Definition: seq_map_ci.cpp:292
TCoding GetCoding(void) const
Target sequence coding.
Definition: seq_vector.hpp:312
const CSeqMap & GetSeqMap(void) const
Definition: seq_vector.hpp:324
CSeqMap_CI FindSegment(TSeqPos pos, CScope *scope) const
Find segment containing the position.
Definition: seq_map.cpp:812
bool IsProtein(void) const
Definition: seq_vector.hpp:350
CSeqMap_CI ResolvedRangeIterator(CScope *scope, TSeqPos from, TSeqPos length, ENa_strand strand=eNa_strand_plus, size_t maxResolve=size_t(-1), TFlags flags=fDefaultFlags) const
Iterate segments in the range with specified strand coordinates.
Definition: seq_map.cpp:868
bool IsNucleotide(void) const
Definition: seq_vector.hpp:357
void GetPackedSeqData(string &buffer, TSeqPos start=0, TSeqPos stop=kInvalidSeqPos)
Definition: seq_vector.cpp:311
TMol GetSequenceType(void) const
Definition: seq_vector.hpp:343
CScope & GetScope(void) const
Definition: seq_vector.hpp:330
ESegmentType
Definition: seq_map.hpp:96
@ eSeqData
real sequence data
Definition: seq_map.hpp:98
@ eSeqGap
gap
Definition: seq_map.hpp:97
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
@ eLim_unk
unknown
Definition: Int_fuzz_.hpp:210
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
void ResetExt(void)
Reset Ext data member.
Definition: Seq_inst_.cpp:142
void ResetSeq_data(void)
Reset Seq_data data member.
Definition: Seq_inst_.cpp:125
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
constexpr auto sort(_Init &&init)
#define abs(a)
Definition: ncbi_heapmgr.c:130
Static variables safety - create on demand, destroy on application termination.
T max(T x_, T y_)
T min(T x_, T y_)
This holds the output of x_CountAmbigInRange.
Definition: sequence.hpp:1469
For example, if bases_to_check is 10 and max_bases_allowed_to_be_ambig is 5, then on each iteration w...
Definition: sequence.hpp:1314
#define _ASSERT
Modified on Sun Mar 03 03:13:25 2024 by modify_doxy.py rev. 669887