NCBI C++ ToolKit
sequtil_manip.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sequtil_manip.cpp 70627 2016-01-08 13:02:41Z ivanov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mati Shomrat
27  *
28  * File Description:
29  *
30  */
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbistd.hpp>
33 #include <corelib/ncbistr.hpp>
34 #include <vector>
35 #include <algorithm>
36 
37 #include <util/sequtil/sequtil.hpp>
41 #include "sequtil_shared.hpp"
42 #include "sequtil_tables.hpp"
43 
44 
46 
47 /////////////////////////////////////////////////////////////////////////////
48 //
49 // Reverse
50 
51 // When reversing a sequence the packed formats ncbi2na and ncbi4na
52 // get "special" treatment, since the requesetd interval might not
53 // fall on a byte boundry.
54 // Other formats perform a simple conversion on the sequence. Note that
55 // if the original sequnece is erroneous (e.g. lower case) the reverse
56 // isn't "fixed".
57 
59 (const char* src,
60  TSeqPos pos,
61  TSeqPos length,
62  char* dst)
63 {
64  const char* begin = src + (pos / 4);
65  const char* end = src + ((pos + length - 1) / 4) + 1;
66  const char* iter = end;
67 
68  size_t offset = (pos + length - 1) % 4;
70 
71  if ( offset == 3 ) { // byte boundry when viewed from the end
72  for ( ; iter != begin; ++dst ) {
73  *dst = table[static_cast<Uint1>(*--iter)];
74  }
75  --dst;
76  } else {
77  --iter;
78  for ( size_t count = length / 4; count; --count, ++dst ) {
79  *dst =
80  table[static_cast<Uint1>(*iter) * 2 + 1] |
81  table[static_cast<Uint1>(*(iter - 1)) * 2];
82  --iter;
83  }
84 
85  // handle the overhang
86  if ( length % 4 != 0 ) {
87  *dst = table[static_cast<Uint1>(*iter) * 2 + 1];
88  if ( iter != begin ) {
89  --iter;
90  *dst |= (char)table[static_cast<Uint1>(*iter) * 2];
91  }
92  }
93  }
94 
95  // now, take care of the last byte
96  *dst &= char(0xFF << ((4 - (length % 4)) % 4) * 2);
97 
98  return length;
99 }
100 
101 
103 (const char* src,
104  TSeqPos pos,
105  TSeqPos length,
106  char* dst)
107 {
108  size_t start_offset = (pos + length - 1) % 2;
109 
110  const Uint1* table = C4naReverse::GetTable();
111 
112  const char* begin = src + (pos / 2);
113  const char* end = src + ((pos + length - 1) / 2) + 1;
114  const char* iter = end;
115 
116  switch ( start_offset ) {
117  case 1:
118  // byte boundry
119  {{
120  for ( ; iter != begin; ++dst ) {
121  *dst = table[static_cast<Uint1>(*--iter)];
122  }
123  --dst;
124  if ( length % 2 != 0 ) {
125  *dst &= (char)0xF0;
126  }
127  }}
128  break;
129 
130  case 0:
131  {{
132  for ( size_t count = length / 2; count; --count, ++dst ) {
133  --iter;
134  *dst = char((static_cast<Uint1>(*iter) & 0xF0) |
135  (static_cast<Uint1>(*(iter - 1)) & 0x0F));
136  }
137 
138  if ( length % 2 != 0 ) {
139  --iter;
140  *dst = char(static_cast<Uint1>(*iter) & 0xF0);
141  }
142  }}
143  break;
144  }
145 
146  return length;
147 }
148 
149 
150 template <typename SrcCont, typename DstCont>
152 (const SrcCont& src,
153  CSeqUtil::TCoding src_coding,
154  TSeqPos pos,
155  TSeqPos length,
156  DstCont& dst)
157 {
158  _ASSERT(!OutOfRange(pos, src, src_coding));
159  if ( src.empty() || (length == 0) ) {
160  return 0;
161  }
162 
163  AdjustLength(src, src_coding, pos, length);
164  ResizeDst(dst, src_coding, length);
165 
166  return CSeqManip::Reverse(&*src.begin(), src_coding,
167  pos, length, &*dst.begin());
168 }
169 
170 
172 (const string& src,
173  TCoding src_coding,
174  TSeqPos pos,
175  TSeqPos length,
176  string& dst)
177 {
178  // call the templated version
179  return s_Reverse(src, src_coding, pos, length, dst);
180 }
181 
182 
184 (const vector<char>& src,
185  TCoding coding,
186  TSeqPos pos,
187  TSeqPos length,
188  vector<char>& dst)
189 {
190  // call the templated version
191  return s_Reverse(src, coding,pos, length, dst);
192 }
193 
194 
196 (const char* src,
197  TCoding src_coding,
198  TSeqPos pos,
199  TSeqPos length,
200  char* dst)
201 {
202  _ASSERT((dst != 0) && (src != 0));
203 
204  switch ( src_coding ) {
205 
206  // "special" treatment
207  case CSeqUtil::e_Ncbi2na:
208  return s_2naReverse(src, pos, length, dst);
209 
210  case CSeqUtil::e_Ncbi4na:
211  return s_4naReverse(src, pos, length, dst);
212 
213  // a simple reverse
214  default:
215  reverse_copy(src + pos, src + pos + length, dst);
216  return length;
217  }
218 
219  NCBI_THROW(CSeqUtilException, eInvalidCoding, "Unknown coding");
220 }
221 
222 
223 /////////////////////////////////////////////////////////////////////////////
224 //
225 // Complement
226 
227 template <typename SrcCont, typename DstCont>
229 (const SrcCont& src,
230  CSeqUtil::TCoding src_coding,
231  TSeqPos pos,
232  TSeqPos length,
233  DstCont& dst)
234 {
235  _ASSERT(!OutOfRange(pos, src, src_coding));
236  if ( src.empty() || (length == 0) ) {
237  return 0;
238  }
239 
240  AdjustLength(src, src_coding, pos, length);
241  ResizeDst(dst, src_coding, length);
242 
243  return CSeqManip::Complement(&*src.begin(), src_coding,
244  pos, length, &*dst.begin());
245 }
246 
247 
249 (const string& src,
250  TCoding coding,
251  TSeqPos pos,
252  TSeqPos length,
253  string& dst)
254 {
255  // call the templated version
256  return s_Complement(src, coding,pos, length, dst);
257 }
258 
259 
261 (const vector<char>& src,
262  TCoding coding,
263  TSeqPos pos,
264  TSeqPos length,
265  vector<char>& dst)
266 {
267  // call the templated version
268  return s_Complement(src, coding,pos, length, dst);
269 }
270 
271 
273 (const char* src,
274  TSeqPos pos,
275  TSeqPos length,
276  char* dst)
277 {
278  const char* iter = src + (pos / 4);
279  const char* end = src + ((pos + length - 1) / 4) + 1;
280 
281  if ( pos % 4 == 0 ) {
282  for ( ; iter != end; ++iter, ++dst ) {
283  *dst = char(~(*iter));
284  }
285 
286  if ( length % 4 != 0 ) {
287  *(--dst) &= char(0xFF << (8 - (length % 4) * 2));
288  }
289  } else {
290  const Uint1* table = C2naCmp::GetTable(pos % 4);
291 
292  for ( size_t count = length / 4; count; --count, ++dst, ++iter ) {
293  *dst= char(
294  table[static_cast<Uint1>(*iter) * 2] |
295  table[static_cast<Uint1>(*(iter + 1)) * 2 + 1] );
296  }
297 
298  // handle the overhang
299  if ( length % 4 != 0 ) {
300  *dst = (char)table[static_cast<Uint1>(*iter) * 2];
301  if ( ++iter != end ) {
302  *dst |= (char)table[static_cast<Uint1>(*iter) * 2 + 1];
303  }
304  }
305  }
306  // now, take care of the last byte
307  *dst &= char(0xFF << ((4 - (length % 4)) % 4) * 2);
308 
309  return length;
310 }
311 
312 
314 (const char* src,
315  TSeqPos pos,
316  TSeqPos length,
317  char* dst)
318 {
319  const char* end = src + pos + length;
320  const char* iter = src + pos;
321 
322  for ( ; iter != end; ++iter, ++dst ) {
323  *dst = char(3 - static_cast<Uint1>(*iter));
324  }
325 
326  return length;
327 }
328 
329 
331 (const char* src,
332  TSeqPos pos,
333  TSeqPos length,
334  char* dst)
335 {
336  const char* iter = src + (pos / 2);
337  const char* end = src + (pos + length - 1) / 2 + 1;
338 
339  const Uint1* table = C4naCmp::GetTable(pos % 2);
340 
341  switch ( pos % 2 ) {
342  case 0:
343  {{
344  for ( ; iter != end; ++iter, ++dst ) {
345  *dst = (char)table[static_cast<Uint1>(*iter)];
346  }
347 
348  if ( length % 2 != 0 ) {
349  *dst &= char(0xF0);
350  }
351  }}
352  break;
353 
354  case 1:
355  {{
356  for ( size_t count = length / 2; count; --count, ++iter, ++dst ) {
357  *dst = char(
358  table[static_cast<Uint1>(*iter) * 2] |
359  table[static_cast<Uint1>(*(iter + 1)) * 2 + 1] );
360  }
361 
362  if ( length % 2 != 0 ) {
363  *dst = (char)table[static_cast<Uint1>(*iter) * 2];
364  }
365  }}
366  break;
367  }
368 
369  return length;
370 }
371 
372 
374 (const char* src,
375  TCoding src_coding,
376  TSeqPos pos,
377  TSeqPos length,
378  char* dst)
379 {
380  _ASSERT((dst != 0) && (src != 0));
381 
382  switch ( src_coding ) {
383  case CSeqUtil::e_Iupacna:
384  return convert_1_to_1(src, pos, length, dst, CIupacnaCmp::GetTable());
385 
386  case CSeqUtil::e_Ncbi2na:
387  return s_Ncbi2naComplement(src, pos, length, dst);
388 
390  return s_Ncbi2naExpandComplement(src, pos, length, dst);
391 
392  case CSeqUtil::e_Ncbi4na:
393  return s_Ncbi4naComplement(src, pos, length, dst);
394 
395  case CSeqUtil::e_Ncbi8na:
397  return convert_1_to_1(src, pos, length, dst, C8naCmp::GetTable());
398 
399  default:
400  break;
401  }
402 
403  NCBI_THROW(CSeqUtilException, eInvalidCoding,
404  "There is no complement for the specified coding.");
405 }
406 
407 /////////////////////////////////////////////////////////////////////////////
408 //
409 // ReverseComplement
410 
411 template <typename SrcCont, typename DstCont>
413 (const SrcCont& src,
414  CSeqUtil::TCoding src_coding,
415  TSeqPos pos,
416  TSeqPos length,
417  DstCont& dst)
418 {
419  _ASSERT(!OutOfRange(pos, src, src_coding));
420  if ( src.empty() || (length == 0) ) {
421  return 0;
422  }
423 
424  AdjustLength(src, src_coding, pos, length);
425  ResizeDst(dst, src_coding, length);
426 
427  return CSeqManip::ReverseComplement(&*src.begin(), src_coding,
428  pos, length, &*dst.begin());
429 }
430 
431 
433 (const string& src,
434  TCoding coding,
435  TSeqPos pos,
436  TSeqPos length,
437  string& dst)
438 {
439  // call the templated version
440  return s_ReverseComplement(src, coding,pos, length, dst);
441 }
442 
443 
445 (const vector<char>& src,
446  TCoding coding,
447  TSeqPos pos,
448  TSeqPos length,
449  vector<char>& dst)
450 {
451  // call the templated version
452  return s_ReverseComplement(src, coding,pos, length, dst);
453 }
454 
455 
457 (const char* src,
458  TSeqPos pos,
459  TSeqPos length,
460  char* dst)
461 {
462  size_t offset = (pos + length - 1) % 4;
464 
465  const char* begin = src + (pos / 4);
466  const char* iter = src + (pos + length - 1) / 4 + 1;
467  switch ( offset ) {
468  case 0:
469  case 1:
470  case 2:
471  --iter;
472  for ( size_t count = length / 4; count; --count, ++dst, --iter ) {
473  *dst =
474  table[static_cast<Uint1>(*iter) * 2] |
475  table[static_cast<Uint1>(*(iter - 1)) * 2 + 1];
476  }
477 
478  // handle the overhang
479  if ( length % 4 != 0 ) {
480  *dst = table[static_cast<Uint1>(*iter) * 2];
481  if ( iter != begin ) {
482  --iter;
483  *dst |= (char)table[static_cast<Uint1>(*iter) * 2 + 1];
484  }
485  }
486  break;
487 
488  case 3:
489  // aligned operation
490  for ( ; iter != begin; ++dst ) {
491  *dst = table[static_cast<Uint1>(*--iter)];
492  }
493  break;
494  }
495 
496  // zero redundent bits
497  *dst &= char(0xFF << ((4 - (length % 4)) % 4) * 2);
498 
499  return length;
500 }
501 
502 
504 (const char* src,
505  TSeqPos pos,
506  TSeqPos length,
507  char* dst)
508 {
509  const char* begin = src + pos;
510  const char* iter = src + pos + length;
511 
512  for ( ; iter != begin; ++dst ) {
513  *dst = char(3 - static_cast<Uint1>(*--iter));
514  }
515 
516  return length;
517 }
518 
519 
521 (const char* src,
522  TSeqPos pos,
523  TSeqPos length,
524  char* dst)
525 {
526  const char* begin = src + (pos / 2);
527  const char* iter = src + ((pos + length - 1) / 2) + 1;
528 
529  size_t offset = (pos + length - 1) % 2;
531 
532  switch ( offset ) {
533  case 0:
534  {{
535  --iter;
536  for ( size_t count = length / 2; count; --count, --iter, ++dst ) {
537  *dst =
538  table[static_cast<Uint1>(*iter) * 2] |
539  table[static_cast<Uint1>(*(iter - 1)) * 2 + 1];
540  }
541 
542  if ( length % 2 != 0 ) {
543  *dst = table[static_cast<Uint1>(*iter) * 2];
544  }
545  }}
546  break;
547 
548  case 1:
549  {{
550  for ( ; iter != begin; ++dst ) {
551  *dst = table[static_cast<Uint1>(*--iter)];
552  }
553 
554  if ( length % 2 != 0 ) {
555  *dst &= char(0xF0);
556  }
557  }}
558  break;
559  }
560 
561  return length;
562 }
563 
564 
566 (const char* src,
567  TCoding src_coding,
568  TSeqPos pos,
569  TSeqPos length,
570  char* dst)
571 {
572  _ASSERT((dst != 0) && (src != 0));
573 
574  switch ( src_coding ) {
575  case CSeqUtil::e_Iupacna:
576  return copy_1_to_1_reverse(src, pos, length, dst,
578 
579  case CSeqUtil::e_Ncbi2na:
580  return s_Ncbi2naRevCmp(src, pos, length, dst);
581 
583  return s_Ncbi2naExpandRevCmp(src, pos, length, dst);
584 
585  case CSeqUtil::e_Ncbi4na:
586  return s_Ncbi4naRevCmp(src, pos, length, dst);
587 
588  case CSeqUtil::e_Ncbi8na:
590  return copy_1_to_1_reverse(src, pos, length, dst,
592  default:
593  break;
594  }
595 
596  NCBI_THROW(CSeqUtilException, eInvalidCoding,
597  "There is no complement for the specified coding.");
598 }
599 
600 
601 // in place
602 
603 template <typename SrcCont>
605 (SrcCont& src,
606  CSeqUtil::TCoding src_coding,
607  TSeqPos pos,
608  TSeqPos length)
609 {
610  _ASSERT(!OutOfRange(pos, src, src_coding));
611  if ( src.empty() || (length == 0) ) {
612  return 0;
613  }
614 
615  AdjustLength(src, src_coding, pos, length);
616 
617  return CSeqManip::ReverseComplement(&*src.begin(), src_coding,
618  pos, length);
619 }
620 
621 
623 (string& src,
624  TCoding src_coding,
625  TSeqPos pos,
626  TSeqPos length)
627 {
628  // call the templated version
629  return s_ReverseComplement(src, src_coding, pos, length);
630 }
631 
632 
634 (vector<char>& src,
635  TCoding src_coding,
636  TSeqPos pos,
637  TSeqPos length)
638 {
639  // call the templated version
640  return s_ReverseComplement(src, src_coding, pos, length);
641 }
642 
643 
645 (char* src,
646  TSeqPos pos,
647  TSeqPos length)
648 {
649  char* first = src + pos;
650  char* last = first + length;
651  char temp;
652 
653  for ( ; first <= last; ++first, --last ) {
654  temp = char(3 - *first);
655  *first = char(3 - *last);
656  *last = temp;
657  }
658 
659  if ( pos != 0 ) {
660  copy(src + pos, src + pos + length, src);
661  }
662 
663  return length;
664 }
665 
666 
668 (char* src,
669  TSeqPos pos,
670  TSeqPos length)
671 {
672  char* buf = new char[length];
673  CSeqConvert::Convert(src, CSeqUtil::e_Ncbi2na, pos, length,
675  revcmp(buf, pos, length, C8naCmp::GetTable());
677  src, CSeqUtil::e_Ncbi2na);
678  delete[] buf;
679 
680  return length;
681 }
682 
683 
685 (char* src,
686  TSeqPos pos,
687  TSeqPos length)
688 {
689  char* buf = new char[length];
690  CSeqConvert::Convert(src, CSeqUtil::e_Ncbi4na, pos, length,
692  revcmp(buf, pos, length, C8naCmp::GetTable());
694  src, CSeqUtil::e_Ncbi4na);
695  delete[] buf;
696 
697  return length;
698 }
699 
700 
702 (char* src,
703  TCoding src_coding,
704  TSeqPos pos,
705  TSeqPos length)
706 {
707  _ASSERT(src != 0);
708 
709  switch ( src_coding ) {
710  case CSeqUtil::e_Iupacna:
711  return revcmp(src, pos, length, CIupacnaCmp::GetTable());
712 
713  case CSeqUtil::e_Ncbi2na:
714  return s_Ncbi2naRevCmp(src, pos, length);
715 
717  return s_Ncbi2naExpandRevCmp(src, pos, length);
718 
719  case CSeqUtil::e_Ncbi4na:
720  return s_Ncbi4naRevCmp(src, pos, length);
721 
722  case CSeqUtil::e_Ncbi8na:
724  return revcmp(src, pos, length, C8naCmp::GetTable());
725 
726  default:
727  break;
728  }
729 
730  NCBI_THROW(CSeqUtilException, eInvalidCoding,
731  "There is no complement for the specified coding.");
732 }
733 
static const Uint1 * GetTable(size_t offset)
static const Uint1 * GetTable(size_t offset)
static const Uint1 * GetTable(size_t offset)
static const Uint1 * GetTable(size_t offset)
static const Uint1 * GetTable(size_t offset)
static const Uint1 * GetTable(void)
static const Uint1 * GetTable(void)
static const Uint1 * GetTable(void)
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
static SIZE_TYPE Reverse(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
static SIZE_TYPE ReverseComplement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
static SIZE_TYPE Complement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
@ e_Ncbi4na_expand
Definition: sequtil.hpp:51
@ e_Ncbi8na
Definition: sequtil.hpp:52
@ e_Iupacna
Definition: sequtil.hpp:47
@ e_Ncbi4na
Definition: sequtil.hpp:50
@ e_Ncbi2na_expand
Definition: sequtil.hpp:49
@ e_Ncbi2na
Definition: sequtil.hpp:48
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
char * buf
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
int offset
Definition: replacements.h:160
static SIZE_TYPE s_Ncbi2naRevCmp(const char *src, TSeqPos pos, TSeqPos length, char *dst)
static SIZE_TYPE s_Ncbi2naExpandRevCmp(const char *src, TSeqPos pos, TSeqPos length, char *dst)
static SIZE_TYPE s_4naReverse(const char *src, TSeqPos pos, TSeqPos length, char *dst)
static SIZE_TYPE s_Ncbi4naRevCmp(const char *src, TSeqPos pos, TSeqPos length, char *dst)
SIZE_TYPE s_ReverseComplement(const SrcCont &src, CSeqUtil::TCoding src_coding, TSeqPos pos, TSeqPos length, DstCont &dst)
static SIZE_TYPE s_Ncbi2naComplement(const char *src, TSeqPos pos, TSeqPos length, char *dst)
static SIZE_TYPE s_2naReverse(const char *src, TSeqPos pos, TSeqPos length, char *dst)
SIZE_TYPE s_Complement(const SrcCont &src, CSeqUtil::TCoding src_coding, TSeqPos pos, TSeqPos length, DstCont &dst)
static SIZE_TYPE s_Ncbi2naExpandComplement(const char *src, TSeqPos pos, TSeqPos length, char *dst)
SIZE_TYPE s_Reverse(const SrcCont &src, CSeqUtil::TCoding src_coding, TSeqPos pos, TSeqPos length, DstCont &dst)
static SIZE_TYPE s_Ncbi4naComplement(const char *src, TSeqPos pos, TSeqPos length, char *dst)
SIZE_TYPE copy_1_to_1_reverse(const char *src, TSeqPos pos, TSeqPos length, char *dst, const Uint1 *table)
SIZE_TYPE convert_1_to_1(const char *src, TSeqPos pos, TSeqPos length, char *dst, const Uint1 *table)
SIZE_TYPE revcmp(char *buf, TSeqPos pos, TSeqPos length, const Uint1 *table)
bool OutOfRange(TSeqPos pos, const C &container, CSeqUtil::TCoding coding)
void ResizeDst(C &container, CSeqUtil::TCoding coding, TSeqPos length)
void AdjustLength(C &container, CSeqUtil::TCoding coding, TSeqPos pos, TSeqPos &length)
#define _ASSERT
Modified on Sat Dec 02 09:21:57 2023 by modify_doxy.py rev. 669887