65 unsigned int bases[4];
66 for (
int na4 = 0; na4 < 16; na4++) {
69 for (
int bit = 0; bit < 4; bit++) {
71 if ( !na4 || (na4 & (1 << bit)) ) {
87 for (
int bit = 0; bit < 4; bit++) {
93 for (
int base = 0; base < 4; base++) {
94 if (!bases[base] ||
rnd > bases[base]) {
116 for (
char* stop = data + count; data < stop; ++data, ++pos) {
142 m_SeqMap(vec.m_SeqMap),
146 m_Strand(vec.m_Strand),
147 m_Coding(vec.m_Coding)
155 m_SeqMap(&bioseq.GetSeqMap()),
156 m_TSE(bioseq.GetTSE_Handle()),
196 m_SeqMap(
CSeqMap::GetSeqMapForSeq_loc(loc, &scope)),
202 m_TSE = bh.GetTSE_Handle();
229 m_SeqMap(
CSeqMap::CreateSeqMapForBioseq(bioseq)),
246 if ( &vec !=
this ) {
316 src_end =
min(src_end,
size());
317 if ( src_pos >= src_end ) {
323 "CSeqVector::GetPackedSeqData: "
324 "cannot get seq-data in range: "
325 <<src_pos<<
"-"<<src_end);
329 switch ( dst_coding ) {
346 "Can not pack data using the selected coding: "<<
355 size_t src_pos,
size_t count)
357 _ASSERT(src_pos+count >= src_pos);
358 _ASSERT(src_pos+count <= src_str.size());
360 dst_str.append(src_str.data()+src_pos, count);
367 size_t src_pos,
size_t count)
369 _ASSERT(src_pos+count >= src_pos);
370 _ASSERT(src_pos+count <= src_str.size());
372 dst_str.append(&src_str[src_pos], count);
381 dst_str.append(count, gap);
388 const char* src,
size_t count)
395 dst += char((dst_c<<4)|*src);
401 for ( ; count >= 2; dst_pos += 2, src += 2, count -= 2 ) {
402 dst += char((src[0]<<4)|src[1]);
412 const vector<char>& src,
TSeqPos src_pos,
415 _ASSERT(src_pos+count >= src_pos);
416 _ASSERT(src_pos+count <= src.size()*2);
420 if ( (src_pos^dst_pos) & 1 ) {
424 dst += char((dst_c<<4)|((src[src_pos>>1]>>4)&15));
431 size_t pos = src_pos>>1;
432 for ( ; count >= 2; dst_pos += 2, pos += 1, count -= 2 ) {
433 dst += char(((src[pos]<<4)&0xf0)|((src[pos+1]>>4)&0x0f));
437 dst_c = (src[pos])&15;
444 dst += char((dst_c<<4)|((src[src_pos>>1])&15));
452 size_t octets = count>>1;
453 size_t pos = src_pos>>1;
455 dst.append(&src[pos], octets);
459 dst_c = (src[pos+octets]>>4)&15;
474 dst_str += char((dst_c << 4)|gap);
480 size_t octets = count>>1;
482 dst_str.append(octets,
char((gap<<4)|gap));
497 _ASSERT(dst_str.size() == dst_pos>>2);
498 const char* unpacked =
buffer;
501 for ( ; count && (dst_pos&3); --count, ++dst_pos ) {
502 c = char((c<<2)|*unpacked++);
504 if ( (dst_pos&3) == 0 ) {
516 _ASSERT(dst_str.size() == dst_pos>>2);
518 char* packed_end = packed_buffer;
519 for ( ; count >= 4; count -= 4, unpacked += 4 ) {
520 *packed_end++ = char(
521 (unpacked[0]<<6)|(unpacked[1]<<4)|(unpacked[2]<<2)|unpacked[3] );
523 dst_str.append(packed_buffer, packed_end);
529 dst_c = char((unpacked[0]<<2)|unpacked[1]);
532 dst_c = char((unpacked[0]<<4)|(unpacked[1]<<2)|unpacked[2]);
543 const vector<char>& src,
TSeqPos src_pos,
546 _ASSERT(src_pos+count >= src_pos);
547 _ASSERT(src_pos+count <= src.size()*4);
551 if ( (src_pos^dst_pos) & 3 ) {
574 char c = char((dst_c<<(add*2))|(src[src_pos>>2]&((1<<(add*2))-1)));
576 dst_c = char(c >> (2*(add-count)));
587 size_t octets = count>>2;
588 size_t pos = src_pos>>2;
590 dst.append(&src[pos], octets);
592 size_t rem = count&3;
595 dst_c = char((src[pos+octets]&255)>>(2*(4-rem)));
605 _ASSERT(src_pos+count >= src_pos);
608 _ASSERT(dst_str.size() == dst_pos>>2);
612 fill_n(
buffer, chunk, gap);
618 _ASSERT(dst_str.size() == dst_pos>>2);
627 const char*
table = 0,
bool reverse =
false)
629 _ASSERT(dataPos+total_count >= dataPos);
633 dataPos += total_count;
635 while ( total_count ) {
640 switch ( src_coding ) {
675 "Invalid data coding: "<<src_coding);
677 dst_str.append(
buffer, count);
681 total_count -= count;
690 const char*
table,
bool reverse)
692 _ASSERT(dataPos+total_count >= dataPos);
697 dataPos += total_count;
699 while ( total_count ) {
704 switch ( src_coding ) {
739 "Invalid data coding: "<<src_coding);
746 total_count -= count;
755 const char*
table,
bool reverse,
758 _ASSERT(dataPos+total_count >= dataPos);
763 dataPos += total_count;
765 while ( total_count ) {
770 switch ( src_coding ) {
805 "Invalid data coding: "<<src_coding);
815 randomizer_pos += count;
816 total_count -= count;
833 dst_str.reserve(src_end-src_pos);
836 while ( src_pos < src_end ) {
837 _ASSERT(dst_str.size() == dst_pos);
847 const char*
table = 0;
848 if ( dst_coding != src_coding || reverse ||
851 reverse, case_conversion);
852 if ( !
table && src_coding != dst_coding ) {
854 "Incompatible sequence codings: "<<
855 src_coding<<
" -> "<<dst_coding);
871 switch ( src_coding ) {
907 _ASSERT(dst_str.size() == (dst_pos+=count));
924 dst_str.reserve((src_end-src_pos+1)>>1);
928 while ( src_pos < src_end ) {
929 _ASSERT(dst_str.size() == dst_pos>>1);
939 const char*
table = 0;
940 if ( dst_coding != src_coding || reverse ||
943 reverse, case_conversion);
944 if ( !
table && src_coding != dst_coding ) {
946 "Incompatible sequence codings: "<<
947 src_coding<<
" -> "<<dst_coding);
963 data, dataPos, count,
table, reverse);
975 _ASSERT(dst_str.size() == dst_pos>>1);
978 dst_str += char(dst_c<<4);
995 dst_str.reserve((src_end-src_pos+3)>>2);
999 while ( src_pos < src_end ) {
1000 _ASSERT(dst_str.size() == dst_pos>>2);
1005 "Cannot fill NCBI2na gap without randomizer");
1024 const char*
table = 0;
1025 if ( dst_coding != src_coding || reverse ||
1028 reverse, case_conversion);
1029 if ( !
table && src_coding != dst_coding ) {
1031 "Incompatible sequence codings: "<<
1032 src_coding<<
" -> "<<dst_coding);
1051 data, dataPos, count,
table, reverse,
1052 randomizer, src_pos);
1065 _ASSERT(dst_str.size() == dst_pos>>2);
1104 "Can not indicate gap using the selected coding: "<<
1117 typedef pair<TCoding, TCoding> TMainConversion;
1118 typedef pair<bool, ECaseConversion> TConversionFlags;
1119 typedef pair<TMainConversion, TConversionFlags> TConversionKey;
1120 typedef vector<char> TConversionTable;
1125 key.first = TMainConversion(src, dst);
1126 key.second = TConversionFlags(reverse, case_cvt);
1127 TTables::iterator it =
tables->find(
key);
1128 if ( it !=
tables->end() ) {
1130 switch (it->second.size()) {
1133 default:
return &it->second[0];
1136 TConversionTable&
table = (*tables)[
key];
1147 if ( srcIndex.second >= COUNT ) {
1157 catch ( exception& ) {
1171 pair<unsigned, unsigned> dstIndex =
1173 if ( dstIndex.second >= COUNT ) {
1182 catch ( exception& ) {
1192 table.resize(COUNT,
char(kInvalidCode));
1193 bool different =
false;
1194 for (
unsigned i = srcIndex.first;
i <= srcIndex.second; ++
i ) {
1215 catch ( exception& ) {
1228 '\x00',
'\x01',
'\x02',
'\x03',
'\x04',
'\x05',
'\x06',
'\x07',
1229 '\x08',
'\x09',
'\x0a',
'\x0b',
'\x0c',
'\x0d',
'\x0e',
'\x0f',
1230 '\x10',
'\x11',
'\x12',
'\x13',
'\x14',
'\x15',
'\x16',
'\x17',
1231 '\x18',
'\x19',
'\x1a',
'\x1b',
'\x1c',
'\x1d',
'\x1e',
'\x1f',
1232 '\x20',
'\x21',
'\x22',
'\x23',
'\x24',
'\x25',
'\x26',
'\x27',
1233 '\x28',
'\x29',
'\x2a',
'\x2b',
'\x2c',
'\x2d',
'\x2e',
'\x2f',
1234 '\x30',
'\x31',
'\x32',
'\x33',
'\x34',
'\x35',
'\x36',
'\x37',
1235 '\x38',
'\x39',
'\x3a',
'\x3b',
'\x3c',
'\x3d',
'\x3e',
'\x3f',
1236 '\x40',
'\x41',
'\x42',
'\x43',
'\x44',
'\x45',
'\x46',
'\x47',
1237 '\x48',
'\x49',
'\x4a',
'\x4b',
'\x4c',
'\x4d',
'\x4e',
'\x4f',
1238 '\x50',
'\x51',
'\x52',
'\x53',
'\x54',
'\x55',
'\x56',
'\x57',
1239 '\x58',
'\x59',
'\x5a',
'\x5b',
'\x5c',
'\x5d',
'\x5e',
'\x5f',
1240 '\x60',
'\x61',
'\x62',
'\x63',
'\x64',
'\x65',
'\x66',
'\x67',
1241 '\x68',
'\x69',
'\x6a',
'\x6b',
'\x6c',
'\x6d',
'\x6e',
'\x6f',
1242 '\x70',
'\x71',
'\x72',
'\x73',
'\x74',
'\x75',
'\x76',
'\x77',
1243 '\x78',
'\x79',
'\x7a',
'\x7b',
'\x7c',
'\x7d',
'\x7e',
'\x7f',
1244 '\x80',
'\x81',
'\x82',
'\x83',
'\x84',
'\x85',
'\x86',
'\x87',
1245 '\x88',
'\x89',
'\x8a',
'\x8b',
'\x8c',
'\x8d',
'\x8e',
'\x8f',
1246 '\x90',
'\x91',
'\x92',
'\x93',
'\x94',
'\x95',
'\x96',
'\x97',
1247 '\x98',
'\x99',
'\x9a',
'\x9b',
'\x9c',
'\x9d',
'\x9e',
'\x9f',
1248 '\xa0',
'\xa1',
'\xa2',
'\xa3',
'\xa4',
'\xa5',
'\xa6',
'\xa7',
1249 '\xa8',
'\xa9',
'\xaa',
'\xab',
'\xac',
'\xad',
'\xae',
'\xaf',
1250 '\xb0',
'\xb1',
'\xb2',
'\xb3',
'\xb4',
'\xb5',
'\xb6',
'\xb7',
1251 '\xb8',
'\xb9',
'\xba',
'\xbb',
'\xbc',
'\xbd',
'\xbe',
'\xbf',
1252 '\xc0',
'\xc1',
'\xc2',
'\xc3',
'\xc4',
'\xc5',
'\xc6',
'\xc7',
1253 '\xc8',
'\xc9',
'\xca',
'\xcb',
'\xcc',
'\xcd',
'\xce',
'\xcf',
1254 '\xd0',
'\xd1',
'\xd2',
'\xd3',
'\xd4',
'\xd5',
'\xd6',
'\xd7',
1255 '\xd8',
'\xd9',
'\xda',
'\xdb',
'\xdc',
'\xdd',
'\xde',
'\xdf',
1256 '\xe0',
'\xe1',
'\xe2',
'\xe3',
'\xe4',
'\xe5',
'\xe6',
'\xe7',
1257 '\xe8',
'\xe9',
'\xea',
'\xeb',
'\xec',
'\xed',
'\xee',
'\xef',
1258 '\xf0',
'\xf1',
'\xf2',
'\xf3',
'\xf4',
'\xf5',
'\xf6',
'\xf7',
1259 '\xf8',
'\xf9',
'\xfa',
'\xfb',
'\xfc',
'\xfd',
'\xfe',
'\xff'
static CRef< CScope > m_Scope
CScope * GetScopeOrNull(void) const
SeqVector related exceptions.
static TPair GetCodeIndexFromTo(CSeq_data::E_Choice code_type)
static bool IsCodeAvailable(CSeq_data::E_Choice code_type)
static TIndex GetIndexComplement(CSeq_data::E_Choice code_type, TIndex idx)
static TIndex GetMapToIndex(CSeq_data::E_Choice from_type, CSeq_data::E_Choice to_type, TIndex from_idx)
void reset(element_type *p=0, EOwnership ownership=eTakeOwnership)
Reset will delete the old pointer (if owned), set content to the new value, and assume the ownership ...
unsigned int TSeqPos
Type for sequence locations and lengths.
int TSignedSeqPos
Type for signed sequence position.
element_type * get(void) const
Get pointer.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
#define NCBI_THROW_FMT(exception_class, err_code, message)
The same as NCBI_THROW but with message processed as output to ostream.
const TPrim & Get(void) const
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
TSeqPos GetBioseqLength(void) const
EVectorCoding
CSeqVector constructor flags.
TMol GetSequenceType(void) const
@ eCoding_Ncbi
Set coding to binary coding (Ncbi4na or Ncbistdaa)
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
virtual void RandomizeData(char *buffer, size_t count, TSeqPos pos)=0
Convert count unpacked bases in buffer 4na -> 2na with randomization.
TSeqPos GetEndPosition(void) const
return end position of current segment in sequence (exclusive)
const CSeq_data & GetRefData(void) const
will allow any data segments, user should check for position and strand
static const char sm_TrivialTable[256]
SSeqMapSelector & SetLinkUsedTSE(bool link=true)
TSeqPos GetRefPosition(void) const
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer)
Fill the buffer string with the sequence data for the interval [start, stop).
TSeqPos GetGapSizeForward(void) const
returns number of gap symbols ahead including current symbol returns 0 if current position is not in ...
CConstRef< CSeq_literal > GetGapSeq_literal(void) const
returns gap Seq-data object ref returns null if it's not a gap or an unspecified gap
bool GetRefMinusStrand(void) const
CSeqMap::ESegmentType GetType(void) const
static const char * sx_GetConvertTable(TCoding src, TCoding dst, bool reverse, ECaseConversion case_cvt)
bool CanGetRange(TSeqPos start, TSeqPos stop)
Check if the sequence can be obtained for the interval [start, stop)
TSeqPos GetRefEndPosition(void) const
SSeqMapSelector & SetStrand(ENa_strand strand)
Set strand to iterate over.
static TResidue sx_GetGapChar(TCoding coding, ECaseConversion case_cvt)
TSeqPos GetPosition(void) const
return position of current segment in sequence
AutoPtr< CSeqVector_CI > m_Iterator
TCoding GetCoding(void) const
Target sequence coding.
~CNcbi2naRandomizer(void)
friend class CSeqVector_CI
bool CanGetRange(TSeqPos start, TSeqPos stop) const
Check if the sequence data is available for the interval [start, stop).
CSeqVector & operator=(const CSeqVector &vec)
CConstRef< CSeqMap > m_SeqMap
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
CSeqVector_CI & x_GetIterator(TSeqPos pos) const
void SetNoAmbiguities(void)
void x_InitRandomizer(CRandom &random_gen)
CRef< INcbi2naRandomizer > m_Randomizer
virtual ~CSeqVector(void)
CConstRef< CSeq_literal > GetGapSeq_literal(TSeqPos pos) const
returns gap Seq-literal object ref returns null if it's not a gap or an unspecified gap
void x_ResetIterator(void) const
CSeqVector_CI * x_CreateIterator(TSeqPos pos) const
TMutex & GetMutex(void) const
Get mutex for a few non-MT-safe methods to make them MT-safe at a cost of performance.
CNcbi2naRandomizer(CRandom &gen)
bool IsProtein(void) const
void SetCoding(TCoding coding)
void SetIupacCoding(void)
Set coding to either Iupacaa or Iupacna depending on molecule type.
char m_RandomTable[16][kRandomDataSize]
TSeqPos GetLength(CScope *scope) const
void x_GetPacked8SeqData(string &dst_str, TSeqPos src_pos, TSeqPos src_end)
void SetRandomizeAmbiguities(void)
Randomization of ambiguities and gaps in ncbi2na coding.
TSeqPos GetGapSizeForward(TSeqPos pos) const
returns number of gap symbols ahead including base at position 'pos' returns 0 if the position is not...
void x_GetPacked2naSeqData(string &dst_str, TSeqPos src_pos, TSeqPos src_end)
void SetNcbiCoding(void)
Set coding to either Ncbi8aa or Ncbi8na depending on molecule type.
void SetStrand(ENa_strand strand)
void GetPackedSeqData(string &buffer, TSeqPos start=0, TSeqPos stop=kInvalidSeqPos)
void RandomizeData(char *buffer, size_t count, TSeqPos pos)
Convert count unpacked bases in buffer 4na -> 2na with randomization.
void x_GetPacked4naSeqData(string &dst_str, TSeqPos src_pos, TSeqPos src_end)
TResidue GetGapChar(ECaseConversion case_cvt=eCaseConversion_none) const
Return gap symbol corresponding to the selected coding.
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
uint32_t Uint4
4-byte (32-bit) unsigned integer
Uint4 TValue
Type of the generated integer value and/or the seed value.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
void set_bit(unsigned *dest, unsigned bitpos) noexcept
Set 1 bit in a block.
ENa_strand
strand of nucleic acid
const TIupacaa & GetIupacaa(void) const
Get the variant data.
const TInst & GetInst(void) const
Get the Inst member data.
const TIupacna & GetIupacna(void) const
Get the variant data.
const TNcbi8aa & GetNcbi8aa(void) const
Get the variant data.
TMol GetMol(void) const
Get the Mol member data.
const TNcbieaa & GetNcbieaa(void) const
Get the variant data.
const TNcbistdaa & GetNcbistdaa(void) const
Get the variant data.
const TNcbi4na & GetNcbi4na(void) const
Get the variant data.
const TNcbi2na & GetNcbi2na(void) const
Get the variant data.
const TNcbi8na & GetNcbi8na(void) const
Get the variant data.
E_Choice Which(void) const
Which variant is currently selected.
@ e_not_set
No variant selected.
@ e_Ncbipna
nucleic acid probabilities
@ e_Ncbieaa
extended ASCII 1 letter aa codes
@ e_Ncbistdaa
consecutive codes for std aas
@ e_Ncbi2na
2 bit nucleic acid code
@ e_Iupacna
IUPAC 1 letter nuc acid code.
@ e_Ncbipaa
amino acid probabilities
@ e_Ncbi8na
8 bit extended nucleic acid code
@ e_Ncbi4na
4 bit nucleic acid code
@ e_Iupacaa
IUPAC 1 letter amino acid code.
@ e_Ncbi8aa
8 bit extended amino acid codes
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
const struct ncbi::grid::netcache::search::fields::KEY key
static size_t rnd(size_t minimal, size_t maximal)
Multi-threading – mutexes; rw-locks; semaphore.
static const unsigned char * tables(int mode)
static pcre_uint8 * buffer
static void x_AppendGapTo4(string &dst_str, char &dst_c, TSeqPos dst_pos, TSeqPos count, char gap)
static void x_AppendAnyTo4(string &dst_str, char &dst_c, TSeqPos dst_pos, const CSeq_data &data, TSeqPos dataPos, TSeqPos total_count, const char *table, bool reverse)
static void x_AppendAnyTo8(string &dst_str, const CSeq_data &data, TSeqPos dataPos, TSeqPos total_count, const char *table=0, bool reverse=false)
static void x_Append2To2(string &dst, char &dst_c, TSeqPos dst_pos, const vector< char > &src, TSeqPos src_pos, TSeqPos count)
static void x_Append8To2(string &dst_str, char &dst_c, TSeqPos dst_pos, const char *buffer, TSeqPos count)
static const size_t kBufferSize
static void x_AppendAnyTo2(string &dst_str, char &dst_c, TSeqPos dst_pos, const CSeq_data &data, TSeqPos dataPos, TSeqPos total_count, const char *table, bool reverse, INcbi2naRandomizer *randomizer, TSeqPos randomizer_pos)
static void x_AppendGapTo8(string &dst_str, size_t count, char gap)
static void x_Append8To8(string &dst_str, const string &src_str, size_t src_pos, size_t count)
static void x_Append4To4(string &dst, char &dst_c, TSeqPos dst_pos, const vector< char > &src, TSeqPos src_pos, TSeqPos count)
static void x_AppendRandomTo2(string &dst_str, char &dst_c, TSeqPos dst_pos, TSeqPos src_pos, TSeqPos count, INcbi2naRandomizer &randomizer, char gap)
static void x_Append8To4(string &dst, char &dst_c, TSeqPos dst_pos, const char *src, size_t count)
DEFINE_STATIC_FAST_MUTEX(s_ConvertTableMutex2)
void copy_8bit_any(DstIter dst, size_t count, const SrcCont &srcCont, size_t srcPos, const char *table, bool reverse)
void copy_4bit_any(DstIter dst, size_t count, const SrcCont &srcCont, size_t srcPos, const char *table, bool reverse)
void copy_2bit_any(DstIter dst, size_t count, const SrcCont &srcCont, size_t srcPos, const char *table, bool reverse)
void copy_2bit(DstIter dst, size_t count, const SrcCont &srcCont, size_t srcPos)
Selector used in CSeqMap methods returning iterators.