NCBI C++ ToolKit
seq_vector.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_vector.cpp 100321 2023-07-20 14:27:32Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksey Grichenko, Eugene Vasilchenko
27 *
28 * File Description:
29 * Sequence data container for object manager
30 *
31 */
32 
33 
34 #include <ncbi_pch.hpp>
35 #include <objmgr/seq_vector.hpp>
36 #include <objmgr/seq_vector_ci.hpp>
37 #include <corelib/ncbimtx.hpp>
41 #include <objmgr/seq_map.hpp>
44 #include <algorithm>
45 #include <map>
46 #include <vector>
47 #include <util/random_gen.hpp>
48 
51 
52 
53 ////////////////////////////////////////////////////////////////////
54 //
55 // CNcbi2naRandomizer::
56 //
57 
59 {
60 }
61 
62 
64 {
65  unsigned int bases[4]; // Count of each base in the random distribution
66  for (int na4 = 0; na4 < 16; na4++) {
67  int bit_count = 0;
68  char set_bit = 0;
69  for (int bit = 0; bit < 4; bit++) {
70  // na4 == 0 is special case (gap) should be treated as 0xf
71  if ( !na4 || (na4 & (1 << bit)) ) {
72  bit_count++;
73  bases[bit] = 1;
74  set_bit = (char)bit;
75  }
76  else {
77  bases[bit] = 0;
78  }
79  }
80  if (bit_count == 1) {
81  // Single base
82  m_FixedTable[na4] = set_bit;
83  continue;
84  }
86  // Ambiguity: create random distribution with possible bases
87  for (int bit = 0; bit < 4; bit++) {
88  bases[bit] *= kRandomDataSize/bit_count +
89  kRandomDataSize % bit_count;
90  }
91  for (int i = kRandomDataSize - 1; i >= 0; i--) {
92  CRandom::TValue rnd = gen.GetRand(0, i);
93  for (int base = 0; base < 4; base++) {
94  if (!bases[base] || rnd > bases[base]) {
95  rnd -= bases[base];
96  continue;
97  }
98  m_RandomTable[na4][i] = (char)base;
99  bases[base]--;
100  break;
101  }
102  }
103  }
104 }
105 
106 
108 {
109 }
110 
111 
113  size_t count,
114  TSeqPos pos)
115 {
116  for (char* stop = data + count; data < stop; ++data, ++pos) {
117  int base4na = *data;
118  char base2na = m_FixedTable[base4na];
119  if ( base2na == kRandomValue ) {
120  // Ambiguity, use random value
121  base2na = m_RandomTable[base4na][(pos & kRandomizerPosMask)];
122  }
123  *data = base2na;
124  }
125 }
126 
127 
128 ////////////////////////////////////////////////////////////////////
129 //
130 // CSeqVector::
131 //
132 
133 
135  : m_Size(0)
136 {
137 }
138 
139 
141  : m_Scope(vec.m_Scope),
142  m_SeqMap(vec.m_SeqMap),
143  m_TSE(vec.m_TSE),
144  m_Size(vec.m_Size),
145  m_Mol(vec.m_Mol),
146  m_Strand(vec.m_Strand),
147  m_Coding(vec.m_Coding)
148 {
149 }
150 
151 
153  EVectorCoding coding, ENa_strand strand)
154  : m_Scope(bioseq.GetScope()),
155  m_SeqMap(&bioseq.GetSeqMap()),
156  m_TSE(bioseq.GetTSE_Handle()),
157  m_Strand(strand),
158  m_Coding(CSeq_data::e_not_set)
159 {
160  m_Size = bioseq.GetBioseqLength();
161  m_Mol = bioseq.GetSequenceType();
162  SetCoding(coding);
163 }
164 
165 
166 CSeqVector::CSeqVector(const CSeqMap& seqMap, CScope& scope,
167  EVectorCoding coding, ENa_strand strand)
168  : m_Scope(&scope),
169  m_SeqMap(&seqMap),
170  m_Strand(strand),
171  m_Coding(CSeq_data::e_not_set)
172 {
174  m_Mol = m_SeqMap->GetMol();
175  SetCoding(coding);
176 }
177 
178 
179 CSeqVector::CSeqVector(const CSeqMap& seqMap, const CTSE_Handle& top_tse,
180  EVectorCoding coding, ENa_strand strand)
181  : m_Scope(top_tse.GetScope()),
182  m_SeqMap(&seqMap),
183  m_TSE(top_tse),
184  m_Strand(strand),
185  m_Coding(CSeq_data::e_not_set)
186 {
188  m_Mol = m_SeqMap->GetMol();
189  SetCoding(coding);
190 }
191 
192 
194  EVectorCoding coding, ENa_strand strand)
195  : m_Scope(&scope),
196  m_SeqMap(CSeqMap::GetSeqMapForSeq_loc(loc, &scope)),
197  m_Strand(strand),
198  m_Coding(CSeq_data::e_not_set)
199 {
200  if ( const CSeq_id* id = loc.GetId() ) {
201  if ( CBioseq_Handle bh = scope.GetBioseqHandle(*id) ) {
202  m_TSE = bh.GetTSE_Handle();
203  }
204  }
206  m_Mol = m_SeqMap->GetMol();
207  SetCoding(coding);
208 }
209 
210 
211 CSeqVector::CSeqVector(const CSeq_loc& loc, const CTSE_Handle& top_tse,
212  EVectorCoding coding, ENa_strand strand)
213  : m_Scope(top_tse.GetScope()),
214  m_SeqMap(CSeqMap::GetSeqMapForSeq_loc(loc, &top_tse.GetScope())),
215  m_TSE(top_tse),
216  m_Strand(strand),
217  m_Coding(CSeq_data::e_not_set)
218 {
220  m_Mol = m_SeqMap->GetMol();
221  SetCoding(coding);
222 }
223 
224 
226  CScope* scope,
227  EVectorCoding coding, ENa_strand strand)
228  : m_Scope(scope),
229  m_SeqMap(CSeqMap::CreateSeqMapForBioseq(bioseq)),
230  m_Strand(strand),
231  m_Coding(CSeq_data::e_not_set)
232 {
233  m_Size = m_SeqMap->GetLength(scope);
234  m_Mol = bioseq.GetInst().GetMol();
235  SetCoding(coding);
236 }
237 
238 
240 {
241 }
242 
243 
245 {
246  if ( &vec != this ) {
247  TMutexGuard guard(GetMutex());
248  m_Scope = vec.m_Scope;
249  m_SeqMap = vec.m_SeqMap;
250  m_TSE = vec.m_TSE;
251  m_Size = vec.m_Size;
252  m_Mol = vec.m_Mol;
253  m_Strand = vec.m_Strand;
254  m_Coding = vec.m_Coding;
255  m_Iterator.reset();
256  }
257  return *this;
258 }
259 
260 
262 {
263  CSeqVector_CI* iter;
264  m_Iterator.reset(iter = new CSeqVector_CI(*this, pos));
265  return iter;
266 }
267 
268 
270 {
271  if ( m_Iterator.get() ) {
272  TMutexGuard guard(GetMutex());
273  m_Iterator.reset();
274  }
275 }
276 
277 
279 {
280  TMutexGuard guard(GetMutex());
281  return x_GetIterator(pos).GetGapSizeForward();
282 }
283 
284 
286 {
287  TMutexGuard guard(GetMutex());
288  return x_GetIterator(pos).GetGapSeq_literal();
289 }
290 
291 
292 bool CSeqVector::CanGetRange(TSeqPos start, TSeqPos stop) const
293 {
294  try {
295  TMutexGuard guard(GetMutex());
296  return x_GetIterator(start).CanGetRange(start, stop);
297  }
298  catch ( CException& /*ignored*/ ) {
299  return false;
300  }
301 }
302 
303 
304 void CSeqVector::GetSeqData(TSeqPos start, TSeqPos stop, string& buffer) const
305 {
306  TMutexGuard guard(GetMutex());
307  x_GetIterator(start).GetSeqData(start, stop, buffer);
308 }
309 
310 
311 void CSeqVector::GetPackedSeqData(string& dst_str,
312  TSeqPos src_pos,
313  TSeqPos src_end)
314 {
315  dst_str.erase();
316  src_end = min(src_end, size());
317  if ( src_pos >= src_end ) {
318  return;
319  }
320 
321  if ( m_TSE && !CanGetRange(src_pos, src_end) ) {
323  "CSeqVector::GetPackedSeqData: "
324  "cannot get seq-data in range: "
325  <<src_pos<<"-"<<src_end);
326  }
327 
328  TCoding dst_coding = GetCoding();
329  switch ( dst_coding ) {
336  x_GetPacked8SeqData(dst_str, src_pos, src_end);
337  break;
339  x_GetPacked4naSeqData(dst_str, src_pos, src_end);
340  break;
342  x_GetPacked2naSeqData(dst_str, src_pos, src_end);
343  break;
344  default:
345  NCBI_THROW_FMT(CSeqVectorException, eCodingError,
346  "Can not pack data using the selected coding: "<<
347  GetCoding());
348  }
349 }
350 
351 static const size_t kBufferSize = 1024; // must be multiple of 4
352 
353 static inline
354 void x_Append8To8(string& dst_str, const string& src_str,
355  size_t src_pos, size_t count)
356 {
357  _ASSERT(src_pos+count >= src_pos); // check for overflow
358  _ASSERT(src_pos+count <= src_str.size());
359  if ( count ) {
360  dst_str.append(src_str.data()+src_pos, count);
361  }
362 }
363 
364 
365 static inline
366 void x_Append8To8(string& dst_str, const vector<char>& src_str,
367  size_t src_pos, size_t count)
368 {
369  _ASSERT(src_pos+count >= src_pos); // check for overflow
370  _ASSERT(src_pos+count <= src_str.size());
371  if ( count ) {
372  dst_str.append(&src_str[src_pos], count);
373  }
374 }
375 
376 
377 static inline
378 void x_AppendGapTo8(string& dst_str, size_t count, char gap)
379 {
380  if ( count ) {
381  dst_str.append(count, gap);
382  }
383 }
384 
385 
386 static
387 void x_Append8To4(string& dst, char& dst_c, TSeqPos dst_pos,
388  const char* src, size_t count)
389 {
390  _ASSERT(src+count >= src); // check for overflow
391  if ( !count ) {
392  return;
393  }
394  if ( dst_pos & 1 ) {
395  dst += char((dst_c<<4)|*src);
396  dst_c = 0;
397  ++dst_pos;
398  ++src;
399  --count;
400  }
401  for ( ; count >= 2; dst_pos += 2, src += 2, count -= 2 ) {
402  dst += char((src[0]<<4)|src[1]);
403  }
404  if ( count&1 ) {
405  dst_c = *src;
406  }
407 }
408 
409 
410 static
411 void x_Append4To4(string& dst, char& dst_c, TSeqPos dst_pos,
412  const vector<char>& src, TSeqPos src_pos,
413  TSeqPos count)
414 {
415  _ASSERT(src_pos+count >= src_pos); // check for overflow
416  _ASSERT(src_pos+count <= src.size()*2);
417  if ( !count ) {
418  return;
419  }
420  if ( (src_pos^dst_pos) & 1 ) {
421  // misaligned data -> dst_str
422  if ( dst_pos & 1 ) {
423  // align dst_pos
424  dst += char((dst_c<<4)|((src[src_pos>>1]>>4)&15));
425  dst_c = 0;
426  ++dst_pos;
427  ++src_pos;
428  --count;
429  }
430  _ASSERT((src_pos&1));
431  size_t pos = src_pos>>1;
432  for ( ; count >= 2; dst_pos += 2, pos += 1, count -= 2 ) {
433  dst += char(((src[pos]<<4)&0xf0)|((src[pos+1]>>4)&0x0f));
434  }
435  if ( count&1 ) {
436  _ASSERT((src_pos&1));
437  dst_c = (src[pos])&15;
438  }
439  }
440  else {
441  // aligned data -> dst_str
442  if ( dst_pos & 1 ) {
443  // align dst_pos
444  dst += char((dst_c<<4)|((src[src_pos>>1])&15));
445  dst_c = 0;
446  ++dst_pos;
447  ++src_pos;
448  --count;
449  }
450  _ASSERT(!(src_pos&1));
451  _ASSERT(!(dst_pos&1));
452  size_t octets = count>>1;
453  size_t pos = src_pos>>1;
454  if ( octets ) {
455  dst.append(&src[pos], octets);
456  }
457  if ( count&1 ) {
458  _ASSERT(!(src_pos&1));
459  dst_c = (src[pos+octets]>>4)&15;
460  }
461  }
462 }
463 
464 
465 static
466 void x_AppendGapTo4(string& dst_str, char& dst_c, TSeqPos dst_pos,
467  TSeqPos count, char gap)
468 {
469  if ( !count ) {
470  return;
471  }
472  if ( dst_pos & 1 ) {
473  // align dst_pos
474  dst_str += char((dst_c << 4)|gap);
475  dst_c = 0;
476  ++dst_pos;
477  --count;
478  }
479  _ASSERT(!(dst_pos&1));
480  size_t octets = count>>1;
481  if ( octets ) {
482  dst_str.append(octets, char((gap<<4)|gap));
483  }
484  if ( count&1 ) {
485  dst_c = gap;
486  }
487 }
488 
489 
490 static
491 void x_Append8To2(string& dst_str, char& dst_c, TSeqPos dst_pos,
492  const char* buffer, TSeqPos count)
493 {
494  if ( !count ) {
495  return;
496  }
497  _ASSERT(dst_str.size() == dst_pos>>2);
498  const char* unpacked = buffer;
499  if ( dst_pos&3 ) {
500  char c = dst_c;
501  for ( ; count && (dst_pos&3); --count, ++dst_pos ) {
502  c = char((c<<2)|*unpacked++);
503  }
504  if ( (dst_pos&3) == 0 ) {
505  dst_str += c;
506  dst_c = 0;
507  }
508  else {
509  dst_c = c;
510  }
511  if ( !count ) {
512  return;
513  }
514  }
515  _ASSERT((dst_pos&3) == 0);
516  _ASSERT(dst_str.size() == dst_pos>>2);
517  char packed_buffer[kBufferSize/4];
518  char* packed_end = packed_buffer;
519  for ( ; count >= 4; count -= 4, unpacked += 4 ) {
520  *packed_end++ = char(
521  (unpacked[0]<<6)|(unpacked[1]<<4)|(unpacked[2]<<2)|unpacked[3] );
522  }
523  dst_str.append(packed_buffer, packed_end);
524  switch ( count ) {
525  case 1:
526  dst_c = unpacked[0];
527  break;
528  case 2:
529  dst_c = char((unpacked[0]<<2)|unpacked[1]);
530  break;
531  case 3:
532  dst_c = char((unpacked[0]<<4)|(unpacked[1]<<2)|unpacked[2]);
533  break;
534  default:
535  dst_c = 0;
536  break;
537  }
538 }
539 
540 
541 static
542 void x_Append2To2(string& dst, char& dst_c, TSeqPos dst_pos,
543  const vector<char>& src, TSeqPos src_pos,
544  TSeqPos count)
545 {
546  _ASSERT(src_pos+count >= src_pos); // check for overflow
547  _ASSERT(src_pos+count <= src.size()*4);
548  if ( !count ) {
549  return;
550  }
551  if ( (src_pos^dst_pos) & 3 ) {
552  // misaligned src -> dst
553  char buffer[kBufferSize];
554  while ( count ) {
555  // if count is larger than buffer size make sure
556  // that the next dst_pos is aligned to 4.
557  TSeqPos chunk = min(count, TSeqPos(kBufferSize-(dst_pos&3)));
558  copy_2bit(buffer, chunk, src, src_pos);
559  // Array buffer[] is properly initialized in copy_2bit()
560  // but Clang static analyzer fails to notice it
561  // and issues false warning inside x_Append8To2() call.
562  x_Append8To2(dst, dst_c, dst_pos, buffer, chunk);
563  dst_pos += chunk;
564  src_pos += chunk;
565  count -= chunk;
566  }
567  return;
568  }
569 
570  // aligned src -> dst
571  if ( dst_pos&3 ) {
572  // align dst_pos
573  TSeqPos add = 4-(dst_pos&3);
574  char c = char((dst_c<<(add*2))|(src[src_pos>>2]&((1<<(add*2))-1)));
575  if ( count < add ) {
576  dst_c = char(c >> (2*(add-count)));
577  return;
578  }
579  dst += c;
580  dst_c = 0;
581  src_pos += add;
582  // Dead increment: dst_pos is not used anymore
583  //dst_pos += add;
584  count -= add;
585  }
586  _ASSERT(!(src_pos&3));
587  size_t octets = count>>2;
588  size_t pos = src_pos>>2;
589  if ( octets ) {
590  dst.append(&src[pos], octets);
591  }
592  size_t rem = count&3;
593  if ( rem ) {
594  _ASSERT(!(src_pos&3));
595  dst_c = char((src[pos+octets]&255)>>(2*(4-rem)));
596  }
597 }
598 
599 
600 static
601 void x_AppendRandomTo2(string& dst_str, char& dst_c, TSeqPos dst_pos,
602  TSeqPos src_pos, TSeqPos count,
603  INcbi2naRandomizer& randomizer, char gap)
604 {
605  _ASSERT(src_pos+count >= src_pos); // check for overflow
606  char buffer[kBufferSize];
607  while ( count ) {
608  _ASSERT(dst_str.size() == dst_pos>>2);
609  // if count is larger than buffer size make sure
610  // that the next dst_pos is aligned to 4.
611  TSeqPos chunk = min(count, TSeqPos(kBufferSize-(dst_pos&3)));
612  fill_n(buffer, chunk, gap);
613  randomizer.RandomizeData(buffer, chunk, src_pos);
614  x_Append8To2(dst_str, dst_c, dst_pos, buffer, chunk);
615  count -= chunk;
616  src_pos += chunk;
617  dst_pos += chunk;
618  _ASSERT(dst_str.size() == dst_pos>>2);
619  }
620 }
621 
622 
623 static
624 void x_AppendAnyTo8(string& dst_str,
625  const CSeq_data& data, TSeqPos dataPos,
626  TSeqPos total_count,
627  const char* table = 0, bool reverse = false)
628 {
629  _ASSERT(dataPos+total_count >= dataPos); // check for overflow
630  char buffer[kBufferSize];
631  CSeq_data::E_Choice src_coding = data.Which();
632  if ( reverse ) {
633  dataPos += total_count;
634  }
635  while ( total_count ) {
636  TSeqPos count = min(total_count, TSeqPos(sizeof(buffer)));
637  if ( reverse ) {
638  dataPos -= count;
639  }
640  switch ( src_coding ) {
642  copy_8bit_any(buffer, count, data.GetIupacna().Get(), dataPos,
643  table, reverse);
644  break;
646  copy_8bit_any(buffer, count, data.GetIupacaa().Get(), dataPos,
647  table, reverse);
648  break;
650  copy_2bit_any(buffer, count, data.GetNcbi2na().Get(), dataPos,
651  table, reverse);
652  break;
654  copy_4bit_any(buffer, count, data.GetNcbi4na().Get(), dataPos,
655  table, reverse);
656  break;
658  copy_8bit_any(buffer, count, data.GetNcbi8na().Get(), dataPos,
659  table, reverse);
660  break;
662  copy_8bit_any(buffer, count, data.GetNcbi8aa().Get(), dataPos,
663  table, reverse);
664  break;
666  copy_8bit_any(buffer, count, data.GetNcbieaa().Get(), dataPos,
667  table, reverse);
668  break;
670  copy_8bit_any(buffer, count, data.GetNcbistdaa().Get(), dataPos,
671  table, reverse);
672  break;
673  default:
674  NCBI_THROW_FMT(CSeqVectorException, eCodingError,
675  "Invalid data coding: "<<src_coding);
676  }
677  dst_str.append(buffer, count);
678  if ( !reverse ) {
679  dataPos += count;
680  }
681  total_count -= count;
682  }
683 }
684 
685 
686 static
687 void x_AppendAnyTo4(string& dst_str, char& dst_c, TSeqPos dst_pos,
688  const CSeq_data& data, TSeqPos dataPos,
689  TSeqPos total_count,
690  const char* table, bool reverse)
691 {
692  _ASSERT(dataPos+total_count >= dataPos); // check for overflow
693  _ASSERT(table || reverse);
694  char buffer[kBufferSize];
695  CSeq_data::E_Choice src_coding = data.Which();
696  if ( reverse ) {
697  dataPos += total_count;
698  }
699  while ( total_count ) {
700  TSeqPos count = min(total_count, TSeqPos(sizeof(buffer)));
701  if ( reverse ) {
702  dataPos -= count;
703  }
704  switch ( src_coding ) {
706  copy_8bit_any(buffer, count, data.GetIupacna().Get(), dataPos,
707  table, reverse);
708  break;
710  copy_8bit_any(buffer, count, data.GetIupacaa().Get(), dataPos,
711  table, reverse);
712  break;
714  copy_2bit_any(buffer, count, data.GetNcbi2na().Get(), dataPos,
715  table, reverse);
716  break;
718  copy_4bit_any(buffer, count, data.GetNcbi4na().Get(), dataPos,
719  table, reverse);
720  break;
722  copy_8bit_any(buffer, count, data.GetNcbi8na().Get(), dataPos,
723  table, reverse);
724  break;
726  copy_8bit_any(buffer, count, data.GetNcbi8aa().Get(), dataPos,
727  table, reverse);
728  break;
730  copy_8bit_any(buffer, count, data.GetNcbieaa().Get(), dataPos,
731  table, reverse);
732  break;
734  copy_8bit_any(buffer, count, data.GetNcbistdaa().Get(), dataPos,
735  table, reverse);
736  break;
737  default:
738  NCBI_THROW_FMT(CSeqVectorException, eCodingError,
739  "Invalid data coding: "<<src_coding);
740  }
741  x_Append8To4(dst_str, dst_c, dst_pos, buffer, count);
742  if ( !reverse ) {
743  dataPos += count;
744  }
745  dst_pos += count;
746  total_count -= count;
747  }
748 }
749 
750 
751 static
752 void x_AppendAnyTo2(string& dst_str, char& dst_c, TSeqPos dst_pos,
753  const CSeq_data& data, TSeqPos dataPos,
754  TSeqPos total_count,
755  const char* table, bool reverse,
756  INcbi2naRandomizer* randomizer, TSeqPos randomizer_pos)
757 {
758  _ASSERT(dataPos+total_count >= dataPos); // check for overflow
759  _ASSERT(table || reverse || randomizer);
760  char buffer[kBufferSize];
761  CSeq_data::E_Choice src_coding = data.Which();
762  if ( reverse ) {
763  dataPos += total_count;
764  }
765  while ( total_count ) {
766  TSeqPos count = min(total_count, TSeqPos(sizeof(buffer)));
767  if ( reverse ) {
768  dataPos -= count;
769  }
770  switch ( src_coding ) {
772  copy_8bit_any(buffer, count, data.GetIupacna().Get(), dataPos,
773  table, reverse);
774  break;
776  copy_8bit_any(buffer, count, data.GetIupacaa().Get(), dataPos,
777  table, reverse);
778  break;
780  copy_2bit_any(buffer, count, data.GetNcbi2na().Get(), dataPos,
781  table, reverse);
782  break;
784  copy_4bit_any(buffer, count, data.GetNcbi4na().Get(), dataPos,
785  table, reverse);
786  break;
788  copy_8bit_any(buffer, count, data.GetNcbi8na().Get(), dataPos,
789  table, reverse);
790  break;
792  copy_8bit_any(buffer, count, data.GetNcbi8aa().Get(), dataPos,
793  table, reverse);
794  break;
796  copy_8bit_any(buffer, count, data.GetNcbieaa().Get(), dataPos,
797  table, reverse);
798  break;
800  copy_8bit_any(buffer, count, data.GetNcbistdaa().Get(), dataPos,
801  table, reverse);
802  break;
803  default:
804  NCBI_THROW_FMT(CSeqVectorException, eCodingError,
805  "Invalid data coding: "<<src_coding);
806  }
807  if ( randomizer ) {
808  randomizer->RandomizeData(buffer, count, randomizer_pos);
809  }
810  x_Append8To2(dst_str, dst_c, dst_pos, buffer, count);
811  if ( !reverse ) {
812  dataPos += count;
813  }
814  dst_pos += count;
815  randomizer_pos += count;
816  total_count -= count;
817  }
818 }
819 
820 
821 void CSeqVector::x_GetPacked8SeqData(string& dst_str,
822  TSeqPos src_pos,
823  TSeqPos src_end)
824 {
825  ECaseConversion case_conversion = eCaseConversion_none;
827  sel.SetStrand(m_Strand);
828  if ( m_TSE ) {
829  sel.SetLinkUsedTSE(m_TSE);
830  }
831  CSeqMap_CI seg(m_SeqMap, m_Scope.GetScopeOrNull(), sel, src_pos);
832 
833  dst_str.reserve(src_end-src_pos);
834  TCoding dst_coding = GetCoding();
835  _DEBUG_ARG(TSeqPos dst_pos = 0);
836  while ( src_pos < src_end ) {
837  _ASSERT(dst_str.size() == dst_pos);
838  TSeqPos count = min(src_end-src_pos, seg.GetEndPosition()-src_pos);
839  if ( seg.GetType() == CSeqMap::eSeqGap ) {
840  x_AppendGapTo8(dst_str, count, GetGapChar());
841  }
842  else {
843  const CSeq_data& data = seg.GetRefData();
844  bool reverse = seg.GetRefMinusStrand();
845  TCoding src_coding = data.Which();
846 
847  const char* table = 0;
848  if ( dst_coding != src_coding || reverse ||
849  case_conversion != eCaseConversion_none ) {
850  table = sx_GetConvertTable(src_coding, dst_coding,
851  reverse, case_conversion);
852  if ( !table && src_coding != dst_coding ) {
853  NCBI_THROW_FMT(CSeqVectorException, eCodingError,
854  "Incompatible sequence codings: "<<
855  src_coding<<" -> "<<dst_coding);
856  }
857  }
858 
859  TSeqPos dataPos;
860  if ( reverse ) {
861  // Revert segment offset
862  dataPos = seg.GetRefEndPosition() -
863  (src_pos - seg.GetPosition()) - count;
864  }
865  else {
866  dataPos = seg.GetRefPosition() +
867  (src_pos - seg.GetPosition());
868  }
869 
870  if ( ( !table || table == sm_TrivialTable) && !reverse ) {
871  switch ( src_coding ) {
873  x_Append8To8(dst_str, data.GetIupacna().Get(),
874  dataPos, count);
875  break;
877  x_Append8To8(dst_str, data.GetIupacaa().Get(),
878  dataPos, count);
879  break;
881  x_Append8To8(dst_str, data.GetNcbi8na().Get(),
882  dataPos, count);
883  break;
885  x_Append8To8(dst_str, data.GetNcbi8aa().Get(),
886  dataPos, count);
887  break;
889  x_Append8To8(dst_str, data.GetNcbieaa().Get(),
890  dataPos, count);
891  break;
893  x_Append8To8(dst_str, data.GetNcbistdaa().Get(),
894  dataPos, count);
895  break;
896  default:
897  x_AppendAnyTo8(dst_str, data, dataPos, count);
898  break;
899  }
900  }
901  else {
902  x_AppendAnyTo8(dst_str, data, dataPos, count, table, reverse);
903  }
904  }
905  ++seg;
906  src_pos += count;
907  _ASSERT(dst_str.size() == (dst_pos+=count));
908  }
909 }
910 
911 
913  TSeqPos src_pos,
914  TSeqPos src_end)
915 {
916  ECaseConversion case_conversion = eCaseConversion_none;
918  sel.SetStrand(m_Strand);
919  if ( m_TSE ) {
920  sel.SetLinkUsedTSE(m_TSE);
921  }
922  CSeqMap_CI seg(m_SeqMap, m_Scope.GetScopeOrNull(), sel, src_pos);
923 
924  dst_str.reserve((src_end-src_pos+1)>>1);
925  TCoding dst_coding = GetCoding();
926  TSeqPos dst_pos = 0;
927  char dst_c = 0;
928  while ( src_pos < src_end ) {
929  _ASSERT(dst_str.size() == dst_pos>>1);
930  TSeqPos count = min(src_end-src_pos, seg.GetEndPosition()-src_pos);
931  if ( seg.GetType() == CSeqMap::eSeqGap ) {
932  x_AppendGapTo4(dst_str, dst_c, dst_pos, count, GetGapChar());
933  }
934  else {
935  const CSeq_data& data = seg.GetRefData();
936  bool reverse = seg.GetRefMinusStrand();
937  TCoding src_coding = data.Which();
938 
939  const char* table = 0;
940  if ( dst_coding != src_coding || reverse ||
941  case_conversion != eCaseConversion_none ) {
942  table = sx_GetConvertTable(src_coding, dst_coding,
943  reverse, case_conversion);
944  if ( !table && src_coding != dst_coding ) {
945  NCBI_THROW_FMT(CSeqVectorException, eCodingError,
946  "Incompatible sequence codings: "<<
947  src_coding<<" -> "<<dst_coding);
948  }
949  }
950 
951  if ( (table && table != sm_TrivialTable) || reverse ) {
952  TSeqPos dataPos;
953  if ( reverse ) {
954  // Revert segment offset
955  dataPos = seg.GetRefEndPosition() -
956  (src_pos - seg.GetPosition()) - count;
957  }
958  else {
959  dataPos = seg.GetRefPosition() +
960  (src_pos - seg.GetPosition());
961  }
962  x_AppendAnyTo4(dst_str, dst_c, dst_pos,
963  data, dataPos, count, table, reverse);
964  }
965  else {
966  TSeqPos dataPos = seg.GetRefPosition() +
967  (src_pos - seg.GetPosition());
968  x_Append4To4(dst_str, dst_c, dst_pos,
969  data.GetNcbi4na().Get(), dataPos, count);
970  }
971  }
972  ++seg;
973  dst_pos += count;
974  src_pos += count;
975  _ASSERT(dst_str.size() == dst_pos>>1);
976  }
977  if ( dst_pos&1 ) {
978  dst_str += char(dst_c<<4);
979  }
980 }
981 
982 
984  TSeqPos src_pos,
985  TSeqPos src_end)
986 {
987  ECaseConversion case_conversion = eCaseConversion_none;
989  sel.SetStrand(m_Strand);
990  if ( m_TSE ) {
991  sel.SetLinkUsedTSE(m_TSE);
992  }
993  CSeqMap_CI seg(m_SeqMap, m_Scope.GetScopeOrNull(), sel, src_pos);
994 
995  dst_str.reserve((src_end-src_pos+3)>>2);
997  TSeqPos dst_pos = 0;
998  char dst_c = 0;
999  while ( src_pos < src_end ) {
1000  _ASSERT(dst_str.size() == dst_pos>>2);
1001  TSeqPos count = min(src_end-src_pos, seg.GetEndPosition()-src_pos);
1002  if ( seg.GetType() == CSeqMap::eSeqGap ) {
1003  if ( !m_Randomizer ) {
1004  NCBI_THROW(CSeqVectorException, eCodingError,
1005  "Cannot fill NCBI2na gap without randomizer");
1006  }
1007  x_AppendRandomTo2(dst_str, dst_c, dst_pos, src_pos, count,
1008  *m_Randomizer,
1011  }
1012  else {
1013  const CSeq_data& data = seg.GetRefData();
1014  bool reverse = seg.GetRefMinusStrand();
1015  TCoding src_coding = data.Which();
1016  TCoding dst_coding = CSeq_data::e_Ncbi2na;
1017  INcbi2naRandomizer* randomizer = 0;
1018  if ( src_coding != dst_coding && m_Randomizer) {
1019  randomizer = m_Randomizer.GetPointer();
1020  _ASSERT(randomizer);
1021  dst_coding = CSeq_data::e_Ncbi4na;
1022  }
1023 
1024  const char* table = 0;
1025  if ( dst_coding != src_coding || reverse ||
1026  case_conversion != eCaseConversion_none ) {
1027  table = sx_GetConvertTable(src_coding, dst_coding,
1028  reverse, case_conversion);
1029  if ( !table && src_coding != dst_coding ) {
1030  NCBI_THROW_FMT(CSeqVectorException, eCodingError,
1031  "Incompatible sequence codings: "<<
1032  src_coding<<" -> "<<dst_coding);
1033  }
1034  }
1035 
1036  if ( (table && table != sm_TrivialTable) || reverse
1037  || randomizer ) {
1038  TSeqPos dataPos;
1039  if ( reverse ) {
1040  // Revert segment offset
1041  dataPos = seg.GetRefEndPosition() -
1042  (src_pos - seg.GetPosition()) - count;
1043  }
1044  else {
1045  dataPos = seg.GetRefPosition() +
1046  (src_pos - seg.GetPosition());
1047  }
1048  _ASSERT((!randomizer && dst_coding == CSeq_data::e_Ncbi2na) ||
1049  (randomizer && dst_coding == CSeq_data::e_Ncbi4na));
1050  x_AppendAnyTo2(dst_str, dst_c, dst_pos,
1051  data, dataPos, count, table, reverse,
1052  randomizer, src_pos);
1053  }
1054  else {
1055  _ASSERT(dst_coding == CSeq_data::e_Ncbi2na);
1056  TSeqPos dataPos = seg.GetRefPosition() +
1057  (src_pos - seg.GetPosition());
1058  x_Append2To2(dst_str, dst_c, dst_pos,
1059  data.GetNcbi2na().Get(), dataPos, count);
1060  }
1061  }
1062  ++seg;
1063  dst_pos += count;
1064  src_pos += count;
1065  _ASSERT(dst_str.size() == dst_pos>>2);
1066  }
1067  if ( dst_pos&3 ) {
1068  dst_str += char(dst_c << 2*TSeqPos(-TSignedSeqPos(dst_pos)&3));
1069  }
1070 }
1071 
1072 
1075 {
1076  switch (coding) {
1077  case CSeq_data::e_Iupacna: // DNA - N
1078  return case_cvt == eCaseConversion_lower? 'n': 'N';
1079 
1080  case CSeq_data::e_Ncbi8na: // DNA - bit representation
1081  case CSeq_data::e_Ncbi4na:
1082  return 0; // all bits set == any base
1083 
1084  case CSeq_data::e_Ncbieaa: // Proteins - X
1085  case CSeq_data::e_Ncbi8aa: // Protein - numeric representation
1086  return '-';
1087  case CSeq_data::e_Iupacaa:
1088  return case_cvt == eCaseConversion_lower? 'x': 'X';
1089 
1091  return 0;
1092 
1093  case CSeq_data::e_not_set:
1094  return 0; // It's not good to throw an exception here
1095 
1096  case CSeq_data::e_Ncbi2na: // Codings without gap symbols
1097  // Exception is not good here because it conflicts with CSeqVector_CI.
1098  return 0xff;
1099 
1100  case CSeq_data::e_Ncbipaa: //### Not sure about this
1101  case CSeq_data::e_Ncbipna: //### Not sure about this
1102  default:
1103  NCBI_THROW_FMT(CSeqVectorException, eCodingError,
1104  "Can not indicate gap using the selected coding: "<<
1105  coding);
1106  }
1107 }
1108 
1109 
1110 DEFINE_STATIC_FAST_MUTEX(s_ConvertTableMutex2);
1111 
1112 const char*
1114  bool reverse, ECaseConversion case_cvt)
1115 {
1116  CFastMutexGuard guard(s_ConvertTableMutex2);
1117  typedef pair<TCoding, TCoding> TMainConversion;
1118  typedef pair<bool, ECaseConversion> TConversionFlags;
1119  typedef pair<TMainConversion, TConversionFlags> TConversionKey;
1120  typedef vector<char> TConversionTable;
1121  typedef map<TConversionKey, TConversionTable> TTables;
1123 
1124  TConversionKey key;
1125  key.first = TMainConversion(src, dst);
1126  key.second = TConversionFlags(reverse, case_cvt);
1127  TTables::iterator it = tables->find(key);
1128  if ( it != tables->end() ) {
1129  // already created, but may be a stand-in
1130  switch (it->second.size()) {
1131  case 0: return 0; // error -- incompatible codings or the like
1132  case 1: return sm_TrivialTable;
1133  default: return &it->second[0];
1134  }
1135  }
1136  TConversionTable& table = (*tables)[key];
1137  if ( !CSeqportUtil::IsCodeAvailable(src) ||
1139  // invalid types
1140  return 0;
1141  }
1142 
1143  const size_t COUNT = kMax_UChar+1;
1144  const unsigned kInvalidCode = kMax_UChar;
1145 
1146  pair<unsigned, unsigned> srcIndex = CSeqportUtil::GetCodeIndexFromTo(src);
1147  if ( srcIndex.second >= COUNT ) {
1148  // too large range
1149  return 0;
1150  }
1151 
1152  if ( reverse ) {
1153  // check if src needs complement conversion
1154  try {
1155  CSeqportUtil::GetIndexComplement(src, srcIndex.first);
1156  }
1157  catch ( exception& /*noComplement*/ ) {
1158  reverse = false;
1159  }
1160  }
1161  if ( case_cvt != eCaseConversion_none ) {
1162  // check if dst is text format
1163  if ( dst != CSeq_data::e_Iupacaa &&
1164  dst != CSeq_data::e_Iupacna &&
1165  dst != CSeq_data::e_Ncbieaa ) {
1166  case_cvt = eCaseConversion_none;
1167  }
1168  }
1169 
1170  if ( dst != src ) {
1171  pair<unsigned, unsigned> dstIndex =
1173  if ( dstIndex.second >= COUNT ) {
1174  // too large range
1175  return 0;
1176  }
1177 
1178  try {
1179  // check for types compatibility
1180  CSeqportUtil::GetMapToIndex(src, dst, srcIndex.first);
1181  }
1182  catch ( exception& /*badType*/ ) {
1183  // incompatible types
1184  return 0;
1185  }
1186  }
1187  else if ( !reverse && case_cvt == eCaseConversion_none ) {
1188  // no need to convert at all
1189  return 0;
1190  }
1191 
1192  table.resize(COUNT, char(kInvalidCode));
1193  bool different = false;
1194  for ( unsigned i = srcIndex.first; i <= srcIndex.second; ++i ) {
1195  try {
1196  unsigned code = i;
1197  if ( reverse ) {
1199  }
1200  if ( dst != src ) {
1201  code = CSeqportUtil::GetMapToIndex(src, dst, code);
1202  }
1203  code = min(kInvalidCode, code);
1204  if ( case_cvt == eCaseConversion_upper ) {
1205  code = toupper((unsigned char) code);
1206  }
1207  else if( case_cvt == eCaseConversion_lower ) {
1208  code = tolower((unsigned char) code);
1209  }
1210  if ( code != i ) {
1211  different = true;
1212  }
1213  table[i] = char(code);
1214  }
1215  catch ( exception& /*noConversion or noComplement*/ ) {
1216  different = true;
1217  }
1218  }
1219  if ( !different ) {
1220  table.resize(1);
1221  return sm_TrivialTable;
1222  }
1223  return &table[0];
1224 }
1225 
1226 
1227 const char CSeqVectorTypes::sm_TrivialTable[256] = {
1228  '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
1229  '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
1230  '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
1231  '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
1232  '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
1233  '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
1234  '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
1235  '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
1236  '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
1237  '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
1238  '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
1239  '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
1240  '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
1241  '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
1242  '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
1243  '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
1244  '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
1245  '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
1246  '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
1247  '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
1248  '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
1249  '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
1250  '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
1251  '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
1252  '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
1253  '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
1254  '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
1255  '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
1256  '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
1257  '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
1258  '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
1259  '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff'
1260 };
1261 
1262 
1264 {
1265  if ( strand != m_Strand ) {
1266  m_Strand = strand;
1267  x_ResetIterator();
1268  }
1269 }
1270 
1271 
1273 {
1274  if (m_Coding != coding) {
1275  m_Coding = coding;
1276  x_ResetIterator();
1277  }
1278 }
1279 
1280 
1282 {
1284 }
1285 
1286 
1288 {
1290 }
1291 
1292 
1294 {
1295  switch ( coding ) {
1297  SetIupacCoding();
1298  break;
1300  SetNcbiCoding();
1301  break;
1302  default:
1304  break;
1305  }
1306 }
1307 
1308 
1310 {
1311  CRandom random_gen;
1312  x_InitRandomizer(random_gen);
1313 }
1314 
1315 
1317 {
1318  CRandom random_gen(seed);
1319  x_InitRandomizer(random_gen);
1320 }
1321 
1322 
1324 {
1325  x_InitRandomizer(random_gen);
1326 }
1327 
1328 
1330 {
1331  CRef<INcbi2naRandomizer> randomizer(new CNcbi2naRandomizer(random_gen));
1332  SetRandomizeAmbiguities(randomizer);
1333 }
1334 
1335 
1337 {
1338  if ( m_Randomizer != randomizer ) {
1339  m_Randomizer = randomizer;
1340  x_ResetIterator();
1341  }
1342 }
1343 
1344 
1346 {
1348 }
1349 
1350 
static CRef< CScope > m_Scope
CBioseq_Handle –.
CConstRef –.
Definition: ncbiobj.hpp:1266
CScope * GetScopeOrNull(void) const
Definition: heap_scope.cpp:74
CNcbi2naRandomizer –.
Definition: seq_vector.hpp:219
CRandom::
Definition: random_gen.hpp:66
CSafeStatic<>::
CScope –.
Definition: scope.hpp:92
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
CSeqMap –.
Definition: seq_map.hpp:93
SeqVector related exceptions.
CSeqVector –.
Definition: seq_vector.hpp:65
static TPair GetCodeIndexFromTo(CSeq_data::E_Choice code_type)
static bool IsCodeAvailable(CSeq_data::E_Choice code_type)
static TIndex GetIndexComplement(CSeq_data::E_Choice code_type, TIndex idx)
static TIndex GetMapToIndex(CSeq_data::E_Choice from_type, CSeq_data::E_Choice to_type, TIndex from_idx)
Definition: map.hpp:338
char data[12]
Definition: iconv.c:80
void reset(element_type *p=0, EOwnership ownership=eTakeOwnership)
Reset will delete the old pointer (if owned), set content to the new value, and assume the ownership ...
Definition: ncbimisc.hpp:480
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:469
#define _DEBUG_ARG(arg)
Definition: ncbidbg.hpp:134
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
#define NCBI_THROW_FMT(exception_class, err_code, message)
The same as NCBI_THROW but with message processed as output to ostream.
Definition: ncbiexpt.hpp:719
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TSeqPos GetBioseqLength(void) const
EVectorCoding
CSeqVector constructor flags.
TMol GetSequenceType(void) const
@ eCoding_Ncbi
Set coding to binary coding (Ncbi4na or Ncbistdaa)
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
virtual void RandomizeData(char *buffer, size_t count, TSeqPos pos)=0
Convert count unpacked bases in buffer 4na -> 2na with randomization.
TSeqPos GetEndPosition(void) const
return end position of current segment in sequence (exclusive)
Definition: seq_map_ci.hpp:679
const CSeq_data & GetRefData(void) const
will allow any data segments, user should check for position and strand
Definition: seq_map_ci.cpp:282
static const char sm_TrivialTable[256]
SSeqMapSelector & SetLinkUsedTSE(bool link=true)
Definition: seq_map_ci.hpp:157
TSeqPos GetRefPosition(void) const
Definition: seq_map_ci.hpp:693
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer)
Fill the buffer string with the sequence data for the interval [start, stop).
TSeqPos GetGapSizeForward(void) const
returns number of gap symbols ahead including current symbol returns 0 if current position is not in ...
CConstRef< CSeq_literal > GetGapSeq_literal(void) const
returns gap Seq-data object ref returns null if it's not a gap or an unspecified gap
unsigned char TResidue
bool GetRefMinusStrand(void) const
Definition: seq_map_ci.hpp:700
CSeqMap::ESegmentType GetType(void) const
Definition: seq_map_ci.hpp:651
static const char * sx_GetConvertTable(TCoding src, TCoding dst, bool reverse, ECaseConversion case_cvt)
bool CanGetRange(TSeqPos start, TSeqPos stop)
Check if the sequence can be obtained for the interval [start, stop)
TSeqPos GetRefEndPosition(void) const
Definition: seq_map_ci.hpp:707
SSeqMapSelector & SetStrand(ENa_strand strand)
Set strand to iterate over.
Definition: seq_map_ci.hpp:144
static TResidue sx_GetGapChar(TCoding coding, ECaseConversion case_cvt)
TSeqPos GetPosition(void) const
return position of current segment in sequence
Definition: seq_map_ci.hpp:665
AutoPtr< CSeqVector_CI > m_Iterator
Definition: seq_vector.hpp:209
TCoding GetCoding(void) const
Target sequence coding.
Definition: seq_vector.hpp:312
friend class CSeqVector_CI
Definition: seq_vector.hpp:179
bool CanGetRange(TSeqPos start, TSeqPos stop) const
Check if the sequence data is available for the interval [start, stop).
Definition: seq_vector.cpp:292
CSeqVector & operator=(const CSeqVector &vec)
Definition: seq_vector.cpp:244
CConstRef< CSeqMap > m_SeqMap
Definition: seq_vector.hpp:200
TCoding m_Coding
Definition: seq_vector.hpp:205
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
CSeqVector_CI & x_GetIterator(TSeqPos pos) const
Definition: seq_vector.hpp:249
CTSE_Handle m_TSE
Definition: seq_vector.hpp:201
CSeqVector(void)
Definition: seq_vector.cpp:134
void SetNoAmbiguities(void)
CHeapScope m_Scope
Definition: seq_vector.hpp:199
void x_InitRandomizer(CRandom &random_gen)
CRef< INcbi2naRandomizer > m_Randomizer
Definition: seq_vector.hpp:206
TSeqPos m_Size
Definition: seq_vector.hpp:202
virtual ~CSeqVector(void)
Definition: seq_vector.cpp:239
CConstRef< CSeq_literal > GetGapSeq_literal(TSeqPos pos) const
returns gap Seq-literal object ref returns null if it's not a gap or an unspecified gap
Definition: seq_vector.cpp:285
void x_ResetIterator(void) const
Definition: seq_vector.cpp:269
CSeqVector_CI * x_CreateIterator(TSeqPos pos) const
Definition: seq_vector.cpp:261
TSeqPos size(void) const
Definition: seq_vector.hpp:291
TMutex & GetMutex(void) const
Get mutex for a few non-MT-safe methods to make them MT-safe at a cost of performance.
Definition: seq_vector.hpp:263
CNcbi2naRandomizer(CRandom &gen)
Definition: seq_vector.cpp:63
bool IsProtein(void) const
Definition: seq_vector.hpp:350
TMol GetMol(void) const
Definition: seq_map.hpp:492
void SetCoding(TCoding coding)
void SetIupacCoding(void)
Set coding to either Iupacaa or Iupacna depending on molecule type.
ENa_strand m_Strand
Definition: seq_vector.hpp:204
char m_RandomTable[16][kRandomDataSize]
Definition: seq_vector.hpp:237
TSeqPos GetLength(CScope *scope) const
Definition: seq_map.hpp:482
void x_GetPacked8SeqData(string &dst_str, TSeqPos src_pos, TSeqPos src_end)
Definition: seq_vector.cpp:821
void SetRandomizeAmbiguities(void)
Randomization of ambiguities and gaps in ncbi2na coding.
TSeqPos GetGapSizeForward(TSeqPos pos) const
returns number of gap symbols ahead including base at position 'pos' returns 0 if the position is not...
Definition: seq_vector.cpp:278
void x_GetPacked2naSeqData(string &dst_str, TSeqPos src_pos, TSeqPos src_end)
Definition: seq_vector.cpp:983
void SetNcbiCoding(void)
Set coding to either Ncbi8aa or Ncbi8na depending on molecule type.
void SetStrand(ENa_strand strand)
void GetPackedSeqData(string &buffer, TSeqPos start=0, TSeqPos stop=kInvalidSeqPos)
Definition: seq_vector.cpp:311
void RandomizeData(char *buffer, size_t count, TSeqPos pos)
Convert count unpacked bases in buffer 4na -> 2na with randomization.
Definition: seq_vector.cpp:112
void x_GetPacked4naSeqData(string &dst_str, TSeqPos src_pos, TSeqPos src_end)
Definition: seq_vector.cpp:912
TResidue GetGapChar(ECaseConversion case_cvt=eCaseConversion_none) const
Return gap symbol corresponding to the selected coding.
Definition: seq_vector.hpp:318
@ fDefaultFlags
Definition: seq_map.hpp:140
@ eSeqGap
gap
Definition: seq_map.hpp:97
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
#define kMax_UChar
Definition: ncbi_limits.h:177
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define kMax_UInt
Definition: ncbi_limits.h:185
Uint4 TValue
Type of the generated integer value and/or the seed value.
Definition: random_gen.hpp:69
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
void set_bit(unsigned *dest, unsigned bitpos) noexcept
Set 1 bit in a block.
Definition: bmfunc.h:3721
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
@ e_not_set
No variant selected.
Definition: Seq_data_.hpp:103
@ e_Ncbipna
nucleic acid probabilities
Definition: Seq_data_.hpp:109
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Ncbi2na
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
@ e_Ncbipaa
amino acid probabilities
Definition: Seq_data_.hpp:112
@ e_Ncbi8na
8 bit extended nucleic acid code
Definition: Seq_data_.hpp:108
@ e_Ncbi4na
4 bit nucleic acid code
Definition: Seq_data_.hpp:107
@ e_Iupacaa
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
@ e_Ncbi8aa
8 bit extended amino acid codes
Definition: Seq_data_.hpp:110
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
@ e_not_set
int i
const struct ncbi::grid::netcache::search::fields::KEY key
static size_t rnd(size_t minimal, size_t maximal)
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int toupper(Uchar c)
Definition: ncbictype.hpp:73
Multi-threading – mutexes; rw-locks; semaphore.
T min(T x_, T y_)
static const unsigned char * tables(int mode)
static pcre_uint8 * buffer
Definition: pcretest.c:1051
static void x_AppendGapTo4(string &dst_str, char &dst_c, TSeqPos dst_pos, TSeqPos count, char gap)
Definition: seq_vector.cpp:466
static void x_AppendAnyTo4(string &dst_str, char &dst_c, TSeqPos dst_pos, const CSeq_data &data, TSeqPos dataPos, TSeqPos total_count, const char *table, bool reverse)
Definition: seq_vector.cpp:687
static void x_AppendAnyTo8(string &dst_str, const CSeq_data &data, TSeqPos dataPos, TSeqPos total_count, const char *table=0, bool reverse=false)
Definition: seq_vector.cpp:624
static void x_Append2To2(string &dst, char &dst_c, TSeqPos dst_pos, const vector< char > &src, TSeqPos src_pos, TSeqPos count)
Definition: seq_vector.cpp:542
static void x_Append8To2(string &dst_str, char &dst_c, TSeqPos dst_pos, const char *buffer, TSeqPos count)
Definition: seq_vector.cpp:491
static const size_t kBufferSize
Definition: seq_vector.cpp:351
static void x_AppendAnyTo2(string &dst_str, char &dst_c, TSeqPos dst_pos, const CSeq_data &data, TSeqPos dataPos, TSeqPos total_count, const char *table, bool reverse, INcbi2naRandomizer *randomizer, TSeqPos randomizer_pos)
Definition: seq_vector.cpp:752
static void x_AppendGapTo8(string &dst_str, size_t count, char gap)
Definition: seq_vector.cpp:378
static void x_Append8To8(string &dst_str, const string &src_str, size_t src_pos, size_t count)
Definition: seq_vector.cpp:354
static void x_Append4To4(string &dst, char &dst_c, TSeqPos dst_pos, const vector< char > &src, TSeqPos src_pos, TSeqPos count)
Definition: seq_vector.cpp:411
static void x_AppendRandomTo2(string &dst_str, char &dst_c, TSeqPos dst_pos, TSeqPos src_pos, TSeqPos count, INcbi2naRandomizer &randomizer, char gap)
Definition: seq_vector.cpp:601
static void x_Append8To4(string &dst, char &dst_c, TSeqPos dst_pos, const char *src, size_t count)
Definition: seq_vector.cpp:387
DEFINE_STATIC_FAST_MUTEX(s_ConvertTableMutex2)
void copy_8bit_any(DstIter dst, size_t count, const SrcCont &srcCont, size_t srcPos, const char *table, bool reverse)
void copy_4bit_any(DstIter dst, size_t count, const SrcCont &srcCont, size_t srcPos, const char *table, bool reverse)
void copy_2bit_any(DstIter dst, size_t count, const SrcCont &srcCont, size_t srcPos, const char *table, bool reverse)
void copy_2bit(DstIter dst, size_t count, const SrcCont &srcCont, size_t srcPos)
Selector used in CSeqMap methods returning iterators.
Definition: seq_map_ci.hpp:113
Definition: inftrees.h:24
#define _ASSERT
static int seed
Definition: test_table.cpp:132
CScope & GetScope()
Modified on Thu Jul 11 17:55:21 2024 by modify_doxy.py rev. 669887