98 string errstr =
string(
"CAlnVec::GetBioseqHandle(): ")
99 +
"Seq-id cannot be resolved: "
113 seq_vec = iter->second;
160 for (
int i=0;
i<chunk_vec->size();
i++) {
163 if (chunk->GetType() &
fSeq) {
166 seq_vec.
GetSeqData(chunk->GetRange().GetFrom(),
167 chunk->GetRange().GetTo() + 1,
170 seq_vec.
GetSeqData(seq_vec_size - chunk->GetRange().GetTo() - 1,
171 seq_vec_size - chunk->GetRange().GetFrom(),
180 const int n = chunk->GetAlnRange().GetLength();
181 char* ch_buff =
new char[
n+1];
189 memset(ch_buff, fill_ch,
n);
204 unsigned int scrn_width,
217 scrn_lft_seq_pos = -1,
218 scrn_rgt_seq_pos = -1,
222 int pos, nscrns,
delta;
232 const bool record_inserts = insert_starts && insert_lens;
233 const bool record_coords = scrn_width && scrn_lefts && scrn_rights;
244 for (seg = 0, pos =
row, aln_pos = 0, anchor_pos =
m_Anchor;
250 len = seg_len * width;
252 if (anchored &&
m_Starts[anchor_pos] < 0) {
255 if (record_inserts) {
261 insert_lens->pop_back();
262 insert_lens->push_back(ttl_len);
264 insert_starts->pop_back();
265 insert_starts->push_back(start);
268 prev_aln_pos = aln_pos / width;
270 insert_starts->push_back(start);
271 insert_aln_starts->push_back(prev_aln_pos);
272 insert_lens->push_back(
len);
280 stop = start +
len - 1;
286 if (buf_len < seg_len) {
288 buf_len = seg_len - buf_len;
291 if (seg < left_seg || seg > right_seg) {
297 for (
size_t i = 0;
i < buf_len; ++
i) {
304 if (scrn_lft_seq_pos < 0) {
305 scrn_lft_seq_pos =
plus ? start : stop;
306 if (scrn_rgt_seq_pos < 0) {
307 scrn_rgt_seq_pos = scrn_lft_seq_pos;
311 nscrns = (aln_pos - scrn_pos) / scrn_width;
312 for (
int i = 0;
i < nscrns;
i++) {
313 scrn_lefts->push_back(scrn_lft_seq_pos);
314 scrn_rights->push_back(scrn_rgt_seq_pos);
316 scrn_lft_seq_pos =
plus ? start : stop;
318 scrn_pos += scrn_width;
321 scrn_lft_seq_pos =
plus ? start : stop;
324 nscrns = (aln_pos +
len - scrn_pos) / scrn_width;
326 for (
int i = 0;
i < nscrns;
i++) {
328 scrn_width - (curr_pos - scrn_pos) :
329 curr_pos - scrn_pos - scrn_width);
331 scrn_lefts->push_back(scrn_lft_seq_pos);
333 scrn_lft_seq_pos < start :
334 scrn_lft_seq_pos > stop) {
335 scrn_lft_seq_pos = (
plus ? start : stop) +
337 scrn_rgt_seq_pos = scrn_lft_seq_pos +
340 scrn_rgt_seq_pos = scrn_lft_seq_pos + (
plus ? -1 : 1)
342 scrn_lft_seq_pos +=
delta;
344 if (seg == left_seg &&
345 scrn_lft_seq_pos == scrn_rgt_seq_pos) {
352 scrn_rights->push_back(scrn_rgt_seq_pos);
353 curr_pos = scrn_pos += scrn_width;
355 if (aln_pos +
len <= scrn_pos) {
356 scrn_lft_seq_pos = -1;
358 scrn_rgt_seq_pos =
plus ? stop : start;
365 if (seg < left_seg || seg > right_seg) {
371 for (
size_t i = 0;
i < seg_len; ++
i) {
383 TSeqPos pos_diff = aln_pos - scrn_pos;
385 nscrns = pos_diff / scrn_width;
386 if (pos_diff % scrn_width) {
389 for (
int i = 0;
i < nscrns;
i++) {
390 scrn_lefts->push_back(scrn_lft_seq_pos);
391 scrn_rights->push_back(scrn_rgt_seq_pos);
393 scrn_lft_seq_pos = scrn_rgt_seq_pos;
395 scrn_pos += scrn_width;
419 vector<string>* consens)
const
421 consensus_seq.
Reset();
433 if (consens ==
NULL) {
457 for (
i = 0;
i < consens->size(); ++
i) {
459 for (j = 0; j < (size_t)
m_NumRows; ++j) {
472 if ((*consens)[
i].length() != 0) {
473 new_ds->
SetStarts().push_back(total_bases);
483 data += (*consens)[
i];
487 for (
i = 0;
i <
m_Ids.size(); ++
i) {
496 id->Assign(consensus_id);
497 consensus_seq.
SetId().push_back(
id);
499 new_ds->
SetIds().push_back(
id);
504 desc.
Set().push_back(d);
505 d->
SetComment(
"This is a generated consensus sequence");
523 consensus_row =
int(new_ds->
GetIds().size()) - 1;
531 size_t rows = segs.size();
533 for (
size_t row = 0;
row < rows; ++
row) {
534 const string& s = segs[
row];
541 buf =
new char[(rows+1)*(cols+1)];
543 const char* src = s.c_str();
544 char* dst =
buf+(
row-gap_rows);
545 while ((*dst = *src++)) {
550 for (
size_t col = 0; col < cols; ++col) {
551 char* col_buf =
buf + col*(rows+1);
552 *(col_buf+(rows-gap_rows)) = 0;
553 segs.push_back(
string(col_buf));
562 fill_n(base_count, numBases, 0);
564 const char*
i = col.c_str();
640 fill_n(base_count, numBases, 0);
642 const char*
i = col.c_str();
646 if (0<=pos && pos < numBases)
655 const int numBases = isNucleotide ? 4 : 26;
663 for (
size_t j = 0; j < (size_t)
m_NumSegs; ++j) {
674 if ( gap_count > gap_seg_thresh )
684 consens[j].resize(
m_Lens[j]);
711 for (
int k = 0; k < numBases; ++k) {
721 if (rev_map.count(rev_map.begin()->first) == 1 &&
722 rev_map.begin()->first >= base_thresh) {
723 consens[j][
i] = isNucleotide ?
724 ToIupac(rev_map.begin()->second) :
725 (rev_map.begin()->second+
'A');
730 unsigned char c = 0x00;
732 TRevMap::iterator curr = rev_map.begin();
733 TRevMap::iterator
prev = rev_map.begin();
735 curr != rev_map.end() &&
736 (freq < base_thresh ||
prev->first == curr->first);
743 unsigned char cur_char = curr->second+
'A';
749 c = (cur_char ==
'N' || cur_char ==
'D') ?
'B' :
'X';
752 c = (cur_char ==
'Q' || cur_char ==
'E') ?
'Z' :
'X';
755 c = (cur_char ==
'I' || cur_char ==
'L') ?
'J' :
'X';
767 consens[j][
i] = isNucleotide ?
'N' :
'X';
769 consens[j][
i] = isNucleotide ?
ToIupac(c) : c;
779 size_t segment_row_index = segment*
m_NumRows;
780 for (
size_t i = 0;
i < (size_t)
m_NumRows; ++
i, ++segment_row_index) {
802 const CSeq_id& consensus_id)
const
806 *bioseq, consensus_id);
827 bool s1_is_prot,
bool s2_is_prot,
828 int gen_code1,
int gen_code2)
831 if (s1_is_prot == s2_is_prot && s1.length() != s2.length()) {
833 "CAlnVec::CalculateScore(): "
834 "Strings should have equal lenghts.");
835 }
else if (s1.length() * (s1_is_prot ? 1 : 3) !=
836 s2.length() * (s2_is_prot ? 1 : 3)) {
838 "CAlnVec::CalculateScore(): "
839 "Strings lengths do not match.");
844 const unsigned char * res1 = (
unsigned char *) s1.c_str();
845 const unsigned char * res2 = (
unsigned char *) s2.c_str();
846 const unsigned char * end1 = res1 + s1.length();
847 const unsigned char * end2 = res2 + s2.length();
849 static bool s_FullScoreMatrixInitialized =
false;
850 if (s1_is_prot && s2_is_prot) {
851 if ( !s_FullScoreMatrixInitialized ) {
852 s_FullScoreMatrixInitialized =
true;
857 for ( ; res1 != end1; res1++, res2++) {
862 }
else if ( !s1_is_prot && !s2_is_prot ) {
864 for ( ; res1 != end1; res1++, res2++) {
865 if (*res1 == *res2) {
875 for ( ; res1 != end1; res1++, res2++) {
882 for ( ; res2 != end2; res1++, res2++) {
899 "CAlnVec::TranslateNAToAA(): "
900 "NA size expected to be divisible by 3");
905 size_t na_size = na.size();
908 aa.resize(na_size / 3);
913 for (
size_t na_i = 0; na_i < na_size; ) {
914 for (
size_t i = 0;
i < 3;
i++) {
929 TNumrow index1 = row1, index2 = row2;
951 if (start1 >=0 && start2 >= 0) {
986 bool gaps_in_count)
const
1012 string na_buff, aa_buff;
1025 if (residue_count) {
1042 if (gaps_in_count && residue_count) {
1057 residue_cnt.resize(16, 0);
1061 int max = 0, total = 0;
1069 return 100 *
max / total;
static CRef< CScope > m_Scope
User-defined methods of the data storage class.
static SNCBIFullScoreMatrix s_FullScoreMatrix
bool IsSetAnchor(void) const
list< TSeqPos > TSeqPosList
int GetWidth(TNumrow row) const
const TNumseg & x_GetSeqLeftSeg(TNumrow row) const
const CDense_seg::TStarts & m_Starts
TSegTypeFlags GetSegType(TNumrow row, TNumseg seg, int offset=0) const
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
const CSeq_id & GetSeqId(TNumrow row) const
bool IsPositiveStrand(TNumrow row) const
const CDense_seg::TIds & m_Ids
TNumseg GetSeg(TSeqPos aln_pos) const
TDim GetNumRows(void) const
const CDense_seg::TStrands & m_Strands
CConstRef< CDense_seg > m_DS
const TNumseg & x_GetSeqRightSeg(TNumrow row) const
CRef< CAlnChunkVec > GetAlnChunks(TNumrow row, const TSignedRange &range, TGetChunkFlags flags=fAlnSegsOnly) const
unsigned int TSegTypeFlags
TSeqPos GetAlnStart(void) const
TSeqPos GetLen(TNumseg seg, int offset=0) const
CDense_seg::TNumseg TNumseg
TSeqPos GetAlnStop(void) const
const CDense_seg::TLens & m_Lens
static void CollectNucleotideFrequences(const string &col, int base_count[], int numBases)
CAlnVec(const CDense_seg &ds, CScope &scope)
const CBioseq_Handle & GetBioseqHandle(TNumrow row) const
TResidue GetGapChar(TNumrow row) const
string & GetSeqString(string &buffer, TNumrow row, TSeqPos seq_from, TSeqPos seq_to) const
TResidue GetEndChar() const
static void TranslateNAToAA(const string &na, string &aa, int gen_code=kDefaultGenCode)
int GetGenCode(TNumrow row) const
string & GetColumnVector(string &buffer, TSeqPos aln_pos, TResidueCount *residue_count=0, bool gaps_in_count=false) const
string & GetWholeAlnSeqString(TNumrow row, string &buffer, TSeqPosList *insert_aln_starts=0, TSeqPosList *insert_starts=0, TSeqPosList *insert_lens=0, unsigned int scrn_width=0, TSeqPosList *scrn_lefts=0, TSeqPosList *scrn_rights=0) const
CSeqVector & x_GetSeqVector(TNumrow row) const
CScope & GetScope(void) const
static void CollectProteinFrequences(const string &col, int base_count[], int numBases)
static unsigned char ToIupac(unsigned char c)
CRef< CDense_seg > CreateConsensus(int &consensus_row) const
string & GetAlnSeqString(string &buffer, TNumrow row, const CAlnMap::TSignedRange &aln_rng) const
int CalculateScore(TNumrow row1, TNumrow row2) const
void RetrieveSegmentSequences(size_t segment, vector< string > &segs) const
TBioseqHandleCache m_BioseqHandlesCache
vector< int > TResidueCount
static unsigned char FromIupac(unsigned char c)
TSeqVectorCache m_SeqVectorCache
int CalculatePercentIdentity(TSeqPos aln_pos) const
static void TransposeSequences(vector< string > &segs)
static const CTrans_table & GetTransTable(int id)
@Seq_descr.hpp User-defined methods of the data storage class.
char GetCodonResidue(int state) const
static int NextCodonState(int state, unsigned char ch)
container_type::iterator iterator
const_iterator end() const
const_iterator find(const key_type &key) const
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static const char * column
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string AsFastaString(void) const
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
bool IsNucleotide(void) const
TBioseqCore GetBioseqCore(void) const
Get bioseq core structure.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eStrand_Plus
Plus strand.
@ eStrand_Minus
Minus strand.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
bool IsProtein(void) const
void SetCoding(TCoding coding)
void SetIupacCoding(void)
Set coding to either Iupacaa or Iupacna depending on molecule type.
bool IsNucleotide(void) const
void Reset(void)
Reset reference object.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
TLens & SetLens(void)
Assign a value to Lens data member.
void SetDim(TDim value)
Assign a value to Dim data member.
TStarts & SetStarts(void)
Assign a value to Starts data member.
TStrands & SetStrands(void)
Assign a value to Strands data member.
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
const TIds & GetIds(void) const
Get the Ids member data.
TIds & SetIds(void)
Assign a value to Ids data member.
TSeq & SetSeq(void)
Select the variant.
TId & SetId(void)
Assign a value to Id data member.
const TInst & GetInst(void) const
Get the Inst member data.
TIupacna & SetIupacna(void)
Select the variant.
TMol GetMol(void) const
Get the Mol member data.
TComment & SetComment(void)
Select the variant.
void SetInst(TInst &value)
Assign a value to Inst data member.
virtual void Reset(void)
Reset the whole object.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
void SetRepr(TRepr value)
Assign a value to Repr data member.
Tdata & Set(void)
Assign a value to data member.
void SetLength(TLength value)
Assign a value to Length data member.
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
TIupacaa & SetIupacaa(void)
Select the variant.
void SetMol(TMol value)
Assign a value to Mol data member.
@ eRepr_raw
continuous sequence
@ e_not_set
No variant selected.
@ eMol_na
just a nucleic acid
unsigned int
A callback function used to compare two keys in a database.
double value_type
The numeric datatype used by the parser.
const struct ncbi::grid::netcache::search::fields::SIZE size
Int4 delta(size_t dimension_, const Int4 *score_)
const SNCBIPackedScoreMatrix NCBISM_Blosum62
#define NCBI_FSM_DIM
Recommended approach: unpack and index directly.
void NCBISM_Unpack(const SNCBIPackedScoreMatrix *psm, SNCBIFullScoreMatrix *fsm)
Expand a packed score matrix into an unpacked one, which callers can proceed to index directly by sta...
#define row(bind, expected)