56 const size_t kNrMersTotal (1 << (kNr * 2));
59 const double kRepeatsPercentile (0.995);
68 const char *kFileExt_Masked =
".rep";
71 const char *kFileExt_Remap =
".idc";
72 const char *kFileExt_Offsets =
".ofs";
73 const char *kFileExt_Positions =
".pos";
75 const Uint8 kUI8_LoWord (0xFFFFFFFF);
76 const Uint8 kUI8_LoByte (kUI8_LoWord >> 24);
77 const Uint8 kUI8_HiWord (kUI8_LoWord << 32);
78 const Uint8 kUI8_LoFive (kUI8_LoWord | (kUI8_LoWord << 8));
80 const Uint8 kUI8_LoHalfWordEachByte ((
Uint8(0x0F0F0F0F) << 32) | 0x0F0F0F0F);
82 const Uint4 kUI4_Lo28 (0xFFFFFFFF >> 4);
84 const Uint8 kUI8_SeqDb_lo (0x03030303);
85 const Uint8 kUI8_SeqDb ((kUI8_SeqDb_lo << 32) | kUI8_SeqDb_lo);
89 const size_t kMapGran (512 * 1024 * 1024);
97 case 0: rv =
'A';
break;
98 case 1: rv =
'C';
break;
99 case 2: rv =
'G';
break;
100 case 3: rv =
'T';
break;
106 #ifdef ALGO_ALIGN_SPLIGN_QCOMP_DEBUG
109 string GetBinString(
T v)
113 const size_t bitdim (
sizeof(
T) * 8);
114 for(
size_t i(0);
i < bitdim; ++
i) {
120 for(
size_t i (bitdim);
i > 0; --
i) {
121 if(
i < bitdim &&
i % 8 == 0) {
135 Int8 reported_len (-1);
136 for(
size_t attempt(0); attempt < 1; ++attempt) {
138 if(reported_len >= 0 &&
Uint8(reported_len) == len_bytes) {
148 if(reported_len < 0) {
149 ostr <<
"Cannot write " << filename
150 <<
" (error code = " << reported_len <<
"). ";
153 ostr <<
"The size of " << filename <<
" (" << reported_len <<
')'
154 <<
" is different from the expected " << len_bytes <<
". ";
156 ostr <<
"Please make sure there is enough disk space.";
170 for(
size_t i (0), imax (4*
sizeof(
T)); i < imax; ++i, v >>= 2) {
171 rv = (rv << 2) | (v & 3);
202 for(
size_t i(0), imax(
sizeof(
T));
i < imax; ++
i) {
225 const Uint4 kMaxTwoBaseContent (14);
227 vector<Uint4> counts(4, 0);
229 for(
Uint4 k = 0; k < 14; ++k) {
230 ++counts[0x00000003 &
key];
234 const Uint4 ag (counts[0] + counts[1]);
235 const Uint4 at (counts[0] + counts[2]);
236 const Uint4 ac (counts[0] + counts[3]);
237 const Uint4 gt (counts[1] + counts[2]);
238 const Uint4 gc (counts[1] + counts[3]);
239 const Uint4 tc (counts[2] + counts[3]);
242 ag >= kMaxTwoBaseContent || at >= kMaxTwoBaseContent ||
243 ac >= kMaxTwoBaseContent ||
gt >= kMaxTwoBaseContent ||
244 gc >= kMaxTwoBaseContent || tc >= kMaxTwoBaseContent;
250 string dir, base, ext;
260 string ReplaceExt(
const string& extended_name,
const string & new_ext)
262 string dir, base, ext;
277 template <
typename VectorT>
283 const Uint8 len_bytes (v.size() *
sizeof(TElem));
286 CNcbiOfstream tempcgrfile (filename.c_str(), IOS_BASE::binary);
287 tempcgrfile.write((
const char* ) & v.front(), len_bytes);
297 template <
typename VectorT>
306 CNcbiIfstream tempcgrfile (filename.c_str(), IOS_BASE::binary);
307 tempcgrfile.read((
char* ) & v.front(), dim *
sizeof(TElem));
320 const string sfx (
string(strand?
".p":
".m") +
".v*");
321 const string mask_ofs_q (
m_lbn_q + sfx + kFileExt_Offsets);
325 const string filename ((*ii)->GetPath());
329 for(
const Uint8 * p8 (
reinterpret_cast<const Uint8 *
> (mf.
Map())),
330 * p8e (p8 + vol_length / 8); p8 != p8e;
351 ostr <<
"Sequence volumes with total length exceeding "
353 <<
" are not yet supported. Please split your FASTA file and re-run "
359 Uint4 current_offset (0);
361 unique_ptr<vector<Uint4> > pNrCounts (
new vector<Uint4> (kNrMersTotal, 0));
362 vector<Uint4> & NrCounts (*pNrCounts);
364 cerr <<
" Scanning " << subjdb->
GetNumSeqs() <<
" genomic sequences ... ";
367 const char * pcb (0);
369 const char * pcbe (pcb + bases / 4);
372 if(npcb < npcb0) npcb += 8;
373 const Uint8 gcbase (4*(npcb - npcb0));
374 const Uint8 * pui8 (
reinterpret_cast<const Uint8*
>(npcb));
375 const Uint8 * pui8_e (
reinterpret_cast<const Uint8*
>(pcbe));
377 for(
size_t gccur (current_offset + gcbase); pui8 < pui8_e; ++pui8) {
383 #define QCOMP_COUNT_NrMERS {{ \
384 if(gccur + 16 >= current_offset + bases) { \
387 const Uint4 mer4 (ui8 & kUI8_LoWord); \
388 ++NrCounts[mer4 >> 4]; \
398 if(pui8 + 1 < pui8_e) {
400 ui8 |= (*(pui8 + 1) << 32);
408 #undef QCOMP_COUNT_NMERS
413 current_offset += bases;
418 cerr <<
"Ok" << endl;
419 cerr <<
" Constructing FV ... ";
421 string filename_temp_01;
422 unique_ptr<vector<Uint4> > pNrCounts2 (
new vector<Uint4>);
423 vector<Uint4> & NrCounts2 (* pNrCounts2);
425 NrCounts2 = NrCounts;
433 size_t total_mers (0);
434 ITERATE(vector<Uint4>, ii, NrCounts) {
435 if(*ii > 0) ++total_mers;
437 const size_t percentile ((
size_t)(kNrMersTotal -
438 total_mers * (1 - kRepeatsPercentile)));
439 nth_element(NrCounts.begin(), NrCounts.begin() + percentile, NrCounts.end());
440 const Uint4 max_repeat_count (NrCounts[percentile]);
442 if(filename_temp_01.empty()) {
443 NrCounts = NrCounts2;
451 for(
size_t i (0);
i < kNrMersTotal; ++
i) {
461 const Uint8 len_bytes (kNrMersTotal / 8);
470 copy(src, src + kNrMersTotal / 64, dest);
477 cerr <<
" Reading/transforming FV ... ";
489 for(
size_t i(0);
i < kNrMersTotal; ++
i) {
492 const size_t irc (g_RC(
Uint4(
i) << 4) & kUI4_Lo28);
498 cerr <<
"Ok" << endl;
510 seq_infos.reserve(num_seqs);
512 Uint4 current_offset (0);
516 if(seqlen <= 0 ||
size_t(seqlen) >= 0xFFFFFFFF) {
518 ostr <<
"Cannot create remap data for:\t" <<
519 blastdb.
GetSeqIDs(oid).front()->GetSeqIdString(
true);
524 const Uint4 bases (seqlen);
525 seq_infos.push_back(
SSeqInfo(current_offset, bases, oid));
526 current_offset += bases;
534 CNcbiOfstream ofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);
536 ofstr_remap.write((
const char*) &seq_infos.front(), len_bytes);
541 cerr <<
" Remap data created for " << db <<
"; max offset = "
542 << current_offset << endl;
551 seq_infos.reserve(num_seqs);
553 Uint4 current_offset (0);
557 if(seqlen <= 0 ||
size_t(seqlen) >= 0xFFFFFFFF) {
559 ostr <<
"Cannot create remap data for:\t" <<
565 const Uint4 bases (seqlen);
567 current_offset += bases;
575 CNcbiOfstream ofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);
577 ofstr_remap.write((
const char*) &seq_infos.front(), len_bytes);
582 cerr <<
" Remap data created for sequences; max offset = "
583 << current_offset << endl;
592 cerr <<
" Scanning " << db <<
" for N-mers and their positions." << endl;
599 vector<Uint8> MersAndCoords (mcidx_max, 0);
601 size_t current_offset (0);
610 ostr <<
"Sequence volumes with total length exceeding "
612 <<
" are not yet supported. Please split your FASTA file and re-run "
621 const char * pcb (0);
623 const char * pce (pcb + bases/4);
626 if(npcb < npcb0) npcb += 8;
627 const Uint8 gcbase (4*(npcb - npcb0));
628 const Uint8* pui8_start (
reinterpret_cast<const Uint8*
>(npcb));
629 const Uint8* pui8 (pui8_start);
637 if(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {
638 MersAndCoords.resize(mcidx);
640 MersAndCoords.assign(mcidx_max, 0);
647 for(
size_t gccur (current_offset + gcbase);
648 gccur + 16 < current_offset + bases && mcidx < mcidx_max;
653 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \
654 size_t gccur2 (gccur + 2); \
655 const Uint8 ui8_2930 (w8 >> 60); \
656 Uint8 ui8_ls4 (w8 << 4); \
657 const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \
658 ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \
659 ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48);
663 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \
665 if(gccur + 16 >= current_offset + bases) { \
668 const Uint8 mer (w8 & kUI8_LoWord); \
670 if(m_Mers.get_at(mer)) { \
671 MersAndCoords[mcidx++] = (mer << 32) | gccur; \
675 const Uint4 rc (g_RC(Uint4(mer))); \
676 if(m_Mers.get_at(rc)) { \
677 MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \
678 ((current_offset + bases - gccur - 16) \
696 if(gccur + 32 >= current_offset + bases) {
701 w8 |= ((*(pui8 + 1)) << 32);
715 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
716 #undef QCOMP_CREATE_GENOMIC_IDX
725 for(
const char * p (pcb),
726 *pche ( (
reinterpret_cast<const char*
>(pui8)) + 5 );
729 const Uint8 c8 (*p & kUI8_LoByte);
730 const Uint8 ui8curbyte (c8);
732 fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);
736 for(
Uint4 k(0); k < 4; ++k) {
738 const Uint4 mer (fivebytes & kUI8_LoWord);
739 if(
m_Mers.
get_at(strand? (mer >> 4): (mer & kUI4_Lo28))) {
740 const Uint8 ui8 (mer);
741 const Uint8 coord (current_offset +
742 4*(p - pcb - 5) + k);
743 MersAndCoords[mcidx++] = (ui8 << 32) | coord;
746 fivebytes &= kUI8_LoFive;
748 const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
749 fivebytes &= ~kUI8_SeqDb;
752 fivebytes |= ui8curbyte << 32;
758 Uint8 gccur (current_offset + gcbase);
759 for(; gccur + 32 < current_offset + bases; ++pui8)
763 for(
Uint8 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
765 const Uint8 loword = ui8 & kUI8_LoWord;
767 (loword & kUI4_Lo28)))
769 MersAndCoords[mcidx++] = (loword << 32) | gccur;
772 const Uint8 ui8hi2 ((ui8 >> 62) << 48);
774 const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
776 ui8 |= (ui8m | ui8hi2);
779 if(gccur + 32 < current_offset + bases) {
783 const Uint4 * pui4 (
reinterpret_cast<const Uint4*
>(
n));
786 ui8 |= (ui8_4 << 32);
788 for(
Uint8 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
790 const Uint8 loword = ui8 & kUI8_LoWord;
792 (loword & kUI4_Lo28)))
794 MersAndCoords[mcidx++] = (loword << 32) | gccur;
797 const Uint8 ui8hi2 ((ui8 >> 62) << 48);
799 const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
801 ui8 |= (ui8m | ui8hi2);
807 if(gccur + 16 <= current_offset + bases) {
809 fivebytes = (gccur == current_offset + gcbase)? fivebytes:
811 const char * p (
reinterpret_cast<const char*
>(pui8_start) + 4
812 + (gccur - current_offset - gcbase) / 4);
815 const Uint8 loword = fivebytes & kUI8_LoWord;
818 (loword & kUI4_Lo28)))
820 MersAndCoords[mcidx++] = (loword << 32) | gccur;
825 const Uint8 ui8 (*p++ & kUI8_LoByte);
826 fivebytes |= (ui8 << 32);
833 fivebytes &= kUI8_LoFive;
835 const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
836 fivebytes &= ~kUI8_SeqDb;
850 current_offset += bases;
852 if(mcidx >= mcidx_max) {
854 ostr <<
"Selected max volume size is too small: "
855 <<
"it must be large enough to fit the index for the "
856 <<
"longest input sequence.";
865 MersAndCoords.resize(mcidx);
878 cerr <<
" Scanning sequences for N-mers and their positions." << endl;
885 vector<Uint8> MersAndCoords (mcidx_max, 0);
887 size_t current_offset (0);
895 ostr <<
"Sequence volumes with total length exceeding "
897 <<
" are not yet supported. Please split your FASTA file and re-run "
906 const char * pcb (0);
908 const char * pce (pcb + bases/4);
911 if(npcb < npcb0) npcb += 8;
912 const Uint8 gcbase (4*(npcb - npcb0));
913 const Uint8* pui8_start (
reinterpret_cast<const Uint8*
>(npcb));
914 const Uint8* pui8 (pui8_start);
922 if(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {
923 MersAndCoords.resize(mcidx);
925 MersAndCoords.assign(mcidx_max, 0);
932 for(
size_t gccur (current_offset + gcbase);
933 gccur + 16 < current_offset + bases && mcidx < mcidx_max;
938 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \
939 size_t gccur2 (gccur + 2); \
940 const Uint8 ui8_2930 (w8 >> 60); \
941 Uint8 ui8_ls4 (w8 << 4); \
942 const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \
943 ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \
944 ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48);
948 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \
950 if(gccur + 16 >= current_offset + bases) { \
953 const Uint8 mer (w8 & kUI8_LoWord); \
955 if(m_Mers.get_at(mer)) { \
956 MersAndCoords[mcidx++] = (mer << 32) | gccur; \
960 const Uint4 rc (g_RC(Uint4(mer))); \
961 if(m_Mers.get_at(rc)) { \
962 MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \
963 ((current_offset + bases - gccur - 16) \
981 if(gccur + 32 >= current_offset + bases) {
986 w8 |= ((*(pui8 + 1)) << 32);
1000 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
1001 #undef QCOMP_CREATE_GENOMIC_IDX
1009 Uint8 fivebytes (0);
1010 for(
const char * p (pcb),
1011 *pche ( (
reinterpret_cast<const char*
>(pui8)) + 5 );
1014 const Uint8 c8 (*p & kUI8_LoByte);
1015 const Uint8 ui8curbyte (c8);
1017 fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);
1021 for(
Uint4 k(0); k < 4; ++k) {
1023 const Uint4 mer (fivebytes & kUI8_LoWord);
1024 if(
m_Mers.
get_at(strand? (mer >> 4): (mer & kUI4_Lo28))) {
1025 const Uint8 ui8 (mer);
1026 const Uint8 coord (current_offset +
1027 4*(p - pcb - 5) + k);
1028 MersAndCoords[mcidx++] = (ui8 << 32) | coord;
1031 fivebytes &= kUI8_LoFive;
1033 const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
1034 fivebytes &= ~kUI8_SeqDb;
1037 fivebytes |= ui8curbyte << 32;
1043 Uint8 gccur (current_offset + gcbase);
1044 for(; gccur + 32 < current_offset + bases; ++pui8)
1048 for(
Uint8 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
1050 const Uint8 loword = ui8 & kUI8_LoWord;
1052 (loword & kUI4_Lo28)))
1054 MersAndCoords[mcidx++] = (loword << 32) | gccur;
1057 const Uint8 ui8hi2 ((ui8 >> 62) << 48);
1059 const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
1061 ui8 |= (ui8m | ui8hi2);
1064 if(gccur + 32 < current_offset + bases) {
1068 const Uint4 * pui4 (
reinterpret_cast<const Uint4*
>(
n));
1071 ui8 |= (ui8_4 << 32);
1073 for(
Uint8 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
1075 const Uint8 loword = ui8 & kUI8_LoWord;
1077 (loword & kUI4_Lo28)))
1079 MersAndCoords[mcidx++] = (loword << 32) | gccur;
1082 const Uint8 ui8hi2 ((ui8 >> 62) << 48);
1084 const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
1086 ui8 |= (ui8m | ui8hi2);
1092 if(gccur + 16 <= current_offset + bases) {
1094 fivebytes = (gccur == current_offset + gcbase)? fivebytes:
1095 (ui8 & kUI8_LoWord);
1096 const char * p (
reinterpret_cast<const char*
>(pui8_start) + 4
1097 + (gccur - current_offset - gcbase) / 4);
1100 const Uint8 loword = fivebytes & kUI8_LoWord;
1103 (loword & kUI4_Lo28)))
1105 MersAndCoords[mcidx++] = (loword << 32) | gccur;
1110 const Uint8 ui8 (*p++ & kUI8_LoByte);
1111 fivebytes |= (ui8 << 32);
1118 fivebytes &= kUI8_LoFive;
1120 const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
1121 fivebytes &= ~kUI8_SeqDb;
1135 current_offset += bases;
1137 if(mcidx >= mcidx_max) {
1139 ostr <<
"Selected max volume size is too small: "
1140 <<
"it must be large enough to fit the index for the "
1141 <<
"longest input sequence.";
1148 MersAndCoords.resize(mcidx);
1161 vector<Uint8>& MersAndCoords)
1163 const size_t t0 (time(0));
1175 CNcbiOfstream ofstr_offs (filename_offs.c_str(), IOS_BASE::binary);
1176 Uint8 bytes_offs (0);
1179 CNcbiOfstream ofstr_positions (filename_positions.c_str(), IOS_BASE::binary);
1180 Uint8 bytes_positions (0);
1182 cerr <<
" Generating index volume: " <<
basename <<
" ... ";
1187 sort(MersAndCoords.begin(), MersAndCoords.end());
1189 ITERATE(vector<Uint8>, ii, MersAndCoords) {
1191 const Uint4 mer ((*ii & kUI8_HiWord) >> 32);
1194 ofstr_offs.write((
const char*) &mer,
sizeof mer);
1195 ofstr_offs.write((
const char*) &curofs,
sizeof curofs);
1197 bytes_offs +=
sizeof(mer) +
sizeof(curofs);
1199 const Uint4 pos (*ii & kUI8_LoWord);
1201 ofstr_positions.write((
const char*) &pos,
sizeof pos);
1202 bytes_positions +=
sizeof(pos);
1207 const Uint4 mer (0);
1208 ofstr_offs.write((
const char*)&mer,
sizeof mer);
1209 ofstr_offs.write((
const char*)&curofs,
sizeof curofs);
1210 bytes_offs +=
sizeof(mer) +
sizeof(curofs);
1213 ofstr_positions.close();
1218 cerr <<
"Ok" << endl;
1220 return time(0) - t0;
1223 #define CHECK_MEMMAP(mm,ofs,len) \
1225 const size_t ofs1 (mm.GetOffset()); \
1227 cerr << "Real offset " << ofs1 << " different from " << ofs << endl; \
1229 const size_t len1 (mm.GetSize()); \
1231 cerr << "Real length " << len1 << " different from " << len << endl; \
1240 cerr <<
" Matching (strand = " << (strand?
"plus":
"minus") <<
") ... ";
1245 const string sfx (
string(strand?
".p":
".m") +
".v*");
1247 const string mask_ofs_q (
m_lbn_q + sfx + kFileExt_Offsets);
1248 const string mask_ofs_s (
m_lbn_s + sfx + kFileExt_Offsets);
1252 Uint8 elem_hits_total (0);
1256 const string filename_ofs_s ((*ii_s)->GetPath());
1257 const string filename_pos_s (
ReplaceExt(filename_ofs_s, kFileExt_Positions));
1258 const CFile file_subj (filename_ofs_s);
1259 const size_t dim_ofs_s (file_subj.
GetLength() / 8);
1263 const string filename_ofs_q ((*ii_q)->GetPath());
1264 const string filename_pos_q (
ReplaceExt(filename_ofs_q,
1265 kFileExt_Positions));
1267 Uint8 hit_index_dim (0), elem_hits_this_pair (0);
1268 string filename_hit_index;
1274 const Uint8 * ofs_s (
reinterpret_cast<const Uint8*
>
1277 const CFile file_query ((*ii_q)->GetPath());
1278 const size_t dim_ofs_q (file_query.
GetLength() / 8);
1281 const Uint8 * ofs_q (
reinterpret_cast<const Uint8*
>
1286 hit_index.reserve(
min(dim_ofs_q, dim_ofs_s));
1288 while((*ofs_q & kUI8_LoWord) && (*ofs_s & kUI8_LoWord)) {
1290 while( (*ofs_q & kUI8_LoWord)
1291 && ((*ofs_q & kUI8_LoWord) < (*ofs_s & kUI8_LoWord)))
1295 if((*ofs_q & kUI8_LoWord) == 0)
break;
1297 while( (*ofs_s & kUI8_LoWord)
1298 && ((*ofs_s & kUI8_LoWord) < (*ofs_q & kUI8_LoWord)))
1302 if((*ofs_s & kUI8_LoWord) == 0)
break;
1304 if((*ofs_s & kUI8_LoWord) == (*ofs_q & kUI8_LoWord)) {
1326 hit_index_dim = (hit_index.size());
1327 elem_hits_total += elem_hits_this_pair;
1332 const CFile file_pos_s (filename_pos_s);
1333 const size_t dim_pos_s (file_pos_s.
GetLength() / 4);
1336 size_t map_offset_s (0);
1337 size_t map_length_s (
min(kMapGran, 4*dim_pos_s - map_offset_s));
1338 const Uint4 * pos_s (
reinterpret_cast<const Uint4*
>(
1339 mf_pos_s.
Map(map_offset_s, map_length_s)));
1341 const CFile file_pos_q (filename_pos_q);
1342 const size_t dim_pos_q (file_pos_q.
GetLength() / 4);
1345 size_t map_offset_q (0);
1346 size_t map_length_q (
min(kMapGran, 4*dim_pos_q - map_offset_q));
1347 const Uint4 * pos_q (
reinterpret_cast<const Uint4*
>(
1348 mf_pos_q.
Map(map_offset_q, map_length_q)));
1351 vector<Uint8> hits (elem_hits_this_pair);
1354 CNcbiIfstream ifstr (filename_hit_index.c_str(), IOS_BASE::binary);
1355 for(
size_t cnt(0);
cnt < hit_index_dim; ++
cnt) {
1358 ifstr.read((
char*) &hie,
sizeof(hie));
1362 if(idx_s_max > dim_pos_s || idx_q_max > dim_pos_q) {
1366 if(4*idx_s_max >= map_offset_s + map_length_s) {
1369 map_length_s =
min(kMapGran, 4*dim_pos_s - map_offset_s);
1371 pos_s = (
reinterpret_cast<const Uint4*
>
1372 (mf_pos_s.
Map(map_offset_s, map_length_s)));
1375 if(4*idx_q_max >= map_offset_q + map_length_q) {
1378 map_length_q =
min(kMapGran, 4*dim_pos_q - map_offset_q);
1380 pos_q = (
reinterpret_cast<const Uint4*
>
1381 (mf_pos_q.
Map(map_offset_q, map_length_q)));
1384 for(
size_t idx_s (hie.
m_SubjOfs); idx_s < idx_s_max; ++idx_s) {
1386 Uint8 hiword = pos_s[idx_s - map_offset_s/4];
1388 for(
size_t idx_q (hie.
m_QueryOfs); idx_q < idx_q_max; ++idx_q) {
1390 const Uint8 loword = pos_q[idx_q - map_offset_q/4];
1391 hits[hidx++] = (hiword | loword);
1396 if(hidx != elem_hits_this_pair) {
1398 ostr <<
"The number of hits found (" << hidx
1399 <<
") does not match the expected " << elem_hits_this_pair;
1414 cerr <<
"Ok" << endl;
1422 return (lhs & kUI8_LoWord) < (rhs & kUI8_LoWord);
1428 const double lhs_q (
double(lhs & kUI8_LoWord) / 2);
1429 const double lhs_s (
double(lhs >> 32) / 2);
1430 const double rhs_q (
double(rhs & kUI8_LoWord) / 2);
1431 const double rhs_s (
double(rhs >> 32) / 2);
1433 const double lhs_q1 (lhs_q + lhs_s);
1434 const double lhs_s1 (lhs_s - lhs_q);
1435 const double rhs_q1 (rhs_q + rhs_s);
1436 const double rhs_s1 (rhs_s - rhs_q);
1438 if(lhs_s1 == rhs_s1) {
1439 return lhs_q1 < rhs_q1;
1442 return lhs_s1 < rhs_s1;
1459 const string mask (lbn +
"*" + *ii);
1462 fdl.
Add((*jj)->GetPath());
1470 if(phits->size() == 0) {
1475 sort(phits->begin(), phits->end());
1479 TSeqInfos::const_iterator ii_genomic (ii_genomic_b);
1487 size_t idx_compacted (0);
1488 for(
size_t idx (0), idx_hi (phits->size()); idx < idx_hi; ) {
1490 Uint4 gc_s (((*phits)[idx]) >> 32);
1493 ii_genomic = lower_bound(ii_genomic + 1, ii_genomic_e,
SSeqInfo(gc_s, 0, 0));
1494 if(ii_genomic == ii_genomic_e || ii_genomic->m_Start > gc_s) {
1498 if(gc_s < ii_genomic->m_Start ||
1499 ii_genomic->m_Start + ii_genomic->m_Length <= gc_s)
1502 ostr <<
"Global genomic coordinate "
1503 << gc_s <<
" out of range: ["
1504 << ii_genomic->m_Start <<
", "
1505 << (ii_genomic->m_Start + ii_genomic->m_Length) <<
"), "
1511 const Uint4 gc_s_max (ii_genomic->m_Start + ii_genomic->m_Length);
1518 while(idx < idx_hi && (((*phits)[idx]) >> 32) < gc_s_max) {
1523 sort(phits->begin() + idx0, phits->begin() + idx,
PLoWord);
1526 TSeqInfos::const_iterator ii_cdna (ii_cdna_b);
1527 Uint4 gc_q_min (ii_cdna->m_Start);
1528 Uint4 gc_q_max (ii_cdna->m_Start + ii_cdna->m_Length);
1529 size_t jdx (idx0), jdx0 (jdx);
1530 for(; jdx < idx; ++jdx) {
1532 Uint4 gc_q (((*phits)[jdx]) & kUI8_LoWord);
1534 if(gc_q < gc_q_min || gc_q >= gc_q_max) {
1544 if(
kN * (jdx - jdx0) >= min_matches / 2) {
1549 jdx0, jdx, &idx_compacted);
1554 ii_cdna = lower_bound(ii_cdna + 1, ii_cdna_e,
SSeqInfo(gc_q, 0, 0));
1556 if(ii_cdna == ii_cdna_e || ii_cdna->m_Start > gc_q) {
1560 if(gc_q < ii_cdna->m_Start ||
1561 ii_cdna->m_Start + ii_cdna->m_Length <= gc_q)
1564 ostr <<
"Global cDNA coordinate "
1565 << gc_q <<
" out of range: ["
1566 << ii_cdna->m_Start <<
", "
1567 << (ii_cdna->m_Start + ii_cdna->m_Length) <<
"), "
1568 << (
m_cDNASeqIds[ii_cdna->m_Oid]->GetSeqIdString(
true));
1573 gc_q_min = ii_cdna->m_Start;
1574 gc_q_max = ii_cdna->m_Start + ii_cdna->m_Length;
1587 if(
kN * (jdx - jdx0) >= min_matches / 2) {
1592 jdx0, jdx, &idx_compacted);
1613 const Int8 & right_limit0,
1616 const int Wm (1), Wms (-2);
1617 int score (
int(hitref->GetLength()) * Wm
1618 +
int(hitref->GetMismatches()) * (Wms - Wm));
1619 int score_max (score);
1621 const int overrun (6);
1622 const Int8 left_limit (left_limit0 >= overrun? left_limit0 - overrun: 0);
1623 const Int8 right_limit (right_limit0 >= kDiagMax - overrun?
1624 kDiagMax: right_limit0 + overrun);
1627 Int4 q0 (hitref->GetQueryStart()), s0 (hitref->GetSubjStart());
1628 Uint4 mm (0), mm0 (0);
1629 bool no_overrun_yet (
true);
1630 for(
Int4 q (q0 - 1), s (s0 - 1);
1631 (q + s > left_limit && score +
m_XDropOff >= score_max
1636 if(q + s == left_limit0) {
1637 no_overrun_yet =
false;
1650 if(score > score_max) {
1654 if(no_overrun_yet) {
1666 while(q0 + s0 <= left_limit0) {++q0; ++s0;}
1668 bool extended_left (
false);
1669 if(q0 < hitref->GetQueryStart()) {
1670 hitref->SetQueryStart(q0);
1671 hitref->SetSubjStart(s0);
1672 extended_left =
true;
1676 q0 = (hitref->GetQueryStop());
1677 s0 = (hitref->GetSubjStop());
1681 no_overrun_yet =
true;
1683 for(
Int4 q (q0 + 1), s (s0 + 1);
1684 (q + s < right_limit && score + m_XDropOff >= score_max
1689 if(q + s == right_limit0) {
1690 no_overrun_yet =
false;
1703 if(score > score_max) {
1707 if(no_overrun_yet) {
1719 while(q0 + s0 >= right_limit0) {--q0; --s0; }
1721 bool extended_right (
false);
1722 if(q0 > hitref->GetQueryStop()) {
1723 hitref->SetQueryStop(q0);
1724 hitref->SetSubjStop(s0);
1725 extended_right =
true;
1728 if(extended_left || extended_right) {
1730 hitref->SetMismatches(mm0);
1731 const THit::TCoord len (hitref->GetQueryStop() - hitref->GetQueryStart() + 1);
1732 hitref->SetLength(
len);
1733 hitref->SetIdentity((
len - mm0) /
double(
len));
1734 hitref->SetScore(2 *
len);
1742 TSeqInfos::const_iterator ii_cdna,
1743 TSeqInfos::const_iterator ii_genomic,
1746 size_t* pidx_compacted)
1748 if(idx_start >= idx_stop) {
1757 sort(pvol->begin() + idx_start, pvol->begin() + idx_stop,
PDiag);
1759 const size_t idx_compacted_start (*pidx_compacted);
1762 lens.reserve(pvol->size() / 2);
1764 Uint8 qs0 ((*pvol)[idx_start]);
1765 Uint4 q0 (qs0 & kUI8_LoWord);
1766 Uint4 s0 (qs0 >> 32);
1767 (*pvol)[(*pidx_compacted)++] = qs0;
1770 for(
size_t idx (idx_start + 1); idx < idx_stop; ++idx) {
1772 const Uint8 qs ((*pvol)[idx]);
1773 const Uint4 q (qs & kUI8_LoWord);
1774 const Uint4 s (qs >> 32);
1776 if((q0 >= s0 && q0 - s0 == q - s && q <= q0 + 16) ||
1777 (q0 < s0 && s0 - q0 == s - q && q <= q0 + 16) )
1779 lens.back() += q - q0;
1782 (*pvol)[(*pidx_compacted)++] = qs;
1792 for(
size_t idx (idx_compacted_start); idx < *pidx_compacted; ++idx) {
1794 const Uint8 qs ((*pvol)[idx]);
1795 const Uint4 q (qs & kUI8_LoWord);
1796 const Uint4 s (qs >> 32);
1799 hitref->SetQueryStart(q);
1800 hitref->SetSubjStart(s);
1801 const Uint4 len (lens[idx - idx_compacted_start]);
1802 hitref->SetQueryStop(q +
len - 1);
1803 hitref->SetSubjStop(s +
len - 1);
1804 hitref->SetMismatches(0);
1806 hitref->SetEValue(0);
1807 hitref->SetIdentity(1.0);
1808 hitref->SetLength(
len);
1809 hitref->SetScore(2*
len);
1810 hitrefs[idx - idx_compacted_start] = hitref;
1813 #define EXTEND_USING_SEQUENCE_CHARS
1814 #ifdef EXTEND_USING_SEQUENCE_CHARS
1818 Int8 left_limit (0);
1819 Int8 right_limit (kDiagMax);
1820 Int8 s_prime_cur (0);
1822 const size_t kn (hitrefs.size());
1823 for(
size_t k(0); k < kn; ++k) {
1828 if(
Int8(box[2]) -
Int8(box[0]) != s_prime_cur) {
1829 left_limit = box[0];
1830 s_prime_cur =
Int8(box[2]) -
Int8(box[0]);
1833 right_limit = kDiagMax;
1836 if(
Int8(boX[2]) -
Int8(boX[0]) == s_prime_cur) {
1837 right_limit = boX[0] + boX[2];
1841 left_limit =
x_ExtendHit(left_limit, right_limit, hitrefs[k]);
1846 Int8 rlimit (0), spc (0);
1847 for(
size_t k (0); k < kn; ++k) {
1849 const THit::TCoord cur_diag_stop (hitrefs[k]->GetQueryStop()
1850 + hitrefs[k]->GetSubjStop());
1852 const Int8 s_prime (-
Int8(hitrefs[k]->GetQueryStart())
1853 +
Int8(hitrefs[k]->GetSubjStart()));
1854 if(k == 0 || spc != s_prime) {
1856 hitrefs[++d] = hitrefs[k];
1857 rlimit = cur_diag_stop;
1862 const THit::TCoord cur_diag_start (hitrefs[k]->GetQueryStart()
1863 + hitrefs[k]->GetSubjStart());
1867 if(rlimit + 2 == cur_diag_start) {
1869 rlimit = cur_diag_stop;
1870 hrd->SetQueryStop(hitrefs[k]->GetQueryStop());
1871 hrd->SetSubjStop(hitrefs[k]->GetSubjStop());
1872 hrd->SetMismatches(hrd->GetMismatches()
1873 + hitrefs[k]->GetMismatches());
1875 - hrd->GetQueryStart() + 1);
1876 hrd->SetLength(
len);
1877 hrd->SetIdentity(
double(
len - hrd->GetMismatches()) /
len);
1878 hrd->SetScore(2 *
len);
1880 else if (rlimit + 1 < cur_diag_start) {
1882 hitrefs[++d] = hitrefs[k];
1883 rlimit = cur_diag_stop;
1887 "x_CompartPair(): Unexpected alignment overlap");
1891 hitrefs.resize(
min(
size_t(d + 1), kn));
1893 #undef EXTEND_USING_SEQUENCE_CHARS
1899 if((*ii)->GetQueryStop() >= qmax) {
1900 (*ii)->Modify(1, qmax - 1);
1929 const THit& h (**ii);
1951 ca.
Run(hitrefs.begin(), hitrefs.end());
1958 const THit& h (**ii);
1970 pair_results->Set().begin(),
1971 pair_results->Set().end());
1983 CNcbiIfstream ifstr_genomic (filename_genomic.c_str(), IOS_BASE::binary);
1985 elems_genomic * sizeof (
SSeqInfo));
1986 ifstr_genomic.close();
1991 CNcbiIfstream ifstr_cdna (filename_cdna.c_str(), IOS_BASE::binary);
2024 rv = (rv * 3 + (*ii)) % 3571;
2027 return t - 5000 + rv;
2084 vol_exts.push_back(kFileExt_Offsets);
2085 vol_exts.push_back(kFileExt_Positions);
2106 vol_exts.push_back(kFileExt_Offsets);
2107 vol_exts.push_back(kFileExt_Positions);
2108 vol_exts.push_back(kFileExt_Masked);
2109 vol_exts.push_back(kFileExt_Remap);
const TCoord * GetBox(void) const
void SetSubjId(const TId &id)
void SetBox(const TCoord box[4])
void SetQueryId(const TId &id)
static CSafeStatic< vector< CSeq_id_Handle > > s_ids
TCoord GetLength(void) const
void Run(typename THitRefs::iterator start, typename THitRefs::iterator finish, CScope *scope=NULL, const vector< pair< TCoord, TCoord > > *gaps=NULL)
Execute: identify compartments.
void SetMaxIntron(TCoord mi)
Assign the maximum intron length, in base pairs.
bool GetFirst(THitRefs &compartment)
Initialize iteration over the results.
CRef< objects::CSeq_align_set > AsSeqAlignSet(void) const
Retrieve all valid compartments in a seq-align-set.
bool GetNext(THitRefs &compartment)
Proceed with iteration over the results.
static TCoord s_GetDefaultMaxIntron(void)
Retrieve the default maximum length of an intron.
void reset_at(const Uint8 idx)
void set_at(const Uint8 idx)
bool get_at(const Uint8 idx) const
void Init(Uint8 dim_bits, bool init_value)
const Uint8 * GetBuffer(void) const
const char * m_CurSeq_cDNA
double m_MinCompartmentIdty
vector< SSeqInfo > TSeqInfos
void x_LoadRemapData(ISequenceSource *m_qsrc, const string &sdb)
void x_InitParticipationVector(bool strand)
void x_CleanVolumes(const string &lbn, const TStrings &vol_extensions)
void x_Search(bool strand)
vector< THitRef > THitRefs
void x_CreateRemapData(const string &db, EIndexMode mode)
vector< string > TStrings
TSeqInfos m_SeqInfos_Genomic
TSeqInfos::const_iterator m_ii_cdna
size_t x_WriteIndexFile(size_t volume, EIndexMode mode, bool strand, vector< Uint8 > &MersAndCoords)
static bool s_IsLowComplexity(size_t key)
double m_MinSingletonIdty
Int8 x_ExtendHit(const Int8 &left_limit, const Int8 &right_limit, THitRef hitref)
TSeqInfos m_SeqInfos_cdna
bool x_IsMatch(Uint8 q, Uint8 s) const
vector< SHitIndexEntry > THitIndex
const char * m_CurSeq_Genomic
void x_CreateIndex(const string &db, EIndexMode index_more, bool strand)
void x_CompartPair(vector< Uint8 > *pvol, TSeqInfos::const_iterator ii_cdna, TSeqInfos::const_iterator ii_genomic, size_t idx_start, size_t idx_stop, size_t *pidx_compacted)
void x_InitFilteringVector(const string &sdb, bool strand)
void x_CompartVolume(vector< Uint8 > *phits)
TSeqInfos::const_iterator m_ii_genomic
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
CReverseAndComplement(void)
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
list< CRef< CSeq_id > > GetSeqIDs(int oid) const
Gets a list of sequence identifiers.
int GetSeqLength(int oid) const
Returns the sequence length in base pairs or residues.
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
int GetNumSeqs() const
Returns the number of sequences available.
int GetSequence(int oid, const char **buffer) const
Get a pointer to raw sequence data.
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
virtual CConstRef< CSeq_id > GetSeqID(int idx)
virtual Uint8 GetTotalLength(void)
virtual int GetSeqLength(int idx)
virtual bool GetNext(void)
virtual int GetNumSeqs(void)
virtual void ResetIndex(void)
virtual int GetSeq(const char **buffer)
virtual int GetCurrentIndex()
virtual void RetSequence(const char **)
void g_RestoreFromTemp(const string &filename, VectorT *pvd)
string GetLocalBaseName(const string &extended_name, const string &sfx)
#define QCOMP_COUNT_NrMERS
bool PLoWord(const Uint8 &lhs, const Uint8 &rhs)
void CheckWrittenFile(const string &filename, const Uint8 &len_bytes)
char DecodeSeqDbChar(Uint1 c)
CRandom::TValue GenerateSeed(const string &str)
string ReplaceExt(const string &extended_name, const string &new_ext)
string g_SaveToTemp(const VectorT &v, const string &path)
#define QCOMP_CREATE_GENOMIC_IDX(w8, gccur)
T ReverseAndComplement(T v)
#define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
bool PDiag(const Uint8 &lhs, const Uint8 &rhs)
static const char * str(char *buf, int n)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
TEntries GetEntries(const string &mask=kEmptyStr, TGetEntriesFlags flags=0) const
Get directory entries based on the specified "mask".
static string GetTmpNameEx(const string &dir=kEmptyStr, const string &prefix=kEmptyStr, ETmpFileCreationMode mode=eTmpFileGetName)
Get temporary file name.
void * Map(TOffsetType offset=0, size_t length=0)
Map file.
Int8 GetLength(void) const
Get size of file.
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
bool Unmap(void)
Unmap file if mapped.
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
void Add(const string &path)
Add a path for later deletion.
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
@ eCreate
Create new file or rewrite existent with zeros.
@ eMMS_Shared
Changes are shared.
@ eMMP_Write
Data can be written.
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
void Reset(void)
Reset reference object.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
Uint4 TValue
Type of the generated integer value and/or the seed value.
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
constexpr auto sort(_Init &&init)
double value_type
The numeric datatype used by the parser.
const struct ncbi::grid::netcache::search::fields::KEY key
void SleepSec(unsigned long sec, EInterruptOnSignal onsignal=eRestartOnSignal)
Sleep.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Defines BLAST database access classes.
const value_slice::CValueConvert< value_slice::SRunTimeCP, FROM > Convert(const FROM &value)