NCBI C++ ToolKit
compart_matching.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: compart_matching.cpp 102826 2024-07-25 12:33:19Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Yuri Kapustin
27 *
28 * ===========================================================================
29 */
30 
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <math.h>
35 
36 #include <corelib/ncbi_system.hpp>
37 #include <corelib/ncbifile.hpp>
38 #include <util/random_gen.hpp>
42 
43 #include <util/value_convert.hpp>
44 
46 
48 
49 
50 namespace {
51 
52  // N-mer size for repeat filtering
53  const Uint4 kNr (14);
54 
55  // total number of Nr-mers for repeat filtering
56  const size_t kNrMersTotal (1 << (kNr * 2));
57 
58  // percentile used for repeat filtering
59  const double kRepeatsPercentile (0.995);
60 
61  // N-mer size for matching
62  const Uint4 kN (16);
63 
64  // total number of N-mers for indexing
65  const Uint8 kNMersTotal (Uint8(1) << (kN * 2));
66 
67  // extension to indicate repeat filtering table
68  const char *kFileExt_Masked = ".rep";
69 
70  // extension to indicate ID and coordinate remapping table
71  const char *kFileExt_Remap = ".idc";
72  const char *kFileExt_Offsets = ".ofs";
73  const char *kFileExt_Positions = ".pos";
74 
75  const Uint8 kUI8_LoWord (0xFFFFFFFF);
76  const Uint8 kUI8_LoByte (kUI8_LoWord >> 24);
77  const Uint8 kUI8_HiWord (kUI8_LoWord << 32);
78  const Uint8 kUI8_LoFive (kUI8_LoWord | (kUI8_LoWord << 8));
79 
80  const Uint8 kUI8_LoHalfWordEachByte ((Uint8(0x0F0F0F0F) << 32) | 0x0F0F0F0F);
81 
82  const Uint4 kUI4_Lo28 (0xFFFFFFFF >> 4);
83 
84  const Uint8 kUI8_SeqDb_lo (0x03030303);
85  const Uint8 kUI8_SeqDb ((kUI8_SeqDb_lo << 32) | kUI8_SeqDb_lo);
86 
87  const Int8 kDiagMax (numeric_limits<Int8>::max());
88 
89  const size_t kMapGran (512 * 1024 * 1024);
90 }
91 
92 
94 {
95  char rv ('*');
96  switch(c) {
97  case 0: rv = 'A'; break;
98  case 1: rv = 'C'; break;
99  case 2: rv = 'G'; break;
100  case 3: rv = 'T'; break;
101  }
102  return rv;
103 }
104 
105 //#define ALGO_ALIGN_SPLIGN_QCOMP_DEBUG
106 #ifdef ALGO_ALIGN_SPLIGN_QCOMP_DEBUG
107 
108 template<typename T>
109 string GetBinString(T v)
110 {
111  vector<bool> vb;
112 
113  const size_t bitdim (sizeof(T) * 8);
114  for(size_t i(0); i < bitdim; ++i) {
115  vb.push_back(v & 1);
116  v >>= 1;
117  }
118 
119  CNcbiOstrstream ostr;
120  for(size_t i (bitdim); i > 0; --i) {
121  if(i < bitdim && i % 8 == 0) {
122  ostr << ' ';
123  }
124  ostr << vb[i-1];
125  }
126 
127  const string rv = CNcbiOstrstreamToString(ostr);
128  return rv;
129 }
130 
131 #endif
132 
133 void CheckWrittenFile(const string& filename, const Uint8& len_bytes)
134 {
135  Int8 reported_len (-1);
136  for(size_t attempt(0); attempt < 1; ++attempt) {
137  reported_len = CFile(filename).GetLength();
138  if(reported_len >= 0 && Uint8(reported_len) == len_bytes) {
139  return;
140  }
141  else {
142  SleepSec(1);
143  }
144  }
145 
146  CNcbiOstrstream ostr;
147 
148  if(reported_len < 0) {
149  ostr << "Cannot write " << filename
150  << " (error code = " << reported_len << "). ";
151  }
152  else {
153  ostr << "The size of " << filename << " (" << reported_len << ')'
154  << " is different from the expected " << len_bytes << ". ";
155  }
156  ostr << "Please make sure there is enough disk space.";
157 
158  const string errmsg = CNcbiOstrstreamToString(ostr);
159  NCBI_THROW(CException, eUnknown, errmsg);
160 }
161 
162 
163 //
164 // Brute-force reversal & complementation
165 
166 template<typename T>
168 {
169  T rv (0);
170  for(size_t i (0), imax (4*sizeof(T)); i < imax; ++i, v >>= 2) {
171  rv = (rv << 2) | (v & 3);
172  }
173  rv = ~rv;
174 
175  return rv;
176 }
177 
178 
179 //
180 // Table-based bit reversal & complementation
181 
182 template<typename T>
184 
185 public:
186 
188 
189  m_Table.resize(256);
190 
191  for(Uint1 i(1); i < 0xFF; ++i) {
193  }
194 
195  m_Table[0] = 0xFF;
196  m_Table[0xFF] = 0;
197  }
198 
199  T operator() (T v) const {
200 
201  T rv (0);
202  for(size_t i(0), imax(sizeof(T)); i < imax; ++i) {
203  rv <<= 8;
204  rv |= m_Table[v & 0xFF];
205  v >>= 8;
206  }
207  return rv;
208  }
209 
210 private:
211 
212  vector<Uint1> m_Table;
213 };
214 
215 
216 namespace {
218 }
219 
220 
222 {
223  // evaluate the lower 28 bits (14 residues) only
224 
225  const Uint4 kMaxTwoBaseContent (14);
226 
227  vector<Uint4> counts(4, 0);
228 
229  for(Uint4 k = 0; k < 14; ++k) {
230  ++counts[0x00000003 & key];
231  key >>= 2;
232  }
233 
234  const Uint4 ag (counts[0] + counts[1]);
235  const Uint4 at (counts[0] + counts[2]);
236  const Uint4 ac (counts[0] + counts[3]);
237  const Uint4 gt (counts[1] + counts[2]);
238  const Uint4 gc (counts[1] + counts[3]);
239  const Uint4 tc (counts[2] + counts[3]);
240 
241  return
242  ag >= kMaxTwoBaseContent || at >= kMaxTwoBaseContent ||
243  ac >= kMaxTwoBaseContent || gt >= kMaxTwoBaseContent ||
244  gc >= kMaxTwoBaseContent || tc >= kMaxTwoBaseContent;
245 }
246 
247 
248 string GetLocalBaseName(const string& extended_name, const string & sfx)
249 {
250  string dir, base, ext;
251  CDirEntry::SplitPath(extended_name, &dir, &base, &ext);
252  string rv (base);
253  if(!ext.empty()) {
254  rv += ext;
255  }
256  rv += "." + sfx;
257  return rv;
258 }
259 
260 string ReplaceExt(const string& extended_name, const string & new_ext)
261 {
262  string dir, base, ext;
263  CDirEntry::SplitPath(extended_name, &dir, &base, &ext);
264  string rv;
265  if(!dir.empty()) {
266  rv = dir; // + CDirEntry::GetPathSeparator();
267  }
268  if(!base.empty()) {
269  rv += base;
270  }
271  rv += new_ext;
272 
273  return rv;
274 }
275 
276 
277 template <typename VectorT>
278 string g_SaveToTemp(const VectorT& v, const string& path)
279 {
280  typedef typename VectorT::value_type TElem;
281 
282  const string filename(CFile::GetTmpNameEx(path, "splqcomp_"));
283  const Uint8 len_bytes (v.size() * sizeof(TElem));
284 
285  {
286  CNcbiOfstream tempcgrfile (filename.c_str(), IOS_BASE::binary);
287  tempcgrfile.write((const char* ) & v.front(), len_bytes);
288  tempcgrfile.close();
289  }
290 
291  CheckWrittenFile(filename, len_bytes);
292 
293  return filename;
294 }
295 
296 
297 template <typename VectorT>
298 void g_RestoreFromTemp(const string& filename, VectorT* pvd)
299 {
300  typedef typename VectorT::value_type TElem;
301 
302  VectorT& v = *pvd;
303 
304  const Uint8 dim (CFile(filename).GetLength() / sizeof(TElem));
305 
306  CNcbiIfstream tempcgrfile (filename.c_str(), IOS_BASE::binary);
307  tempcgrfile.read((char* ) & v.front(), dim * sizeof(TElem));
308 
309  CFile(filename).Remove();
310 }
311 
312 
314 {
315  // Skim over the query offset volumes and set the bitmask
316 
317  m_Mers.Init(kNMersTotal, false);
318 
319  CDir dir (m_FilePath);
320  const string sfx (string(strand?".p": ".m") + ".v*");
321  const string mask_ofs_q (m_lbn_q + sfx + kFileExt_Offsets);
322  CDir::TEntries vols_ofs_q (dir.GetEntries(mask_ofs_q));
323  ITERATE(CDir::TEntries, ii, vols_ofs_q) {
324 
325  const string filename ((*ii)->GetPath());
326  const Int8 vol_length (CFile(filename).GetLength());
327 
328  CMemoryFile mf (filename);
329  for(const Uint8 * p8 (reinterpret_cast<const Uint8 *> (mf.Map())),
330  * p8e (p8 + vol_length / 8); p8 != p8e;
331  m_Mers.set_at(*p8++ & kUI8_LoWord));
332  mf.Unmap();
333  }
334 
335  m_Mers.reset_at(0);
336 }
337 
338 
339 void CElementaryMatching::x_InitFilteringVector(const string& sdb, bool strand)
340 {
341  // for plus strand create using the actual sequence data;
342  // use the plus strand table to init the minus strand table
343 
344  if(strand) {
345 
346  // create repeat filtering table (genome);
347  CRef<CSeqDB> subjdb (new CSeqDB(sdb, CSeqDB::eNucleotide));
349  {
350  CNcbiOstrstream ostr;
351  ostr << "Sequence volumes with total length exceeding "
353  << " are not yet supported. Please split your FASTA file and re-run "
354  << " formatdb.";
355  const string err = CNcbiOstrstreamToString(ostr);
357  }
358 
359  Uint4 current_offset (0);
360 
361  unique_ptr<vector<Uint4> > pNrCounts (new vector<Uint4> (kNrMersTotal, 0));
362  vector<Uint4> & NrCounts (*pNrCounts);
363 
364  cerr << " Scanning " << subjdb->GetNumSeqs() << " genomic sequences ... ";
365  for(int oid (0); subjdb->CheckOrFindOID(oid); ++oid)
366  {
367  const char * pcb (0);
368  const Uint4 bases (subjdb->GetSequence(oid, &pcb));
369  const char * pcbe (pcb + bases / 4);
370  uintptr_t npcb (reinterpret_cast<uintptr_t>(pcb)), npcb0 (npcb);
371  npcb -= npcb % 8;
372  if(npcb < npcb0) npcb += 8;
373  const Uint8 gcbase (4*(npcb - npcb0));
374  const Uint8 * pui8 (reinterpret_cast<const Uint8*>(npcb));
375  const Uint8 * pui8_e (reinterpret_cast<const Uint8*>(pcbe));
376 
377  for(size_t gccur (current_offset + gcbase); pui8 < pui8_e; ++pui8) {
378 
379  Uint8 ui8 (*pui8);
380 
381  // counting the hi 28 bits correspond to
382  // a subsequence at positions 1,2,5-16
383 #define QCOMP_COUNT_NrMERS {{ \
384  if(gccur + 16 >= current_offset + bases) { \
385  break; \
386  } \
387  const Uint4 mer4 (ui8 & kUI8_LoWord); \
388  ++NrCounts[mer4 >> 4]; \
389  gccur += 4; \
390  ui8 >>= 8; \
391  }}
392 
397 
398  if(pui8 + 1 < pui8_e) {
399 
400  ui8 |= (*(pui8 + 1) << 32);
401 
406  }
407 
408 #undef QCOMP_COUNT_NMERS
409  }
410 
411  subjdb->RetSequence(&pcb);
412 
413  current_offset += bases;
414  } // OIDs
415 
416  subjdb.Reset(0);
417 
418  cerr << "Ok" << endl;
419  cerr << " Constructing FV ... ";
420 
421  string filename_temp_01;
422  unique_ptr<vector<Uint4> > pNrCounts2 (new vector<Uint4>);
423  vector<Uint4> & NrCounts2 (* pNrCounts2);
424  try {
425  NrCounts2 = NrCounts;
426  }
427  catch(...) {
428  filename_temp_01 = g_SaveToTemp(NrCounts, m_FilePath);
429  pNrCounts2.reset(0);
430  }
431 
432  // find the percentile
433  size_t total_mers (0);
434  ITERATE(vector<Uint4>, ii, NrCounts) {
435  if(*ii > 0) ++total_mers;
436  }
437  const size_t percentile ((size_t)(kNrMersTotal -
438  total_mers * (1 - kRepeatsPercentile)));
439  nth_element(NrCounts.begin(), NrCounts.begin() + percentile, NrCounts.end());
440  const Uint4 max_repeat_count (NrCounts[percentile]);
441 
442  if(filename_temp_01.empty()) {
443  NrCounts = NrCounts2;
444  }
445  else {
446  g_RestoreFromTemp(filename_temp_01, &NrCounts);
447  }
448  pNrCounts2.reset(0);
449 
450  m_Mers.Init(kNrMersTotal, false);
451  for(size_t i (0); i < kNrMersTotal; ++i) {
452  const bool v (NrCounts[i] <= max_repeat_count && !s_IsLowComplexity(i));
453  if(v) {
454  m_Mers.set_at(i);
455  }
456  }
457  pNrCounts.reset(0);
458 
459  // serialize
460  const string masked_filename (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_s + kFileExt_Masked);
461  const Uint8 len_bytes (kNrMersTotal / 8);
462  {{
463  CMemoryFile mf (masked_filename, CMemoryFile::eMMP_Write,
464  CMemoryFile::eMMS_Shared, 0, kNrMersTotal / 8,
466  len_bytes);
467  Uint8* dest (reinterpret_cast<Uint8*>(mf.Map()));
468 
469  const Uint8 * src (m_Mers.GetBuffer());
470  copy(src, src + kNrMersTotal / 64, dest);
471  }}
472 
473  CheckWrittenFile(masked_filename, len_bytes);
474  }
475  else {
476 
477  cerr << " Reading/transforming FV ... ";
478 
479  // read the plus strand vector and trasnform
480  const string masked_filename (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_s + kFileExt_Masked);
481 
482  CMemoryFile mf (masked_filename);
483  const Uint8 * p8 (reinterpret_cast<Uint8*>(mf.Map()));
484  CBoolVector nrmers_plus (kNrMersTotal, p8);
485  mf.Unmap();
486 
487  m_Mers.Init(kNrMersTotal, false);
488 
489  for(size_t i(0); i < kNrMersTotal; ++i) {
490  if(nrmers_plus.get_at(i)) {
491  // todo: optimize using tables
492  const size_t irc (g_RC(Uint4(i) << 4) & kUI4_Lo28);
493  m_Mers.set_at(irc);
494  }
495  }
496  }
497 
498  cerr << "Ok" << endl;
499 }
500 
501 
503 {
504  // create ID and coordinate remapping tables
505 
506  CSeqDB blastdb (db, CSeqDB::eNucleotide);
507 
508  TSeqInfos seq_infos;
509  const size_t num_seqs (blastdb.GetNumSeqs());
510  seq_infos.reserve(num_seqs);
511 
512  Uint4 current_offset (0);
513  for(int oid (0); blastdb.CheckOrFindOID(oid); ++oid) {
514 
515  const int seqlen (blastdb.GetSeqLength(oid));
516  if(seqlen <= 0 || size_t(seqlen) >= 0xFFFFFFFF) {
517  CNcbiOstrstream ostr;
518  ostr << "Cannot create remap data for:\t" <<
519  blastdb.GetSeqIDs(oid).front()->GetSeqIdString(true);
520  const string err = CNcbiOstrstreamToString(ostr);
522  }
523 
524  const Uint4 bases (seqlen);
525  seq_infos.push_back(SSeqInfo(current_offset, bases, oid));
526  current_offset += bases;
527  }
528 
529  const string remap_filename ((mode == eIM_Genomic? m_lbn_s: m_lbn_q) +
530  kFileExt_Remap);
531 
532  const string full_remap_filename = m_FilePath + CDirEntry::GetPathSeparator() + remap_filename;
533 
534  CNcbiOfstream ofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);
535  const Uint8 len_bytes (seq_infos.size() * sizeof(SSeqInfo));
536  ofstr_remap.write((const char*) &seq_infos.front(), len_bytes);
537  ofstr_remap.close();
538 
539  CheckWrittenFile(full_remap_filename, len_bytes);
540 
541  cerr << " Remap data created for " << db << "; max offset = "
542  << current_offset << endl;
543 }
544 
546 {
547  // create ID and coordinate remapping tables
548 
549  TSeqInfos seq_infos;
550  const size_t num_seqs (m_qsrc->GetNumSeqs());
551  seq_infos.reserve(num_seqs);
552 
553  Uint4 current_offset (0);
554  for(m_qsrc->ResetIndex(); m_qsrc->GetNext(); ) {
555 
556  const int seqlen (m_qsrc->GetSeqLength());
557  if(seqlen <= 0 || size_t(seqlen) >= 0xFFFFFFFF) {
558  CNcbiOstrstream ostr;
559  ostr << "Cannot create remap data for:\t" <<
560  m_qsrc->GetSeqID()->GetSeqIdString(true);
561  const string err = CNcbiOstrstreamToString(ostr);
563  }
564 
565  const Uint4 bases (seqlen);
566  seq_infos.push_back(SSeqInfo(current_offset, bases, m_qsrc->GetCurrentIndex()));
567  current_offset += bases;
568  }
569 
570  const string remap_filename ((mode == eIM_Genomic? m_lbn_s: m_lbn_q) +
571  kFileExt_Remap);
572 
573  const string full_remap_filename = m_FilePath + CDirEntry::GetPathSeparator() + remap_filename;
574 
575  CNcbiOfstream ofstr_remap (full_remap_filename.c_str(), IOS_BASE::binary);
576  const Uint8 len_bytes (seq_infos.size() * sizeof(SSeqInfo));
577  ofstr_remap.write((const char*) &seq_infos.front(), len_bytes);
578  ofstr_remap.close();
579 
580  CheckWrittenFile(full_remap_filename, len_bytes);
581 
582  cerr << " Remap data created for sequences; max offset = "
583  << current_offset << endl;
584 }
585 
586 
587 void CElementaryMatching::x_CreateIndex(const string& db, EIndexMode mode, bool strand)
588 {
589  // sort all adjacent genomic N-mers and their positions
590  // except for those marked in the Nr-mer bit vector
591 
592  cerr << " Scanning " << db << " for N-mers and their positions." << endl;
593 
594  if(m_Mers.get_at(0)) {
595  NCBI_THROW(CException, eUnknown, "NULL mer not excluded!");
596  }
597 
598  const size_t mcidx_max (m_MaxVolSize / 8);
599  vector<Uint8> MersAndCoords (mcidx_max, 0);
600  size_t mcidx (0);
601  size_t current_offset (0);
602 
603  CRef<CSeqDB> blastdb (new CSeqDB(db, CSeqDB::eNucleotide));
604 
605  const Uint8 blastdb_total_length (blastdb->GetTotalLength());
606  if((mode == eIM_Genomic && blastdb_total_length / kN > numeric_limits<Uint4>::max())
607  || (mode == eIM_cDNA && blastdb_total_length > numeric_limits<Uint4>::max()))
608  {
609  CNcbiOstrstream ostr;
610  ostr << "Sequence volumes with total length exceeding "
612  << " are not yet supported. Please split your FASTA file and re-run "
613  << " formatdb.";
614  const string err = CNcbiOstrstreamToString(ostr);
616  }
617 
618  size_t volume(0);
619  for(int oid (0); blastdb->CheckOrFindOID(oid); ++oid) {
620 
621  const char * pcb (0);
622  const Uint4 bases (blastdb->GetSequence(oid, &pcb));
623  const char * pce (pcb + bases/4);
624  uintptr_t npcb (reinterpret_cast<uintptr_t>(pcb)), npcb0 (npcb);
625  npcb -= npcb % 8;
626  if(npcb < npcb0) npcb += 8;
627  const Uint8 gcbase (4*(npcb - npcb0));
628  const Uint8* pui8_start (reinterpret_cast<const Uint8*>(npcb));
629  const Uint8* pui8 (pui8_start);
630 
631  // It helps not to break volumes in the middle of a sequence.
632  // We explicitly check here if the volume is close to its limit.
633 
634  const size_t max_new_elems (mode == eIM_Genomic? size_t(8.0 * bases / kN):
635  bases);
636 
637  if(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {
638  MersAndCoords.resize(mcidx);
639  x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
640  MersAndCoords.assign(mcidx_max, 0);
641  mcidx = 0;
642  }
643 
644  if(mode == eIM_Genomic) {
645 
646  // index every other position
647  for(size_t gccur (current_offset + gcbase);
648  gccur + 16 < current_offset + bases && mcidx < mcidx_max;
649  ++pui8)
650  {
651  Uint8 w8 (*pui8);
652 
653 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \
654  size_t gccur2 (gccur + 2); \
655  const Uint8 ui8_2930 (w8 >> 60); \
656  Uint8 ui8_ls4 (w8 << 4); \
657  const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \
658  ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \
659  ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48);
660 
662 
663 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \
664  { \
665  if(gccur + 16 >= current_offset + bases) { \
666  break; \
667  } \
668  const Uint8 mer (w8 & kUI8_LoWord); \
669  if(strand) { \
670  if(m_Mers.get_at(mer)) { \
671  MersAndCoords[mcidx++] = (mer << 32) | gccur; \
672  } \
673  } \
674  else { \
675  const Uint4 rc (g_RC(Uint4(mer))); \
676  if(m_Mers.get_at(rc)) { \
677  MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \
678  ((current_offset + bases - gccur - 16) \
679  + current_offset); \
680  } \
681  } \
682  gccur += 4; \
683  w8 >>= 8; \
684  }
685 
686  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
687  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
688  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
689  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
690 
691  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
692  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
693  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
694  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
695 
696  if(gccur + 32 >= current_offset + bases) {
697  break;
698  }
699  else {
700 
701  w8 |= ((*(pui8 + 1)) << 32);
702 
704 
705  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
706  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
707  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
708  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
709 
710  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
711  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
712  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
713  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
714  }
715 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
716 #undef QCOMP_CREATE_GENOMIC_IDX
717  }
718  }
719  else { // cDNA mode
720 
721  if(bases >= m_MinQueryLength && bases <= m_MaxQueryLength) {
722 
723  // head
724  Uint8 fivebytes (0);
725  for(const char * p (pcb),
726  *pche ( (reinterpret_cast<const char*>(pui8)) + 5 );
727  p < pche; ++p)
728  {
729  const Uint8 c8 (*p & kUI8_LoByte);
730  const Uint8 ui8curbyte (c8);
731  if(pcb + 5 > p) {
732  fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);
733  }
734  else {
735 
736  for(Uint4 k(0); k < 4; ++k) {
737 
738  const Uint4 mer (fivebytes & kUI8_LoWord);
739  if(m_Mers.get_at(strand? (mer >> 4): (mer & kUI4_Lo28))) {
740  const Uint8 ui8 (mer);
741  const Uint8 coord (current_offset +
742  4*(p - pcb - 5) + k);
743  MersAndCoords[mcidx++] = (ui8 << 32) | coord;
744  }
745 
746  fivebytes &= kUI8_LoFive;
747  fivebytes <<= 2;
748  const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
749  fivebytes &= ~kUI8_SeqDb;
750  fivebytes |= ui8m;
751  }
752  fivebytes |= ui8curbyte << 32;
753  }
754  }
755 
756  // body
757  Uint8 ui8 (0);
758  Uint8 gccur (current_offset + gcbase);
759  for(; gccur + 32 < current_offset + bases; ++pui8)
760  {
761  ui8 = *pui8;
762 
763  for(Uint8 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
764 
765  const Uint8 loword = ui8 & kUI8_LoWord;
766  if(m_Mers.get_at(strand? (loword >> 4):
767  (loword & kUI4_Lo28)))
768  {
769  MersAndCoords[mcidx++] = (loword << 32) | gccur;
770  }
771 
772  const Uint8 ui8hi2 ((ui8 >> 62) << 48);
773  ui8 <<= 2;
774  const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
775  ui8 &= ~kUI8_SeqDb;
776  ui8 |= (ui8m | ui8hi2);
777  }
778 
779  if(gccur + 32 < current_offset + bases) {
780 
781  // pre-read next 16 residues
782  const uintptr_t n (reinterpret_cast<uintptr_t>(pui8 + 1));
783  const Uint4 * pui4 (reinterpret_cast<const Uint4*>(n));
784  Uint4 ui4 (*pui4);
785  Uint8 ui8_4 (ui4);
786  ui8 |= (ui8_4 << 32);
787 
788  for(Uint8 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
789 
790  const Uint8 loword = ui8 & kUI8_LoWord;
791  if(m_Mers.get_at(strand? (loword >> 4):
792  (loword & kUI4_Lo28)))
793  {
794  MersAndCoords[mcidx++] = (loword << 32) | gccur;
795  }
796 
797  const Uint8 ui8hi2 ((ui8 >> 62) << 48);
798  ui8 <<= 2;
799  const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
800  ui8 &= ~kUI8_SeqDb;
801  ui8 |= (ui8m | ui8hi2);
802  }
803  }
804  }
805 
806  // tail
807  if(gccur + 16 <= current_offset + bases) {
808 
809  fivebytes = (gccur == current_offset + gcbase)? fivebytes:
810  (ui8 & kUI8_LoWord);
811  const char * p (reinterpret_cast<const char*>(pui8_start) + 4
812  + (gccur - current_offset - gcbase) / 4);
813  size_t k (0);
814  do {
815  const Uint8 loword = fivebytes & kUI8_LoWord;
816 
817  if(m_Mers.get_at(strand? (loword >> 4):
818  (loword & kUI4_Lo28)))
819  {
820  MersAndCoords[mcidx++] = (loword << 32) | gccur;
821  }
822 
823  if(k % 4 == 0) {
824  if(p < pce) {
825  const Uint8 ui8 (*p++ & kUI8_LoByte);
826  fivebytes |= (ui8 << 32);
827  }
828  else {
829  break;
830  }
831  }
832 
833  fivebytes &= kUI8_LoFive;
834  fivebytes <<= 2;
835  const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
836  fivebytes &= ~kUI8_SeqDb;
837  fivebytes |= ui8m;
838 
839  ++k;
840  ++gccur;
841  } while (true);
842  }
843 
844  } // min cDNA length check
845 
846  } // genomic or cDNA
847 
848  blastdb->RetSequence(&pcb);
849 
850  current_offset += bases;
851 
852  if(mcidx >= mcidx_max) {
853  CNcbiOstrstream ostr;
854  ostr << "Selected max volume size is too small: "
855  << "it must be large enough to fit the index for the "
856  << "longest input sequence.";
857  const string err = CNcbiOstrstreamToString(ostr);
859  }
860  } // seqdb iteration
861 
862  blastdb.Reset(0);
863 
864  if(mcidx > 0) {
865  MersAndCoords.resize(mcidx);
866  x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
867  mcidx = 0;
868  }
869 
870  m_Mers.Clear();
871 }
872 
874 {
875  // sort all adjacent genomic N-mers and their positions
876  // except for those marked in the Nr-mer bit vector
877 
878  cerr << " Scanning sequences for N-mers and their positions." << endl;
879 
880  if(m_Mers.get_at(0)) {
881  NCBI_THROW(CException, eUnknown, "NULL mer not excluded!");
882  }
883 
884  const size_t mcidx_max (m_MaxVolSize / 8);
885  vector<Uint8> MersAndCoords (mcidx_max, 0);
886  size_t mcidx (0);
887  size_t current_offset (0);
888 
889 
890  const Uint8 blastdb_total_length (m_qsrc->GetTotalLength());
891  if((mode == eIM_Genomic && blastdb_total_length / kN > numeric_limits<Uint4>::max())
892  || (mode == eIM_cDNA && blastdb_total_length > numeric_limits<Uint4>::max()))
893  {
894  CNcbiOstrstream ostr;
895  ostr << "Sequence volumes with total length exceeding "
897  << " are not yet supported. Please split your FASTA file and re-run "
898  << " formatdb.";
899  const string err = CNcbiOstrstreamToString(ostr);
901  }
902 
903  size_t volume(0);
904  for(m_qsrc->ResetIndex(); m_qsrc->GetNext(); ) {
905 
906  const char * pcb (0);
907  const Uint4 bases (m_qsrc->GetSeq(&pcb));
908  const char * pce (pcb + bases/4);
909  uintptr_t npcb (reinterpret_cast<uintptr_t>(pcb)), npcb0 (npcb);
910  npcb -= npcb % 8;
911  if(npcb < npcb0) npcb += 8;
912  const Uint8 gcbase (4*(npcb - npcb0));
913  const Uint8* pui8_start (reinterpret_cast<const Uint8*>(npcb));
914  const Uint8* pui8 (pui8_start);
915 
916  // It helps not to break volumes in the middle of a sequence.
917  // We explicitly check here if the volume is close to its limit.
918 
919  const size_t max_new_elems (mode == eIM_Genomic? size_t(8.0 * bases / kN):
920  bases);
921 
922  if(mcidx > 1000 && mcidx + max_new_elems >= mcidx_max) {
923  MersAndCoords.resize(mcidx);
924  x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
925  MersAndCoords.assign(mcidx_max, 0);
926  mcidx = 0;
927  }
928 
929  if(mode == eIM_Genomic) {
930 
931  // index every other position
932  for(size_t gccur (current_offset + gcbase);
933  gccur + 16 < current_offset + bases && mcidx < mcidx_max;
934  ++pui8)
935  {
936  Uint8 w8 (*pui8);
937 
938 #define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX \
939  size_t gccur2 (gccur + 2); \
940  const Uint8 ui8_2930 (w8 >> 60); \
941  Uint8 ui8_ls4 (w8 << 4); \
942  const Uint8 ui8_mask (ui8_ls4 & kUI8_LoHalfWordEachByte); \
943  ui8_ls4 &= kUI8_LoHalfWordEachByte << 4; \
944  ui8_ls4 |= (ui8_mask >> 16) | (ui8_2930 << 48);
945 
947 
948 #define QCOMP_CREATE_GENOMIC_IDX(w8,gccur) \
949  { \
950  if(gccur + 16 >= current_offset + bases) { \
951  break; \
952  } \
953  const Uint8 mer (w8 & kUI8_LoWord); \
954  if(strand) { \
955  if(m_Mers.get_at(mer)) { \
956  MersAndCoords[mcidx++] = (mer << 32) | gccur; \
957  } \
958  } \
959  else { \
960  const Uint4 rc (g_RC(Uint4(mer))); \
961  if(m_Mers.get_at(rc)) { \
962  MersAndCoords[mcidx++] = (Uint8(rc) << 32) | \
963  ((current_offset + bases - gccur - 16) \
964  + current_offset); \
965  } \
966  } \
967  gccur += 4; \
968  w8 >>= 8; \
969  }
970 
971  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
972  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
973  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
974  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
975 
976  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
977  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
978  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
979  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
980 
981  if(gccur + 32 >= current_offset + bases) {
982  break;
983  }
984  else {
985 
986  w8 |= ((*(pui8 + 1)) << 32);
987 
989 
990  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
991  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
992  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
993  QCOMP_CREATE_GENOMIC_IDX(w8,gccur);
994 
995  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
996  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
997  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
998  QCOMP_CREATE_GENOMIC_IDX(ui8_ls4,gccur2);
999  }
1000 #undef QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
1001 #undef QCOMP_CREATE_GENOMIC_IDX
1002  }
1003  }
1004  else { // cDNA mode
1005 
1006  if(bases >= m_MinQueryLength && bases <= m_MaxQueryLength) {
1007 
1008  // head
1009  Uint8 fivebytes (0);
1010  for(const char * p (pcb),
1011  *pche ( (reinterpret_cast<const char*>(pui8)) + 5 );
1012  p < pche; ++p)
1013  {
1014  const Uint8 c8 (*p & kUI8_LoByte);
1015  const Uint8 ui8curbyte (c8);
1016  if(pcb + 5 > p) {
1017  fivebytes = (fivebytes >> 8) | (ui8curbyte << 32);
1018  }
1019  else {
1020 
1021  for(Uint4 k(0); k < 4; ++k) {
1022 
1023  const Uint4 mer (fivebytes & kUI8_LoWord);
1024  if(m_Mers.get_at(strand? (mer >> 4): (mer & kUI4_Lo28))) {
1025  const Uint8 ui8 (mer);
1026  const Uint8 coord (current_offset +
1027  4*(p - pcb - 5) + k);
1028  MersAndCoords[mcidx++] = (ui8 << 32) | coord;
1029  }
1030 
1031  fivebytes &= kUI8_LoFive;
1032  fivebytes <<= 2;
1033  const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
1034  fivebytes &= ~kUI8_SeqDb;
1035  fivebytes |= ui8m;
1036  }
1037  fivebytes |= ui8curbyte << 32;
1038  }
1039  }
1040 
1041  // body
1042  Uint8 ui8 (0);
1043  Uint8 gccur (current_offset + gcbase);
1044  for(; gccur + 32 < current_offset + bases; ++pui8)
1045  {
1046  ui8 = *pui8;
1047 
1048  for(Uint8 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
1049 
1050  const Uint8 loword = ui8 & kUI8_LoWord;
1051  if(m_Mers.get_at(strand? (loword >> 4):
1052  (loword & kUI4_Lo28)))
1053  {
1054  MersAndCoords[mcidx++] = (loword << 32) | gccur;
1055  }
1056 
1057  const Uint8 ui8hi2 ((ui8 >> 62) << 48);
1058  ui8 <<= 2;
1059  const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
1060  ui8 &= ~kUI8_SeqDb;
1061  ui8 |= (ui8m | ui8hi2);
1062  }
1063 
1064  if(gccur + 32 < current_offset + bases) {
1065 
1066  // pre-read next 16 residues
1067  const uintptr_t n (reinterpret_cast<uintptr_t>(pui8 + 1));
1068  const Uint4 * pui4 (reinterpret_cast<const Uint4*>(n));
1069  Uint4 ui4 (*pui4);
1070  Uint8 ui8_4 (ui4);
1071  ui8 |= (ui8_4 << 32);
1072 
1073  for(Uint8 gccur_hi (gccur + 16); gccur < gccur_hi; ++gccur) {
1074 
1075  const Uint8 loword = ui8 & kUI8_LoWord;
1076  if(m_Mers.get_at(strand? (loword >> 4):
1077  (loword & kUI4_Lo28)))
1078  {
1079  MersAndCoords[mcidx++] = (loword << 32) | gccur;
1080  }
1081 
1082  const Uint8 ui8hi2 ((ui8 >> 62) << 48);
1083  ui8 <<= 2;
1084  const Uint8 ui8m ((ui8 & kUI8_SeqDb) >> 16);
1085  ui8 &= ~kUI8_SeqDb;
1086  ui8 |= (ui8m | ui8hi2);
1087  }
1088  }
1089  }
1090 
1091  // tail
1092  if(gccur + 16 <= current_offset + bases) {
1093 
1094  fivebytes = (gccur == current_offset + gcbase)? fivebytes:
1095  (ui8 & kUI8_LoWord);
1096  const char * p (reinterpret_cast<const char*>(pui8_start) + 4
1097  + (gccur - current_offset - gcbase) / 4);
1098  size_t k (0);
1099  do {
1100  const Uint8 loword = fivebytes & kUI8_LoWord;
1101 
1102  if(m_Mers.get_at(strand? (loword >> 4):
1103  (loword & kUI4_Lo28)))
1104  {
1105  MersAndCoords[mcidx++] = (loword << 32) | gccur;
1106  }
1107 
1108  if(k % 4 == 0) {
1109  if(p < pce) {
1110  const Uint8 ui8 (*p++ & kUI8_LoByte);
1111  fivebytes |= (ui8 << 32);
1112  }
1113  else {
1114  break;
1115  }
1116  }
1117 
1118  fivebytes &= kUI8_LoFive;
1119  fivebytes <<= 2;
1120  const Uint8 ui8m ((fivebytes & kUI8_SeqDb) >> 16);
1121  fivebytes &= ~kUI8_SeqDb;
1122  fivebytes |= ui8m;
1123 
1124  ++k;
1125  ++gccur;
1126  } while (true);
1127  }
1128 
1129  } // min cDNA length check
1130 
1131  } // genomic or cDNA
1132 
1133  m_qsrc->RetSequence(&pcb);
1134 
1135  current_offset += bases;
1136 
1137  if(mcidx >= mcidx_max) {
1138  CNcbiOstrstream ostr;
1139  ostr << "Selected max volume size is too small: "
1140  << "it must be large enough to fit the index for the "
1141  << "longest input sequence.";
1142  const string err = CNcbiOstrstreamToString(ostr);
1144  }
1145  } // seqdb iteration
1146 
1147  if(mcidx > 0) {
1148  MersAndCoords.resize(mcidx);
1149  x_WriteIndexFile(++volume, mode, strand, MersAndCoords);
1150  mcidx = 0;
1151  }
1152 
1153  m_Mers.Clear();
1154 }
1155 
1156 
1158  size_t volume,
1159  EIndexMode mode,
1160  bool strand,
1161  vector<Uint8>& MersAndCoords)
1162 {
1163  const size_t t0 (time(0));
1164 
1165  string basename;
1166  if(mode == eIM_Genomic) {
1167  basename = m_lbn_s;
1168  }
1169  else {
1170  basename = m_lbn_q;
1171  }
1172  basename += string(strand? ".p": ".m") + ".v" + NStr::NumericToString(volume);
1173  const string filename_offs ( m_FilePath + CDirEntry::GetPathSeparator() + basename + kFileExt_Offsets );
1174 
1175  CNcbiOfstream ofstr_offs (filename_offs.c_str(), IOS_BASE::binary);
1176  Uint8 bytes_offs (0);
1177 
1178  const string filename_positions (m_FilePath + CDirEntry::GetPathSeparator() + basename + kFileExt_Positions);
1179  CNcbiOfstream ofstr_positions (filename_positions.c_str(), IOS_BASE::binary);
1180  Uint8 bytes_positions (0);
1181 
1182  cerr << " Generating index volume: " << basename << " ... ";
1183 
1184  Uint4 curmer (numeric_limits<Uint4>::max());
1185  Uint4 curofs (0);
1186 
1187  sort(MersAndCoords.begin(), MersAndCoords.end());
1188 
1189  ITERATE(vector<Uint8>, ii, MersAndCoords) {
1190 
1191  const Uint4 mer ((*ii & kUI8_HiWord) >> 32);
1192  if(mer != curmer) {
1193 
1194  ofstr_offs.write((const char*) &mer, sizeof mer);
1195  ofstr_offs.write((const char*) &curofs, sizeof curofs);
1196  curmer = mer;
1197  bytes_offs += sizeof(mer) + sizeof(curofs);
1198  }
1199  const Uint4 pos (*ii & kUI8_LoWord);
1200 
1201  ofstr_positions.write((const char*) &pos, sizeof pos);
1202  bytes_positions += sizeof(pos);
1203  ++curofs;
1204  }
1205 
1206  // termination zero mer
1207  const Uint4 mer (0);
1208  ofstr_offs.write((const char*)&mer, sizeof mer);
1209  ofstr_offs.write((const char*)&curofs, sizeof curofs);
1210  bytes_offs += sizeof(mer) + sizeof(curofs);
1211 
1212  ofstr_offs.close();
1213  ofstr_positions.close();
1214 
1215  CheckWrittenFile(filename_offs, bytes_offs);
1216  CheckWrittenFile(filename_positions, bytes_positions);
1217 
1218  cerr << "Ok" << endl;
1219 
1220  return time(0) - t0;
1221 }
1222 
1223 #define CHECK_MEMMAP(mm,ofs,len) \
1224 {{ \
1225  const size_t ofs1 (mm.GetOffset()); \
1226  if(ofs1 != ofs) { \
1227  cerr << "Real offset " << ofs1 << " different from " << ofs << endl; \
1228  } \
1229  const size_t len1 (mm.GetSize()); \
1230  if(len1 != len) { \
1231  cerr << "Real length " << len1 << " different from " << len << endl; \
1232  } \
1233 }}
1234 
1235 
1237 {
1238  m_Mers.Clear();
1239 
1240  cerr << " Matching (strand = " << (strand? "plus": "minus") << ") ... ";
1241  m_CurGenomicStrand = strand;
1242 
1243  CDir dir (m_FilePath);
1244 
1245  const string sfx (string(strand?".p": ".m") + ".v*");
1246 
1247  const string mask_ofs_q (m_lbn_q + sfx + kFileExt_Offsets);
1248  const string mask_ofs_s (m_lbn_s + sfx + kFileExt_Offsets);
1249  CDir::TEntries vols_ofs_q (dir.GetEntries(mask_ofs_q));
1250  CDir::TEntries vols_ofs_s (dir.GetEntries(mask_ofs_s));
1251 
1252  Uint8 elem_hits_total (0);
1253 
1254  ITERATE(CDir::TEntries, ii_s, vols_ofs_s) {
1255 
1256  const string filename_ofs_s ((*ii_s)->GetPath());
1257  const string filename_pos_s (ReplaceExt(filename_ofs_s, kFileExt_Positions));
1258  const CFile file_subj (filename_ofs_s);
1259  const size_t dim_ofs_s (file_subj.GetLength() / 8);
1260 
1261  ITERATE(CDir::TEntries, ii_q, vols_ofs_q) {
1262 
1263  const string filename_ofs_q ((*ii_q)->GetPath());
1264  const string filename_pos_q (ReplaceExt(filename_ofs_q,
1265  kFileExt_Positions));
1266 
1267  Uint8 hit_index_dim (0), elem_hits_this_pair (0);
1268  string filename_hit_index;
1269 
1270  {{
1271 
1272  // map offset files
1273  CMemoryFile mf_ofs_s ((*ii_s)->GetPath());
1274  const Uint8 * ofs_s (reinterpret_cast<const Uint8*>
1275  (mf_ofs_s.Map()));
1276 
1277  const CFile file_query ((*ii_q)->GetPath());
1278  const size_t dim_ofs_q (file_query.GetLength() / 8);
1279 
1280  CMemoryFile mf_ofs_q ((*ii_q)->GetPath());
1281  const Uint8 * ofs_q (reinterpret_cast<const Uint8*>
1282  (mf_ofs_q.Map()));
1283 
1284  // build hit index
1285  THitIndex hit_index;
1286  hit_index.reserve(min(dim_ofs_q, dim_ofs_s));
1287 
1288  while((*ofs_q & kUI8_LoWord) && (*ofs_s & kUI8_LoWord)) {
1289 
1290  while( (*ofs_q & kUI8_LoWord)
1291  && ((*ofs_q & kUI8_LoWord) < (*ofs_s & kUI8_LoWord)))
1292  {
1293  ++ofs_q;
1294  }
1295  if((*ofs_q & kUI8_LoWord) == 0) break;
1296 
1297  while( (*ofs_s & kUI8_LoWord)
1298  && ((*ofs_s & kUI8_LoWord) < (*ofs_q & kUI8_LoWord)))
1299  {
1300  ++ofs_s;
1301  }
1302  if((*ofs_s & kUI8_LoWord) == 0) break;
1303 
1304  if((*ofs_s & kUI8_LoWord) == (*ofs_q & kUI8_LoWord)) {
1305 
1306  hit_index.push_back(SHitIndexEntry());
1307  SHitIndexEntry& elem (hit_index.back());
1308 
1309  elem.m_QueryOfs = (*ofs_q) >> 32;
1310  size_t count ((*(ofs_q + 1) >> 32) - elem.m_QueryOfs);
1311  elem.m_QueryCount = (count > 0xFFFF)? 0xFFFF: count;
1312 
1313  elem.m_SubjOfs = (*ofs_s) >> 32;
1314  count = ((*(ofs_s + 1) >> 32) - elem.m_SubjOfs);
1315  elem.m_SubjCount = (count > 0xFFFF)? 0xFFFF: count;
1316 
1317  ++ofs_s;
1318  ++ofs_q;
1319 
1320  elem_hits_this_pair += elem.m_QueryCount * elem.m_SubjCount;
1321  }
1322  }
1323 
1324  // unload offset files; save hit index
1325  filename_hit_index = g_SaveToTemp(hit_index, m_FilePath);
1326  hit_index_dim = (hit_index.size());
1327  elem_hits_total += elem_hits_this_pair;
1328  }}
1329 
1330  // load position files
1331 
1332  const CFile file_pos_s (filename_pos_s);
1333  const size_t dim_pos_s (file_pos_s.GetLength() / 4);
1334 
1335  CMemoryFile mf_pos_s (filename_pos_s);
1336  size_t map_offset_s (0);
1337  size_t map_length_s (min(kMapGran, 4*dim_pos_s - map_offset_s));
1338  const Uint4 * pos_s (reinterpret_cast<const Uint4*>(
1339  mf_pos_s.Map(map_offset_s, map_length_s)));
1340 
1341  const CFile file_pos_q (filename_pos_q);
1342  const size_t dim_pos_q (file_pos_q.GetLength() / 4);
1343 
1344  CMemoryFile mf_pos_q (filename_pos_q);
1345  size_t map_offset_q (0);
1346  size_t map_length_q (min(kMapGran, 4*dim_pos_q - map_offset_q));
1347  const Uint4 * pos_q (reinterpret_cast<const Uint4*>(
1348  mf_pos_q.Map(map_offset_q, map_length_q)));
1349 
1350  // scan hit index file and build hits
1351  vector<Uint8> hits (elem_hits_this_pair);
1352  size_t hidx (0);
1353 
1354  CNcbiIfstream ifstr (filename_hit_index.c_str(), IOS_BASE::binary);
1355  for(size_t cnt(0); cnt < hit_index_dim; ++cnt) {
1356 
1357  SHitIndexEntry hie;
1358  ifstr.read((char*) &hie, sizeof(hie));
1359 
1360  const size_t idx_s_max (hie.m_SubjOfs + hie.m_SubjCount);
1361  const size_t idx_q_max (hie.m_QueryOfs + hie.m_QueryCount);
1362  if(idx_s_max > dim_pos_s || idx_q_max > dim_pos_q) {
1363  NCBI_THROW(CException, eUnknown, "Coordinate out of scope");
1364  }
1365 
1366  if(4*idx_s_max >= map_offset_s + map_length_s) {
1367 
1368  map_offset_s = 4 * hie.m_SubjOfs;
1369  map_length_s = min(kMapGran, 4*dim_pos_s - map_offset_s);
1370  mf_pos_s.Unmap();
1371  pos_s = (reinterpret_cast<const Uint4*>
1372  (mf_pos_s.Map(map_offset_s, map_length_s)));
1373  }
1374 
1375  if(4*idx_q_max >= map_offset_q + map_length_q) {
1376 
1377  map_offset_q = 4 * hie.m_QueryOfs;
1378  map_length_q = min(kMapGran, 4*dim_pos_q - map_offset_q);
1379  mf_pos_q.Unmap();
1380  pos_q = (reinterpret_cast<const Uint4*>
1381  (mf_pos_q.Map(map_offset_q, map_length_q)));
1382  }
1383 
1384  for(size_t idx_s (hie.m_SubjOfs); idx_s < idx_s_max; ++idx_s) {
1385 
1386  Uint8 hiword = pos_s[idx_s - map_offset_s/4];
1387  hiword <<= 32;
1388  for(size_t idx_q (hie.m_QueryOfs); idx_q < idx_q_max; ++idx_q) {
1389 
1390  const Uint8 loword = pos_q[idx_q - map_offset_q/4];
1391  hits[hidx++] = (hiword | loword);
1392  }
1393  }
1394  }
1395 
1396  if(hidx != elem_hits_this_pair) {
1397  CNcbiOstrstream ostr;
1398  ostr << "The number of hits found (" << hidx
1399  << ") does not match the expected " << elem_hits_this_pair;
1400  const string str = CNcbiOstrstreamToString(ostr);
1402  }
1403 
1404  // remove the hit index file
1405  CFile(filename_hit_index).Remove();
1406 
1407  // detect compartments
1408  x_CompartVolume (&hits);
1409 
1410  } // query vols
1411 
1412  } // subj vols
1413 
1414  cerr << "Ok" << endl;
1415 }
1416 
1417 #undef CHECK_MEMMAP
1418 
1419 
1420 bool PLoWord(const Uint8& lhs, const Uint8& rhs)
1421 {
1422  return (lhs & kUI8_LoWord) < (rhs & kUI8_LoWord);
1423 }
1424 
1425 
1426 bool PDiag(const Uint8& lhs, const Uint8& rhs)
1427 {
1428  const double lhs_q (double(lhs & kUI8_LoWord) / 2);
1429  const double lhs_s (double(lhs >> 32) / 2);
1430  const double rhs_q (double(rhs & kUI8_LoWord) / 2);
1431  const double rhs_s (double(rhs >> 32) / 2);
1432 
1433  const double lhs_q1 (lhs_q + lhs_s);
1434  const double lhs_s1 (lhs_s - lhs_q);
1435  const double rhs_q1 (rhs_q + rhs_s);
1436  const double rhs_s1 (rhs_s - rhs_q);
1437 
1438  if(lhs_s1 == rhs_s1) {
1439  return lhs_q1 < rhs_q1;
1440  }
1441  else {
1442  return lhs_s1 < rhs_s1;
1443  }
1444 }
1445 
1446 
1448  const TStrings& vol_extensions)
1449 {
1450 
1451  // make sure there are no offset or position files left before
1452  // we generate the new ones
1453 
1454  CDir dir (m_FilePath);
1455 
1456  CFileDeleteList fdl;
1457  ITERATE(TStrings, ii, vol_extensions) {
1458 
1459  const string mask (lbn + "*" + *ii);
1460  CDir::TEntries dir_entries (dir.GetEntries(mask));
1461  ITERATE(CDir::TEntries, jj, dir_entries) {
1462  fdl.Add((*jj)->GetPath());
1463  }
1464  }
1465 }
1466 
1467 
1468 void CElementaryMatching::x_CompartVolume(vector<Uint8>* phits)
1469 {
1470  if(phits->size() == 0) {
1471  return;
1472  }
1473 
1474  // sort by the global genomic corrdinate
1475  sort(phits->begin(), phits->end());
1476 
1477  TSeqInfos::const_iterator ii_genomic_b (m_SeqInfos_Genomic.begin());
1478  TSeqInfos::const_iterator ii_genomic_e (m_SeqInfos_Genomic.end());
1479  TSeqInfos::const_iterator ii_genomic (ii_genomic_b);
1480  TSeqInfos::const_iterator ii_cdna_b (m_SeqInfos_cdna.begin());
1481  TSeqInfos::const_iterator ii_cdna_e (m_SeqInfos_cdna.end());
1482 
1483  CSeqDB seqdb_genomic (m_sdb, CSeqDB::eNucleotide);
1484 
1486 
1487  size_t idx_compacted (0);
1488  for(size_t idx (0), idx_hi (phits->size()); idx < idx_hi; ) {
1489 
1490  Uint4 gc_s (((*phits)[idx]) >> 32);
1491 
1492  // find the relevant genomic record
1493  ii_genomic = lower_bound(ii_genomic + 1, ii_genomic_e, SSeqInfo(gc_s, 0, 0));
1494  if(ii_genomic == ii_genomic_e || ii_genomic->m_Start > gc_s) {
1495  --ii_genomic;
1496  }
1497 
1498  if(gc_s < ii_genomic->m_Start ||
1499  ii_genomic->m_Start + ii_genomic->m_Length <= gc_s)
1500  {
1501  CNcbiOstrstream ostr;
1502  ostr << "Global genomic coordinate "
1503  << gc_s << " out of range: ["
1504  << ii_genomic->m_Start << ", "
1505  << (ii_genomic->m_Start + ii_genomic->m_Length) << "), "
1506  << (m_GenomicSeqIds[ii_genomic->m_Oid]->GetSeqIdString(true));
1507  const string str = CNcbiOstrstreamToString(ostr);
1509  }
1510 
1511  const Uint4 gc_s_max (ii_genomic->m_Start + ii_genomic->m_Length);
1512 
1513  // preload genomic sequence
1514  seqdb_genomic.GetSequence(ii_genomic->m_Oid, &m_CurSeq_Genomic);
1515 
1516  // find all hits that belong to the current genomic record
1517  size_t idx0 (idx);
1518  while(idx < idx_hi && (((*phits)[idx]) >> 32) < gc_s_max) {
1519  ++idx;
1520  }
1521 
1522  // sort by the global cDNA coordinate
1523  sort(phits->begin() + idx0, phits->begin() + idx, PLoWord);
1524 
1525  // find the relevant cDNA record
1526  TSeqInfos::const_iterator ii_cdna (ii_cdna_b);
1527  Uint4 gc_q_min (ii_cdna->m_Start);
1528  Uint4 gc_q_max (ii_cdna->m_Start + ii_cdna->m_Length);
1529  size_t jdx (idx0), jdx0 (jdx);
1530  for(; jdx < idx; ++jdx) {
1531 
1532  Uint4 gc_q (((*phits)[jdx]) & kUI8_LoWord);
1533 
1534  if(gc_q < gc_q_min || gc_q >= gc_q_max) {
1535 
1536  const THit::TCoord min_matches1 = THit::TCoord(
1537  round(m_MinCompartmentIdty * ii_cdna->m_Length));
1538 
1539  const THit::TCoord min_matches2 = THit::TCoord(
1540  round(m_MinSingletonIdty * ii_cdna->m_Length));
1541 
1542  const THit::TCoord min_matches (min(min_matches1,min_matches2));
1543 
1544  if(kN * (jdx - jdx0) >= min_matches / 2) {
1545 
1546  // preload genomic sequence
1547  m_qsrc->GetSeq(ii_cdna->m_Oid, &m_CurSeq_cDNA);
1548  x_CompartPair(phits, ii_cdna, ii_genomic,
1549  jdx0, jdx, &idx_compacted);
1551  m_CurSeq_cDNA = 0;
1552  }
1553 
1554  ii_cdna = lower_bound(ii_cdna + 1, ii_cdna_e, SSeqInfo(gc_q, 0, 0));
1555 
1556  if(ii_cdna == ii_cdna_e || ii_cdna->m_Start > gc_q) {
1557  --ii_cdna;
1558  }
1559 
1560  if(gc_q < ii_cdna->m_Start ||
1561  ii_cdna->m_Start + ii_cdna->m_Length <= gc_q)
1562  {
1563  CNcbiOstrstream ostr;
1564  ostr << "Global cDNA coordinate "
1565  << gc_q << " out of range: ["
1566  << ii_cdna->m_Start << ", "
1567  << (ii_cdna->m_Start + ii_cdna->m_Length) << "), "
1568  << (m_cDNASeqIds[ii_cdna->m_Oid]->GetSeqIdString(true));
1569  const string str = CNcbiOstrstreamToString(ostr);
1571  }
1572 
1573  gc_q_min = ii_cdna->m_Start;
1574  gc_q_max = ii_cdna->m_Start + ii_cdna->m_Length;
1575  jdx0 = jdx;
1576  }
1577  }
1578 
1579  const THit::TCoord min_matches1 = THit::TCoord(
1580  round(m_MinCompartmentIdty * ii_cdna->m_Length));
1581 
1582  const THit::TCoord min_matches2 = THit::TCoord(
1583  round(m_MinSingletonIdty * ii_cdna->m_Length));
1584 
1585  const THit::TCoord min_matches (min(min_matches1,min_matches2));
1586 
1587  if(kN * (jdx - jdx0) >= min_matches / 2) {
1588 
1589  m_qsrc->GetSeq(ii_cdna->m_Oid, &m_CurSeq_cDNA);
1590 
1591  x_CompartPair(phits, ii_cdna, ii_genomic,
1592  jdx0, jdx, &idx_compacted);
1594  m_CurSeq_cDNA = 0;
1595  }
1596 
1597  seqdb_genomic.RetSequence(&m_CurSeq_Genomic);
1598  m_CurSeq_Genomic = 0;
1599  }
1600 }
1601 
1602 
1604 {
1605  const Uint1 cq (((m_CurSeq_cDNA[q / 4]) >> ((3 - (q % 4))*2)) & 0x03);
1606  const Uint1 cs (((m_CurSeq_Genomic[s / 4]) >> ((3 - (s % 4))*2)) & 0x03);
1607  const bool rv (m_CurGenomicStrand? (cq == cs): ((cq^cs) == 3));
1608  return rv;
1609 }
1610 
1611 
1613  const Int8 & right_limit0,
1614  THitRef hitref)
1615 {
1616  const int Wm (1), Wms (-2);
1617  int score (int(hitref->GetLength()) * Wm
1618  + int(hitref->GetMismatches()) * (Wms - Wm));
1619  int score_max (score);
1620 
1621  const int overrun (6); // enables connecting hits over a mismatch
1622  const Int8 left_limit (left_limit0 >= overrun? left_limit0 - overrun: 0);
1623  const Int8 right_limit (right_limit0 >= kDiagMax - overrun?
1624  kDiagMax: right_limit0 + overrun);
1625 
1626  // extend left
1627  Int4 q0 (hitref->GetQueryStart()), s0 (hitref->GetSubjStart());
1628  Uint4 mm (0), mm0 (0);
1629  bool no_overrun_yet (true);
1630  for(Int4 q (q0 - 1), s (s0 - 1);
1631  (q + s > left_limit && score + m_XDropOff >= score_max
1632  && q >= m_ii_cdna->m_Start && s >= m_ii_genomic->m_Start);
1633  --q, --s)
1634  {
1635 
1636  if(q + s == left_limit0) {
1637  no_overrun_yet = false;
1638  mm0 += mm;
1639  }
1640 
1641  Uint8 qq (q - m_ii_cdna->m_Start);
1642  Uint8 ss (s - m_ii_genomic->m_Start);
1643  if(m_CurGenomicStrand == false) {
1644  ss = m_ii_genomic->m_Length - ss - 1;
1645  }
1646 
1647  if(x_IsMatch(qq, ss)) {
1648 
1649  score += Wm;
1650  if(score > score_max) {
1651  q0 = q;
1652  s0 = s;
1653  score_max = score;
1654  if(no_overrun_yet) {
1655  mm0 += mm;
1656  mm = 0;
1657  }
1658  }
1659  }
1660  else {
1661  score += Wms;
1662  ++mm;
1663  }
1664  }
1665 
1666  while(q0 + s0 <= left_limit0) {++q0; ++s0;}
1667 
1668  bool extended_left (false);
1669  if(q0 < hitref->GetQueryStart()) {
1670  hitref->SetQueryStart(q0);
1671  hitref->SetSubjStart(s0);
1672  extended_left = true;
1673  }
1674 
1675  // extend right
1676  q0 = (hitref->GetQueryStop());
1677  s0 = (hitref->GetSubjStop());
1678  score = score_max;
1679  mm = mm0;
1680 
1681  no_overrun_yet = true;
1682 
1683  for(Int4 q (q0 + 1), s (s0 + 1);
1684  (q + s < right_limit && score + m_XDropOff >= score_max
1685  && q < m_ii_cdna->m_Start + m_ii_cdna->m_Length
1686  && s < m_ii_genomic->m_Start + m_ii_genomic->m_Length);
1687  ++q, ++s)
1688  {
1689  if(q + s == right_limit0) {
1690  no_overrun_yet = false;
1691  mm0 += mm;
1692  }
1693 
1694  Uint8 qq (q - m_ii_cdna->m_Start);
1695  Uint8 ss (s - m_ii_genomic->m_Start);
1696  if(m_CurGenomicStrand == false) {
1697  ss = m_ii_genomic->m_Length - ss - 1;
1698  }
1699 
1700  if(x_IsMatch(qq, ss)) {
1701 
1702  score += Wm;
1703  if(score > score_max) {
1704  q0 = q;
1705  s0 = s;
1706  score_max = score;
1707  if(no_overrun_yet) {
1708  mm0 += mm;
1709  mm = 0;
1710  }
1711  }
1712  }
1713  else {
1714  score += Wms;
1715  ++mm;
1716  }
1717  }
1718 
1719  while(q0 + s0 >= right_limit0) {--q0; --s0; }
1720 
1721  bool extended_right (false);
1722  if(q0 > hitref->GetQueryStop()) {
1723  hitref->SetQueryStop(q0);
1724  hitref->SetSubjStop(s0);
1725  extended_right = true;
1726  }
1727 
1728  if(extended_left || extended_right) {
1729 
1730  hitref->SetMismatches(mm0);
1731  const THit::TCoord len (hitref->GetQueryStop() - hitref->GetQueryStart() + 1);
1732  hitref->SetLength(len);
1733  hitref->SetIdentity((len - mm0) / double(len));
1734  hitref->SetScore(2 * len);
1735  }
1736 
1737  return q0 + s0;
1738 }
1739 
1740 
1741 void CElementaryMatching::x_CompartPair(vector<Uint8>* pvol,
1742  TSeqInfos::const_iterator ii_cdna,
1743  TSeqInfos::const_iterator ii_genomic,
1744  size_t idx_start,
1745  size_t idx_stop,
1746  size_t* pidx_compacted)
1747 {
1748  if(idx_start >= idx_stop) {
1749  return;
1750  }
1751 
1752  // init "global" members
1753  m_ii_cdna = ii_cdna;
1754  m_ii_genomic = ii_genomic;
1755 
1756  // re-sort for better compactization
1757  sort(pvol->begin() + idx_start, pvol->begin() + idx_stop, PDiag);
1758 
1759  const size_t idx_compacted_start (*pidx_compacted);
1760 
1761  vector<Uint4> lens;
1762  lens.reserve(pvol->size() / 2);
1763 
1764  Uint8 qs0 ((*pvol)[idx_start]);
1765  Uint4 q0 (qs0 & kUI8_LoWord);
1766  Uint4 s0 (qs0 >> 32);
1767  (*pvol)[(*pidx_compacted)++] = qs0;
1768  lens.push_back(16);
1769 
1770  for(size_t idx (idx_start + 1); idx < idx_stop; ++idx) {
1771 
1772  const Uint8 qs ((*pvol)[idx]);
1773  const Uint4 q (qs & kUI8_LoWord);
1774  const Uint4 s (qs >> 32);
1775 
1776  if((q0 >= s0 && q0 - s0 == q - s && q <= q0 + 16) ||
1777  (q0 < s0 && s0 - q0 == s - q && q <= q0 + 16) )
1778  {
1779  lens.back() += q - q0;
1780  }
1781  else {
1782  (*pvol)[(*pidx_compacted)++] = qs;
1783  lens.push_back(16);
1784  }
1785 
1786  q0 = q;
1787  s0 = s;
1788  }
1789 
1790  THitRefs hitrefs (lens.size());
1791 
1792  for(size_t idx (idx_compacted_start); idx < *pidx_compacted; ++idx) {
1793 
1794  const Uint8 qs ((*pvol)[idx]);
1795  const Uint4 q (qs & kUI8_LoWord);
1796  const Uint4 s (qs >> 32);
1797 
1798  THitRef hitref (new THit);
1799  hitref->SetQueryStart(q);
1800  hitref->SetSubjStart(s);
1801  const Uint4 len (lens[idx - idx_compacted_start]);
1802  hitref->SetQueryStop(q + len - 1);
1803  hitref->SetSubjStop(s + len - 1);
1804  hitref->SetMismatches(0);
1805  hitref->SetGaps(0);
1806  hitref->SetEValue(0);
1807  hitref->SetIdentity(1.0);
1808  hitref->SetLength(len);
1809  hitref->SetScore(2*len);
1810  hitrefs[idx - idx_compacted_start] = hitref;
1811  }
1812 
1813 #define EXTEND_USING_SEQUENCE_CHARS
1814 #ifdef EXTEND_USING_SEQUENCE_CHARS
1815 
1816  // try to extend even further over possible mismatches
1817 
1818  Int8 left_limit (0); // diag extension left limit
1819  Int8 right_limit (kDiagMax); // diag ext right limit
1820  Int8 s_prime_cur (0);
1821 
1822  const size_t kn (hitrefs.size());
1823  for(size_t k(0); k < kn; ++k) {
1824 
1825  // diag coords: q_prime = q + s; s_prime = s - q;
1826  const THit::TCoord * box (hitrefs[k]->GetBox());
1827 
1828  if(Int8(box[2]) - Int8(box[0]) != s_prime_cur) {
1829  left_limit = box[0];
1830  s_prime_cur = Int8(box[2]) - Int8(box[0]);
1831  }
1832 
1833  right_limit = kDiagMax;
1834  if(k + 1 < kn) {
1835  const THit::TCoord * boX (hitrefs[k+1]->GetBox());
1836  if(Int8(boX[2]) - Int8(boX[0]) == s_prime_cur) {
1837  right_limit = boX[0] + boX[2];
1838  }
1839  }
1840 
1841  left_limit = x_ExtendHit(left_limit, right_limit, hitrefs[k]);
1842  }
1843 
1844  // merge adjacent hits
1845  int d (-1);
1846  Int8 rlimit (0), spc (0);
1847  for(size_t k (0); k < kn; ++k) {
1848 
1849  const THit::TCoord cur_diag_stop (hitrefs[k]->GetQueryStop()
1850  + hitrefs[k]->GetSubjStop());
1851 
1852  const Int8 s_prime (-Int8(hitrefs[k]->GetQueryStart())
1853  + Int8(hitrefs[k]->GetSubjStart()));
1854  if(k == 0 || spc != s_prime) {
1855 
1856  hitrefs[++d] = hitrefs[k];
1857  rlimit = cur_diag_stop;
1858  spc = s_prime;
1859  }
1860  else {
1861 
1862  const THit::TCoord cur_diag_start (hitrefs[k]->GetQueryStart()
1863  + hitrefs[k]->GetSubjStart());
1864 
1865  THitRef & hrd (hitrefs[d]);
1866 
1867  if(rlimit + 2 == cur_diag_start) {
1868 
1869  rlimit = cur_diag_stop;
1870  hrd->SetQueryStop(hitrefs[k]->GetQueryStop());
1871  hrd->SetSubjStop(hitrefs[k]->GetSubjStop());
1872  hrd->SetMismatches(hrd->GetMismatches()
1873  + hitrefs[k]->GetMismatches());
1874  const THit::TCoord len (hrd->GetQueryStop()
1875  - hrd->GetQueryStart() + 1);
1876  hrd->SetLength(len);
1877  hrd->SetIdentity(double(len - hrd->GetMismatches()) / len);
1878  hrd->SetScore(2 * len);
1879  }
1880  else if (rlimit + 1 < cur_diag_start) {
1881 
1882  hitrefs[++d] = hitrefs[k];
1883  rlimit = cur_diag_stop;
1884  }
1885  else {
1887  "x_CompartPair(): Unexpected alignment overlap");
1888  }
1889  }
1890  }
1891  hitrefs.resize(min(size_t(d + 1), kn));
1892 
1893 #undef EXTEND_USING_SEQUENCE_CHARS
1894 #endif
1895 
1896  // make sure nothing is stretching out
1897  const Uint4 qmax (m_ii_cdna->m_Start + m_ii_cdna->m_Length);
1898  NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
1899  if((*ii)->GetQueryStop() >= qmax) {
1900  (*ii)->Modify(1, qmax - 1);
1901  }
1902  }
1903 
1904  // Remap hit coordinates.
1905  NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
1906  THit& h (**ii);
1907 
1908  h.SetQueryId (m_cDNASeqIds[m_ii_cdna->m_Oid]);
1910 
1911  const Uint4 * box0 (h.GetBox());
1912  Uint4 box [4] = {
1913  box0[0] - m_ii_cdna->m_Start,
1914  box0[1] - m_ii_cdna->m_Start,
1915  box0[2] - m_ii_genomic->m_Start,
1916  box0[3] - m_ii_genomic->m_Start
1917  };
1918 
1919  if(m_CurGenomicStrand == false) {
1920  box[2] = m_ii_genomic->m_Length - box[2] - 1;
1921  box[3] = m_ii_genomic->m_Length - box[3] - 1;
1922  }
1923 
1924  h.SetBox(box);
1925  }
1926 
1927  if (m_HitsOnly) {
1928  ITERATE(THitRefs, ii, hitrefs) {
1929  const THit& h (**ii);
1930  if(h.GetLength() < m_MinHitLength) continue;
1931  cout << h << endl;
1932  }
1933  } else {
1934 
1935  // identify compartments
1936  const Uint4 qlen (m_ii_cdna->m_Length);
1937 
1938  const THit::TCoord min_matches1 = THit::TCoord(
1939  round(m_MinCompartmentIdty * qlen));
1940 
1941  const THit::TCoord min_matches2 = THit::TCoord(
1942  round(m_MinSingletonIdty * qlen));
1943 
1944  const THit::TCoord penalty = THit::TCoord(round(m_Penalty * qlen));
1945 
1946  CCompartmentAccessor<THit> ca(penalty,
1947  min_matches1,
1948  min_matches2,
1949  true);
1951  ca.Run(hitrefs.begin(), hitrefs.end());
1952 
1953  if (m_OutputMethod) {
1954  // print individual compartments
1955  THitRefs comp;
1956  for(bool b0 (ca.GetFirst(comp)); b0; b0 = ca.GetNext(comp)) {
1957  ITERATE(THitRefs, ii, comp) {
1958  const THit& h (**ii);
1959  cout << h << endl;
1960  }
1961  // empty line to separate compartments
1962  cout << endl;
1963  }
1964  } else {
1965  TResults pair_results = ca.AsSeqAlignSet();
1966  if (! m_Results) { // Store first results, or append to existing?
1967  m_Results = pair_results;
1968  } else {
1969  m_Results->Set().insert(m_Results->Set().end(),
1970  pair_results->Set().begin(),
1971  pair_results->Set().end());
1972  }
1973  }
1974  }
1975 }
1976 
1977 
1979 {
1980  const string filename_genomic (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_s + kFileExt_Remap);
1981  const size_t elems_genomic (CFile(filename_genomic).GetLength()/sizeof(SSeqInfo));
1982  m_SeqInfos_Genomic.resize(elems_genomic);
1983  CNcbiIfstream ifstr_genomic (filename_genomic.c_str(), IOS_BASE::binary);
1984  ifstr_genomic.read((char *) &m_SeqInfos_Genomic.front(),
1985  elems_genomic * sizeof (SSeqInfo));
1986  ifstr_genomic.close();
1987 
1988  const string filename_cdna (m_FilePath + CDirEntry::GetPathSeparator() + m_lbn_q + kFileExt_Remap);
1989  const size_t elems_cdna (CFile(filename_cdna).GetLength()/sizeof(SSeqInfo));
1990  m_SeqInfos_cdna.resize(elems_cdna);
1991  CNcbiIfstream ifstr_cdna (filename_cdna.c_str(), IOS_BASE::binary);
1992  ifstr_cdna.read((char *) &m_SeqInfos_cdna.front(),
1993  elems_cdna * sizeof (SSeqInfo));
1994  ifstr_cdna.close();
1995 
1996  {{
1997  CSeqDB blastdb (sdb, CSeqDB::eNucleotide);
1998  m_GenomicSeqIds.clear();
1999  for(int oid (0); blastdb.CheckOrFindOID(oid); ++oid) {
2000  THit::TId id (blastdb.GetSeqIDs(oid).front());
2001  m_GenomicSeqIds.push_back(id);
2002  }
2003  }}
2004 
2005  {{
2006  m_cDNASeqIds.clear();
2007  for(m_qsrc->ResetIndex(); m_qsrc->GetNext(); ) {
2008 
2009  THit::TId id (m_qsrc->GetSeqID());
2010  m_cDNASeqIds.push_back(id);
2011  }
2012  }}
2013 }
2014 
2015 
2016 ////////////////////////////////////////////////////////////////////////////////////
2017 ////
2018 
2019 
2021 {
2022  CRandom::TValue rv (0);
2023  ITERATE(string, ii, str) {
2024  rv = (rv * 3 + (*ii)) % 3571;
2025  }
2026  CRandom::TValue t = Convert(time(0));
2027  return t - 5000 + rv;
2028 }
2029 
2030 
2032 {
2033  CRandom rand (GenerateSeed("qq" + m_sdb));
2034  const string base_sfx ( NStr::NumericToString(rand.GetRand()));
2035  m_lbn_q = GetLocalBaseName("qq", base_sfx);
2036  m_lbn_s = GetLocalBaseName(m_sdb, base_sfx);
2037 
2038  m_OutputMethod = false;
2039 
2040  m_Penalty = 0.55;
2043  m_HitsOnly = false;
2044  m_MaxVolSize = 512 * 1024 * 1024;
2045 
2046  m_MinQueryLength = 50;
2047  m_MaxQueryLength = 500000;
2048  m_MinHitLength = 1;
2049 }
2050 
2051 
2053 {
2054  x_Cleanup();
2055 
2056  // create genomic ID and coordinate remapping tables.
2058 
2059  // create the filtering (repeats + low complexity) table
2060  // using the plus strand of the genomic sequence;
2062 
2063  // create cDNA ID and coordinate remapping tables;
2065 
2066  // create cDNA index using the current filtering table;
2067  x_CreateIndex(m_qsrc, eIM_cDNA, true);
2068 
2069  // init the N-mer participation vector;
2071 
2072  // create plus-strand genomic index using the participation vector
2074 
2075  // generate N-hits for both strands of the genome;
2076  // compact and identify compartments;
2077  // restore original coordinates and IDs in final hits;
2078  // print compartments
2080 
2081  x_Search(true);
2082 
2083  TStrings vol_exts;
2084  vol_exts.push_back(kFileExt_Offsets);
2085  vol_exts.push_back(kFileExt_Positions);
2086  x_CleanVolumes(m_lbn_q, vol_exts);
2087  x_CleanVolumes(m_lbn_s, vol_exts);
2088 
2089  // create the filtering table for the minus genomic strand
2090  // from the plus strand filtering table
2091  x_InitFilteringVector(m_sdb, false);
2092 
2093  // repeat the steps to create cDNA and genomic indices
2094  x_CreateIndex(m_qsrc, eIM_cDNA, false);
2096  x_CreateIndex(m_sdb, eIM_Genomic, false);
2097 
2098  x_Search(false);
2099 }
2100 
2101 
2103 {
2104  m_Mers.Clear();
2105  TStrings vol_exts;
2106  vol_exts.push_back(kFileExt_Offsets);
2107  vol_exts.push_back(kFileExt_Positions);
2108  vol_exts.push_back(kFileExt_Masked);
2109  vol_exts.push_back(kFileExt_Remap);
2110  x_CleanVolumes(m_lbn_q, vol_exts);
2111  x_CleanVolumes(m_lbn_s, vol_exts);
2112  m_Results.Reset();
2113 }
2114 
2115 
T round(const T &v)
ncbi::TMaskedQueryRegions mask
const TCoord * GetBox(void) const
void SetSubjId(const TId &id)
void SetBox(const TCoord box[4])
void SetQueryId(const TId &id)
static CSafeStatic< vector< CSeq_id_Handle > > s_ids
TParent::TCoord TCoord
TCoord GetLength(void) const
void Run(typename THitRefs::iterator start, typename THitRefs::iterator finish, CScope *scope=NULL, const vector< pair< TCoord, TCoord > > *gaps=NULL)
Execute: identify compartments.
void SetMaxIntron(TCoord mi)
Assign the maximum intron length, in base pairs.
bool GetFirst(THitRefs &compartment)
Initialize iteration over the results.
CRef< objects::CSeq_align_set > AsSeqAlignSet(void) const
Retrieve all valid compartments in a seq-align-set.
bool GetNext(THitRefs &compartment)
Proceed with iteration over the results.
static TCoord s_GetDefaultMaxIntron(void)
Retrieve the default maximum length of an intron.
CDir –.
Definition: ncbifile.hpp:1696
bool get_at(const Uint8 idx) const
void Init(Uint8 dim_bits, bool init_value)
const Uint8 * GetBuffer(void) const
vector< SSeqInfo > TSeqInfos
void x_LoadRemapData(ISequenceSource *m_qsrc, const string &sdb)
void x_InitParticipationVector(bool strand)
void x_CleanVolumes(const string &lbn, const TStrings &vol_extensions)
void x_Search(bool strand)
vector< THitRef > THitRefs
void x_CreateRemapData(const string &db, EIndexMode mode)
vector< string > TStrings
TSeqInfos::const_iterator m_ii_cdna
size_t x_WriteIndexFile(size_t volume, EIndexMode mode, bool strand, vector< Uint8 > &MersAndCoords)
ISequenceSource * m_qsrc
static bool s_IsLowComplexity(size_t key)
Int8 x_ExtendHit(const Int8 &left_limit, const Int8 &right_limit, THitRef hitref)
bool x_IsMatch(Uint8 q, Uint8 s) const
vector< SHitIndexEntry > THitIndex
const char * m_CurSeq_Genomic
void x_CreateIndex(const string &db, EIndexMode index_more, bool strand)
void x_CompartPair(vector< Uint8 > *pvol, TSeqInfos::const_iterator ii_cdna, TSeqInfos::const_iterator ii_genomic, size_t idx_start, size_t idx_stop, size_t *pidx_compacted)
void x_InitFilteringVector(const string &sdb, bool strand)
void x_CompartVolume(vector< Uint8 > *phits)
TSeqInfos::const_iterator m_ii_genomic
CFileDeleteList –.
Definition: ncbifile.hpp:2287
CFile –.
Definition: ncbifile.hpp:1605
CMemoryFile –.
Definition: ncbifile.hpp:2861
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CRandom::
Definition: random_gen.hpp:66
CSafeStatic<>::
CSeqDB.
Definition: seqdb.hpp:161
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
Definition: seqdb.cpp:685
list< CRef< CSeq_id > > GetSeqIDs(int oid) const
Gets a list of sequence identifiers.
Definition: seqdb.cpp:765
int GetSeqLength(int oid) const
Returns the sequence length in base pairs or residues.
Definition: seqdb.cpp:400
@ eNucleotide
Definition: seqdb.hpp:175
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
Definition: seqdb.cpp:523
int GetNumSeqs() const
Returns the number of sequences available.
Definition: seqdb.cpp:670
int GetSequence(int oid, const char **buffer) const
Get a pointer to raw sequence data.
Definition: seqdb.cpp:530
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
Definition: seqdb.cpp:728
virtual CConstRef< CSeq_id > GetSeqID(int idx)
virtual Uint8 GetTotalLength(void)
virtual int GetSeqLength(int idx)
virtual bool GetNext(void)
virtual int GetNumSeqs(void)
virtual void ResetIndex(void)
virtual int GetSeq(const char **buffer)
virtual int GetCurrentIndex()
virtual void RetSequence(const char **)
void g_RestoreFromTemp(const string &filename, VectorT *pvd)
string GetLocalBaseName(const string &extended_name, const string &sfx)
#define QCOMP_COUNT_NrMERS
bool PLoWord(const Uint8 &lhs, const Uint8 &rhs)
void CheckWrittenFile(const string &filename, const Uint8 &len_bytes)
char DecodeSeqDbChar(Uint1 c)
CRandom::TValue GenerateSeed(const string &str)
string ReplaceExt(const string &extended_name, const string &new_ext)
string g_SaveToTemp(const VectorT &v, const string &path)
#define QCOMP_CREATE_GENOMIC_IDX(w8, gccur)
T ReverseAndComplement(T v)
#define QCOMP_PREPARE_SHIFTED_GENOMIC_IDX
bool PDiag(const Uint8 &lhs, const Uint8 &rhs)
#define T(s)
Definition: common.h:230
static const char * str(char *buf, int n)
Definition: stats.c:84
#define basename(path)
Definition: replacements.h:116
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
string
Definition: cgiapp.hpp:687
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
TEntries GetEntries(const string &mask=kEmptyStr, TGetEntriesFlags flags=0) const
Get directory entries based on the specified "mask".
Definition: ncbifile.cpp:3846
static string GetTmpNameEx(const string &dir=kEmptyStr, const string &prefix=kEmptyStr, ETmpFileCreationMode mode=eTmpFileGetName)
Get temporary file name.
Definition: ncbifile.cpp:2940
void * Map(TOffsetType offset=0, size_t length=0)
Map file.
Definition: ncbifile.cpp:6127
Int8 GetLength(void) const
Get size of file.
Definition: ncbifile.cpp:3204
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
Definition: ncbifile.cpp:2595
bool Unmap(void)
Unmap file if mapped.
Definition: ncbifile.cpp:6138
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
list< TEntry > TEntries
Definition: ncbifile.hpp:1751
void Add(const string &path)
Add a path for later deletion.
Definition: ncbifile.hpp:4166
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
Definition: ncbifile.cpp:358
@ eCreate
Create new file or rewrite existent with zeros.
Definition: ncbifile.hpp:2445
@ eMMS_Shared
Changes are shared.
Definition: ncbifile.hpp:2439
@ eMMP_Write
Data can be written.
Definition: ncbifile.hpp:2433
@ eUnknown
Definition: app_popup.hpp:72
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
unsigned int uintptr_t
Definition: ncbitype.h:197
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
Uint4 TValue
Type of the generated integer value and/or the seed value.
Definition: random_gen.hpp:69
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
Definition: random_gen.hpp:238
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
sat & gt
int i
yy_size_t n
int len
constexpr auto sort(_Init &&init)
mdb_mode_t mode
Definition: lmdb++.h:38
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const struct ncbi::grid::netcache::search::fields::KEY key
EIPRangeType t
Definition: ncbi_localip.c:101
void SleepSec(unsigned long sec, EInterruptOnSignal onsignal=eRestartOnSignal)
Sleep.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
T max(T x_, T y_)
T min(T x_, T y_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static unsigned cnt[256]
#define count
Defines BLAST database access classes.
static const char * kN
Definition: showdefline.cpp:85
const value_slice::CValueConvert< value_slice::SRunTimeCP, FROM > Convert(const FROM &value)
Modified on Wed Sep 04 15:04:21 2024 by modify_doxy.py rev. 669887