NCBI C++ ToolKit
srsearch_app.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: srsearch_app.cpp 92022 2020-12-17 15:27:44Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Implementation of class CSRSearchApplication.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <algorithm>
36 
38 #include <objmgr/util/sequence.hpp>
39 
43 
44 #include "srsearch_app.hpp"
45 
47 USING_SCOPE( blastdbindex );
48 USING_SCOPE( dbindex_search );
49 
50 void PrintResults(
51  CNcbiOstream & ostream,
52  const vector< string > & idmap,
53  CDbIndex::TSeqNum qnum,
54  const vector< CSRSearch::SResultData > & results,
55  const string & idstr1, const string & idstr2 = "" );
56 
57 //------------------------------------------------------------------------------
58 class CRCache
59 {
60  public: // for Solaris only
61 
63 
64  struct SDataItem
65  {
68  vector< CSRSearch::SResultData > * r1;
69  vector< CSRSearch::SResultData > * r2;
70  string id1;
71  string id2;
72  };
73 
74  public:
75 
76  CRCache( Uint4 max_results_per_query )
77  : max_res_( max_results_per_query ),
78  rv_pool( max_results_per_query )
79  {}
80 
81  void update(
82  TSeqNum query,
83  CSRSearch::TResults & results,
84  const string & idstr1, const string & idstr2 );
85 
86  void updateSIdMap( const vector< string > & idmap, TSeqNum start )
87  {
88  if( sidmap.size() < start + idmap.size() ) {
89  sidmap.resize( start + idmap.size(), "unknown" );
90  }
91 
92  for( TSeqNum i = start; i < start + idmap.size(); ++i ) {
93  sidmap[i] = idmap[i - start];
94  }
95  }
96 
97  const vector< string > & getSIdMap() const { return sidmap; }
98 
99  void dump( CNcbiOstream & ostream );
100 
102  { return data_pool[q].l1; }
103 
105  { return data_pool[q].l2; }
106 
108  {
109  vector< CSRSearch::SResultData > * r = data_pool[q].r1;
110  return (r == 0) ? 0 : r->size();
111  }
112 
114  {
115  vector< CSRSearch::SResultData > * r = data_pool[q].r2;
116  return (r == 0) ? 0 : r->size();
117  }
118 
119  private:
120 
121  class CRVPool
122  {
123  const static Uint4 BLOCK_SIZE = 1024UL*1024UL;
124  const static Uint4 BLOCKS_RESERVE = 1024UL;
125 
126  typedef vector< CSRSearch::SResultData > TItem;
127  typedef vector< TItem > TBlock;
128  typedef vector< TBlock > TBlocks;
129 
130  public:
131 
132  CRVPool( Uint4 mres )
133  : max_res( mres ), num_items( BLOCK_SIZE )
134  {
135  blocks.reserve( BLOCKS_RESERVE );
136  }
137 
139  {
140  if( num_items == BLOCK_SIZE ) newBlock();
141  return &(*blocks.rbegin())[num_items++];
142  }
143 
144  private:
145 
146  void newBlock()
147  {
148  blocks.push_back( TBlock( BLOCK_SIZE ) );
149  TBlock & b = *blocks.rbegin();
150 
151  for( TBlock::iterator i = b.begin(); i != b.end(); ++i ) {
152  i->reserve( max_res );
153  }
154 
155  num_items = 0;
156  }
157 
161  };
162 
163  class CDataPool
164  {
165  const static Uint4 BLOCK_SIZE = 1024UL*1024UL;
166  const static Uint4 BLOCKS_RESERVE = 1024UL;
167 
168  const static Uint4 BLOCK_SHIFT = 20;
169  const static Uint4 BLOCK_MASK = ((1UL<<BLOCK_SHIFT) - 1);
170 
171  typedef vector< SDataItem > TBlock;
172  typedef vector< TBlock > TBlocks;
173 
174  public:
175 
176  CDataPool() : q_max( 0 ), num_blocks( 0 )
177  {
178  blocks.reserve( BLOCKS_RESERVE );
179  }
180 
182  {
183  ensure( q );
184  return at( q );
185  }
186 
187  TSeqNum size() const { return q_max; }
188 
189  private:
190 
192  { return blocks[q>>BLOCK_SHIFT][q&BLOCK_MASK]; }
193 
194  void ensure( TSeqNum q )
195  {
196  Uint4 block = (q>>BLOCK_SHIFT);
197 
198  while( block >= num_blocks ) {
199  blocks.push_back( TBlock( BLOCK_SIZE ) );
200  ++num_blocks;
201  }
202 
203  while( q >= q_max ) {
204  SDataItem & i = at( q_max );
205  i.l1 = i.l2 = CSRSearch::EM;
206  i.r1 = i.r2 = 0;
207  ++q_max;
208  }
209  }
210 
214  };
215 
219  vector< string > idmap1;
220  vector< string > idmap2;
221  vector< string > sidmap;
222 };
223 
224 //------------------------------------------------------------------------------
225 void CRCache::dump( CNcbiOstream & ostream )
226 {
227  for( TSeqNum i = 0; i < data_pool.size(); ++i ) {
228  SDataItem & data = data_pool[i];
229 
230  if( data.r1 != 0 ) {
231  PrintResults( ostream, sidmap, i, *data.r1, data.id1 );
232  }
233 
234  if( data.r2 != 0 ) {
235  PrintResults( ostream, sidmap, i, *data.r2, "", data.id2 );
236  }
237  }
238 }
239 
240 //------------------------------------------------------------------------------
243  const string & idstr1, const string & idstr2 )
244 {
245  if( res.res.empty() ) return;
246 
248 
249  if( data.r1 == 0 ) data.r1 = rv_pool.newItem();
250  if( data.r2 == 0 ) data.r2 = rv_pool.newItem();
251 
252  if( res.res.size() == res.nres_1 ) {
253  if( data.l1 > res.level_1 ) data.r1->swap( res.res );
254  else {
255  for( Uint4 i = 0; i < res.res.size(); ++i ) {
256  data.r1->push_back( res.res[i] );
257  }
258  }
259 
260  if( data.l2 > res.level_2 ) data.r2->clear();
261  data.l1 = res.level_1;
262  }
263  else if( data.l1 < CSRSearch::SE ) {}
264  else if( res.nres_1 == 0 ) {
265  if( data.l2 > res.level_2 ) data.r2->swap( res.res );
266  else {
267  for( Uint4 i = 0; i < res.res.size(); ++i ) {
268  data.r2->push_back( res.res[i] );
269  }
270  }
271 
272  if( data.l1 > res.level_1 ) data.r1->clear();
273  data.l2 = res.level_2;
274  }
275  else {
276  if( data.l1 > res.level_1 ) data.r1->clear();
277  if( data.l2 > res.level_2 ) data.r2->clear();
278 
279  Uint4 sz = res.nres_1;
280 
281  for( Uint4 i = 0 ; i < sz; ++i ) {
282  data.r1->push_back( res.res[i] );
283  }
284 
285  sz = res.res.size();
286 
287  for( Uint4 i = res.nres_1; i < sz; ++i ) {
288  data.r2->push_back( res.res[i] );
289  }
290 
291  data.l1 = res.level_1;
292  data.l2 = res.level_2;
293  }
294 
295  if( !data.r1->empty() ) data.id1 = idstr1;
296  if( !data.r2->empty() ) data.id2 = idstr2;
297 }
298 
299 //------------------------------------------------------------------------------
300 const char * const CSRSearchApplication::USAGE_LINE =
301  "Search for close matches to short sequences.";
302 
303 //------------------------------------------------------------------------------
305 {
306  unique_ptr< CArgDescriptions > arg_desc( new CArgDescriptions );
307  arg_desc->SetUsageContext(
308  GetArguments().GetProgramBasename(), USAGE_LINE );
309  arg_desc->AddKey(
310  "input", "input_file_name", "input file name",
312  arg_desc->AddOptionalKey(
313  "input1", "paired_input_file_name",
314  "file containing query sequence pairs",
316  arg_desc->AddOptionalKey(
317  "output", "output_file_name", "output file name",
319  arg_desc->AddOptionalKey(
320  "pair_distance", "pair_distance",
321  "distance between query pairs",
323  arg_desc->AddOptionalKey(
324  "pair_distance_fuzz", "pair_distance_fuzz",
325  "how much deviation from pair_distance is allowed",
327  arg_desc->AddDefaultKey(
328  "mismatch", "allow_mismatch",
329  "flag to allow one mismatch",
330  CArgDescriptions::eBoolean, "false" );
331  arg_desc->AddDefaultKey(
332  "nomap", "no_mmap_index",
333  "read index rather than mmap()'ing it.",
334  CArgDescriptions::eBoolean, "false" );
335  arg_desc->AddKey(
336  "index", "index_name", "index file name",
338  arg_desc->AddDefaultKey(
339  "start_vol", "index_volume",
340  "the first index volume to process",
342  arg_desc->AddDefaultKey(
343  "end_vol", "index_volume",
344  "one past the last index volume to process",
346  arg_desc->AddDefaultKey(
347  "restrict_matches", "number_of_matches",
348  "restrict the number of matches per query to at most this number",
350  arg_desc->AddDefaultKey(
351  "noid", "use_ordinal",
352  "use ordinal numbers for queries and database in output",
353  CArgDescriptions::eBoolean, "false" );
354  SetupArgDescriptions( arg_desc.release() );
355 }
356 
358 
360 
361 //------------------------------------------------------------------------------
362 string MakeIndexName( const string & prefix, Uint4 vol )
363 {
364  char volstr[3];
365  volstr[2] = 0;
366  snprintf( volstr, 3, "%02d", vol );
367  return prefix + "." + volstr + ".idx";
368 }
369 
370 //------------------------------------------------------------------------------
372  CNcbiOstream & ostream, const vector< string > & idmap,
373  CDbIndex::TSeqNum qnum,
374  const vector< CSRSearch::SResultData > & results,
375  const string & qidstr1, const string & qidstr2 )
376 {
377  typedef vector< CSRSearch::SResultData > TRes;
378 
379  for( TRes::const_iterator i = results.begin();
380  i != results.end(); ++i ) {
381  const string & qidstr = (i->type == 2) ? qidstr2 : qidstr1;
382  ostream << (int)i->type << "\t";
383 
384  if( qidstr.empty() ) {
385  ostream << qnum << "\t" << i->snum << "\t";
386  }
387  else {
388  ostream << qidstr << "\t";
389 
390  if( i->snum < idmap.size() ) ostream << idmap[i->snum] << "\t";
391  else ostream << "unknown" << "\t";
392  }
393 
394  ostream << i->spos_1 << "\t"
395  << ((i->fw_strand_1 == 0) ? '-' : '+') << "\t"
396  << i->mpos_1 << "\t"
397  << (char)i->mbase_1;
398 
399  if( i->pair ) {
400  ostream << "\t"
401  << i->spos_2 << "\t"
402  << ((i->fw_strand_2 == 0) ? '-' : '+') << "\t"
403  << i->mpos_2 << "\t"
404  << (char)i->mbase_2;
405  }
406 
407  ostream << "\n";
408  }
409 }
410 
411 //------------------------------------------------------------------------------
412 CSeqVector ExtractSeqVector( TSeqData & sd, bool noid, string & idstr )
413 {
414  objects::CSeq_entry * entry = sd.seq_entry_.GetPointerOrNull();
415 
416  if( entry == 0 ||
417  entry->Which() != objects::CSeq_entry_Base::e_Seq ) {
418  NCBI_THROW(
419  CDbIndex_Exception, eBadOption,
420  "input seq-entry is NULL or not a sequence" );
421  }
422 
423  if( !noid ){
424  objects::CScope scope( *om );
425  objects::CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry( *entry );
426  objects::CBioseq_Handle bsh = seh.GetSeq();
427  idstr = objects::sequence::GetTitle( bsh );
428  Uint4 pos = idstr.find_first_of( " \t" );
429  idstr = idstr.substr( 0, pos );
430  }
431 
432  return CSeqVector(
433  entry->SetSeq(), 0, objects::CBioseq_Handle::eCoding_Iupac );
434 }
435 
436 //------------------------------------------------------------------------------
438 {
439  om.Reset( objects::CObjectManager::GetInstance() );
440  CNcbiOstream * ostream = 0;
441 
442  if( GetArgs()["output"] ) {
443  ostream = new CNcbiOfstream( GetArgs()["output"].AsString().c_str() );
444  }
445  else ostream = &NcbiCout;
446 
447  Uint4 pd = 0, pdfuzz = 0;
448 
449  if( GetArgs()["input1"] ) {
450  if( !GetArgs()["pair_distance"] ) {
451  ERR_POST( Error << "-pair_distance must be provided for paired input" );
452  exit( 1 );
453  }
454 
455  pd = GetArgs()["pair_distance"].AsInteger();
456 
457  if( GetArgs()["pair_distance_fuzz"] )
458  pdfuzz = GetArgs()["pair_distance_fuzz"].AsInteger();
459  else pdfuzz = pd/2;
460 
461  if( pdfuzz > pd ) {
462  ERR_POST( Error << "the value of -pair_distance_fuzz can not be greater "
463  << "than the value of -pair_distance" );
464  exit( 1 );
465  }
466  }
467 
468  string index_prefix = GetArgs()["index"].AsString();
469 
470  Uint4 start_vol = GetArgs()["start_vol"].AsInteger();
471  Uint4 end_vol = GetArgs()["end_vol"].AsInteger();
472 
473  bool noid = GetArgs()["noid"].AsBoolean();
474 
475  Uint4 nr = GetArgs()["restrict_matches"].AsInteger();
476  bool use_cache = (nr != 0);
477  if( nr == 0 ) nr = 0xFFFFFFFF;
478  bool mismatch = GetArgs()["mismatch"].AsBoolean();
479  bool nomap = GetArgs()["nomap"].AsBoolean();
480 
481  CRCache rcache( nr );
482 
483  while( true ) {
484  string index_name = MakeIndexName( index_prefix, start_vol );
485  cerr << "searching volume " << index_name << endl;
486  CRef< CDbIndex > index( null );
487  try {
488  index = CDbIndex::Load( index_name, nomap );
489  }
490  catch( ... ) {}
491  if( index == 0 ) break;
492  rcache.updateSIdMap( index->getIdMap(), index->getStartOId() );
493  CRef< CSRSearch > search_obj =
494  CSRSearch::MakeSRSearch( index, pd, pdfuzz );
495  CSequenceIStream * iseqstream = new CSequenceIStreamFasta(
496  ( GetArgs()["input"].AsString() ) );
497  CSequenceIStream * iseqstream1 = 0;
498 
499  if( GetArgs()["input1"] ) {
500  iseqstream1 = new CSequenceIStreamFasta(
501  GetArgs()["input1"].AsString() );
502  }
503 
504  bool paired = (iseqstream1 != 0);
505  CDbIndex::TSeqNum seq_counter = 0;
506 
507  while( true ) {
508  CRef< TSeqData > seq_data( iseqstream->next() );
509  TSeqData * sd = seq_data.GetNonNullPointer();
510  if( !*sd ) break;
511  CSRSearch::TResults results;
512  string qidstr1, qidstr2;
513  CSeqVector seq = ExtractSeqVector( *sd, noid, qidstr1 );
514  CSeqVector seq1;
515  Uint4 s1 = rcache.getNRes1( seq_counter );
516  Uint4 s2 = rcache.getNRes2( seq_counter );
517  CSRSearch::ELevel l1 = rcache.getLevel1( seq_counter );
518  CSRSearch::ELevel l2 = rcache.getLevel2( seq_counter );
519 
520  if( paired ) {
521  CRef< TSeqData > seq_data1( iseqstream1->next() );
522  TSeqData * sd1 = seq_data1.GetNonNullPointer();
523 
524  if( !*sd1 ) {
525  ERR_POST( Error << "failed to read a pair to sequence "
526  << seq_counter );
527  exit( 1 );
528  }
529 
530  seq1 = ExtractSeqVector( *sd1, noid, qidstr2 );
531  CSRSearch::SSearchData sdata(
532  seq, seq1, nr, s1, s2, l1, l2, !mismatch );
533  search_obj->search( sdata, results );
534  }
535  else {
536  CSRSearch::SSearchData sdata(
537  seq, nr, s1, s2, l1, l2, !mismatch );
538  search_obj->search( sdata, results );
539  }
540 
541  if( use_cache ) {
542  rcache.update( seq_counter, results, qidstr1, qidstr2 );
543  }
544  else {
545  PrintResults( *ostream, rcache.getSIdMap(), seq_counter, results.res, qidstr1, qidstr2 );
546  }
547 
548  ++seq_counter;
549 
550  if( seq_counter%100000 == 0 ) {
551  cerr << seq_counter << " sequences processed" << endl;
552  }
553  }
554 
555  if( ++start_vol == end_vol ) break;
556  }
557 
558  if( use_cache ) rcache.dump( *ostream );
559  *ostream << flush;
560  return 0;
561 }
562 
CArgDescriptions –.
Definition: ncbiargs.hpp:541
Types of exception the indexing library can throw.
Definition: dbindex.hpp:409
const vector< string > & getIdMap() const
Definition: dbindex.hpp:941
static CRef< CDbIndex > Load(const std::string &fname, bool nomap=false)
Load index.
Definition: dbindex.cpp:415
CSequenceIStream::TStreamPos TSeqNum
Type used to enumerate sequences in the index.
Definition: dbindex.hpp:484
TSeqNum getStartOId() const
Definition: dbindex.hpp:869
static const Uint4 BLOCK_SIZE
vector< TBlock > TBlocks
void ensure(TSeqNum q)
SDataItem & at(TSeqNum q)
static const Uint4 BLOCKS_RESERVE
SDataItem & operator[](TSeqNum q)
vector< SDataItem > TBlock
static const Uint4 BLOCK_SHIFT
static const Uint4 BLOCK_MASK
TSeqNum size() const
vector< TItem > TBlock
static const Uint4 BLOCK_SIZE
vector< TBlock > TBlocks
vector< CSRSearch::SResultData > TItem
CRVPool(Uint4 mres)
static const Uint4 BLOCKS_RESERVE
Uint4 max_res_
CDataPool data_pool
CSRSearch::ELevel getLevel2(TSeqNum q)
const vector< string > & getSIdMap() const
Uint4 getNRes1(TSeqNum q)
CSRSearch::ELevel getLevel1(TSeqNum q)
void updateSIdMap(const vector< string > &idmap, TSeqNum start)
vector< string > idmap2
CRVPool rv_pool
CDbIndex::TSeqNum TSeqNum
Uint4 getNRes2(TSeqNum q)
vector< string > idmap1
void update(TSeqNum query, CSRSearch::TResults &results, const string &idstr1, const string &idstr2)
vector< string > sidmap
void dump(CNcbiOstream &ostream)
CRCache(Uint4 max_results_per_query)
virtual int Run()
Application main procedure.
static const char *const USAGE_LINE
String containing program usage information.
virtual void Init()
Application initialization.
static CRef< CSRSearch > MakeSRSearch(CRef< CDbIndex > index, TSeqPos d=0, TSeqPos dfuzz=0)
Definition: sr_search.cpp:46
CSeqVector –.
Definition: seq_vector.hpp:65
Sequence stream for reading FASTA formatted files.
Class used to abstract reading nucleotide sequences from various sources.
virtual CRef< TSeqData > next()=0
Extract the next sequence from the stream.
CDbIndex::TSeqNum TSeqNum
Forwarding declarations for convenience.
Definition: dbindex_sp.hpp:45
char data[12]
Definition: iconv.c:80
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
Definition: ncbiargs.hpp:590
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
NCBI_XOBJUTIL_EXPORT string GetTitle(const CBioseq_Handle &hnd, TGetTitleFlags flags=0)
Definition: seqtitle.cpp:106
TObjectType * GetNonNullPointer(void)
Get pointer value and throw a null pointer exception if pointer is null.
Definition: ncbiobj.hpp:968
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TObjectType * GetPointerOrNull(void) THROWS_NONE
Get pointer value.
Definition: ncbiobj.hpp:986
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NcbiCout
Definition: ncbistre.hpp:543
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
exit(2)
int i
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
The Object manager core.
static const char * prefix[]
Definition: pcregrep.c:405
CRef< objects::CObjectManager > om
string MakeIndexName(const string &prefix, Uint4 vol)
void PrintResults(CNcbiOstream &ostream, const vector< string > &idmap, CDbIndex::TSeqNum qnum, const vector< CSRSearch::SResultData > &results, const string &idstr1, const string &idstr2="")
CSequenceIStream::TSeqData TSeqData
CSeqVector ExtractSeqVector(TSeqData &sd, bool noid, string &idstr)
USING_NCBI_SCOPE
USING_SCOPE(blastdbindex)
CSRSearch::ELevel l2
vector< CSRSearch::SResultData > * r2
CSRSearch::ELevel l1
vector< CSRSearch::SResultData > * r1
vector< SResultData > res
Definition: sr_search.hpp:149
Type containing the sequence itself along with the masking information.
CRef< objects::CSeq_entry > seq_entry_
Sequence data.
static string query
Modified on Tue Apr 23 07:38:09 2024 by modify_doxy.py rev. 669887