NCBI C++ ToolKit
bl2seq_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: bl2seq_unit_test.cpp 91987 2020-12-17 15:27:07Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Christiam Camacho
27  *
28  */
29 
30 /** @file blast_unit_test.cpp
31  * Unit tests for the CBl2Seq class
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/test_boost.hpp>
43 
44 #include <serial/serial.hpp>
45 #include <serial/iterator.hpp>
46 #include <serial/objostr.hpp>
47 
50 
57 #include <algo/blast/api/local_blast.hpp> // for CLocalBlast
58 #include <algo/blast/api/local_db_adapter.hpp> // for CLocalDbAdapter
59 #include <algo/blast/api/objmgr_query_data.hpp> // for CObjMgr_QueryFactory
63 
65 #include <objtools/simple/simple_om.hpp> // for CSimpleOM
66 #include <objtools/readers/fasta.hpp> // for CFastaReader
68 #include <objmgr/util/sequence.hpp>
69 
70 #include "test_objmgr.hpp"
71 
72 #include <util/random_gen.hpp>
73 
74 #include <corelib/test_boost.hpp>
75 
76 #ifndef SKIP_DOXYGEN_PROCESSING
77 
79 USING_SCOPE(blast);
81 
83 
85 {
86  CRef<CSeq_loc> retval(new CSeq_loc());
87  NON_CONST_ITERATE(vector<TSeqRange>, itr, range_vec)
88  {
89  retval->SetPacked_int().AddInterval(*id, (*itr).GetFrom(), (*itr).GetTo());
90  }
91  return retval;
92 }
93 
94 BOOST_AUTO_TEST_CASE(NucleotideMultipleSeqLocs1) {
95 
96  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 224514841));
99 
100  vector<TSeqRange> range_vec;
101 
102  range_vec.push_back(TSeqRange(73011288, 73011591));
103  range_vec.push_back(TSeqRange(73080052, 73080223));
104  range_vec.push_back(TSeqRange(73096483, 73096589));
105  range_vec.push_back(TSeqRange(73097765, 73097864));
106  range_vec.push_back(TSeqRange(73113762, 73113809));
107  range_vec.push_back(TSeqRange(73119266, 73119340));
108  range_vec.push_back(TSeqRange(73168955, 73169141));
109  range_vec.push_back(TSeqRange(73178294, 73178376));
110  range_vec.push_back(TSeqRange(73220818, 73220920));
111  range_vec.push_back(TSeqRange(73223091, 73223365));
112 
113  CRef<CSeq_loc> temp_loc1 = s_MakePackedInt2(id, range_vec);
114 ofstream o1("temploc1.out.asn");
115 o1 << MSerial_AsnText << *temp_loc1 ;
116  query.push_back(SSeqLoc(temp_loc1, scope));
117 
118  TSeqLocVector subjects;
119  CRef<CSeq_id> sid1(new CSeq_id(CSeq_id::e_Gi, 262050671));
120  CRef<CSeq_loc> ssl1(new CSeq_loc());
121  ssl1->SetWhole(*sid1);
122  SSeqLoc subj_seqloc(ssl1, scope);
123  subjects.push_back(subj_seqloc);
124 
125  CBl2Seq bl2seq(query, subjects, eMegablast);
126  TSeqAlignVector sav(bl2seq.Run());
127  BOOST_CHECK_EQUAL(10, (int) sav.size());
128  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
129  BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
130  int num_ident = 0;
132  BOOST_CHECK_EQUAL((int)303, num_ident);
133  BOOST_CHECK_EQUAL((TSeqPos)73011288, sar->GetSeqStart(0));
134  BOOST_CHECK_EQUAL((TSeqPos)1, sar->GetSeqStart(1));
135 
136 #if 1
137 ofstream o("seqalign.out.asn");
138 ITERATE(TSeqAlignVector, v, sav) {
139 o << MSerial_AsnText << **v ;
140 }
141 o.close();
142 #endif
143 }
144 
145 
146 BOOST_AUTO_TEST_CASE(ProteinBlastInvalidSeqIdSelfHit)
147 {
148  CRef<CSeq_loc> loc(new CSeq_loc());
149  loc->SetWhole().SetGi(INVALID_GI);
150 
151  CRef<CScope> scope(new CScope(CTestObjMgr::Instance().GetObjMgr()));
152  scope->AddDefaults();
153  SSeqLoc query(loc, scope);
154 
155  TSeqLocVector subjects;
156  {
157  CRef<CSeq_loc> local_loc(new CSeq_loc());
158  local_loc->SetWhole().SetGi(INVALID_GI);
159 
160  CScope* local_scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
161  local_scope->AddDefaults();
162  subjects.push_back(SSeqLoc(local_loc, local_scope));
163  }
164 
165  // BLAST by concatenating all queries
166  CBl2Seq blaster4all(query, subjects, eBlastp);
167  TSeqAlignVector sas_v;
168  BOOST_CHECK_THROW(sas_v = blaster4all.Run(), CBlastException);
169 }
170 
190 };
191 
192 /* The following functions are used to test the functionality to interrupt
193  * CBl2Seq runs */
194 
195 /// Returns true so that the processing stops upon the first invocation of this
196 /// callback
197 extern "C" Boolean interrupt_immediately(SBlastProgress* /*progress_info*/)
198 {
199  return TRUE;
200 }
201 
202 /// Returns false so that the processing never stops in spite of a callback
203 /// function to interrupt the process is provided
204 extern "C" Boolean do_not_interrupt(SBlastProgress* /*progress_info*/)
205 {
206  return FALSE;
207 }
208 
209 /// This callback never interrupts the BLAST search, its only purpose is to
210 /// count the number of times this is invoked for the given input. Also to be
211 /// used in CBl2SeqTest::testInterruptXExitAtRandom.
212 extern "C" Boolean callback_counter(SBlastProgress* progress_info)
213 {
214  int& counter = *reinterpret_cast<int*>(progress_info->user_data);
215  counter++;
216  return FALSE;
217 }
218 
219 /// This callback interrupts the BLAST search after the callback has been
220 /// executed the requested number of times in the pair's second member.
221 /// This is used in CBl2SeqTest::testInterruptXExitAtRandom.
222 extern "C" Boolean interrupt_at_random(SBlastProgress* progress_info)
223 {
224  pair<int, int>& progress_pair =
225  *reinterpret_cast< pair<int, int>* >(progress_info->user_data);
226 
227  if (++progress_pair.first == progress_pair.second) {
228  return TRUE;
229  } else {
230  return FALSE;
231  }
232 }
233 
234 /// The interruption occurs after 3 invokations of this callback
235 extern "C" Boolean interrupt_after3calls(SBlastProgress* /*progress_info*/)
236 {
237  static int num_calls = 0;
238  if (++num_calls < 3) {
239  return FALSE;
240  } else {
241  return TRUE;
242  }
243 }
244 
245 /// The interruption occurs after starting the traceback stage
247 {
248  if (progress_info->stage == eTracebackSearch) {
249  return TRUE;
250  } else {
251  return FALSE;
252  }
253 }
254 
255 void testRawCutoffs(CBl2Seq& blaster, EProgram program,
256  EBl2seqTest test_id)
257 {
258  BlastRawCutoffs* raw_cutoffs =
259  blaster.GetDiagnostics()->cutoffs;
260  int x_drop_ungapped;
261  int gap_trigger;
262 
263  if (program == eBlastn || program == eDiscMegablast) {
264  x_drop_ungapped = 22;
265  gap_trigger = 16;
266  } else if (program == eMegablast) {
267  x_drop_ungapped = 8;
268  gap_trigger = 8;
269  } else {
270  x_drop_ungapped = 16;
271  gap_trigger = 41;
272  }
273 
274  switch (test_id) {
275  case eBlastn_555_3090:
276  x_drop_ungapped = 22;
277  gap_trigger = 18;
278  break;
279  case eBlastn_multi_q:
280  x_drop_ungapped = 22;
281  gap_trigger = 18;
282  break;
283  case eMegablast_chrom_mrna:
284  x_drop_ungapped = 7;
285  gap_trigger = 7;
286  break;
288  x_drop_ungapped = 22;
289  gap_trigger = 20;
290  break;
291  case eBlastp_multi_q:
292  gap_trigger = 23;
293  break;
294  case eBlastp_multi_q_s:
295  gap_trigger = 19;
296  break;
298  gap_trigger = 19; break;
299  case eTblastn_129295_555:
300  gap_trigger = 23; break;
302  gap_trigger = 23; break;
304  gap_trigger = 23; break;
305  case eBlastx_555_129295:
306  gap_trigger = 19; break;
307  case eTblastn_oof:
308  gap_trigger = 43;
309  default:
310  break;
311  }
312 
313  switch (program) {
314  case eBlastn: case eDiscMegablast:
315  BOOST_CHECK_EQUAL(x_drop_ungapped,
316  raw_cutoffs->x_drop_ungapped);
317  BOOST_CHECK_EQUAL(33, raw_cutoffs->x_drop_gap);
318  // CC changed 08/07/08
319  //BOOST_CHECK_EQUAL(55, raw_cutoffs->x_drop_gap_final);
320  BOOST_CHECK_EQUAL(110, raw_cutoffs->x_drop_gap_final);
321  BOOST_CHECK_EQUAL(gap_trigger, raw_cutoffs->ungapped_cutoff);
322  break;
323  case eMegablast:
324  BOOST_CHECK_EQUAL(x_drop_ungapped,
325  raw_cutoffs->x_drop_ungapped);
326  BOOST_CHECK_EQUAL(13, raw_cutoffs->x_drop_gap);
327  // CC changed 08/07/08
328  //BOOST_CHECK_EQUAL(27, raw_cutoffs->x_drop_gap_final);
329  BOOST_CHECK_EQUAL(54, raw_cutoffs->x_drop_gap_final);
330  BOOST_CHECK_EQUAL(gap_trigger, raw_cutoffs->ungapped_cutoff);
331  break;
332  case eBlastp: case eBlastx: case eTblastn:
333  BOOST_CHECK_EQUAL(38, raw_cutoffs->x_drop_gap);
334  BOOST_CHECK_EQUAL(64, raw_cutoffs->x_drop_gap_final);
335  BOOST_CHECK_EQUAL(gap_trigger, raw_cutoffs->ungapped_cutoff);
336  /* No break intentional: next test is valid for all the above
337  programs */
338  case eTblastx:
339  BOOST_CHECK_EQUAL(x_drop_ungapped,
340  raw_cutoffs->x_drop_ungapped);
341  break;
342  default: break;
343  }
344 }
345 
346 void testResultAlignments(size_t num_queries,
347  size_t num_subjects,
348  TSeqAlignVector result_alnvec)
349 {
350  size_t num_total_alns = num_queries * num_subjects;
351 
352  // test the number of resulting alignments
353  BOOST_REQUIRE_EQUAL(result_alnvec.size(), num_total_alns);
354 
355  // test the correct ordering of resulting alignments
356  // (q1 s1 q1 s2 ... q2 s1 q2 s2 ...)
357 
358  CConstRef<CSeq_id> id_query, id_prev_query;
359  CConstRef<CSeq_id> id_subject;
360  vector< CConstRef<CSeq_id> > id_prev_subjects;
361  id_prev_subjects.resize(num_subjects);
362 
363  bool prev_query_available = false;
364  vector<bool> prev_subjects_available(num_subjects, false);
365 
366  /* DEBUG OUTPUT
367  cerr << "................................................" << endl;
368  for (size_t i = 0; i < result_alnvec.size(); i++)
369  cerr << "\n<" << i << ">\n"
370  << MSerial_AsnText << result_alnvec[i].GetObject() << endl;
371  cerr << "................................................" << endl;
372  ------------ */
373 
374  for (size_t i_query = 0; i_query < num_queries; i_query++)
375  {
376  prev_query_available = false;
377  for (size_t i_subject = 0; i_subject < num_subjects; i_subject++)
378  {
379  size_t i_lin_index = i_query * num_subjects + i_subject;
380  CRef<CSeq_align_set> aln_set = result_alnvec[i_lin_index];
381 
382  // test if the alignment set is available (even if empty)
383  BOOST_REQUIRE(aln_set.NotNull());
384 
385  // if the alignment set is not empty, take the first alignment
386  // and see if the ID's are in correct order
387  if (aln_set->Get().size() > 0)
388  {
389  CRef<CSeq_align> aln = aln_set->Get().front();
390  id_query.Reset(&(aln->GetSeq_id(0)));
391  id_subject.Reset(&(aln->GetSeq_id(1)));
392 
393  // check if the query id was the same
394  // for the previous subject
395  if (i_subject > 0 &&
396  prev_query_available)
397  {
398  BOOST_REQUIRE(
399  id_query->Match(
400  id_prev_query.GetObject()));
401  }
402 
403  // check if the subject id was the same
404  // on the same position for the previous query
405  if (i_query > 0 &&
406  prev_subjects_available[i_subject])
407  {
408  BOOST_REQUIRE(
409  id_subject->Match(
410  id_prev_subjects[i_subject].GetObject()));
411  }
412 
413  // update the entry in previous subjects vector
414  prev_subjects_available[i_subject] = true;
415  id_prev_subjects[i_subject] = id_subject;
416 
417  // update the previous query entry
418  prev_query_available = true;
419  id_prev_query = id_query;
420  }
421  }
422  }
423 }
424 
425 void testBlastHitCounts(CBl2Seq& blaster, EBl2seqTest test_id)
426 {
427  BlastUngappedStats* ungapped_stats =
428  blaster.GetDiagnostics()->ungapped_stat;
429  BlastGappedStats* gapped_stats =
430  blaster.GetDiagnostics()->gapped_stat;
431 
432  switch (test_id) {
434  BOOST_CHECK_EQUAL(314, (int)ungapped_stats->lookup_hits);
435  BOOST_CHECK_EQUAL(3, ungapped_stats->init_extends);
436  BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
437  BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
438  BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
439  break;
440  case eBlastn_555_555:
441  BOOST_CHECK_EQUAL(157, (int)ungapped_stats->lookup_hits);
442  BOOST_CHECK_EQUAL(3, ungapped_stats->init_extends);
443  BOOST_CHECK_EQUAL(3, ungapped_stats->good_init_extends);
444  BOOST_CHECK_EQUAL(3, gapped_stats->extensions);
445  BOOST_CHECK_EQUAL(3, gapped_stats->good_extensions);
446  break;
447  case eMegablast_555_555:
448  BOOST_CHECK_EQUAL(30, (int)ungapped_stats->lookup_hits);
449  BOOST_CHECK_EQUAL(1, ungapped_stats->init_extends);
450  BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
451  BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
452  BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
453  break;
455  BOOST_CHECK_EQUAL(582, (int)ungapped_stats->lookup_hits);
456  // CC changed 08/07/08
457  //BOOST_CHECK_EQUAL(32, ungapped_stats->init_extends);
458  BOOST_CHECK_EQUAL(1, ungapped_stats->init_extends);
459  // CC changed 08/07/08
460  //BOOST_CHECK_EQUAL(32, ungapped_stats->good_init_extends);
461  BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
462  BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
463  BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
464  break;
465  case eBlastx_555_129295:
466  BOOST_CHECK_EQUAL(282, (int)ungapped_stats->lookup_hits);
467  BOOST_CHECK_EQUAL(3, ungapped_stats->init_extends);
468  BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
469  BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
470  BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
471  break;
472  case eTblastn_129295_555:
473  BOOST_CHECK_EQUAL(157, (int)ungapped_stats->lookup_hits);
474  BOOST_CHECK_EQUAL(1, ungapped_stats->init_extends);
475  BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
476  BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
477  BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
478  break;
480  BOOST_CHECK_EQUAL(5, (int)ungapped_stats->lookup_hits);
481  BOOST_CHECK_EQUAL(4, ungapped_stats->init_extends);
482  BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
483  BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
484  BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
485  break;
486  case eTblastx_555_555:
487  BOOST_CHECK_EQUAL(2590, (int)ungapped_stats->lookup_hits);
488  BOOST_CHECK_EQUAL(57, ungapped_stats->init_extends);
489  BOOST_CHECK_EQUAL(41, ungapped_stats->good_init_extends);
490  break;
491  case eTblastx_many_hits:
492  BOOST_CHECK_EQUAL(18587, (int)ungapped_stats->lookup_hits);
493  BOOST_CHECK_EQUAL(348, ungapped_stats->init_extends);
494  BOOST_CHECK_EQUAL(66, ungapped_stats->good_init_extends);
495  break;
497  BOOST_CHECK_EQUAL(210, (int)ungapped_stats->lookup_hits);
498  BOOST_CHECK_EQUAL(10, ungapped_stats->init_extends);
499  BOOST_CHECK_EQUAL(3, ungapped_stats->good_init_extends);
500  BOOST_CHECK_EQUAL(3, gapped_stats->extensions);
501  BOOST_CHECK_EQUAL(3, gapped_stats->good_extensions);
502  break;
503  case eBlastn_555_3090:
504  BOOST_CHECK_EQUAL(15, (int)ungapped_stats->lookup_hits);
505  BOOST_CHECK_EQUAL(2, ungapped_stats->init_extends);
506  BOOST_CHECK_EQUAL(2, ungapped_stats->good_init_extends);
507  BOOST_CHECK_EQUAL(2, gapped_stats->extensions);
508  BOOST_CHECK_EQUAL(2, gapped_stats->good_extensions);
509  break;
510  case eBlastp_multi_q:
511  BOOST_CHECK_EQUAL(2129, (int)ungapped_stats->lookup_hits);
512  BOOST_CHECK_EQUAL(76, ungapped_stats->init_extends);
513  BOOST_CHECK_EQUAL(14, ungapped_stats->good_init_extends);
514  BOOST_CHECK_EQUAL(8, gapped_stats->extensions);
515  BOOST_CHECK_EQUAL(8, gapped_stats->good_extensions);
516  break;
517  case eBlastn_multi_q:
518  BOOST_CHECK_EQUAL(963, (int)ungapped_stats->lookup_hits);
519  BOOST_CHECK_EQUAL(13, ungapped_stats->init_extends);
520  BOOST_CHECK_EQUAL(13, ungapped_stats->good_init_extends);
521  BOOST_CHECK_EQUAL(11, gapped_stats->extensions);
522  BOOST_CHECK_EQUAL(11, gapped_stats->good_extensions);
523  break;
524  case eBlastp_multi_q_s:
525 #if 0
526  // The following 2 numbers are different in Release and Debug modes
527  // due to a minor discrepancy in locations masked by seg filtering.
528  // The latter is due to a tiny difference in values involved in
529  // a comparison of real numbers inside the seg algorithm.
530  // In Debug mode:
531  BOOST_CHECK_EQUAL(3579, (int)ungapped_stats->lookup_hits);
532  BOOST_CHECK_EQUAL(138, ungapped_stats->init_extends);
533  // In Release mode:
534  BOOST_CHECK_EQUAL(3580, (int)ungapped_stats->lookup_hits);
535  BOOST_CHECK_EQUAL(140, ungapped_stats->init_extends);
536 #endif
537  // Note: Seg is not enabled for this case anymore
538  // (changed blastp defaults)
539  BOOST_CHECK_EQUAL(3939, (int)ungapped_stats->lookup_hits);
540  BOOST_CHECK_EQUAL(159, ungapped_stats->init_extends);
541  BOOST_CHECK_EQUAL(63, ungapped_stats->good_init_extends);
542  BOOST_CHECK_EQUAL(27, gapped_stats->extensions);
543  BOOST_CHECK_EQUAL(24, gapped_stats->good_extensions);
544  break;
545  case eTblastn_oof:
546  BOOST_CHECK_EQUAL(2666, (int)ungapped_stats->lookup_hits);
547  BOOST_CHECK_EQUAL(50, ungapped_stats->init_extends);
548  BOOST_CHECK_EQUAL(4, ungapped_stats->good_init_extends);
549  BOOST_CHECK_EQUAL(2, gapped_stats->extensions);
550  BOOST_CHECK_EQUAL(2, gapped_stats->good_extensions);
551  break;
552  case eBlastx_oof:
553  BOOST_CHECK_EQUAL(5950, (int)ungapped_stats->lookup_hits);
554  BOOST_CHECK_EQUAL(155, ungapped_stats->init_extends);
555  BOOST_CHECK_EQUAL(6, ungapped_stats->good_init_extends);
556  BOOST_CHECK_EQUAL(2, gapped_stats->extensions);
557  BOOST_CHECK_EQUAL(2, gapped_stats->good_extensions);
558  break;
560  BOOST_CHECK_EQUAL(108, (int)ungapped_stats->lookup_hits);
561  // CC changed 08/07/08
562  //BOOST_CHECK_EQUAL(15, ungapped_stats->init_extends);
563  //BOOST_CHECK_EQUAL(15, ungapped_stats->good_init_extends);
564  BOOST_CHECK_EQUAL(3, ungapped_stats->init_extends);
565  BOOST_CHECK_EQUAL(3, ungapped_stats->good_init_extends);
566  BOOST_CHECK_EQUAL(3, gapped_stats->extensions);
567  BOOST_CHECK_EQUAL(3, gapped_stats->good_extensions);
568  break;
570  BOOST_CHECK_EQUAL(14, (int)ungapped_stats->lookup_hits);
571  BOOST_CHECK_EQUAL(1, ungapped_stats->init_extends);
572  BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
573  BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
574  BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
575  break;
576  default: break;
577  }
578 }
579 
580 BOOST_AUTO_TEST_CASE(ProteinBlastSelfHit)
581 {
582  //const int kSeqLength = 232;
583  CSeq_id id("gi|129295");
584  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
585 
586  CBl2Seq blaster(*sl, *sl, eBlastp);
587  TSeqAlignVector sav(blaster.Run());
588  BOOST_REQUIRE(sav[0].NotEmpty());
589  BOOST_REQUIRE( !sav[0]->IsEmpty() );
590  BOOST_REQUIRE(sav[0]->Get().begin()->NotEmpty());
591  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
592  BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
595 
596  // the number of identities is NOT calculated when composition based
597  // statistics is turned on (default for blastp)
598  int num_ident = 0;
600 #if 0
601  ofstream o("0.asn");
602  o << MSerial_AsnText << *sar ;
603  o.close();
604 #endif
605  BOOST_CHECK_EQUAL(232, num_ident);
606 
607  // calculate the number of identities using the BLAST formatter
608 /*
609  double percent_identity =
610  CBlastFormatUtil::GetPercentIdentity(*sar, *sl->scope, false);
611  BOOST_CHECK_EQUAL(1, (int) percent_identity);
612 */
613 
614  // Check the ancillary results
615  CSearchResultSet::TAncillaryVector ancillary_data;
616  blaster.GetAncillaryResults(ancillary_data);
617  BOOST_CHECK_EQUAL((size_t)1, ancillary_data.size());
618  BOOST_CHECK( ancillary_data.front()->GetGappedKarlinBlk() != NULL );
619  BOOST_CHECK( ancillary_data.front()->GetUngappedKarlinBlk() != NULL );
620  BOOST_CHECK( ancillary_data.front()->GetSearchSpace() != (Int8)0 );
621 
622 }
623 
624 BOOST_AUTO_TEST_CASE(TBlastn2Seqs)
625 {
626  CSeq_id qid("gi|129295");
627  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
628 
629  CSeq_id sid("gi|555");
630  unique_ptr<SSeqLoc> subj(
631  CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
632 
633  CBl2Seq blaster(*query, *subj, eTblastn);
634  TSeqAlignVector sav(blaster.Run());
635  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
636 
637 #if 0
638  ofstream o("1.asn");
639  o << MSerial_AsnText << *sar ;
640  o.close();
641 #endif
642 
643  BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetStd().size());
646 
647  int score = 0, comp_adj = 0;
650  BOOST_CHECK_EQUAL(26, score);
651  BOOST_CHECK_EQUAL(2, comp_adj);
652 
653  // Check the ancillary results
654  CSearchResultSet::TAncillaryVector ancillary_data;
655  blaster.GetAncillaryResults(ancillary_data);
656  BOOST_CHECK_EQUAL((size_t)1, ancillary_data.size());
657  BOOST_REQUIRE( ancillary_data.front().NotEmpty() );
658  BOOST_CHECK( ancillary_data.front()->GetGappedKarlinBlk() != NULL );
659  BOOST_CHECK( ancillary_data.front()->GetUngappedKarlinBlk() != NULL );
660  BOOST_CHECK( ancillary_data.front()->GetSearchSpace() != (Int8)0 );
661 }
662 
663 BOOST_AUTO_TEST_CASE(TBlastn2SeqsRevStrand1)
664 {
665  CSeq_id qid("gi|1945390");
666  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
667 
668  pair<TSeqPos, TSeqPos> range(150000, 170000);
669  CSeq_id sid("gi|4755212");
670  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid, range, eNa_strand_minus));
671 
672  CBl2Seq blaster(*query, *subj, eTblastn);
673  TSeqAlignVector sav(blaster.Run());
674  BOOST_CHECK_EQUAL(11, (int) sav[0]->Get().size());
675  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
676  BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetStd().size());
677  vector < CRef< CSeq_loc > > locs = sar->GetSegs().GetStd().front()->GetLoc();
678  BOOST_CHECK_EQUAL((int)eNa_strand_minus, (int)(locs[1])->GetStrand());
679  int num_ident = 0;
681  BOOST_CHECK_EQUAL(161, num_ident);
682 #if 0
683 ofstream o("minus1.new.asn");
684 ITERATE(TSeqAlignVector, v, sav) {
685 o << MSerial_AsnText << **v ;
686 }
687 o.close();
688 #endif
689 }
690 
691 BOOST_AUTO_TEST_CASE(TBlastn2SeqsRevStrand2)
692 {
693  CSeq_id qid("gi|1945390");
694  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
695 
696  CSeq_id sid("gi|1945388");
697  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_minus));
698 
699  CBl2Seq blaster(*query, *subj, eTblastn);
700  TSeqAlignVector sav(blaster.Run());
701  BOOST_CHECK_EQUAL(1, (int) sav[0]->Get().size());
702  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
703  BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetStd().size());
704  vector < CRef< CSeq_loc > > locs = sar->GetSegs().GetStd().front()->GetLoc();
705  BOOST_CHECK_EQUAL((int)eNa_strand_minus, (int)(locs[1])->GetStrand());
706  int num_ident = 0;
708  BOOST_CHECK_EQUAL(11, num_ident);
709 #if 0
710 ofstream o("minus2.asn");
711 o << MSerial_AsnText << *sar ;
712 o.close();
713 #endif
714 }
715 
716 
717 BOOST_AUTO_TEST_CASE(TBlastn2SeqsCompBasedStats)
718 {
719  CSeq_id qid("gi|68737"); // "pir|A01243|DXCH"
720  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
721 
722  CSeq_id sid("gi|118086484");
723  unique_ptr<SSeqLoc> subj(
724  CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
725 
728 
729  CBl2Seq blaster(*query, *subj, *opts);
730  TSeqAlignVector sav(blaster.Run());
731  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
732  BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetStd().size());
733 
734  int num_ident = 0;
736  BOOST_CHECK_EQUAL(229, num_ident);
737 #if 0
738 ofstream o("2.asn");
739 o << MSerial_AsnText << *sar ;
740 o.close();
741 #endif
742 
743  // Check the ancillary results
744  CSearchResultSet::TAncillaryVector ancillary_data;
745  blaster.GetAncillaryResults(ancillary_data);
746  BOOST_CHECK_EQUAL((size_t)1, ancillary_data.size());
747  BOOST_CHECK( ancillary_data.front()->GetGappedKarlinBlk() != NULL );
748  BOOST_CHECK( ancillary_data.front()->GetUngappedKarlinBlk() != NULL );
749  BOOST_CHECK( ancillary_data.front()->GetSearchSpace() != (Int8)0 );
750 }
751 
752 BOOST_AUTO_TEST_CASE(TBlastn2SeqsLargeWord)
753 {
754  CSeq_id qid("gi|129295");
755  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
756 
757  CSeq_id sid("gi|555");
758  unique_ptr<SSeqLoc> subj(
759  CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
760 
762  opts->SetOptions().SetWordSize(6);
764  opts->SetOptions().SetWordThreshold(21.69);
765  opts->SetOptions().SetWindowSize(0);
767 
768  CBl2Seq blaster(*query, *subj, *opts);
769  TSeqAlignVector sav(blaster.Run());
770  BOOST_CHECK_EQUAL(1, (int)sav[0]->Size());
773 
774  int num_ident = 0;
775  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
777 #if 0
778 ofstream o("3.asn");
779 o << MSerial_AsnText << *sar ;
780 o.close();
781 #endif
782  BOOST_CHECK_EQUAL(5, num_ident);
783 }
784 
785 BOOST_AUTO_TEST_CASE(IdenticalProteins)
786 {
787  //const int kSeqLength = 377;
788  CSeq_id qid("gi|34810917");
789  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
790  CSeq_id sid("gi|34810916");
791  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
792 
793  CBl2Seq blaster(*query, *subj, eBlastp);
794  TSeqAlignVector sav(blaster.Run());
795  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
796  BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
797 
798  // the number of identities is NOT calculated when composition based
799  // statistics is turned on (default for blastp)
800  int num_ident = 0;
802 #if 0
803  ofstream o("4.asn");
804  o << MSerial_AsnText << *sar ;
805  o.close();
806 #endif
807  BOOST_CHECK_EQUAL(377, num_ident);
808 
809  // calculate the number of identities using the BLAST formatter
810 /*
811  double percent_identity =
812  CBlastFormatUtil::GetPercentIdentity(*sar, *query->scope, false);
813  BOOST_CHECK_EQUAL(1, (int) percent_identity);
814 */
815 
816  // Check the ancillary results
817  CSearchResultSet::TAncillaryVector ancillary_data;
818  blaster.GetAncillaryResults(ancillary_data);
819  BOOST_CHECK_EQUAL((size_t)1, ancillary_data.size());
820  BOOST_CHECK( ancillary_data.front()->GetGappedKarlinBlk() != NULL );
821  BOOST_CHECK( ancillary_data.front()->GetUngappedKarlinBlk() != NULL );
822  BOOST_CHECK( ancillary_data.front()->GetSearchSpace() != (Int8)0 );
823 }
824 
825 BOOST_AUTO_TEST_CASE(UnsupportedOption) {
826  CDiscNucleotideOptionsHandle opts_handle;
827  BOOST_REQUIRE_THROW(opts_handle.SetTraditionalBlastnDefaults(),
829 }
830 
831 BOOST_AUTO_TEST_CASE(PositiveMismatchOption) {
832  CSeq_id qid("gi|408478"); // zebrafish sequence U02544
833  CSeq_id sid("gi|1546012"); // mouse sequence U61969
834 
835  unique_ptr<SSeqLoc> query(
836  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
837  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
838 
839  const int kMatch = 2;
840  const int kMismatch = 5; // Positive mismatch not allowed.
841 
842  CBlastNucleotideOptionsHandle nucl_options_handle;
843 
844  nucl_options_handle.SetMatchReward(kMatch);
845  nucl_options_handle.SetMismatchPenalty(kMismatch);
846  CBl2Seq blaster(*query, *subj, nucl_options_handle);
847  try {
848  TSeqAlignVector sav(blaster.Run());
849  } catch (const CException& e) {
850  BOOST_REQUIRE(
851  !strcmp("BLASTN penalty must be negative",
852  e.GetMsg().c_str()));
853  }
854 }
855 
856 // If a single query is fully masked, throw an exception (i.e.: an error)
857 BOOST_AUTO_TEST_CASE(FullyMaskedSequence) {
858  CSeq_id qid("ref|NT_024524.13");
859  pair<TSeqPos, TSeqPos> range(27886902, 27886932);
860  unique_ptr<SSeqLoc> query(
861  CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
862  eNa_strand_plus));
863  range.first = 2052;
864  range.second = 2082;
865  CSeq_id sid("emb|BX641126.1");
866  unique_ptr<SSeqLoc> subj(
867  CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
870  options->SetTraditionalBlastnDefaults();
871  options->SetMismatchPenalty(-1);
872  options->SetMatchReward(1);
873  options->SetGapXDropoff(100);
874  options->SetMaskAtHash(false);
875  CBl2Seq blaster(*query, *subj, *options);
876  CRef<CSearchResultSet> results;
877  BOOST_REQUIRE_NO_THROW(results = blaster.RunEx());
878 }
879 
880 // In the case of multiple queries and one being masked, only emit a warning
881 // N.B.: this doesn't test the MT case as MT unit tests aren't supported
882 BOOST_AUTO_TEST_CASE(MultipleQueries1FullyMasked) {
883  const bool kHasProteinQuery(true);
884  CBlastInputSourceConfig iconfig(kHasProteinQuery);
885  const CSearchDatabase dbinfo("data/nt.41646578", CSearchDatabase::eBlastDbIsNucleotide);
887 
888  CNcbiIfstream infile("data/mult_queries_1fully_masked.fsa");
889  CRef<CScope> scope = CBlastScopeSource(kHasProteinQuery).NewScope();
891  (new CBlastFastaInputSource(infile, iconfig));
892  CRef<CBlastInput> input(new CBlastInput(&*fasta_src));
893  TSeqLocVector query_vec = input->GetAllSeqLocs(*scope);
894 
895  // The first iteration runs the search *without* splitting, the second one
896  // forces to use query splitting
897  for (int env = 0; env < 2; env++) {
898 
899  CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(query_vec));
900  unique_ptr<CAutoEnvironmentVariable> envvar1, envvar2;
901  if (env) {
902  envvar1.reset(new CAutoEnvironmentVariable("OVERLAP_CHUNK_SIZE", NStr::SizetToString(10)));
903  envvar2.reset(new CAutoEnvironmentVariable("CHUNK_SIZE", NStr::SizetToString(100)));
904  }
905  CLocalBlast blaster(queries, opts_handle, dbinfo);
906  CRef<CSearchResultSet> results;
907  BOOST_REQUIRE_NO_THROW(results = blaster.Run());
908  BOOST_REQUIRE_EQUAL(3U, results->size());
909  for (size_t query_idx = 0; query_idx < results->size(); query_idx++) {
910  const string& warnings((*results)[query_idx].GetWarningStrings());
911  const string& errors((*results)[query_idx].GetErrorStrings());
912 
913  CNcbiOstrstream oss;
914  if (env) {
915  oss << "Forced splitting of queries: ";
916  } else {
917  oss << "No splitting of queries: ";
918  }
919 
920  if (query_idx == 0 || query_idx == 2) {
921  oss << " expected no warnings/errors, got errors='" << errors << "'; "
922  << "warnings='" << warnings << "'";
923  const string msg = CNcbiOstrstreamToString(oss);
924  BOOST_CHECK_MESSAGE(kEmptyStr == warnings, msg);
925  BOOST_CHECK_MESSAGE(kEmptyStr == errors, msg);
926  } else {
927  oss << " expected warnings to match '" << kBlastErrMsg_CantCalculateUngappedKAParams
928  << "; instead got '" << warnings << "'";
929  const string msg = CNcbiOstrstreamToString(oss);
930  BOOST_CHECK_MESSAGE(warnings.find(kBlastErrMsg_CantCalculateUngappedKAParams) != NPOS, msg);
931  }
932  }
933  }
934 }
935 
936 BOOST_AUTO_TEST_CASE(testInterruptBlastpExitImmediately) {
937  CSeq_id id("gi|129295");
938  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
939 
940  CBl2Seq blaster(*sl, *sl, eBlastp);
941  TInterruptFnPtr fnptr =
943  BOOST_REQUIRE(fnptr == NULL);
944 
945  TSeqAlignVector sav;
946  try { sav = blaster.Run(); }
947  catch (...) {
948  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
949  }
950 }
951 
952 // SB-814
953 BOOST_AUTO_TEST_CASE(Tblastn2Seqs_PlusStrandOnly) {
954  // We expect only 1 hit between these two sequences in the plus strand
955  TSeqLocVector query_vec, subj_vec;
956  CSeq_id qid(CSeq_id::e_Gi, 7019569);
957  unique_ptr<SSeqLoc> ql(CTestObjMgr::Instance().CreateSSeqLoc(qid));
958  query_vec.push_back(*ql);
959  CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(query_vec));
960 
961  const ENa_strand target_strand = eNa_strand_plus;
962  CSeq_id sid(CSeq_id::e_Gi, 17865806);
963  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(sid,
964  target_strand));
965  subj_vec.push_back(*sl);
966  CRef<IQueryFactory> subjects(new CObjMgr_QueryFactory(subj_vec));
967 
969  // N.B.: this setting is not applicable, instead the strand should be set
970  // in the subject seqlocs. FIXME: add a comment to this effect to
971  // SetStrandOption.
972  // N.B.2: BLAST doesn't support setting the subject strand for a database
973  // search
974  //tblastn_opts->SetOptions().SetStrandOption(target_strand);
975  CRef<CLocalDbAdapter> db_adapter(new CLocalDbAdapter(subjects, tblastn_opts));
976 
977  CLocalBlast blaster(queries, tblastn_opts, db_adapter);
978 
979  CRef<CSearchResultSet> results = blaster.Run();
980  BOOST_REQUIRE_EQUAL(eSequenceComparison, results->GetResultType());
981  BOOST_CHECK_EQUAL((size_t)1, results->GetNumResults());
982  const CSearchResults& result = (*results)[0];
983  BOOST_CHECK(result.HasAlignments());
984  CConstRef<CSeq_align_set> alignment = result.GetSeqAlign();
985  BOOST_CHECK_EQUAL((size_t)1, alignment->Size());
986  ITERATE(CSeq_align_set::Tdata, aln, alignment->Get()) {
987  // check the query's strand (protein)
988  BOOST_CHECK_EQUAL(static_cast<int>(eNa_strand_unknown), static_cast<int>((*aln)->GetSeqStrand(0)));
989  // check the subject's strand (nucleotide)
990  BOOST_CHECK_EQUAL(static_cast<int>(eNa_strand_plus), static_cast<int>((*aln)->GetSeqStrand(1)));
991  }
992 }
993 
994 // SB-386
995 BOOST_AUTO_TEST_CASE_TIMEOUT(testInterruptBlastSetup, 3);
996 BOOST_AUTO_TEST_CASE(testInterruptBlastSetup) {
997  CSeq_id id("NC_000002.11");
998  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
999 
1000  CBl2Seq blaster(*sl, *sl, eBlastn);
1001  TInterruptFnPtr fnptr =
1003  BOOST_REQUIRE(fnptr == NULL);
1004 
1005  TSeqAlignVector sav;
1006  try { sav = blaster.Run(); }
1007  catch (...) {
1008  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1009  }
1010 }
1011 
1012 BOOST_AUTO_TEST_CASE(testInterruptBlastnExitImmediately) {
1013  CSeq_id id("gi|555");
1014  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
1015 
1016  CBl2Seq blaster(*sl, *sl, eBlastn);
1017  TInterruptFnPtr fnptr =
1019  BOOST_REQUIRE(fnptr == NULL);
1020 
1021  TSeqAlignVector sav;
1022  try { sav = blaster.Run(); }
1023  catch (...) {
1024  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1025  }
1026 }
1027 
1028 BOOST_AUTO_TEST_CASE(testInterruptBlastxExitImmediately) {
1029  CSeq_id query_id("gi|555");
1030  unique_ptr<SSeqLoc> slq(CTestObjMgr::Instance().CreateSSeqLoc(query_id));
1031  CSeq_id subj_id("gi|129295");
1032  unique_ptr<SSeqLoc> sls(CTestObjMgr::Instance().CreateSSeqLoc(subj_id));
1033 
1034  CBl2Seq blaster(*slq, *sls, eBlastx);
1035  TInterruptFnPtr fnptr =
1037  BOOST_REQUIRE(fnptr == NULL);
1038 
1039  TSeqAlignVector sav;
1040  try { sav = blaster.Run(); }
1041  catch (...) {
1042  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1043  }
1044 }
1045 
1046 BOOST_AUTO_TEST_CASE(testInterruptTblastxExitImmediately) {
1047  CSeq_id query_id("gi|555");
1048  unique_ptr<SSeqLoc> slq(CTestObjMgr::Instance().CreateSSeqLoc(query_id));
1049  CSeq_id subj_id("gi|555");
1050  unique_ptr<SSeqLoc> sls(CTestObjMgr::Instance().CreateSSeqLoc(subj_id));
1051 
1052  CBl2Seq blaster(*slq, *sls, eTblastx);
1053  TInterruptFnPtr fnptr =
1055  BOOST_REQUIRE(fnptr == NULL);
1056 
1057  TSeqAlignVector sav;
1058  try { sav = blaster.Run(); }
1059  catch (...) {
1060  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1061  }
1062 }
1063 
1064 BOOST_AUTO_TEST_CASE(testInterruptTblastnExitImmediately) {
1065  CSeq_id query_id("gi|129295");
1066  unique_ptr<SSeqLoc> slq(CTestObjMgr::Instance().CreateSSeqLoc(query_id));
1067  CSeq_id subj_id("gi|555");
1068  unique_ptr<SSeqLoc> sls(CTestObjMgr::Instance().CreateSSeqLoc(subj_id));
1069 
1070  CBl2Seq blaster(*slq, *sls, eTblastn);
1071  TInterruptFnPtr fnptr =
1073  BOOST_REQUIRE(fnptr == NULL);
1074 
1075  TSeqAlignVector sav;
1076  try { sav = blaster.Run(); }
1077  catch (...) {
1078  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1079  }
1080 }
1081 
1082 #define ARRAY_SIZE(a) (sizeof(a)/sizeof(*a))
1083 static
1085  bool subj_is_nucl,
1086  EProgram program) {
1087 
1088  TIntId protein_gis[] = { 6, 129295, 15606659, 4336138, 5556 };
1089  TIntId nucl_gis[] = { 272208, 272217, 272211, 272247, 272227, 272236,
1090  272219 };
1091 
1092  vector<TIntId> q_gis, s_gis;
1093  if (query_is_nucl) {
1094  copy(&nucl_gis[0],
1095  &nucl_gis[ARRAY_SIZE(nucl_gis)],
1096  back_inserter(q_gis));
1097  } else {
1098  copy(&protein_gis[0],
1099  &protein_gis[ARRAY_SIZE(protein_gis)],
1100  back_inserter(q_gis));
1101  }
1102 
1103  if (subj_is_nucl) {
1104  copy(&nucl_gis[0],
1105  &nucl_gis[ARRAY_SIZE(nucl_gis)],
1106  back_inserter(s_gis));
1107  } else {
1108  copy(&protein_gis[0],
1109  &protein_gis[ARRAY_SIZE(protein_gis)],
1110  back_inserter(s_gis));
1111  }
1112 
1113 
1114  TSeqLocVector queries;
1115  ITERATE(vector<TIntId>, itr, q_gis) {
1116  CRef<CSeq_loc> loc(new CSeq_loc());
1117  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1118 
1119  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1120  scope->AddDefaults();
1121  queries.push_back(SSeqLoc(loc, scope));
1122  }
1123 
1124  TSeqLocVector subjects;
1125  ITERATE(vector<TIntId>, itr, s_gis) {
1126  CRef<CSeq_loc> loc(new CSeq_loc());
1127  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1128 
1129  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1130  scope->AddDefaults();
1131  subjects.push_back(SSeqLoc(loc, scope));
1132  }
1133 
1134  return CRef<CBl2Seq>(new CBl2Seq(queries, subjects, program));
1135 }
1136 
1137 BOOST_AUTO_TEST_CASE(testInterruptBlastpExitAtRandom) {
1138 
1140  false,
1141  eBlastp);
1142 
1143  int num_callbacks_executed(0);
1144  TInterruptFnPtr fnptr =
1145  blaster->SetInterruptCallback(callback_counter,
1146  (void*) &num_callbacks_executed);
1147  BOOST_REQUIRE(fnptr == NULL);
1148 
1149  TSeqAlignVector sav(blaster->Run()); // won't throw
1150  CRandom r((CRandom::TValue)time(0));
1151  int max_interrupt_callbacks = r.GetRand(1, num_callbacks_executed);
1152  pair<int, int> progress_pair(make_pair(0, max_interrupt_callbacks));
1153 
1154  fnptr = blaster->SetInterruptCallback(interrupt_at_random,
1155  (void*)&progress_pair);
1156  BOOST_REQUIRE(fnptr == callback_counter);
1157  sav.clear();
1158 
1159  try { sav = blaster->Run(); }
1160  catch (...) {
1161  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1162  }
1163 }
1164 
1165 BOOST_AUTO_TEST_CASE(testInterruptBlastnExitAtRandom) {
1166 
1167  CRef<CBl2Seq> blaster =
1169 
1170  int num_callbacks_executed(0);
1171  TInterruptFnPtr fnptr =
1172  blaster->SetInterruptCallback(callback_counter,
1173  (void*)&num_callbacks_executed);
1174  BOOST_REQUIRE(fnptr == NULL);
1175 
1176  TSeqAlignVector sav(blaster->Run()); // won't throw
1177  CRandom r((CRandom::TValue)time(0));
1178  int max_interrupt_callbacks = r.GetRand(1, num_callbacks_executed);
1179  pair<int, int> progress_pair(make_pair(0, max_interrupt_callbacks));
1180 
1181  fnptr = blaster->SetInterruptCallback(interrupt_at_random,
1182  (void*)&progress_pair);
1183  BOOST_REQUIRE(fnptr == callback_counter);
1184  sav.clear();
1185 
1186  try { sav = blaster->Run(); }
1187  catch (...) {
1188  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1189  }
1190 }
1191 
1192 // interrupt_at_random.
1193 BOOST_AUTO_TEST_CASE(testInterruptBlastxExitAtRandom) {
1194 
1195  CRef<CBl2Seq> blaster =
1197 
1198  int num_callbacks_executed(0);
1199  TInterruptFnPtr fnptr =
1200  blaster->SetInterruptCallback(callback_counter,
1201  (void*) & num_callbacks_executed);
1202  BOOST_REQUIRE(fnptr == NULL);
1203 
1204  TSeqAlignVector sav(blaster->Run()); // won't throw
1205  CRandom r((CRandom::TValue)time(0));
1206  int max_interrupt_callbacks = r.GetRand(1, num_callbacks_executed);
1207  pair<int, int> progress_pair(make_pair(0, max_interrupt_callbacks));
1208 
1209  fnptr = blaster->SetInterruptCallback(interrupt_at_random,
1210  (void*)&progress_pair);
1211  BOOST_REQUIRE(fnptr == callback_counter);
1212  sav.clear();
1213 
1214  try { sav = blaster->Run(); }
1215  catch (...) {
1216  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1217  }
1218 }
1219 
1220 BOOST_AUTO_TEST_CASE(testInterruptTblastnExitAtRandom) {
1221 
1222  CRef<CBl2Seq> blaster =
1224 
1225  int num_callbacks_executed(0);
1226  TInterruptFnPtr fnptr =
1227  blaster->SetInterruptCallback(callback_counter,
1228  (void*)&num_callbacks_executed);
1229  BOOST_REQUIRE(fnptr == NULL);
1230 
1231  TSeqAlignVector sav(blaster->Run()); // won't throw
1232  CRandom r((CRandom::TValue)time(0));
1233  int max_interrupt_callbacks = r.GetRand(1, num_callbacks_executed);
1234  pair<int, int> progress_pair(make_pair(0, max_interrupt_callbacks));
1235 
1236  fnptr = blaster->SetInterruptCallback(interrupt_at_random,
1237  (void*)&progress_pair);
1238  BOOST_REQUIRE(fnptr == callback_counter);
1239  sav.clear();
1240 
1241  try { sav = blaster->Run(); }
1242  catch (...) {
1243  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1244  }
1245 }
1246 
1247 BOOST_AUTO_TEST_CASE(testInterruptTblastxExitAtRandom) {
1248 
1249  CRef<CBl2Seq> blaster =
1251 
1252  int num_callbacks_executed(0);
1253  TInterruptFnPtr fnptr =
1254  blaster->SetInterruptCallback(callback_counter,
1255  (void*) & num_callbacks_executed);
1256  BOOST_REQUIRE(fnptr == NULL);
1257 
1258  TSeqAlignVector sav(blaster->Run()); // won't throw
1259  CRandom r((CRandom::TValue)time(0));
1260  int max_interrupt_callbacks = r.GetRand(1, num_callbacks_executed);
1261  pair<int, int> progress_pair(make_pair(0, max_interrupt_callbacks));
1262 
1263  fnptr = blaster->SetInterruptCallback(interrupt_at_random,
1264  (void*)&progress_pair);
1265  BOOST_REQUIRE(fnptr == callback_counter);
1266  sav.clear();
1267 
1268  try { sav = blaster->Run(); }
1269  catch (...) {
1270  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1271  }
1272 }
1273 
1274 BOOST_AUTO_TEST_CASE(testInterruptBlastpExitAfter3Callbacks) {
1275  CSeq_id id("gi|129295");
1276  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
1277 
1278  CBl2Seq blaster(*sl, *sl, eBlastp);
1279  TInterruptFnPtr fnptr =
1281  BOOST_REQUIRE(fnptr == NULL);
1282 
1283  TSeqAlignVector sav;
1284  try { sav = blaster.Run(); }
1285  catch (...) {
1286  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1287  }
1288 }
1289 
1290 BOOST_AUTO_TEST_CASE(testInterruptBlastxExitOnTraceback) {
1291 
1293  false,
1294  eBlastx);
1295  TInterruptFnPtr fnptr =
1296  blaster->SetInterruptCallback(interrupt_on_traceback);
1297  BOOST_REQUIRE(fnptr == NULL);
1298 
1299  TSeqAlignVector sav;
1300  try { sav = blaster->Run(); }
1301  catch (...) {
1302  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1303  }
1304 }
1305 
1306 BOOST_AUTO_TEST_CASE(testInterruptTblastxExitOnTraceback) {
1307 
1309  (true, true, eTblastx);
1310  TInterruptFnPtr fnptr =
1311  blaster->SetInterruptCallback(interrupt_on_traceback);
1312  BOOST_REQUIRE(fnptr == NULL);
1313 
1314  TSeqAlignVector sav;
1315  try { sav = blaster->Run(); }
1316  catch (...) {
1317  BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1318  }
1319 }
1320 
1321 BOOST_AUTO_TEST_CASE(ProteinBlastMultipleQueriesWithInvalidSeqId) {
1322  vector<TIntId> q_gis, s_gis;
1323 
1324  // Setup the queries
1325  q_gis.push_back(129295);
1326  q_gis.push_back(-1); // invalid seqid
1327 
1328  // setup the subjects
1329  s_gis.push_back(129295);
1330  s_gis.push_back(4336138); // no hits with gi 129295
1331 
1332  TSeqLocVector queries;
1333  ITERATE(vector<TIntId>, itr, q_gis) {
1334  CRef<CSeq_loc> loc(new CSeq_loc());
1335  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1336 
1337  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1338  scope->AddDefaults();
1339  queries.push_back(SSeqLoc(loc, scope));
1340  }
1341 
1342  TSeqLocVector subjects;
1343  ITERATE(vector<TIntId>, itr, s_gis) {
1344  CRef<CSeq_loc> loc(new CSeq_loc());
1345  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1346 
1347  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1348  scope->AddDefaults();
1349  subjects.push_back(SSeqLoc(loc, scope));
1350  }
1351 
1352  // BLAST by concatenating all queries
1353  CBl2Seq blaster4all(queries, subjects, eBlastp);
1354  TSeqAlignVector sas_v = blaster4all.Run();
1355 
1356  TSearchMessages m;
1357  blaster4all.GetMessages(m);
1358  BOOST_REQUIRE_EQUAL(subjects.size()*queries.size(), sas_v.size());
1359 
1360  BOOST_REQUIRE(m[0].empty());
1361  BOOST_REQUIRE(!m[1].empty());
1362 
1363  // Verify the error message
1364  TQueryMessages qm = m[1];
1365  BOOST_REQUIRE(qm.front()->GetMessage().find("Cannot resolve") !=
1366  string::npos);
1367 
1368  // Verify that the alignments corresponding to the 2nd query are indeed empty
1369  // in older version this was sas_v[1], order has changed
1370  BOOST_REQUIRE_EQUAL(0, (int) sas_v[2]->Size());
1371 
1372  CRef<CSearchResultSet> res = blaster4all.RunEx();
1373  BOOST_REQUIRE(res.NotEmpty());
1374  BOOST_REQUIRE_EQUAL(q_gis.size()*s_gis.size(), res->GetNumResults());
1375  BOOST_REQUIRE(res->GetResults(0,0).HasAlignments());
1376  BOOST_REQUIRE(!res->GetResults(0,1).HasAlignments());
1377  BOOST_REQUIRE(!res->GetResults(1,0).HasAlignments());
1378  BOOST_REQUIRE(!res->GetResults(1,1).HasAlignments());
1379 }
1380 
1381 // JIRA:SB-1242
1382 BOOST_AUTO_TEST_CASE(ProteinBlastMultipleQueriesWithBadQuery) {
1383  vector<TIntId> q_gis, s_gis;
1384 
1385  // Setup the queries
1386  q_gis.push_back(296863684); // All X's
1387  q_gis.push_back(129295);
1388 
1389  // setup the subjects
1390  s_gis.push_back(129296);
1391 
1392  TSeqLocVector queries;
1393  ITERATE(vector<TIntId>, itr, q_gis) {
1394  CRef<CSeq_loc> loc(new CSeq_loc());
1395  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1396 
1397  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1398  scope->AddDefaults();
1399  queries.push_back(SSeqLoc(loc, scope));
1400  }
1401 
1402  TSeqLocVector subjects;
1403  ITERATE(vector<TIntId>, itr, s_gis) {
1404  CRef<CSeq_loc> loc(new CSeq_loc());
1405  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1406 
1407  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1408  scope->AddDefaults();
1409  subjects.push_back(SSeqLoc(loc, scope));
1410  }
1411 
1412  // BLAST by concatenating all queries
1413  CBl2Seq blaster4all(queries, subjects, eBlastp);
1414  TSeqAlignVector sas_v = blaster4all.Run();
1415 
1416  TSearchMessages m;
1417  blaster4all.GetMessages(m);
1418  BOOST_REQUIRE_EQUAL(subjects.size()*queries.size(), sas_v.size());
1419 
1420  BOOST_REQUIRE(!m[0].empty());
1421  BOOST_REQUIRE(m[1].empty());
1422 }
1423 
1424 BOOST_AUTO_TEST_CASE(NucleotideBlastMultipleQueriesWithInvalidSeqId) {
1425  CRef<CSeq_id> id1(new CSeq_id(CSeq_id::e_Gi, 555));
1426  unique_ptr<SSeqLoc> sl1(CTestObjMgr::Instance().CreateSSeqLoc(*id1));
1427  CRef<CSeq_id> id2(new CSeq_id(CSeq_id::e_Gi, 556));
1428  unique_ptr<SSeqLoc> sl2(CTestObjMgr::Instance().CreateSSeqLoc(*id2));
1429 
1430  const TSeqPos kFakeBioseqLength = 12;
1431  const char byte(0); // string of 4 A's in ncbi2na
1432  vector<char> na_data(kFakeBioseqLength/4, byte);
1433 
1434  CRef<CSeq_id> fake_id(new CSeq_id("lcl|77"));
1435  CRef<CBioseq> fake_bioseq(new CBioseq);
1436  fake_bioseq->SetInst().SetLength(kFakeBioseqLength);
1437  fake_bioseq->SetInst().SetSeq_data().SetNcbi2na().Set().swap(na_data);
1438  fake_bioseq->SetInst().SetMol(CSeq_inst::eMol_na);
1439  fake_bioseq->SetInst().SetRepr(CSeq_inst::eRepr_raw);
1440  fake_bioseq->SetId().push_back(fake_id);
1441  CRef<CSeq_loc> fake_loc(new CSeq_loc);
1442  fake_loc->SetWhole(*fake_id);
1443 
1444  CRef<CScope> scope(CSimpleOM::NewScope(false));
1445  scope->AddBioseq(*fake_bioseq);
1446  unique_ptr<SSeqLoc> sl_bad(new SSeqLoc(*fake_loc, *scope));
1447 
1448  TSeqPos len = sequence::GetLength(*sl_bad->seqloc, sl_bad->scope);
1449  BOOST_REQUIRE_EQUAL(kFakeBioseqLength, len);
1450 
1451  TSeqLocVector queries;
1452  queries.push_back(*sl1);
1453  queries.push_back(*sl_bad);
1454  queries.push_back(*sl2);
1455 
1456  // All subjects have matches against this gi
1457  CRef<CSeq_id> subj_id(new CSeq_id(CSeq_id::e_Gi, 555));
1458  unique_ptr<SSeqLoc> subj_loc
1459  (CTestObjMgr::Instance().CreateSSeqLoc(*subj_id));
1461  subject.push_back(*subj_loc);;
1462 
1464  opts_handle->SetMaskAtHash(false);
1465  CBl2Seq bl2seq(queries, subject, *opts_handle);
1466  TSeqAlignVector sas_v = bl2seq.Run();
1467  sas_v = bl2seq.Run();
1468  TSearchMessages m;
1469  bl2seq.GetMessages(m);
1470  BOOST_REQUIRE_EQUAL(sas_v.size(), m.size());
1471  BOOST_REQUIRE_EQUAL(queries.size(), sas_v.size());
1472 
1473  BOOST_REQUIRE(m[0].empty());
1474  BOOST_REQUIRE(!m[1].empty());
1475  BOOST_REQUIRE(m[2].empty());
1476 
1477  TQueryMessages qm = m[1];
1478 
1479  // no duplicate messages for the contexts
1480  BOOST_REQUIRE(qm.size() == 1);
1481  // Verify the error message
1482  ITERATE(TQueryMessages, itr, qm) {
1483  BOOST_REQUIRE((*itr)->GetMessage().find("Could not calculate "
1484  "ungapped Karlin-Altschul "
1485  "parameters")
1486  != string::npos);
1487  }
1488  // Verify that the alignments corresponding to the 2nd query are indeed
1489  // empty
1490  ITERATE(CSeq_align_set::Tdata, alignments, sas_v[1]->Get()) {
1491  BOOST_REQUIRE((*alignments)->GetSegs().IsDisc());
1492  BOOST_REQUIRE((*alignments)->GetSegs().GetDisc().Get().empty());
1493  }
1494 }
1495 
1496 BOOST_AUTO_TEST_CASE(ProteinSelfHitWithMask) {
1497  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 129295));
1498  CRef<CSeq_loc> sl(new CSeq_loc());
1499  sl->SetWhole(*id);
1500  CRef<CSeq_loc> mask(new CSeq_loc(*id, 50, 100));
1502  SSeqLoc seqloc(sl, scope, mask);
1503 
1504  CBl2Seq bl2seq(seqloc, seqloc, eBlastp);
1505  TSeqAlignVector sav(bl2seq.Run());
1506  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1507  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1508 }
1509 
1510 // Inspired by SB-285
1511 BOOST_AUTO_TEST_CASE(NucleotideMaskedLocation) {
1512  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 83219349));
1513  CRef<CSeq_loc> sl(new CSeq_loc());
1514  sl->SetWhole(*id);
1515  CRef<CSeq_loc> mask(new CSeq_loc(*id, 57, 484));
1517  SSeqLoc query_seqloc(sl, scope, mask);
1518 
1519  CRef<CSeq_id> sid(new CSeq_id(CSeq_id::e_Gi, 88954065));
1520  CRef<CSeq_loc> ssl(new CSeq_loc(*sid, 9909580-100, 9909607+100));
1521  SSeqLoc subj_seqloc(ssl, scope);
1522 
1523  CBl2Seq bl2seq(query_seqloc, subj_seqloc, eMegablast);
1524  TSeqAlignVector sav(bl2seq.Run());
1525  BOOST_REQUIRE_EQUAL(0U, sav[0]->Get().size());
1526 }
1527 
1528 // Inspired by SB-285
1529 BOOST_AUTO_TEST_CASE(NucleotideMaskedLocation_FromFile) {
1530  CNcbiIfstream infile("data/masked.fsa");
1531  const bool is_protein(false);
1532  CBlastInputSourceConfig iconfig(is_protein);
1533  iconfig.SetLowercaseMask(true);
1535  (new CBlastFastaInputSource(infile, iconfig));
1536  CRef<CBlastInput> input(new CBlastInput(&*fasta_src));
1537  //CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
1538  //scope->AddDefaults();
1539  CRef<CScope> scope = CBlastScopeSource(is_protein).NewScope();
1540 
1541  CRef<blast::CBlastQueryVector> seqs = input->GetNextSeqBatch(*scope);
1542  CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(*seqs));
1543 
1544  TSeqLocVector subj_vec;
1545  CRef<CSeq_id> sid(new CSeq_id(CSeq_id::e_Gi, 88954065));
1546  CRef<CSeq_loc> ssl(new CSeq_loc(*sid, 9909580-100, 9909607+100));
1547  subj_vec.push_back(SSeqLoc(ssl, scope));
1548  CRef<IQueryFactory> subj_qf(new CObjMgr_QueryFactory(subj_vec));
1550  opts_handle(CBlastOptionsFactory::Create(eBlastn));
1551  CRef<CLocalDbAdapter> subjects(new CLocalDbAdapter(subj_qf,
1552  opts_handle));
1553 
1554  size_t num_queries = seqs->Size();
1555  size_t num_subjects = subj_vec.size();
1556  BOOST_REQUIRE_EQUAL((size_t)1, num_queries);
1557  BOOST_REQUIRE_EQUAL((size_t)1, num_subjects);
1558 
1559  // BLAST by concatenating all queries
1560  CLocalBlast blaster(queries, opts_handle, subjects);
1561  CRef<CSearchResultSet> results = blaster.Run();
1562  BOOST_REQUIRE(results->GetResultType() == eSequenceComparison);
1563  BOOST_REQUIRE_EQUAL((num_queries*num_subjects),
1564  results->GetNumResults());
1565  BOOST_REQUIRE_EQUAL((num_queries*num_subjects), results->size());
1566  BOOST_REQUIRE_EQUAL(num_queries, results->GetNumQueries());
1567  BOOST_REQUIRE_EQUAL(num_subjects,
1568  results->GetNumResults()/results->GetNumQueries());
1569 
1570  CSearchResults& res = (*results)[0];
1571  BOOST_REQUIRE(res.HasAlignments() == false);
1572 }
1573 
1574 // JIRA SB-732
1575 BOOST_AUTO_TEST_CASE(InvalidMaskingAlgorithm) {
1576  CRef<blast::CSearchDatabase> m_SearchDb;
1577  m_SearchDb.Reset(new CSearchDatabase("nr",
1579  m_SearchDb->SetFilteringAlgorithm(66, eSoftSubjMasking);
1580  CRef<CLocalDbAdapter> subject_adapter(new CLocalDbAdapter(*m_SearchDb));
1582  vector<TIntId> q_gis;
1583  // Setup the queries
1584  CSeq_id seq_id("P01013.1");
1585 
1586  TSeqLocVector queries;
1587  CRef<CSeq_loc> loc(new CSeq_loc());
1588  loc->SetWhole(seq_id);
1589 
1590  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1591  scope->AddDefaults();
1592  queries.push_back(SSeqLoc(loc, scope));
1593 
1594  CRef<IQueryFactory> query_fact(new CObjMgr_QueryFactory(queries));
1595  BOOST_REQUIRE_THROW(CLocalBlast blaster(query_fact, opts, subject_adapter),
1596  CBlastException);
1597 }
1598 
1599 // test for the case where the use of composition based
1600 // satistics should have deleted a hit but did not (used to crash)
1601 BOOST_AUTO_TEST_CASE(ProteinCompBasedStats) {
1602 
1604  CRef<CScope> scope(new CScope(*kObjMgr));
1605  CRef<CSeq_entry> seq_entry1;
1606  const string kFileName("data/blastp_compstats.fa");
1607  ifstream in1(kFileName.c_str());
1608  if ( !in1 )
1609  throw runtime_error("Failed to open " + kFileName);
1610  if ( !(seq_entry1 = CFastaReader(in1).ReadOneSeq()))
1611  throw runtime_error("Failed to read sequence from " + kFileName);
1612  scope->AddTopLevelSeqEntry(*seq_entry1);
1613  CRef<CSeq_loc> seqloc1(new CSeq_loc);
1614  const string kSeqIdString1("lcl|1");
1615  CRef<CSeq_id> id1(new CSeq_id(kSeqIdString1));
1616  seqloc1->SetWhole(*id1);
1617  SSeqLoc ss1(seqloc1, scope);
1618 
1619  CSeq_id id("gi|4503637");
1620  unique_ptr<SSeqLoc> ss2(CTestObjMgr::Instance().CreateSSeqLoc(id));
1621 
1623  opts_handle->SetWordSize(2);
1624  opts_handle->SetEvalueThreshold(20000);
1625  opts_handle->SetFilterString("F");/* NCBI_FAKE_WARNING */
1626  opts_handle->SetMatrixName("PAM30");
1627  opts_handle->SetGapOpeningCost(9);
1628  opts_handle->SetGapExtensionCost(1);
1629  opts_handle->SetOptions().SetCompositionBasedStats(
1631 
1632  CBl2Seq blaster(ss1, *ss2, *opts_handle);
1633  TSeqAlignVector sav(blaster.Run());
1634  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1635  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1636 }
1637 
1638 BOOST_AUTO_TEST_CASE(Blastx2Seqs_QueryBothStrands) {
1639  CSeq_id qid("gi|555");
1640  unique_ptr<SSeqLoc> query(
1641  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
1642  query->genetic_code_id = 1;
1643 
1644  CSeq_id sid("gi|129295");
1645  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
1646 
1647  CBl2Seq blaster(*query, *subj, eBlastx);
1648  TSeqAlignVector sav(blaster.Run());
1649  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1650  // cerr << "Align " << MSerial_AsnText << *sar << endl;
1651  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetStd().size());
1654 }
1655 
1656 BOOST_AUTO_TEST_CASE(NucleotideSelfHitWithSubjectMask) {
1657  CRef<CSeq_id> query_id(new CSeq_id(CSeq_id::e_Gi, 148727250));
1658  CRef<CSeq_id> subj_id(new CSeq_id(CSeq_id::e_Gi, 89059606));
1659  CRef<CSeq_loc> qsl(new CSeq_loc(*query_id, 0, 1000));
1660  CRef<CSeq_loc> ssl(new CSeq_loc(*subj_id, 0, 1000));
1661  CPacked_seqint::TRanges mask_vector;
1662  mask_vector.push_back(TSeqRange(0, 44));
1663  mask_vector.push_back(TSeqRange(69, 582));
1664  mask_vector.push_back(TSeqRange(610, 834));
1665  mask_vector.push_back(TSeqRange(854, 1000));
1666  CRef<CPacked_seqint> masks(new CPacked_seqint(*subj_id,
1667  mask_vector));
1668  CRef<CSeq_loc> subj_mask(new CSeq_loc());
1669  subj_mask->SetPacked_int(*masks);
1671  SSeqLoc query(qsl, scope);
1672  unique_ptr<SSeqLoc> subject(new SSeqLoc(ssl, scope, subj_mask));
1673  {
1674  CBl2Seq bl2seq(query, *subject, eBlastn);
1675  TSeqAlignVector sav(bl2seq.Run());
1676  BOOST_REQUIRE_EQUAL((size_t)1, sav.front()->Get().size());
1677  }
1678 
1679  // Now compare the same sequences, without the subject masks
1680  subject.reset(new SSeqLoc(ssl, scope));
1681  {
1682  CBl2Seq bl2seq(query, *subject, eBlastn);
1683  TSeqAlignVector sav(bl2seq.Run());
1684  BOOST_REQUIRE_EQUAL((size_t)4, sav.front()->Get().size());
1685  }
1686 }
1687 
1688 BOOST_AUTO_TEST_CASE(NucleotideBlastSelfHit) {
1689  CSeq_id id("gi|555");
1690  unique_ptr<SSeqLoc> sl(
1691  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
1692 
1693  // Traditional blastn search
1695  CBl2Seq blaster(*sl, *sl, *opts);
1696  TSeqAlignVector sav = blaster.Run();
1697  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1698  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1701 
1702  // Change the options to megablast
1704  blaster.SetOptionsHandle() = *opts;
1705  sav = blaster.Run();
1706  BOOST_REQUIRE_EQUAL(1, (int)sav.size());
1707  sar = *(sav[0]->Get().begin());
1708  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1711 
1712  // Change the options to discontiguous megablast
1714  blaster.SetOptionsHandle() = *opts;
1715  sav = blaster.Run();
1716  sar = *(sav[0]->Get().begin());
1717  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1720 }
1721 
1722 BOOST_AUTO_TEST_CASE(MegablastGreedyTraceback) {
1723  CSeq_id query_id("gi|2655203");
1724  unique_ptr<SSeqLoc> ql(
1725  CTestObjMgr::Instance().CreateSSeqLoc(query_id,
1726  eNa_strand_plus));
1727 
1728  CSeq_id subject_id("gi|200811");
1729  unique_ptr<SSeqLoc> sl(
1730  CTestObjMgr::Instance().CreateSSeqLoc(subject_id,
1731  eNa_strand_minus));
1732 
1733  // test a fix for a bug that corrupted the traceback
1734  // in the one hit returned from this search
1735 
1737  opts->SetTraditionalMegablastDefaults();
1738  opts->SetMatchReward(1);
1739  opts->SetMismatchPenalty(-2);
1740  opts->SetGapOpeningCost(3);
1741  opts->SetGapExtensionCost(1);
1742  opts->SetWordSize(24);
1743  opts->SetGapExtnAlgorithm(eGreedyScoreOnly);
1744  opts->SetGapTracebackAlgorithm(eGreedyTbck);
1745 
1746  CBl2Seq blaster(*ql, *sl, *opts);
1747 
1748  CRef<CSearchResultSet> res = blaster.RunEx();
1749  BOOST_REQUIRE_EQUAL(eSequenceComparison, res->GetResultType());
1750  BOOST_REQUIRE_EQUAL(1U, res->size());
1751  CConstRef<CSeq_align_set> sas = (*res)[0].GetSeqAlign();
1752  BOOST_REQUIRE(sas.NotEmpty());
1753  BOOST_REQUIRE_EQUAL(1U, sas->Get().size());
1754  CRef<CSeq_align> sa = sas->Get().front();
1755  int score = 0;
1757  BOOST_REQUIRE_EQUAL(832, score);
1758 
1759  TSeqAlignVector alignments = blaster.Run();
1760  BOOST_REQUIRE_EQUAL(1U, alignments.size());
1761  sa = alignments[0]->Get().front();
1763  BOOST_REQUIRE_EQUAL(832, score);
1764 }
1765 
1766 
1767 BOOST_AUTO_TEST_CASE(MegablastGreedyTraceback2) {
1769  CRef<CScope> scope(new CScope(*kObjMgr));
1770 
1771  CRef<CSeq_entry> seq_entry1;
1772  ifstream in1("data/greedy1a.fsa");
1773  if ( !in1 )
1774  throw runtime_error("Failed to open file1");
1775  if ( !(seq_entry1 = CFastaReader(in1).ReadOneSeq()))
1776  throw runtime_error("Failed to read sequence from file1");
1777  scope->AddTopLevelSeqEntry(*seq_entry1);
1778  CRef<CSeq_loc> seqloc1(new CSeq_loc);
1779  const string kSeqIdString1("lcl|1");
1780  CRef<CSeq_id> id1(new CSeq_id(kSeqIdString1));
1781  seqloc1->SetWhole(*id1);
1782  SSeqLoc ss1(seqloc1, scope);
1783 
1784  CRef<CSeq_entry> seq_entry2;
1785  ifstream in2("data/greedy1b.fsa");
1786  if ( !in2 )
1787  throw runtime_error("Failed to open file2");
1788  if ( !(seq_entry2 = CFastaReader(in2).ReadOneSeq()))
1789  throw runtime_error("Failed to read sequence from file2");
1790  scope->AddTopLevelSeqEntry(*seq_entry2);
1791  CRef<CSeq_loc> seqloc2(new CSeq_loc);
1792  const string kSeqIdString2("lcl|2");
1793  CRef<CSeq_id> id2(new CSeq_id(kSeqIdString2));
1794  seqloc2->SetWhole(*id2);
1795  SSeqLoc ss2(seqloc2, scope);
1796 
1798  handle->SetGapOpeningCost(0);
1799  handle->SetGapExtensionCost(0);
1800  handle->SetDustFiltering(false);
1801 
1802  // test multiple bug fixes in greedy gapped alignment
1803 
1804  CBl2Seq blaster1(ss1, ss2, *handle);
1805  TSeqAlignVector sav(blaster1.Run());
1806  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1807  BOOST_REQUIRE_EQUAL(1, (int)sav[0]->Size());
1808 
1809  const CSeq_align& seqalign1 = *sar;
1810  BOOST_REQUIRE(seqalign1.IsSetScore());
1811  ITERATE(CSeq_align::TScore, itr, seqalign1.GetScore()) {
1812  BOOST_REQUIRE((*itr)->IsSetId());
1813  if ((*itr)->GetId().GetStr() == "score") {
1814  BOOST_REQUIRE_EQUAL(619, (*itr)->GetValue().GetInt());
1815  break;
1816  }
1817  }
1818 
1819  handle->SetMatchReward(10);
1820  handle->SetMismatchPenalty(-25);
1821  handle->SetGapXDropoff(100.0);
1822  handle->SetGapXDropoffFinal(100.0);
1823 
1824  CBl2Seq blaster2(ss1, ss2, *handle);
1825  sav = blaster2.Run();
1826  sar = *(sav[0]->Get().begin());
1827  BOOST_REQUIRE_EQUAL(1, (int)sav[0]->Size());
1828 
1829  const CSeq_align& seqalign2 = *sar;
1830  BOOST_REQUIRE(seqalign2.IsSetScore());
1831  ITERATE(CSeq_align::TScore, itr, seqalign2.GetScore()) {
1832  BOOST_REQUIRE((*itr)->IsSetId());
1833  if ((*itr)->GetId().GetStr() == "score") {
1834  BOOST_REQUIRE_EQUAL(6035, (*itr)->GetValue().GetInt());
1835  break;
1836  }
1837  }
1838 }
1839 
1840 BOOST_AUTO_TEST_CASE(MegablastGreedyTracebackSelfHits) {
1841  // the following tests are for JIRA SB-1041, where the greedy
1842  // traceback may stop prematurely.
1843  CSeq_id query_id("gi|56384585");
1844  unique_ptr<SSeqLoc> ql(
1845  CTestObjMgr::Instance().CreateSSeqLoc(query_id,
1846  eNa_strand_plus));
1847 
1848  CSeq_id subject_id("gi|56384585");
1849  unique_ptr<SSeqLoc> sl(
1850  CTestObjMgr::Instance().CreateSSeqLoc(subject_id,
1851  eNa_strand_plus));
1852 
1854  opts->SetTraditionalMegablastDefaults();
1855  opts->SetMatchReward(1);
1856  opts->SetMismatchPenalty(-2);
1857  opts->SetGapOpeningCost(0);
1858  opts->SetGapExtensionCost(0);
1859  opts->SetWordSize(28);
1860  opts->SetGapExtnAlgorithm(eGreedyScoreOnly);
1861  opts->SetGapTracebackAlgorithm(eGreedyTbck);
1862 
1863  CBl2Seq blaster(*ql, *sl, *opts);
1864 
1865  CRef<CSearchResultSet> res = blaster.RunEx();
1866  BOOST_REQUIRE_EQUAL(eSequenceComparison, res->GetResultType());
1867  BOOST_REQUIRE_EQUAL(1U, res->size());
1868  CConstRef<CSeq_align_set> sas = (*res)[0].GetSeqAlign();
1869  BOOST_REQUIRE(sas.NotEmpty());
1870  CRef<CSeq_align> sa = sas->Get().front();
1871  int score = 0;
1873  BOOST_REQUIRE_EQUAL(3794584, score);
1874 }
1875 
1876 BOOST_AUTO_TEST_CASE(Blastx2Seqs_QueryPlusStrand) {
1877  CSeq_id qid("gi|555");
1878  unique_ptr<SSeqLoc> query(
1879  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_plus));
1880 
1881  CSeq_id sid("gi|129295");
1882  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
1883 
1884  CBl2Seq blaster(*query, *subj, eBlastx);
1885  TSeqAlignVector sav(blaster.Run());
1886  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1887  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetStd().size());
1888 }
1889 
1890 BOOST_AUTO_TEST_CASE(Blastx2Seqs_QueryMinusStrand) {
1891  CSeq_id qid("gi|555");
1892  unique_ptr<SSeqLoc> query(
1893  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_minus));
1894 
1895  CSeq_id sid("gi|129295");
1896  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
1897 
1898  CBl2Seq blaster(*query, *subj, eBlastx);
1899  TSeqAlignVector sav(blaster.Run());
1900  // No hits. Empty CSeq_align_set returned.
1901  BOOST_REQUIRE(sav[0]->IsEmpty() == true);
1902 }
1903 
1904 
1905 BOOST_AUTO_TEST_CASE(TBlastx2Seqs_QueryBothStrands) {
1906  CSeq_id id("gi|555");
1907  unique_ptr<SSeqLoc> sl(
1908  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
1909 
1910  CBl2Seq blaster(*sl, *sl, eTblastx);
1911  TSeqAlignVector sav(blaster.Run());
1912  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1913  BOOST_REQUIRE_EQUAL(39, (int)sar->GetSegs().GetStd().size());
1916 }
1917 
1918 BOOST_AUTO_TEST_CASE(TBlastx2Seqs_QueryPlusStrand) {
1919  CSeq_id id("gi|555");
1920  unique_ptr<SSeqLoc> sl(
1921  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_plus));
1922 
1923  CBl2Seq blaster(*sl, *sl, eTblastx);
1924  TSeqAlignVector sav(blaster.Run());
1925  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1926  BOOST_REQUIRE_EQUAL(11, (int)sar->GetSegs().GetStd().size());
1927 }
1928 
1929 BOOST_AUTO_TEST_CASE(TBlastx2Seqs_QueryMinusStrand) {
1930  CSeq_id id("gi|555");
1931  unique_ptr<SSeqLoc> sl(
1932  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_minus));
1933 
1934  CBl2Seq blaster(*sl, *sl, eTblastx);
1935  TSeqAlignVector sav(blaster.Run());
1936  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1937  BOOST_REQUIRE_EQUAL(12, (int)sar->GetSegs().GetStd().size());
1938 }
1939 
1940 
1941 BOOST_AUTO_TEST_CASE(TblastxManyHits) {
1942  const int total_num_hsps = 49;
1943  const int num_hsps_to_check = 8;
1944  const int score_array[num_hsps_to_check] =
1945  { 947, 125, 820, 113, 624, 221, 39, 778};
1946  const int sum_n_array[num_hsps_to_check] =
1947  { 2, 2, 2, 2, 3, 3, 3, 0};
1948  CSeq_id qid("gi|24719404");
1949  unique_ptr<SSeqLoc> qsl(
1950  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
1951  CSeq_id sid("gi|29807292");
1952  pair<TSeqPos, TSeqPos> range(15185000, 15195000);
1953  unique_ptr<SSeqLoc> ssl(
1954  CTestObjMgr::Instance().CreateSSeqLoc(sid, range, eNa_strand_both));
1955  CBl2Seq blaster(*qsl, *ssl, eTblastx);
1956  blaster.SetOptionsHandle().SetMaxNumHspPerSequence(total_num_hsps);
1957 
1958  TSeqAlignVector sav(blaster.Run());
1959 
1962 
1963  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1964  list< CRef<CStd_seg> >& segs = sar->SetSegs().SetStd();
1965  BOOST_REQUIRE_EQUAL(total_num_hsps, (int)segs.size());
1966  int index = 0;
1967  ITERATE(list< CRef<CStd_seg> >, itr, segs) {
1968  const vector< CRef< CScore > >& score_v = (*itr)->GetScores();
1969  ITERATE(CSeq_align::TScore, sitr, score_v) {
1970  BOOST_REQUIRE((*sitr)->IsSetId());
1971  if ((*sitr)->GetId().GetStr() == "score") {
1972  BOOST_REQUIRE_EQUAL(score_array[index],
1973  (*sitr)->GetValue().GetInt());
1974  } else if ((*sitr)->GetId().GetStr() == "sum_n") {
1975  BOOST_REQUIRE_EQUAL(sum_n_array[index],
1976  (*sitr)->GetValue().GetInt());
1977  }
1978  }
1979  if (++index == num_hsps_to_check)
1980  break;
1981  }
1982 }
1983 
1984 BOOST_AUTO_TEST_CASE(ProteinBlast2Seqs) {
1985  CSeq_id id("gi|129295");
1986  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(id));
1987 
1988  id.SetGi(GI_CONST(7662354));
1989  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(id));
1990 
1991  CBl2Seq blaster(*query, *subj, eBlastp);
1992  TSeqAlignVector sav(blaster.Run());
1993  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1994  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1997 }
1998 
1999 BOOST_AUTO_TEST_CASE(BlastnWithRepeatFiltering_InvalidDB) {
2000  CSeq_id qid("gi|555");
2001  unique_ptr<SSeqLoc> query(
2002  CTestObjMgr::Instance().CreateSSeqLoc(qid));
2003 
2006  const string kRepeatDb("junk");
2007  opts.SetRepeatFilteringDB(kRepeatDb.c_str());
2008  bool is_repeat_filtering_on = opts.GetRepeatFiltering();
2009  BOOST_REQUIRE(is_repeat_filtering_on);
2010  string repeat_db(opts.GetRepeatFilteringDB()
2011  ? opts.GetRepeatFilteringDB()
2012  : kEmptyStr);
2013  BOOST_REQUIRE_EQUAL(kRepeatDb, repeat_db);
2014 
2015  CBl2Seq blaster(*query, *query, opts);
2016  BOOST_CHECK_THROW((void) blaster.Run(), CBlastException);
2017  BOOST_CHECK_THROW((void) blaster.RunEx(), CBlastException);
2018 }
2019 
2020 BOOST_AUTO_TEST_CASE(BlastnWithRepeatFiltering) {
2021  CSeq_id qid("gi|555");
2022  unique_ptr<SSeqLoc> query(
2023  CTestObjMgr::Instance().CreateSSeqLoc(qid));
2024 
2026  opts->SetTraditionalMegablastDefaults();
2027  opts->SetRepeatFiltering(true);
2028  string repeat_db(opts->GetRepeatFilteringDB()
2029  ? opts->GetRepeatFilteringDB()
2030  : kEmptyStr);
2031  BOOST_REQUIRE_EQUAL(string(kDefaultRepeatFilterDb), repeat_db);
2032  // it's harmless to set them both, but only the latter one will be used
2033  const string kRepeatDb("repeat/repeat_9606");
2034  opts->SetRepeatFilteringDB(kRepeatDb.c_str());
2035  repeat_db.assign(opts->GetRepeatFilteringDB()
2036  ? opts->GetRepeatFilteringDB()
2037  : kEmptyStr);
2038  BOOST_REQUIRE_EQUAL(kRepeatDb, repeat_db);
2039 
2040  bool is_repeat_filtering_on = opts->GetRepeatFiltering();
2041  BOOST_REQUIRE(is_repeat_filtering_on);
2042 
2043  CBl2Seq blaster(*query, *query, *opts);
2044  TSeqAlignVector sav(blaster.Run());
2045  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2046  BOOST_REQUIRE(sar.NotEmpty());
2047  BOOST_REQUIRE(sar->GetSegs().GetDenseg().GetNumseg() >= 1);
2048 }
2049 
2050 BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_Db) {
2051  CSeq_id qid("gi|555");
2052  unique_ptr<SSeqLoc> query(
2053  CTestObjMgr::Instance().CreateSSeqLoc(qid));
2054 
2056  opts->SetTraditionalMegablastDefaults();
2057  const string kWindowMaskerDb = WindowMaskerTaxidToDb(9606);
2058  opts->SetWindowMaskerDatabase(kWindowMaskerDb.c_str());
2059  string wmdb(opts->GetWindowMaskerDatabase()
2060  ? opts->GetWindowMaskerDatabase() : kEmptyStr);
2061  BOOST_REQUIRE_EQUAL(kWindowMaskerDb, wmdb);
2062  BOOST_REQUIRE_EQUAL(0, opts->GetWindowMaskerTaxId());
2063  CBl2Seq blaster(*query, *query, *opts);
2064  TSeqAlignVector sav(blaster.Run());
2065  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2066  BOOST_REQUIRE(sar.NotEmpty());
2067  BOOST_REQUIRE(sar->GetSegs().GetDenseg().GetNumseg() >= 1);
2068 }
2069 
2070 BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_Taxid) {
2071  CSeq_id qid("gi|555");
2072  unique_ptr<SSeqLoc> query(
2073  CTestObjMgr::Instance().CreateSSeqLoc(qid));
2074 
2076  opts->SetTraditionalMegablastDefaults();
2077  opts->SetWindowMaskerTaxId(9606);
2078  CBl2Seq blaster(*query, *query, *opts);
2079  TSeqAlignVector sav(blaster.Run());
2080  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2081  BOOST_REQUIRE(sar.NotEmpty());
2082  BOOST_REQUIRE(sar->GetSegs().GetDenseg().GetNumseg() >= 1);
2083 }
2084 
2085 BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_InvalidDb) {
2086  CSeq_id qid("gi|555");
2087  unique_ptr<SSeqLoc> query(
2088  CTestObjMgr::Instance().CreateSSeqLoc(qid));
2089 
2092  const string kWindowMaskerDb("Dummydb");
2093  opts.SetWindowMaskerDatabase(kWindowMaskerDb.c_str());
2094  string wmdb(opts.GetWindowMaskerDatabase()
2095  ? opts.GetWindowMaskerDatabase() : kEmptyStr);
2096  BOOST_REQUIRE_EQUAL(kWindowMaskerDb, wmdb);
2097  CBl2Seq blaster(*query, *query, opts);
2098  BOOST_CHECK_THROW((void) blaster.Run(), CBlastException);
2099  BOOST_CHECK_THROW((void) blaster.RunEx(), CBlastException);
2100 }
2101 
2102 BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_InvalidTaxid) {
2103  CSeq_id qid("gi|555");
2104  unique_ptr<SSeqLoc> query(
2105  CTestObjMgr::Instance().CreateSSeqLoc(qid));
2106 
2109  const int kInvalidTaxId = -1;
2110  opts.SetWindowMaskerTaxId(kInvalidTaxId);
2111  BOOST_REQUIRE_EQUAL(kInvalidTaxId, opts.GetWindowMaskerTaxId());
2112  CBl2Seq blaster(*query, *query, opts);
2113  BOOST_CHECK_THROW((void) blaster.Run(), CBlastException);
2114  BOOST_CHECK_THROW((void) blaster.RunEx(), CBlastException);
2115 }
2116 
2117 BOOST_AUTO_TEST_CASE(WindowMaskerPathInitInvalid) {
2118  int rv = WindowMaskerPathInit(kEmptyStr);
2119  BOOST_CHECK_EQUAL(1, rv);
2120  rv = WindowMaskerPathInit("blast_unit_test");
2121  BOOST_CHECK_EQUAL(1, rv);
2122 }
2123 
2124 BOOST_AUTO_TEST_CASE(WindowMaskerPathInitValid) {
2125  int rv = WindowMaskerPathInit(CDir::GetCwd());
2126  BOOST_CHECK_EQUAL(0, rv);
2128 }
2129 
2130 BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_DbAndTaxid) {
2131  CSeq_id qid("gi|555");
2132  unique_ptr<SSeqLoc> query(
2133  CTestObjMgr::Instance().CreateSSeqLoc(qid));
2134 
2136  opts->SetTraditionalMegablastDefaults();
2137  // if both are set, the database name will be given preference
2138  opts->SetWindowMaskerDatabase(WindowMaskerTaxidToDb(9606).c_str());
2139  opts->SetWindowMaskerTaxId(-1);
2140  CBl2Seq blaster(*query, *query, *opts);
2141  TSeqAlignVector sav(blaster.Run());
2142  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2143  BOOST_REQUIRE(sar.NotEmpty());
2144  BOOST_REQUIRE(sar->GetSegs().GetDenseg().GetNumseg() >= 1);
2145 }
2146 
2147 BOOST_AUTO_TEST_CASE(ConvertTaxIdToWindowMaskerDb) {
2148  string path;
2149  BOOST_CHECK_NO_THROW(path = WindowMaskerTaxidToDb(9606));
2150  BOOST_REQUIRE(NStr::EndsWith(path, "wmasker.obinary") ||
2151  NStr::EndsWith(path, "wmasker.oascii") );
2152 }
2153 
2154 // Bug report from Alex Astashyn
2156  CSeq_id qid("NG_007092.2");
2157  TSeqRange qr(0, 2311633);
2158  unique_ptr<SSeqLoc> query(
2159  CTestObjMgr::Instance().CreateSSeqLoc(qid, qr, eNa_strand_plus));
2160 
2161  CSeq_id sid("NT_007914.14");
2162  TSeqRange sr(5233652, 9849919);
2163  unique_ptr<SSeqLoc> subj(
2164  CTestObjMgr::Instance().CreateSSeqLoc(sid, sr));
2165 
2167  opts->SetTraditionalMegablastDefaults();
2168  opts->SetRepeatFiltering(true);
2169  CBl2Seq blaster(*query, *subj, *opts);
2170  TSeqAlignVector sav(blaster.Run());
2171  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2172  BOOST_REQUIRE(sar.NotEmpty());
2173  BOOST_REQUIRE(sar->GetSegs().GetDenseg().GetNumseg() >= 1);
2174 }
2175 
2176 BOOST_AUTO_TEST_CASE(NucleotideBlast2Seqs) {
2177  CSeq_id qid("gi|555");
2178  unique_ptr<SSeqLoc> query(
2179  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2180 
2181  CSeq_id sid("gi|3090");
2182  unique_ptr<SSeqLoc> subj(
2183  CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
2184 
2186  opts->SetTraditionalBlastnDefaults();
2187  CBl2Seq blaster(*query, *subj, *opts);
2188  TSeqAlignVector sav(blaster.Run());
2189  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2190  BOOST_REQUIRE_EQUAL(3, (int)sar->GetSegs().GetDenseg().GetNumseg());
2193 }
2194 
2195 BOOST_AUTO_TEST_CASE(ProteinBlastChangeQuery) {
2196  CSeq_id id("gi|129295");
2197  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(id));
2198 
2199  id.SetGi(GI_CONST(7662354));
2200  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(id));
2201 
2202  // Run self hit first
2203  CBl2Seq blaster(*subj, *subj, eBlastp);
2204  TSeqAlignVector sav(blaster.Run());
2205  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2206  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2207 
2208  // Change the query sequence (recreates the lookup table)
2209  blaster.SetQuery(*query);
2210  sav = blaster.Run();
2211  sar = *(sav[0]->Get().begin());
2212  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2213 }
2214 
2215 BOOST_AUTO_TEST_CASE(ProteinBlastChangeSubject) {
2216  CSeq_id qid("gi|129295");
2217  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
2218 
2219  CSeq_id sid("gi|7662354");
2220  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2221 
2222  // Run self hit first
2223  CBl2Seq blaster(*query, *query, eBlastp);
2224  TSeqAlignVector sav(blaster.Run());
2225  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2226  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2227 
2228  // Change the subject sequence
2229  blaster.SetSubject(*subj);
2230  sav = blaster.Run();
2231  BOOST_REQUIRE_EQUAL(1, (int)sav.size());
2232  sar = *(sav[0]->Get().begin());
2233  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2234 }
2235 
2236 BOOST_AUTO_TEST_CASE(NucleotideBlastChangeQuery) {
2237  CSeq_id qid("gi|555");
2238  unique_ptr<SSeqLoc> query(
2239  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2240 
2241  CSeq_id sid("gi|3090");
2242  unique_ptr<SSeqLoc> subj(
2243  CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
2244 
2245  // Run self hit first
2247  opts->SetTraditionalBlastnDefaults();
2248  CBl2Seq blaster(*subj, *subj, *opts);
2249  TSeqAlignVector sav(blaster.Run());
2250  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2251  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2252 
2253  // Change the query sequence (recreates the lookup table)
2254  blaster.SetQuery(*query);
2255  sav = blaster.Run();
2256  BOOST_REQUIRE_EQUAL(2, (int)sav[0]->Size());
2257  sar = *(sav[0]->Get().begin());
2258  BOOST_REQUIRE_EQUAL(3, (int)sar->GetSegs().GetDenseg().GetNumseg());
2259 }
2260 
2261 BOOST_AUTO_TEST_CASE(NucleotideBlastChangeSubject) {
2262  CSeq_id qid("gi|555");
2263  unique_ptr<SSeqLoc> query(
2264  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2265 
2266  CSeq_id sid("gi|3090");
2267  unique_ptr<SSeqLoc> subj(
2268  CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
2269 
2270  // Run self hit first
2272  opts->SetTraditionalBlastnDefaults();
2273  CBl2Seq blaster(*query, *query, *opts);
2274  TSeqAlignVector sav(blaster.Run());
2275  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2276  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2277 
2278  // Change the subject sequence
2279  blaster.SetSubject(*subj);
2280  sav = blaster.Run();
2281  sar = *(sav[0]->Get().begin());
2282  BOOST_REQUIRE_EQUAL(3, (int)sar->GetSegs().GetDenseg().GetNumseg());
2283 }
2284 
2285 
2286 BOOST_AUTO_TEST_CASE(ProteinBlastMultipleQueries) {
2287  TSeqLocVector sequences;
2288 
2289  CSeq_id qid("gi|129295");
2290  unique_ptr<SSeqLoc> sl1(CTestObjMgr::Instance().CreateSSeqLoc(qid));
2291  sequences.push_back(*sl1);
2292 
2293  CSeq_id sid("gi|7662354");
2294  unique_ptr<SSeqLoc> sl2(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2295  sequences.push_back(*sl2);
2296 
2297  CBl2Seq blaster(sequences, sequences, eBlastp);
2298  TSeqAlignVector seqalign_v = blaster.Run();
2299 
2300  /* DEBUG OUTPUT
2301  for (size_t i = 0; i < seqalign_v.size(); i++)
2302  cerr << "\n<" << i << ">\n"
2303  << MSerial_AsnText << seqalign_v[i].GetObject() << endl;
2304  */
2305 
2306  BOOST_REQUIRE_EQUAL(4, (int)seqalign_v.size());
2307  BOOST_REQUIRE_EQUAL(2, (int)sequences.size());
2308 
2309  CRef<CSeq_align> sar;
2310 
2311  BOOST_REQUIRE_EQUAL(1U, seqalign_v[0]->Get().size());
2312  sar = *(seqalign_v[0]->Get().begin());
2313  BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2314 
2315  BOOST_REQUIRE_EQUAL((size_t)2, seqalign_v[1]->Get().size());
2316  sar = *(seqalign_v[1]->Get().begin());
2317  BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2318  sar = *(++(seqalign_v[1]->Get().begin()));
2319  BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2320 
2321  BOOST_REQUIRE_EQUAL((size_t)3, seqalign_v[2]->Get().size());
2322  sar = *(seqalign_v[2]->Get().begin());
2323  BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2324  sar = *(++(seqalign_v[2]->Get().begin()));
2325  BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2326 
2327  BOOST_REQUIRE_EQUAL((size_t)1, seqalign_v[3]->Get().size());
2328  sar = *(seqalign_v[3]->Get().begin());
2329  BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2330 
2331 
2334 
2335  // test the order of queries and subjects:
2336  testResultAlignments(sequences.size(), sequences.size(),
2337  seqalign_v);
2338 }
2339 
2340 BOOST_AUTO_TEST_CASE(NucleotideBlastMultipleQueries) {
2341  TSeqLocVector sequences;
2342 
2343  CSeq_id qid("gi|555");
2344  unique_ptr<SSeqLoc> sl1(
2345  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2346  sequences.push_back(*sl1);
2347  BOOST_REQUIRE(sl1->mask.Empty());
2348 
2349  CSeq_id sid("gi|3090");
2350  unique_ptr<SSeqLoc> sl2(
2351  CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
2352  sequences.push_back(*sl2);
2353  BOOST_REQUIRE(sl2->mask.Empty());
2354 
2355  CBl2Seq blaster(sequences, sequences, eBlastn);
2356  TSeqAlignVector seqalign_v = blaster.Run();
2357  BOOST_REQUIRE_EQUAL(2, (int)sequences.size());
2358  BOOST_REQUIRE_EQUAL(4, (int)seqalign_v.size());
2359 
2360  CRef<CSeq_align> sar = *(seqalign_v[0]->Get().begin());
2361  BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2362 
2363  // in older version this was seqalign_v[1], order has changed
2364  sar = *(seqalign_v[2]->Get().begin());
2365  BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2366 
2369 
2370  // test the order of queries and subjects:
2371  testResultAlignments(sequences.size(), sequences.size(),
2372  seqalign_v);
2373 }
2374 
2375 #if 0
2376 void DoSearchWordSize4(const char *file1, const char *file2) {
2378  CRef<CScope> scope(new CScope(*kObjMgr));
2379 
2380  CRef<CSeq_entry> seq_entry1;
2381  ifstream in1(file1);
2382  if ( !in1 )
2383  throw runtime_error("Failed to open file1");
2384  if ( !(seq_entry1 = CFastaReader(in1).ReadOneSeq()))
2385  throw runtime_error("Failed to read sequence from file1");
2386  scope->AddTopLevelSeqEntry(*seq_entry1);
2387  CRef<CSeq_loc> seqloc1(new CSeq_loc);
2388  const string kSeqIdString1("lcl|1");
2389  CRef<CSeq_id> id1(new CSeq_id(kSeqIdString1));
2390  seqloc1->SetWhole(*id1);
2391  SSeqLoc ss1(seqloc1, scope);
2392 
2393  CRef<CSeq_entry> seq_entry2;
2394  ifstream in2(file2);
2395  if ( !in2 )
2396  throw runtime_error("Failed to open file2");
2397  if ( !(seq_entry2 = CFastaReader(in2).ReadOneSeq()))
2398  throw runtime_error("Failed to read sequence from file2");
2399  scope->AddTopLevelSeqEntry(*seq_entry2);
2400  CRef<CSeq_loc> seqloc2(new CSeq_loc);
2401  const string kSeqIdString2("lcl|2");
2402  CRef<CSeq_id> id2(new CSeq_id(kSeqIdString2));
2403  seqloc2->SetWhole(*id2);
2404  SSeqLoc ss2(seqloc2, scope);
2405 
2408  handle.SetWordSize(4);
2409  handle.SetDustFiltering(false);
2410  handle.SetMismatchPenalty(-1);
2411  handle.SetMatchReward(1);
2412  handle.SetEvalueThreshold(10000);
2413 
2414  CBl2Seq blaster(ss1, ss2, handle);
2415  blaster.RunWithoutSeqalignGeneration(); /* NCBI_FAKE_WARNING */
2416  BlastHSPResults *results = blaster.GetResults(); /* NCBI_FAKE_WARNING */
2417  BOOST_REQUIRE(results != NULL);
2418  BOOST_REQUIRE(results->hitlist_array[0] != NULL);
2419  BOOST_REQUIRE(results->hitlist_array[0]->hsplist_array[0] != NULL);
2420  BlastHSPList *hsp_list = results->hitlist_array[0]->hsplist_array[0];
2421  BOOST_REQUIRE(hsp_list->hspcnt > 0);
2422  BOOST_REQUIRE(hsp_list->hsp_array[0] != NULL);
2423 
2424  // verify that all hits are properly formed, and
2425  // at least as long as the word size
2426 
2427  for (int i = 0; i < hsp_list->hspcnt; i++) {
2428  BlastHSP *hsp = hsp_list->hsp_array[i];
2429  BOOST_REQUIRE(hsp != NULL);
2430  BOOST_REQUIRE(hsp->query.offset < hsp->query.end);
2431  BOOST_REQUIRE(hsp->subject.offset < hsp->subject.end);
2432  BOOST_REQUIRE(hsp->query.gapped_start >= hsp->query.offset &&
2433  hsp->query.gapped_start < hsp->query.end);
2434  BOOST_REQUIRE(hsp->subject.gapped_start >= hsp->subject.offset &&
2435  hsp->subject.gapped_start < hsp->subject.end);
2436  BOOST_REQUIRE(hsp->query.end - hsp->query.offset >= 4);
2437  BOOST_REQUIRE(hsp->subject.end - hsp->subject.offset >= 4);
2438  }
2439 
2440  CRef<CSearchResultSet> res = blaster.RunEx();
2441  BOOST_REQUIRE_EQUAL(eSequenceComparison, res->GetResultType());
2442  BOOST_REQUIRE_EQUAL(1, res->GetNumQueries());
2443  BOOST_REQUIRE_EQUAL(1, res->GetNumResults());
2444  BOOST_REQUIRE_EQUAL(1, res->size());
2445  CSearchResults& r = res->GetResults(0, 0);
2446  BOOST_REQUIRE(r.HasAlignments());
2447  CConstRef<CSeq_align_set> a = r.GetSeqAlign();
2448  cerr << MSerial_AsnText << * a<< endl;
2449 
2450 }
2451 
2452 BOOST_AUTO_TEST_CASE(NucleotideBlastWordSize4) {
2453  DoSearchWordSize4("data/blastn_size4a.fsa",
2454  "data/blastn_size4b.fsa");
2455 }
2456 
2457 // test bug fix when size-4 seed falls at the end of
2458 // the subject sequence
2459 BOOST_AUTO_TEST_CASE(NucleotideBlastWordSize4_EOS) {
2460  DoSearchWordSize4("data/blastn_size4c.fsa",
2461  "data/blastn_size4d.fsa");
2462 }
2463 #endif
2464 
2465 BOOST_AUTO_TEST_CASE(TblastnOutOfFrame) {
2466  CSeq_id qid("NP_647642.2"); // Protein sequence
2467  CSeq_id sid("BC042576.1"); // DNA sequence
2468 
2469  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
2470  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2471 
2472  // Set the options
2474  opts->SetOutOfFrameMode();
2475  opts->SetFrameShiftPenalty(10);
2476  opts->SetFilterString("m;L");/* NCBI_FAKE_WARNING */
2477  opts->SetEvalueThreshold(0.01);
2478  opts->SetCompositionBasedStats(eNoCompositionBasedStats);
2479 
2480  CBl2Seq blaster(*query, *subj, *opts);
2481  TSeqAlignVector sav(blaster.Run());
2482  BOOST_REQUIRE_EQUAL(1, (int)sav.size());
2483  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2484  BOOST_REQUIRE_EQUAL(2, (int)sav[0]->Size());
2485  testBlastHitCounts(blaster, eTblastn_oof);
2487 }
2488 
2489 // test multiple fixes for OOF alignments on both subject strands
2490 BOOST_AUTO_TEST_CASE(TblastnOutOfFrame2) {
2491  CSeq_id qid("gi|38111923"); // Protein sequence
2492  CSeq_id sid("gi|6648925"); // DNA sequence
2493 
2494  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
2495  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2496 
2497  // Set the options
2499  opts->SetOutOfFrameMode();
2500  opts->SetFrameShiftPenalty(5);
2501  opts->SetCompositionBasedStats(eNoCompositionBasedStats);
2502  opts->SetFilterString("L");/* NCBI_FAKE_WARNING */
2503 
2504  CBl2Seq blaster(*query, *subj, *opts);
2505  TSeqAlignVector sav(blaster.Run());
2506  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2507  BOOST_REQUIRE_EQUAL(3, (int)sav[0]->Size());
2508 
2509  // test fix for a bug generating OOF traceback
2510 
2511  const CSeq_align& seqalign = *sar;
2512  BOOST_REQUIRE(seqalign.IsSetScore());
2513  ITERATE(CSeq_align::TScore, itr, seqalign.GetScore()) {
2514  BOOST_REQUIRE((*itr)->IsSetId());
2515  if ((*itr)->GetId().GetStr() == "num_ident") {
2516  BOOST_REQUIRE_EQUAL(80, (*itr)->GetValue().GetInt());
2517  break;
2518  }
2519  }
2520 }
2521 
2522 BOOST_AUTO_TEST_CASE(BlastxOutOfFrame) {
2523  CSeq_id qid("BC042576.1"); // DNA sequence
2524  CSeq_id sid("NP_647642.2"); // Protein sequence
2525 
2526  unique_ptr<SSeqLoc> query(
2527  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2528  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2529 
2530  // Set the options
2532  opts->SetOutOfFrameMode();
2533  opts->SetFrameShiftPenalty(10);
2534  opts->SetCompositionBasedStats(eNoCompositionBasedStats);
2535  opts->SetFilterString("m;L");/* NCBI_FAKE_WARNING */
2536  opts->SetEvalueThreshold(0.01);
2537 
2538  CBl2Seq blaster(*query, *subj, *opts);
2539  TSeqAlignVector sav(blaster.Run());
2540  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2541  BOOST_REQUIRE_EQUAL(2, (int)sav[0]->Size());
2542  testBlastHitCounts(blaster, eBlastx_oof);
2543  testRawCutoffs(blaster, eBlastx, eBlastx_oof);
2544 }
2545 
2546 // test for a bug computing OOF sequence lengths during traceback
2547 
2548 BOOST_AUTO_TEST_CASE(BlastxOutOfFrame_DifferentFrames) {
2549  CSeq_id qid("gi|27486285"); // DNA sequence
2550  CSeq_id sid("gi|7331210"); // Protein sequence
2551 
2552  unique_ptr<SSeqLoc> query(
2553  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2554  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2555 
2556  // Set the options
2558  opts->SetOutOfFrameMode();
2559  opts->SetFrameShiftPenalty(10);
2560  opts->SetFilterString("L");/* NCBI_FAKE_WARNING */
2561  opts->SetCompositionBasedStats(eNoCompositionBasedStats);
2562 
2563  CBl2Seq blaster(*query, *subj, *opts);
2564  TSeqAlignVector sav(blaster.Run());
2565  BOOST_REQUIRE_EQUAL(5, (int)sav[0]->Size());
2566 }
2567 
2568 // The following 3 functions are for checking results in the strand
2569 // combinations tests.
2570 
2572  string aligned_strands) {
2573 
2574  // Starting offsets in alignment as query/subject pairs
2575  vector< pair<TSignedSeqPos, TSignedSeqPos> > starts;
2576  starts.push_back(make_pair(7685759, 10));
2577  starts.push_back(make_pair(7685758, -1));
2578  starts.push_back(make_pair(7685718, 269));
2579  starts.push_back(make_pair(7685717, -1));
2580  starts.push_back(make_pair(7685545, 309));
2581 
2582  const size_t kNumSegments(starts.size());
2583 
2584  // Lengths of the aligned regions defined in starts vector
2585  vector<TSeqPos> lengths;
2586  lengths.reserve(kNumSegments);
2587  lengths.push_back(259);
2588  lengths.push_back(1);
2589  lengths.push_back(40);
2590  lengths.push_back(1);
2591  lengths.push_back(172);
2592 
2593  // Strands of the involved aligned segments as query/subject pairs
2594  typedef vector< pair<ENa_strand, ENa_strand> > TStrandPairs;
2595  TStrandPairs strands(kNumSegments,
2596  make_pair(eNa_strand_minus, eNa_strand_plus));
2597 
2598  // Reverse the contents of the vectors if necessary
2599  if (aligned_strands == "plus-minus") {
2600  reverse(starts.begin(), starts.end());
2601  reverse(lengths.begin(), lengths.end());
2602  NON_CONST_ITERATE(TStrandPairs, itr, strands) {
2603  swap(itr->first, itr->second);
2604  }
2605  }
2606  BOOST_REQUIRE_EQUAL(kNumSegments, lengths.size());
2607  BOOST_REQUIRE_EQUAL(kNumSegments, strands.size());
2608 
2609  // Obtain the data from the Seq-align's dense segs ...
2610  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2611  BOOST_REQUIRE_EQUAL(1, (int)sav[0]->Size());
2612  // BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2613  const CDense_seg& ds = sar->GetSegs().GetDenseg();
2614  // CTypeIterator<CDense_seg> segs_itr(Begin(*sar));
2615  const size_t kNumDim(ds.GetDim());
2616  vector< TSignedSeqPos > seg_starts = ds.GetStarts();
2617  vector< TSeqPos> seg_lengths = ds.GetLens();
2618  vector< ENa_strand> seg_strands = ds.GetStrands();
2619  BOOST_REQUIRE_EQUAL(kNumSegments, seg_lengths.size());
2620  BOOST_REQUIRE_EQUAL(kNumSegments*kNumDim, seg_starts.size());
2621 
2622  // ... and compare it to what is expected
2623  for (size_t index = 0; index < kNumSegments; ++index) {
2624  ostringstream os;
2625  os << "Segment " << index << ": expected " << lengths[index]
2626  << " actual " << seg_lengths[index];
2627  BOOST_REQUIRE_MESSAGE(lengths[index] == seg_lengths[index],
2628  os.str());
2629 
2630  os.str("");
2631  os << "Segment " << index << ": expected " << starts[index].first
2632  << " actual " << seg_starts[2*index];
2633  BOOST_REQUIRE_MESSAGE(starts[index].first == seg_starts[2*index],
2634  os.str());
2635  os.str("");
2636  os << "Segment " << index << ": expected " << starts[index].second
2637  << " actual " << seg_starts[2*index];
2638  BOOST_REQUIRE_MESSAGE(starts[index].second == seg_starts[2*index+1],
2639  os.str());
2640  os.str("");
2641  os << "Segment " << index << ": expected " << (int)strands[index].first
2642  << " actual " << (int)seg_strands[2*index];
2643  BOOST_REQUIRE_MESSAGE(strands[index].first == seg_strands[2*index],
2644  os.str());
2645  os.str("");
2646  os << "Segment " << index << ": expected " << (int)strands[index].second
2647  << " actual " << (int)seg_strands[2*index];
2648  BOOST_REQUIRE_MESSAGE(strands[index].second == seg_strands[2*index+1],
2649  os.str());
2650  }
2651 }
2652 
2654 {
2655  const int num_segs = 5;
2656  const int num_starts = 10;
2657  const int starts[num_starts] = { 7685759, 0, 7685758, -1, 7685718,
2658  269, 7685717, -1, 7685545, 309 };
2659  const int lengths[num_segs] = { 269, 1, 40, 1, 172 };
2660  int index;
2661 
2662  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2663  BOOST_REQUIRE_EQUAL(1, (int)sav[0]->Size());
2664  CTypeIterator<CDense_seg> segs_itr(Begin(*sar));
2665  vector< TSignedSeqPos > seg_starts = segs_itr->GetStarts();
2666  vector< TSeqPos> seg_lengths = segs_itr->GetLens();
2667  vector< ENa_strand> seg_strands = segs_itr->GetStrands();
2668  BOOST_REQUIRE_EQUAL(num_segs, (int)seg_lengths.size());
2669  BOOST_REQUIRE_EQUAL(num_starts, (int)seg_starts.size());
2670  for (index = 0; index < num_segs; ++index) {
2671  BOOST_REQUIRE_EQUAL(lengths[index], (int)seg_lengths[index]);
2672  BOOST_REQUIRE_EQUAL(starts[2*index], (int)seg_starts[2*index]);
2673  BOOST_REQUIRE_EQUAL(starts[2*index+1], (int)seg_starts[2*index+1]);
2674  BOOST_REQUIRE(seg_strands[2*index] == eNa_strand_minus);
2675  BOOST_REQUIRE(seg_strands[2*index+1] == eNa_strand_plus);
2676  }
2677 }
2678 
2680 {
2681  const int num_segs = 5;
2682  const int num_starts = 10;
2683  const int starts[num_starts] = { 309, 7685545, -1, 7685717, 269, 7685718,
2684  -1, 7685758, 0, 7685759 };
2685  const int lengths[num_segs] = { 172, 1, 40, 1, 269 };
2686  int index;
2687 
2688  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2689  BOOST_REQUIRE_EQUAL(1, (int)sav[0]->Size());
2690  CTypeIterator<CDense_seg> segs_itr(Begin(*sar));
2691  vector< TSignedSeqPos > seg_starts = segs_itr->GetStarts();
2692  vector< TSeqPos> seg_lengths = segs_itr->GetLens();
2693  vector< ENa_strand> seg_strands = segs_itr->GetStrands();
2694  BOOST_REQUIRE_EQUAL(num_segs, (int)seg_lengths.size());
2695  BOOST_REQUIRE_EQUAL(num_starts, (int)seg_starts.size());
2696  for (index = 0; index < num_segs; ++index) {
2697  BOOST_REQUIRE_EQUAL(lengths[index], (int)seg_lengths[index]);
2698  BOOST_REQUIRE_EQUAL(starts[2*index], (int)seg_starts[2*index]);
2699  BOOST_REQUIRE_EQUAL(starts[2*index+1], (int)seg_starts[2*index+1]);
2700  BOOST_REQUIRE(seg_strands[2*index] == eNa_strand_minus);
2701  BOOST_REQUIRE(seg_strands[2*index+1] == eNa_strand_plus);
2702  }
2703 }
2704 
2705 // should find alignments
2706 BOOST_AUTO_TEST_CASE(Blastn_QueryBothStrands_SubjBothStrands) {
2707  // Alignment in these sequences is from plus/minus strands
2708  CSeq_id qid("NT_004487.15");
2709  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2710  unique_ptr<SSeqLoc> query(
2711  CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2712  eNa_strand_both));
2713 
2714  CSeq_id sid("AA441981.1");
2715  range.first = 10;
2716  range.second = 480;
2717  unique_ptr<SSeqLoc> subj(
2718  CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2719  eNa_strand_both));
2720 
2723  CBl2Seq blaster(*query, *subj, *opts);
2724  TSeqAlignVector sav(blaster.Run());
2725  x_TestAlignmentQuerySubjStrandCombinations(sav, "minus-plus");
2726 }
2727 
2728 // should find alignment
2729 BOOST_AUTO_TEST_CASE(Blastn_QueryBothStrands_SubjPlusStrand) {
2730  // Alignment in these sequences is from plus/minus strands
2731  CSeq_id qid("NT_004487.15");
2732  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2733  unique_ptr<SSeqLoc> query(
2734  CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2735  eNa_strand_both));
2736 
2737  CSeq_id sid("AA441981.1");
2738  range.first = 10;
2739  range.second = 480;
2740  unique_ptr<SSeqLoc> subj(
2741  CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2742  eNa_strand_plus));
2743 
2745  opts->SetTraditionalBlastnDefaults();
2746  CBl2Seq blaster(*query, *subj, *opts);
2747  TSeqAlignVector sav(blaster.Run());
2748  x_TestAlignmentQuerySubjStrandCombinations(sav, "minus-plus");
2749 }
2750 
2751 // should find alignment
2752 BOOST_AUTO_TEST_CASE(Blastn_QueryBothStrands_SubjMinusStrand) {
2753  // Alignment in these sequences is from plus/minus strands
2754  CSeq_id qid("NT_004487.15");
2755  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2756  unique_ptr<SSeqLoc> query(
2757  CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2758  eNa_strand_both));
2759 
2760  CSeq_id sid("AA441981.1");
2761  range.first = 10;
2762  range.second = 480;
2763  unique_ptr<SSeqLoc> subj(
2764  CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2765  eNa_strand_minus));
2766 
2768  opts->SetTraditionalBlastnDefaults();
2769  CBl2Seq blaster(*query, *subj, *opts);
2770  TSeqAlignVector sav(blaster.Run());
2771  x_TestAlignmentQuerySubjStrandCombinations(sav, "plus-minus");
2772 }
2773 
2774 // shouldn't find an alignment
2775 BOOST_AUTO_TEST_CASE(Blastn_QueryPlusStrand_SubjPlusStrand) {
2776  // Alignment in these sequences is from plus/minus strands
2777  CSeq_id qid("NT_004487.15");
2778  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2779  unique_ptr<SSeqLoc> query(
2780  CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2781  eNa_strand_plus));
2782 
2783  CSeq_id sid("AA441981.1");
2784  range.first = 10;
2785  range.second = 480;
2786  unique_ptr<SSeqLoc> subj(
2787  CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2788  eNa_strand_plus));
2789 
2791  opts->SetTraditionalBlastnDefaults();
2792  CBl2Seq blaster(*query, *subj, *opts);
2793  TSeqAlignVector sav(blaster.Run());
2794  BOOST_REQUIRE(sav[0]->IsEmpty() == true);
2795 }
2796 
2797 // should find an alignment
2798 BOOST_AUTO_TEST_CASE(Blastn_QueryPlusStrand_SubjMinusStrand) {
2799  // Alignment in these sequences is from plus/minus strands
2800  CSeq_id qid("NT_004487.15");
2801  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2802  unique_ptr<SSeqLoc> query(
2803  CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2804  eNa_strand_plus));
2805 
2806  CSeq_id sid("AA441981.1");
2807  range.first = 10;
2808  range.second = 480;
2809  unique_ptr<SSeqLoc> subj(
2810  CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2811  eNa_strand_minus));
2812 
2814  opts->SetTraditionalBlastnDefaults();
2815  CBl2Seq blaster(*query, *subj, *opts);
2816  TSeqAlignVector sav(blaster.Run());
2817  x_TestAlignmentQuerySubjStrandCombinations(sav, "plus-minus");
2818 }
2819 
2820 // should NOT find an alignment because we only search the plus strand of
2821 // the subject sequence
2822 BOOST_AUTO_TEST_CASE(Blastn_QueryPlusStrand_SubjBothStrands) {
2823  // Alignment in these sequences is from plus/minus strands
2824  CSeq_id qid("NT_004487.15");
2825  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2826  unique_ptr<SSeqLoc> query(
2827  CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2828  eNa_strand_plus));
2829 
2830  CSeq_id sid("AA441981.1");
2831  range.first = 10;
2832  range.second = 480;
2833  unique_ptr<SSeqLoc> subj(
2834  CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2835  eNa_strand_both));
2836 
2838  opts->SetTraditionalBlastnDefaults();
2839  CBl2Seq blaster(*query, *subj, *opts);
2840  TSeqAlignVector sav(blaster.Run());
2841  BOOST_REQUIRE(sav[0]->IsEmpty() == true);
2842 }
2843 
2844 // should not find an alignment because alignment is on opposite strands
2845 BOOST_AUTO_TEST_CASE(Blastn_QueryMinusStrand_SubjMinusStrand) {
2846  // Alignment in these sequences is from plus/minus strands
2847  CSeq_id qid("NT_004487.15");
2848  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2849  unique_ptr<SSeqLoc> query(
2850  CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2851  eNa_strand_minus));
2852 
2853  CSeq_id sid("AA441981.1");
2854  range.first = 10;
2855  range.second = 480;
2856  unique_ptr<SSeqLoc> subj(
2857  CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2858  eNa_strand_minus));
2859 
2861  opts->SetTraditionalBlastnDefaults();
2862  CBl2Seq blaster(*query, *subj, *opts);
2863  TSeqAlignVector sav(blaster.Run());
2864  BOOST_REQUIRE(sav[0]->IsEmpty() == true);
2865 }
2866 
2867 // should find alignment
2868 BOOST_AUTO_TEST_CASE(Blastn_QueryMinusStrand_SubjPlusStrand) {
2869  // Alignment in these sequences is from plus/minus strands
2870  CSeq_id qid("NT_004487.15");
2871  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2872  unique_ptr<SSeqLoc> query(
2873  CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2874  eNa_strand_minus));
2875 
2876  CSeq_id sid("AA441981.1");
2877  range.first = 10;
2878  range.second = 480;
2879  unique_ptr<SSeqLoc> subj(
2880  CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2881  eNa_strand_plus));
2882 
2884  opts->SetTraditionalBlastnDefaults();
2885  CBl2Seq blaster(*query, *subj, *opts);
2886  TSeqAlignVector sav(blaster.Run());
2887  x_TestAlignmentQuerySubjStrandCombinations(sav, "minus-plus");
2888 }
2889 
2890 // should find alignment
2891 BOOST_AUTO_TEST_CASE(Blastn_QueryMinusStrand_SubjBothStrands) {
2892  // Alignment in these sequences is from plus/minus strands
2893  CSeq_id qid("NT_004487.15");
2894  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2895  unique_ptr<SSeqLoc> query(
2896  CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2897  eNa_strand_minus));
2898 
2899  CSeq_id sid("AA441981.1");
2900  range.first = 10;
2901  range.second = 480;
2902  unique_ptr<SSeqLoc> subj(
2903  CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2904  eNa_strand_both));
2905 
2907  opts->SetTraditionalBlastnDefaults();
2908  CBl2Seq blaster(*query, *subj, *opts);
2909  TSeqAlignVector sav(blaster.Run());
2910  x_TestAlignmentQuerySubjStrandCombinations(sav, "minus-plus");
2911 }
2912 
2913 // Should properly find alignment
2914 BOOST_AUTO_TEST_CASE(Blastn_QueryWhole_SubjInterval)
2915 {
2916  CRef<CSeq_id> qid(new CSeq_id("AA441981.1"));
2917  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateWholeSSeqLoc(*qid));
2918 
2919  CRef<CSeq_id> sid(new CSeq_id("NT_004487.15"));
2920  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2921  unique_ptr<SSeqLoc> subj(
2922  CTestObjMgr::Instance().CreateSSeqLoc(*sid, range,
2923  eNa_strand_both));
2924 
2926  opts->SetTraditionalBlastnDefaults();
2927  CBl2Seq blaster(*query, *subj, *opts);
2928  TSeqAlignVector sav(blaster.Run());
2930 }
2931 
2932 BOOST_AUTO_TEST_CASE(Blastn_QueryInterval_SubjWhole)
2933 {
2934  CRef<CSeq_id> qid(new CSeq_id("NT_004487.15"));
2935  pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2936  unique_ptr<SSeqLoc> query(
2937  CTestObjMgr::Instance().CreateSSeqLoc(*qid, range,
2938  eNa_strand_both));
2939 
2940  CRef<CSeq_id> sid(new CSeq_id("AA441981.1"));
2941  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateWholeSSeqLoc(*sid));
2942 
2944  opts->SetTraditionalBlastnDefaults();
2945  CBl2Seq blaster(*query, *subj, *opts);
2946  TSeqAlignVector sav(blaster.Run());
2948 }
2949 
2950 BOOST_AUTO_TEST_CASE(BlastpMultipleQueries_MultipleSubjs) {
2951  vector<TIntId> q_gis, s_gis;
2952 
2953  // Setup the queries
2954  q_gis.push_back(6);
2955  q_gis.push_back(129295);
2956  q_gis.push_back(15606659);
2957 
2958  // setup the subjects
2959  s_gis.push_back(129295);
2960  s_gis.push_back(6);
2961  s_gis.push_back(4336138); // no hits with gis 6 and 129295
2962  s_gis.push_back(15606659);
2963  s_gis.push_back(5556);
2964 
2965  TSeqLocVector queries;
2966  ITERATE(vector<TIntId>, itr, q_gis) {
2967  CRef<CSeq_loc> loc(new CSeq_loc());
2968  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
2969 
2970  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
2971  scope->AddDefaults();
2972  queries.push_back(SSeqLoc(loc, scope));
2973  }
2974 
2975  TSeqLocVector subjects;
2976  ITERATE(vector<TIntId>, itr, s_gis) {
2977  CRef<CSeq_loc> loc(new CSeq_loc());
2978  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
2979 
2980  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
2981  scope->AddDefaults();
2982  subjects.push_back(SSeqLoc(loc, scope));
2983  }
2984 
2985  size_t num_queries = queries.size();
2986  size_t num_subjects = subjects.size();
2987 
2988  // BLAST by concatenating all queries
2989  CBl2Seq blaster4all(queries, subjects, eBlastp);
2990  TSeqAlignVector sas_v = blaster4all.Run();
2991  BOOST_REQUIRE_EQUAL(num_queries*num_subjects, sas_v.size());
2992  testBlastHitCounts(blaster4all, eBlastp_multi_q_s);
2993  testRawCutoffs(blaster4all, eBlastp, eBlastp_multi_q_s);
2994 
2995  // test the order of queries and subjects:
2996  testResultAlignments(num_queries, num_subjects,
2997  sas_v);
2998 }
2999 
3000 BOOST_AUTO_TEST_CASE(BlastpMultipleQueries_MultipleSubjs_RunEx) {
3001  vector<TIntId> q_gis, s_gis;
3002 
3003  // Setup the queries
3004  q_gis.push_back(6);
3005  q_gis.push_back(129295);
3006  q_gis.push_back(15606659);
3007 
3008  // setup the subjects
3009  s_gis.push_back(129295);
3010  s_gis.push_back(6);
3011  s_gis.push_back(4336138); // no hits with gis 6 and 129295
3012  s_gis.push_back(15606659);
3013  s_gis.push_back(5556);
3014 
3015  TSeqLocVector queries;
3016  ITERATE(vector<TIntId>, itr, q_gis) {
3017  CRef<CSeq_loc> loc(new CSeq_loc());
3018  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3019 
3020  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3021  scope->AddDefaults();
3022  queries.push_back(SSeqLoc(loc, scope));
3023  }
3024 
3025  TSeqLocVector subjects;
3026  ITERATE(vector<TIntId>, itr, s_gis) {
3027  CRef<CSeq_loc> loc(new CSeq_loc());
3028  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3029 
3030  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3031  scope->AddDefaults();
3032  subjects.push_back(SSeqLoc(loc, scope));
3033  }
3034 
3035  size_t num_queries = queries.size();
3036  size_t num_subjects = subjects.size();
3037 
3038  // BLAST by concatenating all queries
3039  CBl2Seq blaster4all(queries, subjects, eBlastp);
3040  CRef<CSearchResultSet> results = blaster4all.RunEx();
3041  BOOST_REQUIRE(results->GetResultType() == eSequenceComparison);
3042  BOOST_REQUIRE_EQUAL((num_queries*num_subjects),
3043  results->GetNumResults());
3044 
3045  // build the seqalign vector from the result set
3046  TSeqAlignVector sas_v;
3047  for (size_t i = 0; i < num_queries; i++)
3048  {
3049  for (size_t j = 0; j < num_subjects; j++)
3050  {
3051  CSearchResults& res_ij = results->GetResults(i, j);
3052  CRef<CSeq_align_set> aln_set;
3053  aln_set.Reset(const_cast<CSeq_align_set*>
3054  (res_ij.GetSeqAlign().GetPointer()));
3055  sas_v.push_back(aln_set);
3056  }
3057  }
3058 
3059  // do the rest of the tests on sas_v as in the
3060  // BlastpMultipleQueries_MultipleSubjs function:
3061 
3062  BOOST_REQUIRE_EQUAL(num_queries*num_subjects, sas_v.size());
3063  testBlastHitCounts(blaster4all, eBlastp_multi_q_s);
3064  testRawCutoffs(blaster4all, eBlastp, eBlastp_multi_q_s);
3065 
3066  // test the order of queries and subjects:
3067  testResultAlignments(num_queries, num_subjects,
3068  sas_v);
3069 }
3070 
3071 // This closely resembles how the command line applications invoke bl2seq
3072 BOOST_AUTO_TEST_CASE(BlastpMultipleQueries_MultipleSubjs_CLocalBlast) {
3073  vector<TIntId> q_gis, s_gis;
3074 
3075  // Setup the queries
3076  q_gis.push_back(6);
3077  q_gis.push_back(129295);
3078  q_gis.push_back(15606659);
3079 
3080  // setup the subjects
3081  s_gis.push_back(129295);
3082  s_gis.push_back(6);
3083  s_gis.push_back(4336138); // no hits with gis 6 and 129295
3084  s_gis.push_back(15606659);
3085  s_gis.push_back(5556);
3086 
3087  TSeqLocVector query_vec;
3088  ITERATE(vector<TIntId>, itr, q_gis) {
3089  CRef<CSeq_loc> loc(new CSeq_loc());
3090  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3091 
3092  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3093  scope->AddDefaults();
3094  query_vec.push_back(SSeqLoc(loc, scope));
3095  }
3096  CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(query_vec));
3097 
3099  opts_handle(CBlastOptionsFactory::Create(eBlastp));
3100 
3101  TSeqLocVector subj_vec;
3102  ITERATE(vector<TIntId>, itr, s_gis) {
3103  CRef<CSeq_loc> loc(new CSeq_loc());
3104  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3105 
3106  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3107  scope->AddDefaults();
3108  subj_vec.push_back(SSeqLoc(loc, scope));
3109  }
3110  CRef<IQueryFactory> subj_qf(new CObjMgr_QueryFactory(subj_vec));
3111  CRef<CLocalDbAdapter> subjects(new CLocalDbAdapter(subj_qf,
3112  opts_handle));
3113 
3114  BOOST_REQUIRE(subjects->IsDbScanMode() == false);
3115 
3116  size_t num_queries = query_vec.size();
3117  size_t num_subjects = subj_vec.size();
3118 
3119  // BLAST by concatenating all queries
3120  CLocalBlast blaster(queries, opts_handle, subjects);
3121  CRef<CSearchResultSet> results = blaster.Run();
3122  BOOST_REQUIRE(results->GetResultType() == eSequenceComparison);
3123  BOOST_REQUIRE_EQUAL((num_queries*num_subjects),
3124  results->GetNumResults());
3125  BOOST_REQUIRE_EQUAL((num_queries*num_subjects), results->size());
3126  BOOST_REQUIRE_EQUAL(num_queries, results->GetNumQueries());
3127  BOOST_REQUIRE_EQUAL(num_subjects,
3128  results->GetNumResults()/results->GetNumQueries());
3129 
3130  // build the seqalign vector from the result set
3131  TSeqAlignVector sas_v;
3132  for (size_t i = 0; i < num_queries; i++)
3133  {
3134  for (size_t j = 0; j < num_subjects; j++)
3135  {
3136  CSearchResults& res_ij = results->GetResults(i, j);
3137  CRef<CSeq_align_set> aln_set;
3138  aln_set.Reset(const_cast<CSeq_align_set*>
3139  (res_ij.GetSeqAlign().GetPointer()));
3140  sas_v.push_back(aln_set);
3141  }
3142  }
3143 
3144  // do the rest of the tests on sas_v as in the
3145  // BlastpMultipleQueries_MultipleSubjs function:
3146  BOOST_REQUIRE_EQUAL(num_queries*num_subjects, sas_v.size());
3147 
3148  // test the order of queries and subjects:
3149  testResultAlignments(num_queries, num_subjects, sas_v);
3150 }
3151 
3152 BOOST_AUTO_TEST_CASE(Blastp_MultipleSubjs_SearchAsSet) {
3153  vector<TIntId> q_gis, s_gis;
3154 
3155  // Setup the queries
3156  q_gis.push_back(816838863);
3157 
3158  // setup the subjects
3159  s_gis.push_back(16130156);
3160  s_gis.push_back(15644111);
3161  s_gis.push_back(126699345);
3162  s_gis.push_back(504220075);
3163  s_gis.push_back(21222553);
3164  s_gis.push_back(24376189);
3165  s_gis.push_back(15598078);
3166  s_gis.push_back(15599919);
3167  s_gis.push_back(15597767);
3168  s_gis.push_back(16131833);
3169  s_gis.push_back(15599742);
3170  s_gis.push_back(15598387);
3171  s_gis.push_back(15600358);
3172  s_gis.push_back(24375949);
3173  s_gis.push_back(126698248);
3174  s_gis.push_back(24375956);
3175  s_gis.push_back(24375382);
3176  s_gis.push_back(126698598);
3177 
3178  TSeqLocVector query_vec;
3179  ITERATE(vector<TIntId>, itr, q_gis) {
3180  CRef<CSeq_loc> loc(new CSeq_loc());
3181  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3182 
3183  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3184  scope->AddDefaults();
3185  query_vec.push_back(SSeqLoc(loc, scope));
3186  }
3187  CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(query_vec));
3188 
3190  opts_handle(CBlastOptionsFactory::Create(eBlastp));
3191 
3192  TSeqLocVector subj_vec;
3193  ITERATE(vector<TIntId>, itr, s_gis) {
3194  CRef<CSeq_loc> loc(new CSeq_loc());
3195  loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3196 
3197  CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3198  scope->AddDefaults();
3199  subj_vec.push_back(SSeqLoc(loc, scope));
3200  }
3201  CRef<IQueryFactory> subj_qf(new CObjMgr_QueryFactory(subj_vec));
3202  CRef<CLocalDbAdapter> subjects(new CLocalDbAdapter(subj_qf,
3203  opts_handle, true));
3204 
3205  BOOST_REQUIRE(subjects->IsDbScanMode() == true);
3206 
3207  size_t num_queries = query_vec.size();
3208  size_t num_subjects = subj_vec.size();
3209 
3210  // BLAST by concatenating all queries
3211  CLocalBlast blaster(queries, opts_handle, subjects);
3212  CRef<CSearchResultSet> results = blaster.Run();
3213  BOOST_REQUIRE(results->GetResultType() == eDatabaseSearch);
3214  BOOST_REQUIRE_EQUAL(num_queries,
3215  results->GetNumResults());
3216  BOOST_REQUIRE_EQUAL(num_queries, results->size());
3217  BOOST_REQUIRE_EQUAL(num_queries, results->GetNumQueries());
3218 
3219  // build the seqalign vector from the result set
3220  CRef<CSeq_align_set> aln_set;
3221  CSearchResults& res = (*results)[0];
3222  aln_set.Reset(const_cast<CSeq_align_set*>
3223  (res.GetSeqAlign().GetPointer()));
3224  // gi|15599742|ref|NP_253236.1| has two alignments, all others have one.
3225  BOOST_REQUIRE_EQUAL(num_subjects+1, aln_set->Size());
3226 
3227 }
3228 
3229 BOOST_AUTO_TEST_CASE(BlastOptionsEquality) {
3230  // Create options object through factory
3231  unique_ptr<CBlastOptionsHandle> megablast_options_handle(
3233  CBlastNucleotideOptionsHandle nucl_options_handle;
3234  BOOST_REQUIRE(megablast_options_handle->GetOptions() ==
3235  nucl_options_handle.GetOptions());
3236 }
3237 
3238 BOOST_AUTO_TEST_CASE(BlastOptionsInequality) {
3239  CBlastProteinOptionsHandle prot_options_handle;
3240  CBlastNucleotideOptionsHandle nucl_options_handle;
3241  BOOST_REQUIRE(prot_options_handle.GetOptions() !=
3242  nucl_options_handle.GetOptions());
3243 
3244  // Blastn and Megablast are different
3245  unique_ptr<CBlastOptionsHandle> blastn_options_handle(
3247  BOOST_REQUIRE(blastn_options_handle->GetOptions() !=
3248  nucl_options_handle.GetOptions());
3249 
3250  // Change the matrix and compare
3251  CBlastProteinOptionsHandle prot_options_handle2;
3252  prot_options_handle.SetMatrixName("pam30");
3253  BOOST_REQUIRE(prot_options_handle.GetOptions() !=
3254  prot_options_handle2.GetOptions());
3255 }
3256 
3257 BOOST_AUTO_TEST_CASE(DiscontiguousMB) {
3258  CSeq_id qid("gi|408478"); // zebrafish sequence U02544
3259  CSeq_id sid("gi|1546012"); // mouse sequence U61969
3260  unique_ptr<SSeqLoc> query(
3261  CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
3262  unique_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
3263 
3264  CBl2Seq blaster(*query, *subj, eDiscMegablast);
3265  TSeqAlignVector sav(blaster.Run());
3266  BOOST_REQUIRE_EQUAL(1, (int)sav.size());
3267 
3268  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
3269  BOOST_REQUIRE_EQUAL(13, (int)sar->GetSegs().GetDenseg().GetNumseg());
3272 }
3273 
3274 BOOST_AUTO_TEST_CASE(BlastnHumanChrom_MRNA) {
3275  CSeq_id qid("NT_004487.16");
3276  CSeq_id sid("AA621478.1");
3277  pair<TSeqPos, TSeqPos> qrange(7868209-1, 7868602-1);
3278  pair<TSeqPos, TSeqPos> srange(2-1, 397-1);
3279  unique_ptr<SSeqLoc> query(
3280  CTestObjMgr::Instance().CreateSSeqLoc(qid,
3281  qrange, eNa_strand_plus));
3282  unique_ptr<SSeqLoc> subj(
3283  CTestObjMgr::Instance().CreateSSeqLoc(sid,
3284  srange, eNa_strand_plus));
3285 
3287  CBl2Seq blaster(*query, *subj, *options);
3288  TSeqAlignVector sav(blaster.Run());
3289  BOOST_REQUIRE_EQUAL(1, (int)sav.size());
3290 
3291  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
3292  BOOST_REQUIRE_EQUAL(5, (int)sar->GetSegs().GetDenseg().GetNumseg());
3295 }
3296 
3297 // Checks that results for multiple subjects are put in correct places
3298 // in the vector of Seq-aligns
3299 BOOST_AUTO_TEST_CASE(testOneSubjectResults2CSeqAlign)
3300 {
3301  const int num_subjects = 15;
3302  const int results_size[num_subjects] =
3303  { 1, 1, 0, 1, 1, 1, 2, 1, 2, 0, 0, 0, 0, 2, 1 };
3304  const int query_gi = 7274302;
3305  const int gi_diff = 28;
3306  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, query_gi));
3307  unique_ptr<SSeqLoc> sl(
3308  CTestObjMgr::Instance().CreateSSeqLoc(*id, eNa_strand_both));
3310  query.push_back(*sl);
3311  TSeqLocVector subjects;
3312  int index;
3313  for (index = 0; index < num_subjects; ++index) {
3314  id.Reset(new CSeq_id(CSeq_id::e_Gi, (query_gi + gi_diff + index)));
3315  //cerr << query_gi + gi_diff + index << endl;
3316  sl.reset(CTestObjMgr::Instance().CreateSSeqLoc(*id,
3317  eNa_strand_both));
3318  subjects.push_back(*sl);
3319  }
3320  CBl2Seq blaster(query, subjects, eMegablast);
3321  TSeqAlignVector seqalign_v = blaster.Run();
3322  BOOST_REQUIRE_EQUAL(num_subjects, (int)seqalign_v.size());
3323 
3324  index = 0;
3325  ITERATE(TSeqAlignVector, itr, seqalign_v)
3326  {
3327  //cerr << index << endl;
3328  BOOST_REQUIRE_EQUAL(results_size[index], (int) (*itr)->Get().size());
3329  index++;
3330  }
3331 }
3332 
3333 BOOST_AUTO_TEST_CASE(testMultiSeqSearchSymmetry)
3334 {
3335  const size_t num_seqs = 19;
3336  const int gi_list[num_seqs] =
3337  { 1346057, 125527, 121064, 1711551, 125412, 128337, 2507199,
3338  1170625, 1730070, 585365, 140977, 1730069, 20455504, 125206,
3339  125319, 114152, 1706450, 1706307, 125565 };
3340  const int score_cutoff = 70;
3341 
3342  TSeqLocVector seq_vec;
3343  size_t index;
3344  for (index = 0; index < num_seqs; ++index) {
3345  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, gi_list[index]));
3346  unique_ptr<SSeqLoc> sl(
3347  CTestObjMgr::Instance().CreateSSeqLoc(*id, eNa_strand_both));
3348  seq_vec.push_back(*sl);
3349  }
3350 
3352  prot_opts->SetSegFiltering(false);
3353  prot_opts->SetCutoffScore(score_cutoff);
3354  CBl2Seq blaster(seq_vec, seq_vec, *prot_opts);
3355 #if 0
3356  blaster.RunWithoutSeqalignGeneration(); /* NCBI_FAKE_WARNING */
3357  BlastHSPResults* results = blaster.GetResults(); /* NCBI_FAKE_WARNING */
3358 
3359  int qindex, sindex, qindex1, sindex1;
3360  for (qindex = 0; qindex < num_seqs; ++qindex) {
3361  for (sindex = 0; sindex < results->hitlist_array[qindex]->hsplist_count;
3362  ++sindex) {
3363  BlastHSPList* hsp_list1, *hsp_list2 = NULL;
3364  hsp_list1 = results->hitlist_array[qindex]->hsplist_array[sindex];
3365  qindex1 = hsp_list1->oid;
3366  BlastHitList* hitlist = results->hitlist_array[qindex1];
3367  for (sindex1 = 0; sindex1 < hitlist->hsplist_count; ++sindex1) {
3368  if (hitlist->hsplist_array[sindex1]->oid == qindex) {
3369  hsp_list2 = hitlist->hsplist_array[sindex1];
3370  break;
3371  }
3372  }
3373  BOOST_REQUIRE(hsp_list2 != NULL);
3374  int hindex;
3375  for (hindex = 0; hindex < hsp_list1->hspcnt; ++hindex) {
3376  if (hsp_list1->hsp_array[hindex]->score <= score_cutoff)
3377  break;
3378  BOOST_REQUIRE(hindex < hsp_list2->hspcnt);
3379  BOOST_REQUIRE_EQUAL(hsp_list1->hsp_array[hindex]->score,
3380  hsp_list2->hsp_array[hindex]->score);
3381  }
3382  }
3383  }
3384 #endif
3385 
3386  CRef<CSearchResultSet> res = blaster.RunEx();
3387  BOOST_REQUIRE_EQUAL(eSequenceComparison, res->GetResultType());
3388  BOOST_REQUIRE_EQUAL(num_seqs, res->GetNumQueries());
3389  BOOST_REQUIRE_EQUAL(num_seqs*num_seqs, res->GetNumResults());
3390  BOOST_REQUIRE_EQUAL(num_seqs*num_seqs, res->size());
3391  typedef CSearchResultSet::size_type size_type;
3392 
3393  for (size_type q = 0; q < res->GetNumQueries(); q++) {
3394  for (size_type s = 0; s < res->GetNumQueries(); s++) {
3395  const CSearchResults& results = res->GetResults(q, s);
3396  if (q == s) { // self-hit
3397  BOOST_REQUIRE(results.HasAlignments());
3398  CConstRef<CSeq_align_set> sas = results.GetSeqAlign();
3399  BOOST_REQUIRE(sas.NotEmpty());
3400  BOOST_REQUIRE_EQUAL((size_type)1, sas->Get().size());
3401  CConstRef<CSeq_align> sa = sas->Get().front();
3402  const CSeq_id& qid = sa->GetSeq_id(0);
3403  const CSeq_id& sid = sa->GetSeq_id(1);
3404  BOOST_REQUIRE(qid.Match(sid));
3405  int num_ident = 0;
3407  int seqlen = (int)sequence::GetLength(*seq_vec[q].seqloc,
3408  &*seq_vec[q].scope);
3409  BOOST_REQUIRE_EQUAL(seqlen, num_ident);
3410  }
3411  const CSearchResults& results2 = res->GetResults(s, q);
3412  BOOST_REQUIRE_EQUAL(results.HasAlignments(),
3413  results2.HasAlignments());
3414  if ( !results.HasAlignments() ) {
3415  continue;
3416  }
3417 
3418  // Compare Q-S and S-Q hits
3419  CConstRef<CSeq_align_set> sas1 = results.GetSeqAlign();
3420  CConstRef<CSeq_align_set> sas2 = results2.GetSeqAlign();
3421  BOOST_REQUIRE(sas1.NotEmpty());
3422  BOOST_REQUIRE(sas2.NotEmpty());
3423  //cerr << "Alignments1 " << MSerial_AsnText << *sas2 << endl;
3424  //cerr << "Alignments2 " << MSerial_AsnText << *sas2 << endl;
3425  BOOST_REQUIRE_EQUAL(sas1->Get().size(), sas2->Get().size());
3426  CSeq_align_set::Tdata::const_iterator i1 = sas1->Get().begin(),
3427  i2 = sas2->Get().begin();
3428  for (; i1 != sas1->Get().end(); ++i1, ++i2) {
3429  CConstRef<CSeq_align> al1 = *i1;
3430  CConstRef<CSeq_align> al2 = *i2;
3431  BOOST_REQUIRE(al1.NotEmpty());
3432  BOOST_REQUIRE(al2.NotEmpty());
3433  //cerr << "Align1 " << MSerial_AsnText << *al1 << endl;
3434  //cerr << "Align2 " << MSerial_AsnText << *al2 << endl;
3435 
3436  int score1 = 0, score2 = 0;
3439  BOOST_REQUIRE_EQUAL(score1, score2);
3440 
3441  double bit_score1 = .0, bit_score2 = .0;
3442  al1->GetNamedScore(CSeq_align::eScore_BitScore, bit_score1);
3443  al2->GetNamedScore(CSeq_align::eScore_BitScore, bit_score2);
3444  BOOST_REQUIRE_CLOSE(bit_score1, bit_score2, 10e-2);
3445 
3446  double evalue1 = .0, evalue2 = .0;
3449  BOOST_REQUIRE_CLOSE(evalue1, evalue2, 10e-5);
3450  }
3451  }
3452  }
3453 
3454 
3455  // Repeat the tests above using the TSeqAlignVector
3456  TSeqAlignVector alignments = blaster.Run();
3457  BOOST_REQUIRE_EQUAL(num_seqs*num_seqs, alignments.size());
3458  for (size_t q = 0; q < num_seqs; q++) {
3459  for (size_t s = 0; s < num_seqs; s++) {
3460  size_t idx1 = q*num_seqs + s;
3461  CConstRef<CSeq_align_set> al1 = alignments[idx1];
3462  if (idx1 == (q*num_seqs+q)) { // self-hit
3463  BOOST_REQUIRE(al1.NotEmpty());
3464  BOOST_REQUIRE_EQUAL(1U, al1->Get().size());
3465  CConstRef<CSeq_align> sa = al1->Get().front();
3466  const CSeq_id& qid = sa->GetSeq_id(0);
3467  const CSeq_id& sid = sa->GetSeq_id(1);
3468  BOOST_REQUIRE(qid.Match(sid));
3469  int num_ident = 0;
3471  int seqlen = (int)sequence::GetLength(*seq_vec[q].seqloc,
3472  &*seq_vec[q].scope);
3473  BOOST_REQUIRE_EQUAL(seqlen, num_ident);
3474  continue;
3475  }
3476  size_t idx2 = s*num_seqs + q;
3477  CConstRef<CSeq_align_set> al2 = alignments[idx2];
3478  if (al1.Empty() || al2.Empty()) {
3479  BOOST_REQUIRE_EQUAL(al1.Empty(), al2.Empty());
3480  continue;
3481  }
3482  BOOST_REQUIRE_EQUAL(al1.NotEmpty(), al2.NotEmpty());
3483  // compare al1 with al2
3484  {{
3485  CConstRef<CSeq_align> sa1 = al1->Get().front();
3486  CConstRef<CSeq_align> sa2 = al2->Get().front();
3487  BOOST_REQUIRE(sa1.NotEmpty());
3488  BOOST_REQUIRE(sa2.NotEmpty());
3489  //cerr << "Align1 " << MSerial_AsnText << *sa1 << endl;
3490  //cerr << "Align2 " << MSerial_AsnText << *sa2 << endl;
3491 
3492  int score1 = 0, score2 = 0;
3495  BOOST_REQUIRE_EQUAL(score1, score2);
3496 
3497  double bit_score1 = .0, bit_score2 = .0;
3498  sa1->GetNamedScore(CSeq_align::eScore_BitScore, bit_score1);
3499  sa2->GetNamedScore(CSeq_align::eScore_BitScore, bit_score2);
3500  BOOST_REQUIRE_CLOSE(bit_score1, bit_score2, 10e-2);
3501 
3502  double evalue1 = .0, evalue2 = .0;
3505  BOOST_REQUIRE_CLOSE(evalue1, evalue2, 10e-5);
3506  }}
3507  }
3508  }
3509 
3510 }
3511 
3512 BOOST_AUTO_TEST_CASE(testInterruptCallbackWithNull) {
3513  CSeq_id id("gi|129295");
3514  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
3515 
3516  CBl2Seq blaster(*sl, *sl, eBlastp);
3517  TInterruptFnPtr null_fnptr = 0;
3518  TInterruptFnPtr fnptr = blaster.SetInterruptCallback(null_fnptr);
3519  BOOST_REQUIRE(fnptr == NULL);
3520 
3521  TSeqAlignVector sav(blaster.Run());
3522  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
3523  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
3524 
3526  // make sure we get the previous interrupt callback
3527  BOOST_REQUIRE(fnptr == null_fnptr);
3528 
3529  fnptr = blaster.SetInterruptCallback(null_fnptr);
3530  // make sure we get the previous interrupt callback
3531  BOOST_REQUIRE(fnptr == interrupt_immediately);
3532 
3533  // Retry the search now that we've removed the interrupt callback
3534  sav = blaster.Run();
3535  BOOST_REQUIRE_EQUAL(1, (int)sav.size());
3536  sar = *(sav[0]->Get().begin());
3537  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
3538 }
3539 
3540 BOOST_AUTO_TEST_CASE(testInterruptCallbackDoNotInterrupt) {
3541  CSeq_id id("gi|129295");
3542  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
3543 
3544  CBl2Seq blaster(*sl, *sl, eBlastp);
3546  BOOST_REQUIRE(fnptr == NULL);
3547 
3548  TSeqAlignVector sav(blaster.Run());
3549  BOOST_REQUIRE_EQUAL(1, (int)sav.size());
3550  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
3551  BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
3552 }
3553 
3554 #if SEQLOC_MIX_QUERY_OK
3555 BOOST_AUTO_TEST_CASE(MultiIntervalLoc) {
3556  const size_t kNumInts = 20;
3557  const size_t kStarts[kNumInts] =
3558  { 838, 1838, 6542, 7459, 9246, 10431, 14807, 16336, 19563,
3559  20606, 21232, 22615, 23822, 27941, 29597, 30136, 31287,
3560  31786, 33315, 35402 };
3561  const size_t kEnds[kNumInts] =
3562  { 961, 2010, 6740, 7573, 9408, 10609, 15043, 16511, 19783,
3563  20748, 21365, 22817, 24049, 28171, 29839, 30348, 31362,
3564  31911, 33485, 37952 };
3565  size_t index;
3566 
3567  CSeq_id qid("gi|3417288");
3568  CRef<CSeq_loc> qloc(new CSeq_loc());
3569  for (index = 0; index < kNumInts; ++index) {
3570  CRef<CSeq_loc> next_loc(new CSeq_loc());
3571  next_loc->SetInt().SetFrom(kStarts[index]);
3572  next_loc->SetInt().SetTo(kEnds[index]);
3573  next_loc->SetInt().SetId(qid);
3574  qloc->SetMix().Set().push_back(next_loc);
3575  }
3576 
3577  CRef<CScope> scope(new CScope(CTestObjMgr::Instance().GetObjMgr()));
3578  scope->AddDefaults();
3579 
3580  unique_ptr<SSeqLoc> query(new SSeqLoc(qloc, scope));
3581 
3582  CSeq_id sid("gi|51511732");
3583  pair<TSeqPos, TSeqPos> range(15595732, 15705419);
3584  unique_ptr<SSeqLoc> subject(
3585  CTestObjMgr::Instance().CreateSSeqLoc(sid, range, eNa_strand_both));
3586  CBl2Seq blaster(*query, *subject, eBlastn);
3587  TSeqAlignVector sav(blaster.Run());
3588  CRef<CSeq_align> sar = *(sav[0]->Get().begin());
3589  BOOST_REQUIRE_EQUAL(60, (int)sar->GetSegs().GetDisc().Get().size());
3590 }
3591 #endif
3592 
3593 BOOST_AUTO_TEST_CASE(QueryMaskIgnoredInMiniExtension) {
3594  CRef<CSeq_loc> qloc(new CSeq_loc());
3595  qloc->SetWhole().SetGi(GI_CONST(4505696));
3596  CSeq_id sid("gi|29809252");
3597  pair<TSeqPos, TSeqPos> range(662070, 662129);
3598 
3599  CRef<CScope> scope(new CScope(CTestObjMgr::Instance().GetObjMgr()));
3600  scope->AddDefaults();
3601 
3602  unique_ptr<SSeqLoc> query(new SSeqLoc(qloc, scope));
3603  unique_ptr<SSeqLoc> subject(
3604  CTestObjMgr::Instance().CreateSSeqLoc(sid, range, eNa_strand_both));
3605 
3606  CBl2Seq blaster(*query, *subject, eMegablast);
3607  TSeqAlignVector sav(blaster.Run());
3608  CRef<CSeq_align_set> sas = sav.front();
3609  BOOST_REQUIRE(sas->Get().empty());
3610 }
3611 
3612 #endif /* SKIP_DOXYGEN_PROCESSING */
3613 
User-defined methods of the data storage class.
BLAST formatter utilities.
static const char * kFileName
Declares the CBl2Seq (BLAST 2 Sequences) class.
void testBlastHitCounts(CBl2Seq &blaster, EBl2seqTest test_id)
CRef< CSeq_loc > s_MakePackedInt2(CRef< CSeq_id > id, vector< TSeqRange > &range_vec)
#define ARRAY_SIZE(a)
static CRef< CBl2Seq > s_SetupWithMultipleQueriesAndSubjects(bool query_is_nucl, bool subj_is_nucl, EProgram program)
Boolean do_not_interrupt(SBlastProgress *)
Returns false so that the processing never stops in spite of a callback function to interrupt the pro...
BOOST_AUTO_TEST_CASE_TIMEOUT(testInterruptBlastSetup, 3)
USING_SCOPE(blast)
Boolean interrupt_after3calls(SBlastProgress *)
The interruption occurs after 3 invokations of this callback.
BOOST_AUTO_TEST_CASE(NucleotideMultipleSeqLocs1)
Boolean callback_counter(SBlastProgress *progress_info)
This callback never interrupts the BLAST search, its only purpose is to count the number of times thi...
Boolean interrupt_immediately(SBlastProgress *)
Returns true so that the processing stops upon the first invocation of this callback.
static void testWholeIntervalAlignment(TSeqAlignVector &sav)
void testRawCutoffs(CBl2Seq &blaster, EProgram program, EBl2seqTest test_id)
static void testIntervalWholeAlignment(TSeqAlignVector &sav)
Boolean interrupt_at_random(SBlastProgress *progress_info)
This callback interrupts the BLAST search after the callback has been executed the requested number o...
Boolean interrupt_on_traceback(SBlastProgress *progress_info)
The interruption occurs after starting the traceback stage.
USING_NCBI_SCOPE
void testResultAlignments(size_t num_queries, size_t num_subjects, TSeqAlignVector result_alnvec)
void x_TestAlignmentQuerySubjStrandCombinations(TSeqAlignVector &sav, string aligned_strands)
@ eTblastx_many_hits
@ eBlastx_oof
@ eMegablast_555_555
@ eBlastp_multi_q_s
@ eTblastn_129295_555
@ eTblastn_oof
@ eDiscMegablast_555_555
@ eMegablast_chrom_mrna
@ eBlastn_multi_q
@ eTblastx_555_555
@ eBlastn_555_3090
@ eDiscMegablast_U02544_U61969
@ eBlastp_129295_129295
@ eBlastp_129295_7662354
@ eBlastn_555_555
@ eBlastx_555_129295
@ eBlastp_multi_q
@ eTblastn_129295_555_large_word
@ eSoftSubjMasking
Definition: blast_def.h:237
Boolean(* TInterruptFnPtr)(SBlastProgress *progress_info)
Prototype for function pointer to determine whether the BLAST search should proceed or be interrupted...
Definition: blast_def.h:354
@ eTracebackSearch
Traceback stage.
Definition: blast_def.h:330
Interface for reading SRA sequences into blast input.
Interface for converting sources of sequence data into blast sequence input.
const char * kBlastErrMsg_CantCalculateUngappedKAParams
Definition: blast_message.c:38
Declares the CBlastNucleotideOptionsHandle class.
@ eGreedyScoreOnly
Greedy extension (megaBlast)
@ eGreedyTbck
Greedy extension (megaBlast)
#define kDefaultRepeatFilterDb
Default value for repeats database filtering.
@ eCompressedAaLookupTable
compressed alphabet (blastp) lookup table
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
Declares the CBlastProteinOptionsHandle class.
vector< CRef< objects::CSeq_align_set > > TSeqAlignVector
Vector of Seq-align-sets.
@ eDatabaseSearch
Seq-aligns in the style of a database search.
@ eSequenceComparison
Seq-aligns in the BLAST 2 Sequence style (one alignment per query-subject pair)
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
Definition: blast_types.hpp:56
@ eTblastx
Translated nucl-Translated nucl.
Definition: blast_types.hpp:62
@ eBlastn
Nucl-Nucl (traditional blastn)
Definition: blast_types.hpp:58
@ eBlastp
Protein-Protein.
Definition: blast_types.hpp:59
@ eTblastn
Protein-Translated nucl.
Definition: blast_types.hpp:61
@ eMegablast
Nucl-Nucl (traditional megablast)
Definition: blast_types.hpp:65
@ eDiscMegablast
Nucl-Nucl using discontiguous megablast.
Definition: blast_types.hpp:66
@ eBlastx
Translated nucl-Protein.
Definition: blast_types.hpp:60
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
Declares the CBlastxOptionsHandle class.
ncbi::TMaskedQueryRegions mask
CAutoEnvironmentVariable –.
Definition: ncbienv.hpp:173
Runs the BLAST algorithm between 2 sequences.
Definition: bl2seq.hpp:58
Defines BLAST error codes (user errors included)
Class representing a text file containing sequences in fasta format.
Class that centralizes the configuration data for sequences to be converted.
Definition: blast_input.hpp:48
void SetLowercaseMask(bool mask)
Turn lowercase masking on/off.
Generalized converter from an abstract source of biological sequence data to collections of blast inp...
Handle to the nucleotide-nucleotide options to the BLAST algorithm.
Handle to the protein-protein options to the BLAST algorithm.
Class whose purpose is to create CScope objects which have data loaders added with different prioriti...
CRef< objects::CScope > NewScope()
Create a new, properly configured CScope.
Handle to the translated nucleotide-protein options to the BLAST algorithm.
Handle to the nucleotide-nucleotide options to the discontiguous BLAST algorithm.
Base class for reading FASTA sequences.
Definition: fasta.hpp:80
Class to perform a BLAST search on local BLAST databases Note that PHI-BLAST can be run using this cl...
Definition: local_blast.hpp:62
Interface to create a BlastSeqSrc suitable for use in CORE BLAST from a a variety of BLAST database/s...
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
NCBI C++ Object Manager dependant implementation of IQueryFactory.
vector< CRange< TSeqPos > > TRanges
CRandom::
Definition: random_gen.hpp:66
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
Blast Search Subject.
Search Results for One Query.
Tdata::size_type Size() const
@ eScore_CompAdjMethod
Definition: Seq_align.hpp:174
@ eScore_IdentityCount
Definition: Seq_align.hpp:145
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
bool GetNamedScore(const string &id, int &score) const
Get score.
Definition: Seq_align.cpp:563
TSeqPos GetSeqStart(TDim row) const
Definition: Seq_align.cpp:252
static CRef< CScope > NewScope(bool with_defaults=true)
Return a new scope, possibly (by default) with default loaders, which will include the Genbank loader...
Definition: simple_om.cpp:202
Handle to the protein-translated nucleotide options to the BLAST algorithm.
static CTestObjMgr & Instance()
Definition: test_objmgr.cpp:71
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
Class for the messages for an individual query sequence.
typedef for the messages for an entire BLAST search, which could be comprised of multiple query seque...
@ eCompositionBasedStats
Composition-based statistics as in NAR 29:2994-3005, 2001.
@ eNoCompositionBasedStats
Don't use composition based statistics.
static const string kMismatch
Declares the CDiscNucleotideOptionsHandle class.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
Operators to edit gaps in sequences.
void SetCompositionBasedStats(ECompoAdjustModes mode)
void SetEvalueThreshold(double eval)
Sets EvalueThreshold.
void SetWindowSize(int w)
void SetSubject(const SSeqLoc &subject)
Set the subject sequence.
Definition: bl2seq.hpp:250
string WindowMaskerTaxidToDb(int taxid)
Get the windowmasker file path for a given taxid.
void GetMessages(TSearchMessages &messages) const
Returns error messages/warnings.
Definition: bl2seq.hpp:296
void SetMatchReward(int r)
Sets MatchReward.
CConstRef< objects::CSeq_align_set > GetSeqAlign() const
Accessor for the Seq-align results.
void SetWindowMaskerTaxId(int taxid)
Enable window masker and select a taxid (or 0 to disable).
CRef< CSearchResultSet > Run()
Executes the search.
TInterruptFnPtr SetInterruptCallback(TInterruptFnPtr fnptr, void *user_data=NULL)
Set a function callback to be invoked by the CORE of BLAST to allow interrupting a BLAST search in pr...
Definition: bl2seq.hpp:302
static CBlastOptionsHandle * Create(EProgram program, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested program,...
void SetWordThreshold(double w)
Sets WordThreshold.
CBlastOptions & SetOptions()
Returns a reference to the internal options class which this object is a handle for.
const char * GetWindowMaskerDatabase() const
Get the window masker database name (or NULL if not set).
size_type GetNumQueries() const
Return the number of unique query ID's represented by this object.
void SetTraditionalBlastnDefaults()
Sets TraditionalBlastnDefaults.
void SetMismatchPenalty(int p)
Sets MismatchPenalty.
void WindowMaskerPathReset()
Resets the path to the windowmasker data files.
void SetRepeatFilteringDB(const char *db)
Enable repeat filtering.
virtual TSeqAlignVector Run()
Perform BLAST search Assuming N queries and M subjects, the structure of the returned vector is as fo...
Definition: bl2seq.cpp:173
size_type size() const
Identical to GetNumResults, provided to facilitate STL-style iteration.
void SetTraditionalBlastnDefaults()
NOTE: Unavailable for discontiguous megablast.
vector< value_type >::size_type size_type
size_type type definition
void SetWordSize(int ws)
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
CRef< CSearchResultSet > RunEx()
Performs the same functionality as Run(), but it returns a different data type.
Definition: bl2seq.cpp:196
void SetTraditionalMegablastDefaults()
Sets TraditionalMegablastDefaults.
void SetMatrixName(const char *matrix)
Sets MatrixName.
void GetAncillaryResults(CSearchResultSet::TAncillaryVector &retval) const
Get the ancillary results for a BLAST search (to be used with the Run() method)
Definition: bl2seq.hpp:311
bool GetRepeatFiltering() const
Is repeat filtering enabled?
CBlastOptionsHandle & SetOptionsHandle()
Set the options handle.
Definition: bl2seq.hpp:278
int GetWindowMaskerTaxId() const
Get the window masker taxid (or 0 if not set).
EResultType GetResultType() const
Get the type of results contained in this object.
const char * GetRepeatFilteringDB() const
Get the repeat filtering database.
void SetWordSize(int ws)
Sets WordSize.
void SetDustFiltering(bool val)
Enable dust filtering.
void SetWindowMaskerDatabase(const char *db)
Enable window masker and select a database (or NULL to disable).
void SetMaxNumHspPerSequence(int m)
Sets MaxNumHspPerSequence.
vector< CRef< CBlastAncillaryData > > TAncillaryVector
typedef for a vector of CRef<CBlastAncillaryData>
void SetQuery(const SSeqLoc &query)
Set the query.
Definition: bl2seq.hpp:222
int WindowMaskerPathInit(const string &window_masker_path)
Initialize the path to the windowmasker data files.
size_type GetNumResults() const
Return the number of results contained by this object.
void SetLookupTableType(ELookupTableType type)
CSearchResults & GetResults(size_type qi, size_type si)
Retrieve results for a query-subject pair contained by this object.
BlastDiagnostics * GetDiagnostics() const
Retrieves the diagnostics information returned from the engine.
Definition: bl2seq.hpp:290
bool IsDbScanMode() const
Returns true if this is not a database but is database scanning mode.
bool HasAlignments() const
Return true if there are any alignments for this query.
@ eBlastDbIsNucleotide
nucleotide
@ eBlastDbIsProtein
protein
#define INVALID_GI
Definition: ncbimisc.hpp:1089
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
Int8 TIntId
Definition: ncbimisc.hpp:999
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define GI_CONST(gi)
Definition: ncbimisc.hpp:1087
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NULL
Definition: ncbistd.hpp:225
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
static string GetCwd(void)
Get the current working directory.
Definition: ncbifile.cpp:3708
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1033
void SetPacked_int(TPacked_int &v)
Definition: Seq_loc.hpp:984
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
ENa_strand GetStrand(const CSeq_loc &loc, CScope *scope=0)
Returns eNa_strand_unknown if multiple Bioseqs in loc Returns eNa_strand_other if multiple strands in...
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
bool NotNull(void) const THROWS_NONE
Check if pointer is not null – same effect as NotEmpty().
Definition: ncbiobj.hpp:744
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
TObjectType & GetObject(void) const
Get object.
Definition: ncbiobj.hpp:1697
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
Uint4 TValue
Type of the generated integer value and/or the seed value.
Definition: random_gen.hpp:69
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
#define kEmptyStr
Definition: ncbistr.hpp:123
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5429
#define NPOS
Definition: ncbistr.hpp:133
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
const TStd & GetStd(void) const
Get the variant data.
Definition: Seq_align_.hpp:752
bool IsSetScore(void) const
for whole alignment Check if a value has been assigned to Score data member.
Definition: Seq_align_.hpp:884
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
list< CRef< CSeq_align > > Tdata
const TScore & GetScore(void) const
Get the Score member data.
Definition: Seq_align_.hpp:896
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
const TStrands & GetStrands(void) const
Get the Strands member data.
Definition: Dense_seg_.hpp:580
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
static int input()
int i
int len
Main class to perform a BLAST search on the local machine.
Declares class which provides internal BLAST database representations to the internal BLAST APIs.
static void byte(MDB_val *v)
Definition: mdb_dump.c:81
const TYPE & Get(const CNamedParameterList *param)
range(_Ty, _Ty) -> range< _Ty >
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
unsigned int a
Definition: ncbi_localip.c:102
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
BlastUngappedStats * ungapped_stat
Ungapped extension counts.
BlastRawCutoffs * cutoffs
Various raw values for the cutoffs.
BlastGappedStats * gapped_stat
Gapped extension counts.
Structure containing hit counts from the gapped stage of a BLAST search.
Int4 good_extensions
Number of HSPs below the e-value threshold after gapped extension.
Int4 extensions
Total number of gapped extensions performed.
The structure to hold all HSPs for a given sequence after the gapped alignment.
Definition: blast_hits.h:153
Int4 oid
The ordinal id of the subject sequence this HSP list is for.
Definition: blast_hits.h:154
Int4 hspcnt
Number of HSPs saved.
Definition: blast_hits.h:158
BlastHSP ** hsp_array
Array of pointers to individual HSPs.
Definition: blast_hits.h:157
The structure to contain all BLAST results, for multiple queries.
Definition: blast_hits.h:183
BlastHitList ** hitlist_array
Array of results for individual query sequences.
Definition: blast_hits.h:185
Structure holding all information about an HSP.
Definition: blast_hits.h:126
BlastSeg query
Query sequence info.
Definition: blast_hits.h:131
BlastSeg subject
Subject sequence info.
Definition: blast_hits.h:132
Int4 score
This HSP's raw score.
Definition: blast_hits.h:127
The structure to contain all BLAST results for one query sequence.
Definition: blast_hits.h:169
BlastHSPList ** hsplist_array
Array of HSP lists for individual database hits.
Definition: blast_hits.h:176
Int4 hsplist_count
Filled size of the HSP lists array.
Definition: blast_hits.h:170
Structure holding raw cutoff and gap-x-drop values.
Int4 ungapped_cutoff
Minimal raw score for starting gapped extension.
Int4 x_drop_gap_final
Raw value of the x-dropoff for gapped extensions with traceback.
Int4 x_drop_gap
Raw value of the x-dropoff for preliminary gapped extensions.
Int4 x_drop_ungapped
Raw value of the x-dropoff for ungapped extensions.
Int4 end
End of hsp.
Definition: blast_hits.h:99
Int4 gapped_start
Where the gapped extension started.
Definition: blast_hits.h:100
Int4 offset
Start of hsp.
Definition: blast_hits.h:98
Structure containing hit counts from the ungapped stage of a BLAST search.
Int8 lookup_hits
Number of successful lookup table hits.
Int4 init_extends
Number of initial words found and extended.
Int4 good_init_extends
Number of successful initial extensions, i.e.
Progress monitoring structure.
Definition: blast_def.h:341
EBlastStage stage
Stage of the BLAST search currently in progress.
Definition: blast_def.h:342
void * user_data
Pointer to user-provided data.
Definition: blast_def.h:344
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
static string subject
static string query
Declares the CTBlastnOptionsHandle class.
Utility stuff for more convenient using of Boost.Test library.
@ FALSE
Definition: testodbc.c:27
@ TRUE
Definition: testodbc.c:27
static const char * kMatch
else result
Definition: token2.c:20
static HENV env
Definition: transaction2.c:38
Interface to retrieve list of available windowmasker filtering.
Modified on Fri Dec 08 08:19:22 2023 by modify_doxy.py rev. 669887