NCBI C++ ToolKit
blast_args.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_args.cpp 102828 2024-07-25 12:37:59Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: blast_args.cpp
29 
30 Author: Jason Papadopoulos
31 
32 ******************************************************************************/
33 
34 /** @file blast_args.cpp
35  * convert blast-related command line
36  * arguments into blast options
37 */
38 #include <ncbi_pch.hpp>
39 #include <corelib/ncbi_system.hpp>
44 #include <algo/blast/api/objmgr_query_data.hpp> /* for CObjMgrQueryFactory */
48 #include <util/format_guess.hpp>
49 #include <util/line_reader.hpp>
51 #include <algo/blast/blastinput/blast_input.hpp> // for CInputException
52 #include <algo/winmask/seq_masker_istat_factory.hpp> // for CSeqMaskerIstatFactory::DiscoverStatType
53 #include <connect/ncbi_connutil.h>
55 
56 #include <algo/blast/api/msa_pssm_input.hpp> // for CPsiBlastInputClustalW
57 #include <algo/blast/api/pssm_engine.hpp> // for CPssmEngine
58 
60 #include <objtools/blast/seqdb_reader/tax4blastsqlite.hpp> // for taxid to their descendant taxids lookup
62 BEGIN_SCOPE(blast)
64 USING_SCOPE(align_format);
65 
66 void
67 IBlastCmdLineArgs::ExtractAlgorithmOptions(const CArgs& /* cmd_line_args */,
68  CBlastOptions& /* options */)
69 {}
70 
72  const string& program_desc)
73  : m_ProgName(program_name), m_ProgDesc(program_desc)
74 {}
75 
76 void
78 {
79  // program description
80  arg_desc.SetUsageContext(m_ProgName, m_ProgDesc + " " +
81  CBlastVersion().Print());
82 }
83 
85  const string& default_task)
86 : m_SupportedTasks(supported_tasks), m_DefaultTask(default_task)
87 {
89  if ( !m_DefaultTask.empty() ) {
91  }
92 }
93 
94 void
96 {
97  arg_desc.SetCurrentGroup("General search options");
98  if ( !m_DefaultTask.empty() ) {
99  arg_desc.AddDefaultKey(kTask, "task_name", "Task to execute",
101  } else {
102  arg_desc.AddKey(kTask, "task_name", "Task to execute",
104  }
106  arg_desc.SetCurrentGroup("");
107 
108 }
109 
110 void
112  CBlastOptions& /* options */)
113 {
114  // N.B.: handling of tasks occurs at the application level to ensure that
115  // only relevant tasks are added (@sa CBlastnAppArgs)
116 }
117 
119 {
120  // Only support blastn for now
121  if (program == eBlastTypeBlastn) {
122  m_QueryIsProtein = false;
123  m_IsRpsBlast = false;
124  m_ShowPercentIdentity= true;
125  m_IsTblastx= false;
126  m_IsIgBlast = false;
127  m_SuppressSumStats = true;
128  m_IsBlastn = true;
129  }
130  else {
131  NCBI_THROW(CInputException, eInvalidInput, "Invalid program");
132  }
133 }
134 
135 void
137 {
138  arg_desc.SetCurrentGroup("General search options");
139 
140  // evalue cutoff
141  if (!m_IsIgBlast) {
142  string des = "Expectation value (E) threshold for saving hits. Default = 10";
143  if(m_IsBlastn) {
144  des += " (1000 for blastn-short)";
145  }
147  } else if (m_QueryIsProtein) {
148  arg_desc.AddDefaultKey(kArgEvalue, "evalue",
149  "Expectation value (E) threshold for saving hits ",
151  NStr::DoubleToString(1.0));
152  } else {
153  //igblastn
154  arg_desc.AddDefaultKey(kArgEvalue, "evalue",
155  "Expectation value (E) threshold for saving hits ",
157  NStr::DoubleToString(20.0));
158  }
159 
160  // word size
161  // Default values: blastn=11, megablast=28, others=3
162  if(!m_IsRpsBlast) {
163  const string description = m_QueryIsProtein
164  ? "Word size for wordfinder algorithm"
165  : "Word size for wordfinder algorithm (length of best perfect match)";
166  arg_desc.AddOptionalKey(kArgWordSize, "int_value", description,
171  }
172 
173  if ( !m_IsRpsBlast && !m_IsTblastx) {
174  // gap open penalty
175  arg_desc.AddOptionalKey(kArgGapOpen, "open_penalty",
176  "Cost to open a gap",
178 
179  // gap extend penalty
180  arg_desc.AddOptionalKey(kArgGapExtend, "extend_penalty",
181  "Cost to extend a gap",
183  }
184 
185 
187  arg_desc.SetCurrentGroup("Restrict search or results");
188  arg_desc.AddOptionalKey(kArgPercentIdentity, "float_value",
189  "Percent identity",
192  new CArgAllow_Doubles(0.0, 100.0));
193  }
194 
195  if (!m_IsIgBlast) {
196  arg_desc.SetCurrentGroup("Restrict search or results");
197  arg_desc.AddOptionalKey(kArgQueryCovHspPerc, "float_value",
198  "Percent query coverage per hsp",
201  new CArgAllow_Doubles(0.0, 100.0));
202 
203  arg_desc.AddOptionalKey(kArgMaxHSPsPerSubject, "int_value",
204  "Set maximum number of HSPs per subject sequence to save for each query",
208 
209  arg_desc.SetCurrentGroup("Extension options");
210  // ungapped X-drop
211  // Default values: blastn=20, megablast=10, others=7
212  arg_desc.AddOptionalKey(kArgUngappedXDropoff, "float_value",
213  "X-dropoff value (in bits) for ungapped extensions",
215 
216  // Tblastx is ungapped only.
217  if (!m_IsTblastx) {
218  // initial gapped X-drop
219  // Default values: blastn=30, megablast=20, tblastx=0, others=15
220  arg_desc.AddOptionalKey(kArgGappedXDropoff, "float_value",
221  "X-dropoff value (in bits) for preliminary gapped extensions",
223 
224  // final gapped X-drop
225  // Default values: blastn/megablast=50, tblastx=0, others=25
226  arg_desc.AddOptionalKey(kArgFinalGappedXDropoff, "float_value",
227  "X-dropoff value (in bits) for final gapped alignment",
229  }
230  }
231  arg_desc.SetCurrentGroup("Statistical options");
232  // effective search space
233  // Default value is the real size
234  arg_desc.AddOptionalKey(kArgEffSearchSpace, "int_value",
235  "Effective length of the search space",
239 
240  if (!m_SuppressSumStats) {
241  arg_desc.AddOptionalKey(kArgSumStats, "bool_value",
242  "Use sum statistics",
244  }
245 
246  arg_desc.SetCurrentGroup("");
247 }
248 
249 void
251  CBlastOptions& opt)
252 {
253  if (args.Exist(kArgEvalue) && args[kArgEvalue]) {
254  opt.SetEvalueThreshold(args[kArgEvalue].AsDouble());
255  }
256 
257  int gap_open=0, gap_extend=0;
258  if (args.Exist(kArgMatrixName) && args[kArgMatrixName])
260  (args[kArgMatrixName].AsString().c_str(), &gap_open, &gap_extend);
261 
262  if (args.Exist(kArgGapOpen) && args[kArgGapOpen]) {
263  opt.SetGapOpeningCost(args[kArgGapOpen].AsInteger());
264  }
265  else if (args.Exist(kArgMatrixName) && args[kArgMatrixName]) {
266  opt.SetGapOpeningCost(gap_open);
267  }
268 
269  if (args.Exist(kArgGapExtend) && args[kArgGapExtend]) {
270  opt.SetGapExtensionCost(args[kArgGapExtend].AsInteger());
271  }
272  else if (args.Exist(kArgMatrixName) && args[kArgMatrixName]) {
273  opt.SetGapExtensionCost(gap_extend);
274  }
275 
276  if (args.Exist(kArgUngappedXDropoff) && args[kArgUngappedXDropoff]) {
277  opt.SetXDropoff(args[kArgUngappedXDropoff].AsDouble());
278  }
279 
280  if (args.Exist(kArgGappedXDropoff) && args[kArgGappedXDropoff]) {
281  opt.SetGapXDropoff(args[kArgGappedXDropoff].AsDouble());
282  }
283 
285  opt.SetGapXDropoffFinal(args[kArgFinalGappedXDropoff].AsDouble());
286  }
287 
288  if ( args.Exist(kArgWordSize) && args[kArgWordSize]) {
289  if (m_QueryIsProtein && args[kArgWordSize].AsInteger() > 4){
291  opt.SetWordThreshold(19.3);
292  if (args[kArgWordSize].AsInteger() > 5) {
293  opt.SetWordThreshold(21.0);
294  }
295  if (args[kArgWordSize].AsInteger() > 6) {
296  opt.SetWordThreshold(20.25);
297  }
298  }
299  opt.SetWordSize(args[kArgWordSize].AsInteger());
300 
301  }
302 
303  if (args.Exist(kArgEffSearchSpace) && args[kArgEffSearchSpace]) {
305  env.Set("OLD_FSC", "true");
306  opt.SetEffectiveSearchSpace(args[kArgEffSearchSpace].AsInt8());
307  }
308 
309  if (args.Exist(kArgPercentIdentity) && args[kArgPercentIdentity]) {
310  opt.SetPercentIdentity(args[kArgPercentIdentity].AsDouble());
311  }
312 
313  if (args.Exist(kArgQueryCovHspPerc) && args[kArgQueryCovHspPerc]) {
314  opt.SetQueryCovHspPerc(args[kArgQueryCovHspPerc].AsDouble());
315  }
316 
318  opt.SetMaxHspsPerSubject(args[kArgMaxHSPsPerSubject].AsInteger());
319  }
320 
321  if (args.Exist(kArgSumStats) && args[kArgSumStats]) {
322  opt.SetSumStatisticsMode(args[kArgSumStats].AsBoolean());
323  }
324 }
325 
326 void
328 {
329  arg_desc.SetCurrentGroup("Query filtering options");
330 
331  if (m_QueryIsProtein) {
332  arg_desc.AddDefaultKey(kArgSegFiltering, "SEG_options",
333  "Filter query sequence with SEG "
334  "(Format: '" + kDfltArgApplyFiltering + "', " +
335  "'window locut hicut', or '" + kDfltArgNoFiltering +
336  "' to disable)",
339  arg_desc.AddDefaultKey(kArgLookupTableMaskingOnly, "soft_masking",
340  "Apply filtering locations as soft masks",
343  } else {
344  arg_desc.AddOptionalKey(kArgDustFiltering, "DUST_options",
345  "Filter query sequence with DUST "
346  "(Format: '" + kDfltArgApplyFiltering + "', " +
347  "'level window linker', or '" + kDfltArgNoFiltering +
348  "' to disable) Default = '20 64 1' ('" + kDfltArgNoFiltering + "' for blastn-short)",
350  arg_desc.AddOptionalKey(kArgFilteringDb, "filtering_database",
351  "BLAST database containing filtering elements (i.e.: repeats)",
353 
354  arg_desc.AddOptionalKey(kArgWindowMaskerTaxId, "window_masker_taxid",
355  "Enable WindowMasker filtering using a Taxonomic ID",
357 
358  arg_desc.AddOptionalKey(kArgWindowMaskerDatabase, "window_masker_db",
359  "Enable WindowMasker filtering using this repeats database.",
361 
362  arg_desc.AddDefaultKey(kArgLookupTableMaskingOnly, "soft_masking",
363  "Apply filtering locations as soft masks",
366  }
367 
368  arg_desc.SetCurrentGroup("");
369 }
370 
371 void
372 CFilteringArgs::x_TokenizeFilteringArgs(const string& filtering_args,
373  vector<string>& output) const
374 {
375  output.clear();
376  NStr::Split(filtering_args, " ", output);
377  if (output.size() != 3) {
378  NCBI_THROW(CInputException, eInvalidInput,
379  "Invalid number of arguments to filtering option");
380  }
381 }
382 
383 void
385 {
386  if (args[kArgLookupTableMaskingOnly]) {
387  opt.SetMaskAtHash(args[kArgLookupTableMaskingOnly].AsBoolean());
388  }
389 
390  vector<string> tokens;
391 
392  try {
393  if (m_QueryIsProtein && args[kArgSegFiltering]) {
394  const string& seg_opts = args[kArgSegFiltering].AsString();
395  if (seg_opts == kDfltArgNoFiltering) {
396  opt.SetSegFiltering(false);
397  } else if (seg_opts == kDfltArgApplyFiltering) {
398  opt.SetSegFiltering(true);
399  } else {
400  x_TokenizeFilteringArgs(seg_opts, tokens);
404  }
405  }
406 
407  if ( !m_QueryIsProtein && args[kArgDustFiltering]) {
408  const string& dust_opts = args[kArgDustFiltering].AsString();
409  if (dust_opts == kDfltArgNoFiltering) {
410  opt.SetDustFiltering(false);
411  } else if (dust_opts == kDfltArgApplyFiltering) {
412  opt.SetDustFiltering(true);
413  } else {
414  x_TokenizeFilteringArgs(dust_opts, tokens);
418  }
419  }
420  } catch (const CStringException& e) {
422  NCBI_THROW(CInputException, eInvalidInput,
423  "Invalid input for filtering parameters");
424  }
425  }
426 
427  int filter_dbs = 0;
428 
429  if (args.Exist(kArgFilteringDb) && args[kArgFilteringDb]) {
430  opt.SetRepeatFilteringDB(args[kArgFilteringDb].AsString().c_str());
431  filter_dbs++;
432  }
433 
434  if (args.Exist(kArgWindowMaskerTaxId) &&
435  args[kArgWindowMaskerTaxId]) {
436 
438  (args[kArgWindowMaskerTaxId].AsInteger());
439 
440  filter_dbs++;
441  }
442 
443  if (args.Exist(kArgWindowMaskerDatabase) &&
444  args[kArgWindowMaskerDatabase]) {
445  const string& stat_file = args[kArgWindowMaskerDatabase].AsString();
450  string msg("Only optimized binary windowmasker stat files are supported");
451  NCBI_THROW(CInputException, eInvalidInput, msg);
452  }
453 
454  opt.SetWindowMaskerDatabase(stat_file.c_str());
455  filter_dbs++;
456  }
457 
458  if (filter_dbs > 1) {
459  string msg =
460  string("Please specify at most one of ") + kArgFilteringDb + ", " +
462 
463  NCBI_THROW(CInputException, eInvalidInput, msg);
464  }
465 }
466 
467 void
469 {
470  arg_desc.SetCurrentGroup("Extension options");
471  // 2-hit wordfinder window size
472  arg_desc.AddOptionalKey(kArgWindowSize, "int_value",
473  "Multiple hits window size, use 0 to specify "
474  "1-hit algorithm",
476  arg_desc.SetConstraint(kArgWindowSize,
478  arg_desc.SetCurrentGroup("");
479 }
480 
481 void
483 {
484  if (args[kArgWindowSize]) {
485  opt.SetWindowSize(args[kArgWindowSize].AsInteger());
486  } else {
487  int window = -1;
489  opt.GetMatrixName(),
490  &window);
491  if (window != -1) {
492  opt.SetWindowSize(window);
493  }
494  }
495 }
496 
497 void
499 {
500  arg_desc.SetCurrentGroup("Extension options");
501  // 2-hit wordfinder off diagonal range
502  arg_desc.AddDefaultKey(kArgOffDiagonalRange, "int_value",
503  "Number of off-diagonals to search for the 2nd hit, "
504  "use 0 to turn off",
509  arg_desc.SetCurrentGroup("");
510 }
511 
512 void
514 {
515  if (args[kArgOffDiagonalRange]) {
516  opt.SetOffDiagonalRange(args[kArgOffDiagonalRange].AsInteger());
517  } else {
518  opt.SetOffDiagonalRange(0);
519  }
520 }
521 
522 // Options specific to rmblastn -RMH-
523 void
525 {
526  arg_desc.SetCurrentGroup("General search options");
527 
528  arg_desc.AddDefaultKey(kArgMatrixName, "matrix_name",
529  "Scoring matrix name",
531  string(""));
532 
533  arg_desc.AddFlag(kArgComplexityAdj,
534  "Use complexity adjusted scoring",
535  true);
536 
537 
538  arg_desc.AddDefaultKey(kArgMaskLevel, "int_value",
539  "Masklevel - percentage overlap allowed per "
540  "query domain [0-101]",
543  arg_desc.SetConstraint(kArgMaskLevel,
545 
546  arg_desc.SetCurrentGroup("");
547 }
548 
549 // Options specific to rmblastn -RMH-
550 void
552 {
553  if (args[kArgMatrixName]) {
554  opt.SetMatrixName(args[kArgMatrixName].AsString().c_str());
555  }
556 
558 
559  if (args[kArgMaskLevel]) {
560  opt.SetMaskLevel(args[kArgMaskLevel].AsInteger());
561  }
562 
563  if (args[kArgMinRawGappedScore]) {
564  opt.SetCutoffScore(args[kArgMinRawGappedScore].AsInteger());
565  }else if (args[kArgUngappedXDropoff]) {
566  opt.SetCutoffScore(args[kArgUngappedXDropoff].AsInteger());
567  }
568 }
569 
570 void
572 {
573  arg_desc.SetCurrentGroup("General search options");
574  // lookup table word score threshold
575  arg_desc.AddOptionalKey(kArgWordScoreThreshold, "float_value",
576  "Minimum word score such that the word is added to the "
577  "BLAST lookup table",
581  arg_desc.SetCurrentGroup("");
582 }
583 
584 static bool
585 s_IsDefaultWordThreshold(EProgram program, double threshold)
586 {
587  int word_threshold = static_cast<int>(threshold);
588  bool retval = true;
589  if (program == eBlastp &&
590  word_threshold != BLAST_WORD_THRESHOLD_BLASTP) {
591  retval = false;
592  } else if (program == eBlastx &&
593  word_threshold != BLAST_WORD_THRESHOLD_BLASTX) {
594  retval = false;
595  } else if (program == eTblastn &&
596  word_threshold != BLAST_WORD_THRESHOLD_TBLASTN) {
597  retval = false;
598  }
599  return retval;
600 }
601 
602 void
604  CBlastOptions& opt)
605 {
606  if (args[kArgWordScoreThreshold]) {
607  opt.SetWordThreshold(args[kArgWordScoreThreshold].AsDouble());
608  } else if (s_IsDefaultWordThreshold(opt.GetProgram(),
609  opt.GetWordThreshold())) {
610  double threshold = -1;
612  opt.GetMatrixName(),
613  &threshold);
614  if (threshold != -1) {
615  opt.SetWordThreshold(threshold);
616  }
617  }
618 }
619 
620 void
622 {
623  arg_desc.SetCurrentGroup("General search options");
624  arg_desc.AddOptionalKey(kArgMatrixName, "matrix_name",
625  "Scoring matrix name (normally BLOSUM62)",
627  arg_desc.SetCurrentGroup("");
628 }
629 
630 void
632 {
633  if (args[kArgMatrixName]) {
634  opt.SetMatrixName(args[kArgMatrixName].AsString().c_str());
635  }
636 }
637 
638 void
640 {
641  // TLM arg_desc.SetCurrentGroup("Nucleotide scoring options");
642 
643  arg_desc.SetCurrentGroup("General search options");
644  // blastn mismatch penalty
645  arg_desc.AddOptionalKey(kArgMismatch, "penalty",
646  "Penalty for a nucleotide mismatch",
648  arg_desc.SetConstraint(kArgMismatch,
650 
651  // blastn match reward
652  arg_desc.AddOptionalKey(kArgMatch, "reward",
653  "Reward for a nucleotide match",
655  arg_desc.SetConstraint(kArgMatch,
657 
658 
659  arg_desc.SetCurrentGroup("Extension options");
661  "Use non-greedy dynamic programming extension",
662  true);
663 
664  arg_desc.SetCurrentGroup("");
665 }
666 
667 void
669  CBlastOptions& options)
670 {
671  if (cmd_line_args.Exist(kArgMismatch) && cmd_line_args[kArgMismatch]) {
672  options.SetMismatchPenalty(cmd_line_args[kArgMismatch].AsInteger());
673  }
674  if (cmd_line_args.Exist(kArgMatch) && cmd_line_args[kArgMatch]) {
675  options.SetMatchReward(cmd_line_args[kArgMatch].AsInteger());
676  }
677 
678  if (cmd_line_args.Exist(kArgNoGreedyExtension) &&
679  cmd_line_args[kArgNoGreedyExtension]) {
682  }
683 }
684 
685 /// Value to specify coding template type
686 const char* kTemplType_Coding = "coding";
687 /// Value to specify optimal template type
688 const char* kTemplType_Optimal = "optimal";
689 /// Value to specify coding+optimal template type
690 const char* kTemplType_CodingAndOptimal = "coding_and_optimal";
691 
692 void
694 {
695  arg_desc.SetCurrentGroup("Extension options");
696  // FIXME: this can be applied to any program, but since it was only offered
697  // in megablast, we're putting it here
698  arg_desc.AddOptionalKey(kArgMinRawGappedScore, "int_value",
699  "Minimum raw gapped score to keep an alignment "
700  "in the preliminary gapped and traceback stages",
702 
703  arg_desc.SetCurrentGroup("Discontiguous MegaBLAST options");
704 
705  arg_desc.AddOptionalKey(kArgDMBTemplateType, "type",
706  "Discontiguous MegaBLAST template type",
715 
716  arg_desc.AddOptionalKey(kArgDMBTemplateLength, "int_value",
717  "Discontiguous MegaBLAST template length",
719  set<int> allowed_values;
720  allowed_values.insert(16);
721  allowed_values.insert(18);
722  allowed_values.insert(21);
724  new CArgAllowIntegerSet(allowed_values));
728 
729  arg_desc.SetCurrentGroup("");
730 }
731 
732 void
734  CBlastOptions& options)
735 {
736  if (args[kArgMinRawGappedScore]) {
737  options.SetCutoffScore(args[kArgMinRawGappedScore].AsInteger());
738  }
739 
740  if (args[kArgDMBTemplateType]) {
741  const string& type = args[kArgDMBTemplateType].AsString();
742  EDiscWordType temp_type = eMBWordCoding;
743 
744  if (type == kTemplType_Coding) {
745  temp_type = eMBWordCoding;
746  } else if (type == kTemplType_Optimal) {
747  temp_type = eMBWordOptimal;
748  } else if (type == kTemplType_CodingAndOptimal) {
749  temp_type = eMBWordTwoTemplates;
750  } else {
751  abort();
752  }
753  options.SetMBTemplateType(static_cast<unsigned char>(temp_type));
754  }
755 
756  if (args[kArgDMBTemplateLength]) {
757  unsigned char tlen =
758  static_cast<unsigned char>(args[kArgDMBTemplateLength].AsInteger());
759  options.SetMBTemplateLength(tlen);
760  }
761 
762  // FIXME: should the window size be adjusted if this is set?
763 }
764 
765 void
767 {
768  arg_desc.SetCurrentGroup("General search options");
769  // composition based statistics, keep in sync with ECompoAdjustModes
770  // documentation in composition_constants.h
771 
772  string zero_opt = !m_ZeroOptDescr.empty() ?
773  (string)" 0 or F or f: " + m_ZeroOptDescr + "\n" :
774  " 0 or F or f: No composition-based statistics\n";
775 
776  string one_opt_insrt = m_Is2and3Supported ? "" : " or T or t";
777 
778  string more_opts = m_Is2and3Supported ?
779  " 2 or T or t : Composition-based score adjustment as in "
780  "Bioinformatics 21:902-911,\n"
781  " 2005, conditioned on sequence properties\n"
782  " 3: Composition-based score adjustment as in "
783  "Bioinformatics 21:902-911,\n"
784  " 2005, unconditionally\n" : "";
785 
786  string legend = (string)"Use composition-based statistics:\n"
787  " D or d: default (equivalent to " + m_DefaultOpt + " )\n"
788  + zero_opt
789  + " 1" + one_opt_insrt + ": Composition-based statistics "
790  "as in NAR 29:2994-3005, 2001\n"
791  + more_opts;
792 
793  arg_desc.AddDefaultKey(kArgCompBasedStats, "compo", legend,
795 
796 
797  arg_desc.SetCurrentGroup("Miscellaneous options");
798  // Use Smith-Waterman algorithm in traceback stage
799  // FIXME: available only for gapped blastp/tblastn, and with
800  // composition-based statistics
801  arg_desc.AddFlag(kArgUseSWTraceback,
802  "Compute locally optimal Smith-Waterman alignments?",
803  true);
804  arg_desc.SetCurrentGroup("");
805 }
806 
807 /**
808  * @brief Auxiliary function to set the composition based statistics and smith
809  * waterman options
810  *
811  * @param opt BLAST options object [in|out]
812  * @param comp_stat_string command line value for composition based statistics
813  * [in]
814  * @param smith_waterman_value command line value for determining the use of
815  * the smith-waterman algorithm [in]
816  * @param ungapped pointer to the value which determines whether the search
817  * should be ungapped or not. It is NULL if ungapped searches are not
818  * applicable
819  * @param is_deltablast is program deltablast [in]
820  */
821 static void
823  const string& comp_stat_string,
824  bool smith_waterman_value,
825  bool* ungapped)
826 {
827  const EProgram program = opt.GetProgram();
828  if (program == eBlastp || program == eTblastn ||
829  program == ePSIBlast || program == ePSITblastn ||
830  program == eRPSBlast || program == eRPSTblastn ||
831  program == eBlastx || program == eDeltaBlast) {
832 
834 
835  switch (comp_stat_string[0]) {
836  case '0': case 'F': case 'f':
837  compo_mode = eNoCompositionBasedStats;
838  break;
839  case '1':
840  compo_mode = eCompositionBasedStats;
841  break;
842  case 'D': case 'd':
843  if ((program == eRPSBlast) || (program == eRPSTblastn)) {
844  compo_mode = eNoCompositionBasedStats;
845  }
846  else if (program == eDeltaBlast) {
847  compo_mode = eCompositionBasedStats;
848  }
849  else {
850  compo_mode = eCompositionMatrixAdjust;
851  }
852  break;
853  case '2':
854  compo_mode = eCompositionMatrixAdjust;
855  break;
856  case '3':
857  compo_mode = eCompoForceFullMatrixAdjust;
858  break;
859  case 'T': case 't':
860  compo_mode = (program == eRPSBlast || program == eRPSTblastn || program == eDeltaBlast) ?
862  break;
863  }
864 
865  if(program == ePSITblastn) {
866  compo_mode = eNoCompositionBasedStats;
867  }
868 
869  if (ungapped && *ungapped && compo_mode != eNoCompositionBasedStats) {
870  NCBI_THROW(CInputException, eInvalidInput,
871  "Composition-adjusted searched are not supported with "
872  "an ungapped search, please add -comp_based_stats F or "
873  "do a gapped search");
874  }
875 
876  opt.SetCompositionBasedStats(compo_mode);
877  if (program == eBlastp &&
878  compo_mode != eNoCompositionBasedStats &&
879  tolower(comp_stat_string[1]) == 'u') {
880  opt.SetUnifiedP(1);
881  }
882  opt.SetSmithWatermanMode(smith_waterman_value);
883  }
884 }
885 
886 void
888  CBlastOptions& opt)
889 {
890  if (args[kArgCompBasedStats]) {
891  unique_ptr<bool> ungapped(args.Exist(kArgUngapped)
892  ? new bool(args[kArgUngapped]) : 0);
894  args[kArgCompBasedStats].AsString(),
895  args[kArgUseSWTraceback],
896  ungapped.get());
897  }
898 
899 }
900 
901 void
903 {
904  // perform gapped search
905 #if 0
906  arg_desc.AddOptionalKey(ARG_GAPPED, "gapped",
907  "Perform gapped alignment (default T, but "
908  "not available for tblastx)",
911  arg_desc.AddAlias("-gapped", ARG_GAPPED);
912 #endif
913  arg_desc.SetCurrentGroup("Extension options");
914  arg_desc.AddFlag(kArgUngapped, "Perform ungapped alignment only?", true);
915  arg_desc.SetCurrentGroup("");
916 }
917 
918 void
920 {
921 #if 0
922  if (args[ARG_GAPPED] && options.GetProgram() != eTblastx) {
923  options.SetGappedMode(args[ARG_GAPPED].AsBoolean());
924  }
925 #endif
926  options.SetGappedMode( !args[kArgUngapped] );
927 }
928 
929 void
931 {
932  arg_desc.SetCurrentGroup("General search options");
933  // largest intron length
934  arg_desc.AddDefaultKey(kArgMaxIntronLength, "length",
935  "Length of the largest intron allowed in a translated "
936  "nucleotide sequence when linking multiple distinct "
937  "alignments",
942  arg_desc.SetCurrentGroup("");
943 }
944 
945 void
947  CBlastOptions& opt)
948 {
949  if ( !args[kArgMaxIntronLength] ) {
950  return;
951  }
952 
953  // sum statistics are defauled to be on unless a cmdline option is set
954  opt.SetLongestIntronLength(args[kArgMaxIntronLength].AsInteger());
955 
956 }
957 
958 void
960 {
961  arg_desc.SetCurrentGroup("General search options");
962  // applicable in blastx/tblastn, off by default
963  arg_desc.AddOptionalKey(kArgFrameShiftPenalty, "frameshift",
964  "Frame shift penalty (for use with out-of-frame "
965  "gapped alignment in blastx or tblastn, default "
966  "ignored)",
971  arg_desc.SetCurrentGroup("");
972 }
973 
974 void
976  CBlastOptions& opt)
977 {
978  if (args[kArgFrameShiftPenalty]) {
979  if (args[kArgCompBasedStats]) {
980  string cbs = args[kArgCompBasedStats].AsString();
981 
982  if ((cbs[0] != '0' )&& (cbs[0] != 'F') && (cbs[0] != 'f')) {
983  NCBI_THROW(CInputException, eInvalidInput,
984  "Composition-adjusted searches are not supported with "
985  "Out-Of-Frame option, please add -comp_based_stats F ");
986  }
987  }
988 
989  opt.SetOutOfFrameMode();
990  opt.SetFrameShiftPenalty(args[kArgFrameShiftPenalty].AsInteger());
991  }
992 }
993 
994 /// Auxiliary class to validate the genetic code input
996 {
997 protected:
998  /// Overloaded method from CArgAllow
999  virtual bool Verify(const string& value) const {
1000  static int gcs[] = {1,2,3,4,5,6,9,10,11,12,13,14,15,16,21,22,23,24,25,26,27,28,29,30,31,33};
1001  static const set<int> genetic_codes(gcs, gcs+sizeof(gcs)/sizeof(*gcs));
1002  const int val = NStr::StringToInt(value);
1003  return (genetic_codes.find(val) != genetic_codes.end());
1004  }
1005 
1006  /// Overloaded method from CArgAllow
1007  virtual string GetUsage(void) const {
1008  return "values between: 1-6, 9-16, 21-31, 33";
1009  }
1010 };
1011 
1012 void
1014 {
1015  if (m_Target == eQuery) {
1016  arg_desc.SetCurrentGroup("Input query options");
1017  // query genetic code
1018  arg_desc.AddDefaultKey(kArgQueryGeneticCode, "int_value",
1019  "Genetic code to use to translate query (see https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes for details)\n",
1024  } else {
1025  arg_desc.SetCurrentGroup("General search options");
1026  // DB genetic code
1027  arg_desc.AddDefaultKey(kArgDbGeneticCode, "int_value",
1028  "Genetic code to use to translate "
1029  "database/subjects (see user manual for details)\n",
1034 
1035  }
1036  arg_desc.SetCurrentGroup("");
1037 }
1038 
1039 void
1041  CBlastOptions& opt)
1042 {
1043  const EProgram program = opt.GetProgram();
1044 
1045  if (m_Target == eQuery && args[kArgQueryGeneticCode]) {
1046  opt.SetQueryGeneticCode(args[kArgQueryGeneticCode].AsInteger());
1047  }
1048 
1049  if (m_Target == eDatabase && args[kArgDbGeneticCode] &&
1050  (program == eTblastn || program == eTblastx) ) {
1051  opt.SetDbGeneticCode(args[kArgDbGeneticCode].AsInteger());
1052  }
1053 }
1054 
1055 void
1057 {
1058  arg_desc.SetCurrentGroup("Extension options");
1059 
1060  const double default_value = m_QueryIsProtein
1062  arg_desc.AddDefaultKey(kArgGapTrigger, "float_value",
1063  "Number of bits to trigger gapping",
1065  NStr::DoubleToString(default_value));
1066  arg_desc.SetCurrentGroup("");
1067 }
1068 
1069 void
1071  CBlastOptions& opt)
1072 {
1073  if (args[kArgGapTrigger]) {
1074  opt.SetGapTrigger(args[kArgGapTrigger].AsDouble());
1075  }
1076 }
1077 
1078 void
1080 {
1081  arg_desc.SetCurrentGroup("PSSM engine options");
1082 
1083  // Pseudo count
1084  arg_desc.AddDefaultKey(kArgPSIPseudocount, "pseudocount",
1085  "Pseudo-count value used when constructing PSSM",
1088 
1089  if (m_IsDeltaBlast) {
1090  arg_desc.AddDefaultKey(kArgDomainInclusionEThreshold, "ethresh",
1091  "E-value inclusion threshold for alignments "
1092  "with conserved domains",
1095  }
1096 
1097  // Evalue inclusion threshold
1098  arg_desc.AddDefaultKey(kArgPSIInclusionEThreshold, "ethresh",
1099  "E-value inclusion threshold for pairwise alignments",
1102 
1103  arg_desc.SetCurrentGroup("");
1104 }
1105 
1106 void
1108  CBlastOptions& opt)
1109 {
1110  if (args[kArgPSIPseudocount]) {
1111  opt.SetPseudoCount(args[kArgPSIPseudocount].AsInteger());
1112  }
1113 
1114  if (args[kArgPSIInclusionEThreshold]) {
1115  opt.SetInclusionThreshold(args[kArgPSIInclusionEThreshold].AsDouble());
1116  }
1117 
1119  && args[kArgDomainInclusionEThreshold]) {
1120 
1122  args[kArgDomainInclusionEThreshold].AsDouble());
1123  }
1124 }
1125 
1126 void
1128 {
1129 
1130  if (m_DbTarget == eNucleotideDb) {
1131  arg_desc.SetCurrentGroup("PSI-TBLASTN options");
1132 
1133  // PSI-tblastn checkpoint
1134  arg_desc.AddOptionalKey(kArgPSIInputChkPntFile, "psi_chkpt_file",
1135  "PSI-TBLASTN checkpoint file",
1139  kArgRemote);
1140  } else {
1141  arg_desc.SetCurrentGroup("PSI-BLAST options");
1142 
1143  // Number of iterations
1144  arg_desc.AddDefaultKey(kArgPSINumIterations, "int_value",
1145  "Number of iterations to perform (0 means run "
1146  "until convergence)", CArgDescriptions::eInteger,
1152  kArgRemote);
1153  // checkpoint file
1154  arg_desc.AddOptionalKey(kArgPSIOutputChkPntFile, "checkpoint_file",
1155 
1156  "File name to store checkpoint file",
1158  // ASCII matrix file
1159  arg_desc.AddOptionalKey(kArgAsciiPssmOutputFile, "ascii_mtx_file",
1160  "File name to store ASCII version of PSSM",
1162 
1163  arg_desc.AddFlag(kArgSaveLastPssm, "Save PSSM after the last database "
1164  "search");
1165  arg_desc.AddFlag(kArgSaveAllPssms, "Save PSSM after each iteration "
1166  "(file name is given in -save_pssm or "
1167  "-save_ascii_pssm options)");
1168 
1169  if (!m_IsDeltaBlast) {
1170  vector<string> msa_exclusions;
1171  msa_exclusions.push_back(kArgPSIInputChkPntFile);
1172  msa_exclusions.push_back(kArgQuery);
1173  msa_exclusions.push_back(kArgQueryLocation);
1174  // pattern and MSA is not supported
1175  msa_exclusions.push_back(kArgPHIPatternFile);
1176  arg_desc.SetCurrentGroup("");
1177  arg_desc.SetCurrentGroup("");
1178 
1179  // MSA restart file
1180  arg_desc.SetCurrentGroup("PSSM engine options");
1181  arg_desc.AddOptionalKey(kArgMSAInputFile, "align_restart",
1182  "File name of multiple sequence alignment to "
1183  "restart PSI-BLAST",
1185  ITERATE(vector<string>, exclusion, msa_exclusions) {
1188  *exclusion);
1189  }
1190 
1191  arg_desc.AddOptionalKey(kArgMSAMasterIndex, "index",
1192  "Ordinal number (1-based index) of the sequence"
1193  " to use as a master in the multiple sequence "
1194  "alignment. If not provided, the first sequence"
1195  " in the multiple sequence alignment will be "
1196  "used", CArgDescriptions::eInteger);
1199  ITERATE(vector<string>, exclusion, msa_exclusions) {
1202  *exclusion);
1203  }
1210 
1211  arg_desc.AddFlag(kArgIgnoreMsaMaster,
1212  "Ignore the master sequence when creating PSSM", true);
1213 
1214  vector<string> ignore_pssm_master_exclusions;
1215  ignore_pssm_master_exclusions.push_back(kArgMSAMasterIndex);
1216  ignore_pssm_master_exclusions.push_back(kArgPSIInputChkPntFile);
1217  ignore_pssm_master_exclusions.push_back(kArgQuery);
1218  ignore_pssm_master_exclusions.push_back(kArgQueryLocation);
1219  ITERATE(vector<string>, exclusion, msa_exclusions) {
1222  *exclusion);
1223  }
1227 
1228  // PSI-BLAST checkpoint
1229  arg_desc.AddOptionalKey(kArgPSIInputChkPntFile, "psi_chkpt_file",
1230  "PSI-BLAST checkpoint file",
1234  kArgRemote);
1235  }
1236  }
1237 
1238  if (!m_IsDeltaBlast) {
1241  kArgQuery);
1245  }
1246  arg_desc.SetCurrentGroup("");
1247 }
1248 
1251  CBlastOptions& opt, bool save_ascii_pssm,
1252  unsigned int msa_master_idx,
1253  bool ignore_pssm_tmplt_seq)
1254 {
1255  // FIXME get these from CBlastOptions
1256  CPSIBlastOptions psiblast_opts;
1257  PSIBlastOptionsNew(&psiblast_opts);
1258  psiblast_opts->nsg_compatibility_mode = ignore_pssm_tmplt_seq;
1259 
1260  CPSIDiagnosticsRequest diags(PSIDiagnosticsRequestNewEx(save_ascii_pssm));
1261  CPsiBlastInputClustalW pssm_input(input_stream, *psiblast_opts,
1262  opt.GetMatrixName(), diags, NULL, 0,
1263  opt.GetGapOpeningCost(),
1264  opt.GetGapExtensionCost(),
1265  msa_master_idx);
1266  CPssmEngine pssm_engine(&pssm_input);
1267  return pssm_engine.Run();
1268 }
1269 
1270 void
1272  CBlastOptions& opt)
1273 {
1274  if (m_DbTarget == eProteinDb) {
1275  if (args[kArgPSINumIterations]) {
1276  if(m_NumIterations == 1)
1277  m_NumIterations = args[kArgPSINumIterations].AsInteger();
1278  }
1279 
1280  if (args.Exist(kArgSaveLastPssm) && args[kArgSaveLastPssm] &&
1281  (!args.Exist(kArgPSIOutputChkPntFile) ||
1282  !args[kArgPSIOutputChkPntFile]) &&
1283  (!args.Exist(kArgAsciiPssmOutputFile) ||
1284  !args[kArgAsciiPssmOutputFile])) {
1285 
1286  NCBI_THROW(CInputException, eInvalidInput, kArgSaveLastPssm +
1287  " option requires " + kArgPSIOutputChkPntFile + " or " +
1289  }
1290 
1291  if (args.Exist(kArgSaveAllPssms) && args[kArgSaveAllPssms] &&
1292  (!args.Exist(kArgPSIOutputChkPntFile) ||
1293  !args[kArgPSIOutputChkPntFile]) &&
1294  (!args.Exist(kArgAsciiPssmOutputFile) ||
1295  !args[kArgAsciiPssmOutputFile])) {
1296 
1297  NCBI_THROW(CInputException, eInvalidInput, kArgSaveAllPssms +
1298  " option requires " + kArgPSIOutputChkPntFile + " or " +
1300  }
1301 
1302  const bool kSaveAllPssms
1303  = args.Exist(kArgSaveAllPssms) && args[kArgSaveAllPssms];
1304  if (args.Exist(kArgPSIOutputChkPntFile) &&
1305  args[kArgPSIOutputChkPntFile]) {
1308  (args[kArgPSIOutputChkPntFile].AsString(), kSaveAllPssms));
1309  }
1310  const bool kSaveAsciiPssm = args[kArgAsciiPssmOutputFile];
1311  if (kSaveAsciiPssm) {
1314  (args[kArgAsciiPssmOutputFile].AsString(), kSaveAllPssms));
1315  }
1316  if (args.Exist(kArgMSAInputFile) && args[kArgMSAInputFile]) {
1317  CNcbiIstream& in = args[kArgMSAInputFile].AsInputFile();
1318  unsigned int msa_master_idx = 0;
1319  if (args[kArgMSAMasterIndex]) {
1320  msa_master_idx = args[kArgMSAMasterIndex].AsInteger() - 1;
1321  }
1322  m_Pssm = x_CreatePssmFromMsa(in, opt, kSaveAsciiPssm,
1323  msa_master_idx,
1324  args[kArgIgnoreMsaMaster]);
1325  }
1326  if (!m_IsDeltaBlast) {
1328  }
1329 
1330  if (args.Exist(kArgSaveLastPssm) && args[kArgSaveLastPssm]) {
1331  m_SaveLastPssm = true;
1332  }
1333  }
1334 
1336  CNcbiIstream& in = args[kArgPSIInputChkPntFile].AsInputFile();
1337  _ASSERT(m_Pssm.Empty());
1339  try {
1340  switch (CFormatGuess().Format(in)) {
1342  in >> MSerial_AsnBinary >> *m_Pssm;
1343  break;
1345  in >> MSerial_AsnText >> *m_Pssm;
1346  break;
1347  case CFormatGuess::eXml:
1348  in >> MSerial_Xml >> *m_Pssm;
1349  break;
1350  default:
1351  NCBI_THROW(CInputException, eInvalidInput,
1352  "Unsupported format for PSSM");
1353  }
1354  } catch (const CSerialException&) {
1355  string msg("Unrecognized format for PSSM in ");
1356  msg += args[kArgPSIInputChkPntFile].AsString() + " (must be ";
1357  msg += "PssmWithParameters)";
1358  NCBI_THROW(CInputException, eInvalidInput, msg);
1359  }
1360  _ASSERT(m_Pssm.NotEmpty());
1361  }
1362 }
1363 
1364 void
1366 {
1367  arg_desc.SetCurrentGroup("PHI-BLAST options");
1368 
1369  arg_desc.AddOptionalKey(kArgPHIPatternFile, "file",
1370  "File name containing pattern to search",
1375 
1376  arg_desc.SetCurrentGroup("");
1377 }
1378 
1379 void
1381  CBlastOptions& opt)
1382 {
1383  if (args.Exist(kArgPHIPatternFile) && args[kArgPHIPatternFile]) {
1384  CNcbiIstream& in = args[kArgPHIPatternFile].AsInputFile();
1385  in.clear();
1386  in.seekg(0);
1387  char buffer[4096];
1388  string line;
1389  string pattern;
1390  string name;
1391  while (in.getline(buffer, 4096)) {
1392  line = buffer;
1393  string ltype = line.substr(0, 2);
1394  if (ltype == "ID")
1395  name = line.substr(4);
1396  else if (ltype == "PA")
1397  pattern = line.substr(4);
1398  }
1399  if (!pattern.empty())
1400  opt.SetPHIPattern(pattern.c_str(),
1402  ? true : false));
1403  else
1404  NCBI_THROW(CInputException, eInvalidInput,
1405  "PHI pattern not read");
1406  }
1407 }
1408 
1409 void
1411 {
1412  arg_desc.SetCurrentGroup("KBLASTP options");
1413  arg_desc.AddDefaultKey(kArgJDistance, "threshold", "Jaccard Distance",
1415  arg_desc.AddDefaultKey(kArgMinHits, "minhits", "minimal number of LSH matches",
1417  arg_desc.AddDefaultKey(kArgCandidateSeqs, "candidates", "Number of candidate sequences to process with BLAST",
1419 }
1420 
1421 void
1423  CBlastOptions& opt)
1424 {
1425  if (args.Exist(kArgJDistance))
1426  m_JDistance = args[kArgJDistance].AsDouble();
1427  if (args.Exist(kArgMinHits))
1428  m_MinHits = args[kArgMinHits].AsInteger();
1429  if (args.Exist(kArgCandidateSeqs))
1430  m_CandidateSeqs = args[kArgCandidateSeqs].AsInteger();
1431 }
1432 
1433 
1434 void
1436 {
1437  arg_desc.SetCurrentGroup("DELTA-BLAST options");
1438 
1439  arg_desc.AddDefaultKey(kArgRpsDb, "database_name", "BLAST domain "
1440  "database name", CArgDescriptions::eString,
1441  kDfltArgRpsDb);
1442 
1443  arg_desc.AddFlag(kArgShowDomainHits, "Show domain hits");
1445  kArgRemote);
1447  kArgSubject);
1448 }
1449 
1450 void
1452  CBlastOptions& opt)
1453 {
1454  m_DomainDb.Reset(new CSearchDatabase(args[kArgRpsDb].AsString(),
1456 
1457  if (args.Exist(kArgShowDomainHits)) {
1459  }
1460 }
1461 
1462 void
1464 {
1465 
1466  arg_desc.SetCurrentGroup("Mapping options");
1467  arg_desc.AddDefaultKey(kArgScore, "num", "Cutoff score for accepting "
1468  "alignments. Can be expressed as a number or a "
1469  "function of read length: "
1470  "L,b,a for a * length + b.\n"
1471  "Zero means that the cutoff score will be equal to:\n"
1472  "read length, if read length <= 20,\n"
1473  "20, if read length <= 30,\n"
1474  "read length - 10, if read length <= 50,\n"
1475  "40, otherwise.",
1477  arg_desc.AddOptionalKey(kArgMaxEditDist, "num", "Cutoff edit distance for "
1478  "accepting an alignment\nDefault = unlimited",
1480  arg_desc.AddDefaultKey(kArgSplice, "TF", "Search for spliced alignments",
1481  CArgDescriptions::eBoolean, "true");
1482  arg_desc.AddDefaultKey(kArgRefType, "type", "Type of the reference: "
1483  "genome or transcriptome",
1484  CArgDescriptions::eString, "genome");
1485  arg_desc.SetConstraint(kArgRefType,
1486  &(*new CArgAllow_Strings, "genome", "transcriptome"));
1487 
1488  arg_desc.SetCurrentGroup("Query filtering options");
1489  arg_desc.AddDefaultKey(kArgLimitLookup, "TF", "Remove word seeds with "
1490  "high frequency in the searched database",
1491  CArgDescriptions::eBoolean, "true");
1492  arg_desc.AddDefaultKey(kArgMaxDbWordCount, "num", "Words that appear more "
1493  "than this number of times in the database will be"
1494  " masked in the lookup table",
1498  new CArgAllowValuesBetween(2, 255, true));
1499  arg_desc.AddDefaultKey(kArgLookupStride, "num", "Number of words to skip "
1500  "after collecting one while creating a lookup table",
1502 
1503  arg_desc.SetCurrentGroup("");
1504 }
1505 
1506 
1507 void
1509  CBlastOptions& opt)
1510 {
1511  if (args.Exist(kArgScore) && args[kArgScore]) {
1512 
1513  string s = args[kArgScore].AsString();
1514  // score cutoff may be defined as a liner function of query length:
1515  // L,0.0,0.6 ...
1516  if (s[0] == 'L') {
1517  list<string> tokens;
1518  NStr::Split(s, ",", tokens);
1519  vector<double> coeffs;
1520  if (tokens.size() < 3) {
1521  NCBI_THROW(CInputException, eInvalidInput,
1522  (string)"Incorrectly formatted score function: " +
1523  s + ". It should be of the form 'L,b,a' for ax + b,"
1524  "a, b must be numbers");
1525  }
1526  auto it = tokens.begin();
1527  ++it;
1528  try {
1529  for (; it != tokens.end(); ++it) {
1530  coeffs.push_back(NStr::StringToDouble(*it));
1531  }
1532  }
1533  catch (CException&) {
1534  NCBI_THROW(CInputException, eInvalidInput,
1535  (string)"Incorrectly formatted score function: " +
1536  s + ". It should be of the form 'L,b,a' for ax + b,"
1537  " a, b must be real numbers");
1538  }
1539  opt.SetCutoffScoreCoeffs(coeffs);
1540  }
1541  else {
1542  // ... or a numerical constant
1543  try {
1545  }
1546  catch (CException&) {
1547  NCBI_THROW(CInputException, eInvalidInput,
1548  (string)"Incorrectly formatted score threshold: " +
1549  s + ". It must be either an integer or a linear "
1550  "function in the form: L,b,a for ax + b, a and b "
1551  "must be real numbers");
1552  }
1553  }
1554  }
1555 
1556  if (args.Exist(kArgMaxEditDist) && args[kArgMaxEditDist]) {
1557  opt.SetMaxEditDistance(args[kArgMaxEditDist].AsInteger());
1558  }
1559 
1560  if (args.Exist(kArgSplice) && args[kArgSplice]) {
1561  opt.SetSpliceAlignments(args[kArgSplice].AsBoolean());
1562  }
1563 
1564  string ref_type = "genome";
1565  if (args.Exist(kArgRefType) && args[kArgRefType]) {
1566  ref_type = args[kArgRefType].AsString();
1567  }
1568 
1569  if (args.Exist(kArgLimitLookup) && args[kArgLimitLookup]) {
1570  opt.SetLookupDbFilter(args[kArgLimitLookup].AsBoolean());
1571  }
1572  else {
1573  opt.SetLookupDbFilter(ref_type == "genome");
1574  }
1575 
1576  if (args.Exist(kArgMaxDbWordCount) && args[kArgMaxDbWordCount]) {
1577  opt.SetMaxDbWordCount(args[kArgMaxDbWordCount].AsInteger());
1578  }
1579 
1580  if (args.Exist(kArgLookupStride) && args[kArgLookupStride]) {
1581  opt.SetLookupTableStride(args[kArgLookupStride].AsInteger());
1582  }
1583 }
1584 
1585 
1586 void
1588 {
1589  arg_desc.SetCurrentGroup("Ig-BLAST options");
1590  const static char suffix[] = "VDJ";
1591  const static int df_num_align[3] = {3,3,3};
1592  int num_genes = (m_IsProtein) ? 1 : 3;
1593 
1594 
1595  for (int gene=0; gene<num_genes; ++gene) {
1596  // Subject sequence input
1597  /* TODO disabled for now
1598  string arg_sub = kArgGLSubject;
1599  arg_sub.push_back(suffix[gene]);
1600  arg_desc.AddOptionalKey(arg_sub , "filename",
1601  "Germline subject sequence to align",
1602  CArgDescriptions::eInputFile);
1603  */
1604  // Germline database file name
1605  string arg_db = kArgGLDatabase;
1606  arg_db.push_back(suffix[gene]);
1607  arg_desc.AddOptionalKey(arg_db, "germline_database_name",
1608  "Germline database name",
1610  //arg_desc.SetDependency(arg_db, CArgDescriptions::eExcludes, arg_sub);
1611  // Number of alignments to show
1612  string arg_na = kArgGLNumAlign;
1613  arg_na.push_back(suffix[gene]);
1614  arg_desc.AddDefaultKey(arg_na, "int_value",
1615  "Number of Germline sequences to show alignments for",
1617  NStr::IntToString(df_num_align[gene]));
1618  //arg_desc.SetConstraint(arg_na,
1619  // new CArgAllowValuesBetween(0, 4));
1620  // Seqidlist
1621  arg_desc.AddOptionalKey(arg_db + "_seqidlist", "filename",
1622  "Restrict search of germline database to list of SeqIds's",
1624  }
1625 
1626  if (!m_IsProtein) {
1627  arg_desc.AddDefaultKey(kArgCRegionNumAlign, "int_value",
1628  "Number of Germline sequences to show alignments for",
1630 
1631  arg_desc.AddOptionalKey(kArgCRegionDatabase, "constant_region_database_name",
1632  "C region database name",
1634 
1635  arg_desc.AddOptionalKey(kArgCustomInternalData, "filename",
1636  "custom internal data file for V region annotation",
1638 
1639  arg_desc.AddOptionalKey(kArgDFrameDefinitionFile, "filename",
1640  "D gene frame definition file",
1642 
1643  arg_desc.AddOptionalKey(kArgGLChainType, "filename",
1644  "File containing the coding frame start positions for sequences in germline J database",
1646 
1647  arg_desc.AddOptionalKey(kArgMinDMatch, "min_D_match",
1648  "Required minimal consecutive nucleotide base matches for D genes ",
1650  arg_desc.SetConstraint(kArgMinDMatch,
1652 
1653  arg_desc.AddDefaultKey(kArgVPenalty, "V_penalty",
1654  "Penalty for a nucleotide mismatch in V gene",
1656  arg_desc.SetConstraint(kArgVPenalty,
1657  new CArgAllowValuesBetween(-4, 0));
1658 
1659 
1660  arg_desc.AddDefaultKey(kArgDPenalty, "D_penalty",
1661  "Penalty for a nucleotide mismatch in D gene",
1663 
1664  arg_desc.SetConstraint(kArgDPenalty,
1665  new CArgAllowValuesBetween(-5, 0));
1666 
1667  arg_desc.AddDefaultKey(kArgJPenalty, "J_penalty",
1668  "Penalty for a nucleotide mismatch in J gene",
1670 
1671  arg_desc.SetConstraint(kArgJPenalty,
1672  new CArgAllowValuesBetween(-4, 0));
1673 
1674  arg_desc.AddDefaultKey(kArgNumClonotype, "num_clonotype",
1675  "Number of top clonotypes to show ",
1679 
1680  arg_desc.AddOptionalKey(kArgClonotypeFile, "clonotype_out",
1681  "Output file name for clonotype info",
1683 
1684  arg_desc.AddFlag(kArgDetectOverlap, "Allow V(D)J genes to overlap. This option is active only when D_penalty and J_penalty are set to -4 and -3, respectively", true);
1685 
1686 
1687  }
1688 
1689  arg_desc.AddDefaultKey(kArgGLOrigin, "germline_origin",
1690  "The organism for your query sequence. Supported organisms include human, mouse, rat, rabbit and rhesus_monkey for Ig and human and mouse for TCR. Custom organism is also supported but you need to supply your own germline annotations (see IgBLAST web site for details)",
1691  CArgDescriptions::eString, "human");
1692 
1693  arg_desc.AddDefaultKey(kArgGLDomainSystem, "domain_system",
1694  "Domain system to be used for segment annotation",
1695  CArgDescriptions::eString, "imgt");
1696  arg_desc.SetConstraint(kArgGLDomainSystem, &(*new CArgAllow_Strings, "kabat", "imgt"));
1697 
1698  arg_desc.AddDefaultKey(kArgIgSeqType, "sequence_type",
1699  "Specify Ig or T cell receptor sequence",
1701  arg_desc.SetConstraint(kArgIgSeqType, &(*new CArgAllow_Strings, "Ig", "TCR"));
1702 
1703 
1704  arg_desc.AddFlag(kArgGLFocusV, "Should the search only be for V segment (effective only for non-germline database search using -db option)?", true);
1705 
1706  arg_desc.AddFlag(kArgExtendAlign5end, "Extend V gene alignment at 5' end", true);
1707 
1708  arg_desc.AddFlag(kArgExtendAlign3end, "Extend J gene alignment at 3' end", true);
1709 
1710  arg_desc.AddDefaultKey(kArgMinVLength, "Min_V_Length",
1711  "Minimal required V gene length",
1713 
1714  arg_desc.SetConstraint(kArgMinVLength,
1716 
1717  if (! m_IsProtein) {
1718  arg_desc.AddDefaultKey(kArgMinJLength, "Min_J_Length",
1719  "Minimal required J gene length",
1721 
1722  arg_desc.SetConstraint(kArgMinJLength,
1724  }
1725 
1726  if (! m_IsProtein) {
1727  arg_desc.AddFlag(kArgTranslate, "Show translated alignments", true);
1728  }
1729 
1730  arg_desc.SetCurrentGroup("");
1731 }
1732 
1733 static string s_RegisterOMDataLoader(CRef<CSeqDB> db_handle)
1734 { // the blast formatter requires that the database coexist in
1735  // the same scope with the query sequences
1740  CBlastDbDataLoader::SBlastDbParam param(db_handle);
1741  string retval(CBlastDbDataLoader::GetLoaderNameFromArgs(param));
1742  _TRACE("Registering " << retval << " at priority " <<
1744  return retval;
1745 }
1746 
1747 void
1749  CBlastOptions& opts)
1750 {
1751  string paths[3];
1754  paths[1] = CDirEntry::NormalizePath(env.Get("IGDATA"), eFollowLinks);
1756  if (app) {
1757  const CNcbiRegistry& registry = app->GetConfig();
1758  paths[2] = CDirEntry::NormalizePath(registry.Get("BLAST","IGDATA"), eFollowLinks);
1759  } else {
1760 #if defined(NCBI_OS_DARWIN)
1761  paths[2] = "/usr/local/ncbi/igblast/data";
1762 #else
1763  paths[2] = paths[0];
1764 #endif
1765  }
1766 
1768 
1772 
1774  m_IgOptions->m_Origin = args[kArgGLOrigin].AsString();
1775  m_IgOptions->m_DomainSystem = args[kArgGLDomainSystem].AsString();
1776  m_IgOptions->m_FocusV = args.Exist(kArgGLFocusV) ? args[kArgGLFocusV] : false;
1780  m_IgOptions->m_MinVLength = args[kArgMinVLength].AsInteger();
1781  if (args.Exist(kArgMinJLength) && args[kArgMinJLength]) {
1782  m_IgOptions->m_MinJLength = args[kArgMinJLength].AsInteger();
1783  } else {
1785  }
1786  m_IgOptions->m_Translate = args.Exist(kArgTranslate) ? args[kArgTranslate] : false;
1789 
1790  if (!m_IsProtein) {
1791  string aux_file = (args.Exist(kArgGLChainType) && args[kArgGLChainType])
1792  ? args[kArgGLChainType].AsString()
1793  : m_IgOptions->m_Origin + "_gl.aux";
1794  m_IgOptions->m_AuxFilename = aux_file;
1795  for (int i=0; i<3; i++) {
1796  string aux_path = CDirEntry::ConcatPath(paths[i], aux_file);
1797  CDirEntry entry(aux_path);
1798  if (entry.Exists() && entry.IsFile()) {
1799  m_IgOptions->m_AuxFilename = aux_path;
1800  break;
1801  }
1802  }
1803 
1806  }
1807 
1810  }
1811  }
1812 
1814 
1816 
1817  // default germline database name for annotation
1818  for (int i=0; i<3; i++) {
1819  string int_data = CDirEntry::ConcatPath(paths[i], "internal_data");
1820  CDirEntry entry(int_data);
1821  if (entry.Exists() && entry.IsDir()) {
1822  m_IgOptions->m_IgDataPath = int_data;
1823  break;
1824  }
1825  }
1826 
1827  m_IgOptions->m_SequenceType = "Ig";
1828  if (args.Exist(kArgIgSeqType) && args[kArgIgSeqType]) {
1829  m_IgOptions->m_SequenceType = args[kArgIgSeqType].AsString();
1830  }
1831 
1832  string df_db_name = CDirEntry::ConcatPath(
1835  ((m_IgOptions->m_SequenceType == "TCR")?"_TR":"") + "_V");
1836  CRef<CSearchDatabase> db(new CSearchDatabase(df_db_name, mol_type));
1837  m_IgOptions->m_Db[3].Reset(new CLocalDbAdapter(*db));
1838  try {
1839  db->GetSeqDb();
1840  } catch(...) {
1841  NCBI_THROW(CInputException, eInvalidInput,
1842  "Germline annotation database " + df_db_name + " could not be found in [internal_data] directory");
1843  }
1844 
1846  if (args.Exist(kArgMinDMatch) && args[kArgMinDMatch]) {
1847  m_IgOptions->m_Min_D_match = args[kArgMinDMatch].AsInteger();
1848  }
1849 
1850  if (args.Exist(kArgVPenalty) && args[kArgVPenalty]) {
1851  m_IgOptions->m_V_penalty = args[kArgVPenalty].AsInteger();
1852  }
1853 
1854  if (args.Exist(kArgDPenalty) && args[kArgDPenalty]) {
1855  m_IgOptions->m_D_penalty = args[kArgDPenalty].AsInteger();
1856  }
1857 
1858  if (args.Exist(kArgJPenalty) && args[kArgJPenalty]) {
1859  m_IgOptions->m_J_penalty = args[kArgJPenalty].AsInteger();
1860  }
1861 
1862  CRef<CBlastOptionsHandle> opts_hndl;
1863  if (m_IgOptions->m_IsProtein) {
1865  } else {
1867  }
1868 
1869 
1870  const static char suffix[] = "VDJ";
1871  int num_genes = (m_IsProtein) ? 1: 3;
1872  for (int gene=0; gene< num_genes; ++gene) {
1873  string arg_sub = kArgGLSubject;
1874  string arg_db = kArgGLDatabase;
1875  string arg_na = kArgGLNumAlign;
1876 
1877  arg_sub.push_back(suffix[gene]);
1878  arg_db.push_back(suffix[gene]);
1879  arg_na.push_back(suffix[gene]);
1880 
1881  m_IgOptions->m_NumAlign[gene] = args[arg_na].AsInteger();
1882 
1883  if (args.Exist(arg_sub) && args[arg_sub]) {
1884  CNcbiIstream& subj_input_stream = args[arg_sub].AsInputFile();
1885  TSeqRange subj_range;
1886 
1887  const bool parse_deflines = args.Exist(kArgParseDeflines)
1888  ? bool(args[kArgParseDeflines])
1890  const bool use_lcase_masks = args.Exist(kArgUseLCaseMasking)
1891  ? bool(args[kArgUseLCaseMasking])
1894  CRef<CScope> scope = ReadSequencesToBlast(subj_input_stream,
1896  subj_range, parse_deflines,
1897  use_lcase_masks, subjects);
1898  m_Scope->AddScope(*scope,
1900  CRef<IQueryFactory> sub_seqs(
1901  new blast::CObjMgr_QueryFactory(*subjects));
1903  sub_seqs, opts_hndl));
1904  } else {
1905  string gl_db_name = m_IgOptions->m_Origin + "_gl_";
1906  gl_db_name.push_back(suffix[gene]);
1907  string db_name = (args.Exist(arg_db) && args[arg_db])
1908  ? args[arg_db].AsString() : gl_db_name;
1909  db.Reset(new CSearchDatabase(db_name, mol_type));
1910 
1911  if (args.Exist(arg_db + "_seqidlist") && args[arg_db + "_seqidlist"]) {
1912  string fn(SeqDB_ResolveDbPath(args[arg_db + "_seqidlist"].AsString()));
1915  }
1916 
1917  m_IgOptions->m_Db[gene].Reset(new CLocalDbAdapter(*db));
1918  m_Scope->AddDataLoader(s_RegisterOMDataLoader(db->GetSeqDb()));
1919  }
1920  }
1921 
1922  if (args.Exist(kArgCRegionDatabase) && args[kArgCRegionDatabase]) {
1923  m_IgOptions->m_NumAlign[3] = args[kArgCRegionNumAlign].AsInteger();
1924  db.Reset(new CSearchDatabase(args[kArgCRegionDatabase].AsString(), mol_type));
1925  m_IgOptions->m_Db[4].Reset(new CLocalDbAdapter(*db));
1926  m_Scope->AddDataLoader(s_RegisterOMDataLoader(db->GetSeqDb()));
1927  } else {
1928  m_IgOptions->m_Db[4].Reset(0);
1929  }
1930 }
1931 
1932 void
1934 {
1935 
1936  arg_desc.SetCurrentGroup("Query filtering options");
1937  // lowercase masking
1938  arg_desc.AddFlag(kArgUseLCaseMasking,
1939  "Use lower case filtering in query and subject sequence(s)?", true);
1940 
1941  arg_desc.SetCurrentGroup("Input query options");
1942  // query location
1943  arg_desc.AddOptionalKey(kArgQueryLocation, "range",
1944  "Location on the query sequence in 1-based offsets "
1945  "(Format: start-stop)",
1947 
1948  if ( !m_QueryCannotBeNucl) {
1949  // search strands
1950  arg_desc.AddDefaultKey(kArgStrand, "strand",
1951  "Query strand(s) to search against database/subject",
1953  arg_desc.SetConstraint(kArgStrand, &(*new CArgAllow_Strings,
1954  kDfltArgStrand, "plus", "minus"));
1955  }
1956 
1957  arg_desc.SetCurrentGroup("Miscellaneous options");
1958  arg_desc.AddFlag(kArgParseDeflines,
1959  "Should the query and subject defline(s) be parsed?", true);
1960 
1961  arg_desc.SetCurrentGroup("");
1962 }
1963 
1964 void
1966  CBlastOptions& opt)
1967 {
1968  // Get the strand
1969  {
1971 
1972  if (!Blast_QueryIsProtein(opt.GetProgramType())) {
1973 
1974  if (args.Exist(kArgStrand) && args[kArgStrand]) {
1975  const string& kStrand = args[kArgStrand].AsString();
1976  if (kStrand == "both") {
1978  } else if (kStrand == "plus") {
1980  } else if (kStrand == "minus") {
1982  } else {
1983  abort();
1984  }
1985  }
1986  else {
1988  }
1989  }
1990  }
1991 
1992  // set the sequence range
1993  if (args.Exist(kArgQueryLocation) && args[kArgQueryLocation]) {
1994  m_Range = ParseSequenceRange(args[kArgQueryLocation].AsString(),
1995  "Invalid specification of query location");
1996  }
1997 
1999  static_cast<bool>(args[kArgUseLCaseMasking]);
2001  static_cast<bool>(args[kArgParseDeflines]);
2002 }
2003 
2004 void
2006 {
2007 
2008  arg_desc.SetCurrentGroup("Query filtering options");
2009  // lowercase masking
2010  arg_desc.AddFlag(kArgUseLCaseMasking,
2011  "Use lower case filtering in subject sequence(s)?", true);
2012  arg_desc.AddDefaultKey(kArgQualityFilter, "TF", "Reject low quality "
2013  "sequences ", CArgDescriptions::eBoolean, "true");
2014 
2015  arg_desc.SetCurrentGroup("Input query options");
2016  arg_desc.AddDefaultKey(kArgInputFormat, "format", "Input format for "
2017  "sequences", CArgDescriptions::eString, "fasta");
2019  "fasta", "fastc", "fastq",
2020  "asn1", "asn1b"));
2021  arg_desc.AddFlag(kArgPaired, "Input query sequences are paired", true);
2022  arg_desc.AddOptionalKey(kArgQueryMate, "infile", "FASTA file with "
2023  "mates for query sequences (if given in "
2024  "another file)", CArgDescriptions::eInputFile);
2026  kArgQuery);
2027 
2028  arg_desc.AddOptionalKey(kArgSraAccession, "accession",
2029  "Comma-separated SRA accessions",
2032  kArgQuery);
2034  kArgInputFormat);
2035 
2036  arg_desc.AddOptionalKey(kArgSraAccessionBatch, "file",
2037  "File with a list of SRA accessions, one per line",
2042  kArgQuery);
2044  kArgInputFormat);
2045 
2046  arg_desc.SetCurrentGroup("Miscellaneous options");
2047  arg_desc.AddDefaultKey(kArgParseDeflines, "TF", "Should the query and "
2048  "subject defline(s) be parsed?",
2049  CArgDescriptions::eBoolean, "true");
2050 
2051  arg_desc.AddFlag(kArgEnableSraCache, "Enable SRA caching in local files");
2054 
2055 
2056  arg_desc.SetCurrentGroup("");
2057 }
2058 
2059 void
2061  CBlastOptions& opt)
2062 {
2064 
2065  if (args.Exist(kArgPaired) && args[kArgPaired]) {
2066  opt.SetPaired(true);
2067  m_IsPaired = true;
2068  }
2069 
2070  if (args.Exist(kArgInputFormat) && args[kArgInputFormat]) {
2071  if (args[kArgInputFormat].AsString() == "fasta") {
2073  }
2074  else if (args[kArgInputFormat].AsString() == "fastc") {
2076  }
2077  else if (args[kArgInputFormat].AsString() == "fastq") {
2079  }
2080  else if (args[kArgInputFormat].AsString() == "asn1") {
2082  }
2083  else if (args[kArgInputFormat].AsString() == "asn1b") {
2085  }
2086  else {
2087  NCBI_THROW(CInputException, eInvalidInput,
2088  "Unexpected input format: " +
2089  args[kArgInputFormat].AsString());
2090  }
2091  }
2092 
2093  if (m_InputFormat == eFastc) {
2094  // FASTC format always has pairs in a single file
2095  opt.SetPaired(true);
2096  m_IsPaired = true;
2097  }
2098 
2099  if (args.Exist(kArgQualityFilter) && args[kArgQualityFilter]) {
2100  opt.SetReadQualityFiltering(args[kArgQualityFilter].AsBoolean());
2101  }
2102 
2103  if (args.Exist(kArgQueryMate) && args[kArgQueryMate]) {
2104  // create a decompress stream is the file is compressed
2105  // (the primary query file is handeled by CStdCmdLieArgs object)
2106  if (NStr::EndsWith(args[kArgQueryMate].AsString(), ".gz",
2107  NStr::eNocase)) {
2109  args[kArgQueryMate].AsInputFile(),
2112  }
2113  else {
2114  m_MateInputStream = &args[kArgQueryMate].AsInputFile();
2115  }
2116 
2117  // queries have pairs in the mate stream
2118  opt.SetPaired(true);
2119  m_IsPaired = true;
2120  }
2121 
2122  if ((args.Exist(kArgSraAccession) && args[kArgSraAccession]) ||
2124 
2125  if (args[kArgSraAccession]) {
2126  // accessions given in the command-line
2127  NStr::Split((CTempString)args[kArgSraAccession].AsString(), ",",
2128  m_SraAccessions);
2129  }
2130  else {
2131  // accessions given in a file
2132  while (!args[kArgSraAccessionBatch].AsInputFile().eof()) {
2133  string line;
2134  args[kArgSraAccessionBatch].AsInputFile() >> line;
2135  if (!line.empty()) {
2136  m_SraAccessions.push_back(line);
2137  }
2138  }
2139  }
2140 
2141  if (m_SraAccessions.empty()) {
2142  NCBI_THROW(CInputException, eInvalidInput,
2143  "No SRA accessions provided");
2144  }
2145 
2146  m_InputFormat = eSra;
2147  // assume SRA input is paired, that information for each read is in
2148  // SRA database, this option will trigger checking for pairs
2149  opt.SetPaired(true);
2150  m_IsPaired = true;
2151  }
2152 
2153  if (args.Exist(kArgEnableSraCache) && args[kArgEnableSraCache]) {
2154  m_EnableSraCache = true;
2155  }
2156 }
2157 
2158 
2159 
2160 CBlastDatabaseArgs::CBlastDatabaseArgs(bool request_mol_type /* = false */,
2161  bool is_rpsblast /* = false */,
2162  bool is_igblast /* = false */,
2163  bool is_mapper /* = false */,
2164  bool is_kblast /* = false */)
2165  : m_RequestMoleculeType(request_mol_type),
2166  m_IsRpsBlast(is_rpsblast),
2167  m_IsIgBlast(is_igblast),
2168  m_IsProtein(true),
2169  m_IsMapper(is_mapper),
2170  m_IsKBlast(is_kblast),
2171  m_SupportsDatabaseMasking(false),
2172  m_SupportIPGFiltering(false)
2173 {}
2174 
2175 bool
2177 {
2178  if ( (args.Exist(kArgDb) && args[kArgDb].HasValue()) ||
2179  (args.Exist(kArgSubject) && args[kArgSubject].HasValue()) ) {
2180  return true;
2181  }
2182  return false;
2183 }
2184 
2185 void
2187 {
2188  arg_desc.SetCurrentGroup("General search options");
2189  // database filename
2190  if (m_IsIgBlast){
2191  arg_desc.AddOptionalKey(kArgDb, "database_name", "Optional additional database name",
2193  } else {
2194  arg_desc.AddOptionalKey(kArgDb, "database_name", "BLAST database name",
2196  }
2197 
2198  arg_desc.SetCurrentGroup("");
2199 
2200  if (m_RequestMoleculeType) {
2201  arg_desc.AddKey(kArgDbType, "database_type",
2202  "BLAST database molecule type",
2204  arg_desc.SetConstraint(kArgDbType,
2205  &(*new CArgAllow_Strings, "prot", "nucl"));
2206  }
2207 
2208  vector<string> database_args;
2209  database_args.push_back(kArgDb);
2210  database_args.push_back(kArgGiList);
2211  database_args.push_back(kArgSeqIdList);
2212  database_args.push_back(kArgNegativeGiList);
2213  database_args.push_back(kArgNegativeSeqidList);
2214  database_args.push_back(kArgTaxIdList);
2215  database_args.push_back(kArgTaxIdListFile);
2216  database_args.push_back(kArgNegativeTaxIdList);
2217  database_args.push_back(kArgNegativeTaxIdListFile);
2218  database_args.push_back(kArgNoTaxIdExpansion);
2219  if (m_SupportIPGFiltering) {
2220  database_args.push_back(kArgIpgList);
2221  database_args.push_back(kArgNegativeIpgList);
2222  }
2224  database_args.push_back(kArgDbSoftMask);
2225  database_args.push_back(kArgDbHardMask);
2226  }
2227 
2228  // DB size
2229  if (!m_IsMapper) {
2230  arg_desc.SetCurrentGroup("Statistical options");
2231  arg_desc.AddOptionalKey(kArgDbSize, "num_letters",
2232  "Effective length of the database ",
2234  }
2235 
2236  arg_desc.SetCurrentGroup("Restrict search or results");
2237  // GI list
2238  if (!m_IsRpsBlast && !m_IsIgBlast) {
2239  arg_desc.AddOptionalKey(kArgGiList, "filename",
2240  "Restrict search of database to list of GIs",
2242  // SeqId list
2243  arg_desc.AddOptionalKey(kArgSeqIdList, "filename",
2244  "Restrict search of database to list of SeqIDs",
2246  // Negative GI list
2247  arg_desc.AddOptionalKey(kArgNegativeGiList, "filename",
2248  "Restrict search of database to everything"
2249  " except the specified GIs",
2251 
2252  // Negative SeqId list
2253  arg_desc.AddOptionalKey(kArgNegativeSeqidList, "filename",
2254  "Restrict search of database to everything"
2255  " except the specified SeqIDs",
2257 
2258  // Tax ID list
2259  arg_desc.AddOptionalKey(kArgTaxIdList, "taxids",
2260  "Restrict search of database to include only "
2261  "the specified taxonomy IDs and their descendants "
2262  "(multiple IDs delimited by ',')",
2264  arg_desc.AddOptionalKey(kArgNegativeTaxIdList, "taxids",
2265  "Restrict search of database to everything "
2266  "except the specified taxonomy IDs and their descendants "
2267  "(multiple IDs delimited by ',')",
2269  // Tax ID list file
2270  arg_desc.AddOptionalKey(kArgTaxIdListFile, "filename",
2271  "Restrict search of database to include only "
2272  "the specified taxonomy IDs and their descendants ",
2274  arg_desc.AddOptionalKey(kArgNegativeTaxIdListFile, "filename",
2275  "Restrict search of database to everything "
2276  "except the specified taxonomy IDs and their descendants ",
2278  // Disable Tax ID resoution to the descendants
2279  arg_desc.AddFlag(kArgNoTaxIdExpansion, "Do not expand the taxonomy IDs provided to their descendant taxonomy IDs ", true);
2287 
2288  if (m_SupportIPGFiltering) {
2289  arg_desc.AddOptionalKey(kArgIpgList, "filename",
2290  "Restrict search of database to list of IPGs",
2292 
2293  // Negative IPG list
2294  arg_desc.AddOptionalKey(kArgNegativeIpgList, "filename",
2295  "Restrict search of database to everything"
2296  " except the specified IPGs",
2298  }
2299  // N.B.: all restricting options are mutually exclusive
2300  const vector<string> kBlastDBFilteringOptions = {
2301  kArgGiList,
2302  kArgSeqIdList,
2303  kArgTaxIdList,
2305 
2310  };
2311  for (size_t i = 0; i < kBlastDBFilteringOptions.size(); i++) {
2312  for (size_t j = i+1; j < kBlastDBFilteringOptions.size(); j++) {
2313  arg_desc.SetDependency(kBlastDBFilteringOptions[i], CArgDescriptions::eExcludes,
2314  kBlastDBFilteringOptions[j]);
2315  }
2316  }
2317 
2318  // For now, disable pairing -remote with either -gilist or
2319  // -negative_gilist as this is not implemented in the BLAST server
2320  for (const string& s: kBlastDBFilteringOptions) {
2322  }
2323  }
2324 
2325  // Entrez Query
2326  if (!m_IsMapper) {
2327  arg_desc.AddOptionalKey(kArgEntrezQuery, "entrez_query",
2328  "Restrict search with the given Entrez query",
2330 
2331  // Entrez query currently requires the -remote option
2333  kArgRemote);
2334  }
2335 
2336 
2337 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
2338  (!defined(NCBI_COMPILER_MIPSPRO)) )
2339  // Masking of database
2341  arg_desc.AddOptionalKey(kArgDbSoftMask,
2342  "filtering_algorithm",
2343  "Filtering algorithm ID to apply to the BLAST database as soft "
2344  "masking",
2347  kArgDbHardMask);
2348 
2349  arg_desc.AddOptionalKey(kArgDbHardMask,
2350  "filtering_algorithm",
2351  "Filtering algorithm ID to apply to the BLAST database as hard "
2352  "masking",
2354  }
2355 #endif
2356 
2357  // There is no RPS-BLAST 2 sequences
2358  if ( !m_IsRpsBlast && !m_IsKBlast && !m_IsIgBlast) {
2359  arg_desc.SetCurrentGroup("BLAST-2-Sequences options");
2360  // subject sequence input (for bl2seq)
2361  arg_desc.AddOptionalKey(kArgSubject, "subject_input_file",
2362  "Subject sequence(s) to search",
2364  ITERATE(vector<string>, dbarg, database_args) {
2366  *dbarg);
2367  }
2368 
2369  // subject location
2370  arg_desc.AddOptionalKey(kArgSubjectLocation, "range",
2371  "Location on the subject sequence in 1-based offsets "
2372  "(Format: start-stop)",
2374  ITERATE(vector<string>, dbarg, database_args) {
2377  *dbarg);
2378  }
2379  // Because Blast4-subject does not support Seq-locs, specifying a
2380  // subject range does not work for remote searches
2383  }
2384 
2385  arg_desc.SetCurrentGroup("");
2386 }
2387 
2388 
2389 
2390 //
2391 // Get taid(s) from user provided string or file, optionally resolve taxid to it's descendant if isTargetOnly == false
2392 // logic to add/resolve is next:
2393 // --------------------------------------------------------------------------------------
2394 // isTargetOnly | decsendant(s) found |
2395 // --------------------------------------------------------------------------------------
2396 // TRUE | N/A | add user's taxids, no lookup for decsendant
2397 // FALSE | TRUE | add user's taxid AND add only found descendant(s)
2398 // --------------------------------------------------------------------------------------
2399 //
2400 static void s_GetTaxIDList(const string & in, bool isFile, bool isNegativeList, CRef<CSearchDatabase> & sdb, bool isTargetOnly )
2401 {
2402  vector<string> ids;
2403  if (isFile) {
2404  string filename(SeqDB_ResolveDbPath(in));
2405  if(filename == kEmptyStr) {
2406  NCBI_THROW(CInputException, eInvalidInput, "File is not acessible: "+ in );
2407  }
2408  CNcbiIfstream instream(filename.c_str());
2409  CStreamLineReader reader(instream);
2410 
2411  while (!reader.AtEOF()) {
2412  reader.ReadLine();
2413  ids.push_back(reader.GetCurrentLine());
2414  }
2415  } else {
2416  NStr::Split(in, ",", ids, NStr::fSplit_Tokenize);
2417  }
2418  unique_ptr<ITaxonomy4Blast> tb;
2419  if( !isTargetOnly ) {
2420  try{
2421  tb.reset(new CTaxonomy4BlastSQLite());
2422  }
2423  catch(CException &){
2424  LOG_POST(Warning << "The -taxids command line option requires additional data files. Please see the section 'Taxonomic filtering for BLAST databases' in https://www.ncbi.nlm.nih.gov/books/NBK569839/ for details.");
2425  }
2426  }
2427  set<TTaxId> tax_ids;
2428  for (auto id : ids) {
2429  try {
2430  if (NStr::IsBlank(id)) {
2431  continue;
2432  }
2433  auto taxid = NStr::StringToNumeric<TTaxId>(id, NStr::fAllowLeadingSpaces | NStr::fAllowTrailingSpaces);
2434  if( isTargetOnly ) {
2435  tax_ids.insert(taxid);
2436  } else if (tb) {
2437  tax_ids.insert(taxid);
2438  vector<int> desc;
2439  tb->GetLeafNodeTaxids(taxid, desc);
2440  for (auto i: desc)
2441  tax_ids.insert( static_cast<TTaxId>(i) );
2442  }
2443  } catch(CException &){
2444  NCBI_THROW(CInputException, eInvalidInput, "Invalid taxidlist file ");
2445  }
2446  }
2447 
2448  CRef<CSeqDBGiList> taxid_list(new CSeqDBGiList());
2449  taxid_list->AddTaxIds(tax_ids);
2450  if(isNegativeList) {
2451  sdb->SetNegativeGiList(taxid_list.GetPointer());
2452  }
2453  else {
2454  sdb->SetGiList(taxid_list.GetPointer());
2455  }
2456 
2457 }
2458 
2459 
2460 void
2462  CBlastOptions& opts)
2463 {
2468 
2469  if (args.Exist(kArgDb) && args[kArgDb]) {
2470  std::string local_dblist = NStr::TruncateSpaces( args[kArgDb].AsString() );
2471 
2472  m_SearchDb.Reset(new CSearchDatabase( local_dblist,
2473  mol_type));
2474 
2475  if (args.Exist(kArgGiList) && args[kArgGiList]) {
2476  string fn(SeqDB_ResolveDbPath(args[kArgGiList].AsString()));
2478 
2479  } else if (args.Exist(kArgNegativeGiList) && args[kArgNegativeGiList]) {
2480  string fn(SeqDB_ResolveDbPath(args[kArgNegativeGiList].AsString()));
2482 
2483  } else if (args.Exist(kArgSeqIdList) && args[kArgSeqIdList]) {
2484  string fn(SeqDB_ResolveDbPath(args[kArgSeqIdList].AsString()));
2487  } else if (args.Exist(kArgNegativeSeqidList) && args[kArgNegativeSeqidList]) {
2488  string fn(SeqDB_ResolveDbPath(args[kArgNegativeSeqidList].AsString()));
2490  } else if (args.Exist(kArgTaxIdList) && args[kArgTaxIdList]) {
2491  s_GetTaxIDList(args[kArgTaxIdList].AsString(), false, false, m_SearchDb,args[kArgNoTaxIdExpansion].AsBoolean());
2492 
2493  } else if (args.Exist(kArgTaxIdListFile) && args[kArgTaxIdListFile]) {
2494  s_GetTaxIDList(args[kArgTaxIdListFile].AsString(), true, false, m_SearchDb, args[kArgNoTaxIdExpansion].AsBoolean());
2495 
2496  } else if (args.Exist(kArgNegativeTaxIdList) && args[kArgNegativeTaxIdList]) {
2497  s_GetTaxIDList(args[kArgNegativeTaxIdList].AsString(), false, true, m_SearchDb, args[kArgNoTaxIdExpansion].AsBoolean());
2498 
2499  } else if (args.Exist(kArgNegativeTaxIdListFile) && args[kArgNegativeTaxIdListFile]) {
2500  s_GetTaxIDList(args[kArgNegativeTaxIdListFile].AsString(), true, true, m_SearchDb,args[kArgNoTaxIdExpansion].AsBoolean());
2501 
2502  } else if (args.Exist(kArgIpgList) && args[kArgIpgList]) {
2503  string fn(SeqDB_ResolveDbPath(args[kArgIpgList].AsString()));
2505  } else if (args.Exist(kArgNegativeIpgList) && args[kArgNegativeIpgList]) {
2506  string fn(SeqDB_ResolveDbPath(args[kArgNegativeIpgList].AsString()));
2508 
2509  }
2510 
2511  if (args.Exist(kArgEntrezQuery) && args[kArgEntrezQuery])
2513 
2514 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
2515  (!defined(NCBI_COMPILER_MIPSPRO)) )
2516  if (args.Exist(kArgDbSoftMask) && args[kArgDbSoftMask]) {
2518  } else if (args.Exist(kArgDbHardMask) && args[kArgDbHardMask]) {
2520  }
2521 #endif
2522  } else if (args.Exist(kArgSubject) && args[kArgSubject]) {
2523 
2524  CNcbiIstream* subj_input_stream = NULL;
2525  unique_ptr<CDecompressIStream> decompress_stream;
2526  if (m_IsMapper &&
2527  NStr::EndsWith(args[kArgSubject].AsString(), ".gz", NStr::eNocase)) {
2528  decompress_stream.reset(
2529  new CDecompressIStream(args[kArgSubject].AsInputFile(),
2531  subj_input_stream = decompress_stream.get();
2532  }
2533  else {
2534  subj_input_stream = &args[kArgSubject].AsInputFile();
2535  }
2536 
2537  TSeqRange subj_range;
2538  if (args.Exist(kArgSubjectLocation) && args[kArgSubjectLocation]) {
2539  subj_range =
2540  ParseSequenceRange(args[kArgSubjectLocation].AsString(),
2541  "Invalid specification of subject location");
2542  }
2543 
2544  const bool parse_deflines = args.Exist(kArgParseDeflines)
2545  ? args[kArgParseDeflines].AsBoolean()
2547  const bool use_lcase_masks = args.Exist(kArgUseLCaseMasking)
2548  ? bool(args[kArgUseLCaseMasking])
2551  m_Scope = ReadSequencesToBlast(*subj_input_stream, IsProtein(),
2552  subj_range, parse_deflines,
2553  use_lcase_masks, subjects, m_IsMapper);
2554  m_Subjects.Reset(new blast::CObjMgr_QueryFactory(*subjects));
2555 
2556  } else if (!m_IsIgBlast){
2557  // IgBlast permits use of germline database
2558  NCBI_THROW(CInputException, eInvalidInput,
2559  "Either a BLAST database or subject sequence(s) must be specified");
2560  }
2561 
2562  if (opts.GetEffectiveSearchSpace() != 0) {
2563  // no need to set any other options, as this trumps them
2564  return;
2565  }
2566 
2567  if (args.Exist(kArgDbSize) && args[kArgDbSize]) {
2568  opts.SetDbLength(args[kArgDbSize].AsInt8());
2569  }
2570 
2571 }
2572 
2573 void
2575 {
2576  arg_desc.SetCurrentGroup("Formatting options");
2577 
2578  string kOutputFormatDescription = string(
2579  "alignment view options:\n"
2580  " 0 = Pairwise,\n"
2581  " 1 = Query-anchored showing identities,\n"
2582  " 2 = Query-anchored no identities,\n"
2583  " 3 = Flat query-anchored showing identities,\n"
2584  " 4 = Flat query-anchored no identities,\n"
2585  " 5 = BLAST XML,\n"
2586  " 6 = Tabular,\n"
2587  " 7 = Tabular with comment lines,\n"
2588  " 8 = Seqalign (Text ASN.1),\n"
2589  " 9 = Seqalign (Binary ASN.1),\n"
2590  " 10 = Comma-separated values,\n"
2591  " 11 = BLAST archive (ASN.1),\n"
2592  " 12 = Seqalign (JSON),\n"
2593  " 13 = Multiple-file BLAST JSON,\n"
2594  " 14 = Multiple-file BLAST XML2,\n"
2595  " 15 = Single-file BLAST JSON,\n"
2596  " 16 = Single-file BLAST XML2");
2597 
2598  if(m_FormatFlags & eIsSAM) {
2599  kOutputFormatDescription += ",\n 17 = Sequence Alignment/Map (SAM)";
2600  }
2601  kOutputFormatDescription += ",\n 18 = Organism Report\n\n";
2602  if(m_FormatFlags & eIsSAM) {
2603  kOutputFormatDescription +=
2604  "Options 6, 7, 10 and 17 "
2605  "can be additionally configured to produce\n"
2606  "a custom format specified by space delimited format specifiers,\n"
2607  "or in the case of options 6, 7, and 10, by a token specified\n"
2608  "by the delim keyword. E.g.: \"17 delim=@ qacc sacc score\".\n"
2609  "The delim keyword must appear after the numeric output format\n"
2610  "specification.\n"
2611  "The supported format specifiers for options 6, 7 and 10 are:\n";
2612  }
2613  else {
2614  kOutputFormatDescription +=
2615  "Options 6, 7 and 10 "
2616  "can be additionally configured to produce\n"
2617  "a custom format specified by space delimited format specifiers,\n"
2618  "or by a token specified by the delim keyword.\n"
2619  " E.g.: \"10 delim=@ qacc sacc score\".\n"
2620  "The delim keyword must appear after the numeric output format\n"
2621  "specification.\n"
2622  "The supported format specifiers are:\n";
2623  }
2624 
2625  kOutputFormatDescription += DescribeTabularOutputFormatSpecifiers() + string("\n");
2626 
2627  if(m_FormatFlags & eIsSAM) {
2628  kOutputFormatDescription +=
2629  "The supported format specifier for option 17 is:\n" +
2631  }
2632 
2633 
2634  int dft_outfmt = kDfltArgOutputFormat;
2635 
2636  // Igblast shows extra column of gaps
2637  if (m_IsIgBlast) {
2638  kOutputFormatDescription = string(
2639  "alignment view options:\n"
2640  " 3 = Flat query-anchored, show identities,\n"
2641  " 4 = Flat query-anchored, no identities,\n"
2642  " 7 = Tabular with comment lines\n"
2643  " 19 = Rearrangement summary report (AIRR format)\n\n"
2644  "Options 7 can be additionally configured to produce\n"
2645  "a custom format specified by space delimited format specifiers.\n"
2646  "The supported format specifiers are:\n") +
2648  string("\n");
2649  dft_outfmt = 3;
2650  }
2651 
2652  // alignment view
2653  arg_desc.AddDefaultKey(kArgOutputFormat, "format",
2654  kOutputFormatDescription,
2656  NStr::IntToString(dft_outfmt));
2657 
2658  // show GIs in deflines
2659  arg_desc.AddFlag(kArgShowGIs, "Show NCBI GIs in deflines?", true);
2660 
2661  // number of one-line descriptions to display
2662  arg_desc.AddOptionalKey(kArgNumDescriptions, "int_value",
2663  "Number of database sequences to show one-line "
2664  "descriptions for\n"
2665  "Not applicable for outfmt > 4\n"
2666  "Default = `"+ NStr::IntToString(m_DfltNumDescriptions)+ "'",
2670 
2671  // number of alignments per DB sequence
2672  arg_desc.AddOptionalKey(kArgNumAlignments, "int_value",
2673  "Number of database sequences to show alignments for\n"
2674  "Default = `" + NStr::IntToString(m_DfltNumAlignments) + "'",
2678 
2679  arg_desc.AddOptionalKey(kArgLineLength, "line_length",
2680  "Line length for formatting alignments\n"
2681  "Not applicable for outfmt > 4\n"
2684  arg_desc.SetConstraint(kArgLineLength,
2686 
2687  if(!m_IsIgBlast){
2688  // Produce HTML?
2689  arg_desc.AddFlag(kArgProduceHtml, "Produce HTML output?", true);
2690 
2691 
2692  arg_desc.AddOptionalKey(kArgSortHits, "sort_hits",
2693  "Sorting option for hits:\n"
2694  "alignment view options:\n"
2695  " 0 = Sort by evalue,\n"
2696  " 1 = Sort by bit score,\n"
2697  " 2 = Sort by total score,\n"
2698  " 3 = Sort by percent identity,\n"
2699  " 4 = Sort by query coverage\n"
2700  "Not applicable for outfmt > 4\n",
2702  arg_desc.SetConstraint(kArgSortHits,
2705  true));
2706 
2707  arg_desc.AddOptionalKey(kArgSortHSPs, "sort_hsps",
2708  "Sorting option for hps:\n"
2709  " 0 = Sort by hsp evalue,\n"
2710  " 1 = Sort by hsp score,\n"
2711  " 2 = Sort by hsp query start,\n"
2712  " 3 = Sort by hsp percent identity,\n"
2713  " 4 = Sort by hsp subject start\n"
2714  "Not applicable for outfmt != 0\n",
2716  arg_desc.SetConstraint(kArgSortHSPs,
2719  true));
2720  /// Hit list size, listed here for convenience only
2721  arg_desc.SetCurrentGroup("Restrict search or results");
2722  arg_desc.AddOptionalKey(kArgMaxTargetSequences, "num_sequences",
2723  "Maximum number of aligned sequences to keep \n"
2724  "(value of 5 or more is recommended)\n"
2725  "Default = `" + NStr::IntToString(BLAST_HITLIST_SIZE) + "'",
2735  }
2736  arg_desc.SetCurrentGroup("");
2737 }
2738 
2739 bool
2741 {
2742  EOutputFormat output_fmt;
2743  string ignore1, ignore2;
2744  ParseFormattingString(args, output_fmt, ignore1, ignore2);
2745  return (output_fmt == eArchiveFormat ? true : false);
2746 }
2747 
2748 
2749 static void s_ValidateCustomDelim(string custom_fmt_spec,string customDelim)
2750 {
2751  bool error = false;
2752  string checkfield;
2753  custom_fmt_spec = NStr::TruncateSpaces(custom_fmt_spec);
2754  if(custom_fmt_spec.empty()) return;
2755 
2756  //Check if delim is already used
2757  const string kFieldsWithSemicolSeparator = "sallseqid staxids sscinames scomnames sblastnames sskingdoms";//sep = ";"
2758  const string kFramesField = "frames"; //sep = "/"
2759  const string kAllTitlesField ="salltitles"; //sep = "<>""
2760 
2761  if(customDelim == ";") {
2762  vector <string> tokens;
2763  NStr::Split(kFieldsWithSemicolSeparator," ", tokens);
2764  for(size_t i = 0; i < tokens.size(); i++) {
2765  if(NStr::Find(custom_fmt_spec,tokens[i]) != NPOS) {
2766  checkfield = tokens[i];
2767  error = true;
2768  break;
2769  }
2770  }
2771  }
2772  else {
2773  if(customDelim == "/") {
2774  checkfield = kFramesField;
2775  }
2776  else if(customDelim == "<>") {
2777  checkfield = kAllTitlesField;
2778  }
2779  if(!checkfield.empty() && NStr::Find(custom_fmt_spec,checkfield) != NPOS) {
2780  error = true;
2781  }
2782  }
2783 
2784  if(error) {
2785  string msg("Your custom record separator (" + customDelim + ") is also used by the format specifier (" + checkfield +
2786  ") to separate multiple entries. Please use a different record separator (delim keyword).");
2787  NCBI_THROW(CInputException, eInvalidInput, msg);
2788  }
2789 }
2790 
2791 void
2793  EOutputFormat& fmt_type,
2794  string& custom_fmt_spec,
2795  string& custom_delim) const
2796 {
2797  custom_fmt_spec.clear();
2798  if (args[kArgOutputFormat]) {
2799  string fmt_choice =
2800  NStr::TruncateSpaces(args[kArgOutputFormat].AsString());
2801  string::size_type pos;
2802  if ( (pos = fmt_choice.find_first_of(' ')) != string::npos) {
2803  custom_fmt_spec.assign(fmt_choice, pos+1,
2804  fmt_choice.size()-(pos+1));
2805  fmt_choice.erase(pos);
2806  }
2807  if(!custom_fmt_spec.empty()) {
2808  if(NStr::StartsWith(custom_fmt_spec, "delim")) {
2809  vector <string> tokens;
2810  NStr::Split(custom_fmt_spec," ",tokens);
2811  if(tokens.size() > 0) {
2812  string tag;
2813  bool isValid = NStr::SplitInTwo(tokens[0],"=",tag,custom_delim);
2814  if(!isValid) {
2815  string msg("Delimiter format is invalid. Valid format is delim=<delimiter value>");
2816  NCBI_THROW(CInputException, eInvalidInput, msg);
2817  }
2818  else {
2819  custom_fmt_spec = NStr::Replace(custom_fmt_spec,tokens[0],"");
2820  }
2821  }
2822  }
2823  }
2824  int val = 0;
2825  try { val = NStr::StringToInt(fmt_choice); }
2826  catch (const CStringException&) { // probably a conversion error
2827  CNcbiOstrstream os;
2828  os << "'" << fmt_choice << "' is not a valid output format";
2829  string msg = CNcbiOstrstreamToString(os);
2830  NCBI_THROW(CInputException, eInvalidInput, msg);
2831  }
2832  if (val < 0 || val >= static_cast<int>(eEndValue)) {
2833  string msg("Formatting choice is out of range");
2834  throw std::out_of_range(msg);
2835  }
2836  if (m_IsIgBlast && (val != 3 && val != 4 && val != 7 && val != eAirrRearrangement)) {
2837  string msg("Formatting choice is not valid");
2838  throw std::out_of_range(msg);
2839  }
2840  fmt_type = static_cast<EOutputFormat>(val);
2841  if ( !(fmt_type == eTabular ||
2842  fmt_type == eTabularWithComments ||
2843  fmt_type == eCommaSeparatedValues ||
2844  fmt_type == eSAM) ) {
2845  custom_fmt_spec.clear();
2846  }
2847  }
2848 }
2849 
2850 
2851 void
2853  CBlastOptions& opt)
2854 {
2856  if((m_OutputFormat == eSAM) && !(m_FormatFlags & eIsSAM) ){
2857  NCBI_THROW(CInputException, eInvalidInput,
2858  "SAM format is only applicable to blastn" );
2859  }
2861  NCBI_THROW(CInputException, eInvalidInput,
2862  "AIRR rearrangement format is only applicable to igblastn" );
2863  }
2864  if (m_OutputFormat == eFasta) {
2865  NCBI_THROW(CInputException, eInvalidInput,
2866  "FASTA output format is only applicable to magicblast");
2867  }
2869  m_ShowGis = static_cast<bool>(args[kArgShowGIs]);
2870  if(m_IsIgBlast){
2871  m_Html = false;
2872  } else {
2873  m_Html = static_cast<bool>(args[kArgProduceHtml]);
2874  }
2875  // Default hitlist size 500, value can be changed if import search strategy is used
2876  int hitlist_size = opt.GetHitlistSize();
2877 
2878  // To preserve hitlist size in import search strategy > 500,
2879  // we need to increase the num_ descriptions and num_alignemtns
2880  if(hitlist_size > BLAST_HITLIST_SIZE )
2881  {
2882  if((!args.Exist(kArgNumDescriptions) || !args[kArgNumDescriptions]) &&
2883  (!args.Exist(kArgNumAlignments) || !args[kArgNumAlignments]) &&
2885  m_NumDescriptions = hitlist_size;
2886  m_NumAlignments = hitlist_size/ 2;
2887  return;
2888  }
2889  }
2890 
2892 
2893 
2896 
2897  if (args.Exist(kArgNumDescriptions) && args[kArgNumDescriptions]) {
2898  m_NumDescriptions = args[kArgNumDescriptions].AsInteger();
2899  }
2900 
2901  if (args.Exist(kArgNumAlignments) && args[kArgNumAlignments]) {
2902  m_NumAlignments = args[kArgNumAlignments].AsInteger();
2903  }
2904 
2906  m_NumDescriptions = args[kArgMaxTargetSequences].AsInteger();
2907  m_NumAlignments = args[kArgMaxTargetSequences].AsInteger();
2908  hitlist_size = m_NumAlignments;
2909  }
2910 
2911  // The If clause is for handling import_search_strategy hitlist size < 500
2912  // We want to preserve the hitlist size in iss if no formatting input is entered in cmdline
2913  // If formmating option(s) is entered than the iss hitlist size is overridden.
2914  // FIXME: does this work with import search strategies?
2915  if ((args.Exist(kArgNumDescriptions) && args[kArgNumDescriptions]) ||
2916  (args.Exist(kArgNumAlignments) && args[kArgNumAlignments])) {
2917  hitlist_size = max(m_NumDescriptions, m_NumAlignments);
2918  }
2919 
2920  if (args[kArgLineLength]) {
2921  m_LineLength = args[kArgLineLength].AsInteger();
2922  }
2923  if(args.Exist(kArgSortHits) && args[kArgSortHits])
2924  {
2925  m_HitsSortOption = args[kArgSortHits].AsInteger();
2926  }
2927  }
2928  else
2929  {
2930  if (args.Exist(kArgNumDescriptions) && args[kArgNumDescriptions]) {
2931  ERR_POST(Warning << "The parameter -num_descriptions is ignored for "
2932  "output formats > 4 . Use -max_target_seqs "
2933  "to control output");
2934  }
2935 
2936  if (args[kArgLineLength]) {
2937  ERR_POST(Warning << "The parameter -line_length is not applicable for "
2938  "output formats > 4 .");
2939  }
2940 
2942  hitlist_size = args[kArgMaxTargetSequences].AsInteger();
2943  }
2944  else if (args.Exist(kArgNumAlignments) && args[kArgNumAlignments]) {
2945  hitlist_size = args[kArgNumAlignments].AsInteger();
2946  }
2947 
2948  m_NumDescriptions = hitlist_size;
2949  m_NumAlignments = hitlist_size;
2950 
2951  if(args.Exist(kArgSortHits) && args[kArgSortHits]) {
2952  ERR_POST(Warning << "The parameter -sorthits is ignored for output formats > 4.");
2953  }
2954  }
2955 
2956  if(hitlist_size < 5){
2957  ERR_POST(Warning << "Examining 5 or more matches is recommended");
2958  }
2959  opt.SetHitlistSize(hitlist_size);
2960 
2961  if(args.Exist(kArgSortHSPs) && args[kArgSortHSPs])
2962  {
2963  int hspsSortOption = args[kArgSortHSPs].AsInteger();
2964  if(m_OutputFormat == ePairwise) {
2965  m_HspsSortOption = hspsSortOption;
2966  }
2967  else {
2968  ERR_POST(Warning << "The parameter -sorthsps is ignored for output formats != 0.");
2969  }
2970  }
2971  return;
2972 }
2973 
2974 
2975 void
2977 {
2978  arg_desc.SetCurrentGroup("Formatting options");
2979  string kOutputFormatDescription = string(
2980  "alignment view options:\n"
2981  "sam = SAM format,\n"
2982  "tabular = Tabular format,\n"
2983  "asn = text ASN.1\n");
2984 
2985  string kUnalignedOutputFormatDescription = string(
2986  "format for reporting unaligned reads:\n"
2987  "sam = SAM format,\n"
2988  "tabular = Tabular format,\n"
2989  "fasta = sequences in FASTA format\n"
2990  "Default = same as ") +
2992 
2993  arg_desc.AddDefaultKey(align_format::kArgOutputFormat, "format",
2994  kOutputFormatDescription,
2996  "sam");
2997 
2998  set<string> allowed_formats = {"sam", "tabular", "asn"};
3000  new CArgAllowStringSet(allowed_formats));
3001 
3002  arg_desc.AddOptionalKey(kArgUnalignedFormat, "format",
3003  kUnalignedOutputFormatDescription,
3005 
3006  set<string> allowed_unaligned_formats = {"sam", "tabular", "fasta"};
3008  new CArgAllowStringSet(allowed_unaligned_formats));
3009 
3012 
3013 
3014  arg_desc.AddFlag(kArgPrintMdTag, "Include MD tag in SAM report");
3015  arg_desc.AddFlag(kArgNoReadIdTrim, "Do not trim '.1', '/1', '.2', " \
3016  "or '/2' at the end of read ids for SAM format and" \
3017  "paired runs");
3018 
3019  arg_desc.AddFlag(kArgNoUnaligned, "Do not report unaligned reads");
3020 
3021  arg_desc.AddFlag(kArgNoDiscordant,
3022  "Suppress discordant alignments for paired reads");
3023 
3024  arg_desc.AddOptionalKey(kArgUserTag, "tag",
3025  "A user tag to add to each alignment",
3027 
3028  arg_desc.SetCurrentGroup("");
3029 }
3030 
3032  CBlastOptions& opt)
3033 {
3035  string fmt_choice = args[align_format::kArgOutputFormat].AsString();
3036  if (fmt_choice == "sam") {
3037  m_OutputFormat = eSAM;
3038  }
3039  else if (fmt_choice == "tabular") {
3041  }
3042  else if (fmt_choice == "asn") {
3044  }
3045  else {
3046  CNcbiOstrstream os;
3047  os << "'" << fmt_choice << "' is not a valid output format";
3048  string msg = CNcbiOstrstreamToString(os);
3049  NCBI_THROW(CInputException, eInvalidInput, msg);
3050  }
3051 
3053  }
3054 
3055  if (args.Exist(kArgUnalignedFormat) && args[kArgUnalignedFormat]) {
3056  string fmt_choice = args[kArgUnalignedFormat].AsString();
3057  if (fmt_choice == "sam") {
3059  }
3060  else if (fmt_choice == "tabular") {
3062  }
3063  else if (fmt_choice == "fasta") {
3065  }
3066  else {
3067  CNcbiOstrstream os;
3068  os << "'" << fmt_choice
3069  << "' is not a valid output format for unaligned reads";
3070  string msg = CNcbiOstrstreamToString(os);
3071  NCBI_THROW(CInputException, eInvalidInput, msg);
3072  }
3073  }
3074 
3075  m_ShowGis = true;
3076  m_Html = false;
3077 
3078  if (args.Exist(kArgNoReadIdTrim) && args[kArgNoReadIdTrim]) {
3079  m_TrimReadIds = false;
3080  }
3081 
3082  if (args.Exist(kArgNoUnaligned) && args[kArgNoUnaligned]) {
3083  m_PrintUnaligned = false;
3084  }
3085 
3086  if (args.Exist(kArgNoDiscordant) && args[kArgNoDiscordant]) {
3087  m_NoDiscordant = true;
3088  }
3089 
3090  if (args.Exist(kArgFwdRev) && args[kArgFwdRev]) {
3091  m_FwdRev = true;
3092  }
3093 
3094  if (args.Exist(kArgRevFwd) && args[kArgRevFwd]) {
3095  m_RevFwd = true;
3096  }
3097 
3098  if (args.Exist(kArgFwdOnly) && args[kArgFwdOnly]) {
3099  m_FwdOnly = true;
3100  }
3101 
3102  if (args.Exist(kArgRevOnly) && args[kArgRevOnly]) {
3103  m_RevOnly = true;
3104  }
3105 
3107  m_OnlyStrandSpecific = true;
3108  }
3109 
3110  if (args.Exist(kArgPrintMdTag) && args[kArgPrintMdTag]) {
3111  m_PrintMdTag = true;
3112  }
3113 
3114  // only the fast tabular format is able to show merged HSPs with
3115  // common query bases
3116  if (m_OutputFormat != eTabular) {
3117  // FIXME: This is a hack. Merging should be done by the formatter,
3118  // but is currently done by HSP stream writer. This is an easy
3119  // switch until merging is implemented properly.
3120  CNcbiEnvironment().Set("MAPPER_NO_OVERLAPPED_HSP_MERGE", "1");
3121  }
3122 
3123  if (args.Exist(kArgUserTag) && args[kArgUserTag]) {
3124  NStr::Replace(args[kArgUserTag].AsString(), "\\t", "\t", m_UserTag);
3125  }
3126 }
3127 
3128 void
3130 {
3131  // number of threads
3132  arg_desc.SetCurrentGroup("Miscellaneous options");
3133 #ifdef NCBI_THREADS
3134  const int kMinValue = static_cast<int>(CThreadable::kMinNumThreads);
3135  const int kMaxValue = static_cast<int>(CSystemInfo::GetCpuCount());
3136  const int kDfltValue = m_NumThreads != CThreadable::kMinNumThreads
3137  ? std::min<int>(static_cast<int>(m_NumThreads), kMaxValue) : kMinValue;
3138 
3139  arg_desc.AddDefaultKey(kArgNumThreads, "int_value",
3140  "Number of threads (CPUs) to use in the BLAST search",
3142  NStr::IntToString(kDfltValue));
3143  arg_desc.SetConstraint(kArgNumThreads,
3144  new CArgAllowValuesGreaterThanOrEqual(kMinValue));
3145  arg_desc.SetDependency(kArgNumThreads,
3147  kArgRemote);
3148 
3149  if (m_MTMode >= 0) {
3150  arg_desc.AddDefaultKey(kArgMTMode, "int_value",
3151  "Multi-thread mode to use in BLAST search:\n "
3152  "0 auto split by database or queries \n "
3153  "1 split by queries\n "
3154  "2 split by database",
3156  NStr::IntToString(0));
3157  arg_desc.SetConstraint(kArgMTMode,
3158  new CArgAllowValuesBetween(0, 2, true));
3159  arg_desc.SetDependency(kArgMTMode,
3161  kArgNumThreads);
3162  }
3163  /*
3164  arg_desc.SetDependency(kArgNumThreads,
3165  CArgDescriptions::eExcludes,
3166  kArgUseIndex);
3167  */
3168 #endif
3169  arg_desc.SetCurrentGroup("");
3170 }
3171 
3173 {
3175 }
3176 
3177 
3178 void
3180 {
3182 }
3183 void
3185 {
3186  const int kMaxValue = static_cast<int>(CSystemInfo::GetCpuCount());
3187 
3188  if (args.Exist(kArgNumThreads) &&
3189  args[kArgNumThreads].HasValue()) { // could be cancelled by the exclusion in CRemoteArgs
3190 
3191  // use the minimum of the two: user requested number of threads and
3192  // number of available CPUs for number of threads
3193  int num_threads = args[kArgNumThreads].AsInteger();
3194  if (num_threads > kMaxValue) {
3195  m_NumThreads = kMaxValue;
3196 
3197  ERR_POST(Warning << (string)"Number of threads was reduced to " +
3198  NStr::IntToString((unsigned int)m_NumThreads) +
3199  " to match the number of available CPUs");
3200  }
3201  else {
3202  m_NumThreads = num_threads;
3203  }
3204 
3205  // This is temporarily ignored (per SB-635)
3206  if (args.Exist(kArgSubject) && args[kArgSubject].HasValue() &&
3209  string opt = kArgNumThreads;
3210  if (args.Exist(kArgMTMode) &&
3211  (args[kArgMTMode].AsInteger() == CMTArgs::eSplitByQueries)) {
3213  opt += " and " + kArgMTMode;
3214  }
3215  ERR_POST(Warning << "'" << opt << "' is currently "
3216  << "ignored when '" << kArgSubject << "' is specified.");
3217  return;
3218  }
3219  }
3220  if (args.Exist(kArgMTMode) && args[kArgMTMode].HasValue()) {
3221  m_MTMode = (EMTMode) args[kArgMTMode].AsInteger();
3222  }
3223 
3224 }
3225 
3226 void
3228 {
3229  arg_desc.SetCurrentGroup("Miscellaneous options");
3230  arg_desc.AddFlag(kArgRemote, "Execute search remotely?", true);
3231 
3232  arg_desc.SetCurrentGroup("");
3233 }
3234 
3235 void
3237 {
3238  if (args.Exist(kArgRemote)) {
3239  m_IsRemote = static_cast<bool>(args[kArgRemote]);
3240  }
3241 }
3242 
3243 void
3245 {
3246 #if _BLAST_DEBUG
3247  arg_desc.SetCurrentGroup("Miscellaneous options");
3248  arg_desc.AddFlag("verbose", "Produce verbose output (show BLAST options)",
3249  true);
3250  arg_desc.AddFlag("remote_verbose",
3251  "Produce verbose output for remote searches", true);
3252  arg_desc.AddFlag("use_test_remote_service",
3253  "Send remote requests to test servers", true);
3254  arg_desc.SetCurrentGroup("");
3255 #endif /* _BLAST_DEBUG */
3256 }
3257 
3258 void
3260 {
3261 #if _BLAST_DEBUG
3262  m_DebugOutput = static_cast<bool>(args["verbose"]);
3263  m_RmtDebugOutput = static_cast<bool>(args["remote_verbose"]);
3264  if (args["use_test_remote_service"]) {
3267  "blast4_test");
3268  }
3269 #endif /* _BLAST_DEBUG */
3270 }
3271 
3272 void
3274 {
3275  // culling limit
3276  arg_desc.SetCurrentGroup("Restrict search or results");
3277  arg_desc.AddOptionalKey(kArgCullingLimit, "int_value",
3278  "If the query range of a hit is enveloped by that of at "
3279  "least this many higher-scoring hits, delete the hit",
3282  // best hit algorithm arguments
3284 
3285  arg_desc.AddOptionalKey(kArgBestHitOverhang, "float_value",
3286  "Best Hit algorithm overhang value "
3287  "(recommended value: " +
3289  ")",
3297 
3298  arg_desc.AddOptionalKey(kArgBestHitScoreEdge, "float_value",
3299  "Best Hit algorithm score edge value "
3300  "(recommended value: " +
3302  ")",
3310  arg_desc.AddFlag(kArgSubjectBestHit, "Turn on best hit per subject sequence", true);
3311 
3312  arg_desc.SetCurrentGroup("");
3313 }
3314 
3315 void
3317  CBlastOptions& opts)
3318 {
3319  if (args[kArgCullingLimit]) {
3320  opts.SetCullingLimit(args[kArgCullingLimit].AsInteger());
3321  }
3322  if (args[kArgBestHitOverhang]) {
3323  opts.SetBestHitOverhang(args[kArgBestHitOverhang].AsDouble());
3324  }
3325  if (args[kArgBestHitScoreEdge]) {
3326  opts.SetBestHitScoreEdge(args[kArgBestHitScoreEdge].AsDouble());
3327  }
3328  if (args[kArgSubjectBestHit]) {
3329  opts.SetSubjectBestHit();
3330  }
3331 }
3332 
3333 void
3335 {
3336  arg_desc.SetCurrentGroup("General search options");
3337  arg_desc.AddDefaultKey(
3338  kArgUseIndex, "boolean",
3339  "Use MegaBLAST database index",
3341  arg_desc.AddOptionalKey(
3342  kArgIndexName, "string",
3343  "MegaBLAST database index name (deprecated; use only for old style indices)",
3345  arg_desc.SetCurrentGroup( "" );
3346 }
3347 
3348 bool
3350 {
3351  if ( (args.Exist(kArgUseIndex) && args[kArgUseIndex].HasValue()) ||
3352  (args.Exist(kArgIndexName) && args[kArgIndexName].HasValue()) ) {
3353  return true;
3354  }
3355  return false;
3356 }
3357 
3358 void
3360  CBlastOptions& opts)
3361 {
3362  // MB Index does not apply to Blast2Sequences
3363  if( args.Exist( kArgUseIndex ) &&
3364  !(args.Exist( kArgSubject ) && args[kArgSubject])) {
3365 
3366  bool use_index = true;
3367  bool force_index = false;
3368  bool old_style_index = false;
3369 
3370  if( args[kArgUseIndex] ) {
3371  if( args[kArgUseIndex].AsBoolean() ) force_index = true;
3372  else use_index = false;
3373  }
3374 
3375  if( args.Exist( kTask ) && args[kTask] &&
3376  args[kTask].AsString() != "megablast" ) {
3377  use_index = false;
3378  }
3379 
3380  if( use_index ) {
3381  string index_name;
3382 
3383  if( args.Exist( kArgIndexName ) && args[kArgIndexName] ) {
3384  index_name = args[kArgIndexName].AsString();
3385  old_style_index = true;
3386  }
3387  else if( args.Exist( kArgDb ) && args[kArgDb] ) {
3388  index_name = args[kArgDb].AsString();
3389  }
3390  else {
3391  NCBI_THROW(CInputException, eInvalidInput,
3392  "Can not deduce database index name" );
3393  }
3394 
3395  opts.SetUseIndex( true, index_name, force_index, old_style_index );
3396  }
3397  }
3398 }
3399 
3400 void
3402 {
3403  arg_desc.SetCurrentGroup("Input query options");
3404 
3405  // query filename
3406  arg_desc.AddDefaultKey(kArgQuery, "input_file",
3407  "Input file name",
3409  // for now it's either -query or -sra
3410  if( m_SRAaccessionEnabled ) {
3411  arg_desc.AddOptionalKey(kArgSraAccession, "accession",
3412  "Comma-separated SRA accessions",
3416  kArgQuery);
3417  }
3418 
3419  arg_desc.SetCurrentGroup("General search options");
3420 
3421  // report output file
3422  arg_desc.AddDefaultKey(kArgOutput, "output_file",
3423  "Output file name",
3426 
3427  if (m_GzipEnabled) {
3428  arg_desc.AddFlag(kArgOutputGzip, "Output will be compressed");
3429  }
3430 
3431  arg_desc.SetCurrentGroup("");
3432 }
3433 
3434 void
3436  CBlastOptions& /* opt */)
3437 {
3438  if (args.Exist(kArgQuery) && args[kArgQuery].HasValue() &&
3439  m_InputStream == NULL) {
3440 
3441  if (m_GzipEnabled &&
3442  NStr::EndsWith(args[kArgQuery].AsString(), ".gz", NStr::eNocase)) {
3444  args[kArgQuery].AsInputFile(),
3447  }
3448  else {
3449  m_InputStream = &args[kArgQuery].AsInputFile();
3450  }
3451  }
3452 
3453  if (args.Exist(kArgOutputGzip) && args[kArgOutputGzip]) {
3455  args[kArgOutput].AsOutputFile(),
3458  }
3459  else {
3460  m_OutputStream = &args[kArgOutput].AsOutputFile();
3461  }
3462 
3463  // stream for unaligned reads in magicblast
3464  if (args.Exist(kArgUnalignedOutput) && args[kArgUnalignedOutput]) {
3465  if (args.Exist(kArgOutputGzip) && args[kArgOutputGzip]) {
3467  args[kArgUnalignedOutput].AsOutputFile(),
3470  }
3471  else {
3472  m_UnalignedOutputStream = &args[kArgUnalignedOutput].AsOutputFile();
3473  }
3474  }
3475 }
3476 
3477 CNcbiIstream&
3479 {
3480  // programmer must ensure the ExtractAlgorithmOptions method is called
3481  // before this method is invoked
3482  if ( !m_InputStream ) {
3483  abort();
3484  }
3485  return *m_InputStream;
3486 }
3487 
3488 CNcbiOstream&
3490 {
3491  // programmer must ensure the ExtractAlgorithmOptions method is called
3492  // before this method is invoked
3494  return *m_OutputStream;
3495 }
3496 
3497 void
3499 {
3502 }
3503 
3504 void
3506 {
3507  arg_desc.SetCurrentGroup("Search strategy options");
3508 
3510  "filename",
3511  "Search strategy to use",
3514  "filename",
3515  "File name to record the search strategy used",
3520 
3521  arg_desc.SetCurrentGroup("");
3522 }
3523 
3524 void
3526  CBlastOptions& /* options */)
3527 {
3528 }
3529 
3530 CNcbiIstream*
3532 {
3533  CNcbiIstream* retval = NULL;
3534  if (args.Exist(kArgInputSearchStrategy) &&
3535  args[kArgInputSearchStrategy].HasValue()) {
3536  retval = &args[kArgInputSearchStrategy].AsInputFile();
3537  }
3538  return retval;
3539 }
3540 
3541 CNcbiOstream*
3543 {
3544  CNcbiOstream* retval = NULL;
3545  if (args.Exist(kArgOutputSearchStrategy) &&
3546  args[kArgOutputSearchStrategy].HasValue()) {
3547  retval = &args[kArgOutputSearchStrategy].AsOutputFile();
3548  }
3549  return retval;
3550 }
3551 
3553 {
3556  m_IsUngapped = false;
3557 }
3558 
3561 {
3563 }
3564 
3567 {
3568  // We're recovering from a saved strategy or combining
3569  // CBlastOptions/CBlastOptionsHandle with command line options (in GBench,
3570  // see GB-1116), so we need to still extract
3571  // certain options from the command line, include overriding query
3572  // and/or database
3573  if (m_OptsHandle.NotEmpty()) {
3575  //opts.DebugDumpText(cerr, "OptionsBeforeLoop", 1);
3576  const bool mbidxargs_set = CMbIndexArgs::HasBeenSet(args);
3577  const bool dbargs_set = CBlastDatabaseArgs::HasBeenSet(args);
3579  if (dynamic_cast<CMbIndexArgs*>(&**arg)) {
3580  if (mbidxargs_set)
3581  (*arg)->ExtractAlgorithmOptions(args, opts);
3582  } else if (dynamic_cast<CBlastDatabaseArgs*>(&**arg)) {
3583  if (dbargs_set)
3585  } else {
3586  (*arg)->ExtractAlgorithmOptions(args, opts);
3587  }
3588  }
3589  m_IsUngapped = !opts.GetGappedMode();
3590  try { m_OptsHandle->Validate(); }
3591  catch (const CBlastException& e) {
3592  NCBI_THROW(CInputException, eInvalidInput, e.GetMsg());
3593  }
3594  //opts.DebugDumpText(cerr, "OptionsAfterLoop", 1);
3595  return m_OptsHandle;
3596  }
3597 
3598  CBlastOptions::EAPILocality locality =
3599  (args.Exist(kArgRemote) && args[kArgRemote])
3602 
3603  // This is needed as a CRemoteBlast object and its options are instantiated
3604  // to create the search strategy
3605  if (GetExportSearchStrategyStream(args) ||
3607  locality = CBlastOptions::eBoth;
3608  }
3609 
3610  CRef<CBlastOptionsHandle> retval(x_CreateOptionsHandle(locality, args));
3611  CBlastOptions& opts = retval->SetOptions();
3613  (*arg)->ExtractAlgorithmOptions(args, opts);
3614  }
3615 
3616  m_IsUngapped = !opts.GetGappedMode();
3617  try { retval->Validate(); }
3618  catch (const CBlastException& e) {
3619  NCBI_THROW(CInputException, eInvalidInput, e.GetMsg());
3620  }
3621  return retval;
3622 }
3623 
3624 void CBlastAppArgs::SetTask(const string& task)
3625 {
3626 #if _BLAST_DEBUG
3627  ThrowIfInvalidTask(task);
3628 #endif
3629  m_Task.assign(task);
3630 }
3631 
3632 /// Get the input stream
3634  return m_StdCmdLineArgs->GetInputStream();
3635 }
3636 /// Get the output stream
3639 }
3640 
3643 {
3644  unique_ptr<CArgDescriptions> retval(new CArgDescriptions);
3645 
3646  // Create the groups so that the ordering is established
3647  retval->SetCurrentGroup("Input query options");
3648  retval->SetCurrentGroup("General search options");
3649  retval->SetCurrentGroup("BLAST database options");
3650  retval->SetCurrentGroup("BLAST-2-Sequences options");
3651  retval->SetCurrentGroup("Formatting options");
3652  retval->SetCurrentGroup("Query filtering options");
3653  retval->SetCurrentGroup("Restrict search or results");
3654  retval->SetCurrentGroup("Discontiguous MegaBLAST options");
3655  retval->SetCurrentGroup("Statistical options");
3656  retval->SetCurrentGroup("Search strategy options");
3657  retval->SetCurrentGroup("Extension options");
3658  retval->SetCurrentGroup("");
3659 
3660 
3661  NON_CONST_ITERATE(TBlastCmdLineArgs, arg, args) {
3662  (*arg)->SetArgumentDescriptions(*retval);
3663  }
3664  return retval.release();
3665 }
3666 
3669  (CBlastOptions::EAPILocality locality, const string& task)
3670 {
3671  _ASSERT(!task.empty());
3673  SetTask(task);
3674  retval.Reset(CBlastOptionsFactory::CreateTask(GetTask(), locality));
3675  _ASSERT(retval.NotEmpty());
3676  return retval;
3677 }
3678 
3679 void
3681 {
3682  set<string> can_override;
3683  can_override.insert(kArgQuery);
3684  can_override.insert(kArgQueryLocation);
3685  can_override.insert(kArgSubject);
3686  can_override.insert(kArgSubjectLocation);
3687  can_override.insert(kArgUseLCaseMasking);
3688  can_override.insert(kArgDb);
3689  can_override.insert(kArgDbSize);
3690  can_override.insert(kArgEntrezQuery);
3691  can_override.insert(kArgDbSoftMask);
3692  can_override.insert(kArgDbHardMask);
3693  can_override.insert(kArgUseIndex);
3694  can_override.insert(kArgIndexName);
3695  can_override.insert(kArgStrand);
3696  can_override.insert(kArgParseDeflines);
3697  can_override.insert(kArgOutput);
3698  can_override.insert(kArgOutputFormat);
3699  can_override.insert(kArgNumDescriptions);
3700  can_override.insert(kArgNumAlignments);
3701  can_override.insert(kArgMaxTargetSequences);
3702  can_override.insert(kArgRemote);
3703  can_override.insert(kArgNumThreads);
3704  can_override.insert(kArgInputSearchStrategy);
3705  can_override.insert(kArgRemote);
3706  can_override.insert("remote_verbose");
3707  can_override.insert("verbose");
3708 
3709  // this stores the arguments (and their defaults) that cannot be overriden
3710  map<string, string> has_defaults;
3712  has_defaults[kArgCompBasedStats] =
3714  // FIX the line below for igblast, and add igblast options
3716  has_defaults[kTask] = m_Task;
3717  has_defaults[kArgOldStyleIndex] = kDfltArgOldStyleIndex;
3718 
3719  if (Blast_QueryIsProtein(prog)) {
3720  if (NStr::Find(m_Task, "blastp") != NPOS ||
3721  NStr::Find(m_Task, "psiblast") != NPOS) {
3722  has_defaults[kArgSegFiltering] = kDfltArgNoFiltering;
3723  } else {
3724  has_defaults[kArgSegFiltering] = kDfltArgSegFiltering;
3725  }
3726  has_defaults[kArgLookupTableMaskingOnly] =
3728  has_defaults[kArgGapTrigger] =
3730  } else {
3731  has_defaults[kArgDustFiltering] = kDfltArgDustFiltering;
3732  has_defaults[kArgLookupTableMaskingOnly] =
3734  has_defaults[kArgGapTrigger] =
3736  }
3737  has_defaults[kArgOffDiagonalRange] =
3739  has_defaults[kArgMaskLevel] = kDfltArgMaskLevel;
3740  has_defaults[kArgMaxIntronLength] =
3744  // pssm engine/psiblast default options
3745  has_defaults[kArgPSIPseudocount] =
3747  has_defaults[kArgPSIInclusionEThreshold] =
3749  has_defaults[kArgPSINumIterations] =
3751 
3752  // get arguments, remove the supported ones and warn about those that
3753  // cannot be overridden.
3754  typedef vector< CRef<CArgValue> > TArgs;
3755  TArgs arguments = args.GetAll();
3756  ITERATE(TArgs, a, arguments) {
3757  const string& arg_name = (*a)->GetName();
3758  const string& arg_value = (*a)->AsString();
3759  // if it has a default value, ignore it if it's not different from the
3760  // default, otherwise, issue a warning
3761  if (has_defaults.find(arg_name) != has_defaults.end()) {
3762  if (has_defaults[arg_name] == arg_value) {
3763  continue;
3764  } else {
3765  if (arg_name == kTask && arg_value == "megablast") {
3766  // No need to issue warning here, as it's OK to change this
3767  continue;
3768  }
3769  ERR_POST(Warning << arg_name << " cannot be overridden when "
3770  "using a search strategy");
3771  }
3772  }
3773  // if the argument cannot be overridden, issue a warning
3774  if (can_override.find(arg_name) == can_override.end()) {
3775  ERR_POST(Warning << arg_name << " cannot be overridden when "
3776  "using a search strategy");
3777  }
3778  }
3779 }
3780 
3783 {
3784  if(m_OptsHandle.Empty())
3785  {
3786  NCBI_THROW(CInputException, eInvalidInput, "Empty Blast Options Handle");
3787  }
3788 
3789  // We're recovering from a saved strategy, so we need to still extract
3790  // certain options from the command line, include overriding query
3791  // and/or database
3793  // invoke ExtractAlgorithmOptions on certain argument classes, i.e.: those
3794  // that should have their arguments overriden
3798  m_DebugArgs->ExtractAlgorithmOptions(args, opts);
3800  m_MTArgs->ExtractAlgorithmOptions(args, opts);
3801  if (CBlastDatabaseArgs::HasBeenSet(args)) {
3803  }
3804  if (CMbIndexArgs::HasBeenSet(args)) {
3806  if (dynamic_cast<CMbIndexArgs*>(arg->GetPointer()) != NULL) {
3807  (*arg)->ExtractAlgorithmOptions(args, opts);
3808  }
3809  }
3810  }
3811  m_IsUngapped = !opts.GetGappedMode();
3813  try { m_OptsHandle->Validate(); }
3814  catch (const CBlastException& e) {
3815  NCBI_THROW(CInputException, eInvalidInput, e.GetMsg());
3816  }
3817  return m_OptsHandle;
3818 }
3819 
3820 END_SCOPE(blast)
User-defined methods of the data storage class.
Declares singleton objects to store the version and reference for the BLAST engine.
USING_SCOPE(objects)
static void s_GetTaxIDList(const string &in, bool isFile, bool isNegativeList, CRef< CSearchDatabase > &sdb, bool isTargetOnly)
static bool s_IsDefaultWordThreshold(EProgram program, double threshold)
Definition: blast_args.cpp:585
static void s_ValidateCustomDelim(string custom_fmt_spec, string customDelim)
static void s_SetCompositionBasedStats(CBlastOptions &opt, const string &comp_stat_string, bool smith_waterman_value, bool *ungapped)
Auxiliary function to set the composition based statistics and smith waterman options.
Definition: blast_args.cpp:822
const char * kTemplType_Coding
Value to specify coding template type.
Definition: blast_args.cpp:686
const char * kTemplType_Optimal
Value to specify optimal template type.
Definition: blast_args.cpp:688
const char * kTemplType_CodingAndOptimal
Value to specify coding+optimal template type.
Definition: blast_args.cpp:690
CArgDescriptions * SetUpCommandLineArguments(TBlastCmdLineArgs &args)
Create a CArgDescriptions object and invoke SetArgumentDescriptions for each of the TBlastCmdLineArgs...
static string s_RegisterOMDataLoader(CRef< CSeqDB > db_handle)
Interface for converting blast-related command line arguments into blast options.
vector< CRef< IBlastCmdLineArgs > > TBlastCmdLineArgs
Type definition of a container of IBlastCmdLineArgs.
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
@ eHardSubjMasking
Definition: blast_def.h:238
@ eSoftSubjMasking
Definition: blast_def.h:237
Declares the BLAST exception class.
Interface for converting sources of sequence data into blast sequence input.
TSeqRange ParseSequenceRange(const string &range_str, const char *error_prefix=NULL)
Parse and extract a sequence range from argument provided to this function.
CRef< objects::CScope > ReadSequencesToBlast(CNcbiIstream &in, bool read_proteins, const TSeqRange &range, bool parse_deflines, bool use_lcase_masking, CRef< CBlastQueryVector > &sequences, bool gaps_to_Ns=false)
Read sequence input for BLAST.
Routines for creating nucleotide BLAST lookup tables.
EDiscWordType
General types of discontiguous word templates.
@ eMBWordOptimal
@ eMBWordCoding
@ eMBWordTwoTemplates
#define PSI_INCLUSION_ETHRESH
Defaults for PSI-BLAST and DELTA-BLAST options.
#define BLAST_HITLIST_SIZE
Number of database sequences to save hits for.
#define BLAST_WORD_THRESHOLD_BLASTX
default threshold (blastx)
Int2 BLAST_GetSuggestedThreshold(EBlastProgramType program_number, const char *matrixName, double *threshold)
Get thresholds for word-finding suggested by Stephen Altschul.
@ eDynProgScoreOnly
standard affine gapping
Int2 BLAST_GetSuggestedWindowSize(EBlastProgramType program_number, const char *matrixName, Int4 *window_size)
Get window sizes for two hit algorithm suggested by Stephen Altschul.
#define BLAST_GAP_TRIGGER_NUCL
default bit score that will trigger a gapped extension for blastn
#define MAX_DB_WORD_COUNT_MAPPER
Default max frequency for a database word.
#define BLAST_EXPECT_VALUE
Default parameters for saving hits.
#define DELTA_INCLUSION_ETHRESH
Inclusion threshold for DELTA-BLAST.
#define BLAST_WORD_THRESHOLD_BLASTP
neighboring word score thresholds; a threshold of zero means that only query and subject words that m...
#define BLAST_GAP_TRIGGER_PROT
default bit score that will trigger gapped extension
#define PSI_PSEUDO_COUNT_CONST
Pseudo-count constant for PSI-BLAST.
@ eDynProgTbck
standard affine gapping
Int2 PSIBlastOptionsNew(PSIBlastOptions **psi_options)
Initialize default options for PSI BLAST.
#define BLAST_GENETIC_CODE
Default genetic code for query and/or database.
#define BLAST_WORD_THRESHOLD_TBLASTN
default neighboring threshold (tblastn/rpstblastn)
@ eCompressedAaLookupTable
compressed alphabet (blastp) lookup table
Boolean Blast_SubjectIsNucleotide(EBlastProgramType p)
Returns true if the subject is nucleotide.
Definition: blast_program.c:53
Boolean Blast_QueryIsNucleotide(EBlastProgramType p)
Returns true if the query is nucleotide.
Definition: blast_program.c:43
Boolean Blast_QueryIsProtein(EBlastProgramType p)
Returns true if the query is protein.
Definition: blast_program.c:40
Boolean Blast_ProgramIsRpsBlast(EBlastProgramType p)
Returns true if program is RPS-BLAST (i.e.
Definition: blast_program.c:73
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastn
Definition: blast_program.h:74
PSIDiagnosticsRequest * PSIDiagnosticsRequestNewEx(Boolean save_ascii_pssm)
Allocates a PSIDiagnosticsRequest structure, setting fields to their default values for their use in ...
Definition: blast_psi.c:591
Int2 BLAST_GetProteinGapExistenceExtendParams(const char *matrixName, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
Definition: blast_stat.c:3374
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
Definition: blast_types.hpp:56
@ eTblastx
Translated nucl-Translated nucl.
Definition: blast_types.hpp:62
@ eBlastn
Nucl-Nucl (traditional blastn)
Definition: blast_types.hpp:58
@ eRPSBlast
protein-pssm (reverse-position-specific BLAST)
Definition: blast_types.hpp:63
@ eBlastp
Protein-Protein.
Definition: blast_types.hpp:59
@ ePSIBlast
PSI Blast.
Definition: blast_types.hpp:67
@ eTblastn
Protein-Translated nucl.
Definition: blast_types.hpp:61
@ eDeltaBlast
Delta Blast.
Definition: blast_types.hpp:71
@ ePSITblastn
PSI Tblastn.
Definition: blast_types.hpp:68
@ eRPSTblastn
nucleotide-pssm (RPS blast with translated query)
Definition: blast_types.hpp:64
@ eBlastx
Translated nucl-Protein.
Definition: blast_types.hpp:60
Auxiliary class to validate the genetic code input.
Definition: blast_args.cpp:996
virtual string GetUsage(void) const
Overloaded method from CArgAllow.
virtual bool Verify(const string &value) const
Overloaded method from CArgAllow.
Definition: blast_args.cpp:999
Class to constrain the length of the file name passed to a given CArgDescriptions key.
Class to constrain the values of an argument to those in between the values specified in the construc...
Class to constrain the values of an argument to those greater than or equal to the value specified in...
Class to constrain the values of an argument to those less than or equal to the value specified in th...
CArgAllow_Doubles –.
Definition: ncbiargs.hpp:1781
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgAllow –.
Definition: ncbiargs.hpp:1488
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
Auxiliary class to store the name of an output file, which is reset every time its GetStream method i...
CRef< CRemoteArgs > m_RemoteArgs
remote vs. local execution options
CRef< CBlastOptionsHandle > SetOptionsForSavedStrategy(const CArgs &args)
Combine the command line arguments into a CBlastOptions object recovered from saved search strategy.
string GetTask() const
Get the task for this object.
virtual CNcbiIstream & GetInputStream()
Get the input stream.
CRef< CBlastOptionsHandle > m_OptsHandle
The BLAST options handle, only non-NULL if assigned via SetOptionsHandle, i.e.
CRef< CQueryOptionsArgs > m_QueryOptsArgs
query options object
CRef< CBlastDatabaseArgs > m_BlastDbArgs
database/subject object
virtual CRef< CBlastOptionsHandle > x_CreateOptionsHandle(CBlastOptions::EAPILocality locality, const CArgs &args)=0
Create the options handle based on the command line arguments.
CRef< CBlastOptionsHandle > SetOptions(const CArgs &args)
Extract the command line arguments into a CBlastOptionsHandle object.
CRef< CSearchStrategyArgs > m_SearchStrategyArgs
arguments for dealing with search strategies
string m_Task
Task specified in the command line.
CRef< CDebugArgs > m_DebugArgs
Debugging arguments.
CRef< CBlastOptionsHandle > x_CreateOptionsHandleWithTask(CBlastOptions::EAPILocality locality, const string &task)
Creates the BLAST options handle based on the task argument.
CBlastAppArgs()
Default constructor.
CRef< CMTArgs > m_MTArgs
multi-threaded options
CArgDescriptions * SetCommandLine()
Set the command line arguments.
CRef< CFormattingArgs > m_FormattingArgs
formatting options
void x_IssueWarningsForIgnoredOptions(const CArgs &args)
Issue warnings when recovering from a search strategy (command line applications only)
bool m_IsUngapped
Is this application being run ungapped.
TBlastCmdLineArgs m_Args
Set of command line argument objects.
CNcbiOstream * GetExportSearchStrategyStream(const CArgs &args)
Get the output stream for the search strategy.
void SetTask(const string &task)
Set the task for this object.
virtual CNcbiOstream & GetOutputStream()
Get the output stream.
CRef< CStdCmdLineArgs > m_StdCmdLineArgs
standard command line arguments class
Argument class to collect database/subject arguments.
Definition: blast_args.hpp:882
CBlastDatabaseArgs(bool request_mol_type=false, bool is_rpsblast=false, bool is_igblast=false, bool is_mapper=false, bool is_kblast=false)
Constructor.
virtual void ExtractAlgorithmOptions(const CArgs &args, CBlastOptions &opts)
Interface method,.
static bool HasBeenSet(const CArgs &args)
Auxiliary function to determine if the database/subject sequence has been set.
CRef< objects::CScope > m_Scope
CScope object in which all subject sequences read are kept.
Definition: blast_args.hpp:978
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
bool m_IsMapper
true for short read mapper
Definition: blast_args.hpp:975
bool IsProtein() const
Is the database/subject protein?
Definition: blast_args.hpp:919
bool m_SupportsDatabaseMasking
true if it's supported
Definition: blast_args.hpp:980
static const int kSubjectsDataLoaderPriority
The default priority for subjects, should be used for subjects/databases.
Definition: blast_args.hpp:886
bool m_IsProtein
Is the database/subject(s) protein?
Definition: blast_args.hpp:974
bool m_RequestMoleculeType
Determines whether the database's molecule type should be requested in the command line,...
Definition: blast_args.hpp:967
bool m_IsIgBlast
true if the search is Ig-BLAST
Definition: blast_args.hpp:972
CRef< IQueryFactory > m_Subjects
The subject sequences.
Definition: blast_args.hpp:977
bool m_IsRpsBlast
true if the search is RPS-BLAST
Definition: blast_args.hpp:971
CRef< CSearchDatabase > m_SearchDb
Description of the BLAST database.
Definition: blast_args.hpp:966
bool m_SupportIPGFiltering
true if IPG filtering is supported
Definition: blast_args.hpp:981
bool m_IsKBlast
true for Kblastp
Definition: blast_args.hpp:976
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, const string &dbname="nr", const EDbType dbtype=eUnknown, bool use_fixed_size_slices=true, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: bdbloader.cpp:52
static string GetLoaderNameFromArgs(CConstRef< CSeqDB > db_handle)
Definition: bdbloader.cpp:164
Defines BLAST error codes (user errors included)
Encapsulates ALL the BLAST algorithm's options.
EAPILocality
Enumerates the possible contexts in which objects of this type can be used.
@ eLocal
To be used for running BLAST locally.
@ eRemote
To be used when running BLAST remotely.
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
Definition: version.hpp:53
bool m_Is2and3Supported
Are options 2 and 3 supported.
Definition: blast_args.hpp:414
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:887
string m_ZeroOptDescr
Non standard description for option zero.
Definition: blast_args.hpp:418
string m_DefaultOpt
Default option.
Definition: blast_args.hpp:416
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:766
CCompressOStream –.
bool m_DebugOutput
Should debugging (verbose) output be printed.
bool m_RmtDebugOutput
Should debugging (verbose) output be printed for remote BLAST.
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
CDecompressIStream –.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
CRef< CSearchDatabase > m_DomainDb
Conserved Domain Database.
Definition: blast_args.hpp:743
bool m_ShowDomainHits
Is printing CDD hits requested.
Definition: blast_args.hpp:746
CDirEntry –.
Definition: ncbifile.hpp:262
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:733
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:693
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:327
bool m_QueryIsProtein
true if the query is protein
Definition: blast_args.hpp:352
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:384
void x_TokenizeFilteringArgs(const string &filtering_args, vector< string > &output) const
Auxiliary method to tokenize the filtering string.
Definition: blast_args.cpp:372
bool m_FilterByDefault
Should filtering be applied by default?
Definition: blast_args.hpp:353
Class implements different ad-hoc unreliable file format identifications.
@ eBinaryASN
Binary ASN.1.
@ eTextASN
Text ASN.1.
TSeqPos m_NumDescriptions
Number of 1-line descr. to show.
TSeqPos m_DfltNumDescriptions
Default value for num descriptions.
TSeqPos m_NumAlignments
Number of alignments to show.
virtual bool ArchiveFormatRequested(const CArgs &args) const
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
EFormatFlags m_FormatFlags
virtual void ParseFormattingString(const CArgs &args, EOutputFormat &fmt_type, string &custom_fmt_spec, string &custom_delim) const
Parses the output format command line option value, returns the requested output format type and any ...
virtual void ExtractAlgorithmOptions(const CArgs &args, CBlastOptions &opts)
Interface method,.
EOutputFormat
Defines the output formats supported by our command line formatter.
@ eEndValue
Sentinel value for error checking.
@ ePairwise
Standard pairwise alignments.
@ eTabular
Tabular output.
@ eSAM
SAM format.
@ eCommaSeparatedValues
Comma-separated values.
@ eAsnText
ASN.1 text output.
@ eArchiveFormat
BLAST archive format.
@ eAirrRearrangement
igblast AIRR rearrangement, 19
@ eFasta
unaligned reads in magicblast
@ eFlatQueryAnchoredNoIdentities
@ eTabularWithComments
Tabular output with comments.
bool m_IsIgBlast
IgBlast has a different default num_alignments.
string m_CustomOutputFormatSpec
The format specification for custom output, e.g.
EOutputFormat m_OutputFormat
Choice of formatting output.
TSeqPos m_DfltNumAlignments
Default value for num alignments.
bool m_ShowGis
Display NCBI GIs?
bool m_Html
Display HTML output?
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:975
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:959
bool m_QueryIsProtein
true if the query is protein
Definition: blast_args.hpp:499
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:902
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:919
bool m_QueryIsProtein
true if the query is protein
Definition: blast_args.hpp:320
bool m_IsRpsBlast
true if the search is RPS-BLAST
Definition: blast_args.hpp:321
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:250
bool m_IsIgBlast
true if the search is igblast
Definition: blast_args.hpp:325
bool m_IsTblastx
true if the search is tblastx
Definition: blast_args.hpp:324
bool m_ShowPercentIdentity
true if the percent identity option should be shown
Definition: blast_args.hpp:322
CGenericSearchArgs(bool query_is_protein=true, bool is_rpsblast=false, bool show_perc_identity=false, bool is_tblastx=false, bool is_igblast=false, bool suppress_sum_stats=false)
Constructor.
Definition: blast_args.hpp:303
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:136
bool m_SuppressSumStats
true if search is blastn or blastp
Definition: blast_args.hpp:326
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
ETarget m_Target
Genetic code target.
Definition: blast_args.hpp:479
@ eQuery
Query genetic code.
Definition: blast_args.hpp:460
@ eDatabase
Database genetic code.
Definition: blast_args.hpp:461
virtual void ExtractAlgorithmOptions(const CArgs &args, CBlastOptions &opts)
Interface method,.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
CRef< CIgBlastOptions > m_IgOptions
Igblast options to fill.
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
bool m_IsProtein
Is this a protein search?
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
CRef< objects::CScope > m_Scope
scope to get sequences
Defines user input exceptions.
double m_JDistance
Jaccard distance.
Definition: blast_args.hpp:698
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
int m_CandidateSeqs
Number of candidate sequences to try BLAST on.
Definition: blast_args.hpp:707
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
int m_MinHits
Minimum number of hits in LSH phase.
Definition: blast_args.hpp:701
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:946
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:930
Interface to create a BlastSeqSrc suitable for use in CORE BLAST from a a variety of BLAST database/s...
size_t m_NumThreads
Number of threads to spawn.
void x_ExtractAlgorithmOptions(const CArgs &args)
CMTArgs(size_t default_num_threads=CThreadable::kMinNumThreads, EMTMode mt_mode=eNotSupported)
Default Constructor.
EMTMode m_MTMode
@ eSplitByQueries
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
EOutputFormat m_UnalignedOutputFormat
virtual void ExtractAlgorithmOptions(const CArgs &args, CBlastOptions &opt)
Interface method,.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
CNcbiIstream * m_MateInputStream
Definition: blast_args.hpp:873
EInputFormat m_InputFormat
Definition: blast_args.hpp:870
virtual void ExtractAlgorithmOptions(const CArgs &args, CBlastOptions &opt)
Interface method,.
vector< string > m_SraAccessions
Definition: blast_args.hpp:871
unique_ptr< CDecompressIStream > m_DecompressIStream
Definition: blast_args.hpp:874
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:621
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:631
Argument class to retrieve megablast database indexing options.
static bool HasBeenSet(const CArgs &args)
Auxiliary function to determine if the megablast database indexing options have been set.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
virtual void ExtractAlgorithmOptions(const CArgs &args, CBlastOptions &opts)
Interface method,.
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CNcbiEnvironment –.
Definition: ncbienv.hpp:110
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CNcbiRegistry –.
Definition: ncbireg.hpp:913
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:668
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:639
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:513
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:498
Wrapper class for PSIBlastOptions .
Definition: blast_aux.hpp:330
Wrapper class for PSIDiagnosticsRequest .
Definition: blast_aux.hpp:347
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:77
string m_ProgDesc
Application's description.
Definition: blast_args.hpp:193
string m_ProgName
Application's name.
Definition: blast_args.hpp:192
CProgramDescriptionArgs(const string &program_name, const string &program_description)
Constructor.
Definition: blast_args.cpp:71
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
bool m_SaveLastPssm
Save PSSM after the last database search.
Definition: blast_args.hpp:635
CRef< CAutoOutputFileReset > m_AsciiMatrixOutput
ASCII matrix output file.
Definition: blast_args.hpp:627
bool m_IsDeltaBlast
Are the aruments set up for Delta Blast.
Definition: blast_args.hpp:632
@ eProteinDb
Traditional, iterated PSI-BLAST.
Definition: blast_args.hpp:544
@ eNucleotideDb
PSI-Tblastn, non-iterated.
Definition: blast_args.hpp:545
CRef< CAutoOutputFileReset > m_CheckPointOutput
checkpoint output file
Definition: blast_args.hpp:625
ETargetDatabase m_DbTarget
Molecule of the database.
Definition: blast_args.hpp:621
CRef< objects::CPssmWithParameters > x_CreatePssmFromMsa(CNcbiIstream &input_stream, CBlastOptions &opt, bool save_ascii_pssm, unsigned int msa_master_idx, bool ignore_pssm_tmpl_seq)
Auxiliary function to create a PSSM from a multiple sequence alignment file.
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
CRef< objects::CPssmWithParameters > m_Pssm
PSSM.
Definition: blast_args.hpp:629
size_t m_NumIterations
number of iterations to perform
Definition: blast_args.hpp:623
This class is a concrete strategy for IPssmInputData which converts the CLUSTALW-style output contain...
bool m_IsDeltaBlast
Are these arumnets for Delta Blast.
Definition: blast_args.hpp:519
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Computes a PSSM as specified in PSI-BLAST.
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
bool m_UseLCaseMask
use lowercase masking in FASTA input
Definition: blast_args.hpp:808
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
objects::ENa_strand m_Strand
Strand(s) to search.
Definition: blast_args.hpp:804
TSeqRange m_Range
range to restrict the query sequence(s)
Definition: blast_args.hpp:806
bool m_ParseDeflines
Should the deflines be parsed?
Definition: blast_args.hpp:810
bool m_QueryCannotBeNucl
only false for blast[xn], and tblastx true in case of PSI-BLAST
Definition: blast_args.hpp:814
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:524
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:551
bool m_IsRemote
Should the search be executed remotely?
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
CScope –.
Definition: scope.hpp:92
Blast Search Subject.
Argument class to import/export the search strategy.
Definition: blast_args.hpp:524
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
CNcbiIstream * GetImportStream(const CArgs &args) const
Get the input stream for the search strategy.
CNcbiOstream * GetExportStream(const CArgs &args) const
Get the output stream for the search strategy.
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
CSeqDBFileGiList.
CSeqDBGiList.
void AddTaxIds(const set< TTaxId > &tax_ids)
EStatType
Counts statistics formats.
static EStatType DiscoverStatType(string const &name)
Return the format of the counts statistics file.
Root class for all serialization exceptions.
Definition: exception.hpp:50
bool m_GzipEnabled
If true input file will be decompressed with gzip if filename ends with ".gz".
Definition: blast_args.hpp:165
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
CNcbiIstream & GetInputStream() const
Get the input stream for a command line application.
unique_ptr< CDecompressIStream > m_DecompressIStream
Definition: blast_args.hpp:156
CNcbiOstream & GetOutputStream() const
Get the output stream for a command line application.
CRef< CTmpFile > m_QueryTmpInputFile
ASN.1 specification of query sequences when read from a saved search strategy.
Definition: blast_args.hpp:161
unique_ptr< CCompressOStream > m_CompressOStream
Definition: blast_args.hpp:157
CNcbiOstream * m_OutputStream
Application's output stream.
Definition: blast_args.hpp:155
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
CNcbiIstream * m_InputStream
Application's input stream.
Definition: blast_args.hpp:154
bool m_SRAaccessionEnabled
If true, option to specify SRA runs will be presented as possible query input.
Definition: blast_args.hpp:169
void SetInputStream(CRef< CTmpFile > input_file)
Set the input stream if read from a saved search strategy.
CNcbiOstream * m_UnalignedOutputStream
Output stream to report unaligned sequences/reads.
Definition: blast_args.hpp:172
unique_ptr< CCompressOStream > m_UnalignedCompressOStream
Definition: blast_args.hpp:173
Simple implementation of ILineReader for i(o)streams.
CStringException –.
Definition: ncbistr.hpp:4508
static unsigned int GetCpuCount(void)
Return number of active CPUs/cores (never less than 1).
const set< string > m_SupportedTasks
Set of supported tasks by this command line argument.
Definition: blast_args.hpp:215
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:111
CTaskCmdLineArgs(const set< string > &supported_tasks, const string &default_task)
Constructor.
Definition: blast_args.cpp:84
string m_DefaultTask
Default task for this command line argument.
Definition: blast_args.hpp:217
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:95
Clas to retrieve taxonomic information for filtering BLASTDBs.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:482
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:468
virtual void ExtractAlgorithmOptions(const CArgs &cmd_line_args, CBlastOptions &options)
Interface method,.
Definition: blast_args.cpp:603
virtual void SetArgumentDescriptions(CArgDescriptions &arg_desc)
Interface method,.
Definition: blast_args.cpp:571
BLAST Command line arguments design The idea is to have several small objects (subclasses of IBlastCm...
Definition: blast_args.hpp:84
IRWRegistry –.
Definition: ncbireg.hpp:407
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
bool empty() const
Definition: set.hpp:133
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
const string kArgMatrixName
Argument for scoring matrix.
const string kArgWindowMaskerDatabase
Argument to specify a path to a Window Masker database.
const string kArgGLChainType
Argument to specify the germline database chaintype name for igblast.
const string kArgAsciiPssmOutputFile
Argument to specify the file name for saving the ASCII representation of the PSSM.
const string kArgMaxDbWordCount
Argument to specify a maximum number of times a word can be repeated in a database.
const string kArgGLOrigin
Argument to specify the germline origin for igblast.
const string kDfltArgJDistance
Jaccard default value.
const string kArgPSIPseudocount
Argument to specify the pseudo-count value used when constructing PSSM.
const string kArgNoGreedyExtension
Argument to specify non-greedy dynamic programming extension.
const string kDfltArgApplyFiltering
Default argument to specify filtering.
const string kArgPSIOutputChkPntFile
Argument to specify a 'checkpoint' file to write the PSSM.
const string kArgSplice
Argument to specify whether to search for spliced alignments.
const string kArgMinRawGappedScore
Argument for minimum raw gapped score for preliminary gapped and traceback stages.
const string kArgGLNumAlign
Argument to specify the number of alignments for germline database.
const string kArgLookupStride
Argument to sepcify the stride when creating a lookup table.
const string kArgDbSize
Effective length of BLAST database.
const string kArgDbGeneticCode
Database genetic code.
const string kArgPSIInclusionEThreshold
Argument to specify the evalue inclusion threshold for considering aligned sequences for PSSM constru...
const string kArgRevFwd
Argument to specify reverse/forward strand specificity.
const string kArgPSIInputChkPntFile
Argument to specify a 'checkpoint' file to recover the PSSM from.
const string kArgScore
Argument to specify cutoff score for accepting a spliced alignment.
const string kArgMaxIntronLength
Argument to specify the maximum length of an intron when linking multiple distinct alignments (applic...
const string kArgTranslate
Arugment to specify if Igblast alignment should be translated to protein.
const bool kDfltArgParseDeflines
Default argument to specify whether sequences deflines should be parsed.
const string kArgDMBTemplateLength
Argument to specify the discontinuous megablast template length.
const string kArgOutput
Output file name.
const string kArgClonotypeFile
Argument to specify number of clonotype file.
const int kDfltArgCullingLimit
Default argument to specify the culling limit.
const string kArgPercentIdentity
Argument to specify the target percent identity.
const string kArgStrand
Argument to select the query strand(s) to search.
const string kDfltArgCompBasedStatsDelta
const string kArgDMBTemplateType
Argument to specify the discontinuous megablast template type.
const string kArgCandidateSeqs
Number of sequences to attempt BLAST on.
const string kArgOutputSearchStrategy
Argument to specify the file name to save the search strategy used for a BLAST search.
const string kArgDPenalty
Argument to specify mismatch penalty for D gene search.
const string kArgGapExtend
Argument to select the gap extending penalty.
const string kArgRemote
Argument to determine whether searches should be run locally or remotely.
const string kArgQueryLocation
Argument to specify a location to restrict the query sequence(s)
const string kArgDbHardMask
const string kArgDbSoftMask
List of filtering algorithms to apply to subjects as soft masking.
const string kArgOnlyStrandSpecific
Argument to specify only strand specific results.
const int kDfltArgMaxIntronLength
Default value for maximum intron length.
const double kDfltArgBestHitOverhang
Default argument for the overhang parameter to the best hit algorithm.
const string kArgJPenalty
Argument to specify mismatch penalty for J gene search.
const string kArgFilteringDb
Argument to specify a filtering database (i.e.
const string kArgSegFiltering
Argument to specify SEG filtering on query sequence(s)
const string kArgDbType
BLAST database molecule type.
const string kArgTaxIdListFile
Argument to specify file with taxonomy ids for filtering.
const string kArgUnalignedOutput
Argument to output unaligned reads in a separate file.
const string kArgNoTaxIdExpansion
Argument to not to resolve TaxId to descendant.
const string kArgMinJLength
Argument to specify minimal required J gene length.
const string kArgPrintMdTag
Argument to specify printing SAN MD tag.
const string kArgGappedXDropoff
Argument to select the gapped X dropoff value.
const string kArgUseSWTraceback
Argument to specify that Smith-Waterman algorithm should be used to compute locally optimal alignment...
const string kArgIndexName
Megablast database index name.
const string kArgGapOpen
Argument to select the gap opening penalty.
const string kArgDustFiltering
Argument to specify DUST filtering on query sequence(s)
const string kArgSubjectBestHit
Argument to specify the culling limit.
const string kArgQueryMate
Mates for the query sequences if given in a separate file.
const string kArgFinalGappedXDropoff
Argument to select the final gapped X dropoff value.
const string kArgBestHitOverhang
Argument to specify the overhang parameter to the best hit algorithm.
const string kArgNegativeSeqidList
argument for gi list to exclude from a BLAST database search
const string kArgEntrezQuery
Entrez query.
const string kArgJDistance
KBLASTP arguments Specifies Jaccard distance (threshold)
const string kArgGLDatabase
Argument to specify the germline database name for igblast.
const string kArgGLFocusV
Arugment to specify if Igblast alignment should restrict to V seg.
const string kTask
Task to perform.
const string kArgSraAccessionBatch
Argument to specify a file with a list of SRA accessions.
const string kArgLineLength
Argument to specify line length for displaying alignments.
const string kArgMaxTargetSequences
Argument to specify the maximum number of target sequences to keep (a.k.a.
const string kArgFrameShiftPenalty
Argument to specify the frame shift penality.
const string kArgUseIndex
Flag to force using or not using megablast database index.
const bool kDfltArgUseIndex
Default value for megablast database index flag.
const string kArgMinDMatch
Arugment to specify if Igblast min D gene match.
const string kDfltArgQuery
Default value for query sequence input.
const string kArgRpsDb
Argument to specify domain database name for DELTA-BLAST.
const string kArgQualityFilter
Argyment to specify whether quality filtering is to be done.
const string kArgNegativeGiList
argument for seqid list to exclude from a BLAST database search
const string kArgInputFormat
Argument to specify input format.
const string kArgLookupTableMaskingOnly
Argument to specify to mask query during lookup table creation.
const string kArgMismatch
Argument to select the nucleotide mismatch penalty.
const string kArgParseDeflines
Argument to specify if the query and subject sequences defline should be parsed.
const string kArgSaveAllPssms
Argument to specify whether to save PSSM after each psiblast iteration.
const string kDfltArgCandidateSeqs
const string kArgIgnoreMsaMaster
Argument to specify whether the template sequence (usually the query) should be ignored for the purpo...
const string kArgEvalue
Argument for expectation value cutoff.
const string kArgFwdRev
Argument to specify forward/reverse strand specificity.
const string kArgOldStyleIndex
Use old style megablast index.
const string kArgMaskLevel
const string kArgIgSeqType
Argument to specify IgBlast sequence type.
const string kArgGLDomainSystem
Argument to specify the Ig domain system.
const string kArgIpgList
IPG list file name to restrict BLAST database.
const string kArgMaxEditDist
Argument to specify a cutoff edit distance fot an alignment.
const string kArgEnableSraCache
Argument to enable SRA caching in local files.
const bool kDfltArgUseLCaseMasking
Default argument to specify whether lowercase masking should be used.
const string kArgCullingLimit
Argument to specify the culling limit.
const string kArgGapTrigger
Argument to specify number of bits to initiate gapping.
const string kArgEffSearchSpace
Argument to specify the effective length of the search space.
const string kArgSubjectLocation
Argument to specify a location to restrict the subject sequence(s)
const string kArgOffDiagonalRange
Argument to select the off-diagonal scan range in the 2-hit wordfinder algorithm.
const string kDfltArgStrand
Default value for strand selection.
const string kArgPaired
Argument to specify whether mapped reads are paired.
const string kArgQueryCovHspPerc
Argument to specify min query coverage percentage for each hsp.
const string kDfltArgSegFiltering
Default arguments to apply SEG filtering on query sequence(s)
const string kArgMTMode
Argument to specify mt mode (split by db or split by queries)
const string kArgPSINumIterations
Argument to select the number of iterations to perform in PSI-BLAST.
const string kArgQuery
Query sequence(s)
const string kArgNumClonotype
Argument to specify number of clonotype to show.
const string kArgMinVLength
Argument to specify minimal required V length.
const string kArgNegativeIpgList
argument for IPG list to exclude from a BLAST database search
const string kArgNoUnaligned
Argument to trun off printing of unaligned reads.
const string kArgComplexityAdj
const string kArgMSAInputFile
Argument to specify a multiple sequence alignment file to create a PSSM from.
const string kArgUnalignedFormat
Argument to specify format for reporting unaligned reads.
const string kArgNegativeTaxIdList
Argument to specify negative taxonomy ids filtering.
const string kDfltArgOldStyleIndex
Default value for use old style megablast index.
const string kArgVPenalty
Argument to specify mismatch penalty for V gene search.
const string kDfltArgDustFiltering
Default arguments to apply DUST filtering on query sequence(s)
const string kArgSeqIdList
seqid list file name to restrict BLAST database
const string kDfltArgLookupTableMaskingOnlyProt
Default argument mask a protein query during lookup table construction.
const unsigned int kDfltArgPSINumIterations
const string kArgRevOnly
Argument to specify reverse-only strand specificity.
const string kArgDb
BLAST database name.
const string kArgOutputGzip
Argument to specify that the output will be compressed with gzip.
const string kArgCustomInternalData
Argument to specify custom internal data file.
const string kArgWindowMaskerTaxId
Argument to specify a taxid for Window Masker.
const string kArgCRegionNumAlign
Argument to specify the number of alignments for c gene db.
const string kArgWindowSize
Argument to select the window size in the 2-hit wordfinder algorithm.
const string kArgRefType
Reference type: genome or transcriptome.
const string kArgWordSize
Argument to select the wordfinder's word size.
<