NCBI C++ ToolKit
makeprofiledb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: makeprofiledb.cpp 101992 2024-03-15 12:45:07Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Amelia Fong
27  *
28  */
29 
30 /** @file makeprofiledb.cpp
31  * Command line tool to create RPS,COBALT & DELTA BLAST databases.
32  * This is the successor to formatrpsdb from the C toolkit
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbimisc.hpp>
38 #include <corelib/ncbitime.hpp>
39 #include <util/math/matrix.hpp>
40 #include <serial/objistrasn.hpp>
64 #include "../blast/blast_app_util.hpp"
65 
66 #ifndef SKIP_DOXYGEN_PROCESSING
68 USING_SCOPE(blast);
70 #endif /* SKIP_DOXYGEN_PROCESSING */
71 
72 
73 //Input args specify to makeprofiledb
74 static const string kInPssmList("in");
75 static const string kOutDbName("out");
76 static const string kOutDbType("dbtype");
77 static const string kPssmScaleFactor("scale");
78 static const string kOutIndexFile("index");
79 static const string kObsrThreshold("obsr_threshold");
80 static const string kExcludeInvalid("exclude_invalid");
81 static const string kBinaryScoremat("binary");
82 static const string kUseCmdlineThreshold("force");
83 static const string kMaxSmpFilesPerVol("max_smp_vol");
84 
85 static const string kLogFile("logfile");
86 
87 //Supported Output Database Types
88 static const string kOutDbRps = "rps";
89 static const string kOutDbCobalt = "cobalt";
90 static const string kOutDbDelta = "delta";
91 
92 //Supported Matrices
93 static const string kMatrixBLOSUM62 = "BLOSUM62";
94 static const string kMatrixBLOSUM80 = "BLOSUM80";
95 static const string kMatrixBLOSUM50 = "BLOSUM50";
96 static const string kMatrixBLOSUM45 = "BLOSUM45";
97 static const string kMatrixBLOSUM90 = "BLOSUM90";
98 static const string kMatrixPAM250 = "PAM250";
99 static const string kMatrixPAM30 = "PAM30";
100 static const string kMatrixPAM70 = "PAM70";
101 
102 //Default Input Values
103 static const string kDefaultMatrix(kMatrixBLOSUM62);
104 static const string kDefaultOutDbType(kOutDbRps);
105 static const string kDefaultOutIndexFile("true");
106 static const string kDefaultExcludeInvalid("true");
107 #define kDefaultWordScoreThreshold (9.82)
108 #define kDefaultPssmScaleFactor (100.00)
109 #define kDefaultObsrThreshold (6.0)
110 #define kDefaultMaxSmpFilesPerVol (2500)
111 
112 //Fix point scale factor for delta blast
113 static const Uint4 kFixedPointScaleFactor = 1000;
114 #define kEpsylon (0.0001)
115 
116 #define DEFAULT_POS_MATRIX_SIZE 2000
117 #define RPS_NUM_LOOKUP_CELLS 32768
118 #if BLASTAA_SIZE == 28
119 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM_28
120 #else
121 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM
122 #endif
123 
124 #define kSingleVol (-1)
125 
127 {
128 public:
131 
132  void Create(int seq_size);
133  void Delete(void);
134 
135  Int4 ** Get(void) { return m_posMatrix;};
136  unsigned int GetSize(void){return m_size;};
137 
138 private:
139 
141  int m_size;
142 };
143 
145 {
146  Delete();
147 
148  m_posMatrix = new Int4* [size];
149 
150  for(int i = 0; i < size; ++ i)
151  {
152  m_posMatrix[i] = new Int4[BLASTAA_SIZE];
153  }
154  m_size = size;
155 
156  return;
157 }
158 
160 {
161  if( NULL == m_posMatrix)
162  return;
163 
164  for(int i = 0; i < m_size; ++ i)
165  {
166  if (m_posMatrix[i] != NULL)
167  delete [] m_posMatrix[i];
168  }
169 
170  delete [] m_posMatrix;
171  m_posMatrix = NULL;
172  return;
173 }
174 
176 {
177 public:
178  /** @inheritDoc */
179  CMakeProfileDBApp(void);
181 private:
182  /** @inheritDoc */
183  virtual void Init();
184  /** @inheritDoc */
185  virtual int Run();
186 
187  enum op_mode
188  {
192  op_invalid
193  };
194 
196  {
197  public:
198  string db_name;
205 
214  string matrix;
216 
217  CRPS_DbInfo(void):
221  { };
223  {
224  if( NULL != query_options) {
226  }
227 
228  if(NULL != lookup) {
230  }
231 
232  if(NULL != lookup_options) {
234  }
235  };
236  };
237 
239  {
242  sm_invalid
243  };
244 
245  enum
246  {
249  eTrue
250  };
251 
253  const string & filename);
254  void x_SetupArgDescriptions(void);
255  void x_InitProgramParameters(void);
256  vector<string> x_GetSMPFilenames(void);
257  void x_InitOutputDb(CRPS_DbInfo & rpsDBInfo);
258  void x_InitRPSDbInfo(CRPS_DbInfo & rpsDBInfo, Int4 vol, Int4 num_files);
259  void x_UpdateRPSDbInfo(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p);
260  void x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_w_parameters, bool freq_only);
261  void x_RPSUpdateLookup(CRPS_DbInfo & rpsDbInfo, Int4 seq_size);
262  void x_RPSUpdateStatistics(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & seq, Int4 seq_size);
263  void x_FillInRPSDbParameters(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_p);
264  void x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo, const CPssm & pssm, Int4 seq_index, Int4 seq_size);
265  void x_RPS_DbClose(CRPS_DbInfo & rpsDbInfo);
266  void x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_size);
267  bool x_CheckDelta( const CPssm & pssm, Int4 seq_size, const string & filename);
268  bool x_ValidateCd(const list<double>& freqs, const list<double>& observ, unsigned int alphabet_size);
269  void x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo, CTmpFile & tmp_obsr_file, CTmpFile & tmp_freq_file,
270  list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset);
271  vector<string> x_CreateDeltaList(void);
272  void x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_index, Int4 seq_size);
273  void x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames);
274  bool x_IsUpdateFreqRatios(const CPssm & p);
275  void x_MakeVol(Int4 vol, vector<string> & smps);
276 
277  int x_Run(void);
278 
279  void x_AddCmdOptions(void);
280  void x_CreateAliasFile(void);
281 
282  // Data
285  string m_Title;
287  string m_OutDbName;
288  string m_OutDbType;
293  string m_Matrix;
298 
302  bool m_Done;
303 
304  //For Delta Blast
307 
310 
311  vector<string> m_VolNames;
314 };
315 
317  : m_LogFile(NULL), m_InPssmList(NULL), m_Title(kEmptyStr),
318  m_WordDefaultScoreThreshold(0), m_OutDbName(kEmptyStr),
319  m_OutDbType(kEmptyStr), m_CreateIndexFile(false),m_GapOpenPenalty(0),
320  m_GapExtPenalty(0), m_PssmScaleFactor(0),m_Matrix(kEmptyStr), m_op_mode(op_invalid),
321  m_binary_scoremat(false), m_MaxSmpFilesPerVol(0), m_NumOfVols(0), m_DbVer(eBDB_Version5),
322  m_Taxids(new CTaxIdSet()), m_UserTaxIds(false), m_Done(false),
323  m_ObsrvThreshold(0), m_ExcludeInvalid(false),
324  m_UpdateFreqRatios(eUndefined), m_UseModelThreshold(true)
325 {
327  version->SetVersionInfo(new CBlastVersion());
329  m_StopWatch.Start();
330  if (m_UsageReport.IsEnabled()) {
332  m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "makeprofiledb");
333  }
334 }
335 
337 {
338  // NEED CLEAN UP CODE !!!!
339  if(m_Done == false)
340  {
341  for(unsigned int i =0; i < m_VolNames.size(); i ++)
342  {
343  string rps_str = m_VolNames[i] + ".rps";
344  string lookup_str = m_VolNames[i] + ".loo";
345  string aux_str = m_VolNames[i] + ".aux";
346  string freq_str = m_VolNames[i] + ".freq";
347  CFile(rps_str).Remove();
348  CFile(lookup_str).Remove();
349  CFile(aux_str).Remove();
350  CFile(freq_str).Remove();
351 
352  if(op_cobalt == m_op_mode)
353  {
354  string blocks_str = m_VolNames[i] + ".blocks";
355  CFile(blocks_str).Remove();
356  }
357 
358  if(op_delta == m_op_mode)
359  {
360  string wcounts_str = m_VolNames[i] + ".wcounts";
361  string obsr_str = m_VolNames[i] + ".obsr";
362  CFile(wcounts_str).Remove();
363  CFile(obsr_str).Remove();
364  }
365  }
366  if (m_VolNames.size() > 1) {
367  string pal_str = m_OutDbName + ".pal";
368  CFile(pal_str).Remove();
369  }
370  }
371  else
372  {
373  for(unsigned int i =0; i < m_VolNames.size(); i ++) {
374  string pog_str = m_VolNames[i] + ".pog";
375  CFile(pog_str).Remove();
376  }
377  }
379 }
380 
382 {
384 
385  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
386 
387  // Specify USAGE context
388  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
389  "Application to create databases for rpsblast, cobalt and deltablast, version "
390  + CBlastVersion().Print());
391 
392  string dflt("Default = input file name provided to -");
393  dflt += kInPssmList + " argument";
394 
395  arg_desc->SetCurrentGroup("Input options");
396  arg_desc->AddKey(kInPssmList, "in_pssm_list",
397  "Input file that contains a list of smp files (delimited by space, tab or newline)",
399 
400  arg_desc->AddFlag(kBinaryScoremat,
401  "Scoremats are in binary format",
402  true);
403 
404  arg_desc->SetCurrentGroup("Configuration options");
405  arg_desc->AddOptionalKey(kArgDbTitle, "database_title",
406  "Title for database\n" + dflt,
408 
409  arg_desc->AddDefaultKey(kArgWordScoreThreshold, "word_score_threshold",
410  "Minimum word score to add a word to the lookup table",
413  arg_desc->AddFlag(kUseCmdlineThreshold, "Use cmdline threshold", true);
414 
415  arg_desc->SetCurrentGroup("Output options");
416  arg_desc->AddOptionalKey(kOutDbName, "database_name",
417  "Name of database to be created\n" +
419 
420  arg_desc->AddDefaultKey("blastdb_version", "version",
421  "Version of BLAST database to be created",
423  NStr::NumericToString(static_cast<int>(eBDB_Version5)));
424  arg_desc->SetConstraint("blastdb_version",
426 
427  arg_desc->AddDefaultKey(kMaxSmpFilesPerVol, "max_smp_files_per_vol",
428  "Maximum number of SMP files per DB volume",
430 
431  arg_desc->AddDefaultKey(kOutDbType, "output_db_type",
432  "Output database type: cobalt, delta, rps",
434  arg_desc->SetConstraint(kOutDbType, &(*new CArgAllow_Strings, kOutDbRps, kOutDbCobalt , kOutDbDelta ));
435 
436  arg_desc->AddDefaultKey(kOutIndexFile, "create_index_files",
437  "Create Index Files",
439 
440  arg_desc->SetCurrentGroup("Used only if scoremat files do not contain PSSM scores, ignored otherwise.");
441  arg_desc->AddOptionalKey(kArgGapOpen, "gap_open_penalty",
442  "Cost to open a gap",
444 
445  arg_desc->AddOptionalKey(kArgGapExtend, "gap_extend_penalty",
446  "Cost to extend a gap, ",
448 
449  arg_desc->AddDefaultKey(kPssmScaleFactor, "pssm_scale_factor",
450  "Pssm Scale factor ",
453 
454  arg_desc->AddDefaultKey(kArgMatrixName, "matrix_name",
455  "Scoring matrix name",
458  arg_desc->SetConstraint(kArgMatrixName, &(*new CArgAllow_Strings,kMatrixBLOSUM62, kMatrixBLOSUM80,
460 
461  arg_desc->AddFlag("without_freq_ratios", "Build rps db without freq ratios",true);
462 
463  //Delta Blast Options
464  arg_desc->SetCurrentGroup("Delta Blast Options");
465  arg_desc->AddDefaultKey(kObsrThreshold, "observations_threshold", "Exclude domains with "
466  "with maximum number of independent observations "
467  "below this threshold", CArgDescriptions::eDouble,
469 
470  arg_desc->AddDefaultKey(kExcludeInvalid, "exclude_invalid", "Exclude domains that do "
471  "not pass validation test",
473 
474  arg_desc->SetCurrentGroup("Taxonomy options");
475  arg_desc->AddOptionalKey("taxid", "TaxID",
476  "Taxonomy ID to assign to all sequences",
478  arg_desc->SetConstraint("taxid", new CArgAllowValuesGreaterThanOrEqual(0));
479  arg_desc->SetDependency("taxid", CArgDescriptions::eExcludes, "taxid_map");
480 
481  arg_desc->AddOptionalKey("taxid_map", "TaxIDMapFile",
482  "Text file mapping sequence IDs to taxonomy IDs.\n"
483  "Format:<SequenceId> <TaxonomyId><newline>",
485 
486  SetupArgDescriptions(arg_desc.release());
487 }
488 
490 {
491  const CArgs& args = GetArgs();
492 
493  //log_file
494  if (args[kLogFile].HasValue())
495  m_LogFile = &args[kLogFile].AsOutputFile();
496  else
497  m_LogFile = &cout;
498 
499 
500  //in_list
501  if (args[kInPssmList].HasValue())
502  m_InPssmList = &args[kInPssmList].AsInputFile();
503  else
504  NCBI_THROW(CInputException, eInvalidInput, "Please provide an input file with list of smp files");
505 
506  // Binary Scoremat
508 
509  //title
510  if (args[kArgDbTitle].HasValue())
511  m_Title = args[kArgDbTitle].AsString();
512  else
513  m_Title = args[kInPssmList].AsString();
514 
515  //threshold
517 
518  //Out
519  if(args[kOutDbName].HasValue())
520  m_OutDbName = args[kOutDbName].AsString();
521  else
522  m_OutDbName = args[kInPssmList].AsString();
523 
524  //Number of SMP files per db vol
525  m_MaxSmpFilesPerVol = args[kMaxSmpFilesPerVol].AsInteger();
526 
527  //out_db_type
528  m_OutDbType = args[kOutDbType].AsString();
529  if(kOutDbRps == m_OutDbType)
530  m_op_mode = op_rps;
531  else if (kOutDbCobalt == m_OutDbType)
533  else if(kOutDbDelta == m_OutDbType)
535  else
536  NCBI_THROW(CInputException, eInvalidInput, "Invalid Output database type");
537 
538  m_CreateIndexFile = args[kOutIndexFile].AsBoolean();
539 
540  int default_gap_open = 0;
541  int default_gap_extend = 0;
542  //matrix
543  m_Matrix = args[kArgMatrixName].AsString();
544  BLAST_GetProteinGapExistenceExtendParams(m_Matrix.c_str(), &default_gap_open, &default_gap_extend);
545 
546  //gapopen
547  if(args[kArgGapOpen].HasValue())
548  m_GapOpenPenalty = args[kArgGapOpen].AsInteger();
549  else
550  m_GapOpenPenalty = default_gap_open;
551 
552  //gapextend
553  if(args[kArgGapExtend].HasValue())
554  m_GapExtPenalty = args[kArgGapExtend].AsInteger();
555  else
556  m_GapExtPenalty = default_gap_extend;
557 
558  if(args.Exist("without_freq_ratios")) {
559  if (m_op_mode == op_rps) {
560  m_UpdateFreqRatios = !args["without_freq_ratios"].AsBoolean();
561  }
562  else {
563  if (args["without_freq_ratios"].AsBoolean()) {
564  NCBI_THROW(CInputException, eInvalidInput, "without_freq_ratios can only be used for rps db");
565  }
566  }
567  }
568  //pssm scale factor
569  m_PssmScaleFactor = args[kPssmScaleFactor].AsDouble();
570 
571  //matrix
572  m_Matrix = args[kArgMatrixName].AsString();
573 
574  //Delta Blast Parameters
575  m_ObsrvThreshold = args[kObsrThreshold].AsDouble();
576  m_ExcludeInvalid = args[kExcludeInvalid].AsBoolean();
577 
578  if (args[kUseCmdlineThreshold]){
579  m_UseModelThreshold = false;
580  }
581  m_DbVer = static_cast<EBlastDbVersion>(args["blastdb_version"].AsInteger());
582 
583  if (args["taxid"].HasValue()) {
584  _ASSERT( !args["taxid_map"].HasValue() );
585  m_Taxids.Reset(new CTaxIdSet(TAX_ID_FROM(int, args["taxid"].AsInteger())));
586  m_UserTaxIds = true;
587  } else if (args["taxid_map"].HasValue()) {
588  _ASSERT( !args["taxid"].HasValue() );
589  _ASSERT( !m_Taxids.Empty() );
590  m_Taxids->SetMappingFromFile(args["taxid_map"].AsInputFile());
591  m_UserTaxIds = true;
592  }
593 }
594 
596 {
597  vector<string> filenames;
598 
599  while(!m_InPssmList->eof())
600  {
601  string line;
602  vector<string> tmp;
605 
606  if(tmp.size() > 0)
607  filenames.insert(filenames.end(), tmp.begin(), tmp.end() );
608  }
609 
610  if( 0 == filenames.size())
611  NCBI_THROW(CInputException, eInvalidInput, "Input file contains no smp filnames");
612 
613  return filenames;
614 }
615 
618  const string & filename)
619 {
621 
622  if(pssm_w_parameters.IsSetPssm())
623  {
624  const CPssm & pssm = pssm_w_parameters.GetPssm();
625 
626  if(!pssm.IsSetQuery() || (0 == pssm.GetQueryLength()))
627  {
628  string err = filename + " contains no bioseq data";
629  NCBI_THROW(CInputException, eInvalidInput, err);
630  }
631 
632  if(!pssm.IsSetNumRows() || !pssm.IsSetNumColumns())
633  {
634  string err = filename + " contains no info on num of columns or num of rows";
635  NCBI_THROW(CInputException, eInvalidInput, err);
636  }
637 
638  if((int) (pssm.GetQueryLength()) != pssm.GetNumColumns())
639  {
640  string err = filename + " 's num of columns does not match size of sequence";
641  NCBI_THROW(CInputException, eInvalidInput, err);
642  }
643 
644  int num_rows = pssm.GetNumRows();
645  if( num_rows <= 0 || num_rows > BLASTAA_SIZE )
646  {
647  string err = filename + " has invalid alphabet size";
648  NCBI_THROW(CInputException, eInvalidInput, err);
649  }
650 
651  // First time around
653  {
655  }
656 
658  {
659  string err = filename + " contains no frequence ratios.\n" +
660  "You can use the -without_freq_ratios option to build the database without frequency ratios.\n" +
661  "However composition based statistics will have to be disabled for RPSBLAST searches against\n" +
662  "this database (not recommended).";
663  NCBI_THROW(CInputException, eInvalidInput, err);
664  }
665 
666  if(op_cobalt == m_op_mode)
667  {
668  if(!pssm_w_parameters.IsSetParams() || !pssm_w_parameters.GetParams().IsSetConstraints() ||
669  ! pssm_w_parameters.GetParams().GetConstraints().IsSetBlocks())
670  {
671  string err = filename + " contains no core block to build cobalt database";
672  NCBI_THROW(CInputException, eInvalidInput, err);
673  }
674  }
675 
676  if(pssm.IsSetFinalData())
677  {
678  sm = sm_valid_has_pssm;
679  }
680  else if(pssm.IsSetIntermediateData())
681  {
683  {
684  sm = sm_valid_freq_only;
685  }
686  }
687 
688  if(sm_invalid == sm)
689  {
690  string err = filename + " contains no pssm or residue frequencies";
691  NCBI_THROW(CInputException, eInvalidInput, err);
692  }
693  }
694  else
695  {
696  string err = filename + " contains no scoremat";
697  NCBI_THROW(CInputException, eInvalidInput, err);
698  }
699 
700  return sm;
701 }
702 
704 {
705  if(op_delta == m_op_mode) {
706  return eFalse;
707  }
708 
709  return eTrue;
710 }
711 
713 {
715  rpsDbInfo.output_db.Reset(new CWriteDB(rpsDbInfo.db_name, CWriteDB::eProtein, m_Title, index_type, m_CreateIndexFile, false, false, m_DbVer));
716  rpsDbInfo.output_db->SetMaxFileSize(4000000000);
717  return;
718 }
719 
720 static bool s_DeleteMakeprofileDb(const string & name )
721 {
722  bool isRemoved = false;
723  static const char * mp_ext[]={".rps", ".loo", ".aux", ".freq", ".blocks", ".wcounts", ".obsr", NULL};
724  for(const char ** mp=mp_ext; *mp != NULL; mp++) {
725  CNcbiOstrstream oss;
726  oss << name << *mp;
727  const string fname = CNcbiOstrstreamToString(oss);
728  if (CFile(fname).Remove()) {
729  LOG_POST(Info << "Deleted " << fname);
730  }
731  else {
732  unsigned int index = 0;
733  string vfname = name + "." + NStr::IntToString(index/10) +
734  NStr::IntToString(index%10) + *mp;
735  while (CFile(vfname).Remove()) {
736  index++;
737  vfname = name + "." + NStr::IntToString(index/10) +
738  NStr::IntToString(index%10) + *mp;
739  }
740  }
741  }
743  isRemoved = true;
744 
745  return isRemoved;
746 }
747 
748 
749 void CMakeProfileDBApp::x_InitRPSDbInfo(CRPS_DbInfo & rpsDbInfo, Int4 vol, Int4 num_files)
750 {
751 
752  rpsDbInfo.num_seqs = num_files;
753  if(vol == kSingleVol) {
754  rpsDbInfo.db_name = m_OutDbName;
755  }
756  else if (vol >= 0) {
758  }
759  else {
760  NCBI_THROW(CBlastException, eCoreBlastError,"Invalid vol number");
761  }
762 
763  string rps_str = rpsDbInfo.db_name + ".rps";
764  rpsDbInfo.pssm_file.open(rps_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
765  if (!rpsDbInfo.pssm_file.is_open())
766  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .rps file ");
767 
768  string lookup_str = rpsDbInfo.db_name + ".loo";
769  rpsDbInfo.lookup_file.open(lookup_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
770  if (!rpsDbInfo.lookup_file.is_open())
771  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .loo file");
772 
773  string aux_str = rpsDbInfo.db_name + ".aux";
774  rpsDbInfo.aux_file.open(aux_str.c_str());
775  if (!rpsDbInfo.aux_file.is_open())
776  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .aux file");
777 
778  string freq_str = rpsDbInfo.db_name + ".freq";
779  rpsDbInfo.freq_file.open(freq_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
780  if (!rpsDbInfo.freq_file.is_open())
781  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .freq file");
782 
783  /* Write the magic numbers to the PSSM file */
784 
786  rpsDbInfo.pssm_file.write ((char *)&version , sizeof(Int4));
787  rpsDbInfo.freq_file.write ((char *)&version , sizeof(Int4));
788 
789  /* Fill in space for the sequence offsets. The PSSM
790  data gets written after this list of integers. Also
791  write the number of sequences to the PSSM file */
792 
793  rpsDbInfo.pssm_file.write((char *) &num_files, sizeof(Int4));
794  rpsDbInfo.freq_file.write((char *) &num_files, sizeof(Int4));
795  for (Int4 i = 0; i <= num_files; i++)
796  {
797  rpsDbInfo.pssm_file.write((char *)&i, sizeof(Int4));
798  rpsDbInfo.freq_file.write((char *)&i, sizeof(Int4));
799  }
800 
801  if(op_cobalt == m_op_mode)
802  {
803  string blocks_str = rpsDbInfo.db_name + ".blocks";
804  rpsDbInfo.blocks_file.open(blocks_str.c_str());
805  if (!rpsDbInfo.blocks_file.is_open())
806  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .blocks file");
807  }
808 
809 
810  rpsDbInfo.curr_seq_offset = 0;
811  //Init them to input arg values first , may change after reading in the first sequence
812  rpsDbInfo.gap_extend = m_GapExtPenalty;
813  rpsDbInfo.gap_open = m_GapOpenPenalty;
814  rpsDbInfo.matrix = m_Matrix;
815  rpsDbInfo.scale_factor = (Int4) ceil(m_PssmScaleFactor);
816 
817  return;
818  }
819 
820 //For first sequence only
822 {
823  if(pssm_p.IsSetParams())
824  {
825  if(pssm_p.GetParams().IsSetRpsdbparams())
826  {
827  const CFormatRpsDbParameters & rps_db_params = pssm_p.GetParams().GetRpsdbparams();
828  if(rps_db_params.IsSetGapExtend())
829  rpsDbInfo.gap_extend = rps_db_params.GetGapExtend();
830 
831  if(rps_db_params.IsSetGapOpen())
832  rpsDbInfo.gap_open = rps_db_params.GetGapOpen();
833 
834  if(rps_db_params.IsSetMatrixName())
835  rpsDbInfo.matrix = rps_db_params.GetMatrixName();
836  }
837  }
838  return;
839 }
840 
842 {
843  if(!pssm_p.IsSetParams())
844  pssm_p.SetParams();
845 
846  if(!pssm_p.GetParams().IsSetRpsdbparams())
847  pssm_p.SetParams().SetRpsdbparams();
848 
849  CFormatRpsDbParameters & rps_params= pssm_p.SetParams().SetRpsdbparams();
850  if(!rps_params.IsSetGapExtend())
851  rps_params.SetGapExtend(rpsDbInfo.gap_extend);
852  else if(rps_params.GetGapExtend() != rpsDbInfo.gap_extend)
853  NCBI_THROW(CBlastException, eCoreBlastError, "Gap extend penalties do not match");
854 
855  if(!rps_params.IsSetGapOpen())
856  rps_params.SetGapOpen(rpsDbInfo.gap_open);
857  else if(rps_params.GetGapOpen() != rpsDbInfo.gap_open)
858  NCBI_THROW(CBlastException, eCoreBlastError, "Gap open penalties do not match");
859 
860  if(!rps_params.IsSetMatrixName())
861  rps_params.SetMatrixName (rpsDbInfo.matrix);
862  else if(rps_params.GetMatrixName()!= rpsDbInfo.matrix)
863  NCBI_THROW(CBlastException, eCoreBlastError, "Score matrix does not match");
864 
865  return;
866 }
867 
868 /* Update the input scoremat with a new PSSM and modified
869  statistics. Scoremat must contain only residue frequencies.
870  Note that upon completion the new PSSM will always have
871  columns of length BLASTAA_SIZE
872  seq is the sequence and set of score frequencies read in
873  from the next data file
874  seq_size is the number of letters in this sequence
875  alphabet_size refers to the number of PSSM rows
876  ScalingFactor is the multiplier for all PSSM scores
877 */
879 {
880 
881  CPssm & pssm = seq.SetPssm();
882  const CPssmParameters & params = seq.GetParams();
883  string matrix_name = params.GetRpsdbparams().GetMatrixName();
884 
885  /* Read in the sequence residues from the scoremat structure. */
886  CNCBIstdaa query_stdaa;
887  pssm.GetQuerySequenceData(query_stdaa);
888 
889  vector <char> query_v = query_stdaa.Get();
890 
891  if((Int4) (query_v.size()) != seq_size)
892  NCBI_THROW(CBlastException, eCoreBlastError, "Query sequence lengths mismatch");
893 
894  /* allocate query array and PSSM row array */
895  AutoArray<Uint1> query(seq_size);
896 
897  for(unsigned int i = 0; i < query_v.size(); i++)
898  query[i] = query_v[i];
899 
900  unique_ptr<CNcbiMatrix <double> > freq_list (CScorematPssmConverter::GetFreqRatios(seq));
901 
902  CPsiBlastInputFreqRatios pssm_freq_ratio(query.get(), seq_size, *freq_list,
903  matrix_name.c_str(), rpsDbInfo.gap_open,
904  rpsDbInfo.gap_extend, rpsDbInfo.scale_factor);
905  CPssmEngine pssm_engine(&pssm_freq_ratio);
906  CRef<CPssmWithParameters> out_par(pssm_engine.Run());
907 
908  CPssmFinalData & i = pssm.SetFinalData();
909  const CPssmFinalData & o = out_par->GetPssm().GetFinalData();
910  i.SetScores() = o.GetScores();
911  i.SetLambda() = o.GetLambda();
912  i.SetKappa() = o.GetKappa();
913  i.SetH() = o.GetH();
914  i.SetScalingFactor(rpsDbInfo.scale_factor);
915 
916  return;
917 }
918 
919  /* The first sequence in the list determines several
920  parameters that all other sequences in the list must
921  have. In this case, extra initialization is required
922 
923  info contains all the information on data files
924  and parameters from previously added sequences
925  seq is the sequence and PSSM read in from the next data file
926  seq_index refers to the (0-based) position of this sequence
927  in the complete list of seqences
928  seq_size is the number of letters in this sequence
929  alphabet_size refers to the number of PSSM rows
930  */
931  void CMakeProfileDBApp::x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_w_parameters, bool freq_only )
932  {
933  x_UpdateRPSDbInfo(rpsDbInfo, pssm_w_parameters);
934 
935  x_FillInRPSDbParameters(rpsDbInfo, pssm_w_parameters);
936  double wordScoreThreshold = m_WordDefaultScoreThreshold;
937 
938  if(!freq_only)
939  {
940  if(pssm_w_parameters.GetPssm().GetFinalData().IsSetScalingFactor())
941  {
942  rpsDbInfo.scale_factor = pssm_w_parameters.GetPssm().GetFinalData().GetScalingFactor();
943  }
944  else
945  {
946  // asn1 default value is 1
947  rpsDbInfo.scale_factor = 1.0;
948  }
949  if(m_UseModelThreshold && pssm_w_parameters.GetPssm().GetFinalData().IsSetWordScoreThreshold())
950  {
951  wordScoreThreshold = pssm_w_parameters.GetPssm().GetFinalData().GetWordScoreThreshold();
952  }
953  }
954  else
955  {
956  x_RPSUpdateStatistics(rpsDbInfo, pssm_w_parameters, pssm_w_parameters.GetPssm().GetQueryLength());
957  }
958 
959  /* scale up the threshold value and convert to integer */
960  double threshold = rpsDbInfo.scale_factor * wordScoreThreshold;
961 
962  /* create BLAST lookup table */
963  if (LookupTableOptionsNew(eBlastTypeBlastp, &(rpsDbInfo.lookup_options)) != 0)
964  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot create lookup options");
965 
967  FALSE, /* no megablast */
968  threshold, /* neighboring threshold */
969  BLAST_WORDSIZE_PROT ) != 0)
970  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot set lookup table options");
971 
972  if (BlastAaLookupTableNew(rpsDbInfo.lookup_options, &(rpsDbInfo.lookup)) != 0)
973  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot allocate lookup table");
974 
975  rpsDbInfo.lookup->use_pssm = TRUE; /* manually turn on use of PSSMs */
976 
977  /* Perform generic query setup */
978 
979  if (BlastQuerySetUpOptionsNew(&(rpsDbInfo.query_options)) != 0)
980  NCBI_THROW(CBlastException, eCoreBlastError, "Generic query setup failed");
981 
983  NULL, /* no filtering */
984  0 /* strand not applicable */ ) != 0)
985  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot fill query options");
986 
987  /* Write the header of the RPS .aux file */
988  rpsDbInfo.aux_file << rpsDbInfo.matrix << "\n";
989  rpsDbInfo.aux_file << rpsDbInfo.gap_open << "\n";
990  rpsDbInfo.aux_file << rpsDbInfo.gap_extend << "\n";
991  rpsDbInfo.aux_file << scientific << 0.0 << "\n";
992  rpsDbInfo.aux_file << scientific << 0.0 << "\n";
993  rpsDbInfo.aux_file << (int) 0 << "\n";
994  rpsDbInfo.aux_file << (int) 0 << "\n";
995  rpsDbInfo.aux_file << fixed << (double) rpsDbInfo.scale_factor << "\n";
996 
997  return;
998  }
999 
1000  void CMakeProfileDBApp::x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_size)
1001  {
1002  const CPssm & pssm = pssm_p.GetPssm();
1003  // Update .blocks file
1004  const list<CRef<CCoreBlock> > & block_list = pssm_p.GetParams().GetConstraints().GetBlocks();
1005 
1006  list<CRef<CCoreBlock> >::const_iterator itr = block_list.begin();
1007 
1008  int count =0;
1009 
1010  while(itr != block_list.end())
1011  {
1012  const CCoreBlock & block = (**itr);
1013  if(!block.IsSetStart() || !block.IsSetStop())
1014  NCBI_THROW(CInputException, eInvalidInput, "No start Or stop found in conserved block");
1015 
1016  string seq_id_str = "id" + NStr::IntToString(count);
1017  if(pssm.IsSetQuery())
1018  {
1019  if(pssm.GetQuery().IsSeq())
1020  {
1021  if(pssm.GetQuery().GetSeq().IsSetDescr())
1022  {
1023  const list<CRef<CSeqdesc> > descr_list= pssm.GetQuery().GetSeq().GetDescr();
1024  if(descr_list.size() > 0)
1025  {
1026  const CRef<CSeqdesc> descr = descr_list.front();
1027  if(descr->IsTitle())
1028  {
1029  string title = descr->GetTitle();
1030  string accession;
1031  string tmp;
1032  if(NStr::SplitInTwo(title, ",", accession, tmp))
1033  seq_id_str = accession;
1034  }
1035  }
1036  }
1037  }
1038  }
1039 
1040  rpsDbInfo.blocks_file << seq_id_str << "\t";
1041  rpsDbInfo.blocks_file << count << "\t";
1042  rpsDbInfo.blocks_file << block.GetStart() << "\t";
1043  rpsDbInfo.blocks_file << block.GetStop() << "\n";
1044  count++;
1045  ++itr;
1046  }
1047  return;
1048  }
1049 void CMakeProfileDBApp::x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_index, Int4 seq_size)
1050  {
1051  if (!m_UpdateFreqRatios)
1052  return;
1053 
1054  const CPssm & pssm = pssm_p.GetPssm();
1055  // Update .freq file
1056  Int4 i = 0;
1057  Int4 j = 0;
1059  Int4 alphabet_size = pssm.GetNumRows();
1060 
1061  const list<double> & freq_ratios = pssm.GetIntermediateData().GetFreqRatios();
1062  list<double>::const_iterator itr_fr = freq_ratios.begin();
1063  rpsDbInfo.freq_file.seekp(0, ios_base::end);
1064 
1065  if (pssm.GetByRow() == FALSE) {
1066  for (i = 0; i < seq_size; i++) {
1067  for (j = 0; j < alphabet_size; j++) {
1068  if (itr_fr == freq_ratios.end())
1069  break;
1070  row[j] = (Int4) BLAST_Nint(*itr_fr * FREQ_RATIO_SCALE);
1071  ++itr_fr;
1072  }
1073  for ( ;j < BLASTAA_SIZE; j++) {
1074  row[j] = 0;
1075  }
1076  rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1077  }
1078  }
1079  else {
1080  unique_ptr<CNcbiMatrix<double> > matrix (CScorematPssmConverter::GetFreqRatios(pssm_p));
1081 
1082  for (i = 0; i < seq_size; i++) {
1083  for (j = 0; j < BLASTAA_SIZE; j++) {
1084  row[j] = (Int4) BLAST_Nint((*matrix)(i,j ) * FREQ_RATIO_SCALE);
1085  }
1086  rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1087  }
1088  }
1089 
1090  memset(row, 0, sizeof(row));
1091  rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1092 
1093  rpsDbInfo.freq_file.seekp( 8 + (seq_index) * sizeof(Int4), ios_base::beg);
1094  rpsDbInfo.freq_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1095  return;
1096  }
1097 
1098  /* Incrementally update the BLAST lookup table with
1099  words derived from the present sequence
1100  info contains all the information on data files
1101  and parameters from previously added sequences
1102  seq is the sequence and PSSM read in from the next data file
1103  seq_size is the number of letters in this sequence
1104  */
1106  {
1107  BlastSeqLoc *lookup_segment = NULL;
1108 
1109  /* Tell the blast engine to index the entire input
1110  sequence. Since only the PSSM matters for lookup
1111  table creation, the process does not require
1112  actually extracting the sequence data from 'seq'*/
1113 
1114  BlastSeqLocNew(&lookup_segment, 0, seq_size - 1);
1115 
1116  /* add this sequence to the lookup table. NULL
1117  is passed in place of the query */
1118 
1119  Int4 ** posMatrix = rpsDbInfo.pos_matrix.Get();
1120  if (NULL == posMatrix)
1121  NCBI_THROW(CBlastException, eCoreBlastError, "Empty pos matrix");
1122 
1123  BlastAaLookupIndexQuery(rpsDbInfo.lookup, posMatrix,
1124  NULL, lookup_segment, rpsDbInfo.curr_seq_offset);
1125 
1126  BlastSeqLocFree(lookup_segment);
1127  return;
1128  }
1129 
1130  /* Incrementally update the RPS PSSM file with the
1131  PSSM for the next input sequence
1132  info contains all the information on data files
1133  and parameters from previously added sequences
1134  seq is the sequence and PSSM read in from the next data file
1135  seq_index refers to the (0-based) position of this sequence
1136  in the complete list of seqences
1137  seq_size is the number of letters in this sequence
1138  alphabet_size refers to the number of PSSM rows
1139  */
1140 void CMakeProfileDBApp::x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo, const CPssm & pssm, Int4 seq_index, Int4 seq_size)
1141 {
1142  Int4 i = 0;
1143  Int4 j = 0;
1144 
1145  /* Note that RPS blast requires an extra column at
1146  * the end of the PSSM */
1147 
1148  list<int>::const_iterator score_list_itr = pssm.GetFinalData().GetScores().begin();
1149  list<int>::const_iterator score_list_end = pssm.GetFinalData().GetScores().end();
1150  Int4 alphabet_size = pssm.GetNumRows();
1151 
1152  rpsDbInfo.pos_matrix.Create(seq_size + 1);
1153  Int4 ** posMatrix = rpsDbInfo.pos_matrix.Get();
1154  if (pssm.GetByRow() == FALSE) {
1155  for (i = 0; i < seq_size; i++) {
1156  for (j = 0; j < alphabet_size; j++) {
1157  if (score_list_itr == score_list_end)
1158  break;
1159  posMatrix[i][j] = *score_list_itr;
1160  score_list_itr++;
1161  }
1162  if (j < alphabet_size)
1163  break;
1164  for (; j < BLASTAA_SIZE; j++) {
1165  posMatrix[i][j] = INT2_MIN;
1166  }
1167  }
1168  }
1169  else {
1170  for (j = 0; j < alphabet_size; j++) {
1171  for (i = 0; i < seq_size; i++) {
1172  if (score_list_itr == score_list_end)
1173  break;
1174  posMatrix[i][j] = *score_list_itr;
1175  score_list_itr++;
1176  }
1177  if (i < seq_size)
1178  break;
1179  }
1180  if (j == alphabet_size) {
1181  for (; j < BLASTAA_SIZE; j++) {
1182  for (i = 0; i < seq_size; i++) {
1183  posMatrix[i][j] = INT2_MIN;
1184  }
1185  }
1186  }
1187  }
1188 
1189  if (i < seq_size || j < alphabet_size)
1190  NCBI_THROW(CBlastException, eCoreBlastError, "PSSM was truncated early");
1191 
1192  if(score_list_itr != score_list_end)
1193  NCBI_THROW(CBlastException, eCoreBlastError, "PSSM too large for this sequence");
1194 
1195  /* manually fill in the extra (last) column of the PSSM.
1196  Note that the value to use should more appropriately
1197  be BLAST_SCORE_MIN, but we instead follow the convention
1198  used in copymat */
1199 
1200  for (i = 0; i < BLASTAA_SIZE; i++)
1201  posMatrix[seq_size][i] = -BLAST_SCORE_MAX;
1202 
1203  /* Dump the score matrix, column by column */
1204  rpsDbInfo.pssm_file.seekp(0, ios_base::end);
1205  for (i = 0; i < seq_size + 1; i++) {
1206  rpsDbInfo.pssm_file.write((const char *) posMatrix[i], sizeof(Int4)*BLASTAA_SIZE);
1207  }
1208  /* Write the next context offset. Note that the
1209  RPSProfileHeader structure is one int too large for
1210  our purposes, so that the index of this sequence
1211  must be decremented to get the right byte offset
1212  into the file */
1213 
1214  rpsDbInfo.pssm_file.seekp( 8 + (seq_index) * sizeof(Int4), ios_base::beg);
1215  rpsDbInfo.pssm_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1216 
1217  return;
1218  }
1219 
1220 /* Once all sequences have been processed, perform
1221  final setup on the BLAST lookup table and finish
1222  up the RPS files */
1223 
1225 {
1226  /* Write the last context offset to the PSSM file.
1227  This is the total number of letters for all RPS
1228  DB sequences combined */
1229 
1230  rpsDbInfo.pssm_file.seekp(8 + (rpsDbInfo.num_seqs) * sizeof(Int4), ios::beg);
1231  rpsDbInfo.pssm_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1232  rpsDbInfo.freq_file.seekp(8 + (rpsDbInfo.num_seqs) * sizeof(Int4), ios::beg);
1233  rpsDbInfo.freq_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1234 
1235  /* Pack the lookup table into its compressed form */
1236  if(NULL == rpsDbInfo.lookup)
1237  NCBI_THROW(CBlastException, eCoreBlastError, "Empty database");
1238 
1239  if (BlastAaLookupFinalize(rpsDbInfo.lookup, eBackbone) != 0) {
1240  NCBI_THROW(CBlastException, eCoreBlastError, "Failed to compress lookup table");
1241  }
1242  else {
1243  /* Change the lookup table format to match that
1244  of the legacy BLAST lookup table */
1245 
1246  BlastRPSLookupFileHeader header;
1247  BlastAaLookupTable *lut = rpsDbInfo.lookup;
1248  Int4 i, index;
1249  Int4 cursor, old_cursor;
1250  AaLookupBackboneCell *cell;
1251  RPSBackboneCell empty_cell;
1252 
1253  memset(&header, 0, sizeof(header));
1255 
1256  /* for each lookup table cell */
1257 
1258  for (index = cursor = 0; index < lut->backbone_size; index++) {
1259  cell = (AaLookupBackboneCell*)lut->thick_backbone + index;
1260 
1261 
1262  if (cell->num_used == 0)
1263  continue;
1264 
1265  /* The cell contains hits */
1266 
1267  if (cell->num_used <= RPS_HITS_PER_CELL) {
1268  /* if 3 hits or less, just update each hit offset
1269  to point to the end of the word rather than
1270  the beginning */
1271 
1272  for (i = 0; i < cell->num_used; i++)
1273  cell->payload.entries[i] += BLAST_WORDSIZE_PROT - 1;
1274  }
1275  else {
1276  /* if more than 3 hits, pack the first hit into the
1277  lookup table cell, pack the overflow array byte
1278  offset into the cell, and compress the resulting
1279  'hole' in the overflow array. Update the hit
1280  offsets as well */
1281 
1282  old_cursor = cell->payload.overflow_cursor;
1283  cell->payload.entries[0] = ((Int4*)lut->overflow)[old_cursor] +
1284  BLAST_WORDSIZE_PROT - 1;
1285  cell->payload.entries[1] = cursor * sizeof(Int4);
1286  for (i = 1; i < cell->num_used; i++, cursor++) {
1287  ((Int4*)lut->overflow)[cursor]
1288  = ((Int4*)lut->overflow)[old_cursor + i] +
1289  BLAST_WORDSIZE_PROT - 1;
1290  }
1291  }
1292  }
1293 
1294  header.start_of_backbone = sizeof(header);
1295  header.end_of_overflow = header.start_of_backbone +
1296  (RPS_NUM_LOOKUP_CELLS + 1) * sizeof(RPSBackboneCell) +
1297  cursor * sizeof(Int4);
1298 
1299  /* write the lookup file header */
1300 
1301  rpsDbInfo.lookup_file.write((const char *)&header, sizeof(header));
1302 
1303  /* write the thick backbone */
1304 
1305  rpsDbInfo.lookup_file.write((const char *)lut->thick_backbone,
1306  sizeof(RPSBackboneCell)* lut->backbone_size);
1307 
1308  /* write extra backbone cells */
1309  memset(&empty_cell, 0, sizeof(empty_cell));
1310  for (i = lut->backbone_size; i < RPS_NUM_LOOKUP_CELLS + 1; i++) {
1311  rpsDbInfo.lookup_file.write((const char *)&empty_cell, sizeof(empty_cell));
1312  }
1313 
1314  /* write the new overflow array */
1315  rpsDbInfo.lookup_file.write((const char *)lut->overflow, sizeof(Int4)*cursor);
1316  }
1317 
1318  /* Free data, close files */
1319 
1320  rpsDbInfo.lookup = BlastAaLookupTableDestruct(rpsDbInfo.lookup);
1322  rpsDbInfo.lookup_file.flush();
1323  rpsDbInfo.lookup_file.close();
1324  rpsDbInfo.pssm_file.flush();
1325  rpsDbInfo.pssm_file.close();
1326  rpsDbInfo.aux_file.flush();
1327  rpsDbInfo.aux_file.close();
1328  rpsDbInfo.freq_file.flush();
1329  rpsDbInfo.freq_file.close();
1330 
1331  if(op_cobalt == m_op_mode)
1332  {
1333  rpsDbInfo.blocks_file.flush();
1334  rpsDbInfo.blocks_file.close();
1335  }
1336  else if(!m_UpdateFreqRatios)
1337  {
1338  string freq_str = rpsDbInfo.db_name + ".freq";
1339  CFile(freq_str).Remove();
1340  }
1341 
1342 }
1343 
1345 {
1348 }
1349 
1350 static bool s_HasDefline(const CBioseq & bio)
1351 {
1352  if (bio.CanGetDescr()) {
1353  return true;
1354  }
1355 
1356  return false;
1357 }
1358 
1360 {
1362  CRef<CBlast_def_line> defline(new CBlast_def_line());
1363  defline->SetSeqid() = bio.GetId();
1364  defline_set->Set().push_back(defline);
1365  return defline_set;
1366 }
1367 
1369 {
1372  *m_LogFile << "Deleted existing BLAST database with identical name." << endl;
1373  }
1374  vector<string> smpFilenames = (op_delta == m_op_mode )? x_CreateDeltaList():x_GetSMPFilenames();
1375  int num_smps = smpFilenames.size();
1376  m_NumOfVols = num_smps/m_MaxSmpFilesPerVol + 1;
1377  int num_seqs = num_smps/m_NumOfVols;
1378  int residue_seqs = num_smps % m_NumOfVols;
1379  if(m_NumOfVols == 1) {
1380  x_MakeVol( -1, smpFilenames);
1381  m_Done = true;
1382  return 0;
1383  }
1384  else {
1385  vector<string>::iterator b = smpFilenames.begin();
1386  vector<string>::iterator r = b + num_seqs;
1387  for(int i=0; i < m_NumOfVols; i++) {
1388  vector<string> vol_smps(b, r);
1389  x_MakeVol(i, vol_smps);
1390  b= r;
1391  r = b + num_seqs;
1392  if(residue_seqs > 0) {
1393  r++;
1394  residue_seqs--;
1395  }
1396  }
1397  _ASSERT(b==smpFilenames.end());
1398  }
1399  if (m_NumOfVols == m_VolNames.size()) {
1401  m_Done = true;
1402  }
1403  return 0;
1404 }
1405 
1406 void CMakeProfileDBApp::x_MakeVol(Int4 vol, vector<string> & smps)
1407 {
1408 
1409  CRPS_DbInfo rpsDbInfo;
1410  x_InitRPSDbInfo(rpsDbInfo, vol, smps.size());
1411  m_VolNames.push_back(rpsDbInfo.db_name);
1412  x_InitOutputDb(rpsDbInfo);
1413 
1414  for(int seq_index=0; seq_index < rpsDbInfo.num_seqs; seq_index++)
1415  {
1416  string filename = smps[seq_index];
1417  CFile f(filename);
1418  if(!f.Exists())
1419  {
1420  string err = filename + " does not exists";
1421  NCBI_THROW(CInputException, eInvalidInput, err);
1422  }
1423 
1424  //Read PssmWithParameters from file
1425  CPssmWithParameters pssm_w_parameters;
1426  if(m_binary_scoremat)
1427  {
1428  CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1429  in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1430  }
1431  else
1432  {
1433  CNcbiIfstream in_stream(filename.c_str());
1434  in_stream >> MSerial_AsnText >> pssm_w_parameters;
1435  }
1436 
1437  CheckInputScoremat_RV sm = x_CheckInputScoremat(pssm_w_parameters, filename);
1438  // Should have error out already....
1439  if(sm_invalid == sm)
1440  {
1441  string err = filename + " contains invalid scoremat";
1442  NCBI_THROW(CInputException, eInvalidInput, err);
1443  }
1444 
1445  const CPssm & pssm = pssm_w_parameters.GetPssm();
1446  int seq_size = pssm.GetQueryLength();
1447 
1448  const CBioseq & bioseq = pssm.GetQuery().GetSeq();
1449  CRef<CBlast_def_line_set> deflines;
1450  if(s_HasDefline(bioseq)) {
1451  deflines = CWriteDB::ExtractBioseqDeflines(bioseq);
1452  }
1453  else {
1454  deflines = s_GenerateBlastDefline(bioseq);
1455  }
1456 
1457  // set taxids from the PSSM unless -taxid or -taxid_map option was used
1458  if (!m_UserTaxIds) {
1459  if (bioseq.IsSetDescr()) {
1460  for (const auto& it: bioseq.GetDescr().Get()) {
1461  if (it->IsOrg()) {
1462  TTaxId taxid = it->GetOrg().GetTaxId();
1463  const CSeq_id* seqid = bioseq.GetFirstId();
1464  _ASSERT(seqid);
1465  if (seqid) {
1466  m_Taxids->AddTaxId(*seqid, taxid);
1467  }
1468 
1469  break;
1470  }
1471  }
1472  }
1473  }
1474 
1475  m_Taxids->FixTaxId(deflines);
1476  rpsDbInfo.output_db->AddSequence(bioseq);
1477  rpsDbInfo.output_db->SetDeflines(*deflines);
1478 
1479  //Complete RpsDnInfo init with data from first file
1480  if(NULL == rpsDbInfo.lookup)
1481  {
1482  x_RPSAddFirstSequence( rpsDbInfo, pssm_w_parameters, sm == sm_valid_freq_only);
1483  }
1484  else
1485  {
1486  x_FillInRPSDbParameters(rpsDbInfo, pssm_w_parameters);
1487  if(sm_valid_freq_only == sm){
1488  x_RPSUpdateStatistics(rpsDbInfo, pssm_w_parameters, seq_size);
1489  }
1490 
1491  if( pssm.GetFinalData().IsSetScalingFactor())
1492  {
1493  if( pssm.GetFinalData().GetScalingFactor() != rpsDbInfo.scale_factor) {
1494  NCBI_THROW(CBlastException, eCoreBlastError, "Scaling factors do not match");
1495  }
1496  }
1497  else
1498  {
1499  // If scaling factor not specified, the default is 1
1500  if( 1 != rpsDbInfo.scale_factor) {
1501  NCBI_THROW(CBlastException, eCoreBlastError, "Scaling factors do not match");
1502  }
1503  }
1504 
1506  rpsDbInfo.lookup->threshold = rpsDbInfo.scale_factor * pssm_w_parameters.GetPssm().GetFinalData().GetWordScoreThreshold();
1507  }
1508  else {
1509  rpsDbInfo.lookup->threshold = rpsDbInfo.scale_factor * m_WordDefaultScoreThreshold;
1510  }
1511 
1512  }
1513 
1514  x_RPSUpdatePSSM(rpsDbInfo, pssm, seq_index, seq_size);
1515  x_RPSUpdateLookup(rpsDbInfo, seq_size);
1516  x_UpdateFreqRatios(rpsDbInfo, pssm_w_parameters, seq_index, seq_size);
1517 
1518  rpsDbInfo.aux_file << seq_size << "\n";
1519  rpsDbInfo.aux_file << scientific << pssm.GetFinalData().GetKappa() << "\n";
1520  rpsDbInfo.curr_seq_offset +=(seq_size +1);
1521  rpsDbInfo.pos_matrix.Delete();
1522 
1523  if(op_cobalt == m_op_mode) {
1524  x_UpdateCobalt(rpsDbInfo, pssm_w_parameters, seq_size);
1525  }
1526  }
1527 
1528  if(op_delta == m_op_mode) {
1529  x_UpdateDelta(rpsDbInfo, smps);
1530  }
1531  rpsDbInfo.output_db->Close();
1532  x_RPS_DbClose(rpsDbInfo);
1533 }
1534 
1535 static void s_WriteInt4List(CNcbiOfstream & ostr, const list<Int4> & l)
1536 {
1537  ITERATE(list<Int4>, it, l)
1538  {
1539  ostr.write((char*)&(*it), sizeof(Int4));
1540  }
1541 }
1542 
1543 static void s_WriteUint4List(CNcbiOfstream & ostr, const list<Uint4> & l)
1544 {
1545  ITERATE(list<Uint4>, it, l)
1546  {
1547  ostr.write((char*)&(*it), sizeof(Uint4));
1548  }
1549 }
1550 
1552 {
1553  vector<string> smpFilenames = x_GetSMPFilenames();
1554  vector<string> deltaList;
1555 
1556  for(unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1557  {
1558  string filename = smpFilenames[seq_index];
1559  CFile f(filename);
1560  if(!f.Exists())
1561  {
1562  string err = filename + " does not exists";
1563  NCBI_THROW(CInputException, eInvalidInput, err);
1564  }
1565 
1566  //Read PssmWithParameters from file
1567  CPssmWithParameters pssm_w_parameters;
1568  if(m_binary_scoremat)
1569  {
1570  CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1571  in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1572  }
1573  else
1574  {
1575  CNcbiIfstream in_stream(filename.c_str());
1576  in_stream >> MSerial_AsnText >> pssm_w_parameters;
1577  }
1578 
1579  CheckInputScoremat_RV sm = x_CheckInputScoremat(pssm_w_parameters, filename);
1580  // Should have error out already....
1581  if(sm_invalid == sm)
1582  {
1583  string err = filename + " contains invalid scoremat";
1584  NCBI_THROW(CInputException, eInvalidInput, err);
1585  }
1586 
1587  const CPssm & pssm = pssm_w_parameters.GetPssm();
1588  int seq_size = pssm.GetQueryLength();
1590  {
1591  string err = filename + " contains no weighted residue frequencies for building delta database";
1592  NCBI_THROW(CInputException, eInvalidInput, err);
1593  }
1594 
1596  {
1597  string err = filename + " contains no observations information for building delta database";
1598  NCBI_THROW(CInputException, eInvalidInput, err);
1599  }
1600 
1601  if (true == x_CheckDelta(pssm, seq_size, filename))
1602  {
1603  deltaList.push_back(filename);
1604  }
1605  }
1606 
1607  return deltaList;
1608 }
1609 
1610 void CMakeProfileDBApp::x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames)
1611 {
1612  CTmpFile tmp_obsr_file(CTmpFile::eRemove);
1613  CTmpFile tmp_freq_file(CTmpFile::eRemove);
1614  CNcbiOfstream tmp_obsr_buff(tmp_obsr_file.GetFileName().c_str(), IOS_BASE::out | IOS_BASE::binary);
1615  CNcbiOfstream tmp_freq_buff(tmp_freq_file.GetFileName().c_str(), IOS_BASE::out | IOS_BASE::binary);
1616 
1617  list<Int4> FreqOffsets;
1618  list<Int4> ObsrOffsets;
1619  Int4 CurrFreqOffset = 0;
1620  Int4 CurrObsrOffset= 0;
1621 
1622  for(unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1623  {
1624  string filename = smpFilenames[seq_index];
1625  //Read PssmWithParameters from file
1626  CPssmWithParameters pssm_w_parameters;
1627  if(m_binary_scoremat)
1628  {
1629  CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1630  in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1631  }
1632  else
1633  {
1634  CNcbiIfstream in_stream(filename.c_str());
1635  in_stream >> MSerial_AsnText >> pssm_w_parameters;
1636  }
1637 
1638  const CPssm & pssm = pssm_w_parameters.GetPssm();
1639  int seq_size = pssm.GetQueryLength();
1640 
1641  // get weightd residue frequencies
1642  const list<double>& orig_freqs = pssm.GetIntermediateData().GetWeightedResFreqsPerPos();
1643 
1644  // get number of independent observations
1645  const list<double>& obsr = pssm.GetIntermediateData().GetNumIndeptObsr();
1646 
1647  int alphabet_size = pssm.GetNumRows();
1648  list<double> modify_freqs;
1649 
1650  if(pssm.GetByRow())
1651  {
1652  // need to flip the freq matrix
1653  vector<double> tmp(orig_freqs.size());
1654  list<double>::const_iterator f_itr = orig_freqs.begin();
1655 
1656  for(int i = 0; i < alphabet_size; i++)
1657  {
1658  for(int j = 0; j < seq_size; j++)
1659  {
1660  tmp[i + j*alphabet_size] = *f_itr;
1661  ++f_itr;
1662  }
1663  }
1664  copy(tmp.begin(), tmp.end(), modify_freqs.begin());
1665  }
1666 
1667  // Pad matrix if necessary
1668  if(alphabet_size < BLASTAA_SIZE)
1669  {
1670  if(0 == modify_freqs.size())
1671  copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1672 
1673  list<double>::iterator p_itr = modify_freqs.begin();
1674 
1675  for (int j=0; j < seq_size; j++)
1676  {
1677  for(int i=0; i < alphabet_size; i++)
1678  {
1679  if(modify_freqs.end() == p_itr)
1680  break;
1681 
1682  ++p_itr;
1683  }
1684 
1685  modify_freqs.insert(p_itr, (BLASTAA_SIZE-alphabet_size), 0);
1686  }
1687  }
1688 
1689  const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1690 
1691  //save offset for this record
1692  ObsrOffsets.push_back(CurrObsrOffset);
1693 
1694  list<Uint4> ObsrBuff;
1695  // write effective observations in compressed form
1696  // as a list of pairs: value, number of occurences
1697  unsigned int num_obsr_columns = 0;
1698  list<double>::const_iterator obsr_it = obsr.begin();
1699  do
1700  {
1701  double current = *obsr_it;
1702  Uint4 num = 1;
1703  num_obsr_columns++;
1704  obsr_it++;
1705  while (obsr_it != obsr.end() && fabs(*obsr_it - current) < 1e-4)
1706  {
1707  obsr_it++;
1708  num++;
1709  num_obsr_columns++;
1710  }
1711 
1712  // +1 because pssm engine returns alpha (in psi-blast papers)
1713  // which is number of independent observations - 1
1714  ObsrBuff.push_back((Uint4)((current + 1.0) * kFixedPointScaleFactor));
1715  ObsrBuff.push_back(num);
1716  }
1717  while (obsr_it != obsr.end());
1718 
1719  Uint4 num_weighted_counts = 0;
1720 
1721  // save offset for this frequencies record
1722  FreqOffsets.push_back(CurrFreqOffset / BLASTAA_SIZE);
1723 
1724  list<Uint4> FreqBuff;
1725  // save weighted residue frequencies
1726  ITERATE (list<double>, it, freqs)
1727  {
1728  FreqBuff.push_back((Uint4)(*it * kFixedPointScaleFactor));
1729  num_weighted_counts++;
1730  }
1731 
1732  if (num_obsr_columns != num_weighted_counts / BLASTAA_SIZE)
1733  {
1734  string err = "Number of frequencies and observations columns do not match in " + filename;
1735  NCBI_THROW(CException, eInvalid, err);
1736  }
1737 
1738  // additional column of zeros is added for compatibility with rps database
1739  unsigned int padded_size = FreqBuff.size() + BLASTAA_SIZE;
1740  FreqBuff.resize(padded_size, 0);
1741 
1742  CurrFreqOffset += FreqBuff.size();
1743  CurrObsrOffset += ObsrBuff.size();
1744  s_WriteUint4List(tmp_freq_buff, FreqBuff);
1745  s_WriteUint4List(tmp_obsr_buff, ObsrBuff);
1746 
1747  }
1748 
1749  tmp_obsr_buff.flush();
1750  tmp_freq_buff.flush();
1751  x_WrapUpDelta(rpsDbInfo, tmp_obsr_file, tmp_freq_file, FreqOffsets, ObsrOffsets, CurrFreqOffset, CurrObsrOffset);
1752 }
1753 
1754 
1755 bool CMakeProfileDBApp::x_ValidateCd(const list<double>& freqs,
1756  const list<double>& observ,
1757  unsigned int alphabet_size)
1758 {
1759 
1760  if (freqs.size() / alphabet_size != observ.size())
1761  {
1762  string err = "Number of frequency and observations columns do not match";
1763  NCBI_THROW(CException, eInvalid, err);
1764  }
1765 
1766  ITERATE (list<double>, it, freqs)
1767  {
1768  unsigned int residue = 0;
1769  double sum = 0.0;
1770  while (residue < alphabet_size - 1)
1771  {
1772  sum += *it;
1773  it++;
1774  residue++;
1775  }
1776  sum += *it;
1777 
1778  if (fabs(sum - 1.0) > kEpsylon)
1779  return false;
1780  }
1781 
1782  ITERATE (list<double>, it, observ)
1783  {
1784  if (*it < 1.0)
1785  return false;
1786  }
1787 
1788  return true;
1789 }
1790 
1791 
1792 bool CMakeProfileDBApp::x_CheckDelta( const CPssm & pssm, Int4 seq_size, const string & filename)
1793 {
1794  // get weightd residue frequencies
1795  const list<double>& orig_freqs = pssm.GetIntermediateData().GetWeightedResFreqsPerPos();
1796 
1797  // get number of independent observations
1798  const list<double>& obsr = pssm.GetIntermediateData().GetNumIndeptObsr();
1799 
1800  int alphabet_size = pssm.GetNumRows();
1801  list<double> modify_freqs;
1802 
1803  if(pssm.GetByRow())
1804  {
1805  // need to flip the freq matrix
1806  vector<double> tmp(orig_freqs.size());
1807  list<double>::const_iterator f_itr = orig_freqs.begin();
1808 
1809  for(int i = 0; i < alphabet_size; i++)
1810  {
1811  for(int j = 0; j < seq_size; j++)
1812  {
1813  tmp[i + j*alphabet_size] = *f_itr;
1814  ++f_itr;
1815  }
1816  }
1817  copy(tmp.begin(), tmp.end(), modify_freqs.begin());
1818  }
1819 
1820  // Pad matrix if necessary
1821  if(alphabet_size < BLASTAA_SIZE)
1822  {
1823  if(0 == modify_freqs.size())
1824  copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1825 
1826  list<double>::iterator p_itr = modify_freqs.begin();
1827 
1828  for (int j=0; j < seq_size; j++)
1829  {
1830  for(int i=0; i < alphabet_size; i++)
1831  {
1832  if(modify_freqs.end() == p_itr)
1833  break;
1834 
1835  ++p_itr;
1836  }
1837 
1838  modify_freqs.insert(p_itr, (BLASTAA_SIZE-alphabet_size), 0);
1839  }
1840  }
1841 
1842  const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1843  double max_obsr = *max_element(obsr.begin(), obsr.end()) + 1.0;
1844  if(max_obsr < m_ObsrvThreshold)
1845  {
1846  *m_LogFile << filename +
1847  " was excluded: due to too few independent observations\n";
1848  return false;
1849  }
1850 
1851  if( !x_ValidateCd(freqs, obsr, BLASTAA_SIZE) && m_ExcludeInvalid)
1852  {
1853  *m_LogFile << filename +
1854  " was excluded: it conatins an invalid CD \n";
1855  return false;
1856  }
1857  return true;
1858 }
1859 
1860 
1861 
1862 void CMakeProfileDBApp::x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo, CTmpFile & tmp_obsr_file, CTmpFile & tmp_freq_file,
1863  list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset)
1864 {
1865  FreqOffsets.push_back(CurrFreqOffset / BLASTAA_SIZE);
1866  ObsrOffsets.push_back(CurrObsrOffset);
1867 
1868  string wcounts_str = rpsDbInfo.db_name + ".wcounts";
1869  CNcbiOfstream wcounts_file(wcounts_str.c_str(), ios::out | ios::binary);
1870  if (!wcounts_file.is_open())
1871  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .wcounts file");
1872 
1873  string obsr_str = rpsDbInfo.db_name + ".obsr";
1874  CNcbiOfstream obsr_file(obsr_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
1875  if (!obsr_file.is_open())
1876  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .obsr file");
1877 
1878  CNcbiIfstream tmp_obsr_buff (tmp_obsr_file.GetFileName().c_str(), IOS_BASE::in | IOS_BASE::binary);
1879  CNcbiIfstream tmp_freq_buff (tmp_freq_file.GetFileName().c_str(), IOS_BASE::in | IOS_BASE::binary);
1880 
1881  // write RPS BLAST database magic number
1882  Int4 magic_number = RPS_MAGIC_NUM_28;
1883  wcounts_file.write((char*)&magic_number, sizeof(Int4));
1884  obsr_file.write((char*)&magic_number, sizeof(Int4));
1885 
1886  // write number of recrods
1887  Int4 num_wcounts_records = FreqOffsets.size() -1;
1888  Int4 num_obsr_records = ObsrOffsets.size() -1;
1889  wcounts_file.write((char*)&num_wcounts_records, sizeof(Int4));
1890  obsr_file.write((char*)&num_obsr_records, sizeof(Int4));
1891 
1892  s_WriteInt4List(wcounts_file, FreqOffsets);
1893  wcounts_file.flush();
1894  wcounts_file << tmp_freq_buff.rdbuf();
1895  wcounts_file.flush();
1896  wcounts_file.close();
1897 
1898  s_WriteInt4List(obsr_file, ObsrOffsets);
1899  obsr_file.flush();
1900  obsr_file << tmp_obsr_buff.rdbuf();
1901  obsr_file.flush();
1902  obsr_file.close();
1903 }
1904 
1906 {
1907  vector<string> v;
1908  for(unsigned int i=0; i < m_VolNames.size(); i++) {
1909  string t = kEmptyStr;
1911  s.GetString(t);
1912  v.push_back(t);
1913  }
1916 }
1917 
1919 {
1920  int status = 0;
1921  try { x_Run(); }
1922  catch(const blast::CInputException& e) {
1923  ERR_POST(Error << "INPUT ERROR: " << e.GetMsg());
1924  status = BLAST_INPUT_ERROR;
1925  }
1926  catch (const CSeqDBException& e) {
1927  ERR_POST(Error << "ERROR: " << e.GetMsg());
1928  status = BLAST_DATABASE_ERROR;
1929  }
1930  catch (const blast::CBlastException& e) {
1931  ERR_POST(Error << "ERROR: " << e.GetMsg());
1932  status = BLAST_INPUT_ERROR;
1933  }
1934  catch (const CException& e) {
1935  ERR_POST(Error << "ERROR: " << e.GetMsg());
1936  status = BLAST_UNKNOWN_ERROR;
1937  }
1938  catch (...) {
1939  ERR_POST(Error << "Error: Unknown exception");
1940  status = BLAST_UNKNOWN_ERROR;
1941  }
1942 
1943  x_AddCmdOptions();
1945  return status;
1946 }
1947 
1949 {
1950  const CArgs & args = GetArgs();
1951  if (args["dbtype"].HasValue()) {
1952  m_UsageReport.AddParam(CBlastUsageReport::eDBType, args["dbtype"].AsString());
1953  }
1954  if(args["taxid"].HasValue() || args["taxid_map"].HasValue()) {
1956  }
1957 }
1958 
1959 
1960 #ifndef SKIP_DOXYGEN_PROCESSING
1961 int main(int argc, const char* argv[] /*, const char* envp[]*/)
1962 {
1963  return CMakeProfileDBApp().AppMain(argc, argv);
1964 }
1965 
1966 
1967 
1968 
1969 #endif /* SKIP_DOXYGEN_PROCESSING */
1970 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Declares singleton objects to store the version and reference for the BLAST engine.
Routines for creating protein BLAST lookup tables.
@ eBackbone
BlastAaLookupTable * BlastAaLookupTableDestruct(BlastAaLookupTable *lookup)
Free the lookup table.
void BlastAaLookupIndexQuery(BlastAaLookupTable *lookup, Int4 **matrix, BLAST_SequenceBlk *query, BlastSeqLoc *unmasked_regions, Int4 query_bias)
Index a protein query.
struct RPSBackboneCell RPSBackboneCell
structure defining one cell of the RPS lookup table
#define RPS_HITS_PER_CELL
maximum number of hits in an RPS backbone cell; this may be redundant (have the same value as AA_HITS...
Int4 BlastAaLookupFinalize(BlastAaLookupTable *lookup, EBoneType bone_type)
Pack the data structures comprising a protein lookup table into their final form.
Int4 BlastAaLookupTableNew(const LookupTableOptions *opt, BlastAaLookupTable **lut)
Create a new protein lookup table.
#define BLAST_INPUT_ERROR
Command line binary exit code: error in input query/options.
#define BLAST_UNKNOWN_ERROR
Command line binary exit code: unknown error.
#define BLAST_DATABASE_ERROR
Command line binary exit code: error in database/subject.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Definition: blast_filter.c:737
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
Definition: blast_filter.c:608
Interface for converting sources of sequence data into blast sequence input.
The structures and functions in blast_options.
Int2 BLAST_FillQuerySetUpOptions(QuerySetUpOptions *options, EBlastProgramType program, const char *filter_string, Uint1 strand_option)
Fill non-default contents of the QuerySetUpOptions.
Int2 BlastQuerySetUpOptionsNew(QuerySetUpOptions **options)
Allocate memory for QuerySetUpOptions and fill with default values.
Int2 BLAST_FillLookupTableOptions(LookupTableOptions *options, EBlastProgramType program, Boolean is_megablast, double threshold, Int4 word_size)
Allocate memory for lookup table options and fill with default values.
Int2 LookupTableOptionsNew(EBlastProgramType program, LookupTableOptions **options)
Allocate memory for lookup table options and fill with default values.
#define BLAST_WORDSIZE_PROT
length of word to trigger an extension.
Definition: blast_options.h:66
LookupTableOptions * LookupTableOptionsFree(LookupTableOptions *options)
Deallocates memory for LookupTableOptions*.
QuerySetUpOptions * BlastQuerySetUpOptionsFree(QuerySetUpOptions *options)
Deallocate memory for QuerySetUpOptions.
@ eBlastTypePsiBlast
Definition: blast_program.h:82
@ eBlastTypeBlastp
Definition: blast_program.h:73
#define FREQ_RATIO_SCALE
header for RPS blast frequency ratios ('.freq') file
Definition: blast_rps.h:83
#define RPS_MAGIC_NUM_28
Version number for 28-letter alphabet.
Definition: blast_rps.h:44
Int2 BLAST_GetProteinGapExistenceExtendParams(const char *matrixName, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
Definition: blast_stat.c:3374
#define BLAST_SCORE_MAX
maximum allowed score (for one letter comparison).
Definition: blast_stat.h:122
Code to build a database given various sources of sequence data.
AutoArray –.
Definition: ncbimisc.hpp:527
Class to constrain the values of an argument to those greater than or equal to the value specified in...
CArgAllow_Integers –.
Definition: ncbiargs.hpp:1751
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
Defines BLAST error codes (user errors included)
void AddParam(EUsageParams p, int val)
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
Definition: version.hpp:53
static void CreateDirectories(const string &dbname)
Create Directory for blast db.
Definition: build_db.cpp:1051
CCoreBlock –.
Definition: CoreBlock.hpp:66
CFile –.
Definition: ncbifile.hpp:1604
Defines user input exceptions.
void Create(int seq_size)
Int4 ** Get(void)
unsigned int GetSize(void)
QuerySetUpOptions * query_options
LookupTableOptions * lookup_options
CMakeProfileDBApp(void)
@inheritDoc
void x_AddCmdOptions(void)
virtual void Init()
@inheritDoc
CNcbiOstream * m_LogFile
CheckInputScoremat_RV x_CheckInputScoremat(const CPssmWithParameters &pssm_w_parameters, const string &filename)
CRef< CTaxIdSet > m_Taxids
CNcbiIstream * m_InPssmList
EBlastDbVersion m_DbVer
void x_RPSUpdateLookup(CRPS_DbInfo &rpsDbInfo, Int4 seq_size)
vector< string > x_CreateDeltaList(void)
void x_WrapUpDelta(CRPS_DbInfo &rpsDbInfo, CTmpFile &tmp_obsr_file, CTmpFile &tmp_freq_file, list< Int4 > &FreqOffsets, list< Int4 > &ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset)
void x_RPSUpdateStatistics(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &seq, Int4 seq_size)
virtual int Run()
@inheritDoc
void x_CreateAliasFile(void)
void x_FillInRPSDbParameters(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &pssm_p)
void x_InitProgramParameters(void)
void x_InitRPSDbInfo(CRPS_DbInfo &rpsDBInfo, Int4 vol, Int4 num_files)
void x_RPS_DbClose(CRPS_DbInfo &rpsDbInfo)
bool x_CheckDelta(const CPssm &pssm, Int4 seq_size, const string &filename)
void x_RPSAddFirstSequence(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &pssm_w_parameters, bool freq_only)
void x_UpdateRPSDbInfo(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p)
void x_UpdateDelta(CRPS_DbInfo &rpsDbInfo, vector< string > &smpFilenames)
double m_WordDefaultScoreThreshold
void x_RPSUpdatePSSM(CRPS_DbInfo &rpsDbInfo, const CPssm &pssm, Int4 seq_index, Int4 seq_size)
void x_InitOutputDb(CRPS_DbInfo &rpsDBInfo)
void x_SetupArgDescriptions(void)
CBlastUsageReport m_UsageReport
vector< string > m_VolNames
CStopWatch m_StopWatch
bool x_ValidateCd(const list< double > &freqs, const list< double > &observ, unsigned int alphabet_size)
void x_UpdateFreqRatios(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p, Int4 seq_index, Int4 seq_size)
bool x_IsUpdateFreqRatios(const CPssm &p)
vector< string > x_GetSMPFilenames(void)
void x_UpdateCobalt(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p, Int4 seq_size)
void x_MakeVol(Int4 vol, vector< string > &smps)
CNCBIstdaa –.
Definition: NCBIstdaa.hpp:66
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
Implements the interface to retrieve data for the last 2 stages of the PSSM creation.
Computes a PSSM as specified in PSI-BLAST.
CPssmFinalData –.
CPssmParameters –.
Definition: Pssm.hpp:55
void GetQuerySequenceData(CNCBIstdaa &sequence) const
Retrieve the query sequence data in ncbistdaa format.
Definition: Pssm.cpp:77
SIZE_TYPE GetQueryLength() const
Return the query length or 0 if no query is available.
Definition: Pssm.cpp:62
CSeqDBException.
Definition: seqdbcommon.hpp:73
String slicing.
void GetString(string &s) const
Return the data by assigning it to a string.
@ eProtein
Definition: seqdb.hpp:174
CStopWatch –.
Definition: ncbitime.hpp:1938
void FixTaxId(CRef< objects::CBlast_def_line_set > deflines)
Check that each defline has the specified taxid; if not, replace the defline and set the taxid.
Definition: taxid_set.cpp:131
void AddTaxId(const objects::CSeq_id &seqid, const TTaxId &taxid)
Definition: taxid_set.cpp:77
void SetMappingFromFile(CNcbiIstream &f)
Definition: taxid_set.cpp:45
CTmpFile –.
Definition: ncbifile.hpp:2352
static string MakeShortName(const string &base, int index)
Construct the short name for a volume.
CWriteDB.
Definition: writedb.hpp:92
void SetMaxFileSize(Uint8 sz)
Set maximum size for output files.
Definition: writedb.cpp:118
@ eProtein
Protein database.
Definition: writedb.hpp:97
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
Definition: writedb.cpp:79
EIndexType
Whether and what kind of indices to build.
Definition: writedb.hpp:104
@ eDefault
Like eFullIndex but also build a numeric Trace ID index.
Definition: writedb.hpp:121
@ eNoIndex
Build a database without any indices.
Definition: writedb.hpp:106
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
Definition: writedb.cpp:129
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
Definition: writedb.cpp:94
void Close()
Close the Database.
Definition: writedb.cpp:104
Constant declarations for command line arguments for BLAST programs.
const string kArgMatrixName
Argument for scoring matrix.
const string kArgDbTitle
Title for the BLAST database.
const string kArgGapExtend
Argument to select the gap extending penalty.
const string kArgGapOpen
Argument to select the gap opening penalty.
const string kArgWordScoreThreshold
Argument to specify the minimum word score such that the word is added to the lookup table.
void Print(const CCompactSAMApplication::AlignInfo &ai)
std::ofstream out("events_result.xml")
main entry point for tests
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static char tmp[3200]
Definition: utf8.c:42
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
static CNcbiMatrix< double > * GetFreqRatios(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1174
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1312
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
CVersionInfo GetVersion(void) const
Get the program version information.
Definition: ncbiapp.cpp:1184
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
@ fHideXmlHelp
Hide XML help description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
bool Exist(const string &name) const
Check existence of argument description.
Definition: ncbiargs.cpp:1813
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
Definition: ncbiargs.hpp:590
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
Definition: ncbifile.cpp:2595
const string & GetFileName(void) const
Return used file name (generated or given in the constructor).
Definition: ncbifile.cpp:5429
@ eRemove
Remove file.
Definition: ncbifile.hpp:2356
#define MSerial_AsnBinary
Definition: serialbase.hpp:697
const TPrim & Get(void) const
Definition: serialbase.hpp:347
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
bool IsEnabled(void)
Indicates whether application usage statistics collection is enabled for a current reporter instance.
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5187
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3554
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
@ fSplit_Truncate
Definition: ncbistr.hpp:2501
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2498
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2776
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2765
#define CVersion
Tdata & Set(void)
Assign a value to data member.
const TFreqRatios & GetFreqRatios(void) const
Get the FreqRatios member data.
const TNumIndeptObsr & GetNumIndeptObsr(void) const
Get the NumIndeptObsr member data.
const TMatrixName & GetMatrixName(void) const
Get the MatrixName member data.
const TQuery & GetQuery(void) const
Get the Query member data.
Definition: Pssm_.hpp:772
TNumRows GetNumRows(void) const
Get the NumRows member data.
Definition: Pssm_.hpp:610
void SetParams(TParams &value)
Assign a value to Params data member.
bool IsSetFinalData(void) const
Final representation for the PSSM Check if a value has been assigned to FinalData data member.
Definition: Pssm_.hpp:802
bool IsSetStop(void) const
end of block on query Check if a value has been assigned to Stop data member.
Definition: CoreBlock_.hpp:367
TH GetH(void) const
Get the H member data.
TKappa GetKappa(void) const
Get the Kappa member data.
const TScores & GetScores(void) const
Get the Scores member data.
const TWeightedResFreqsPerPos & GetWeightedResFreqsPerPos(void) const
Get the WeightedResFreqsPerPos member data.
bool IsSetGapOpen(void) const
gap opening penalty corresponding to the matrix above Check if a value has been assigned to GapOpen d...
TGapExtend GetGapExtend(void) const
Get the GapExtend member data.
TWordScoreThreshold GetWordScoreThreshold(void) const
Get the WordScoreThreshold member data.
TScalingFactor GetScalingFactor(void) const
Get the ScalingFactor member data.
const TBlocks & GetBlocks(void) const
Get the Blocks member data.
Definition: CoreDef_.hpp:369
bool IsSetStart(void) const
begin of block on query Check if a value has been assigned to Start data member.
Definition: CoreBlock_.hpp:320
bool IsSetWordScoreThreshold(void) const
Word score threshold Check if a value has been assigned to WordScoreThreshold data member.
bool IsSetScalingFactor(void) const
scaling factor used to obtain more precision when building the PSSM.
bool IsSetFreqRatios(void) const
PSSM's frequency ratios Check if a value has been assigned to FreqRatios data member.
TStop GetStop(void) const
Get the Stop member data.
Definition: CoreBlock_.hpp:386
void SetMatrixName(const TMatrixName &value)
Assign a value to MatrixName data member.
bool IsSetIntermediateData(void) const
both intermediateData and finalData can be provided, but at least one of them must be provided.
Definition: Pssm_.hpp:781
const TFinalData & GetFinalData(void) const
Get the FinalData member data.
Definition: Pssm_.hpp:814
bool IsSetWeightedResFreqsPerPos(void) const
Weighted observed residue frequencies per position of the PSSM.
bool IsSetRpsdbparams(void) const
data needed by formatrpsdb to create RPS-BLAST databases.
void SetPssm(TPssm &value)
Assign a value to Pssm data member.
TNumColumns GetNumColumns(void) const
Get the NumColumns member data.
Definition: Pssm_.hpp:657
const TConstraints & GetConstraints(void) const
Get the Constraints member data.
bool IsSetMatrixName(void) const
name of the underlying score matrix whose frequency ratios were used in PSSM construction (e....
bool IsSetNumRows(void) const
The dimensions of the matrix are returned so the client can verify that all data was received.
Definition: Pssm_.hpp:591
void SetFinalData(TFinalData &value)
Assign a value to FinalData data member.
Definition: Pssm_.cpp:116
TStart GetStart(void) const
Get the Start member data.
Definition: CoreBlock_.hpp:339
bool IsSetQuery(void) const
PSSM representative sequence (master) Check if a value has been assigned to Query data member.
Definition: Pssm_.hpp:760
TGapOpen GetGapOpen(void) const
Get the GapOpen member data.
bool IsSetNumIndeptObsr(void) const
Number of independent observations per position of the PSSM NOTE: this is needed for building CDD dat...
bool IsSetConstraints(void) const
alignment constraints needed by sequence-structure threader and other global or local block-alignment...
bool IsSetGapExtend(void) const
gap extension penalty corresponding to the matrix above Check if a value has been assigned to GapExte...
bool IsSetNumColumns(void) const
number of columns Check if a value has been assigned to NumColumns data member.
Definition: Pssm_.hpp:638
const TIntermediateData & GetIntermediateData(void) const
Get the IntermediateData member data.
Definition: Pssm_.hpp:793
TByRow GetByRow(void) const
Get the ByRow member data.
Definition: Pssm_.hpp:735
void SetGapOpen(TGapOpen value)
Assign a value to GapOpen data member.
const TParams & GetParams(void) const
Get the Params member data.
bool IsSetBlocks(void) const
nblocks locations Check if a value has been assigned to Blocks data member.
Definition: CoreDef_.hpp:357
bool IsSetPssm(void) const
This field is applicable to PSI-BLAST and formatrpsdb.
void SetGapExtend(TGapExtend value)
Assign a value to GapExtend data member.
const TPssm & GetPssm(void) const
Get the Pssm member data.
bool IsSetParams(void) const
This field's rpsdbparams is used to specify the values of options for processing by formatrpsdb.
const TRpsdbparams & GetRpsdbparams(void) const
Get the Rpsdbparams member data.
TLambda GetLambda(void) const
Get the Lambda member data.
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
Definition: Bioseq_.hpp:309
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
bool IsTitle(void) const
Check if variant Title is selected.
Definition: Seqdesc_.hpp:1026
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
if(yy_accept[yy_current_state])
static void s_WriteInt4List(CNcbiOfstream &ostr, const list< Int4 > &l)
static const string kDefaultMatrix(kMatrixBLOSUM62)
static const string kOutDbName("out")
static CRef< CBlast_def_line_set > s_GenerateBlastDefline(const CBioseq &bio)
static const string kMatrixBLOSUM80
static const string kOutDbType("dbtype")
static const string kExcludeInvalid("exclude_invalid")
#define RPS_NUM_LOOKUP_CELLS
USING_SCOPE(blast)
static const string kMatrixPAM250
static const string kMaxSmpFilesPerVol("max_smp_vol")
static const string kMatrixBLOSUM62
static bool s_HasDefline(const CBioseq &bio)
static const Uint4 kFixedPointScaleFactor
static const string kLogFile("logfile")
static const string kDefaultOutIndexFile("true")
static const string kDefaultOutDbType(kOutDbRps)
#define kEpsylon
static const string kDefaultExcludeInvalid("true")
#define RPS_DATABASE_VERSION
static const string kMatrixBLOSUM50
static const string kOutDbRps
static void s_WriteUint4List(CNcbiOfstream &ostr, const list< Uint4 > &l)
static const string kMatrixBLOSUM90
#define kDefaultWordScoreThreshold
#define kDefaultObsrThreshold
static const string kInPssmList("in")
int main(int argc, const char *argv[])
#define kDefaultMaxSmpFilesPerVol
static const string kObsrThreshold("obsr_threshold")
USING_NCBI_SCOPE
static const string kMatrixPAM70
static const string kMatrixBLOSUM45
#define kSingleVol
static const string kOutDbDelta
static bool s_DeleteMakeprofileDb(const string &name)
static const string kMatrixPAM30
static const string kBinaryScoremat("binary")
static const string kOutDbCobalt
static const string kUseCmdlineThreshold("force")
static const string kPssmScaleFactor("scale")
static const string kOutIndexFile("index")
#define kDefaultPssmScaleFactor
static int version
Definition: mdb_load.c:29
const struct ncbi::grid::netcache::search::fields::SIZE size
#define fabs(v)
Definition: ncbi_dispd.c:46
EIPRangeType t
Definition: ncbi_localip.c:101
Prototypes for portable math library (ported from C Toolkit)
long BLAST_Nint(double x)
Nearest integer.
Definition: ncbi_math.c:437
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define INT2_MIN
smallest (most negative) number represented by signed (two byte) short
Definition: ncbi_std.h:161
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Miscellaneous common-use basic types and functionality.
Defines: CTimeFormat - storage class for time format.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
static int filenames
Definition: pcregrep.c:172
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
C++ API for the PSI-BLAST PSSM engine.
bool DeleteBlastDb(const string &dbpath, CSeqDB::ESeqType seq_type)
Deletes all files associated with a BLAST database.
Definition: seqdb.cpp:1542
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
@ eBDB_Version4
Definition: seqdbcommon.hpp:52
@ eBDB_Version5
Definition: seqdbcommon.hpp:53
This file defines several SeqDB utility functions related to byte order and file system portability.
CSeqDB_Substring SeqDB_RemoveDirName(CSeqDB_Substring s)
Returns a filename minus greedy path.
Definition: seqdbcommon.cpp:50
#define row(bind, expected)
Definition: string_bind.c:73
structure defining one cell of the compacted lookup table
union AaLookupBackboneCell::@3 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
Int4 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins
Int4 num_used
number of hits stored for this cell
The basic lookup table structure for blastp searches.
void * thick_backbone
may point to BackboneCell, SmallboneCell, or TinyboneCell.
Boolean use_pssm
if TRUE, lookup table construction will assume that the underlying score matrix is position- specific
Int4 threshold
the score threshold for neighboring words
void * overflow
may point to Int4 or Uint2, the overflow array for the compacted lookup table
Int4 backbone_size
number of cells in the backbone
header of RPS blast '.loo' file
Definition: blast_rps.h:49
Int4 magic_number
value should be RPS_MAGIC_NUM
Definition: blast_rps.h:50
Int4 start_of_backbone
byte offset of start of backbone
Definition: blast_rps.h:56
Int4 end_of_overflow
byte offset to end of overflow array
Definition: blast_rps.h:57
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
Options needed to construct a lookup table Also needed: query sequence and query length.
Options required for setting up the query sequence.
structure defining one cell of the RPS lookup table
static string query
Class which defines sequence id to taxid mapping.
#define _ASSERT
Defines BLAST database construction classes.
void CWriteDB_CreateAliasFile(const string &file_name, const string &db_name, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title=string(), EAliasFileFilterType alias_type=eGiList)
Writes an alias file that restricts a database with a gi list.
@ eNoAliasFilterType
Sentinel value.
Definition: writedb.hpp:610
Code for database files construction.
Modified on Thu Apr 25 08:17:01 2024 by modify_doxy.py rev. 669887