NCBI C++ ToolKit
makeprofiledb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: makeprofiledb.cpp 100753 2023-09-07 13:33:06Z madden $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Amelia Fong
27  *
28  */
29 
30 /** @file makeprofiledb.cpp
31  * Command line tool to create RPS,COBALT & DELTA BLAST databases.
32  * This is the successor to formatrpsdb from the C toolkit
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbimisc.hpp>
38 #include <corelib/ncbitime.hpp>
39 #include <util/math/matrix.hpp>
40 #include <serial/objistrasn.hpp>
64 #include "../blast/blast_app_util.hpp"
65 
66 #ifndef SKIP_DOXYGEN_PROCESSING
68 USING_SCOPE(blast);
70 #endif /* SKIP_DOXYGEN_PROCESSING */
71 
72 
73 //Input args specify to makeprofiledb
74 static const string kInPssmList("in");
75 static const string kOutDbName("out");
76 static const string kOutDbType("dbtype");
77 static const string kPssmScaleFactor("scale");
78 static const string kOutIndexFile("index");
79 static const string kObsrThreshold("obsr_threshold");
80 static const string kExcludeInvalid("exclude_invalid");
81 static const string kBinaryScoremat("binary");
82 static const string kUseCmdlineThreshold("force");
83 static const string kMaxSmpFilesPerVol("max_smp_vol");
84 
85 static const string kLogFile("logfile");
86 
87 //Supported Output Database Types
88 static const string kOutDbRps = "rps";
89 static const string kOutDbCobalt = "cobalt";
90 static const string kOutDbDelta = "delta";
91 
92 //Supported Matrices
93 static const string kMatrixBLOSUM62 = "BLOSUM62";
94 static const string kMatrixBLOSUM80 = "BLOSUM80";
95 static const string kMatrixBLOSUM50 = "BLOSUM50";
96 static const string kMatrixBLOSUM45 = "BLOSUM45";
97 static const string kMatrixBLOSUM90 = "BLOSUM90";
98 static const string kMatrixPAM250 = "PAM250";
99 static const string kMatrixPAM30 = "PAM30";
100 static const string kMatrixPAM70 = "PAM70";
101 
102 //Default Input Values
103 static const string kDefaultMatrix(kMatrixBLOSUM62);
104 static const string kDefaultOutDbType(kOutDbRps);
105 static const string kDefaultOutIndexFile("true");
106 static const string kDefaultExcludeInvalid("true");
107 #define kDefaultWordScoreThreshold (9.82)
108 #define kDefaultPssmScaleFactor (100.00)
109 #define kDefaultObsrThreshold (6.0)
110 #define kDefaultMaxSmpFilesPerVol (2500)
111 
112 //Fix point scale factor for delta blast
113 static const Uint4 kFixedPointScaleFactor = 1000;
114 #define kEpsylon (0.0001)
115 
116 #define DEFAULT_POS_MATRIX_SIZE 2000
117 #define RPS_NUM_LOOKUP_CELLS 32768
118 #if BLASTAA_SIZE == 28
119 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM_28
120 #else
121 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM
122 #endif
123 
124 #define kSingleVol (-1)
125 
127 {
128 public:
131 
132  void Create(int seq_size);
133  void Delete(void);
134 
135  Int4 ** Get(void) { return m_posMatrix;};
136  unsigned int GetSize(void){return m_size;};
137 
138 private:
139 
141  int m_size;
142 };
143 
145 {
146  Delete();
147 
148  m_posMatrix = new Int4* [size];
149 
150  for(int i = 0; i < size; ++ i)
151  {
152  m_posMatrix[i] = new Int4[BLASTAA_SIZE];
153  }
154  m_size = size;
155 
156  return;
157 }
158 
160 {
161  if( NULL == m_posMatrix)
162  return;
163 
164  for(int i = 0; i < m_size; ++ i)
165  {
166  if (m_posMatrix[i] != NULL)
167  delete [] m_posMatrix[i];
168  }
169 
170  delete [] m_posMatrix;
171  m_posMatrix = NULL;
172  return;
173 }
174 
176 {
177 public:
178  /** @inheritDoc */
179  CMakeProfileDBApp(void);
181 private:
182  /** @inheritDoc */
183  virtual void Init();
184  /** @inheritDoc */
185  virtual int Run();
186 
187  enum op_mode
188  {
192  op_invalid
193  };
194 
196  {
197  public:
198  string db_name;
205 
214  string matrix;
216 
217  CRPS_DbInfo(void):
221  { };
223  {
224  if( NULL != query_options) {
226  }
227 
228  if(NULL != lookup) {
230  }
231 
232  if(NULL != lookup_options) {
234  }
235  };
236  };
237 
239  {
242  sm_invalid
243  };
244 
245  enum
246  {
249  eTrue
250  };
251 
253  const string & filename);
254  void x_SetupArgDescriptions(void);
255  void x_InitProgramParameters(void);
256  vector<string> x_GetSMPFilenames(void);
257  void x_InitOutputDb(CRPS_DbInfo & rpsDBInfo);
258  void x_InitRPSDbInfo(CRPS_DbInfo & rpsDBInfo, Int4 vol, Int4 num_files);
259  void x_UpdateRPSDbInfo(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p);
260  void x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_w_parameters, bool freq_only);
261  void x_RPSUpdateLookup(CRPS_DbInfo & rpsDbInfo, Int4 seq_size);
262  void x_RPSUpdateStatistics(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & seq, Int4 seq_size);
263  void x_FillInRPSDbParameters(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_p);
264  void x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo, const CPssm & pssm, Int4 seq_index, Int4 seq_size);
265  void x_RPS_DbClose(CRPS_DbInfo & rpsDbInfo);
266  void x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_size);
267  bool x_CheckDelta( const CPssm & pssm, Int4 seq_size, const string & filename);
268  bool x_ValidateCd(const list<double>& freqs, const list<double>& observ, unsigned int alphabet_size);
269  void x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo, CTmpFile & tmp_obsr_file, CTmpFile & tmp_freq_file,
270  list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset);
271  vector<string> x_CreateDeltaList(void);
272  void x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_index, Int4 seq_size);
273  void x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames);
274  bool x_IsUpdateFreqRatios(const CPssm & p);
275  void x_MakeVol(Int4 vol, vector<string> & smps);
276 
277  int x_Run(void);
278 
279  void x_AddCmdOptions(void);
280  void x_CreateAliasFile(void);
281 
282  // Data
285  string m_Title;
287  string m_OutDbName;
288  string m_OutDbType;
293  string m_Matrix;
298 
302  bool m_Done;
303 
304  //For Delta Blast
307 
310 
311  vector<string> m_VolNames;
314 };
315 
317  : m_LogFile(NULL), m_InPssmList(NULL), m_Title(kEmptyStr),
318  m_WordDefaultScoreThreshold(0), m_OutDbName(kEmptyStr),
319  m_OutDbType(kEmptyStr), m_CreateIndexFile(false),m_GapOpenPenalty(0),
320  m_GapExtPenalty(0), m_PssmScaleFactor(0),m_Matrix(kEmptyStr), m_op_mode(op_invalid),
321  m_binary_scoremat(false), m_MaxSmpFilesPerVol(0), m_NumOfVols(0), m_DbVer(eBDB_Version5),
322  m_Taxids(new CTaxIdSet()), m_UserTaxIds(false), m_Done(false),
323  m_ObsrvThreshold(0), m_ExcludeInvalid(false),
324  m_UpdateFreqRatios(eUndefined), m_UseModelThreshold(true)
325 {
327  version->SetVersionInfo(new CBlastVersion());
329  m_StopWatch.Start();
330  if (m_UsageReport.IsEnabled()) {
332  m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "makeprofiledb");
333  }
334 }
335 
337 {
338  // NEED CLEAN UP CODE !!!!
339  if(m_Done == false)
340  {
341  for(unsigned int i =0; i < m_VolNames.size(); i ++)
342  {
343  string rps_str = m_VolNames[i] + ".rps";
344  string lookup_str = m_VolNames[i] + ".loo";
345  string aux_str = m_VolNames[i] + ".aux";
346  string freq_str = m_VolNames[i] + ".freq";
347  CFile(rps_str).Remove();
348  CFile(lookup_str).Remove();
349  CFile(aux_str).Remove();
350  CFile(freq_str).Remove();
351 
352  if(op_cobalt == m_op_mode)
353  {
354  string blocks_str = m_VolNames[i] + ".blocks";
355  CFile(blocks_str).Remove();
356  }
357 
358  if(op_delta == m_op_mode)
359  {
360  string wcounts_str = m_VolNames[i] + ".wcounts";
361  string obsr_str = m_VolNames[i] + ".obsr";
362  CFile(wcounts_str).Remove();
363  CFile(obsr_str).Remove();
364  }
365  }
366  if (m_VolNames.size() > 1) {
367  string pal_str = m_OutDbName + ".pal";
368  CFile(pal_str).Remove();
369  }
370  }
371  else
372  {
373  for(unsigned int i =0; i < m_VolNames.size(); i ++) {
374  string pog_str = m_VolNames[i] + ".pog";
375  CFile(pog_str).Remove();
376  }
377  }
379 }
380 
382 {
384 
385  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
386 
387  // Specify USAGE context
388  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
389  "Application to create databases for rpsblast, cobalt and deltablast, version "
390  + CBlastVersion().Print());
391 
392  string dflt("Default = input file name provided to -");
393  dflt += kInPssmList + " argument";
394 
395  arg_desc->SetCurrentGroup("Input options");
396  arg_desc->AddKey(kInPssmList, "in_pssm_list",
397  "Input file that contains a list of smp files (delimited by space, tab or newline)",
399 
400  arg_desc->AddFlag(kBinaryScoremat,
401  "Scoremats are in binary format",
402  true);
403 
404  arg_desc->SetCurrentGroup("Configuration options");
405  arg_desc->AddOptionalKey(kArgDbTitle, "database_title",
406  "Title for database\n" + dflt,
408 
409  arg_desc->AddDefaultKey(kArgWordScoreThreshold, "word_score_threshold",
410  "Minimum word score to add a word to the lookup table",
413  arg_desc->AddFlag(kUseCmdlineThreshold, "Use cmdline threshold", true);
414 
415  arg_desc->SetCurrentGroup("Output options");
416  arg_desc->AddOptionalKey(kOutDbName, "database_name",
417  "Name of database to be created\n" +
419 
420  arg_desc->AddDefaultKey("blastdb_version", "version",
421  "Version of BLAST database to be created",
423  NStr::NumericToString(static_cast<int>(eBDB_Version5)));
424  arg_desc->SetConstraint("blastdb_version",
426 
427  arg_desc->AddDefaultKey(kMaxSmpFilesPerVol, "max_smp_files_per_vol",
428  "Maximum number of SMP files per DB volume",
430 
431  arg_desc->AddDefaultKey(kOutDbType, "output_db_type",
432  "Output database type: cobalt, delta, rps",
434  arg_desc->SetConstraint(kOutDbType, &(*new CArgAllow_Strings, kOutDbRps, kOutDbCobalt , kOutDbDelta ));
435 
436  arg_desc->AddDefaultKey(kOutIndexFile, "create_index_files",
437  "Create Index Files",
439 
440  arg_desc->SetCurrentGroup("Used only if scoremat files do not contain PSSM scores, ignored otherwise.");
441  arg_desc->AddOptionalKey(kArgGapOpen, "gap_open_penalty",
442  "Cost to open a gap",
444 
445  arg_desc->AddOptionalKey(kArgGapExtend, "gap_extend_penalty",
446  "Cost to extend a gap, ",
448 
449  arg_desc->AddDefaultKey(kPssmScaleFactor, "pssm_scale_factor",
450  "Pssm Scale factor ",
453 
454  arg_desc->AddDefaultKey(kArgMatrixName, "matrix_name",
455  "Scoring matrix name",
458  arg_desc->SetConstraint(kArgMatrixName, &(*new CArgAllow_Strings,kMatrixBLOSUM62, kMatrixBLOSUM80,
460 
461  //Delta Blast Options
462  arg_desc->SetCurrentGroup("Delta Blast Options");
463  arg_desc->AddDefaultKey(kObsrThreshold, "observations_threshold", "Exclude domains with "
464  "with maximum number of independent observations "
465  "below this threshold", CArgDescriptions::eDouble,
467 
468  arg_desc->AddDefaultKey(kExcludeInvalid, "exclude_invalid", "Exclude domains that do "
469  "not pass validation test",
471 
472  arg_desc->SetCurrentGroup("Taxonomy options");
473  arg_desc->AddOptionalKey("taxid", "TaxID",
474  "Taxonomy ID to assign to all sequences",
476  arg_desc->SetConstraint("taxid", new CArgAllowValuesGreaterThanOrEqual(0));
477  arg_desc->SetDependency("taxid", CArgDescriptions::eExcludes, "taxid_map");
478 
479  arg_desc->AddOptionalKey("taxid_map", "TaxIDMapFile",
480  "Text file mapping sequence IDs to taxonomy IDs.\n"
481  "Format:<SequenceId> <TaxonomyId><newline>",
483 
484  SetupArgDescriptions(arg_desc.release());
485 }
486 
488 {
489  const CArgs& args = GetArgs();
490 
491  //log_file
492  if (args[kLogFile].HasValue())
493  m_LogFile = &args[kLogFile].AsOutputFile();
494  else
495  m_LogFile = &cout;
496 
497 
498  //in_list
499  if (args[kInPssmList].HasValue())
500  m_InPssmList = &args[kInPssmList].AsInputFile();
501  else
502  NCBI_THROW(CInputException, eInvalidInput, "Please provide an input file with list of smp files");
503 
504  // Binary Scoremat
506 
507  //title
508  if (args[kArgDbTitle].HasValue())
509  m_Title = args[kArgDbTitle].AsString();
510  else
511  m_Title = args[kInPssmList].AsString();
512 
513  //threshold
515 
516  //Out
517  if(args[kOutDbName].HasValue())
518  m_OutDbName = args[kOutDbName].AsString();
519  else
520  m_OutDbName = args[kInPssmList].AsString();
521 
522  //Number of SMP files per db vol
523  m_MaxSmpFilesPerVol = args[kMaxSmpFilesPerVol].AsInteger();
524 
525  //out_db_type
526  m_OutDbType = args[kOutDbType].AsString();
527  if(kOutDbRps == m_OutDbType)
528  m_op_mode = op_rps;
529  else if (kOutDbCobalt == m_OutDbType)
531  else if(kOutDbDelta == m_OutDbType)
533  else
534  NCBI_THROW(CInputException, eInvalidInput, "Invalid Output database type");
535 
536  m_CreateIndexFile = args[kOutIndexFile].AsBoolean();
537 
538  int default_gap_open = 0;
539  int default_gap_extend = 0;
540  //matrix
541  m_Matrix = args[kArgMatrixName].AsString();
542  BLAST_GetProteinGapExistenceExtendParams(m_Matrix.c_str(), &default_gap_open, &default_gap_extend);
543 
544  //gapopen
545  if(args[kArgGapOpen].HasValue())
546  m_GapOpenPenalty = args[kArgGapOpen].AsInteger();
547  else
548  m_GapOpenPenalty = default_gap_open;
549 
550  //gapextend
551  if(args[kArgGapExtend].HasValue())
552  m_GapExtPenalty = args[kArgGapExtend].AsInteger();
553  else
554  m_GapExtPenalty = default_gap_extend;
555 
556  //pssm scale factor
557  m_PssmScaleFactor = args[kPssmScaleFactor].AsDouble();
558 
559  //matrix
560  m_Matrix = args[kArgMatrixName].AsString();
561 
562  //Delta Blast Parameters
563  m_ObsrvThreshold = args[kObsrThreshold].AsDouble();
564  m_ExcludeInvalid = args[kExcludeInvalid].AsBoolean();
565 
566  if (args[kUseCmdlineThreshold]){
567  m_UseModelThreshold = false;
568  }
569  m_DbVer = static_cast<EBlastDbVersion>(args["blastdb_version"].AsInteger());
570 
571  if (args["taxid"].HasValue()) {
572  _ASSERT( !args["taxid_map"].HasValue() );
573  m_Taxids.Reset(new CTaxIdSet(TAX_ID_FROM(int, args["taxid"].AsInteger())));
574  m_UserTaxIds = true;
575  } else if (args["taxid_map"].HasValue()) {
576  _ASSERT( !args["taxid"].HasValue() );
577  _ASSERT( !m_Taxids.Empty() );
578  m_Taxids->SetMappingFromFile(args["taxid_map"].AsInputFile());
579  m_UserTaxIds = true;
580  }
581 }
582 
584 {
585  vector<string> filenames;
586 
587  while(!m_InPssmList->eof())
588  {
589  string line;
590  vector<string> tmp;
593 
594  if(tmp.size() > 0)
595  filenames.insert(filenames.end(), tmp.begin(), tmp.end() );
596  }
597 
598  if( 0 == filenames.size())
599  NCBI_THROW(CInputException, eInvalidInput, "Input file contains no smp filnames");
600 
601  return filenames;
602 }
603 
606  const string & filename)
607 {
609 
610  if(pssm_w_parameters.IsSetPssm())
611  {
612  const CPssm & pssm = pssm_w_parameters.GetPssm();
613 
614  if(!pssm.IsSetQuery() || (0 == pssm.GetQueryLength()))
615  {
616  string err = filename + " contains no bioseq data";
617  NCBI_THROW(CInputException, eInvalidInput, err);
618  }
619 
620  if(!pssm.IsSetNumRows() || !pssm.IsSetNumColumns())
621  {
622  string err = filename + " contains no info on num of columns or num of rows";
623  NCBI_THROW(CInputException, eInvalidInput, err);
624  }
625 
626  if((int) (pssm.GetQueryLength()) != pssm.GetNumColumns())
627  {
628  string err = filename + " 's num of columns does not match size of sequence";
629  NCBI_THROW(CInputException, eInvalidInput, err);
630  }
631 
632  int num_rows = pssm.GetNumRows();
633  if( num_rows <= 0 || num_rows > BLASTAA_SIZE )
634  {
635  string err = filename + " has invalid alphabet size";
636  NCBI_THROW(CInputException, eInvalidInput, err);
637  }
638 
639  // First time around
641  {
643  }
644 
646  {
647  string err = filename + " contains no frequence ratios for building database";
648  NCBI_THROW(CInputException, eInvalidInput, err);
649  }
650 
651  if(op_cobalt == m_op_mode)
652  {
653  if(!pssm_w_parameters.IsSetParams() || !pssm_w_parameters.GetParams().IsSetConstraints() ||
654  ! pssm_w_parameters.GetParams().GetConstraints().IsSetBlocks())
655  {
656  string err = filename + " contains no core block to build cobalt database";
657  NCBI_THROW(CInputException, eInvalidInput, err);
658  }
659  }
660 
661  if(pssm.IsSetFinalData())
662  {
663  sm = sm_valid_has_pssm;
664  }
665  else if(pssm.IsSetIntermediateData())
666  {
668  {
669  sm = sm_valid_freq_only;
670  }
671  }
672 
673  if(sm_invalid == sm)
674  {
675  string err = filename + " contains no pssm or residue frequencies";
676  NCBI_THROW(CInputException, eInvalidInput, err);
677  }
678  }
679  else
680  {
681  string err = filename + " contains no scoremat";
682  NCBI_THROW(CInputException, eInvalidInput, err);
683  }
684 
685  return sm;
686 }
687 
689 {
690  if(op_cobalt == m_op_mode)
691  return eTrue;
692 
694  return eFalse;
695 
696  return eTrue;
697 }
698 
700 {
702  rpsDbInfo.output_db.Reset(new CWriteDB(rpsDbInfo.db_name, CWriteDB::eProtein, m_Title, index_type, m_CreateIndexFile, false, false, m_DbVer));
703  rpsDbInfo.output_db->SetMaxFileSize(4000000000);
704  return;
705 }
706 
707 static bool s_DeleteMakeprofileDb(const string & name )
708 {
709  bool isRemoved = false;
710  static const char * mp_ext[]={".rps", ".loo", ".aux", ".freq", ".blocks", ".wcounts", ".obsr", NULL};
711  for(const char ** mp=mp_ext; *mp != NULL; mp++) {
712  CNcbiOstrstream oss;
713  oss << name << *mp;
714  const string fname = CNcbiOstrstreamToString(oss);
715  if (CFile(fname).Remove()) {
716  LOG_POST(Info << "Deleted " << fname);
717  }
718  else {
719  unsigned int index = 0;
720  string vfname = name + "." + NStr::IntToString(index/10) +
721  NStr::IntToString(index%10) + *mp;
722  while (CFile(vfname).Remove()) {
723  index++;
724  vfname = name + "." + NStr::IntToString(index/10) +
725  NStr::IntToString(index%10) + *mp;
726  }
727  }
728  }
730  isRemoved = true;
731 
732  return isRemoved;
733 }
734 
735 
736 void CMakeProfileDBApp::x_InitRPSDbInfo(CRPS_DbInfo & rpsDbInfo, Int4 vol, Int4 num_files)
737 {
738 
739  rpsDbInfo.num_seqs = num_files;
740  if(vol == kSingleVol) {
741  rpsDbInfo.db_name = m_OutDbName;
742  }
743  else if (vol >= 0) {
745  }
746  else {
747  NCBI_THROW(CBlastException, eCoreBlastError,"Invalid vol number");
748  }
749 
750  string rps_str = rpsDbInfo.db_name + ".rps";
751  rpsDbInfo.pssm_file.open(rps_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
752  if (!rpsDbInfo.pssm_file.is_open())
753  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .rps file ");
754 
755  string lookup_str = rpsDbInfo.db_name + ".loo";
756  rpsDbInfo.lookup_file.open(lookup_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
757  if (!rpsDbInfo.lookup_file.is_open())
758  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .loo file");
759 
760  string aux_str = rpsDbInfo.db_name + ".aux";
761  rpsDbInfo.aux_file.open(aux_str.c_str());
762  if (!rpsDbInfo.aux_file.is_open())
763  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .aux file");
764 
765  string freq_str = rpsDbInfo.db_name + ".freq";
766  rpsDbInfo.freq_file.open(freq_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
767  if (!rpsDbInfo.freq_file.is_open())
768  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .freq file");
769 
770  /* Write the magic numbers to the PSSM file */
771 
773  rpsDbInfo.pssm_file.write ((char *)&version , sizeof(Int4));
774  rpsDbInfo.freq_file.write ((char *)&version , sizeof(Int4));
775 
776  /* Fill in space for the sequence offsets. The PSSM
777  data gets written after this list of integers. Also
778  write the number of sequences to the PSSM file */
779 
780  rpsDbInfo.pssm_file.write((char *) &num_files, sizeof(Int4));
781  rpsDbInfo.freq_file.write((char *) &num_files, sizeof(Int4));
782  for (Int4 i = 0; i <= num_files; i++)
783  {
784  rpsDbInfo.pssm_file.write((char *)&i, sizeof(Int4));
785  rpsDbInfo.freq_file.write((char *)&i, sizeof(Int4));
786  }
787 
788  if(op_cobalt == m_op_mode)
789  {
790  string blocks_str = rpsDbInfo.db_name + ".blocks";
791  rpsDbInfo.blocks_file.open(blocks_str.c_str());
792  if (!rpsDbInfo.blocks_file.is_open())
793  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .blocks file");
794  }
795 
796 
797  rpsDbInfo.curr_seq_offset = 0;
798  //Init them to input arg values first , may change after reading in the first sequence
799  rpsDbInfo.gap_extend = m_GapExtPenalty;
800  rpsDbInfo.gap_open = m_GapOpenPenalty;
801  rpsDbInfo.matrix = m_Matrix;
802  rpsDbInfo.scale_factor = (Int4) ceil(m_PssmScaleFactor);
803 
804  return;
805  }
806 
807 //For first sequence only
809 {
810  if(pssm_p.IsSetParams())
811  {
812  if(pssm_p.GetParams().IsSetRpsdbparams())
813  {
814  const CFormatRpsDbParameters & rps_db_params = pssm_p.GetParams().GetRpsdbparams();
815  if(rps_db_params.IsSetGapExtend())
816  rpsDbInfo.gap_extend = rps_db_params.GetGapExtend();
817 
818  if(rps_db_params.IsSetGapOpen())
819  rpsDbInfo.gap_open = rps_db_params.GetGapOpen();
820 
821  if(rps_db_params.IsSetMatrixName())
822  rpsDbInfo.matrix = rps_db_params.GetMatrixName();
823  }
824  }
825  return;
826 }
827 
829 {
830  if(!pssm_p.IsSetParams())
831  pssm_p.SetParams();
832 
833  if(!pssm_p.GetParams().IsSetRpsdbparams())
834  pssm_p.SetParams().SetRpsdbparams();
835 
836  CFormatRpsDbParameters & rps_params= pssm_p.SetParams().SetRpsdbparams();
837  if(!rps_params.IsSetGapExtend())
838  rps_params.SetGapExtend(rpsDbInfo.gap_extend);
839  else if(rps_params.GetGapExtend() != rpsDbInfo.gap_extend)
840  NCBI_THROW(CBlastException, eCoreBlastError, "Gap extend penalties do not match");
841 
842  if(!rps_params.IsSetGapOpen())
843  rps_params.SetGapOpen(rpsDbInfo.gap_open);
844  else if(rps_params.GetGapOpen() != rpsDbInfo.gap_open)
845  NCBI_THROW(CBlastException, eCoreBlastError, "Gap open penalties do not match");
846 
847  if(!rps_params.IsSetMatrixName())
848  rps_params.SetMatrixName (rpsDbInfo.matrix);
849  else if(rps_params.GetMatrixName()!= rpsDbInfo.matrix)
850  NCBI_THROW(CBlastException, eCoreBlastError, "Score matrix does not match");
851 
852  return;
853 }
854 
855 /* Update the input scoremat with a new PSSM and modified
856  statistics. Scoremat must contain only residue frequencies.
857  Note that upon completion the new PSSM will always have
858  columns of length BLASTAA_SIZE
859  seq is the sequence and set of score frequencies read in
860  from the next data file
861  seq_size is the number of letters in this sequence
862  alphabet_size refers to the number of PSSM rows
863  ScalingFactor is the multiplier for all PSSM scores
864 */
866 {
867 
868  CPssm & pssm = seq.SetPssm();
869  const CPssmParameters & params = seq.GetParams();
870  string matrix_name = params.GetRpsdbparams().GetMatrixName();
871 
872  /* Read in the sequence residues from the scoremat structure. */
873  CNCBIstdaa query_stdaa;
874  pssm.GetQuerySequenceData(query_stdaa);
875 
876  vector <char> query_v = query_stdaa.Get();
877 
878  if((Int4) (query_v.size()) != seq_size)
879  NCBI_THROW(CBlastException, eCoreBlastError, "Query sequence lengths mismatch");
880 
881  /* allocate query array and PSSM row array */
882  AutoArray<Uint1> query(seq_size);
883 
884  for(unsigned int i = 0; i < query_v.size(); i++)
885  query[i] = query_v[i];
886 
887  unique_ptr<CNcbiMatrix <double> > freq_list (CScorematPssmConverter::GetFreqRatios(seq));
888 
889  CPsiBlastInputFreqRatios pssm_freq_ratio(query.get(), seq_size, *freq_list,
890  matrix_name.c_str(), rpsDbInfo.gap_open,
891  rpsDbInfo.gap_extend, rpsDbInfo.scale_factor);
892  CPssmEngine pssm_engine(&pssm_freq_ratio);
893  CRef<CPssmWithParameters> out_par(pssm_engine.Run());
894 
895  CPssmFinalData & i = pssm.SetFinalData();
896  const CPssmFinalData & o = out_par->GetPssm().GetFinalData();
897  i.SetScores() = o.GetScores();
898  i.SetLambda() = o.GetLambda();
899  i.SetKappa() = o.GetKappa();
900  i.SetH() = o.GetH();
901  i.SetScalingFactor(rpsDbInfo.scale_factor);
902 
903  return;
904 }
905 
906  /* The first sequence in the list determines several
907  parameters that all other sequences in the list must
908  have. In this case, extra initialization is required
909 
910  info contains all the information on data files
911  and parameters from previously added sequences
912  seq is the sequence and PSSM read in from the next data file
913  seq_index refers to the (0-based) position of this sequence
914  in the complete list of seqences
915  seq_size is the number of letters in this sequence
916  alphabet_size refers to the number of PSSM rows
917  */
918  void CMakeProfileDBApp::x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_w_parameters, bool freq_only )
919  {
920  x_UpdateRPSDbInfo(rpsDbInfo, pssm_w_parameters);
921 
922  x_FillInRPSDbParameters(rpsDbInfo, pssm_w_parameters);
923  double wordScoreThreshold = m_WordDefaultScoreThreshold;
924 
925  if(!freq_only)
926  {
927  if(pssm_w_parameters.GetPssm().GetFinalData().IsSetScalingFactor())
928  {
929  rpsDbInfo.scale_factor = pssm_w_parameters.GetPssm().GetFinalData().GetScalingFactor();
930  }
931  else
932  {
933  // asn1 default value is 1
934  rpsDbInfo.scale_factor = 1.0;
935  }
936  if(m_UseModelThreshold && pssm_w_parameters.GetPssm().GetFinalData().IsSetWordScoreThreshold())
937  {
938  wordScoreThreshold = pssm_w_parameters.GetPssm().GetFinalData().GetWordScoreThreshold();
939  }
940  }
941  else
942  {
943  x_RPSUpdateStatistics(rpsDbInfo, pssm_w_parameters, pssm_w_parameters.GetPssm().GetQueryLength());
944  }
945 
946  /* scale up the threshold value and convert to integer */
947  double threshold = rpsDbInfo.scale_factor * wordScoreThreshold;
948 
949  /* create BLAST lookup table */
950  if (LookupTableOptionsNew(eBlastTypeBlastp, &(rpsDbInfo.lookup_options)) != 0)
951  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot create lookup options");
952 
954  FALSE, /* no megablast */
955  threshold, /* neighboring threshold */
956  BLAST_WORDSIZE_PROT ) != 0)
957  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot set lookup table options");
958 
959  if (BlastAaLookupTableNew(rpsDbInfo.lookup_options, &(rpsDbInfo.lookup)) != 0)
960  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot allocate lookup table");
961 
962  rpsDbInfo.lookup->use_pssm = TRUE; /* manually turn on use of PSSMs */
963 
964  /* Perform generic query setup */
965 
966  if (BlastQuerySetUpOptionsNew(&(rpsDbInfo.query_options)) != 0)
967  NCBI_THROW(CBlastException, eCoreBlastError, "Generic query setup failed");
968 
970  NULL, /* no filtering */
971  0 /* strand not applicable */ ) != 0)
972  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot fill query options");
973 
974  /* Write the header of the RPS .aux file */
975  rpsDbInfo.aux_file << rpsDbInfo.matrix << "\n";
976  rpsDbInfo.aux_file << rpsDbInfo.gap_open << "\n";
977  rpsDbInfo.aux_file << rpsDbInfo.gap_extend << "\n";
978  rpsDbInfo.aux_file << scientific << 0.0 << "\n";
979  rpsDbInfo.aux_file << scientific << 0.0 << "\n";
980  rpsDbInfo.aux_file << (int) 0 << "\n";
981  rpsDbInfo.aux_file << (int) 0 << "\n";
982  rpsDbInfo.aux_file << fixed << (double) rpsDbInfo.scale_factor << "\n";
983 
984  return;
985  }
986 
987  void CMakeProfileDBApp::x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_size)
988  {
989  const CPssm & pssm = pssm_p.GetPssm();
990  // Update .blocks file
991  const list<CRef<CCoreBlock> > & block_list = pssm_p.GetParams().GetConstraints().GetBlocks();
992 
993  list<CRef<CCoreBlock> >::const_iterator itr = block_list.begin();
994 
995  int count =0;
996 
997  while(itr != block_list.end())
998  {
999  const CCoreBlock & block = (**itr);
1000  if(!block.IsSetStart() || !block.IsSetStop())
1001  NCBI_THROW(CInputException, eInvalidInput, "No start Or stop found in conserved block");
1002 
1003  string seq_id_str = "id" + NStr::IntToString(count);
1004  if(pssm.IsSetQuery())
1005  {
1006  if(pssm.GetQuery().IsSeq())
1007  {
1008  if(pssm.GetQuery().GetSeq().IsSetDescr())
1009  {
1010  const list<CRef<CSeqdesc> > descr_list= pssm.GetQuery().GetSeq().GetDescr();
1011  if(descr_list.size() > 0)
1012  {
1013  const CRef<CSeqdesc> descr = descr_list.front();
1014  if(descr->IsTitle())
1015  {
1016  string title = descr->GetTitle();
1017  string accession;
1018  string tmp;
1019  if(NStr::SplitInTwo(title, ",", accession, tmp))
1020  seq_id_str = accession;
1021  }
1022  }
1023  }
1024  }
1025  }
1026 
1027  rpsDbInfo.blocks_file << seq_id_str << "\t";
1028  rpsDbInfo.blocks_file << count << "\t";
1029  rpsDbInfo.blocks_file << block.GetStart() << "\t";
1030  rpsDbInfo.blocks_file << block.GetStop() << "\n";
1031  count++;
1032  ++itr;
1033  }
1034  return;
1035  }
1036 void CMakeProfileDBApp::x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_index, Int4 seq_size)
1037  {
1038  if (!m_UpdateFreqRatios)
1039  return;
1040 
1041  const CPssm & pssm = pssm_p.GetPssm();
1042  // Update .freq file
1043  Int4 i = 0;
1044  Int4 j = 0;
1045  Int4 row[BLASTAA_SIZE];
1046  Int4 alphabet_size = pssm.GetNumRows();
1047 
1048  const list<double> & freq_ratios = pssm.GetIntermediateData().GetFreqRatios();
1049  list<double>::const_iterator itr_fr = freq_ratios.begin();
1050  rpsDbInfo.freq_file.seekp(0, ios_base::end);
1051 
1052  if (pssm.GetByRow() == FALSE) {
1053  for (i = 0; i < seq_size; i++) {
1054  for (j = 0; j < alphabet_size; j++) {
1055  if (itr_fr == freq_ratios.end())
1056  break;
1057  row[j] = (Int4) BLAST_Nint(*itr_fr * FREQ_RATIO_SCALE);
1058  ++itr_fr;
1059  }
1060  for ( ;j < BLASTAA_SIZE; j++) {
1061  row[j] = 0;
1062  }
1063  rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1064  }
1065  }
1066  else {
1067  unique_ptr<CNcbiMatrix<double> > matrix (CScorematPssmConverter::GetFreqRatios(pssm_p));
1068 
1069  for (i = 0; i < seq_size; i++) {
1070  for (j = 0; j < BLASTAA_SIZE; j++) {
1071  row[j] = (Int4) BLAST_Nint((*matrix)(i,j ) * FREQ_RATIO_SCALE);
1072  }
1073  rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1074  }
1075  }
1076 
1077  memset(row, 0, sizeof(row));
1078  rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1079 
1080  rpsDbInfo.freq_file.seekp( 8 + (seq_index) * sizeof(Int4), ios_base::beg);
1081  rpsDbInfo.freq_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1082  return;
1083  }
1084 
1085  /* Incrementally update the BLAST lookup table with
1086  words derived from the present sequence
1087  info contains all the information on data files
1088  and parameters from previously added sequences
1089  seq is the sequence and PSSM read in from the next data file
1090  seq_size is the number of letters in this sequence
1091  */
1093  {
1094  BlastSeqLoc *lookup_segment = NULL;
1095 
1096  /* Tell the blast engine to index the entire input
1097  sequence. Since only the PSSM matters for lookup
1098  table creation, the process does not require
1099  actually extracting the sequence data from 'seq'*/
1100 
1101  BlastSeqLocNew(&lookup_segment, 0, seq_size - 1);
1102 
1103  /* add this sequence to the lookup table. NULL
1104  is passed in place of the query */
1105 
1106  Int4 ** posMatrix = rpsDbInfo.pos_matrix.Get();
1107  if (NULL == posMatrix)
1108  NCBI_THROW(CBlastException, eCoreBlastError, "Empty pos matrix");
1109 
1110  BlastAaLookupIndexQuery(rpsDbInfo.lookup, posMatrix,
1111  NULL, lookup_segment, rpsDbInfo.curr_seq_offset);
1112 
1113  BlastSeqLocFree(lookup_segment);
1114  return;
1115  }
1116 
1117  /* Incrementally update the RPS PSSM file with the
1118  PSSM for the next input sequence
1119  info contains all the information on data files
1120  and parameters from previously added sequences
1121  seq is the sequence and PSSM read in from the next data file
1122  seq_index refers to the (0-based) position of this sequence
1123  in the complete list of seqences
1124  seq_size is the number of letters in this sequence
1125  alphabet_size refers to the number of PSSM rows
1126  */
1127 void CMakeProfileDBApp::x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo, const CPssm & pssm, Int4 seq_index, Int4 seq_size)
1128 {
1129  Int4 i = 0;
1130  Int4 j = 0;
1131 
1132  /* Note that RPS blast requires an extra column at
1133  * the end of the PSSM */
1134 
1135  list<int>::const_iterator score_list_itr = pssm.GetFinalData().GetScores().begin();
1136  list<int>::const_iterator score_list_end = pssm.GetFinalData().GetScores().end();
1137  Int4 alphabet_size = pssm.GetNumRows();
1138 
1139  rpsDbInfo.pos_matrix.Create(seq_size + 1);
1140  Int4 ** posMatrix = rpsDbInfo.pos_matrix.Get();
1141  if (pssm.GetByRow() == FALSE) {
1142  for (i = 0; i < seq_size; i++) {
1143  for (j = 0; j < alphabet_size; j++) {
1144  if (score_list_itr == score_list_end)
1145  break;
1146  posMatrix[i][j] = *score_list_itr;
1147  score_list_itr++;
1148  }
1149  if (j < alphabet_size)
1150  break;
1151  for (; j < BLASTAA_SIZE; j++) {
1152  posMatrix[i][j] = INT2_MIN;
1153  }
1154  }
1155  }
1156  else {
1157  for (j = 0; j < alphabet_size; j++) {
1158  for (i = 0; i < seq_size; i++) {
1159  if (score_list_itr == score_list_end)
1160  break;
1161  posMatrix[i][j] = *score_list_itr;
1162  score_list_itr++;
1163  }
1164  if (i < seq_size)
1165  break;
1166  }
1167  if (j == alphabet_size) {
1168  for (; j < BLASTAA_SIZE; j++) {
1169  for (i = 0; i < seq_size; i++) {
1170  posMatrix[i][j] = INT2_MIN;
1171  }
1172  }
1173  }
1174  }
1175 
1176  if (i < seq_size || j < alphabet_size)
1177  NCBI_THROW(CBlastException, eCoreBlastError, "PSSM was truncated early");
1178 
1179  if(score_list_itr != score_list_end)
1180  NCBI_THROW(CBlastException, eCoreBlastError, "PSSM too large for this sequence");
1181 
1182  /* manually fill in the extra (last) column of the PSSM.
1183  Note that the value to use should more appropriately
1184  be BLAST_SCORE_MIN, but we instead follow the convention
1185  used in copymat */
1186 
1187  for (i = 0; i < BLASTAA_SIZE; i++)
1188  posMatrix[seq_size][i] = -BLAST_SCORE_MAX;
1189 
1190  /* Dump the score matrix, column by column */
1191  rpsDbInfo.pssm_file.seekp(0, ios_base::end);
1192  for (i = 0; i < seq_size + 1; i++) {
1193  rpsDbInfo.pssm_file.write((const char *) posMatrix[i], sizeof(Int4)*BLASTAA_SIZE);
1194  }
1195  /* Write the next context offset. Note that the
1196  RPSProfileHeader structure is one int too large for
1197  our purposes, so that the index of this sequence
1198  must be decremented to get the right byte offset
1199  into the file */
1200 
1201  rpsDbInfo.pssm_file.seekp( 8 + (seq_index) * sizeof(Int4), ios_base::beg);
1202  rpsDbInfo.pssm_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1203 
1204  return;
1205  }
1206 
1207 /* Once all sequences have been processed, perform
1208  final setup on the BLAST lookup table and finish
1209  up the RPS files */
1210 
1212 {
1213  /* Write the last context offset to the PSSM file.
1214  This is the total number of letters for all RPS
1215  DB sequences combined */
1216 
1217  rpsDbInfo.pssm_file.seekp(8 + (rpsDbInfo.num_seqs) * sizeof(Int4), ios::beg);
1218  rpsDbInfo.pssm_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1219  rpsDbInfo.freq_file.seekp(8 + (rpsDbInfo.num_seqs) * sizeof(Int4), ios::beg);
1220  rpsDbInfo.freq_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1221 
1222  /* Pack the lookup table into its compressed form */
1223  if(NULL == rpsDbInfo.lookup)
1224  NCBI_THROW(CBlastException, eCoreBlastError, "Empty database");
1225 
1226  if (BlastAaLookupFinalize(rpsDbInfo.lookup, eBackbone) != 0) {
1227  NCBI_THROW(CBlastException, eCoreBlastError, "Failed to compress lookup table");
1228  }
1229  else {
1230  /* Change the lookup table format to match that
1231  of the legacy BLAST lookup table */
1232 
1233  BlastRPSLookupFileHeader header;
1234  BlastAaLookupTable *lut = rpsDbInfo.lookup;
1235  Int4 i, index;
1236  Int4 cursor, old_cursor;
1237  AaLookupBackboneCell *cell;
1238  RPSBackboneCell empty_cell;
1239 
1240  memset(&header, 0, sizeof(header));
1242 
1243  /* for each lookup table cell */
1244 
1245  for (index = cursor = 0; index < lut->backbone_size; index++) {
1246  cell = (AaLookupBackboneCell*)lut->thick_backbone + index;
1247 
1248 
1249  if (cell->num_used == 0)
1250  continue;
1251 
1252  /* The cell contains hits */
1253 
1254  if (cell->num_used <= RPS_HITS_PER_CELL) {
1255  /* if 3 hits or less, just update each hit offset
1256  to point to the end of the word rather than
1257  the beginning */
1258 
1259  for (i = 0; i < cell->num_used; i++)
1260  cell->payload.entries[i] += BLAST_WORDSIZE_PROT - 1;
1261  }
1262  else {
1263  /* if more than 3 hits, pack the first hit into the
1264  lookup table cell, pack the overflow array byte
1265  offset into the cell, and compress the resulting
1266  'hole' in the overflow array. Update the hit
1267  offsets as well */
1268 
1269  old_cursor = cell->payload.overflow_cursor;
1270  cell->payload.entries[0] = ((Int4*)lut->overflow)[old_cursor] +
1271  BLAST_WORDSIZE_PROT - 1;
1272  cell->payload.entries[1] = cursor * sizeof(Int4);
1273  for (i = 1; i < cell->num_used; i++, cursor++) {
1274  ((Int4*)lut->overflow)[cursor]
1275  = ((Int4*)lut->overflow)[old_cursor + i] +
1276  BLAST_WORDSIZE_PROT - 1;
1277  }
1278  }
1279  }
1280 
1281  header.start_of_backbone = sizeof(header);
1282  header.end_of_overflow = header.start_of_backbone +
1283  (RPS_NUM_LOOKUP_CELLS + 1) * sizeof(RPSBackboneCell) +
1284  cursor * sizeof(Int4);
1285 
1286  /* write the lookup file header */
1287 
1288  rpsDbInfo.lookup_file.write((const char *)&header, sizeof(header));
1289 
1290  /* write the thick backbone */
1291 
1292  rpsDbInfo.lookup_file.write((const char *)lut->thick_backbone,
1293  sizeof(RPSBackboneCell)* lut->backbone_size);
1294 
1295  /* write extra backbone cells */
1296  memset(&empty_cell, 0, sizeof(empty_cell));
1297  for (i = lut->backbone_size; i < RPS_NUM_LOOKUP_CELLS + 1; i++) {
1298  rpsDbInfo.lookup_file.write((const char *)&empty_cell, sizeof(empty_cell));
1299  }
1300 
1301  /* write the new overflow array */
1302  rpsDbInfo.lookup_file.write((const char *)lut->overflow, sizeof(Int4)*cursor);
1303  }
1304 
1305  /* Free data, close files */
1306 
1307  rpsDbInfo.lookup = BlastAaLookupTableDestruct(rpsDbInfo.lookup);
1309  rpsDbInfo.lookup_file.flush();
1310  rpsDbInfo.lookup_file.close();
1311  rpsDbInfo.pssm_file.flush();
1312  rpsDbInfo.pssm_file.close();
1313  rpsDbInfo.aux_file.flush();
1314  rpsDbInfo.aux_file.close();
1315  rpsDbInfo.freq_file.flush();
1316  rpsDbInfo.freq_file.close();
1317 
1318  if(op_cobalt == m_op_mode)
1319  {
1320  rpsDbInfo.blocks_file.flush();
1321  rpsDbInfo.blocks_file.close();
1322  }
1323  else if(!m_UpdateFreqRatios)
1324  {
1325  string freq_str = m_OutDbName + ".freq";
1326  CFile(freq_str).Remove();
1327  }
1328 
1329 }
1330 
1332 {
1335 }
1336 
1337 static bool s_HasDefline(const CBioseq & bio)
1338 {
1339  if (bio.CanGetDescr()) {
1340  return true;
1341  }
1342 
1343  return false;
1344 }
1345 
1347 {
1349  CRef<CBlast_def_line> defline(new CBlast_def_line());
1350  defline->SetSeqid() = bio.GetId();
1351  defline_set->Set().push_back(defline);
1352  return defline_set;
1353 }
1354 
1356 {
1359  *m_LogFile << "Deleted existing BLAST database with identical name." << endl;
1360  }
1361  vector<string> smpFilenames = (op_delta == m_op_mode )? x_CreateDeltaList():x_GetSMPFilenames();
1362  int num_smps = smpFilenames.size();
1363  m_NumOfVols = num_smps/m_MaxSmpFilesPerVol + 1;
1364  int num_seqs = num_smps/m_NumOfVols;
1365  int residue_seqs = num_smps % m_NumOfVols;
1366  if(m_NumOfVols == 1) {
1367  x_MakeVol( -1, smpFilenames);
1368  m_Done = true;
1369  return 0;
1370  }
1371  else {
1372  vector<string>::iterator b = smpFilenames.begin();
1373  vector<string>::iterator r = b + num_seqs;
1374  for(int i=0; i < m_NumOfVols; i++) {
1375  vector<string> vol_smps(b, r);
1376  x_MakeVol(i, vol_smps);
1377  b= r;
1378  r = b + num_seqs;
1379  if(residue_seqs > 0) {
1380  r++;
1381  residue_seqs--;
1382  }
1383  }
1384  _ASSERT(b==smpFilenames.end());
1385  }
1386  if (m_NumOfVols == m_VolNames.size()) {
1388  m_Done = true;
1389  }
1390  return 0;
1391 }
1392 
1393 void CMakeProfileDBApp::x_MakeVol(Int4 vol, vector<string> & smps)
1394 {
1395 
1396  CRPS_DbInfo rpsDbInfo;
1397  x_InitRPSDbInfo(rpsDbInfo, vol, smps.size());
1398  m_VolNames.push_back(rpsDbInfo.db_name);
1399  x_InitOutputDb(rpsDbInfo);
1400 
1401  for(int seq_index=0; seq_index < rpsDbInfo.num_seqs; seq_index++)
1402  {
1403  string filename = smps[seq_index];
1404  CFile f(filename);
1405  if(!f.Exists())
1406  {
1407  string err = filename + " does not exists";
1408  NCBI_THROW(CInputException, eInvalidInput, err);
1409  }
1410 
1411  //Read PssmWithParameters from file
1412  CPssmWithParameters pssm_w_parameters;
1413  if(m_binary_scoremat)
1414  {
1415  CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1416  in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1417  }
1418  else
1419  {
1420  CNcbiIfstream in_stream(filename.c_str());
1421  in_stream >> MSerial_AsnText >> pssm_w_parameters;
1422  }
1423 
1424  CheckInputScoremat_RV sm = x_CheckInputScoremat(pssm_w_parameters, filename);
1425  // Should have error out already....
1426  if(sm_invalid == sm)
1427  {
1428  string err = filename + " contains invalid scoremat";
1429  NCBI_THROW(CInputException, eInvalidInput, err);
1430  }
1431 
1432  const CPssm & pssm = pssm_w_parameters.GetPssm();
1433  int seq_size = pssm.GetQueryLength();
1434 
1435  const CBioseq & bioseq = pssm.GetQuery().GetSeq();
1436  CRef<CBlast_def_line_set> deflines;
1437  if(s_HasDefline(bioseq)) {
1438  deflines = CWriteDB::ExtractBioseqDeflines(bioseq);
1439  }
1440  else {
1441  deflines = s_GenerateBlastDefline(bioseq);
1442  }
1443 
1444  // set taxids from the PSSM unless -taxid or -taxid_map option was used
1445  if (!m_UserTaxIds) {
1446  if (bioseq.IsSetDescr()) {
1447  for (const auto& it: bioseq.GetDescr().Get()) {
1448  if (it->IsOrg()) {
1449  TTaxId taxid = it->GetOrg().GetTaxId();
1450  const CSeq_id* seqid = bioseq.GetFirstId();
1451  _ASSERT(seqid);
1452  if (seqid) {
1453  m_Taxids->AddTaxId(*seqid, taxid);
1454  }
1455 
1456  break;
1457  }
1458  }
1459  }
1460  }
1461 
1462  m_Taxids->FixTaxId(deflines);
1463  rpsDbInfo.output_db->AddSequence(bioseq);
1464  rpsDbInfo.output_db->SetDeflines(*deflines);
1465 
1466  //Complete RpsDnInfo init with data from first file
1467  if(NULL == rpsDbInfo.lookup)
1468  {
1469  x_RPSAddFirstSequence( rpsDbInfo, pssm_w_parameters, sm == sm_valid_freq_only);
1470  }
1471  else
1472  {
1473  x_FillInRPSDbParameters(rpsDbInfo, pssm_w_parameters);
1474  if(sm_valid_freq_only == sm){
1475  x_RPSUpdateStatistics(rpsDbInfo, pssm_w_parameters, seq_size);
1476  }
1477 
1478  if( pssm.GetFinalData().IsSetScalingFactor())
1479  {
1480  if( pssm.GetFinalData().GetScalingFactor() != rpsDbInfo.scale_factor) {
1481  NCBI_THROW(CBlastException, eCoreBlastError, "Scaling factors do not match");
1482  }
1483  }
1484  else
1485  {
1486  // If scaling factor not specified, the default is 1
1487  if( 1 != rpsDbInfo.scale_factor) {
1488  NCBI_THROW(CBlastException, eCoreBlastError, "Scaling factors do not match");
1489  }
1490  }
1491 
1493  rpsDbInfo.lookup->threshold = rpsDbInfo.scale_factor * pssm_w_parameters.GetPssm().GetFinalData().GetWordScoreThreshold();
1494  }
1495  else {
1496  rpsDbInfo.lookup->threshold = rpsDbInfo.scale_factor * m_WordDefaultScoreThreshold;
1497  }
1498 
1499  }
1500 
1501  x_RPSUpdatePSSM(rpsDbInfo, pssm, seq_index, seq_size);
1502  x_RPSUpdateLookup(rpsDbInfo, seq_size);
1503  x_UpdateFreqRatios(rpsDbInfo, pssm_w_parameters, seq_index, seq_size);
1504 
1505  rpsDbInfo.aux_file << seq_size << "\n";
1506  rpsDbInfo.aux_file << scientific << pssm.GetFinalData().GetKappa() << "\n";
1507  rpsDbInfo.curr_seq_offset +=(seq_size +1);
1508  rpsDbInfo.pos_matrix.Delete();
1509 
1510  if(op_cobalt == m_op_mode) {
1511  x_UpdateCobalt(rpsDbInfo, pssm_w_parameters, seq_size);
1512  }
1513  }
1514 
1515  if(op_delta == m_op_mode) {
1516  x_UpdateDelta(rpsDbInfo, smps);
1517  }
1518  rpsDbInfo.output_db->Close();
1519  x_RPS_DbClose(rpsDbInfo);
1520 }
1521 
1522 static void s_WriteInt4List(CNcbiOfstream & ostr, const list<Int4> & l)
1523 {
1524  ITERATE(list<Int4>, it, l)
1525  {
1526  ostr.write((char*)&(*it), sizeof(Int4));
1527  }
1528 }
1529 
1530 static void s_WriteUint4List(CNcbiOfstream & ostr, const list<Uint4> & l)
1531 {
1532  ITERATE(list<Uint4>, it, l)
1533  {
1534  ostr.write((char*)&(*it), sizeof(Uint4));
1535  }
1536 }
1537 
1539 {
1540  vector<string> smpFilenames = x_GetSMPFilenames();
1541  vector<string> deltaList;
1542 
1543  for(unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1544  {
1545  string filename = smpFilenames[seq_index];
1546  CFile f(filename);
1547  if(!f.Exists())
1548  {
1549  string err = filename + " does not exists";
1550  NCBI_THROW(CInputException, eInvalidInput, err);
1551  }
1552 
1553  //Read PssmWithParameters from file
1554  CPssmWithParameters pssm_w_parameters;
1555  if(m_binary_scoremat)
1556  {
1557  CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1558  in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1559  }
1560  else
1561  {
1562  CNcbiIfstream in_stream(filename.c_str());
1563  in_stream >> MSerial_AsnText >> pssm_w_parameters;
1564  }
1565 
1566  CheckInputScoremat_RV sm = x_CheckInputScoremat(pssm_w_parameters, filename);
1567  // Should have error out already....
1568  if(sm_invalid == sm)
1569  {
1570  string err = filename + " contains invalid scoremat";
1571  NCBI_THROW(CInputException, eInvalidInput, err);
1572  }
1573 
1574  const CPssm & pssm = pssm_w_parameters.GetPssm();
1575  int seq_size = pssm.GetQueryLength();
1577  {
1578  string err = filename + " contains no weighted residue frequencies for building delta database";
1579  NCBI_THROW(CInputException, eInvalidInput, err);
1580  }
1581 
1583  {
1584  string err = filename + " contains no observations information for building delta database";
1585  NCBI_THROW(CInputException, eInvalidInput, err);
1586  }
1587 
1588  if (true == x_CheckDelta(pssm, seq_size, filename))
1589  {
1590  deltaList.push_back(filename);
1591  }
1592  }
1593 
1594  return deltaList;
1595 }
1596 
1597 void CMakeProfileDBApp::x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames)
1598 {
1599  CTmpFile tmp_obsr_file(CTmpFile::eRemove);
1600  CTmpFile tmp_freq_file(CTmpFile::eRemove);
1601  CNcbiOfstream tmp_obsr_buff(tmp_obsr_file.GetFileName().c_str(), IOS_BASE::out | IOS_BASE::binary);
1602  CNcbiOfstream tmp_freq_buff(tmp_freq_file.GetFileName().c_str(), IOS_BASE::out | IOS_BASE::binary);
1603 
1604  list<Int4> FreqOffsets;
1605  list<Int4> ObsrOffsets;
1606  Int4 CurrFreqOffset = 0;
1607  Int4 CurrObsrOffset= 0;
1608 
1609  for(unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1610  {
1611  string filename = smpFilenames[seq_index];
1612  //Read PssmWithParameters from file
1613  CPssmWithParameters pssm_w_parameters;
1614  if(m_binary_scoremat)
1615  {
1616  CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1617  in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1618  }
1619  else
1620  {
1621  CNcbiIfstream in_stream(filename.c_str());
1622  in_stream >> MSerial_AsnText >> pssm_w_parameters;
1623  }
1624 
1625  const CPssm & pssm = pssm_w_parameters.GetPssm();
1626  int seq_size = pssm.GetQueryLength();
1627 
1628  // get weightd residue frequencies
1629  const list<double>& orig_freqs = pssm.GetIntermediateData().GetWeightedResFreqsPerPos();
1630 
1631  // get number of independent observations
1632  const list<double>& obsr = pssm.GetIntermediateData().GetNumIndeptObsr();
1633 
1634  int alphabet_size = pssm.GetNumRows();
1635  list<double> modify_freqs;
1636 
1637  if(pssm.GetByRow())
1638  {
1639  // need to flip the freq matrix
1640  vector<double> tmp(orig_freqs.size());
1641  list<double>::const_iterator f_itr = orig_freqs.begin();
1642 
1643  for(int i = 0; i < alphabet_size; i++)
1644  {
1645  for(int j = 0; j < seq_size; j++)
1646  {
1647  tmp[i + j*alphabet_size] = *f_itr;
1648  ++f_itr;
1649  }
1650  }
1651  copy(tmp.begin(), tmp.end(), modify_freqs.begin());
1652  }
1653 
1654  // Pad matrix if necessary
1655  if(alphabet_size < BLASTAA_SIZE)
1656  {
1657  if(0 == modify_freqs.size())
1658  copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1659 
1660  list<double>::iterator p_itr = modify_freqs.begin();
1661 
1662  for (int j=0; j < seq_size; j++)
1663  {
1664  for(int i=0; i < alphabet_size; i++)
1665  {
1666  if(modify_freqs.end() == p_itr)
1667  break;
1668 
1669  ++p_itr;
1670  }
1671 
1672  modify_freqs.insert(p_itr, (BLASTAA_SIZE-alphabet_size), 0);
1673  }
1674  }
1675 
1676  const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1677 
1678  //save offset for this record
1679  ObsrOffsets.push_back(CurrObsrOffset);
1680 
1681  list<Uint4> ObsrBuff;
1682  // write effective observations in compressed form
1683  // as a list of pairs: value, number of occurences
1684  unsigned int num_obsr_columns = 0;
1685  list<double>::const_iterator obsr_it = obsr.begin();
1686  do
1687  {
1688  double current = *obsr_it;
1689  Uint4 num = 1;
1690  num_obsr_columns++;
1691  obsr_it++;
1692  while (obsr_it != obsr.end() && fabs(*obsr_it - current) < 1e-4)
1693  {
1694  obsr_it++;
1695  num++;
1696  num_obsr_columns++;
1697  }
1698 
1699  // +1 because pssm engine returns alpha (in psi-blast papers)
1700  // which is number of independent observations - 1
1701  ObsrBuff.push_back((Uint4)((current + 1.0) * kFixedPointScaleFactor));
1702  ObsrBuff.push_back(num);
1703  }
1704  while (obsr_it != obsr.end());
1705 
1706  Uint4 num_weighted_counts = 0;
1707 
1708  // save offset for this frequencies record
1709  FreqOffsets.push_back(CurrFreqOffset / BLASTAA_SIZE);
1710 
1711  list<Uint4> FreqBuff;
1712  // save weighted residue frequencies
1713  ITERATE (list<double>, it, freqs)
1714  {
1715  FreqBuff.push_back((Uint4)(*it * kFixedPointScaleFactor));
1716  num_weighted_counts++;
1717  }
1718 
1719  if (num_obsr_columns != num_weighted_counts / BLASTAA_SIZE)
1720  {
1721  string err = "Number of frequencies and observations columns do not match in " + filename;
1722  NCBI_THROW(CException, eInvalid, err);
1723  }
1724 
1725  // additional column of zeros is added for compatibility with rps database
1726  unsigned int padded_size = FreqBuff.size() + BLASTAA_SIZE;
1727  FreqBuff.resize(padded_size, 0);
1728 
1729  CurrFreqOffset += FreqBuff.size();
1730  CurrObsrOffset += ObsrBuff.size();
1731  s_WriteUint4List(tmp_freq_buff, FreqBuff);
1732  s_WriteUint4List(tmp_obsr_buff, ObsrBuff);
1733 
1734  }
1735 
1736  tmp_obsr_buff.flush();
1737  tmp_freq_buff.flush();
1738  x_WrapUpDelta(rpsDbInfo, tmp_obsr_file, tmp_freq_file, FreqOffsets, ObsrOffsets, CurrFreqOffset, CurrObsrOffset);
1739 }
1740 
1741 
1742 bool CMakeProfileDBApp::x_ValidateCd(const list<double>& freqs,
1743  const list<double>& observ,
1744  unsigned int alphabet_size)
1745 {
1746 
1747  if (freqs.size() / alphabet_size != observ.size())
1748  {
1749  string err = "Number of frequency and observations columns do not match";
1750  NCBI_THROW(CException, eInvalid, err);
1751  }
1752 
1753  ITERATE (list<double>, it, freqs)
1754  {
1755  unsigned int residue = 0;
1756  double sum = 0.0;
1757  while (residue < alphabet_size - 1)
1758  {
1759  sum += *it;
1760  it++;
1761  residue++;
1762  }
1763  sum += *it;
1764 
1765  if (fabs(sum - 1.0) > kEpsylon)
1766  return false;
1767  }
1768 
1769  ITERATE (list<double>, it, observ)
1770  {
1771  if (*it < 1.0)
1772  return false;
1773  }
1774 
1775  return true;
1776 }
1777 
1778 
1779 bool CMakeProfileDBApp::x_CheckDelta( const CPssm & pssm, Int4 seq_size, const string & filename)
1780 {
1781  // get weightd residue frequencies
1782  const list<double>& orig_freqs = pssm.GetIntermediateData().GetWeightedResFreqsPerPos();
1783 
1784  // get number of independent observations
1785  const list<double>& obsr = pssm.GetIntermediateData().GetNumIndeptObsr();
1786 
1787  int alphabet_size = pssm.GetNumRows();
1788  list<double> modify_freqs;
1789 
1790  if(pssm.GetByRow())
1791  {
1792  // need to flip the freq matrix
1793  vector<double> tmp(orig_freqs.size());
1794  list<double>::const_iterator f_itr = orig_freqs.begin();
1795 
1796  for(int i = 0; i < alphabet_size; i++)
1797  {
1798  for(int j = 0; j < seq_size; j++)
1799  {
1800  tmp[i + j*alphabet_size] = *f_itr;
1801  ++f_itr;
1802  }
1803  }
1804  copy(tmp.begin(), tmp.end(), modify_freqs.begin());
1805  }
1806 
1807  // Pad matrix if necessary
1808  if(alphabet_size < BLASTAA_SIZE)
1809  {
1810  if(0 == modify_freqs.size())
1811  copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1812 
1813  list<double>::iterator p_itr = modify_freqs.begin();
1814 
1815  for (int j=0; j < seq_size; j++)
1816  {
1817  for(int i=0; i < alphabet_size; i++)
1818  {
1819  if(modify_freqs.end() == p_itr)
1820  break;
1821 
1822  ++p_itr;
1823  }
1824 
1825  modify_freqs.insert(p_itr, (BLASTAA_SIZE-alphabet_size), 0);
1826  }
1827  }
1828 
1829  const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1830  double max_obsr = *max_element(obsr.begin(), obsr.end()) + 1.0;
1831  if(max_obsr < m_ObsrvThreshold)
1832  {
1833  *m_LogFile << filename +
1834  " was excluded: due to too few independent observations\n";
1835  return false;
1836  }
1837 
1838  if( !x_ValidateCd(freqs, obsr, BLASTAA_SIZE) && m_ExcludeInvalid)
1839  {
1840  *m_LogFile << filename +
1841  " was excluded: it conatins an invalid CD \n";
1842  return false;
1843  }
1844  return true;
1845 }
1846 
1847 
1848 
1849 void CMakeProfileDBApp::x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo, CTmpFile & tmp_obsr_file, CTmpFile & tmp_freq_file,
1850  list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset)
1851 {
1852  FreqOffsets.push_back(CurrFreqOffset / BLASTAA_SIZE);
1853  ObsrOffsets.push_back(CurrObsrOffset);
1854 
1855  string wcounts_str = rpsDbInfo.db_name + ".wcounts";
1856  CNcbiOfstream wcounts_file(wcounts_str.c_str(), ios::out | ios::binary);
1857  if (!wcounts_file.is_open())
1858  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .wcounts file");
1859 
1860  string obsr_str = rpsDbInfo.db_name + ".obsr";
1861  CNcbiOfstream obsr_file(obsr_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
1862  if (!obsr_file.is_open())
1863  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .obsr file");
1864 
1865  CNcbiIfstream tmp_obsr_buff (tmp_obsr_file.GetFileName().c_str(), IOS_BASE::in | IOS_BASE::binary);
1866  CNcbiIfstream tmp_freq_buff (tmp_freq_file.GetFileName().c_str(), IOS_BASE::in | IOS_BASE::binary);
1867 
1868  // write RPS BLAST database magic number
1869  Int4 magic_number = RPS_MAGIC_NUM_28;
1870  wcounts_file.write((char*)&magic_number, sizeof(Int4));
1871  obsr_file.write((char*)&magic_number, sizeof(Int4));
1872 
1873  // write number of recrods
1874  Int4 num_wcounts_records = FreqOffsets.size() -1;
1875  Int4 num_obsr_records = ObsrOffsets.size() -1;
1876  wcounts_file.write((char*)&num_wcounts_records, sizeof(Int4));
1877  obsr_file.write((char*)&num_obsr_records, sizeof(Int4));
1878 
1879  s_WriteInt4List(wcounts_file, FreqOffsets);
1880  wcounts_file.flush();
1881  wcounts_file << tmp_freq_buff.rdbuf();
1882  wcounts_file.flush();
1883  wcounts_file.close();
1884 
1885  s_WriteInt4List(obsr_file, ObsrOffsets);
1886  obsr_file.flush();
1887  obsr_file << tmp_obsr_buff.rdbuf();
1888  obsr_file.flush();
1889  obsr_file.close();
1890 }
1891 
1893 {
1894  vector<string> v;
1895  for(unsigned int i=0; i < m_VolNames.size(); i++) {
1896  string t = kEmptyStr;
1898  s.GetString(t);
1899  v.push_back(t);
1900  }
1903 }
1904 
1906 {
1907  int status = 0;
1908  try { x_Run(); }
1909  catch(const blast::CInputException& e) {
1910  ERR_POST(Error << "INPUT ERROR: " << e.GetMsg());
1911  status = BLAST_INPUT_ERROR;
1912  }
1913  catch (const CSeqDBException& e) {
1914  ERR_POST(Error << "ERROR: " << e.GetMsg());
1915  status = BLAST_DATABASE_ERROR;
1916  }
1917  catch (const blast::CBlastException& e) {
1918  ERR_POST(Error << "ERROR: " << e.GetMsg());
1919  status = BLAST_INPUT_ERROR;
1920  }
1921  catch (const CException& e) {
1922  ERR_POST(Error << "ERROR: " << e.GetMsg());
1923  status = BLAST_UNKNOWN_ERROR;
1924  }
1925  catch (...) {
1926  ERR_POST(Error << "Error: Unknown exception");
1927  status = BLAST_UNKNOWN_ERROR;
1928  }
1929 
1930  x_AddCmdOptions();
1932  return status;
1933 }
1934 
1936 {
1937  const CArgs & args = GetArgs();
1938  if (args["dbtype"].HasValue()) {
1939  m_UsageReport.AddParam(CBlastUsageReport::eDBType, args["dbtype"].AsString());
1940  }
1941  if(args["taxid"].HasValue() || args["taxid_map"].HasValue()) {
1943  }
1944 }
1945 
1946 
1947 #ifndef SKIP_DOXYGEN_PROCESSING
1948 int main(int argc, const char* argv[] /*, const char* envp[]*/)
1949 {
1950  return CMakeProfileDBApp().AppMain(argc, argv);
1951 }
1952 
1953 
1954 
1955 
1956 #endif /* SKIP_DOXYGEN_PROCESSING */
1957 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Declares singleton objects to store the version and reference for the BLAST engine.
Routines for creating protein BLAST lookup tables.
@ eBackbone
BlastAaLookupTable * BlastAaLookupTableDestruct(BlastAaLookupTable *lookup)
Free the lookup table.
void BlastAaLookupIndexQuery(BlastAaLookupTable *lookup, Int4 **matrix, BLAST_SequenceBlk *query, BlastSeqLoc *unmasked_regions, Int4 query_bias)
Index a protein query.
struct RPSBackboneCell RPSBackboneCell
structure defining one cell of the RPS lookup table
#define RPS_HITS_PER_CELL
maximum number of hits in an RPS backbone cell; this may be redundant (have the same value as AA_HITS...
Int4 BlastAaLookupFinalize(BlastAaLookupTable *lookup, EBoneType bone_type)
Pack the data structures comprising a protein lookup table into their final form.
Int4 BlastAaLookupTableNew(const LookupTableOptions *opt, BlastAaLookupTable **lut)
Create a new protein lookup table.
#define BLAST_INPUT_ERROR
Command line binary exit code: error in input query/options.
#define BLAST_UNKNOWN_ERROR
Command line binary exit code: unknown error.
#define BLAST_DATABASE_ERROR
Command line binary exit code: error in database/subject.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Definition: blast_filter.c:737
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
Definition: blast_filter.c:608
Interface for converting sources of sequence data into blast sequence input.
The structures and functions in blast_options.
Int2 BLAST_FillQuerySetUpOptions(QuerySetUpOptions *options, EBlastProgramType program, const char *filter_string, Uint1 strand_option)
Fill non-default contents of the QuerySetUpOptions.
Int2 BlastQuerySetUpOptionsNew(QuerySetUpOptions **options)
Allocate memory for QuerySetUpOptions and fill with default values.
Int2 BLAST_FillLookupTableOptions(LookupTableOptions *options, EBlastProgramType program, Boolean is_megablast, double threshold, Int4 word_size)
Allocate memory for lookup table options and fill with default values.
Int2 LookupTableOptionsNew(EBlastProgramType program, LookupTableOptions **options)
Allocate memory for lookup table options and fill with default values.
#define BLAST_WORDSIZE_PROT
length of word to trigger an extension.
Definition: blast_options.h:66
LookupTableOptions * LookupTableOptionsFree(LookupTableOptions *options)
Deallocates memory for LookupTableOptions*.
QuerySetUpOptions * BlastQuerySetUpOptionsFree(QuerySetUpOptions *options)
Deallocate memory for QuerySetUpOptions.
@ eBlastTypePsiBlast
Definition: blast_program.h:82
@ eBlastTypeBlastp
Definition: blast_program.h:73
#define FREQ_RATIO_SCALE
header for RPS blast frequency ratios ('.freq') file
Definition: blast_rps.h:83
#define RPS_MAGIC_NUM_28
Version number for 28-letter alphabet.
Definition: blast_rps.h:44
Int2 BLAST_GetProteinGapExistenceExtendParams(const char *matrixName, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
Definition: blast_stat.c:3373
#define BLAST_SCORE_MAX
maximum allowed score (for one letter comparison).
Definition: blast_stat.h:122
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
Code to build a database given various sources of sequence data.
AutoArray –.
Definition: ncbimisc.hpp:527
Class to constrain the values of an argument to those greater than or equal to the value specified in...
CArgAllow_Integers –.
Definition: ncbiargs.hpp:1751
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
Defines BLAST error codes (user errors included)
void AddParam(EUsageParams p, int val)
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
Definition: version.hpp:53
static void CreateDirectories(const string &dbname)
Create Directory for blast db.
Definition: build_db.cpp:1051
CCoreBlock –.
Definition: CoreBlock.hpp:66
CFile –.
Definition: ncbifile.hpp:1604
Defines user input exceptions.
void Create(int seq_size)
Int4 ** Get(void)
unsigned int GetSize(void)
QuerySetUpOptions * query_options
LookupTableOptions * lookup_options
CMakeProfileDBApp(void)
@inheritDoc
void x_AddCmdOptions(void)
virtual void Init()
@inheritDoc
CNcbiOstream * m_LogFile
CheckInputScoremat_RV x_CheckInputScoremat(const CPssmWithParameters &pssm_w_parameters, const string &filename)
CRef< CTaxIdSet > m_Taxids
CNcbiIstream * m_InPssmList
EBlastDbVersion m_DbVer
void x_RPSUpdateLookup(CRPS_DbInfo &rpsDbInfo, Int4 seq_size)
vector< string > x_CreateDeltaList(void)
void x_WrapUpDelta(CRPS_DbInfo &rpsDbInfo, CTmpFile &tmp_obsr_file, CTmpFile &tmp_freq_file, list< Int4 > &FreqOffsets, list< Int4 > &ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset)
void x_RPSUpdateStatistics(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &seq, Int4 seq_size)
virtual int Run()
@inheritDoc
void x_CreateAliasFile(void)
void x_FillInRPSDbParameters(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &pssm_p)
void x_InitProgramParameters(void)
void x_InitRPSDbInfo(CRPS_DbInfo &rpsDBInfo, Int4 vol, Int4 num_files)
void x_RPS_DbClose(CRPS_DbInfo &rpsDbInfo)
bool x_CheckDelta(const CPssm &pssm, Int4 seq_size, const string &filename)
void x_RPSAddFirstSequence(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &pssm_w_parameters, bool freq_only)
void x_UpdateRPSDbInfo(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p)
void x_UpdateDelta(CRPS_DbInfo &rpsDbInfo, vector< string > &smpFilenames)
double m_WordDefaultScoreThreshold
void x_RPSUpdatePSSM(CRPS_DbInfo &rpsDbInfo, const CPssm &pssm, Int4 seq_index, Int4 seq_size)
void x_InitOutputDb(CRPS_DbInfo &rpsDBInfo)
void x_SetupArgDescriptions(void)
CBlastUsageReport m_UsageReport
vector< string > m_VolNames
CStopWatch m_StopWatch
bool x_ValidateCd(const list< double > &freqs, const list< double > &observ, unsigned int alphabet_size)
void x_UpdateFreqRatios(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p, Int4 seq_index, Int4 seq_size)
bool x_IsUpdateFreqRatios(const CPssm &p)
vector< string > x_GetSMPFilenames(void)
void x_UpdateCobalt(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p, Int4 seq_size)
void x_MakeVol(Int4 vol, vector< string > &smps)
CNCBIstdaa –.
Definition: NCBIstdaa.hpp:66
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
Implements the interface to retrieve data for the last 2 stages of the PSSM creation.
Computes a PSSM as specified in PSI-BLAST.
CPssmFinalData –.
CPssmParameters –.
Definition: Pssm.hpp:55
void GetQuerySequenceData(CNCBIstdaa &sequence) const
Retrieve the query sequence data in ncbistdaa format.
Definition: Pssm.cpp:77
SIZE_TYPE GetQueryLength() const
Return the query length or 0 if no query is available.
Definition: Pssm.cpp:62
CSeqDBException.
Definition: seqdbcommon.hpp:73
String slicing.
void GetString(string &s) const
Return the data by assigning it to a string.
@ eProtein
Definition: seqdb.hpp:174
CStopWatch –.
Definition: ncbitime.hpp:1938
void FixTaxId(CRef< objects::CBlast_def_line_set > deflines)
Check that each defline has the specified taxid; if not, replace the defline and set the taxid.
Definition: taxid_set.cpp:131
void AddTaxId(const objects::CSeq_id &seqid, const TTaxId &taxid)
Definition: taxid_set.cpp:77
void SetMappingFromFile(CNcbiIstream &f)
Definition: taxid_set.cpp:45
CTmpFile –.
Definition: ncbifile.hpp:2352
static string MakeShortName(const string &base, int index)
Construct the short name for a volume.
CWriteDB.
Definition: writedb.hpp:92
void SetMaxFileSize(Uint8 sz)
Set maximum size for output files.
Definition: writedb.cpp:118
@ eProtein
Protein database.
Definition: writedb.hpp:97
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
Definition: writedb.cpp:79
EIndexType
Whether and what kind of indices to build.
Definition: writedb.hpp:104
@ eDefault
Like eFullIndex but also build a numeric Trace ID index.
Definition: writedb.hpp:121
@ eNoIndex
Build a database without any indices.
Definition: writedb.hpp:106
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
Definition: writedb.cpp:129
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
Definition: writedb.cpp:94
void Close()
Close the Database.
Definition: writedb.cpp:104
Constant declarations for command line arguments for BLAST programs.
const string kArgMatrixName
Argument for scoring matrix.
const string kArgDbTitle
Title for the BLAST database.
const string kArgGapExtend
Argument to select the gap extending penalty.
const string kArgGapOpen
Argument to select the gap opening penalty.
const string kArgWordScoreThreshold
Argument to specify the minimum word score such that the word is added to the lookup table.
void Print(const CCompactSAMApplication::AlignInfo &ai)
std::ofstream out("events_result.xml")
main entry point for tests
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
static CNcbiMatrix< double > * GetFreqRatios(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1154
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1292
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:799
CVersionInfo GetVersion(void) const
Get the program version information.
Definition: ncbiapp.cpp:1164
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1175
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
@ fHideXmlHelp
Hide XML help description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
Definition: ncbiargs.hpp:590
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
Definition: ncbifile.cpp:2595
const string & GetFileName(void) const
Return used file name (generated or given in the constructor).
Definition: ncbifile.cpp:5429
@ eRemove
Remove file.
Definition: ncbifile.hpp:2356
#define MSerial_AsnBinary
Definition: serialbase.hpp:697
const TPrim & Get(void) const
Definition: serialbase.hpp:347
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
bool IsEnabled(void)
Indicates whether application usage statistics collection is enabled for a current reporter instance.
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5186
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3550
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
@ fSplit_Truncate
Definition: ncbistr.hpp:2501
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2498
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2776
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2765
#define CVersion
Tdata & Set(void)
Assign a value to data member.
const TFreqRatios & GetFreqRatios(void) const
Get the FreqRatios member data.
const TNumIndeptObsr & GetNumIndeptObsr(void) const
Get the NumIndeptObsr member data.
const TMatrixName & GetMatrixName(void) const
Get the MatrixName member data.
const TQuery & GetQuery(void) const
Get the Query member data.
Definition: Pssm_.hpp:772
TNumRows GetNumRows(void) const
Get the NumRows member data.
Definition: Pssm_.hpp:610
void SetParams(TParams &value)
Assign a value to Params data member.
bool IsSetFinalData(void) const
Final representation for the PSSM Check if a value has been assigned to FinalData data member.
Definition: Pssm_.hpp:802
bool IsSetStop(void) const
end of block on query Check if a value has been assigned to Stop data member.
Definition: CoreBlock_.hpp:367
TH GetH(void) const
Get the H member data.
TKappa GetKappa(void) const
Get the Kappa member data.
const TScores & GetScores(void) const
Get the Scores member data.
const TWeightedResFreqsPerPos & GetWeightedResFreqsPerPos(void) const
Get the WeightedResFreqsPerPos member data.
bool IsSetGapOpen(void) const
gap opening penalty corresponding to the matrix above Check if a value has been assigned to GapOpen d...
TGapExtend GetGapExtend(void) const
Get the GapExtend member data.
TWordScoreThreshold GetWordScoreThreshold(void) const
Get the WordScoreThreshold member data.
TScalingFactor GetScalingFactor(void) const
Get the ScalingFactor member data.
const TBlocks & GetBlocks(void) const
Get the Blocks member data.
Definition: CoreDef_.hpp:369
bool IsSetStart(void) const
begin of block on query Check if a value has been assigned to Start data member.
Definition: CoreBlock_.hpp:320
bool IsSetWordScoreThreshold(void) const
Word score threshold Check if a value has been assigned to WordScoreThreshold data member.
bool IsSetScalingFactor(void) const
scaling factor used to obtain more precision when building the PSSM.
bool IsSetFreqRatios(void) const
PSSM's frequency ratios Check if a value has been assigned to FreqRatios data member.
TStop GetStop(void) const
Get the Stop member data.
Definition: CoreBlock_.hpp:386
void SetMatrixName(const TMatrixName &value)
Assign a value to MatrixName data member.
bool IsSetIntermediateData(void) const
both intermediateData and finalData can be provided, but at least one of them must be provided.
Definition: Pssm_.hpp:781
const TFinalData & GetFinalData(void) const
Get the FinalData member data.
Definition: Pssm_.hpp:814
bool IsSetWeightedResFreqsPerPos(void) const
Weighted observed residue frequencies per position of the PSSM.
bool IsSetRpsdbparams(void) const
data needed by formatrpsdb to create RPS-BLAST databases.
void SetPssm(TPssm &value)
Assign a value to Pssm data member.
TNumColumns GetNumColumns(void) const
Get the NumColumns member data.
Definition: Pssm_.hpp:657
const TConstraints & GetConstraints(void) const
Get the Constraints member data.
bool IsSetMatrixName(void) const
name of the underlying score matrix whose frequency ratios were used in PSSM construction (e....
bool IsSetNumRows(void) const
The dimensions of the matrix are returned so the client can verify that all data was received.
Definition: Pssm_.hpp:591
void SetFinalData(TFinalData &value)
Assign a value to FinalData data member.
Definition: Pssm_.cpp:116
TStart GetStart(void) const
Get the Start member data.
Definition: CoreBlock_.hpp:339
bool IsSetQuery(void) const
PSSM representative sequence (master) Check if a value has been assigned to Query data member.
Definition: Pssm_.hpp:760
TGapOpen GetGapOpen(void) const
Get the GapOpen member data.
bool IsSetNumIndeptObsr(void) const
Number of independent observations per position of the PSSM NOTE: this is needed for building CDD dat...
bool IsSetConstraints(void) const
alignment constraints needed by sequence-structure threader and other global or local block-alignment...
bool IsSetGapExtend(void) const
gap extension penalty corresponding to the matrix above Check if a value has been assigned to GapExte...
bool IsSetNumColumns(void) const
number of columns Check if a value has been assigned to NumColumns data member.
Definition: Pssm_.hpp:638
const TIntermediateData & GetIntermediateData(void) const
Get the IntermediateData member data.
Definition: Pssm_.hpp:793
TByRow GetByRow(void) const
Get the ByRow member data.
Definition: Pssm_.hpp:735
void SetGapOpen(TGapOpen value)
Assign a value to GapOpen data member.
const TParams & GetParams(void) const
Get the Params member data.
bool IsSetBlocks(void) const
nblocks locations Check if a value has been assigned to Blocks data member.
Definition: CoreDef_.hpp:357
bool IsSetPssm(void) const
This field is applicable to PSI-BLAST and formatrpsdb.
void SetGapExtend(TGapExtend value)
Assign a value to GapExtend data member.
const TPssm & GetPssm(void) const
Get the Pssm member data.
bool IsSetParams(void) const
This field's rpsdbparams is used to specify the values of options for processing by formatrpsdb.
const TRpsdbparams & GetRpsdbparams(void) const
Get the Rpsdbparams member data.
TLambda GetLambda(void) const
Get the Lambda member data.
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
Definition: Bioseq_.hpp:309
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
bool IsTitle(void) const
Check if variant Title is selected.
Definition: Seqdesc_.hpp:1026
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
if(yy_accept[yy_current_state])
static void s_WriteInt4List(CNcbiOfstream &ostr, const list< Int4 > &l)
static const string kDefaultMatrix(kMatrixBLOSUM62)
static const string kOutDbName("out")
static CRef< CBlast_def_line_set > s_GenerateBlastDefline(const CBioseq &bio)
static const string kMatrixBLOSUM80
static const string kOutDbType("dbtype")
static const string kExcludeInvalid("exclude_invalid")
#define RPS_NUM_LOOKUP_CELLS
USING_SCOPE(blast)
static const string kMatrixPAM250
static const string kMaxSmpFilesPerVol("max_smp_vol")
static const string kMatrixBLOSUM62
static bool s_HasDefline(const CBioseq &bio)
static const Uint4 kFixedPointScaleFactor
static const string kLogFile("logfile")
static const string kDefaultOutIndexFile("true")
static const string kDefaultOutDbType(kOutDbRps)
#define kEpsylon
static const string kDefaultExcludeInvalid("true")
#define RPS_DATABASE_VERSION
static const string kMatrixBLOSUM50
static const string kOutDbRps
static void s_WriteUint4List(CNcbiOfstream &ostr, const list< Uint4 > &l)
static const string kMatrixBLOSUM90
#define kDefaultWordScoreThreshold
#define kDefaultObsrThreshold
static const string kInPssmList("in")
int main(int argc, const char *argv[])
#define kDefaultMaxSmpFilesPerVol
static const string kObsrThreshold("obsr_threshold")
USING_NCBI_SCOPE
static const string kMatrixPAM70
static const string kMatrixBLOSUM45
#define kSingleVol
static const string kOutDbDelta
static bool s_DeleteMakeprofileDb(const string &name)
static const string kMatrixPAM30
static const string kBinaryScoremat("binary")
static const string kOutDbCobalt
static const string kUseCmdlineThreshold("force")
static const string kPssmScaleFactor("scale")
static const string kOutIndexFile("index")
#define kDefaultPssmScaleFactor
static int version
Definition: mdb_load.c:29
const struct ncbi::grid::netcache::search::fields::SIZE size
#define fabs(v)
Definition: ncbi_dispd.c:46
EIPRangeType t
Definition: ncbi_localip.c:101
Prototypes for portable math library (ported from C Toolkit)
long BLAST_Nint(double x)
Nearest integer.
Definition: ncbi_math.c:437
#define INT2_MIN
smallest (most negative) number represented by signed (two byte) short
Definition: ncbi_std.h:161
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Miscellaneous common-use basic types and functionality.
Defines: CTimeFormat - storage class for time format.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
static char tmp[2048]
Definition: utf8.c:42
static int filenames
Definition: pcregrep.c:172
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
C++ API for the PSI-BLAST PSSM engine.
bool DeleteBlastDb(const string &dbpath, CSeqDB::ESeqType seq_type)
Deletes all files associated with a BLAST database.
Definition: seqdb.cpp:1510
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
@ eBDB_Version4
Definition: seqdbcommon.hpp:52
@ eBDB_Version5
Definition: seqdbcommon.hpp:53
This file defines several SeqDB utility functions related to byte order and file system portability.
CSeqDB_Substring SeqDB_RemoveDirName(CSeqDB_Substring s)
Returns a filename minus greedy path.
Definition: seqdbcommon.cpp:50
structure defining one cell of the compacted lookup table
union AaLookupBackboneCell::@3 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
Int4 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins
Int4 num_used
number of hits stored for this cell
The basic lookup table structure for blastp searches.
void * thick_backbone
may point to BackboneCell, SmallboneCell, or TinyboneCell.
Boolean use_pssm
if TRUE, lookup table construction will assume that the underlying score matrix is position- specific
Int4 threshold
the score threshold for neighboring words
void * overflow
may point to Int4 or Uint2, the overflow array for the compacted lookup table
Int4 backbone_size
number of cells in the backbone
header of RPS blast '.loo' file
Definition: blast_rps.h:49
Int4 magic_number
value should be RPS_MAGIC_NUM
Definition: blast_rps.h:50
Int4 start_of_backbone
byte offset of start of backbone
Definition: blast_rps.h:56
Int4 end_of_overflow
byte offset to end of overflow array
Definition: blast_rps.h:57
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
Options needed to construct a lookup table Also needed: query sequence and query length.
Options required for setting up the query sequence.
structure defining one cell of the RPS lookup table
static string query
Class which defines sequence id to taxid mapping.
#define _ASSERT
@ FALSE
Definition: testodbc.c:27
@ TRUE
Definition: testodbc.c:27
Defines BLAST database construction classes.
void CWriteDB_CreateAliasFile(const string &file_name, const string &db_name, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title=string(), EAliasFileFilterType alias_type=eGiList)
Writes an alias file that restricts a database with a gi list.
@ eNoAliasFilterType
Sentinel value.
Definition: writedb.hpp:610
Code for database files construction.
Modified on Tue Nov 28 02:18:36 2023 by modify_doxy.py rev. 669887