NCBI C++ ToolKit
makeprofiledb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: makeprofiledb.cpp 102656 2024-06-21 14:24:39Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Amelia Fong
27  *
28  */
29 
30 /** @file makeprofiledb.cpp
31  * Command line tool to create RPS,COBALT & DELTA BLAST databases.
32  * This is the successor to formatrpsdb from the C toolkit
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbimisc.hpp>
38 #include <corelib/ncbitime.hpp>
39 #include <util/math/matrix.hpp>
40 #include <serial/objistrasn.hpp>
64 #include "../blast/blast_app_util.hpp"
65 
66 #ifndef SKIP_DOXYGEN_PROCESSING
68 USING_SCOPE(blast);
70 #endif /* SKIP_DOXYGEN_PROCESSING */
71 
72 
73 //Input args specify to makeprofiledb
74 static const string kInPssmList("in");
75 static const string kOutDbName("out");
76 static const string kOutDbType("dbtype");
77 static const string kPssmScaleFactor("scale");
78 static const string kOutIndexFile("index");
79 static const string kObsrThreshold("obsr_threshold");
80 static const string kExcludeInvalid("exclude_invalid");
81 static const string kBinaryScoremat("binary");
82 static const string kUseCmdlineThreshold("force");
83 static const string kMaxSmpFilesPerVol("max_smp_vol");
84 
85 static const string kLogFile("logfile");
86 
87 //Supported Output Database Types
88 static const string kOutDbRps = "rps";
89 static const string kOutDbCobalt = "cobalt";
90 static const string kOutDbDelta = "delta";
91 
92 //Supported Matrices
93 static const string kMatrixBLOSUM62 = "BLOSUM62";
94 static const string kMatrixBLOSUM80 = "BLOSUM80";
95 static const string kMatrixBLOSUM50 = "BLOSUM50";
96 static const string kMatrixBLOSUM45 = "BLOSUM45";
97 static const string kMatrixBLOSUM90 = "BLOSUM90";
98 static const string kMatrixPAM250 = "PAM250";
99 static const string kMatrixPAM30 = "PAM30";
100 static const string kMatrixPAM70 = "PAM70";
101 
102 //Default Input Values
103 static const string kDefaultMatrix(kMatrixBLOSUM62);
104 static const string kDefaultOutDbType(kOutDbRps);
105 static const string kDefaultOutIndexFile("true");
106 static const string kDefaultExcludeInvalid("true");
107 #define kDefaultWordScoreThreshold (9.82)
108 #define kDefaultPssmScaleFactor (100.00)
109 #define kDefaultObsrThreshold (6.0)
110 #define kDefaultMaxSmpFilesPerVol (2500)
111 
112 //Fix point scale factor for delta blast
113 static const Uint4 kFixedPointScaleFactor = 1000;
114 #define kEpsylon (0.0001)
115 
116 #define DEFAULT_POS_MATRIX_SIZE 2000
117 #define RPS_NUM_LOOKUP_CELLS 32768
118 #if BLASTAA_SIZE == 28
119 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM_28
120 #else
121 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM
122 #endif
123 
124 #define kSingleVol (-1)
125 
127 {
128 public:
131 
132  void Create(int seq_size);
133  void Delete(void);
134 
135  Int4 ** Get(void) { return m_posMatrix;};
136  unsigned int GetSize(void){return m_size;};
137 
138 private:
139 
141  int m_size;
142 };
143 
145 {
146  Delete();
147 
148  m_posMatrix = new Int4* [size];
149 
150  for(int i = 0; i < size; ++ i)
151  {
152  m_posMatrix[i] = new Int4[BLASTAA_SIZE];
153  }
154  m_size = size;
155 
156  return;
157 }
158 
160 {
161  if( NULL == m_posMatrix)
162  return;
163 
164  for(int i = 0; i < m_size; ++ i)
165  {
166  if (m_posMatrix[i] != NULL)
167  delete [] m_posMatrix[i];
168  }
169 
170  delete [] m_posMatrix;
171  m_posMatrix = NULL;
172  return;
173 }
174 
176 {
177 public:
178  /** @inheritDoc */
179  CMakeProfileDBApp(void);
181 private:
182  /** @inheritDoc */
183  virtual void Init();
184  /** @inheritDoc */
185  virtual int Run();
186 
187  enum op_mode
188  {
192  op_invalid
193  };
194 
196  {
197  public:
198  string db_name;
205 
214  string matrix;
216 
217  CRPS_DbInfo(void):
221  { };
223  {
224  if( NULL != query_options) {
226  }
227 
228  if(NULL != lookup) {
230  }
231 
232  if(NULL != lookup_options) {
234  }
235  };
236  };
237 
239  {
242  sm_invalid
243  };
244 
245  enum
246  {
249  eTrue
250  };
251 
253  const string & filename);
254  void x_SetupArgDescriptions(void);
255  void x_InitProgramParameters(void);
256  vector<string> x_GetSMPFilenames(void);
257  void x_InitOutputDb(CRPS_DbInfo & rpsDBInfo);
258  void x_InitRPSDbInfo(CRPS_DbInfo & rpsDBInfo, Int4 vol, Int4 num_files);
259  void x_UpdateRPSDbInfo(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p);
260  void x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_w_parameters, bool freq_only);
261  void x_RPSUpdateLookup(CRPS_DbInfo & rpsDbInfo, Int4 seq_size);
262  void x_RPSUpdateStatistics(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & seq, Int4 seq_size);
263  void x_FillInRPSDbParameters(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_p);
264  void x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo, const CPssm & pssm, Int4 seq_index, Int4 seq_size);
265  void x_RPS_DbClose(CRPS_DbInfo & rpsDbInfo);
266  void x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_size);
267  bool x_CheckDelta( const CPssm & pssm, Int4 seq_size, const string & filename);
268  bool x_ValidateCd(const list<double>& freqs, const list<double>& observ, unsigned int alphabet_size);
269  void x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo, CTmpFile & tmp_obsr_file, CTmpFile & tmp_freq_file,
270  list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset);
271  vector<string> x_CreateDeltaList(void);
272  void x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_index, Int4 seq_size);
273  void x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames);
274  bool x_IsUpdateFreqRatios(const CPssm & p);
275  void x_MakeVol(Int4 vol, vector<string> & smps);
276 
277  int x_Run(void);
278 
279  void x_AddCmdOptions(void);
280  void x_CreateAliasFile(void);
281 
282  // Data
285  string m_Title;
287  string m_OutDbName;
288  string m_OutDbType;
293  string m_Matrix;
298 
302  bool m_Done;
303 
304  //For Delta Blast
307 
310 
311  vector<string> m_VolNames;
314 };
315 
317  : m_LogFile(NULL), m_InPssmList(NULL), m_Title(kEmptyStr),
318  m_WordDefaultScoreThreshold(0), m_OutDbName(kEmptyStr),
319  m_OutDbType(kEmptyStr), m_CreateIndexFile(false),m_GapOpenPenalty(0),
320  m_GapExtPenalty(0), m_PssmScaleFactor(0),m_Matrix(kEmptyStr), m_op_mode(op_invalid),
321  m_binary_scoremat(false), m_MaxSmpFilesPerVol(0), m_NumOfVols(0), m_DbVer(eBDB_Version5),
322  m_Taxids(new CTaxIdSet()), m_UserTaxIds(false), m_Done(false),
323  m_ObsrvThreshold(0), m_ExcludeInvalid(false),
324  m_UpdateFreqRatios(eUndefined), m_UseModelThreshold(true)
325 {
327  version->SetVersionInfo(new CBlastVersion());
329  m_StopWatch.Start();
330  if (m_UsageReport.IsEnabled()) {
332  m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "makeprofiledb");
333  }
334 }
335 
337 {
338  // NEED CLEAN UP CODE !!!!
339  if(m_Done == false)
340  {
341  for(unsigned int i =0; i < m_VolNames.size(); i ++)
342  {
343  string rps_str = m_VolNames[i] + ".rps";
344  string lookup_str = m_VolNames[i] + ".loo";
345  string aux_str = m_VolNames[i] + ".aux";
346  string freq_str = m_VolNames[i] + ".freq";
347  CFile(rps_str).Remove();
348  CFile(lookup_str).Remove();
349  CFile(aux_str).Remove();
350  CFile(freq_str).Remove();
351 
352  if(op_cobalt == m_op_mode)
353  {
354  string blocks_str = m_VolNames[i] + ".blocks";
355  CFile(blocks_str).Remove();
356  }
357 
358  if(op_delta == m_op_mode)
359  {
360  string wcounts_str = m_VolNames[i] + ".wcounts";
361  string obsr_str = m_VolNames[i] + ".obsr";
362  CFile(wcounts_str).Remove();
363  CFile(obsr_str).Remove();
364  }
365  }
366  if (m_VolNames.size() > 1) {
367  string pal_str = m_OutDbName + ".pal";
368  CFile(pal_str).Remove();
369  }
370  }
371  else
372  {
373  for(unsigned int i =0; i < m_VolNames.size(); i ++) {
374  string pog_str = m_VolNames[i] + ".pog";
375  CFile(pog_str).Remove();
376  }
377  }
379 }
380 
382 {
384 
385  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
386 
387  // Specify USAGE context
388  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
389  "Application to create databases for rpsblast, cobalt and deltablast, version "
390  + CBlastVersion().Print());
391 
392  string dflt("Default = input file name provided to -");
393  dflt += kInPssmList + " argument";
394 
395  arg_desc->SetCurrentGroup("Input options");
396  arg_desc->AddKey(kInPssmList, "in_pssm_list",
397  "Input file that contains a list of smp files (delimited by space, tab or newline)",
399 
400  arg_desc->AddFlag(kBinaryScoremat,
401  "Scoremats are in binary format",
402  true);
403 
404  arg_desc->SetCurrentGroup("Configuration options");
405  arg_desc->AddOptionalKey(kArgDbTitle, "database_title",
406  "Title for database\n" + dflt,
408 
409  arg_desc->AddDefaultKey(kArgWordScoreThreshold, "word_score_threshold",
410  "Minimum word score to add a word to the lookup table",
413  arg_desc->AddFlag(kUseCmdlineThreshold, "Use cmdline threshold", true);
414 
415  arg_desc->SetCurrentGroup("Output options");
416  arg_desc->AddOptionalKey(kOutDbName, "database_name",
417  "Name of database to be created\n" +
419 
420  arg_desc->AddDefaultKey("blastdb_version", "version",
421  "Version of BLAST database to be created",
423  NStr::NumericToString(static_cast<int>(eBDB_Version5)));
424  arg_desc->SetConstraint("blastdb_version",
426 
427  arg_desc->AddDefaultKey(kMaxSmpFilesPerVol, "max_smp_files_per_vol",
428  "Maximum number of SMP files per DB volume",
430 
431  arg_desc->AddDefaultKey(kOutDbType, "output_db_type",
432  "Output database type: cobalt, delta, rps",
434  arg_desc->SetConstraint(kOutDbType, &(*new CArgAllow_Strings, kOutDbRps, kOutDbCobalt , kOutDbDelta ));
435 
436  arg_desc->AddDefaultKey(kOutIndexFile, "create_index_files",
437  "Create Index Files",
439 
440  arg_desc->SetCurrentGroup("Used only if scoremat files do not contain PSSM scores, ignored otherwise.");
441  arg_desc->AddOptionalKey(kArgGapOpen, "gap_open_penalty",
442  "Cost to open a gap",
444 
445  arg_desc->AddOptionalKey(kArgGapExtend, "gap_extend_penalty",
446  "Cost to extend a gap, ",
448 
449  arg_desc->AddDefaultKey(kPssmScaleFactor, "pssm_scale_factor",
450  "Pssm Scale factor ",
453 
454  arg_desc->AddDefaultKey(kArgMatrixName, "matrix_name",
455  "Scoring matrix name",
458  arg_desc->SetConstraint(kArgMatrixName, &(*new CArgAllow_Strings,kMatrixBLOSUM62, kMatrixBLOSUM80,
460 
461  //Delta Blast Options
462  arg_desc->SetCurrentGroup("Delta Blast Options");
463  arg_desc->AddDefaultKey(kObsrThreshold, "observations_threshold", "Exclude domains with "
464  "with maximum number of independent observations "
465  "below this threshold", CArgDescriptions::eDouble,
467 
468  arg_desc->AddDefaultKey(kExcludeInvalid, "exclude_invalid", "Exclude domains that do "
469  "not pass validation test",
471 
472  arg_desc->SetCurrentGroup("Taxonomy options");
473  arg_desc->AddOptionalKey("taxid", "TaxID",
474  "Taxonomy ID to assign to all sequences",
476  arg_desc->SetConstraint("taxid", new CArgAllowValuesGreaterThanOrEqual(0));
477  arg_desc->SetDependency("taxid", CArgDescriptions::eExcludes, "taxid_map");
478 
479  arg_desc->AddOptionalKey("taxid_map", "TaxIDMapFile",
480  "Text file mapping sequence IDs to taxonomy IDs.\n"
481  "Format:<SequenceId> <TaxonomyId><newline>",
483 
484  SetupArgDescriptions(arg_desc.release());
485 }
486 
488 {
489  const CArgs& args = GetArgs();
490 
491  //log_file
492  if (args[kLogFile].HasValue())
493  m_LogFile = &args[kLogFile].AsOutputFile();
494  else
495  m_LogFile = &cout;
496 
497 
498  //in_list
499  if (args[kInPssmList].HasValue())
500  m_InPssmList = &args[kInPssmList].AsInputFile();
501  else
502  NCBI_THROW(CInputException, eInvalidInput, "Please provide an input file with list of smp files");
503 
504  // Binary Scoremat
506 
507  //title
508  if (args[kArgDbTitle].HasValue())
509  m_Title = args[kArgDbTitle].AsString();
510  else
511  m_Title = args[kInPssmList].AsString();
512 
513  //threshold
515 
516  //Out
517  if(args[kOutDbName].HasValue())
518  m_OutDbName = args[kOutDbName].AsString();
519  else
520  m_OutDbName = args[kInPssmList].AsString();
521 
522  //Number of SMP files per db vol
523  m_MaxSmpFilesPerVol = args[kMaxSmpFilesPerVol].AsInteger();
524 
525  //out_db_type
526  m_OutDbType = args[kOutDbType].AsString();
527  if(kOutDbRps == m_OutDbType)
528  m_op_mode = op_rps;
529  else if (kOutDbCobalt == m_OutDbType)
531  else if(kOutDbDelta == m_OutDbType)
533  else
534  NCBI_THROW(CInputException, eInvalidInput, "Invalid Output database type");
535 
536  m_CreateIndexFile = args[kOutIndexFile].AsBoolean();
537 
538  int default_gap_open = 0;
539  int default_gap_extend = 0;
540  //matrix
541  m_Matrix = args[kArgMatrixName].AsString();
542  BLAST_GetProteinGapExistenceExtendParams(m_Matrix.c_str(), &default_gap_open, &default_gap_extend);
543 
544  //gapopen
545  if(args[kArgGapOpen].HasValue())
546  m_GapOpenPenalty = args[kArgGapOpen].AsInteger();
547  else
548  m_GapOpenPenalty = default_gap_open;
549 
550  //gapextend
551  if(args[kArgGapExtend].HasValue())
552  m_GapExtPenalty = args[kArgGapExtend].AsInteger();
553  else
554  m_GapExtPenalty = default_gap_extend;
555 
556  //pssm scale factor
557  m_PssmScaleFactor = args[kPssmScaleFactor].AsDouble();
558 
559  //matrix
560  m_Matrix = args[kArgMatrixName].AsString();
561 
562  //Delta Blast Parameters
563  m_ObsrvThreshold = args[kObsrThreshold].AsDouble();
564  m_ExcludeInvalid = args[kExcludeInvalid].AsBoolean();
565 
566  if (args[kUseCmdlineThreshold]){
567  m_UseModelThreshold = false;
568  }
569  m_DbVer = static_cast<EBlastDbVersion>(args["blastdb_version"].AsInteger());
570 
571  if (args["taxid"].HasValue()) {
572  _ASSERT( !args["taxid_map"].HasValue() );
573  m_Taxids.Reset(new CTaxIdSet(TAX_ID_FROM(int, args["taxid"].AsInteger())));
574  m_UserTaxIds = true;
575  } else if (args["taxid_map"].HasValue()) {
576  _ASSERT( !args["taxid"].HasValue() );
577  _ASSERT( !m_Taxids.Empty() );
578  m_Taxids->SetMappingFromFile(args["taxid_map"].AsInputFile());
579  m_UserTaxIds = true;
580  }
581 }
582 
584 {
585  vector<string> filenames;
586 
587  while(!m_InPssmList->eof())
588  {
589  string line;
590  vector<string> tmp;
593 
594  if(tmp.size() > 0)
595  filenames.insert(filenames.end(), tmp.begin(), tmp.end() );
596  }
597 
598  if( 0 == filenames.size())
599  NCBI_THROW(CInputException, eInvalidInput, "Input file contains no smp filnames");
600 
601  return filenames;
602 }
603 
606  const string & filename)
607 {
609 
610  if(pssm_w_parameters.IsSetPssm())
611  {
612  const CPssm & pssm = pssm_w_parameters.GetPssm();
613 
614  if(!pssm.IsSetQuery() || (0 == pssm.GetQueryLength()))
615  {
616  string err = filename + " contains no bioseq data";
617  NCBI_THROW(CInputException, eInvalidInput, err);
618  }
619 
620  if(!pssm.IsSetNumRows() || !pssm.IsSetNumColumns())
621  {
622  string err = filename + " contains no info on num of columns or num of rows";
623  NCBI_THROW(CInputException, eInvalidInput, err);
624  }
625 
626  if((int) (pssm.GetQueryLength()) != pssm.GetNumColumns())
627  {
628  string err = filename + " 's num of columns does not match size of sequence";
629  NCBI_THROW(CInputException, eInvalidInput, err);
630  }
631 
632  int num_rows = pssm.GetNumRows();
633  if( num_rows <= 0 || num_rows > BLASTAA_SIZE )
634  {
635  string err = filename + " has invalid alphabet size";
636  NCBI_THROW(CInputException, eInvalidInput, err);
637  }
638 
639  // First time around
641  {
643  }
644 
646  {
647  string err = filename + " contains no frequency ratios.\n" +
648  "Please use a recent version of psiblast to regenerate PSSM files\n" ;
649  NCBI_THROW(CInputException, eInvalidInput, err);
650  }
651 
652  if(op_cobalt == m_op_mode)
653  {
654  if(!pssm_w_parameters.IsSetParams() || !pssm_w_parameters.GetParams().IsSetConstraints() ||
655  ! pssm_w_parameters.GetParams().GetConstraints().IsSetBlocks())
656  {
657  string err = filename + " contains no core block to build cobalt database";
658  NCBI_THROW(CInputException, eInvalidInput, err);
659  }
660  }
661 
662  if(pssm.IsSetFinalData())
663  {
664  sm = sm_valid_has_pssm;
665  }
666  else if(pssm.IsSetIntermediateData())
667  {
669  {
670  sm = sm_valid_freq_only;
671  }
672  }
673 
674  if(sm_invalid == sm)
675  {
676  string err = filename + " contains no pssm or residue frequencies";
677  NCBI_THROW(CInputException, eInvalidInput, err);
678  }
679  }
680  else
681  {
682  string err = filename + " contains no scoremat";
683  NCBI_THROW(CInputException, eInvalidInput, err);
684  }
685 
686  return sm;
687 }
688 
690 {
691  if(op_delta == m_op_mode) {
692  return eFalse;
693  }
694 
695  return eTrue;
696 }
697 
699 {
701  rpsDbInfo.output_db.Reset(new CWriteDB(rpsDbInfo.db_name, CWriteDB::eProtein, m_Title, index_type, m_CreateIndexFile, false, false, m_DbVer));
702  rpsDbInfo.output_db->SetMaxFileSize(4000000000);
703  return;
704 }
705 
706 static bool s_DeleteMakeprofileDb(const string & name )
707 {
708  bool isRemoved = false;
709  static const char * mp_ext[]={".rps", ".loo", ".aux", ".freq", ".blocks", ".wcounts", ".obsr", NULL};
710  for(const char ** mp=mp_ext; *mp != NULL; mp++) {
711  CNcbiOstrstream oss;
712  oss << name << *mp;
713  const string fname = CNcbiOstrstreamToString(oss);
714  if (CFile(fname).Remove()) {
715  LOG_POST(Info << "Deleted " << fname);
716  }
717  else {
718  unsigned int index = 0;
719  string vfname = name + "." + NStr::IntToString(index/10) +
720  NStr::IntToString(index%10) + *mp;
721  while (CFile(vfname).Remove()) {
722  index++;
723  vfname = name + "." + NStr::IntToString(index/10) +
724  NStr::IntToString(index%10) + *mp;
725  }
726  }
727  }
729  isRemoved = true;
730 
731  return isRemoved;
732 }
733 
734 
735 void CMakeProfileDBApp::x_InitRPSDbInfo(CRPS_DbInfo & rpsDbInfo, Int4 vol, Int4 num_files)
736 {
737 
738  rpsDbInfo.num_seqs = num_files;
739  if(vol == kSingleVol) {
740  rpsDbInfo.db_name = m_OutDbName;
741  }
742  else if (vol >= 0) {
744  }
745  else {
746  NCBI_THROW(CBlastException, eCoreBlastError,"Invalid vol number");
747  }
748 
749  string rps_str = rpsDbInfo.db_name + ".rps";
750  rpsDbInfo.pssm_file.open(rps_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
751  if (!rpsDbInfo.pssm_file.is_open())
752  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .rps file ");
753 
754  string lookup_str = rpsDbInfo.db_name + ".loo";
755  rpsDbInfo.lookup_file.open(lookup_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
756  if (!rpsDbInfo.lookup_file.is_open())
757  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .loo file");
758 
759  string aux_str = rpsDbInfo.db_name + ".aux";
760  rpsDbInfo.aux_file.open(aux_str.c_str());
761  if (!rpsDbInfo.aux_file.is_open())
762  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .aux file");
763 
764  string freq_str = rpsDbInfo.db_name + ".freq";
765  rpsDbInfo.freq_file.open(freq_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
766  if (!rpsDbInfo.freq_file.is_open())
767  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .freq file");
768 
769  /* Write the magic numbers to the PSSM file */
770 
772  rpsDbInfo.pssm_file.write ((char *)&version , sizeof(Int4));
773  rpsDbInfo.freq_file.write ((char *)&version , sizeof(Int4));
774 
775  /* Fill in space for the sequence offsets. The PSSM
776  data gets written after this list of integers. Also
777  write the number of sequences to the PSSM file */
778 
779  rpsDbInfo.pssm_file.write((char *) &num_files, sizeof(Int4));
780  rpsDbInfo.freq_file.write((char *) &num_files, sizeof(Int4));
781  for (Int4 i = 0; i <= num_files; i++)
782  {
783  rpsDbInfo.pssm_file.write((char *)&i, sizeof(Int4));
784  rpsDbInfo.freq_file.write((char *)&i, sizeof(Int4));
785  }
786 
787  if(op_cobalt == m_op_mode)
788  {
789  string blocks_str = rpsDbInfo.db_name + ".blocks";
790  rpsDbInfo.blocks_file.open(blocks_str.c_str());
791  if (!rpsDbInfo.blocks_file.is_open())
792  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .blocks file");
793  }
794 
795 
796  rpsDbInfo.curr_seq_offset = 0;
797  //Init them to input arg values first , may change after reading in the first sequence
798  rpsDbInfo.gap_extend = m_GapExtPenalty;
799  rpsDbInfo.gap_open = m_GapOpenPenalty;
800  rpsDbInfo.matrix = m_Matrix;
801  rpsDbInfo.scale_factor = (Int4) ceil(m_PssmScaleFactor);
802 
803  return;
804  }
805 
806 //For first sequence only
808 {
809  if(pssm_p.IsSetParams())
810  {
811  if(pssm_p.GetParams().IsSetRpsdbparams())
812  {
813  const CFormatRpsDbParameters & rps_db_params = pssm_p.GetParams().GetRpsdbparams();
814  if(rps_db_params.IsSetGapExtend())
815  rpsDbInfo.gap_extend = rps_db_params.GetGapExtend();
816 
817  if(rps_db_params.IsSetGapOpen())
818  rpsDbInfo.gap_open = rps_db_params.GetGapOpen();
819 
820  if(rps_db_params.IsSetMatrixName())
821  rpsDbInfo.matrix = rps_db_params.GetMatrixName();
822  }
823  }
824  return;
825 }
826 
828 {
829  if(!pssm_p.IsSetParams())
830  pssm_p.SetParams();
831 
832  if(!pssm_p.GetParams().IsSetRpsdbparams())
833  pssm_p.SetParams().SetRpsdbparams();
834 
835  CFormatRpsDbParameters & rps_params= pssm_p.SetParams().SetRpsdbparams();
836  if(!rps_params.IsSetGapExtend())
837  rps_params.SetGapExtend(rpsDbInfo.gap_extend);
838  else if(rps_params.GetGapExtend() != rpsDbInfo.gap_extend)
839  NCBI_THROW(CBlastException, eCoreBlastError, "Gap extend penalties do not match");
840 
841  if(!rps_params.IsSetGapOpen())
842  rps_params.SetGapOpen(rpsDbInfo.gap_open);
843  else if(rps_params.GetGapOpen() != rpsDbInfo.gap_open)
844  NCBI_THROW(CBlastException, eCoreBlastError, "Gap open penalties do not match");
845 
846  if(!rps_params.IsSetMatrixName())
847  rps_params.SetMatrixName (rpsDbInfo.matrix);
848  else if(rps_params.GetMatrixName()!= rpsDbInfo.matrix)
849  NCBI_THROW(CBlastException, eCoreBlastError, "Score matrix does not match");
850 
851  return;
852 }
853 
854 /* Update the input scoremat with a new PSSM and modified
855  statistics. Scoremat must contain only residue frequencies.
856  Note that upon completion the new PSSM will always have
857  columns of length BLASTAA_SIZE
858  seq is the sequence and set of score frequencies read in
859  from the next data file
860  seq_size is the number of letters in this sequence
861  alphabet_size refers to the number of PSSM rows
862  ScalingFactor is the multiplier for all PSSM scores
863 */
865 {
866 
867  CPssm & pssm = seq.SetPssm();
868  const CPssmParameters & params = seq.GetParams();
869  string matrix_name = params.GetRpsdbparams().GetMatrixName();
870 
871  /* Read in the sequence residues from the scoremat structure. */
872  CNCBIstdaa query_stdaa;
873  pssm.GetQuerySequenceData(query_stdaa);
874 
875  vector <char> query_v = query_stdaa.Get();
876 
877  if((Int4) (query_v.size()) != seq_size)
878  NCBI_THROW(CBlastException, eCoreBlastError, "Query sequence lengths mismatch");
879 
880  /* allocate query array and PSSM row array */
881  AutoArray<Uint1> query(seq_size);
882 
883  for(unsigned int i = 0; i < query_v.size(); i++)
884  query[i] = query_v[i];
885 
886  unique_ptr<CNcbiMatrix <double> > freq_list (CScorematPssmConverter::GetFreqRatios(seq));
887 
888  CPsiBlastInputFreqRatios pssm_freq_ratio(query.get(), seq_size, *freq_list,
889  matrix_name.c_str(), rpsDbInfo.gap_open,
890  rpsDbInfo.gap_extend, rpsDbInfo.scale_factor);
891  CPssmEngine pssm_engine(&pssm_freq_ratio);
892  CRef<CPssmWithParameters> out_par(pssm_engine.Run());
893 
894  CPssmFinalData & i = pssm.SetFinalData();
895  const CPssmFinalData & o = out_par->GetPssm().GetFinalData();
896  i.SetScores() = o.GetScores();
897  i.SetLambda() = o.GetLambda();
898  i.SetKappa() = o.GetKappa();
899  i.SetH() = o.GetH();
900  i.SetScalingFactor(rpsDbInfo.scale_factor);
901 
902  return;
903 }
904 
905  /* The first sequence in the list determines several
906  parameters that all other sequences in the list must
907  have. In this case, extra initialization is required
908 
909  info contains all the information on data files
910  and parameters from previously added sequences
911  seq is the sequence and PSSM read in from the next data file
912  seq_index refers to the (0-based) position of this sequence
913  in the complete list of seqences
914  seq_size is the number of letters in this sequence
915  alphabet_size refers to the number of PSSM rows
916  */
917  void CMakeProfileDBApp::x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_w_parameters, bool freq_only )
918  {
919  x_UpdateRPSDbInfo(rpsDbInfo, pssm_w_parameters);
920 
921  x_FillInRPSDbParameters(rpsDbInfo, pssm_w_parameters);
922  double wordScoreThreshold = m_WordDefaultScoreThreshold;
923 
924  if(!freq_only)
925  {
926  if(pssm_w_parameters.GetPssm().GetFinalData().IsSetScalingFactor())
927  {
928  rpsDbInfo.scale_factor = pssm_w_parameters.GetPssm().GetFinalData().GetScalingFactor();
929  }
930  else
931  {
932  // asn1 default value is 1
933  rpsDbInfo.scale_factor = 1.0;
934  }
935  if(m_UseModelThreshold && pssm_w_parameters.GetPssm().GetFinalData().IsSetWordScoreThreshold())
936  {
937  wordScoreThreshold = pssm_w_parameters.GetPssm().GetFinalData().GetWordScoreThreshold();
938  }
939  }
940  else
941  {
942  x_RPSUpdateStatistics(rpsDbInfo, pssm_w_parameters, pssm_w_parameters.GetPssm().GetQueryLength());
943  }
944 
945  /* scale up the threshold value and convert to integer */
946  double threshold = rpsDbInfo.scale_factor * wordScoreThreshold;
947 
948  /* create BLAST lookup table */
949  if (LookupTableOptionsNew(eBlastTypeBlastp, &(rpsDbInfo.lookup_options)) != 0)
950  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot create lookup options");
951 
953  FALSE, /* no megablast */
954  threshold, /* neighboring threshold */
955  BLAST_WORDSIZE_PROT ) != 0)
956  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot set lookup table options");
957 
958  if (BlastAaLookupTableNew(rpsDbInfo.lookup_options, &(rpsDbInfo.lookup)) != 0)
959  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot allocate lookup table");
960 
961  rpsDbInfo.lookup->use_pssm = TRUE; /* manually turn on use of PSSMs */
962 
963  /* Perform generic query setup */
964 
965  if (BlastQuerySetUpOptionsNew(&(rpsDbInfo.query_options)) != 0)
966  NCBI_THROW(CBlastException, eCoreBlastError, "Generic query setup failed");
967 
969  NULL, /* no filtering */
970  0 /* strand not applicable */ ) != 0)
971  NCBI_THROW(CBlastException, eCoreBlastError, "Cannot fill query options");
972 
973  /* Write the header of the RPS .aux file */
974  rpsDbInfo.aux_file << rpsDbInfo.matrix << "\n";
975  rpsDbInfo.aux_file << rpsDbInfo.gap_open << "\n";
976  rpsDbInfo.aux_file << rpsDbInfo.gap_extend << "\n";
977  rpsDbInfo.aux_file << scientific << 0.0 << "\n";
978  rpsDbInfo.aux_file << scientific << 0.0 << "\n";
979  rpsDbInfo.aux_file << (int) 0 << "\n";
980  rpsDbInfo.aux_file << (int) 0 << "\n";
981  rpsDbInfo.aux_file << fixed << (double) rpsDbInfo.scale_factor << "\n";
982 
983  return;
984  }
985 
986  void CMakeProfileDBApp::x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_size)
987  {
988  const CPssm & pssm = pssm_p.GetPssm();
989  // Update .blocks file
990  const list<CRef<CCoreBlock> > & block_list = pssm_p.GetParams().GetConstraints().GetBlocks();
991 
992  list<CRef<CCoreBlock> >::const_iterator itr = block_list.begin();
993 
994  int count =0;
995 
996  while(itr != block_list.end())
997  {
998  const CCoreBlock & block = (**itr);
999  if(!block.IsSetStart() || !block.IsSetStop())
1000  NCBI_THROW(CInputException, eInvalidInput, "No start Or stop found in conserved block");
1001 
1002  string seq_id_str = "id" + NStr::IntToString(count);
1003  if(pssm.IsSetQuery())
1004  {
1005  if(pssm.GetQuery().IsSeq())
1006  {
1007  if(pssm.GetQuery().GetSeq().IsSetDescr())
1008  {
1009  const list<CRef<CSeqdesc> > descr_list= pssm.GetQuery().GetSeq().GetDescr();
1010  if(descr_list.size() > 0)
1011  {
1012  const CRef<CSeqdesc> descr = descr_list.front();
1013  if(descr->IsTitle())
1014  {
1015  string title = descr->GetTitle();
1016  string accession;
1017  string tmp;
1018  if(NStr::SplitInTwo(title, ",", accession, tmp))
1019  seq_id_str = accession;
1020  }
1021  }
1022  }
1023  }
1024  }
1025 
1026  rpsDbInfo.blocks_file << seq_id_str << "\t";
1027  rpsDbInfo.blocks_file << count << "\t";
1028  rpsDbInfo.blocks_file << block.GetStart() << "\t";
1029  rpsDbInfo.blocks_file << block.GetStop() << "\n";
1030  count++;
1031  ++itr;
1032  }
1033  return;
1034  }
1035 void CMakeProfileDBApp::x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_index, Int4 seq_size)
1036  {
1037  if (!m_UpdateFreqRatios)
1038  return;
1039 
1040  const CPssm & pssm = pssm_p.GetPssm();
1041  // Update .freq file
1042  Int4 i = 0;
1043  Int4 j = 0;
1045  Int4 alphabet_size = pssm.GetNumRows();
1046 
1047  const list<double> & freq_ratios = pssm.GetIntermediateData().GetFreqRatios();
1048  list<double>::const_iterator itr_fr = freq_ratios.begin();
1049  rpsDbInfo.freq_file.seekp(0, ios_base::end);
1050 
1051  if (pssm.GetByRow() == FALSE) {
1052  for (i = 0; i < seq_size; i++) {
1053  for (j = 0; j < alphabet_size; j++) {
1054  if (itr_fr == freq_ratios.end())
1055  break;
1056  row[j] = (Int4) BLAST_Nint(*itr_fr * FREQ_RATIO_SCALE);
1057  ++itr_fr;
1058  }
1059  for ( ;j < BLASTAA_SIZE; j++) {
1060  row[j] = 0;
1061  }
1062  rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1063  }
1064  }
1065  else {
1066  unique_ptr<CNcbiMatrix<double> > matrix (CScorematPssmConverter::GetFreqRatios(pssm_p));
1067 
1068  for (i = 0; i < seq_size; i++) {
1069  for (j = 0; j < BLASTAA_SIZE; j++) {
1070  row[j] = (Int4) BLAST_Nint((*matrix)(i,j ) * FREQ_RATIO_SCALE);
1071  }
1072  rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1073  }
1074  }
1075 
1076  memset(row, 0, sizeof(row));
1077  rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1078 
1079  rpsDbInfo.freq_file.seekp( 8 + (seq_index) * sizeof(Int4), ios_base::beg);
1080  rpsDbInfo.freq_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1081  return;
1082  }
1083 
1084  /* Incrementally update the BLAST lookup table with
1085  words derived from the present sequence
1086  info contains all the information on data files
1087  and parameters from previously added sequences
1088  seq is the sequence and PSSM read in from the next data file
1089  seq_size is the number of letters in this sequence
1090  */
1092  {
1093  BlastSeqLoc *lookup_segment = NULL;
1094 
1095  /* Tell the blast engine to index the entire input
1096  sequence. Since only the PSSM matters for lookup
1097  table creation, the process does not require
1098  actually extracting the sequence data from 'seq'*/
1099 
1100  BlastSeqLocNew(&lookup_segment, 0, seq_size - 1);
1101 
1102  /* add this sequence to the lookup table. NULL
1103  is passed in place of the query */
1104 
1105  Int4 ** posMatrix = rpsDbInfo.pos_matrix.Get();
1106  if (NULL == posMatrix)
1107  NCBI_THROW(CBlastException, eCoreBlastError, "Empty pos matrix");
1108 
1109  BlastAaLookupIndexQuery(rpsDbInfo.lookup, posMatrix,
1110  NULL, lookup_segment, rpsDbInfo.curr_seq_offset);
1111 
1112  BlastSeqLocFree(lookup_segment);
1113  return;
1114  }
1115 
1116  /* Incrementally update the RPS PSSM file with the
1117  PSSM for the next input sequence
1118  info contains all the information on data files
1119  and parameters from previously added sequences
1120  seq is the sequence and PSSM read in from the next data file
1121  seq_index refers to the (0-based) position of this sequence
1122  in the complete list of seqences
1123  seq_size is the number of letters in this sequence
1124  alphabet_size refers to the number of PSSM rows
1125  */
1126 void CMakeProfileDBApp::x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo, const CPssm & pssm, Int4 seq_index, Int4 seq_size)
1127 {
1128  Int4 i = 0;
1129  Int4 j = 0;
1130 
1131  /* Note that RPS blast requires an extra column at
1132  * the end of the PSSM */
1133 
1134  list<int>::const_iterator score_list_itr = pssm.GetFinalData().GetScores().begin();
1135  list<int>::const_iterator score_list_end = pssm.GetFinalData().GetScores().end();
1136  Int4 alphabet_size = pssm.GetNumRows();
1137 
1138  rpsDbInfo.pos_matrix.Create(seq_size + 1);
1139  Int4 ** posMatrix = rpsDbInfo.pos_matrix.Get();
1140  if (pssm.GetByRow() == FALSE) {
1141  for (i = 0; i < seq_size; i++) {
1142  for (j = 0; j < alphabet_size; j++) {
1143  if (score_list_itr == score_list_end)
1144  break;
1145  posMatrix[i][j] = *score_list_itr;
1146  score_list_itr++;
1147  }
1148  if (j < alphabet_size)
1149  break;
1150  for (; j < BLASTAA_SIZE; j++) {
1151  posMatrix[i][j] = INT2_MIN;
1152  }
1153  }
1154  }
1155  else {
1156  for (j = 0; j < alphabet_size; j++) {
1157  for (i = 0; i < seq_size; i++) {
1158  if (score_list_itr == score_list_end)
1159  break;
1160  posMatrix[i][j] = *score_list_itr;
1161  score_list_itr++;
1162  }
1163  if (i < seq_size)
1164  break;
1165  }
1166  if (j == alphabet_size) {
1167  for (; j < BLASTAA_SIZE; j++) {
1168  for (i = 0; i < seq_size; i++) {
1169  posMatrix[i][j] = INT2_MIN;
1170  }
1171  }
1172  }
1173  }
1174 
1175  if (i < seq_size || j < alphabet_size)
1176  NCBI_THROW(CBlastException, eCoreBlastError, "PSSM was truncated early");
1177 
1178  if(score_list_itr != score_list_end)
1179  NCBI_THROW(CBlastException, eCoreBlastError, "PSSM too large for this sequence");
1180 
1181  /* manually fill in the extra (last) column of the PSSM.
1182  Note that the value to use should more appropriately
1183  be BLAST_SCORE_MIN, but we instead follow the convention
1184  used in copymat */
1185 
1186  for (i = 0; i < BLASTAA_SIZE; i++)
1187  posMatrix[seq_size][i] = -BLAST_SCORE_MAX;
1188 
1189  /* Dump the score matrix, column by column */
1190  rpsDbInfo.pssm_file.seekp(0, ios_base::end);
1191  for (i = 0; i < seq_size + 1; i++) {
1192  rpsDbInfo.pssm_file.write((const char *) posMatrix[i], sizeof(Int4)*BLASTAA_SIZE);
1193  }
1194  /* Write the next context offset. Note that the
1195  RPSProfileHeader structure is one int too large for
1196  our purposes, so that the index of this sequence
1197  must be decremented to get the right byte offset
1198  into the file */
1199 
1200  rpsDbInfo.pssm_file.seekp( 8 + (seq_index) * sizeof(Int4), ios_base::beg);
1201  rpsDbInfo.pssm_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1202 
1203  return;
1204  }
1205 
1206 /* Once all sequences have been processed, perform
1207  final setup on the BLAST lookup table and finish
1208  up the RPS files */
1209 
1211 {
1212  /* Write the last context offset to the PSSM file.
1213  This is the total number of letters for all RPS
1214  DB sequences combined */
1215 
1216  rpsDbInfo.pssm_file.seekp(8 + (rpsDbInfo.num_seqs) * sizeof(Int4), ios::beg);
1217  rpsDbInfo.pssm_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1218  rpsDbInfo.freq_file.seekp(8 + (rpsDbInfo.num_seqs) * sizeof(Int4), ios::beg);
1219  rpsDbInfo.freq_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1220 
1221  /* Pack the lookup table into its compressed form */
1222  if(NULL == rpsDbInfo.lookup)
1223  NCBI_THROW(CBlastException, eCoreBlastError, "Empty database");
1224 
1225  if (BlastAaLookupFinalize(rpsDbInfo.lookup, eBackbone) != 0) {
1226  NCBI_THROW(CBlastException, eCoreBlastError, "Failed to compress lookup table");
1227  }
1228  else {
1229  /* Change the lookup table format to match that
1230  of the legacy BLAST lookup table */
1231 
1232  BlastRPSLookupFileHeader header;
1233  BlastAaLookupTable *lut = rpsDbInfo.lookup;
1234  Int4 i, index;
1235  Int4 cursor, old_cursor;
1236  AaLookupBackboneCell *cell;
1237  RPSBackboneCell empty_cell;
1238 
1239  memset(&header, 0, sizeof(header));
1241 
1242  /* for each lookup table cell */
1243 
1244  for (index = cursor = 0; index < lut->backbone_size; index++) {
1245  cell = (AaLookupBackboneCell*)lut->thick_backbone + index;
1246 
1247 
1248  if (cell->num_used == 0)
1249  continue;
1250 
1251  /* The cell contains hits */
1252 
1253  if (cell->num_used <= RPS_HITS_PER_CELL) {
1254  /* if 3 hits or less, just update each hit offset
1255  to point to the end of the word rather than
1256  the beginning */
1257 
1258  for (i = 0; i < cell->num_used; i++)
1259  cell->payload.entries[i] += BLAST_WORDSIZE_PROT - 1;
1260  }
1261  else {
1262  /* if more than 3 hits, pack the first hit into the
1263  lookup table cell, pack the overflow array byte
1264  offset into the cell, and compress the resulting
1265  'hole' in the overflow array. Update the hit
1266  offsets as well */
1267 
1268  old_cursor = cell->payload.overflow_cursor;
1269  cell->payload.entries[0] = ((Int4*)lut->overflow)[old_cursor] +
1270  BLAST_WORDSIZE_PROT - 1;
1271  cell->payload.entries[1] = cursor * sizeof(Int4);
1272  for (i = 1; i < cell->num_used; i++, cursor++) {
1273  ((Int4*)lut->overflow)[cursor]
1274  = ((Int4*)lut->overflow)[old_cursor + i] +
1275  BLAST_WORDSIZE_PROT - 1;
1276  }
1277  }
1278  }
1279 
1280  header.start_of_backbone = sizeof(header);
1281  header.end_of_overflow = header.start_of_backbone +
1282  (RPS_NUM_LOOKUP_CELLS + 1) * sizeof(RPSBackboneCell) +
1283  cursor * sizeof(Int4);
1284 
1285  /* write the lookup file header */
1286 
1287  rpsDbInfo.lookup_file.write((const char *)&header, sizeof(header));
1288 
1289  /* write the thick backbone */
1290 
1291  rpsDbInfo.lookup_file.write((const char *)lut->thick_backbone,
1292  sizeof(RPSBackboneCell)* lut->backbone_size);
1293 
1294  /* write extra backbone cells */
1295  memset(&empty_cell, 0, sizeof(empty_cell));
1296  for (i = lut->backbone_size; i < RPS_NUM_LOOKUP_CELLS + 1; i++) {
1297  rpsDbInfo.lookup_file.write((const char *)&empty_cell, sizeof(empty_cell));
1298  }
1299 
1300  /* write the new overflow array */
1301  rpsDbInfo.lookup_file.write((const char *)lut->overflow, sizeof(Int4)*cursor);
1302  }
1303 
1304  /* Free data, close files */
1305 
1306  rpsDbInfo.lookup = BlastAaLookupTableDestruct(rpsDbInfo.lookup);
1308  rpsDbInfo.lookup_file.flush();
1309  rpsDbInfo.lookup_file.close();
1310  rpsDbInfo.pssm_file.flush();
1311  rpsDbInfo.pssm_file.close();
1312  rpsDbInfo.aux_file.flush();
1313  rpsDbInfo.aux_file.close();
1314  rpsDbInfo.freq_file.flush();
1315  rpsDbInfo.freq_file.close();
1316 
1317  if(op_cobalt == m_op_mode)
1318  {
1319  rpsDbInfo.blocks_file.flush();
1320  rpsDbInfo.blocks_file.close();
1321  }
1322  else if(!m_UpdateFreqRatios)
1323  {
1324  string freq_str = rpsDbInfo.db_name + ".freq";
1325  CFile(freq_str).Remove();
1326  }
1327 
1328 }
1329 
1331 {
1334 }
1335 
1336 static bool s_HasDefline(const CBioseq & bio)
1337 {
1338  if (bio.CanGetDescr()) {
1339  return true;
1340  }
1341 
1342  return false;
1343 }
1344 
1346 {
1348  CRef<CBlast_def_line> defline(new CBlast_def_line());
1349  defline->SetSeqid() = bio.GetId();
1350  defline_set->Set().push_back(defline);
1351  return defline_set;
1352 }
1353 
1355 {
1358  *m_LogFile << "Deleted existing BLAST database with identical name." << endl;
1359  }
1360  vector<string> smpFilenames = (op_delta == m_op_mode )? x_CreateDeltaList():x_GetSMPFilenames();
1361  int num_smps = smpFilenames.size();
1362  m_NumOfVols = num_smps/m_MaxSmpFilesPerVol + 1;
1363  int num_seqs = num_smps/m_NumOfVols;
1364  int residue_seqs = num_smps % m_NumOfVols;
1365  if(m_NumOfVols == 1) {
1366  x_MakeVol( -1, smpFilenames);
1367  m_Done = true;
1368  return 0;
1369  }
1370  else {
1371  vector<string>::iterator b = smpFilenames.begin();
1372  vector<string>::iterator r = b + num_seqs;
1373  for(int i=0; i < m_NumOfVols; i++) {
1374  vector<string> vol_smps(b, r);
1375  x_MakeVol(i, vol_smps);
1376  b= r;
1377  r = b + num_seqs;
1378  if(residue_seqs > 0) {
1379  r++;
1380  residue_seqs--;
1381  }
1382  }
1383  _ASSERT(b==smpFilenames.end());
1384  }
1385  if (m_NumOfVols == m_VolNames.size()) {
1387  m_Done = true;
1388  }
1389  return 0;
1390 }
1391 
1392 void CMakeProfileDBApp::x_MakeVol(Int4 vol, vector<string> & smps)
1393 {
1394 
1395  CRPS_DbInfo rpsDbInfo;
1396  x_InitRPSDbInfo(rpsDbInfo, vol, smps.size());
1397  m_VolNames.push_back(rpsDbInfo.db_name);
1398  x_InitOutputDb(rpsDbInfo);
1399 
1400  for(int seq_index=0; seq_index < rpsDbInfo.num_seqs; seq_index++)
1401  {
1402  string filename = smps[seq_index];
1403  CFile f(filename);
1404  if(!f.Exists())
1405  {
1406  string err = filename + " does not exists";
1407  NCBI_THROW(CInputException, eInvalidInput, err);
1408  }
1409 
1410  //Read PssmWithParameters from file
1411  CPssmWithParameters pssm_w_parameters;
1412  if(m_binary_scoremat)
1413  {
1414  CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1415  in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1416  }
1417  else
1418  {
1419  CNcbiIfstream in_stream(filename.c_str());
1420  in_stream >> MSerial_AsnText >> pssm_w_parameters;
1421  }
1422 
1423  CheckInputScoremat_RV sm = x_CheckInputScoremat(pssm_w_parameters, filename);
1424  // Should have error out already....
1425  if(sm_invalid == sm)
1426  {
1427  string err = filename + " contains invalid scoremat";
1428  NCBI_THROW(CInputException, eInvalidInput, err);
1429  }
1430 
1431  const CPssm & pssm = pssm_w_parameters.GetPssm();
1432  int seq_size = pssm.GetQueryLength();
1433 
1434  const CBioseq & bioseq = pssm.GetQuery().GetSeq();
1435  CRef<CBlast_def_line_set> deflines;
1436  if(s_HasDefline(bioseq)) {
1437  deflines = CWriteDB::ExtractBioseqDeflines(bioseq);
1438  }
1439  else {
1440  deflines = s_GenerateBlastDefline(bioseq);
1441  }
1442 
1443  // set taxids from the PSSM unless -taxid or -taxid_map option was used
1444  if (!m_UserTaxIds) {
1445  if (bioseq.IsSetDescr()) {
1446  for (const auto& it: bioseq.GetDescr().Get()) {
1447  if (it->IsOrg()) {
1448  TTaxId taxid = it->GetOrg().GetTaxId();
1449  const CSeq_id* seqid = bioseq.GetFirstId();
1450  _ASSERT(seqid);
1451  if (seqid) {
1452  m_Taxids->AddTaxId(*seqid, taxid);
1453  }
1454 
1455  break;
1456  }
1457  }
1458  }
1459  }
1460 
1461  m_Taxids->FixTaxId(deflines);
1462  rpsDbInfo.output_db->AddSequence(bioseq);
1463  rpsDbInfo.output_db->SetDeflines(*deflines);
1464 
1465  //Complete RpsDnInfo init with data from first file
1466  if(NULL == rpsDbInfo.lookup)
1467  {
1468  x_RPSAddFirstSequence( rpsDbInfo, pssm_w_parameters, sm == sm_valid_freq_only);
1469  }
1470  else
1471  {
1472  x_FillInRPSDbParameters(rpsDbInfo, pssm_w_parameters);
1473  if(sm_valid_freq_only == sm){
1474  x_RPSUpdateStatistics(rpsDbInfo, pssm_w_parameters, seq_size);
1475  }
1476 
1477  if( pssm.GetFinalData().IsSetScalingFactor())
1478  {
1479  if( pssm.GetFinalData().GetScalingFactor() != rpsDbInfo.scale_factor) {
1480  NCBI_THROW(CBlastException, eCoreBlastError, "Scaling factors do not match");
1481  }
1482  }
1483  else
1484  {
1485  // If scaling factor not specified, the default is 1
1486  if( 1 != rpsDbInfo.scale_factor) {
1487  NCBI_THROW(CBlastException, eCoreBlastError, "Scaling factors do not match");
1488  }
1489  }
1490 
1492  rpsDbInfo.lookup->threshold = rpsDbInfo.scale_factor * pssm_w_parameters.GetPssm().GetFinalData().GetWordScoreThreshold();
1493  }
1494  else {
1495  rpsDbInfo.lookup->threshold = rpsDbInfo.scale_factor * m_WordDefaultScoreThreshold;
1496  }
1497 
1498  }
1499 
1500  x_RPSUpdatePSSM(rpsDbInfo, pssm, seq_index, seq_size);
1501  x_RPSUpdateLookup(rpsDbInfo, seq_size);
1502  x_UpdateFreqRatios(rpsDbInfo, pssm_w_parameters, seq_index, seq_size);
1503 
1504  rpsDbInfo.aux_file << seq_size << "\n";
1505  rpsDbInfo.aux_file << scientific << pssm.GetFinalData().GetKappa() << "\n";
1506  rpsDbInfo.curr_seq_offset +=(seq_size +1);
1507  rpsDbInfo.pos_matrix.Delete();
1508 
1509  if(op_cobalt == m_op_mode) {
1510  x_UpdateCobalt(rpsDbInfo, pssm_w_parameters, seq_size);
1511  }
1512  }
1513 
1514  if(op_delta == m_op_mode) {
1515  x_UpdateDelta(rpsDbInfo, smps);
1516  }
1517  rpsDbInfo.output_db->Close();
1518  x_RPS_DbClose(rpsDbInfo);
1519 }
1520 
1521 static void s_WriteInt4List(CNcbiOfstream & ostr, const list<Int4> & l)
1522 {
1523  ITERATE(list<Int4>, it, l)
1524  {
1525  ostr.write((char*)&(*it), sizeof(Int4));
1526  }
1527 }
1528 
1529 static void s_WriteUint4List(CNcbiOfstream & ostr, const list<Uint4> & l)
1530 {
1531  ITERATE(list<Uint4>, it, l)
1532  {
1533  ostr.write((char*)&(*it), sizeof(Uint4));
1534  }
1535 }
1536 
1538 {
1539  vector<string> smpFilenames = x_GetSMPFilenames();
1540  vector<string> deltaList;
1541 
1542  for(unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1543  {
1544  string filename = smpFilenames[seq_index];
1545  CFile f(filename);
1546  if(!f.Exists())
1547  {
1548  string err = filename + " does not exists";
1549  NCBI_THROW(CInputException, eInvalidInput, err);
1550  }
1551 
1552  //Read PssmWithParameters from file
1553  CPssmWithParameters pssm_w_parameters;
1554  if(m_binary_scoremat)
1555  {
1556  CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1557  in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1558  }
1559  else
1560  {
1561  CNcbiIfstream in_stream(filename.c_str());
1562  in_stream >> MSerial_AsnText >> pssm_w_parameters;
1563  }
1564 
1565  CheckInputScoremat_RV sm = x_CheckInputScoremat(pssm_w_parameters, filename);
1566  // Should have error out already....
1567  if(sm_invalid == sm)
1568  {
1569  string err = filename + " contains invalid scoremat";
1570  NCBI_THROW(CInputException, eInvalidInput, err);
1571  }
1572 
1573  const CPssm & pssm = pssm_w_parameters.GetPssm();
1574  int seq_size = pssm.GetQueryLength();
1576  {
1577  string err = filename + " contains no weighted residue frequencies for building delta database";
1578  NCBI_THROW(CInputException, eInvalidInput, err);
1579  }
1580 
1582  {
1583  string err = filename + " contains no observations information for building delta database";
1584  NCBI_THROW(CInputException, eInvalidInput, err);
1585  }
1586 
1587  if (true == x_CheckDelta(pssm, seq_size, filename))
1588  {
1589  deltaList.push_back(filename);
1590  }
1591  }
1592 
1593  return deltaList;
1594 }
1595 
1596 void CMakeProfileDBApp::x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames)
1597 {
1598  CTmpFile tmp_obsr_file(CTmpFile::eRemove);
1599  CTmpFile tmp_freq_file(CTmpFile::eRemove);
1600  CNcbiOfstream tmp_obsr_buff(tmp_obsr_file.GetFileName().c_str(), IOS_BASE::out | IOS_BASE::binary);
1601  CNcbiOfstream tmp_freq_buff(tmp_freq_file.GetFileName().c_str(), IOS_BASE::out | IOS_BASE::binary);
1602 
1603  list<Int4> FreqOffsets;
1604  list<Int4> ObsrOffsets;
1605  Int4 CurrFreqOffset = 0;
1606  Int4 CurrObsrOffset= 0;
1607 
1608  for(unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1609  {
1610  string filename = smpFilenames[seq_index];
1611  //Read PssmWithParameters from file
1612  CPssmWithParameters pssm_w_parameters;
1613  if(m_binary_scoremat)
1614  {
1615  CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1616  in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1617  }
1618  else
1619  {
1620  CNcbiIfstream in_stream(filename.c_str());
1621  in_stream >> MSerial_AsnText >> pssm_w_parameters;
1622  }
1623 
1624  const CPssm & pssm = pssm_w_parameters.GetPssm();
1625  int seq_size = pssm.GetQueryLength();
1626 
1627  // get weightd residue frequencies
1628  const list<double>& orig_freqs = pssm.GetIntermediateData().GetWeightedResFreqsPerPos();
1629 
1630  // get number of independent observations
1631  const list<double>& obsr = pssm.GetIntermediateData().GetNumIndeptObsr();
1632 
1633  int alphabet_size = pssm.GetNumRows();
1634  list<double> modify_freqs;
1635 
1636  if(pssm.GetByRow())
1637  {
1638  // need to flip the freq matrix
1639  vector<double> tmp(orig_freqs.size());
1640  list<double>::const_iterator f_itr = orig_freqs.begin();
1641 
1642  for(int i = 0; i < alphabet_size; i++)
1643  {
1644  for(int j = 0; j < seq_size; j++)
1645  {
1646  tmp[i + j*alphabet_size] = *f_itr;
1647  ++f_itr;
1648  }
1649  }
1650  copy(tmp.begin(), tmp.end(), modify_freqs.begin());
1651  }
1652 
1653  // Pad matrix if necessary
1654  if(alphabet_size < BLASTAA_SIZE)
1655  {
1656  if(0 == modify_freqs.size())
1657  copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1658 
1659  list<double>::iterator p_itr = modify_freqs.begin();
1660 
1661  for (int j=0; j < seq_size; j++)
1662  {
1663  for(int i=0; i < alphabet_size; i++)
1664  {
1665  if(modify_freqs.end() == p_itr)
1666  break;
1667 
1668  ++p_itr;
1669  }
1670 
1671  modify_freqs.insert(p_itr, (BLASTAA_SIZE-alphabet_size), 0);
1672  }
1673  }
1674 
1675  const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1676 
1677  //save offset for this record
1678  ObsrOffsets.push_back(CurrObsrOffset);
1679 
1680  list<Uint4> ObsrBuff;
1681  // write effective observations in compressed form
1682  // as a list of pairs: value, number of occurences
1683  unsigned int num_obsr_columns = 0;
1684  list<double>::const_iterator obsr_it = obsr.begin();
1685  do
1686  {
1687  double current = *obsr_it;
1688  Uint4 num = 1;
1689  num_obsr_columns++;
1690  obsr_it++;
1691  while (obsr_it != obsr.end() && fabs(*obsr_it - current) < 1e-4)
1692  {
1693  obsr_it++;
1694  num++;
1695  num_obsr_columns++;
1696  }
1697 
1698  // +1 because pssm engine returns alpha (in psi-blast papers)
1699  // which is number of independent observations - 1
1700  ObsrBuff.push_back((Uint4)((current + 1.0) * kFixedPointScaleFactor));
1701  ObsrBuff.push_back(num);
1702  }
1703  while (obsr_it != obsr.end());
1704 
1705  Uint4 num_weighted_counts = 0;
1706 
1707  // save offset for this frequencies record
1708  FreqOffsets.push_back(CurrFreqOffset / BLASTAA_SIZE);
1709 
1710  list<Uint4> FreqBuff;
1711  // save weighted residue frequencies
1712  ITERATE (list<double>, it, freqs)
1713  {
1714  FreqBuff.push_back((Uint4)(*it * kFixedPointScaleFactor));
1715  num_weighted_counts++;
1716  }
1717 
1718  if (num_obsr_columns != num_weighted_counts / BLASTAA_SIZE)
1719  {
1720  string err = "Number of frequencies and observations columns do not match in " + filename;
1721  NCBI_THROW(CException, eInvalid, err);
1722  }
1723 
1724  // additional column of zeros is added for compatibility with rps database
1725  unsigned int padded_size = FreqBuff.size() + BLASTAA_SIZE;
1726  FreqBuff.resize(padded_size, 0);
1727 
1728  CurrFreqOffset += FreqBuff.size();
1729  CurrObsrOffset += ObsrBuff.size();
1730  s_WriteUint4List(tmp_freq_buff, FreqBuff);
1731  s_WriteUint4List(tmp_obsr_buff, ObsrBuff);
1732 
1733  }
1734 
1735  tmp_obsr_buff.flush();
1736  tmp_freq_buff.flush();
1737  x_WrapUpDelta(rpsDbInfo, tmp_obsr_file, tmp_freq_file, FreqOffsets, ObsrOffsets, CurrFreqOffset, CurrObsrOffset);
1738 }
1739 
1740 
1741 bool CMakeProfileDBApp::x_ValidateCd(const list<double>& freqs,
1742  const list<double>& observ,
1743  unsigned int alphabet_size)
1744 {
1745 
1746  if (freqs.size() / alphabet_size != observ.size())
1747  {
1748  string err = "Number of frequency and observations columns do not match";
1749  NCBI_THROW(CException, eInvalid, err);
1750  }
1751 
1752  ITERATE (list<double>, it, freqs)
1753  {
1754  unsigned int residue = 0;
1755  double sum = 0.0;
1756  while (residue < alphabet_size - 1)
1757  {
1758  sum += *it;
1759  it++;
1760  residue++;
1761  }
1762  sum += *it;
1763 
1764  if (fabs(sum - 1.0) > kEpsylon)
1765  return false;
1766  }
1767 
1768  ITERATE (list<double>, it, observ)
1769  {
1770  if (*it < 1.0)
1771  return false;
1772  }
1773 
1774  return true;
1775 }
1776 
1777 
1778 bool CMakeProfileDBApp::x_CheckDelta( const CPssm & pssm, Int4 seq_size, const string & filename)
1779 {
1780  // get weightd residue frequencies
1781  const list<double>& orig_freqs = pssm.GetIntermediateData().GetWeightedResFreqsPerPos();
1782 
1783  // get number of independent observations
1784  const list<double>& obsr = pssm.GetIntermediateData().GetNumIndeptObsr();
1785 
1786  int alphabet_size = pssm.GetNumRows();
1787  list<double> modify_freqs;
1788 
1789  if(pssm.GetByRow())
1790  {
1791  // need to flip the freq matrix
1792  vector<double> tmp(orig_freqs.size());
1793  list<double>::const_iterator f_itr = orig_freqs.begin();
1794 
1795  for(int i = 0; i < alphabet_size; i++)
1796  {
1797  for(int j = 0; j < seq_size; j++)
1798  {
1799  tmp[i + j*alphabet_size] = *f_itr;
1800  ++f_itr;
1801  }
1802  }
1803  copy(tmp.begin(), tmp.end(), modify_freqs.begin());
1804  }
1805 
1806  // Pad matrix if necessary
1807  if(alphabet_size < BLASTAA_SIZE)
1808  {
1809  if(0 == modify_freqs.size())
1810  copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1811 
1812  list<double>::iterator p_itr = modify_freqs.begin();
1813 
1814  for (int j=0; j < seq_size; j++)
1815  {
1816  for(int i=0; i < alphabet_size; i++)
1817  {
1818  if(modify_freqs.end() == p_itr)
1819  break;
1820 
1821  ++p_itr;
1822  }
1823 
1824  modify_freqs.insert(p_itr, (BLASTAA_SIZE-alphabet_size), 0);
1825  }
1826  }
1827 
1828  const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1829  double max_obsr = *max_element(obsr.begin(), obsr.end()) + 1.0;
1830  if(max_obsr < m_ObsrvThreshold)
1831  {
1832  *m_LogFile << filename +
1833  " was excluded: due to too few independent observations\n";
1834  return false;
1835  }
1836 
1837  if( !x_ValidateCd(freqs, obsr, BLASTAA_SIZE) && m_ExcludeInvalid)
1838  {
1839  *m_LogFile << filename +
1840  " was excluded: it conatins an invalid CD \n";
1841  return false;
1842  }
1843  return true;
1844 }
1845 
1846 
1847 
1848 void CMakeProfileDBApp::x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo, CTmpFile & tmp_obsr_file, CTmpFile & tmp_freq_file,
1849  list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset)
1850 {
1851  FreqOffsets.push_back(CurrFreqOffset / BLASTAA_SIZE);
1852  ObsrOffsets.push_back(CurrObsrOffset);
1853 
1854  string wcounts_str = rpsDbInfo.db_name + ".wcounts";
1855  CNcbiOfstream wcounts_file(wcounts_str.c_str(), ios::out | ios::binary);
1856  if (!wcounts_file.is_open())
1857  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .wcounts file");
1858 
1859  string obsr_str = rpsDbInfo.db_name + ".obsr";
1860  CNcbiOfstream obsr_file(obsr_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
1861  if (!obsr_file.is_open())
1862  NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .obsr file");
1863 
1864  CNcbiIfstream tmp_obsr_buff (tmp_obsr_file.GetFileName().c_str(), IOS_BASE::in | IOS_BASE::binary);
1865  CNcbiIfstream tmp_freq_buff (tmp_freq_file.GetFileName().c_str(), IOS_BASE::in | IOS_BASE::binary);
1866 
1867  // write RPS BLAST database magic number
1868  Int4 magic_number = RPS_MAGIC_NUM_28;
1869  wcounts_file.write((char*)&magic_number, sizeof(Int4));
1870  obsr_file.write((char*)&magic_number, sizeof(Int4));
1871 
1872  // write number of recrods
1873  Int4 num_wcounts_records = FreqOffsets.size() -1;
1874  Int4 num_obsr_records = ObsrOffsets.size() -1;
1875  wcounts_file.write((char*)&num_wcounts_records, sizeof(Int4));
1876  obsr_file.write((char*)&num_obsr_records, sizeof(Int4));
1877 
1878  s_WriteInt4List(wcounts_file, FreqOffsets);
1879  wcounts_file.flush();
1880  wcounts_file << tmp_freq_buff.rdbuf();
1881  wcounts_file.flush();
1882  wcounts_file.close();
1883 
1884  s_WriteInt4List(obsr_file, ObsrOffsets);
1885  obsr_file.flush();
1886  obsr_file << tmp_obsr_buff.rdbuf();
1887  obsr_file.flush();
1888  obsr_file.close();
1889 }
1890 
1892 {
1893  vector<string> v;
1894  for(unsigned int i=0; i < m_VolNames.size(); i++) {
1895  string t = kEmptyStr;
1897  s.GetString(t);
1898  v.push_back(t);
1899  }
1902 }
1903 
1905 {
1906  int status = 0;
1907  try { x_Run(); }
1908  catch(const blast::CInputException& e) {
1909  ERR_POST(Error << "INPUT ERROR: " << e.GetMsg());
1910  status = BLAST_INPUT_ERROR;
1911  }
1912  catch (const CSeqDBException& e) {
1913  ERR_POST(Error << "ERROR: " << e.GetMsg());
1914  status = BLAST_DATABASE_ERROR;
1915  }
1916  catch (const blast::CBlastException& e) {
1917  ERR_POST(Error << "ERROR: " << e.GetMsg());
1918  status = BLAST_INPUT_ERROR;
1919  }
1920  catch (const CException& e) {
1921  ERR_POST(Error << "ERROR: " << e.GetMsg());
1922  status = BLAST_UNKNOWN_ERROR;
1923  }
1924  catch (...) {
1925  ERR_POST(Error << "Error: Unknown exception");
1926  status = BLAST_UNKNOWN_ERROR;
1927  }
1928 
1929  x_AddCmdOptions();
1931  return status;
1932 }
1933 
1935 {
1936  const CArgs & args = GetArgs();
1937  if (args["dbtype"].HasValue()) {
1938  m_UsageReport.AddParam(CBlastUsageReport::eDBType, args["dbtype"].AsString());
1939  }
1940  if(args["taxid"].HasValue() || args["taxid_map"].HasValue()) {
1942  }
1943 }
1944 
1945 
1946 #ifndef SKIP_DOXYGEN_PROCESSING
1947 int main(int argc, const char* argv[] /*, const char* envp[]*/)
1948 {
1949  return CMakeProfileDBApp().AppMain(argc, argv);
1950 }
1951 
1952 
1953 
1954 
1955 #endif /* SKIP_DOXYGEN_PROCESSING */
1956 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Declares singleton objects to store the version and reference for the BLAST engine.
Routines for creating protein BLAST lookup tables.
@ eBackbone
BlastAaLookupTable * BlastAaLookupTableDestruct(BlastAaLookupTable *lookup)
Free the lookup table.
void BlastAaLookupIndexQuery(BlastAaLookupTable *lookup, Int4 **matrix, BLAST_SequenceBlk *query, BlastSeqLoc *unmasked_regions, Int4 query_bias)
Index a protein query.
struct RPSBackboneCell RPSBackboneCell
structure defining one cell of the RPS lookup table
#define RPS_HITS_PER_CELL
maximum number of hits in an RPS backbone cell; this may be redundant (have the same value as AA_HITS...
Int4 BlastAaLookupFinalize(BlastAaLookupTable *lookup, EBoneType bone_type)
Pack the data structures comprising a protein lookup table into their final form.
Int4 BlastAaLookupTableNew(const LookupTableOptions *opt, BlastAaLookupTable **lut)
Create a new protein lookup table.
#define BLAST_INPUT_ERROR
Command line binary exit code: error in input query/options.
#define BLAST_UNKNOWN_ERROR
Command line binary exit code: unknown error.
#define BLAST_DATABASE_ERROR
Command line binary exit code: error in database/subject.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Definition: blast_filter.c:737
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
Definition: blast_filter.c:608
Interface for converting sources of sequence data into blast sequence input.
The structures and functions in blast_options.
Int2 BLAST_FillQuerySetUpOptions(QuerySetUpOptions *options, EBlastProgramType program, const char *filter_string, Uint1 strand_option)
Fill non-default contents of the QuerySetUpOptions.
Int2 BlastQuerySetUpOptionsNew(QuerySetUpOptions **options)
Allocate memory for QuerySetUpOptions and fill with default values.
Int2 BLAST_FillLookupTableOptions(LookupTableOptions *options, EBlastProgramType program, Boolean is_megablast, double threshold, Int4 word_size)
Allocate memory for lookup table options and fill with default values.
Int2 LookupTableOptionsNew(EBlastProgramType program, LookupTableOptions **options)
Allocate memory for lookup table options and fill with default values.
#define BLAST_WORDSIZE_PROT
length of word to trigger an extension.
Definition: blast_options.h:66
LookupTableOptions * LookupTableOptionsFree(LookupTableOptions *options)
Deallocates memory for LookupTableOptions*.
QuerySetUpOptions * BlastQuerySetUpOptionsFree(QuerySetUpOptions *options)
Deallocate memory for QuerySetUpOptions.
@ eBlastTypePsiBlast
Definition: blast_program.h:82
@ eBlastTypeBlastp
Definition: blast_program.h:73
#define FREQ_RATIO_SCALE
header for RPS blast frequency ratios ('.freq') file
Definition: blast_rps.h:83
#define RPS_MAGIC_NUM_28
Version number for 28-letter alphabet.
Definition: blast_rps.h:44
Int2 BLAST_GetProteinGapExistenceExtendParams(const char *matrixName, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
Definition: blast_stat.c:3374
#define BLAST_SCORE_MAX
maximum allowed score (for one letter comparison).
Definition: blast_stat.h:122
Code to build a database given various sources of sequence data.
AutoArray –.
Definition: ncbimisc.hpp:527
Class to constrain the values of an argument to those greater than or equal to the value specified in...
CArgAllow_Integers –.
Definition: ncbiargs.hpp:1751
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
Defines BLAST error codes (user errors included)
void AddParam(EUsageParams p, int val)
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
Definition: version.hpp:53
static void CreateDirectories(const string &dbname)
Create Directory for blast db.
Definition: build_db.cpp:1051
CCoreBlock –.
Definition: CoreBlock.hpp:66
CFile –.
Definition: ncbifile.hpp:1605
Defines user input exceptions.
void Create(int seq_size)
Int4 ** Get(void)
unsigned int GetSize(void)
QuerySetUpOptions * query_options
LookupTableOptions * lookup_options
CMakeProfileDBApp(void)
@inheritDoc
void x_AddCmdOptions(void)
virtual void Init()
@inheritDoc
CNcbiOstream * m_LogFile
CheckInputScoremat_RV x_CheckInputScoremat(const CPssmWithParameters &pssm_w_parameters, const string &filename)
CRef< CTaxIdSet > m_Taxids
CNcbiIstream * m_InPssmList
EBlastDbVersion m_DbVer
void x_RPSUpdateLookup(CRPS_DbInfo &rpsDbInfo, Int4 seq_size)
vector< string > x_CreateDeltaList(void)
void x_WrapUpDelta(CRPS_DbInfo &rpsDbInfo, CTmpFile &tmp_obsr_file, CTmpFile &tmp_freq_file, list< Int4 > &FreqOffsets, list< Int4 > &ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset)
void x_RPSUpdateStatistics(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &seq, Int4 seq_size)
virtual int Run()
@inheritDoc
void x_CreateAliasFile(void)
void x_FillInRPSDbParameters(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &pssm_p)
void x_InitProgramParameters(void)
void x_InitRPSDbInfo(CRPS_DbInfo &rpsDBInfo, Int4 vol, Int4 num_files)
void x_RPS_DbClose(CRPS_DbInfo &rpsDbInfo)
bool x_CheckDelta(const CPssm &pssm, Int4 seq_size, const string &filename)
void x_RPSAddFirstSequence(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &pssm_w_parameters, bool freq_only)
void x_UpdateRPSDbInfo(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p)
void x_UpdateDelta(CRPS_DbInfo &rpsDbInfo, vector< string > &smpFilenames)
double m_WordDefaultScoreThreshold
void x_RPSUpdatePSSM(CRPS_DbInfo &rpsDbInfo, const CPssm &pssm, Int4 seq_index, Int4 seq_size)
void x_InitOutputDb(CRPS_DbInfo &rpsDBInfo)
void x_SetupArgDescriptions(void)
CBlastUsageReport m_UsageReport
vector< string > m_VolNames
CStopWatch m_StopWatch
bool x_ValidateCd(const list< double > &freqs, const list< double > &observ, unsigned int alphabet_size)
void x_UpdateFreqRatios(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p, Int4 seq_index, Int4 seq_size)
bool x_IsUpdateFreqRatios(const CPssm &p)
vector< string > x_GetSMPFilenames(void)
void x_UpdateCobalt(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p, Int4 seq_size)
void x_MakeVol(Int4 vol, vector< string > &smps)
CNCBIstdaa –.
Definition: NCBIstdaa.hpp:66
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
Implements the interface to retrieve data for the last 2 stages of the PSSM creation.
Computes a PSSM as specified in PSI-BLAST.
CPssmFinalData –.
CPssmParameters –.
Definition: Pssm.hpp:55
void GetQuerySequenceData(CNCBIstdaa &sequence) const
Retrieve the query sequence data in ncbistdaa format.
Definition: Pssm.cpp:77
SIZE_TYPE GetQueryLength() const
Return the query length or 0 if no query is available.
Definition: Pssm.cpp:62
CSeqDBException.
Definition: seqdbcommon.hpp:73
String slicing.
void GetString(string &s) const
Return the data by assigning it to a string.
@ eProtein
Definition: seqdb.hpp:174
CStopWatch –.
Definition: ncbitime.hpp:1937
void FixTaxId(CRef< objects::CBlast_def_line_set > deflines)
Check that each defline has the specified taxid; if not, replace the defline and set the taxid.
Definition: taxid_set.cpp:131
void AddTaxId(const objects::CSeq_id &seqid, const TTaxId &taxid)
Definition: taxid_set.cpp:77
void SetMappingFromFile(CNcbiIstream &f)
Definition: taxid_set.cpp:45
CTmpFile –.
Definition: ncbifile.hpp:2353
static string MakeShortName(const string &base, int index)
Construct the short name for a volume.
CWriteDB.
Definition: writedb.hpp:92
void SetMaxFileSize(Uint8 sz)
Set maximum size for output files.
Definition: writedb.cpp:118
@ eProtein
Protein database.
Definition: writedb.hpp:97
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
Definition: writedb.cpp:79
EIndexType
Whether and what kind of indices to build.
Definition: writedb.hpp:104
@ eDefault
Like eFullIndex but also build a numeric Trace ID index.
Definition: writedb.hpp:121
@ eNoIndex
Build a database without any indices.
Definition: writedb.hpp:106
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
Definition: writedb.cpp:129
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
Definition: writedb.cpp:94
void Close()
Close the Database.
Definition: writedb.cpp:104
Constant declarations for command line arguments for BLAST programs.
const string kArgMatrixName
Argument for scoring matrix.
const string kArgDbTitle
Title for the BLAST database.
const string kArgGapExtend
Argument to select the gap extending penalty.
const string kArgGapOpen
Argument to select the gap opening penalty.
const string kArgWordScoreThreshold
Argument to specify the minimum word score such that the word is added to the lookup table.
void Print(const CCompactSAMApplication::AlignInfo &ai)
std::ofstream out("events_result.xml")
main entry point for tests
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static char tmp[3200]
Definition: utf8.c:42
static FILE * f
Definition: readconf.c:23
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
static CNcbiMatrix< double > * GetFreqRatios(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1187
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1325
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:832
CVersionInfo GetVersion(void) const
Get the program version information.
Definition: ncbiapp.cpp:1197
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1208
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
@ fHideXmlHelp
Hide XML help description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
Definition: ncbiargs.hpp:590
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
Definition: ncbifile.cpp:2595
const string & GetFileName(void) const
Return used file name (generated or given in the constructor).
Definition: ncbifile.cpp:5429
@ eRemove
Remove file.
Definition: ncbifile.hpp:2357
#define MSerial_AsnBinary
Definition: serialbase.hpp:697
const TPrim & Get(void) const
Definition: serialbase.hpp:347
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
bool IsEnabled(void)
Indicates whether application usage statistics collection is enabled for a current reporter instance.
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5181
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3545
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
@ fSplit_Truncate
Definition: ncbistr.hpp:2503
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2500
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2775
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2764
#define CVersion
Tdata & Set(void)
Assign a value to data member.
const TFreqRatios & GetFreqRatios(void) const
Get the FreqRatios member data.
const TNumIndeptObsr & GetNumIndeptObsr(void) const
Get the NumIndeptObsr member data.
const TMatrixName & GetMatrixName(void) const
Get the MatrixName member data.
const TQuery & GetQuery(void) const
Get the Query member data.
Definition: Pssm_.hpp:772
TNumRows GetNumRows(void) const
Get the NumRows member data.
Definition: Pssm_.hpp:610
void SetParams(TParams &value)
Assign a value to Params data member.
bool IsSetFinalData(void) const
Final representation for the PSSM Check if a value has been assigned to FinalData data member.
Definition: Pssm_.hpp:802
bool IsSetStop(void) const
end of block on query Check if a value has been assigned to Stop data member.
Definition: CoreBlock_.hpp:367
TH GetH(void) const
Get the H member data.
TKappa GetKappa(void) const
Get the Kappa member data.
const TScores & GetScores(void) const
Get the Scores member data.
const TWeightedResFreqsPerPos & GetWeightedResFreqsPerPos(void) const
Get the WeightedResFreqsPerPos member data.
bool IsSetGapOpen(void) const
gap opening penalty corresponding to the matrix above Check if a value has been assigned to GapOpen d...
TGapExtend GetGapExtend(void) const
Get the GapExtend member data.
TWordScoreThreshold GetWordScoreThreshold(void) const
Get the WordScoreThreshold member data.
TScalingFactor GetScalingFactor(void) const
Get the ScalingFactor member data.
const TBlocks & GetBlocks(void) const
Get the Blocks member data.
Definition: CoreDef_.hpp:369
bool IsSetStart(void) const
begin of block on query Check if a value has been assigned to Start data member.
Definition: CoreBlock_.hpp:320
bool IsSetWordScoreThreshold(void) const
Word score threshold Check if a value has been assigned to WordScoreThreshold data member.
bool IsSetScalingFactor(void) const
scaling factor used to obtain more precision when building the PSSM.
bool IsSetFreqRatios(void) const
PSSM's frequency ratios Check if a value has been assigned to FreqRatios data member.
TStop GetStop(void) const
Get the Stop member data.
Definition: CoreBlock_.hpp:386
void SetMatrixName(const TMatrixName &value)
Assign a value to MatrixName data member.
bool IsSetIntermediateData(void) const
both intermediateData and finalData can be provided, but at least one of them must be provided.
Definition: Pssm_.hpp:781
const TFinalData & GetFinalData(void) const
Get the FinalData member data.
Definition: Pssm_.hpp:814
bool IsSetWeightedResFreqsPerPos(void) const
Weighted observed residue frequencies per position of the PSSM.
bool IsSetRpsdbparams(void) const
data needed by formatrpsdb to create RPS-BLAST databases.
void SetPssm(TPssm &value)
Assign a value to Pssm data member.
TNumColumns GetNumColumns(void) const
Get the NumColumns member data.
Definition: Pssm_.hpp:657
const TConstraints & GetConstraints(void) const
Get the Constraints member data.
bool IsSetMatrixName(void) const
name of the underlying score matrix whose frequency ratios were used in PSSM construction (e....
bool IsSetNumRows(void) const
The dimensions of the matrix are returned so the client can verify that all data was received.
Definition: Pssm_.hpp:591
void SetFinalData(TFinalData &value)
Assign a value to FinalData data member.
Definition: Pssm_.cpp:116
TStart GetStart(void) const
Get the Start member data.
Definition: CoreBlock_.hpp:339
bool IsSetQuery(void) const
PSSM representative sequence (master) Check if a value has been assigned to Query data member.
Definition: Pssm_.hpp:760
TGapOpen GetGapOpen(void) const
Get the GapOpen member data.
bool IsSetNumIndeptObsr(void) const
Number of independent observations per position of the PSSM NOTE: this is needed for building CDD dat...
bool IsSetConstraints(void) const
alignment constraints needed by sequence-structure threader and other global or local block-alignment...
bool IsSetGapExtend(void) const
gap extension penalty corresponding to the matrix above Check if a value has been assigned to GapExte...
bool IsSetNumColumns(void) const
number of columns Check if a value has been assigned to NumColumns data member.
Definition: Pssm_.hpp:638
const TIntermediateData & GetIntermediateData(void) const
Get the IntermediateData member data.
Definition: Pssm_.hpp:793
TByRow GetByRow(void) const
Get the ByRow member data.
Definition: Pssm_.hpp:735
void SetGapOpen(TGapOpen value)
Assign a value to GapOpen data member.
const TParams & GetParams(void) const
Get the Params member data.
bool IsSetBlocks(void) const
nblocks locations Check if a value has been assigned to Blocks data member.
Definition: CoreDef_.hpp:357
bool IsSetPssm(void) const
This field is applicable to PSI-BLAST and formatrpsdb.
void SetGapExtend(TGapExtend value)
Assign a value to GapExtend data member.
const TPssm & GetPssm(void) const
Get the Pssm member data.
bool IsSetParams(void) const
This field's rpsdbparams is used to specify the values of options for processing by formatrpsdb.
const TRpsdbparams & GetRpsdbparams(void) const
Get the Rpsdbparams member data.
TLambda GetLambda(void) const
Get the Lambda member data.
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
Definition: Bioseq_.hpp:309
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
bool IsTitle(void) const
Check if variant Title is selected.
Definition: Seqdesc_.hpp:1026
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
if(yy_accept[yy_current_state])
static void s_WriteInt4List(CNcbiOfstream &ostr, const list< Int4 > &l)
static const string kDefaultMatrix(kMatrixBLOSUM62)
static const string kOutDbName("out")
static CRef< CBlast_def_line_set > s_GenerateBlastDefline(const CBioseq &bio)
static const string kMatrixBLOSUM80
static const string kOutDbType("dbtype")
static const string kExcludeInvalid("exclude_invalid")
#define RPS_NUM_LOOKUP_CELLS
USING_SCOPE(blast)
static const string kMatrixPAM250
static const string kMaxSmpFilesPerVol("max_smp_vol")
static const string kMatrixBLOSUM62
static bool s_HasDefline(const CBioseq &bio)
static const Uint4 kFixedPointScaleFactor
static const string kLogFile("logfile")
static const string kDefaultOutIndexFile("true")
static const string kDefaultOutDbType(kOutDbRps)
#define kEpsylon
static const string kDefaultExcludeInvalid("true")
#define RPS_DATABASE_VERSION
static const string kMatrixBLOSUM50
static const string kOutDbRps
static void s_WriteUint4List(CNcbiOfstream &ostr, const list< Uint4 > &l)
static const string kMatrixBLOSUM90
#define kDefaultWordScoreThreshold
#define kDefaultObsrThreshold
static const string kInPssmList("in")
int main(int argc, const char *argv[])
#define kDefaultMaxSmpFilesPerVol
static const string kObsrThreshold("obsr_threshold")
USING_NCBI_SCOPE
static const string kMatrixPAM70
static const string kMatrixBLOSUM45
#define kSingleVol
static const string kOutDbDelta
static bool s_DeleteMakeprofileDb(const string &name)
static const string kMatrixPAM30
static const string kBinaryScoremat("binary")
static const string kOutDbCobalt
static const string kUseCmdlineThreshold("force")
static const string kPssmScaleFactor("scale")
static const string kOutIndexFile("index")
#define kDefaultPssmScaleFactor
const string version
version string
Definition: variables.hpp:66
const struct ncbi::grid::netcache::search::fields::SIZE size
#define fabs(v)
Definition: ncbi_dispd.c:46
EIPRangeType t
Definition: ncbi_localip.c:101
Prototypes for portable math library (ported from C Toolkit)
long BLAST_Nint(double x)
Nearest integer.
Definition: ncbi_math.c:437
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define INT2_MIN
smallest (most negative) number represented by signed (two byte) short
Definition: ncbi_std.h:161
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Miscellaneous common-use basic types and functionality.
Defines: CTimeFormat - storage class for time format.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static int filenames
Definition: pcre2grep.c:247
#define count
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
C++ API for the PSI-BLAST PSSM engine.
bool DeleteBlastDb(const string &dbpath, CSeqDB::ESeqType seq_type)
Deletes all files associated with a BLAST database.
Definition: seqdb.cpp:1542
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
@ eBDB_Version4
Definition: seqdbcommon.hpp:52
@ eBDB_Version5
Definition: seqdbcommon.hpp:53
This file defines several SeqDB utility functions related to byte order and file system portability.
CSeqDB_Substring SeqDB_RemoveDirName(CSeqDB_Substring s)
Returns a filename minus greedy path.
Definition: seqdbcommon.cpp:50
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
#define row(bind, expected)
Definition: string_bind.c:73
structure defining one cell of the compacted lookup table
union AaLookupBackboneCell::@3 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
Int4 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins
Int4 num_used
number of hits stored for this cell
The basic lookup table structure for blastp searches.
void * thick_backbone
may point to BackboneCell, SmallboneCell, or TinyboneCell.
Boolean use_pssm
if TRUE, lookup table construction will assume that the underlying score matrix is position- specific
Int4 threshold
the score threshold for neighboring words
void * overflow
may point to Int4 or Uint2, the overflow array for the compacted lookup table
Int4 backbone_size
number of cells in the backbone
header of RPS blast '.loo' file
Definition: blast_rps.h:49
Int4 magic_number
value should be RPS_MAGIC_NUM
Definition: blast_rps.h:50
Int4 start_of_backbone
byte offset of start of backbone
Definition: blast_rps.h:56
Int4 end_of_overflow
byte offset to end of overflow array
Definition: blast_rps.h:57
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
Options needed to construct a lookup table Also needed: query sequence and query length.
Options required for setting up the query sequence.
structure defining one cell of the RPS lookup table
static string query
Class which defines sequence id to taxid mapping.
#define _ASSERT
Defines BLAST database construction classes.
void CWriteDB_CreateAliasFile(const string &file_name, const string &db_name, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title=string(), EAliasFileFilterType alias_type=eGiList)
Writes an alias file that restricts a database with a gi list.
@ eNoAliasFilterType
Sentinel value.
Definition: writedb.hpp:610
Code for database files construction.
Modified on Fri Sep 20 14:57:27 2024 by modify_doxy.py rev. 669887