64 #include "../blast/blast_app_util.hpp"
66 #ifndef SKIP_DOXYGEN_PROCESSING
107 #define kDefaultWordScoreThreshold (9.82)
108 #define kDefaultPssmScaleFactor (100.00)
109 #define kDefaultObsrThreshold (6.0)
110 #define kDefaultMaxSmpFilesPerVol (2500)
114 #define kEpsylon (0.0001)
116 #define DEFAULT_POS_MATRIX_SIZE 2000
117 #define RPS_NUM_LOOKUP_CELLS 32768
118 #if BLASTAA_SIZE == 28
119 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM_28
121 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM
124 #define kSingleVol (-1)
132 void Create(
int seq_size);
253 const string & filename);
268 bool x_ValidateCd(
const list<double>& freqs,
const list<double>& observ,
unsigned int alphabet_size);
270 list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets,
Int4 CurrFreqOffset,
Int4 CurrObsrOffset);
273 void x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames);
318 m_WordDefaultScoreThreshold(0), m_OutDbName(
kEmptyStr),
319 m_OutDbType(
kEmptyStr), m_CreateIndexFile(
false),m_GapOpenPenalty(0),
320 m_GapExtPenalty(0), m_PssmScaleFactor(0),m_Matrix(
kEmptyStr), m_op_mode(op_invalid),
321 m_binary_scoremat(
false), m_MaxSmpFilesPerVol(0), m_NumOfVols(0), m_DbVer(
eBDB_Version5),
323 m_ObsrvThreshold(0), m_ExcludeInvalid(
false),
388 arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
389 "Application to create databases for rpsblast, cobalt and deltablast, version "
392 string dflt(
"Default = input file name provided to -");
395 arg_desc->SetCurrentGroup(
"Input options");
397 "Input file that contains a list of smp files (delimited by space, tab or newline)",
401 "Scoremats are in binary format",
404 arg_desc->SetCurrentGroup(
"Configuration options");
405 arg_desc->AddOptionalKey(
kArgDbTitle,
"database_title",
406 "Title for database\n" + dflt,
410 "Minimum word score to add a word to the lookup table",
415 arg_desc->SetCurrentGroup(
"Output options");
416 arg_desc->AddOptionalKey(
kOutDbName,
"database_name",
417 "Name of database to be created\n" +
420 arg_desc->AddDefaultKey(
"blastdb_version",
"version",
421 "Version of BLAST database to be created",
424 arg_desc->SetConstraint(
"blastdb_version",
428 "Maximum number of SMP files per DB volume",
431 arg_desc->AddDefaultKey(
kOutDbType,
"output_db_type",
432 "Output database type: cobalt, delta, rps",
437 "Create Index Files",
440 arg_desc->SetCurrentGroup(
"Used only if scoremat files do not contain PSSM scores, ignored otherwise.");
441 arg_desc->AddOptionalKey(
kArgGapOpen,
"gap_open_penalty",
442 "Cost to open a gap",
445 arg_desc->AddOptionalKey(
kArgGapExtend,
"gap_extend_penalty",
446 "Cost to extend a gap, ",
450 "Pssm Scale factor ",
455 "Scoring matrix name",
462 arg_desc->SetCurrentGroup(
"Delta Blast Options");
463 arg_desc->AddDefaultKey(
kObsrThreshold,
"observations_threshold",
"Exclude domains with "
464 "with maximum number of independent observations "
468 arg_desc->AddDefaultKey(
kExcludeInvalid,
"exclude_invalid",
"Exclude domains that do "
469 "not pass validation test",
472 arg_desc->SetCurrentGroup(
"Taxonomy options");
473 arg_desc->AddOptionalKey(
"taxid",
"TaxID",
474 "Taxonomy ID to assign to all sequences",
479 arg_desc->AddOptionalKey(
"taxid_map",
"TaxIDMapFile",
480 "Text file mapping sequence IDs to taxonomy IDs.\n"
481 "Format:<SequenceId> <TaxonomyId><newline>",
538 int default_gap_open = 0;
539 int default_gap_extend = 0;
575 }
else if (args[
"taxid_map"].
HasValue()) {
606 const string & filename)
616 string err = filename +
" contains no bioseq data";
622 string err = filename +
" contains no info on num of columns or num of rows";
628 string err = filename +
" 's num of columns does not match size of sequence";
635 string err = filename +
" has invalid alphabet size";
647 string err = filename +
" contains no frequency ratios.\n" +
648 "Please use a recent version of psiblast to regenerate PSSM files\n" ;
657 string err = filename +
" contains no core block to build cobalt database";
676 string err = filename +
" contains no pssm or residue frequencies";
682 string err = filename +
" contains no scoremat";
708 bool isRemoved =
false;
709 static const char * mp_ext[]={
".rps",
".loo",
".aux",
".freq",
".blocks",
".wcounts",
".obsr",
NULL};
710 for(
const char ** mp=mp_ext; *mp !=
NULL; mp++) {
714 if (
CFile(fname).Remove()) {
718 unsigned int index = 0;
721 while (
CFile(vfname).Remove()) {
749 string rps_str = rpsDbInfo.
db_name +
".rps";
754 string lookup_str = rpsDbInfo.
db_name +
".loo";
759 string aux_str = rpsDbInfo.
db_name +
".aux";
760 rpsDbInfo.
aux_file.open(aux_str.c_str());
764 string freq_str = rpsDbInfo.
db_name +
".freq";
779 rpsDbInfo.
pssm_file.write((
char *) &num_files,
sizeof(
Int4));
780 rpsDbInfo.
freq_file.write((
char *) &num_files,
sizeof(
Int4));
781 for (
Int4 i = 0;
i <= num_files;
i++)
789 string blocks_str = rpsDbInfo.
db_name +
".blocks";
875 vector <char> query_v = query_stdaa.
Get();
877 if((
Int4) (query_v.size()) != seq_size)
883 for(
unsigned int i = 0;
i < query_v.size();
i++)
889 matrix_name.c_str(), rpsDbInfo.
gap_open,
946 double threshold = rpsDbInfo.
scale_factor * wordScoreThreshold;
977 rpsDbInfo.
aux_file << scientific << 0.0 <<
"\n";
978 rpsDbInfo.
aux_file << scientific << 0.0 <<
"\n";
992 list<CRef<CCoreBlock> >::const_iterator itr = block_list.begin();
996 while(itr != block_list.end())
1010 if(descr_list.size() > 0)
1019 seq_id_str = accession;
1048 list<double>::const_iterator itr_fr = freq_ratios.begin();
1049 rpsDbInfo.
freq_file.seekp(0, ios_base::end);
1052 for (
i = 0;
i < seq_size;
i++) {
1053 for (j = 0; j < alphabet_size; j++) {
1054 if (itr_fr == freq_ratios.end())
1068 for (
i = 0;
i < seq_size;
i++) {
1076 memset(
row, 0,
sizeof(
row));
1079 rpsDbInfo.
freq_file.seekp( 8 + (seq_index) *
sizeof(
Int4), ios_base::beg);
1106 if (
NULL == posMatrix)
1141 for (
i = 0;
i < seq_size;
i++) {
1142 for (j = 0; j < alphabet_size; j++) {
1143 if (score_list_itr == score_list_end)
1145 posMatrix[
i][j] = *score_list_itr;
1148 if (j < alphabet_size)
1156 for (j = 0; j < alphabet_size; j++) {
1157 for (
i = 0;
i < seq_size;
i++) {
1158 if (score_list_itr == score_list_end)
1160 posMatrix[
i][j] = *score_list_itr;
1166 if (j == alphabet_size) {
1168 for (
i = 0;
i < seq_size;
i++) {
1175 if (
i < seq_size || j < alphabet_size)
1178 if(score_list_itr != score_list_end)
1190 rpsDbInfo.
pssm_file.seekp(0, ios_base::end);
1191 for (
i = 0;
i < seq_size + 1;
i++) {
1200 rpsDbInfo.
pssm_file.seekp( 8 + (seq_index) *
sizeof(
Int4), ios_base::beg);
1235 Int4 cursor, old_cursor;
1239 memset(&header, 0,
sizeof(header));
1244 for (index = cursor = 0; index < lut->
backbone_size; index++) {
1283 cursor *
sizeof(
Int4);
1287 rpsDbInfo.
lookup_file.write((
const char *)&header,
sizeof(header));
1295 memset(&empty_cell, 0,
sizeof(empty_cell));
1297 rpsDbInfo.
lookup_file.write((
const char *)&empty_cell,
sizeof(empty_cell));
1324 string freq_str = rpsDbInfo.
db_name +
".freq";
1349 defline->SetSeqid() = bio.
GetId();
1350 defline_set->
Set().push_back(defline);
1358 *
m_LogFile <<
"Deleted existing BLAST database with identical name." << endl;
1361 int num_smps = smpFilenames.size();
1371 vector<string>::iterator
b = smpFilenames.begin();
1372 vector<string>::iterator
r =
b + num_seqs;
1374 vector<string> vol_smps(
b,
r);
1378 if(residue_seqs > 0) {
1400 for(
int seq_index=0; seq_index < rpsDbInfo.
num_seqs; seq_index++)
1402 string filename = smps[seq_index];
1406 string err = filename +
" does not exists";
1427 string err = filename +
" contains invalid scoremat";
1448 TTaxId taxid = it->GetOrg().GetTaxId();
1504 rpsDbInfo.
aux_file << seq_size <<
"\n";
1525 ostr.write((
char*)&(*it),
sizeof(
Int4));
1533 ostr.write((
char*)&(*it),
sizeof(
Uint4));
1540 vector<string> deltaList;
1542 for(
unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1544 string filename = smpFilenames[seq_index];
1548 string err = filename +
" does not exists";
1569 string err = filename +
" contains invalid scoremat";
1577 string err = filename +
" contains no weighted residue frequencies for building delta database";
1583 string err = filename +
" contains no observations information for building delta database";
1589 deltaList.push_back(filename);
1603 list<Int4> FreqOffsets;
1604 list<Int4> ObsrOffsets;
1605 Int4 CurrFreqOffset = 0;
1606 Int4 CurrObsrOffset= 0;
1608 for(
unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1610 string filename = smpFilenames[seq_index];
1634 list<double> modify_freqs;
1639 vector<double>
tmp(orig_freqs.size());
1640 list<double>::const_iterator f_itr = orig_freqs.begin();
1642 for(
int i = 0;
i < alphabet_size;
i++)
1644 for(
int j = 0; j < seq_size; j++)
1646 tmp[
i + j*alphabet_size] = *f_itr;
1650 copy(
tmp.begin(),
tmp.end(), modify_freqs.begin());
1656 if(0 == modify_freqs.size())
1657 copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1659 list<double>::iterator p_itr = modify_freqs.begin();
1661 for (
int j=0; j < seq_size; j++)
1663 for(
int i=0;
i < alphabet_size;
i++)
1665 if(modify_freqs.end() == p_itr)
1671 modify_freqs.insert(p_itr, (
BLASTAA_SIZE-alphabet_size), 0);
1675 const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1678 ObsrOffsets.push_back(CurrObsrOffset);
1680 list<Uint4> ObsrBuff;
1683 unsigned int num_obsr_columns = 0;
1684 list<double>::const_iterator obsr_it = obsr.begin();
1687 double current = *obsr_it;
1691 while (obsr_it != obsr.end() &&
fabs(*obsr_it - current) < 1e-4)
1701 ObsrBuff.push_back(num);
1703 while (obsr_it != obsr.end());
1705 Uint4 num_weighted_counts = 0;
1710 list<Uint4> FreqBuff;
1712 ITERATE (list<double>, it, freqs)
1715 num_weighted_counts++;
1718 if (num_obsr_columns != num_weighted_counts /
BLASTAA_SIZE)
1720 string err =
"Number of frequencies and observations columns do not match in " + filename;
1725 unsigned int padded_size = FreqBuff.size() +
BLASTAA_SIZE;
1726 FreqBuff.resize(padded_size, 0);
1728 CurrFreqOffset += FreqBuff.size();
1729 CurrObsrOffset += ObsrBuff.size();
1735 tmp_obsr_buff.flush();
1736 tmp_freq_buff.flush();
1737 x_WrapUpDelta(rpsDbInfo, tmp_obsr_file, tmp_freq_file, FreqOffsets, ObsrOffsets, CurrFreqOffset, CurrObsrOffset);
1742 const list<double>& observ,
1743 unsigned int alphabet_size)
1746 if (freqs.size() / alphabet_size != observ.size())
1748 string err =
"Number of frequency and observations columns do not match";
1752 ITERATE (list<double>, it, freqs)
1754 unsigned int residue = 0;
1756 while (residue < alphabet_size - 1)
1768 ITERATE (list<double>, it, observ)
1787 list<double> modify_freqs;
1792 vector<double>
tmp(orig_freqs.size());
1793 list<double>::const_iterator f_itr = orig_freqs.begin();
1795 for(
int i = 0;
i < alphabet_size;
i++)
1797 for(
int j = 0; j < seq_size; j++)
1799 tmp[
i + j*alphabet_size] = *f_itr;
1803 copy(
tmp.begin(),
tmp.end(), modify_freqs.begin());
1809 if(0 == modify_freqs.size())
1810 copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1812 list<double>::iterator p_itr = modify_freqs.begin();
1814 for (
int j=0; j < seq_size; j++)
1816 for(
int i=0;
i < alphabet_size;
i++)
1818 if(modify_freqs.end() == p_itr)
1824 modify_freqs.insert(p_itr, (
BLASTAA_SIZE-alphabet_size), 0);
1828 const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1829 double max_obsr = *max_element(obsr.begin(), obsr.end()) + 1.0;
1833 " was excluded: due to too few independent observations\n";
1840 " was excluded: it conatins an invalid CD \n";
1849 list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets,
Int4 CurrFreqOffset,
Int4 CurrObsrOffset)
1852 ObsrOffsets.push_back(CurrObsrOffset);
1854 string wcounts_str = rpsDbInfo.
db_name +
".wcounts";
1856 if (!wcounts_file.is_open())
1859 string obsr_str = rpsDbInfo.
db_name +
".obsr";
1861 if (!obsr_file.is_open())
1869 wcounts_file.write((
char*)&magic_number,
sizeof(
Int4));
1870 obsr_file.write((
char*)&magic_number,
sizeof(
Int4));
1873 Int4 num_wcounts_records = FreqOffsets.size() -1;
1874 Int4 num_obsr_records = ObsrOffsets.size() -1;
1875 wcounts_file.write((
char*)&num_wcounts_records,
sizeof(
Int4));
1876 obsr_file.write((
char*)&num_obsr_records,
sizeof(
Int4));
1879 wcounts_file.flush();
1880 wcounts_file << tmp_freq_buff.rdbuf();
1881 wcounts_file.flush();
1882 wcounts_file.close();
1886 obsr_file << tmp_obsr_buff.rdbuf();
1908 catch(
const blast::CInputException& e) {
1916 catch (
const blast::CBlastException& e) {
1946 #ifndef SKIP_DOXYGEN_PROCESSING
1947 int main(
int argc,
const char* argv[] )
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Declares singleton objects to store the version and reference for the BLAST engine.
Routines for creating protein BLAST lookup tables.
BlastAaLookupTable * BlastAaLookupTableDestruct(BlastAaLookupTable *lookup)
Free the lookup table.
void BlastAaLookupIndexQuery(BlastAaLookupTable *lookup, Int4 **matrix, BLAST_SequenceBlk *query, BlastSeqLoc *unmasked_regions, Int4 query_bias)
Index a protein query.
struct RPSBackboneCell RPSBackboneCell
structure defining one cell of the RPS lookup table
#define RPS_HITS_PER_CELL
maximum number of hits in an RPS backbone cell; this may be redundant (have the same value as AA_HITS...
Int4 BlastAaLookupFinalize(BlastAaLookupTable *lookup, EBoneType bone_type)
Pack the data structures comprising a protein lookup table into their final form.
Int4 BlastAaLookupTableNew(const LookupTableOptions *opt, BlastAaLookupTable **lut)
Create a new protein lookup table.
#define BLAST_INPUT_ERROR
Command line binary exit code: error in input query/options.
#define BLAST_UNKNOWN_ERROR
Command line binary exit code: unknown error.
#define BLAST_DATABASE_ERROR
Command line binary exit code: error in database/subject.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
The structures and functions in blast_options.
Int2 BLAST_FillQuerySetUpOptions(QuerySetUpOptions *options, EBlastProgramType program, const char *filter_string, Uint1 strand_option)
Fill non-default contents of the QuerySetUpOptions.
Int2 BlastQuerySetUpOptionsNew(QuerySetUpOptions **options)
Allocate memory for QuerySetUpOptions and fill with default values.
Int2 BLAST_FillLookupTableOptions(LookupTableOptions *options, EBlastProgramType program, Boolean is_megablast, double threshold, Int4 word_size)
Allocate memory for lookup table options and fill with default values.
Int2 LookupTableOptionsNew(EBlastProgramType program, LookupTableOptions **options)
Allocate memory for lookup table options and fill with default values.
#define BLAST_WORDSIZE_PROT
length of word to trigger an extension.
LookupTableOptions * LookupTableOptionsFree(LookupTableOptions *options)
Deallocates memory for LookupTableOptions*.
QuerySetUpOptions * BlastQuerySetUpOptionsFree(QuerySetUpOptions *options)
Deallocate memory for QuerySetUpOptions.
#define FREQ_RATIO_SCALE
header for RPS blast frequency ratios ('.freq') file
#define RPS_MAGIC_NUM_28
Version number for 28-letter alphabet.
Int2 BLAST_GetProteinGapExistenceExtendParams(const char *matrixName, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
#define BLAST_SCORE_MAX
maximum allowed score (for one letter comparison).
Code to build a database given various sources of sequence data.
Class to constrain the values of an argument to those greater than or equal to the value specified in...
const CSeq_id * GetFirstId() const
Defines BLAST error codes (user errors included)
void AddParam(EUsageParams p, int val)
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
static void CreateDirectories(const string &dbname)
Create Directory for blast db.
void Create(int seq_size)
unsigned int GetSize(void)
CMakeDbPosMatrix pos_matrix
CNcbiOfstream lookup_file
QuerySetUpOptions * query_options
CNcbiOfstream blocks_file
BlastAaLookupTable * lookup
CRef< CWriteDB > output_db
LookupTableOptions * lookup_options
CMakeProfileDBApp(void)
@inheritDoc
void x_AddCmdOptions(void)
virtual void Init()
@inheritDoc
CheckInputScoremat_RV x_CheckInputScoremat(const CPssmWithParameters &pssm_w_parameters, const string &filename)
CRef< CTaxIdSet > m_Taxids
CNcbiIstream * m_InPssmList
void x_RPSUpdateLookup(CRPS_DbInfo &rpsDbInfo, Int4 seq_size)
vector< string > x_CreateDeltaList(void)
void x_WrapUpDelta(CRPS_DbInfo &rpsDbInfo, CTmpFile &tmp_obsr_file, CTmpFile &tmp_freq_file, list< Int4 > &FreqOffsets, list< Int4 > &ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset)
void x_RPSUpdateStatistics(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &seq, Int4 seq_size)
virtual int Run()
@inheritDoc
void x_CreateAliasFile(void)
void x_FillInRPSDbParameters(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &pssm_p)
void x_InitProgramParameters(void)
void x_InitRPSDbInfo(CRPS_DbInfo &rpsDBInfo, Int4 vol, Int4 num_files)
void x_RPS_DbClose(CRPS_DbInfo &rpsDbInfo)
bool x_CheckDelta(const CPssm &pssm, Int4 seq_size, const string &filename)
void x_RPSAddFirstSequence(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &pssm_w_parameters, bool freq_only)
void x_UpdateRPSDbInfo(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p)
void x_UpdateDelta(CRPS_DbInfo &rpsDbInfo, vector< string > &smpFilenames)
double m_WordDefaultScoreThreshold
void x_RPSUpdatePSSM(CRPS_DbInfo &rpsDbInfo, const CPssm &pssm, Int4 seq_index, Int4 seq_size)
void x_InitOutputDb(CRPS_DbInfo &rpsDBInfo)
void x_SetupArgDescriptions(void)
CBlastUsageReport m_UsageReport
vector< string > m_VolNames
bool x_ValidateCd(const list< double > &freqs, const list< double > &observ, unsigned int alphabet_size)
void x_UpdateFreqRatios(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p, Int4 seq_index, Int4 seq_size)
bool x_IsUpdateFreqRatios(const CPssm &p)
vector< string > x_GetSMPFilenames(void)
void x_UpdateCobalt(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p, Int4 seq_size)
void x_MakeVol(Int4 vol, vector< string > &smps)
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Computes a PSSM as specified in PSI-BLAST.
void GetQuerySequenceData(CNCBIstdaa &sequence) const
Retrieve the query sequence data in ncbistdaa format.
SIZE_TYPE GetQueryLength() const
Return the query length or 0 if no query is available.
void GetString(string &s) const
Return the data by assigning it to a string.
void FixTaxId(CRef< objects::CBlast_def_line_set > deflines)
Check that each defline has the specified taxid; if not, replace the defline and set the taxid.
void AddTaxId(const objects::CSeq_id &seqid, const TTaxId &taxid)
void SetMappingFromFile(CNcbiIstream &f)
static string MakeShortName(const string &base, int index)
Construct the short name for a volume.
void SetMaxFileSize(Uint8 sz)
Set maximum size for output files.
@ eProtein
Protein database.
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
EIndexType
Whether and what kind of indices to build.
@ eDefault
Like eFullIndex but also build a numeric Trace ID index.
@ eNoIndex
Build a database without any indices.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
void Close()
Close the Database.
Constant declarations for command line arguments for BLAST programs.
const string kArgMatrixName
Argument for scoring matrix.
const string kArgDbTitle
Title for the BLAST database.
const string kArgGapExtend
Argument to select the gap extending penalty.
const string kArgGapOpen
Argument to select the gap opening penalty.
const string kArgWordScoreThreshold
Argument to specify the minimum word score such that the word is added to the lookup table.
void Print(const CCompactSAMApplication::AlignInfo &ai)
std::ofstream out("events_result.xml")
main entry point for tests
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
static CNcbiMatrix< double > * GetFreqRatios(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
CVersionInfo GetVersion(void) const
Get the program version information.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
SStrictId_Tax::TId TTaxId
Taxon id type.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
#define TAX_ID_FROM(T, value)
@ fHideXmlHelp
Hide XML help description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eExcludes
One argument excludes another.
@ eInputFile
Name of file (must exist and be readable)
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
@ eDouble
Convertible into a floating point number (double)
@ eString
An arbitrary string.
@ eInteger
Convertible into an integer number (int or Int8)
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
void Error(CExceptionArgs_Base &args)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Info(CExceptionArgs_Base &args)
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
const string & GetFileName(void) const
Return used file name (generated or given in the constructor).
#define MSerial_AsnBinary
const TPrim & Get(void) const
#define MSerial_AsnText
I/O stream manipulators –.
void Reset(void)
Reset reference object.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
bool IsEnabled(void)
Indicates whether application usage statistics collection is enabled for a current reporter instance.
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
void Start(void)
Start the timer.
Tdata & Set(void)
Assign a value to data member.
const TFreqRatios & GetFreqRatios(void) const
Get the FreqRatios member data.
const TNumIndeptObsr & GetNumIndeptObsr(void) const
Get the NumIndeptObsr member data.
const TMatrixName & GetMatrixName(void) const
Get the MatrixName member data.
const TQuery & GetQuery(void) const
Get the Query member data.
TNumRows GetNumRows(void) const
Get the NumRows member data.
void SetParams(TParams &value)
Assign a value to Params data member.
bool IsSetFinalData(void) const
Final representation for the PSSM Check if a value has been assigned to FinalData data member.
bool IsSetStop(void) const
end of block on query Check if a value has been assigned to Stop data member.
TH GetH(void) const
Get the H member data.
TKappa GetKappa(void) const
Get the Kappa member data.
const TScores & GetScores(void) const
Get the Scores member data.
const TWeightedResFreqsPerPos & GetWeightedResFreqsPerPos(void) const
Get the WeightedResFreqsPerPos member data.
bool IsSetGapOpen(void) const
gap opening penalty corresponding to the matrix above Check if a value has been assigned to GapOpen d...
TGapExtend GetGapExtend(void) const
Get the GapExtend member data.
TWordScoreThreshold GetWordScoreThreshold(void) const
Get the WordScoreThreshold member data.
TScalingFactor GetScalingFactor(void) const
Get the ScalingFactor member data.
const TBlocks & GetBlocks(void) const
Get the Blocks member data.
bool IsSetStart(void) const
begin of block on query Check if a value has been assigned to Start data member.
bool IsSetWordScoreThreshold(void) const
Word score threshold Check if a value has been assigned to WordScoreThreshold data member.
bool IsSetScalingFactor(void) const
scaling factor used to obtain more precision when building the PSSM.
bool IsSetFreqRatios(void) const
PSSM's frequency ratios Check if a value has been assigned to FreqRatios data member.
TStop GetStop(void) const
Get the Stop member data.
void SetMatrixName(const TMatrixName &value)
Assign a value to MatrixName data member.
bool IsSetIntermediateData(void) const
both intermediateData and finalData can be provided, but at least one of them must be provided.
const TFinalData & GetFinalData(void) const
Get the FinalData member data.
bool IsSetWeightedResFreqsPerPos(void) const
Weighted observed residue frequencies per position of the PSSM.
bool IsSetRpsdbparams(void) const
data needed by formatrpsdb to create RPS-BLAST databases.
void SetPssm(TPssm &value)
Assign a value to Pssm data member.
TNumColumns GetNumColumns(void) const
Get the NumColumns member data.
const TConstraints & GetConstraints(void) const
Get the Constraints member data.
bool IsSetMatrixName(void) const
name of the underlying score matrix whose frequency ratios were used in PSSM construction (e....
bool IsSetNumRows(void) const
The dimensions of the matrix are returned so the client can verify that all data was received.
void SetFinalData(TFinalData &value)
Assign a value to FinalData data member.
TStart GetStart(void) const
Get the Start member data.
bool IsSetQuery(void) const
PSSM representative sequence (master) Check if a value has been assigned to Query data member.
TGapOpen GetGapOpen(void) const
Get the GapOpen member data.
bool IsSetNumIndeptObsr(void) const
Number of independent observations per position of the PSSM NOTE: this is needed for building CDD dat...
bool IsSetConstraints(void) const
alignment constraints needed by sequence-structure threader and other global or local block-alignment...
bool IsSetGapExtend(void) const
gap extension penalty corresponding to the matrix above Check if a value has been assigned to GapExte...
bool IsSetNumColumns(void) const
number of columns Check if a value has been assigned to NumColumns data member.
const TIntermediateData & GetIntermediateData(void) const
Get the IntermediateData member data.
TByRow GetByRow(void) const
Get the ByRow member data.
void SetGapOpen(TGapOpen value)
Assign a value to GapOpen data member.
const TParams & GetParams(void) const
Get the Params member data.
bool IsSetBlocks(void) const
nblocks locations Check if a value has been assigned to Blocks data member.
bool IsSetPssm(void) const
This field is applicable to PSI-BLAST and formatrpsdb.
void SetGapExtend(TGapExtend value)
Assign a value to GapExtend data member.
const TPssm & GetPssm(void) const
Get the Pssm member data.
bool IsSetParams(void) const
This field's rpsdbparams is used to specify the values of options for processing by formatrpsdb.
const TRpsdbparams & GetRpsdbparams(void) const
Get the Rpsdbparams member data.
TLambda GetLambda(void) const
Get the Lambda member data.
const TSeq & GetSeq(void) const
Get the variant data.
bool IsSeq(void) const
Check if variant Seq is selected.
const TTitle & GetTitle(void) const
Get the variant data.
const TId & GetId(void) const
Get the Id member data.
const Tdata & Get(void) const
Get the member data.
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
bool IsTitle(void) const
Check if variant Title is selected.
const TDescr & GetDescr(void) const
Get the Descr member data.
unsigned int
A callback function used to compare two keys in a database.
if(yy_accept[yy_current_state])
static void s_WriteInt4List(CNcbiOfstream &ostr, const list< Int4 > &l)
static const string kDefaultMatrix(kMatrixBLOSUM62)
static const string kOutDbName("out")
static CRef< CBlast_def_line_set > s_GenerateBlastDefline(const CBioseq &bio)
static const string kMatrixBLOSUM80
static const string kOutDbType("dbtype")
static const string kExcludeInvalid("exclude_invalid")
#define RPS_NUM_LOOKUP_CELLS
static const string kMatrixPAM250
static const string kMaxSmpFilesPerVol("max_smp_vol")
static const string kMatrixBLOSUM62
static bool s_HasDefline(const CBioseq &bio)
static const Uint4 kFixedPointScaleFactor
static const string kLogFile("logfile")
static const string kDefaultOutIndexFile("true")
static const string kDefaultOutDbType(kOutDbRps)
static const string kDefaultExcludeInvalid("true")
#define RPS_DATABASE_VERSION
static const string kMatrixBLOSUM50
static const string kOutDbRps
static void s_WriteUint4List(CNcbiOfstream &ostr, const list< Uint4 > &l)
static const string kMatrixBLOSUM90
#define kDefaultWordScoreThreshold
#define kDefaultObsrThreshold
static const string kInPssmList("in")
int main(int argc, const char *argv[])
#define kDefaultMaxSmpFilesPerVol
static const string kObsrThreshold("obsr_threshold")
static const string kMatrixPAM70
static const string kMatrixBLOSUM45
static const string kOutDbDelta
static bool s_DeleteMakeprofileDb(const string &name)
static const string kMatrixPAM30
static const string kBinaryScoremat("binary")
static const string kOutDbCobalt
static const string kUseCmdlineThreshold("force")
static const string kPssmScaleFactor("scale")
static const string kOutIndexFile("index")
#define kDefaultPssmScaleFactor
const string version
version string
const struct ncbi::grid::netcache::search::fields::SIZE size
Prototypes for portable math library (ported from C Toolkit)
long BLAST_Nint(double x)
Nearest integer.
#define TRUE
bool replacment for C indicating true.
#define FALSE
bool replacment for C indicating false.
#define INT2_MIN
smallest (most negative) number represented by signed (two byte) short
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Miscellaneous common-use basic types and functionality.
Defines: CTimeFormat - storage class for time format.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
C++ API for the PSI-BLAST PSSM engine.
bool DeleteBlastDb(const string &dbpath, CSeqDB::ESeqType seq_type)
Deletes all files associated with a BLAST database.
EBlastDbVersion
BLAST database version.
This file defines several SeqDB utility functions related to byte order and file system portability.
CSeqDB_Substring SeqDB_RemoveDirName(CSeqDB_Substring s)
Returns a filename minus greedy path.
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
#define row(bind, expected)
structure defining one cell of the compacted lookup table
union AaLookupBackboneCell::@3 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
Int4 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins
Int4 num_used
number of hits stored for this cell
The basic lookup table structure for blastp searches.
void * thick_backbone
may point to BackboneCell, SmallboneCell, or TinyboneCell.
Boolean use_pssm
if TRUE, lookup table construction will assume that the underlying score matrix is position- specific
Int4 threshold
the score threshold for neighboring words
void * overflow
may point to Int4 or Uint2, the overflow array for the compacted lookup table
Int4 backbone_size
number of cells in the backbone
Used to hold a set of positions, mostly used for filtering.
Options needed to construct a lookup table Also needed: query sequence and query length.
Options required for setting up the query sequence.
structure defining one cell of the RPS lookup table
Class which defines sequence id to taxid mapping.
Defines BLAST database construction classes.
void CWriteDB_CreateAliasFile(const string &file_name, const string &db_name, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title=string(), EAliasFileFilterType alias_type=eGiList)
Writes an alias file that restricts a database with a gi list.
@ eNoAliasFilterType
Sentinel value.
Code for database files construction.