64 #include "../blast/blast_app_util.hpp"
66 #ifndef SKIP_DOXYGEN_PROCESSING
107 #define kDefaultWordScoreThreshold (9.82)
108 #define kDefaultPssmScaleFactor (100.00)
109 #define kDefaultObsrThreshold (6.0)
110 #define kDefaultMaxSmpFilesPerVol (2500)
114 #define kEpsylon (0.0001)
116 #define DEFAULT_POS_MATRIX_SIZE 2000
117 #define RPS_NUM_LOOKUP_CELLS 32768
118 #if BLASTAA_SIZE == 28
119 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM_28
121 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM
124 #define kSingleVol (-1)
132 void Create(
int seq_size);
253 const string & filename);
268 bool x_ValidateCd(
const list<double>& freqs,
const list<double>& observ,
unsigned int alphabet_size);
270 list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets,
Int4 CurrFreqOffset,
Int4 CurrObsrOffset);
273 void x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames);
318 m_WordDefaultScoreThreshold(0), m_OutDbName(
kEmptyStr),
319 m_OutDbType(
kEmptyStr), m_CreateIndexFile(
false),m_GapOpenPenalty(0),
320 m_GapExtPenalty(0), m_PssmScaleFactor(0),m_Matrix(
kEmptyStr), m_op_mode(op_invalid),
321 m_binary_scoremat(
false), m_MaxSmpFilesPerVol(0), m_NumOfVols(0), m_DbVer(
eBDB_Version5),
323 m_ObsrvThreshold(0), m_ExcludeInvalid(
false),
388 arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
389 "Application to create databases for rpsblast, cobalt and deltablast, version "
392 string dflt(
"Default = input file name provided to -");
395 arg_desc->SetCurrentGroup(
"Input options");
397 "Input file that contains a list of smp files (delimited by space, tab or newline)",
401 "Scoremats are in binary format",
404 arg_desc->SetCurrentGroup(
"Configuration options");
405 arg_desc->AddOptionalKey(
kArgDbTitle,
"database_title",
406 "Title for database\n" + dflt,
410 "Minimum word score to add a word to the lookup table",
415 arg_desc->SetCurrentGroup(
"Output options");
416 arg_desc->AddOptionalKey(
kOutDbName,
"database_name",
417 "Name of database to be created\n" +
420 arg_desc->AddDefaultKey(
"blastdb_version",
"version",
421 "Version of BLAST database to be created",
424 arg_desc->SetConstraint(
"blastdb_version",
428 "Maximum number of SMP files per DB volume",
431 arg_desc->AddDefaultKey(
kOutDbType,
"output_db_type",
432 "Output database type: cobalt, delta, rps",
437 "Create Index Files",
440 arg_desc->SetCurrentGroup(
"Used only if scoremat files do not contain PSSM scores, ignored otherwise.");
441 arg_desc->AddOptionalKey(
kArgGapOpen,
"gap_open_penalty",
442 "Cost to open a gap",
445 arg_desc->AddOptionalKey(
kArgGapExtend,
"gap_extend_penalty",
446 "Cost to extend a gap, ",
450 "Pssm Scale factor ",
455 "Scoring matrix name",
461 arg_desc->AddFlag(
"without_freq_ratios",
"Build rps db without freq ratios",
true);
464 arg_desc->SetCurrentGroup(
"Delta Blast Options");
465 arg_desc->AddDefaultKey(
kObsrThreshold,
"observations_threshold",
"Exclude domains with "
466 "with maximum number of independent observations "
470 arg_desc->AddDefaultKey(
kExcludeInvalid,
"exclude_invalid",
"Exclude domains that do "
471 "not pass validation test",
474 arg_desc->SetCurrentGroup(
"Taxonomy options");
475 arg_desc->AddOptionalKey(
"taxid",
"TaxID",
476 "Taxonomy ID to assign to all sequences",
481 arg_desc->AddOptionalKey(
"taxid_map",
"TaxIDMapFile",
482 "Text file mapping sequence IDs to taxonomy IDs.\n"
483 "Format:<SequenceId> <TaxonomyId><newline>",
540 int default_gap_open = 0;
541 int default_gap_extend = 0;
558 if(args.
Exist(
"without_freq_ratios")) {
563 if (args[
"without_freq_ratios"].AsBoolean()) {
587 }
else if (args[
"taxid_map"].
HasValue()) {
618 const string & filename)
628 string err = filename +
" contains no bioseq data";
634 string err = filename +
" contains no info on num of columns or num of rows";
640 string err = filename +
" 's num of columns does not match size of sequence";
647 string err = filename +
" has invalid alphabet size";
659 string err = filename +
" contains no frequence ratios.\n" +
660 "You can use the -without_freq_ratios option to build the database without frequency ratios.\n" +
661 "However composition based statistics will have to be disabled for RPSBLAST searches against\n" +
662 "this database (not recommended).";
671 string err = filename +
" contains no core block to build cobalt database";
690 string err = filename +
" contains no pssm or residue frequencies";
696 string err = filename +
" contains no scoremat";
722 bool isRemoved =
false;
723 static const char * mp_ext[]={
".rps",
".loo",
".aux",
".freq",
".blocks",
".wcounts",
".obsr",
NULL};
724 for(
const char ** mp=mp_ext; *mp !=
NULL; mp++) {
728 if (
CFile(fname).Remove()) {
732 unsigned int index = 0;
735 while (
CFile(vfname).Remove()) {
763 string rps_str = rpsDbInfo.
db_name +
".rps";
768 string lookup_str = rpsDbInfo.
db_name +
".loo";
773 string aux_str = rpsDbInfo.
db_name +
".aux";
774 rpsDbInfo.
aux_file.open(aux_str.c_str());
778 string freq_str = rpsDbInfo.
db_name +
".freq";
793 rpsDbInfo.
pssm_file.write((
char *) &num_files,
sizeof(
Int4));
794 rpsDbInfo.
freq_file.write((
char *) &num_files,
sizeof(
Int4));
795 for (
Int4 i = 0;
i <= num_files;
i++)
803 string blocks_str = rpsDbInfo.
db_name +
".blocks";
889 vector <char> query_v = query_stdaa.
Get();
891 if((
Int4) (query_v.size()) != seq_size)
897 for(
unsigned int i = 0;
i < query_v.size();
i++)
903 matrix_name.c_str(), rpsDbInfo.
gap_open,
960 double threshold = rpsDbInfo.
scale_factor * wordScoreThreshold;
991 rpsDbInfo.
aux_file << scientific << 0.0 <<
"\n";
992 rpsDbInfo.
aux_file << scientific << 0.0 <<
"\n";
1006 list<CRef<CCoreBlock> >::const_iterator itr = block_list.begin();
1010 while(itr != block_list.end())
1024 if(descr_list.size() > 0)
1033 seq_id_str = accession;
1062 list<double>::const_iterator itr_fr = freq_ratios.begin();
1063 rpsDbInfo.
freq_file.seekp(0, ios_base::end);
1066 for (
i = 0;
i < seq_size;
i++) {
1067 for (j = 0; j < alphabet_size; j++) {
1068 if (itr_fr == freq_ratios.end())
1082 for (
i = 0;
i < seq_size;
i++) {
1090 memset(
row, 0,
sizeof(
row));
1093 rpsDbInfo.
freq_file.seekp( 8 + (seq_index) *
sizeof(
Int4), ios_base::beg);
1120 if (
NULL == posMatrix)
1155 for (
i = 0;
i < seq_size;
i++) {
1156 for (j = 0; j < alphabet_size; j++) {
1157 if (score_list_itr == score_list_end)
1159 posMatrix[
i][j] = *score_list_itr;
1162 if (j < alphabet_size)
1170 for (j = 0; j < alphabet_size; j++) {
1171 for (
i = 0;
i < seq_size;
i++) {
1172 if (score_list_itr == score_list_end)
1174 posMatrix[
i][j] = *score_list_itr;
1180 if (j == alphabet_size) {
1182 for (
i = 0;
i < seq_size;
i++) {
1189 if (
i < seq_size || j < alphabet_size)
1192 if(score_list_itr != score_list_end)
1204 rpsDbInfo.
pssm_file.seekp(0, ios_base::end);
1205 for (
i = 0;
i < seq_size + 1;
i++) {
1214 rpsDbInfo.
pssm_file.seekp( 8 + (seq_index) *
sizeof(
Int4), ios_base::beg);
1249 Int4 cursor, old_cursor;
1253 memset(&header, 0,
sizeof(header));
1258 for (index = cursor = 0; index < lut->
backbone_size; index++) {
1297 cursor *
sizeof(
Int4);
1301 rpsDbInfo.
lookup_file.write((
const char *)&header,
sizeof(header));
1309 memset(&empty_cell, 0,
sizeof(empty_cell));
1311 rpsDbInfo.
lookup_file.write((
const char *)&empty_cell,
sizeof(empty_cell));
1338 string freq_str = rpsDbInfo.
db_name +
".freq";
1363 defline->SetSeqid() = bio.
GetId();
1364 defline_set->
Set().push_back(defline);
1372 *
m_LogFile <<
"Deleted existing BLAST database with identical name." << endl;
1375 int num_smps = smpFilenames.size();
1385 vector<string>::iterator
b = smpFilenames.begin();
1386 vector<string>::iterator
r =
b + num_seqs;
1388 vector<string> vol_smps(
b,
r);
1392 if(residue_seqs > 0) {
1414 for(
int seq_index=0; seq_index < rpsDbInfo.
num_seqs; seq_index++)
1416 string filename = smps[seq_index];
1420 string err = filename +
" does not exists";
1441 string err = filename +
" contains invalid scoremat";
1462 TTaxId taxid = it->GetOrg().GetTaxId();
1518 rpsDbInfo.
aux_file << seq_size <<
"\n";
1539 ostr.write((
char*)&(*it),
sizeof(
Int4));
1547 ostr.write((
char*)&(*it),
sizeof(
Uint4));
1554 vector<string> deltaList;
1556 for(
unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1558 string filename = smpFilenames[seq_index];
1562 string err = filename +
" does not exists";
1583 string err = filename +
" contains invalid scoremat";
1591 string err = filename +
" contains no weighted residue frequencies for building delta database";
1597 string err = filename +
" contains no observations information for building delta database";
1603 deltaList.push_back(filename);
1617 list<Int4> FreqOffsets;
1618 list<Int4> ObsrOffsets;
1619 Int4 CurrFreqOffset = 0;
1620 Int4 CurrObsrOffset= 0;
1622 for(
unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1624 string filename = smpFilenames[seq_index];
1648 list<double> modify_freqs;
1653 vector<double>
tmp(orig_freqs.size());
1654 list<double>::const_iterator f_itr = orig_freqs.begin();
1656 for(
int i = 0;
i < alphabet_size;
i++)
1658 for(
int j = 0; j < seq_size; j++)
1660 tmp[
i + j*alphabet_size] = *f_itr;
1664 copy(
tmp.begin(),
tmp.end(), modify_freqs.begin());
1670 if(0 == modify_freqs.size())
1671 copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1673 list<double>::iterator p_itr = modify_freqs.begin();
1675 for (
int j=0; j < seq_size; j++)
1677 for(
int i=0;
i < alphabet_size;
i++)
1679 if(modify_freqs.end() == p_itr)
1685 modify_freqs.insert(p_itr, (
BLASTAA_SIZE-alphabet_size), 0);
1689 const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1692 ObsrOffsets.push_back(CurrObsrOffset);
1694 list<Uint4> ObsrBuff;
1697 unsigned int num_obsr_columns = 0;
1698 list<double>::const_iterator obsr_it = obsr.begin();
1701 double current = *obsr_it;
1705 while (obsr_it != obsr.end() &&
fabs(*obsr_it - current) < 1e-4)
1715 ObsrBuff.push_back(num);
1717 while (obsr_it != obsr.end());
1719 Uint4 num_weighted_counts = 0;
1724 list<Uint4> FreqBuff;
1726 ITERATE (list<double>, it, freqs)
1729 num_weighted_counts++;
1732 if (num_obsr_columns != num_weighted_counts /
BLASTAA_SIZE)
1734 string err =
"Number of frequencies and observations columns do not match in " + filename;
1739 unsigned int padded_size = FreqBuff.size() +
BLASTAA_SIZE;
1740 FreqBuff.resize(padded_size, 0);
1742 CurrFreqOffset += FreqBuff.size();
1743 CurrObsrOffset += ObsrBuff.size();
1749 tmp_obsr_buff.flush();
1750 tmp_freq_buff.flush();
1751 x_WrapUpDelta(rpsDbInfo, tmp_obsr_file, tmp_freq_file, FreqOffsets, ObsrOffsets, CurrFreqOffset, CurrObsrOffset);
1756 const list<double>& observ,
1757 unsigned int alphabet_size)
1760 if (freqs.size() / alphabet_size != observ.size())
1762 string err =
"Number of frequency and observations columns do not match";
1766 ITERATE (list<double>, it, freqs)
1768 unsigned int residue = 0;
1770 while (residue < alphabet_size - 1)
1782 ITERATE (list<double>, it, observ)
1801 list<double> modify_freqs;
1806 vector<double>
tmp(orig_freqs.size());
1807 list<double>::const_iterator f_itr = orig_freqs.begin();
1809 for(
int i = 0;
i < alphabet_size;
i++)
1811 for(
int j = 0; j < seq_size; j++)
1813 tmp[
i + j*alphabet_size] = *f_itr;
1817 copy(
tmp.begin(),
tmp.end(), modify_freqs.begin());
1823 if(0 == modify_freqs.size())
1824 copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1826 list<double>::iterator p_itr = modify_freqs.begin();
1828 for (
int j=0; j < seq_size; j++)
1830 for(
int i=0;
i < alphabet_size;
i++)
1832 if(modify_freqs.end() == p_itr)
1838 modify_freqs.insert(p_itr, (
BLASTAA_SIZE-alphabet_size), 0);
1842 const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1843 double max_obsr = *max_element(obsr.begin(), obsr.end()) + 1.0;
1847 " was excluded: due to too few independent observations\n";
1854 " was excluded: it conatins an invalid CD \n";
1863 list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets,
Int4 CurrFreqOffset,
Int4 CurrObsrOffset)
1866 ObsrOffsets.push_back(CurrObsrOffset);
1868 string wcounts_str = rpsDbInfo.
db_name +
".wcounts";
1870 if (!wcounts_file.is_open())
1873 string obsr_str = rpsDbInfo.
db_name +
".obsr";
1875 if (!obsr_file.is_open())
1883 wcounts_file.write((
char*)&magic_number,
sizeof(
Int4));
1884 obsr_file.write((
char*)&magic_number,
sizeof(
Int4));
1887 Int4 num_wcounts_records = FreqOffsets.size() -1;
1888 Int4 num_obsr_records = ObsrOffsets.size() -1;
1889 wcounts_file.write((
char*)&num_wcounts_records,
sizeof(
Int4));
1890 obsr_file.write((
char*)&num_obsr_records,
sizeof(
Int4));
1893 wcounts_file.flush();
1894 wcounts_file << tmp_freq_buff.rdbuf();
1895 wcounts_file.flush();
1896 wcounts_file.close();
1900 obsr_file << tmp_obsr_buff.rdbuf();
1922 catch(
const blast::CInputException& e) {
1930 catch (
const blast::CBlastException& e) {
1960 #ifndef SKIP_DOXYGEN_PROCESSING
1961 int main(
int argc,
const char* argv[] )
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Declares singleton objects to store the version and reference for the BLAST engine.
Routines for creating protein BLAST lookup tables.
BlastAaLookupTable * BlastAaLookupTableDestruct(BlastAaLookupTable *lookup)
Free the lookup table.
void BlastAaLookupIndexQuery(BlastAaLookupTable *lookup, Int4 **matrix, BLAST_SequenceBlk *query, BlastSeqLoc *unmasked_regions, Int4 query_bias)
Index a protein query.
struct RPSBackboneCell RPSBackboneCell
structure defining one cell of the RPS lookup table
#define RPS_HITS_PER_CELL
maximum number of hits in an RPS backbone cell; this may be redundant (have the same value as AA_HITS...
Int4 BlastAaLookupFinalize(BlastAaLookupTable *lookup, EBoneType bone_type)
Pack the data structures comprising a protein lookup table into their final form.
Int4 BlastAaLookupTableNew(const LookupTableOptions *opt, BlastAaLookupTable **lut)
Create a new protein lookup table.
#define BLAST_INPUT_ERROR
Command line binary exit code: error in input query/options.
#define BLAST_UNKNOWN_ERROR
Command line binary exit code: unknown error.
#define BLAST_DATABASE_ERROR
Command line binary exit code: error in database/subject.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
The structures and functions in blast_options.
Int2 BLAST_FillQuerySetUpOptions(QuerySetUpOptions *options, EBlastProgramType program, const char *filter_string, Uint1 strand_option)
Fill non-default contents of the QuerySetUpOptions.
Int2 BlastQuerySetUpOptionsNew(QuerySetUpOptions **options)
Allocate memory for QuerySetUpOptions and fill with default values.
Int2 BLAST_FillLookupTableOptions(LookupTableOptions *options, EBlastProgramType program, Boolean is_megablast, double threshold, Int4 word_size)
Allocate memory for lookup table options and fill with default values.
Int2 LookupTableOptionsNew(EBlastProgramType program, LookupTableOptions **options)
Allocate memory for lookup table options and fill with default values.
#define BLAST_WORDSIZE_PROT
length of word to trigger an extension.
LookupTableOptions * LookupTableOptionsFree(LookupTableOptions *options)
Deallocates memory for LookupTableOptions*.
QuerySetUpOptions * BlastQuerySetUpOptionsFree(QuerySetUpOptions *options)
Deallocate memory for QuerySetUpOptions.
#define FREQ_RATIO_SCALE
header for RPS blast frequency ratios ('.freq') file
#define RPS_MAGIC_NUM_28
Version number for 28-letter alphabet.
Int2 BLAST_GetProteinGapExistenceExtendParams(const char *matrixName, Int4 *gap_existence, Int4 *gap_extension)
Extract the recommended gap existence and extension values.
#define BLAST_SCORE_MAX
maximum allowed score (for one letter comparison).
Code to build a database given various sources of sequence data.
Class to constrain the values of an argument to those greater than or equal to the value specified in...
const CSeq_id * GetFirstId() const
Defines BLAST error codes (user errors included)
void AddParam(EUsageParams p, int val)
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
static void CreateDirectories(const string &dbname)
Create Directory for blast db.
void Create(int seq_size)
unsigned int GetSize(void)
CMakeDbPosMatrix pos_matrix
CNcbiOfstream lookup_file
QuerySetUpOptions * query_options
CNcbiOfstream blocks_file
BlastAaLookupTable * lookup
CRef< CWriteDB > output_db
LookupTableOptions * lookup_options
CMakeProfileDBApp(void)
@inheritDoc
void x_AddCmdOptions(void)
virtual void Init()
@inheritDoc
CheckInputScoremat_RV x_CheckInputScoremat(const CPssmWithParameters &pssm_w_parameters, const string &filename)
CRef< CTaxIdSet > m_Taxids
CNcbiIstream * m_InPssmList
void x_RPSUpdateLookup(CRPS_DbInfo &rpsDbInfo, Int4 seq_size)
vector< string > x_CreateDeltaList(void)
void x_WrapUpDelta(CRPS_DbInfo &rpsDbInfo, CTmpFile &tmp_obsr_file, CTmpFile &tmp_freq_file, list< Int4 > &FreqOffsets, list< Int4 > &ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset)
void x_RPSUpdateStatistics(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &seq, Int4 seq_size)
virtual int Run()
@inheritDoc
void x_CreateAliasFile(void)
void x_FillInRPSDbParameters(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &pssm_p)
void x_InitProgramParameters(void)
void x_InitRPSDbInfo(CRPS_DbInfo &rpsDBInfo, Int4 vol, Int4 num_files)
void x_RPS_DbClose(CRPS_DbInfo &rpsDbInfo)
bool x_CheckDelta(const CPssm &pssm, Int4 seq_size, const string &filename)
void x_RPSAddFirstSequence(CRPS_DbInfo &rpsDbInfo, CPssmWithParameters &pssm_w_parameters, bool freq_only)
void x_UpdateRPSDbInfo(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p)
void x_UpdateDelta(CRPS_DbInfo &rpsDbInfo, vector< string > &smpFilenames)
double m_WordDefaultScoreThreshold
void x_RPSUpdatePSSM(CRPS_DbInfo &rpsDbInfo, const CPssm &pssm, Int4 seq_index, Int4 seq_size)
void x_InitOutputDb(CRPS_DbInfo &rpsDBInfo)
void x_SetupArgDescriptions(void)
CBlastUsageReport m_UsageReport
vector< string > m_VolNames
bool x_ValidateCd(const list< double > &freqs, const list< double > &observ, unsigned int alphabet_size)
void x_UpdateFreqRatios(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p, Int4 seq_index, Int4 seq_size)
bool x_IsUpdateFreqRatios(const CPssm &p)
vector< string > x_GetSMPFilenames(void)
void x_UpdateCobalt(CRPS_DbInfo &rpsDbInfo, const CPssmWithParameters &pssm_p, Int4 seq_size)
void x_MakeVol(Int4 vol, vector< string > &smps)
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Computes a PSSM as specified in PSI-BLAST.
void GetQuerySequenceData(CNCBIstdaa &sequence) const
Retrieve the query sequence data in ncbistdaa format.
SIZE_TYPE GetQueryLength() const
Return the query length or 0 if no query is available.
void GetString(string &s) const
Return the data by assigning it to a string.
void FixTaxId(CRef< objects::CBlast_def_line_set > deflines)
Check that each defline has the specified taxid; if not, replace the defline and set the taxid.
void AddTaxId(const objects::CSeq_id &seqid, const TTaxId &taxid)
void SetMappingFromFile(CNcbiIstream &f)
static string MakeShortName(const string &base, int index)
Construct the short name for a volume.
void SetMaxFileSize(Uint8 sz)
Set maximum size for output files.
@ eProtein
Protein database.
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
EIndexType
Whether and what kind of indices to build.
@ eDefault
Like eFullIndex but also build a numeric Trace ID index.
@ eNoIndex
Build a database without any indices.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
void Close()
Close the Database.
Constant declarations for command line arguments for BLAST programs.
const string kArgMatrixName
Argument for scoring matrix.
const string kArgDbTitle
Title for the BLAST database.
const string kArgGapExtend
Argument to select the gap extending penalty.
const string kArgGapOpen
Argument to select the gap opening penalty.
const string kArgWordScoreThreshold
Argument to specify the minimum word score such that the word is added to the lookup table.
void Print(const CCompactSAMApplication::AlignInfo &ai)
std::ofstream out("events_result.xml")
main entry point for tests
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
static CNcbiMatrix< double > * GetFreqRatios(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
CVersionInfo GetVersion(void) const
Get the program version information.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
SStrictId_Tax::TId TTaxId
Taxon id type.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
#define TAX_ID_FROM(T, value)
@ fHideXmlHelp
Hide XML help description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
bool Exist(const string &name) const
Check existence of argument description.
@ eExcludes
One argument excludes another.
@ eInputFile
Name of file (must exist and be readable)
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
@ eDouble
Convertible into a floating point number (double)
@ eString
An arbitrary string.
@ eInteger
Convertible into an integer number (int or Int8)
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
void Error(CExceptionArgs_Base &args)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Info(CExceptionArgs_Base &args)
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
const string & GetFileName(void) const
Return used file name (generated or given in the constructor).
#define MSerial_AsnBinary
const TPrim & Get(void) const
#define MSerial_AsnText
I/O stream manipulators –.
void Reset(void)
Reset reference object.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
bool IsEnabled(void)
Indicates whether application usage statistics collection is enabled for a current reporter instance.
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
void Start(void)
Start the timer.
Tdata & Set(void)
Assign a value to data member.
const TFreqRatios & GetFreqRatios(void) const
Get the FreqRatios member data.
const TNumIndeptObsr & GetNumIndeptObsr(void) const
Get the NumIndeptObsr member data.
const TMatrixName & GetMatrixName(void) const
Get the MatrixName member data.
const TQuery & GetQuery(void) const
Get the Query member data.
TNumRows GetNumRows(void) const
Get the NumRows member data.
void SetParams(TParams &value)
Assign a value to Params data member.
bool IsSetFinalData(void) const
Final representation for the PSSM Check if a value has been assigned to FinalData data member.
bool IsSetStop(void) const
end of block on query Check if a value has been assigned to Stop data member.
TH GetH(void) const
Get the H member data.
TKappa GetKappa(void) const
Get the Kappa member data.
const TScores & GetScores(void) const
Get the Scores member data.
const TWeightedResFreqsPerPos & GetWeightedResFreqsPerPos(void) const
Get the WeightedResFreqsPerPos member data.
bool IsSetGapOpen(void) const
gap opening penalty corresponding to the matrix above Check if a value has been assigned to GapOpen d...
TGapExtend GetGapExtend(void) const
Get the GapExtend member data.
TWordScoreThreshold GetWordScoreThreshold(void) const
Get the WordScoreThreshold member data.
TScalingFactor GetScalingFactor(void) const
Get the ScalingFactor member data.
const TBlocks & GetBlocks(void) const
Get the Blocks member data.
bool IsSetStart(void) const
begin of block on query Check if a value has been assigned to Start data member.
bool IsSetWordScoreThreshold(void) const
Word score threshold Check if a value has been assigned to WordScoreThreshold data member.
bool IsSetScalingFactor(void) const
scaling factor used to obtain more precision when building the PSSM.
bool IsSetFreqRatios(void) const
PSSM's frequency ratios Check if a value has been assigned to FreqRatios data member.
TStop GetStop(void) const
Get the Stop member data.
void SetMatrixName(const TMatrixName &value)
Assign a value to MatrixName data member.
bool IsSetIntermediateData(void) const
both intermediateData and finalData can be provided, but at least one of them must be provided.
const TFinalData & GetFinalData(void) const
Get the FinalData member data.
bool IsSetWeightedResFreqsPerPos(void) const
Weighted observed residue frequencies per position of the PSSM.
bool IsSetRpsdbparams(void) const
data needed by formatrpsdb to create RPS-BLAST databases.
void SetPssm(TPssm &value)
Assign a value to Pssm data member.
TNumColumns GetNumColumns(void) const
Get the NumColumns member data.
const TConstraints & GetConstraints(void) const
Get the Constraints member data.
bool IsSetMatrixName(void) const
name of the underlying score matrix whose frequency ratios were used in PSSM construction (e....
bool IsSetNumRows(void) const
The dimensions of the matrix are returned so the client can verify that all data was received.
void SetFinalData(TFinalData &value)
Assign a value to FinalData data member.
TStart GetStart(void) const
Get the Start member data.
bool IsSetQuery(void) const
PSSM representative sequence (master) Check if a value has been assigned to Query data member.
TGapOpen GetGapOpen(void) const
Get the GapOpen member data.
bool IsSetNumIndeptObsr(void) const
Number of independent observations per position of the PSSM NOTE: this is needed for building CDD dat...
bool IsSetConstraints(void) const
alignment constraints needed by sequence-structure threader and other global or local block-alignment...
bool IsSetGapExtend(void) const
gap extension penalty corresponding to the matrix above Check if a value has been assigned to GapExte...
bool IsSetNumColumns(void) const
number of columns Check if a value has been assigned to NumColumns data member.
const TIntermediateData & GetIntermediateData(void) const
Get the IntermediateData member data.
TByRow GetByRow(void) const
Get the ByRow member data.
void SetGapOpen(TGapOpen value)
Assign a value to GapOpen data member.
const TParams & GetParams(void) const
Get the Params member data.
bool IsSetBlocks(void) const
nblocks locations Check if a value has been assigned to Blocks data member.
bool IsSetPssm(void) const
This field is applicable to PSI-BLAST and formatrpsdb.
void SetGapExtend(TGapExtend value)
Assign a value to GapExtend data member.
const TPssm & GetPssm(void) const
Get the Pssm member data.
bool IsSetParams(void) const
This field's rpsdbparams is used to specify the values of options for processing by formatrpsdb.
const TRpsdbparams & GetRpsdbparams(void) const
Get the Rpsdbparams member data.
TLambda GetLambda(void) const
Get the Lambda member data.
const TSeq & GetSeq(void) const
Get the variant data.
bool IsSeq(void) const
Check if variant Seq is selected.
const TTitle & GetTitle(void) const
Get the variant data.
const TId & GetId(void) const
Get the Id member data.
const Tdata & Get(void) const
Get the member data.
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
bool IsTitle(void) const
Check if variant Title is selected.
const TDescr & GetDescr(void) const
Get the Descr member data.
unsigned int
A callback function used to compare two keys in a database.
if(yy_accept[yy_current_state])
static void s_WriteInt4List(CNcbiOfstream &ostr, const list< Int4 > &l)
static const string kDefaultMatrix(kMatrixBLOSUM62)
static const string kOutDbName("out")
static CRef< CBlast_def_line_set > s_GenerateBlastDefline(const CBioseq &bio)
static const string kMatrixBLOSUM80
static const string kOutDbType("dbtype")
static const string kExcludeInvalid("exclude_invalid")
#define RPS_NUM_LOOKUP_CELLS
static const string kMatrixPAM250
static const string kMaxSmpFilesPerVol("max_smp_vol")
static const string kMatrixBLOSUM62
static bool s_HasDefline(const CBioseq &bio)
static const Uint4 kFixedPointScaleFactor
static const string kLogFile("logfile")
static const string kDefaultOutIndexFile("true")
static const string kDefaultOutDbType(kOutDbRps)
static const string kDefaultExcludeInvalid("true")
#define RPS_DATABASE_VERSION
static const string kMatrixBLOSUM50
static const string kOutDbRps
static void s_WriteUint4List(CNcbiOfstream &ostr, const list< Uint4 > &l)
static const string kMatrixBLOSUM90
#define kDefaultWordScoreThreshold
#define kDefaultObsrThreshold
static const string kInPssmList("in")
int main(int argc, const char *argv[])
#define kDefaultMaxSmpFilesPerVol
static const string kObsrThreshold("obsr_threshold")
static const string kMatrixPAM70
static const string kMatrixBLOSUM45
static const string kOutDbDelta
static bool s_DeleteMakeprofileDb(const string &name)
static const string kMatrixPAM30
static const string kBinaryScoremat("binary")
static const string kOutDbCobalt
static const string kUseCmdlineThreshold("force")
static const string kPssmScaleFactor("scale")
static const string kOutIndexFile("index")
#define kDefaultPssmScaleFactor
const struct ncbi::grid::netcache::search::fields::SIZE size
Prototypes for portable math library (ported from C Toolkit)
long BLAST_Nint(double x)
Nearest integer.
#define TRUE
bool replacment for C indicating true.
#define FALSE
bool replacment for C indicating false.
#define INT2_MIN
smallest (most negative) number represented by signed (two byte) short
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Miscellaneous common-use basic types and functionality.
Defines: CTimeFormat - storage class for time format.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
double f(double x_, const double &y_)
C++ API for the PSI-BLAST PSSM engine.
bool DeleteBlastDb(const string &dbpath, CSeqDB::ESeqType seq_type)
Deletes all files associated with a BLAST database.
EBlastDbVersion
BLAST database version.
This file defines several SeqDB utility functions related to byte order and file system portability.
CSeqDB_Substring SeqDB_RemoveDirName(CSeqDB_Substring s)
Returns a filename minus greedy path.
#define row(bind, expected)
structure defining one cell of the compacted lookup table
union AaLookupBackboneCell::@3 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
Int4 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins
Int4 num_used
number of hits stored for this cell
The basic lookup table structure for blastp searches.
void * thick_backbone
may point to BackboneCell, SmallboneCell, or TinyboneCell.
Boolean use_pssm
if TRUE, lookup table construction will assume that the underlying score matrix is position- specific
Int4 threshold
the score threshold for neighboring words
void * overflow
may point to Int4 or Uint2, the overflow array for the compacted lookup table
Int4 backbone_size
number of cells in the backbone
Used to hold a set of positions, mostly used for filtering.
Options needed to construct a lookup table Also needed: query sequence and query length.
Options required for setting up the query sequence.
structure defining one cell of the RPS lookup table
Class which defines sequence id to taxid mapping.
Defines BLAST database construction classes.
void CWriteDB_CreateAliasFile(const string &file_name, const string &db_name, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title=string(), EAliasFileFilterType alias_type=eGiList)
Writes an alias file that restricts a database with a gi list.
@ eNoAliasFilterType
Sentinel value.
Code for database files construction.