NCBI C++ ToolKit
Classes | Functions
seqdbcommon.cpp File Reference

Definitions of various helper functions for SeqDB. More...

#include <ncbi_pch.hpp>
#include <corelib/metareg.hpp>
#include <corelib/ncbienv.hpp>
#include <corelib/ncbifile.hpp>
#include <objtools/blast/seqdb_reader/seqdbcommon.hpp>
#include <util/sequtil/sequtil.hpp>
#include <util/sequtil/sequtil_convert.hpp>
#include <objects/seq/seq__.hpp>
#include <objects/general/general__.hpp>
#include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
#include <objtools/blast/seqdb_reader/impl/seqdbatlas.hpp>
#include <objtools/blast/seqdb_reader/seqidlist_reader.hpp>
#include <algorithm>
+ Include dependency graph for seqdbcommon.cpp:

Go to the source code of this file.

Go to the SVN repository for this file.

Classes

class  CSeqDB_FileExistence
 File existence test interface. More...
 
class  CSeqDB_AtlasAccessor
 Check file existence using CSeqDBAtlas. More...
 
class  CSeqDB_SimpleAccessor
 Check file existence using CFile. More...
 
class  CSeqDB_SortOidLessThan
 Compare SGiOid structs by OID. More...
 
class  CSeqDB_SortGiLessThan
 Compare SGiOid structs by GI. More...
 
class  CSeqDB_SortPigLessThan
 
class  CSeqDB_SortTiLessThan
 Compare SGiOid structs by GI. More...
 
class  CSeqDB_SortSiLessThan
 Compare SSeqIdOid structs by SeqId. More...
 

Functions

const string kSeqDBGroupAliasFileName ("index.alx")
 
CSeqDB_Substring SeqDB_RemoveDirName (CSeqDB_Substring s)
 Returns a filename minus greedy path. More...
 
CSeqDB_Substring SeqDB_RemoveFileName (CSeqDB_Substring s)
 Returns a path minus filename. More...
 
CSeqDB_Substring SeqDB_RemoveExtn (CSeqDB_Substring s)
 Returns a filename minus greedy path. More...
 
bool SeqDB_SplitString (CSeqDB_Substring &buffer, CSeqDB_Substring &front, char delim)
 Parse a prefix from a substring. More...
 
void SeqDB_CombinePath (const CSeqDB_Substring &one, const CSeqDB_Substring &two, const CSeqDB_Substring *extn, string &outp)
 Combine a filesystem path and file name. More...
 
bool SeqDB_CompareVolume (const string &s1, const string &s2)
 Compares two volume file names and determine the volume order. More...
 
static bool s_SeqDB_DBExists (const string &dbname, char dbtype, CSeqDB_FileExistence &access, bool linkoutdb_search)
 Test whether an index or alias file exists. More...
 
static string s_GetPathSplitter ()
 Returns the character used to seperate path components in the current operating system or platform. More...
 
void SeqDB_ConvertOSPath (string &dbs)
 Change path delimiters to platform preferred kind in-place. More...
 
string SeqDB_MakeOSPath (const string &dbs)
 Return path with delimiters changed to platform preferred kind. More...
 
static string s_SeqDB_TryPaths (const string &blast_paths, const string &dbname, char dbtype, bool exact, CSeqDB_FileExistence &access, bool linkoutdb_search=false)
 Search for a file in a provided set of paths. More...
 
static string s_SeqDB_FindBlastDBPath (const string &dbname, char dbtype, string *sp, bool exact, CSeqDB_FileExistence &access, const string path="")
 
string SeqDB_FindBlastDBPath (const string &dbname, char dbtype, string *sp, bool exact, CSeqDBAtlas &atlas)
 Finds a file in the search path. More...
 
string SeqDB_ResolveDbPath (const string &filename)
 Resolve a file path using SeqDB's path algorithms. More...
 
string SeqDB_ResolveDbPathNoExtension (const string &filename, char dbtype)
 Resolve a file path using SeqDB's path algorithms. More...
 
string SeqDB_ResolveDbPathForLinkoutDB (const string &filename)
 Resolve a file path using SeqDB's path algorithms. More...
 
void SeqDB_JoinDelim (string &a, const string &b, const string &delim)
 Join two strings with a delimiter. More...
 
template<class TCompare , class TVector >
void s_InsureOrder (TVector &v)
 
void SeqDB_ReadBinaryGiList (const string &fname, vector< TGi > &gis)
 Read a binary-format GI list from a file. More...
 
static bool s_SeqDB_IsBinaryNumericList (const char *fbeginp, const char *fendp, bool &has_long_ids, bool *has_tis=NULL)
 This function determines whether a file is a valid binary GI/TI file. More...
 
int s_ReadDigit (const char d, const string &list_type)
 
void SeqDB_ReadMemoryGiList (const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SGiOid > &gis, bool *in_order)
 Read a text or binary GI list from an area of memory. More...
 
void SeqDB_ReadMemoryPigList (const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SPigOid > &pigs, bool *in_order)
 
void SeqDB_ReadMemoryTaxIdList (const char *fbeginp, const char *fendp, CSeqDBGiList::STaxIdsOids &taxids)
 
void SeqDB_ReadMemoryTiList (const char *fbeginp, const char *fendp, vector< CSeqDBGiList::STiOid > &tis, bool *in_order)
 Read a text or binary TI list from an area of memory. More...
 
void SeqDB_ReadMemorySiList (const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order)
 Read a text SeqID list from an area of memory. More...
 
void SeqDB_ReadMemoryMixList (const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SGiOid > &gis, vector< CSeqDBGiList::STiOid > &tis, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order)
 Read an ID list (mixed type) from an area of memory. More...
 
static bool s_ContainsBinaryNumericIdList (const string &fname, CSeqDBFileGiList::EIdType type)
 
bool SeqDB_IsBinaryTiList (const string &fname)
 Returns true if the file name passed contains a binary TI list. More...
 
bool SeqDB_IsBinaryGiList (const string &fname)
 Read a text or binary SeqId list from a file. More...
 
void SeqDB_ReadGiList (const string &fname, vector< CSeqDBGiList::SGiOid > &gis, bool *in_order)
 Read a text or binary GI list from a file. More...
 
void SeqDB_ReadTiList (const string &fname, vector< CSeqDBGiList::STiOid > &tis, bool *in_order)
 Read a text or binary TI list from a file. More...
 
void SeqDB_ReadMixList (const string &fname, vector< CSeqDBGiList::SGiOid > &gis, vector< CSeqDBGiList::STiOid > &tis, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order)
 Read a text SeqId list from a file. More...
 
void SeqDB_ReadPigList (const string &fname, vector< CSeqDBGiList::SPigOid > &pigs, bool *in_order)
 
void SeqDB_ReadTaxIdList (const string &fname, CSeqDBGiList::STaxIdsOids &taxids)
 
void SeqDB_ReadGiList (const string &fname, vector< TGi > &gis, bool *in_order)
 Read a text or binary GI list from a file. More...
 
void SeqDB_ReadSiList (const string &fname, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order, SBlastSeqIdListInfo &db_info)
 Read a text SeqId list from a file. More...
 
void SeqDB_CombineAndQuote (const vector< string > &dbs, string &dbname)
 Combine and quote list of database names. More...
 
void SeqDB_SplitQuoted (const string &dbname, vector< CTempString > &dbs, bool keep_quote)
 Split a (possibly) quoted list of database names into pieces. More...
 
void SeqDB_SplitQuoted (const string &dbname, vector< CSeqDB_Substring > &dbs, bool keep_quote)
 Combine and quote list of database names. More...
 
void SeqDB_FileIntegrityAssert (const string &file, int line, const string &text)
 Report file corruption by throwing an eFile CSeqDBException. More...
 
ESeqDBIdType SeqDB_SimplifySeqid (CSeq_id &bestid, const string *acc, Int8 &num_id, string &str_id, bool &simpler)
 Seq-id simplification. More...
 
static size_t s_SeqDB_EndOfFastaID (const string &str, size_t pos)
 Find the end of a single element in a Seq-id set. More...
 
static bool s_SeqDB_ParseSeqIDs (const string &line, vector< CRef< CSeq_id > > &seqids)
 Parse string into a sequence of Seq-id objects. More...
 
ESeqDBIdType SeqDB_SimplifyAccession (const string &acc, Int8 &num_id, string &str_id, bool &simpler)
 String id simplification. More...
 
const string SeqDB_SimplifyAccession (const string &acc)
 String id simplification. More...
 
void SeqDB_GetFileExtensions (bool db_is_protein, vector< string > &extn, EBlastDbVersion dbver)
 Retrieves a list of all supported file extensions for BLAST databases. More...
 
void SeqDB_GetLMDBFileExtensions (bool db_is_protein, vector< string > &extn)
 Retrieves file extensions for BLAST LMDB files. More...
 
void SeqDB_GetMetadataFileExtension (bool db_is_protein, string &extn)
 
bool IsStringId (const CSeq_id &id)
 Determine if id is srting id. More...
 
string GetBlastSeqIdString (const CSeq_id &seqid, bool version)
 Return ID string as stored in lmdb. More...
 
const string SeqDB_GetOidMaskFileExt (bool db_is_protein, EOidMaskType t)
 

Detailed Description

Definitions of various helper functions for SeqDB.

Definition in file seqdbcommon.cpp.

Function Documentation

◆ GetBlastSeqIdString()

string GetBlastSeqIdString ( const CSeq_id seqid,
bool  version 
)

Return ID string as stored in lmdb.

Definition at line 2696 of file seqdbcommon.cpp.

References CSeq_id::AsFastaString(), CSeq_id::GetSeqIdString(), CSeq_id_Base::IsPir(), CSeq_id_Base::IsPrf(), and version.

Referenced by CSeqDBGiList::FindId(), and CSeqDBNegativeList::FindId().

◆ IsStringId()

bool IsStringId ( const CSeq_id id)

Determine if id is srting id.

Parameters
idinput id for check
Returns
Return true if id is not of type gi, ti or pig

Definition at line 2677 of file seqdbcommon.cpp.

References CDbtag_Base::CanGetDb(), CSeq_id_Base::e_General, CSeq_id_Base::e_Gi, and CDbtag_Base::GetDb().

Referenced by CSeqDBImpl::SeqidToOids(), and CBlastDBExtractor::SetSeqId().

◆ kSeqDBGroupAliasFileName()

const string kSeqDBGroupAliasFileName ( "index.alx"  )

◆ s_ContainsBinaryNumericIdList()

static bool s_ContainsBinaryNumericIdList ( const string fname,
CSeqDBFileGiList::EIdType  type 
)
static

◆ s_GetPathSplitter()

static string s_GetPathSplitter ( )
static

Returns the character used to seperate path components in the current operating system or platform.

Definition at line 270 of file seqdbcommon.cpp.

Referenced by s_SeqDB_TryPaths().

◆ s_InsureOrder()

template<class TCompare , class TVector >
void s_InsureOrder ( TVector v)

Definition at line 590 of file seqdbcommon.cpp.

References i, int, and ct::sort().

◆ s_ReadDigit()

int s_ReadDigit ( const char  d,
const string list_type 
)

◆ s_SeqDB_DBExists()

static bool s_SeqDB_DBExists ( const string dbname,
char  dbtype,
CSeqDB_FileExistence access,
bool  linkoutdb_search 
)
static

Test whether an index or alias file exists.

The provide filename is combined with both of the extensions appropriate to the database sequence type, and the resulting strings are checked for existence in the file system. The 'access' object defines how to check file existence.

Parameters
dbnameInput path and filename
dbtypeDatabase type, either protein or nucleotide
accessThe file access object.
linkoutdb_searchDetermines whether linkoutdb files should be searched for
Returns
true if either of the index or alias files is found

Definition at line 232 of file seqdbcommon.cpp.

References _ASSERT, dbname(), and CSeqDB_FileExistence::DoesFileExist().

Referenced by s_SeqDB_TryPaths().

◆ s_SeqDB_EndOfFastaID()

static size_t s_SeqDB_EndOfFastaID ( const string str,
size_t  pos 
)
static

Find the end of a single element in a Seq-id set.

Seq-id strings sometimes contain several Seq-ids. This function looks for the end of the first Seq-id, and will return its length. Static methods of CSeq_id are used to evaluate tokens.

Parameters
strSeq-id string to search.
posPosition at which to start search.
Returns
End position of first fasta id, or string::npos in case of error.

Definition at line 2442 of file seqdbcommon.cpp.

References CSeq_id_Base::e_not_set, int, str(), and CSeq_id::WhichInverseSeqId().

Referenced by s_SeqDB_ParseSeqIDs().

◆ s_SeqDB_FindBlastDBPath()

static string s_SeqDB_FindBlastDBPath ( const string dbname,
char  dbtype,
string sp,
bool  exact,
CSeqDB_FileExistence access,
const string  path = "" 
)
static

◆ s_SeqDB_IsBinaryNumericList()

static bool s_SeqDB_IsBinaryNumericList ( const char *  fbeginp,
const char *  fendp,
bool has_long_ids,
bool has_tis = NULL 
)
static

This function determines whether a file is a valid binary GI/TI file.

Parameters
fbeginppointer to start of file [in]
fendppointer to end of file [in]
has_long_idswill be set to true if the gi file contains long IDs [out]
has_tiswill be set to true if the input file contains Trace IDs, otherwise the file contains GIs [out]
Returns
true if file is binary
Exceptions
CSeqDBExceptionif file is empty or invalid gi file

Definition at line 854 of file seqdbcommon.cpp.

References isdigit(), and NCBI_THROW.

Referenced by s_ContainsBinaryNumericIdList(), SeqDB_ReadMemoryGiList(), SeqDB_ReadMemoryPigList(), SeqDB_ReadMemoryTaxIdList(), and SeqDB_ReadMemoryTiList().

◆ s_SeqDB_ParseSeqIDs()

static bool s_SeqDB_ParseSeqIDs ( const string line,
vector< CRef< CSeq_id > > &  seqids 
)
static

Parse string into a sequence of Seq-id objects.

A string is broken down into Seq-ids and the set of Seq-ids is returned.

Parameters
lineThe string to interpret.
seqidsThe returned set of Seq-id objects.
Returns
true if any Seq-id objects were found.

Definition at line 2497 of file seqdbcommon.cpp.

References s_SeqDB_EndOfFastaID().

Referenced by SeqDB_SimplifyAccession().

◆ s_SeqDB_TryPaths()

static string s_SeqDB_TryPaths ( const string blast_paths,
const string dbname,
char  dbtype,
bool  exact,
CSeqDB_FileExistence access,
bool  linkoutdb_search = false 
)
static

Search for a file in a provided set of paths.

This function takes a search path as a ":" delimited set of path names, and searches in those paths for the given database component. The component name may include path components. If the exact flag is set, the path is assumed to contain any required extension; otherwise extensions for index and alias files will be tried. Each element of the search path is tried in sequential order for both index or alias files (if exact is not set), before moving to the next element of the search path. The path returned from this function will not contain a file extension unless the provided filename did (in which case, exact is normally set).

Parameters
blast_pathsList of filesystem paths seperated by ":".
dbnameBase name of the database index or alias file to search for.
dbtypeType of database, either protein or nucleotide.
exactSet to true if dbname already contains any needed extension.
linkoutdb_searchDetermines whether linkoutdb files should be searched for
Returns
Full pathname, minus extension, or empty string if none found.

Definition at line 332 of file seqdbcommon.cpp.

References dbname(), CSeqDB_FileExistence::DoesFileExist(), NStr::fSplit_Tokenize, ITERATE, result, s_GetPathSplitter(), s_SeqDB_DBExists(), SeqDB_CombinePath(), SeqDB_MakeOSPath(), and NStr::Split().

Referenced by s_SeqDB_FindBlastDBPath(), and SeqDB_ResolveDbPathForLinkoutDB().

◆ SeqDB_CombineAndQuote()

void SeqDB_CombineAndQuote ( const vector< string > &  dbs,
string dbname 
)

Combine and quote list of database names.

Combine and quote a list of database names.

Parameters
dbsDatabase names to combine.
dbnameCombined database name.

Definition at line 1717 of file seqdbcommon.cpp.

References dbname(), i, int, and ncbi::grid::netcache::search::fields::size.

Referenced by CSeqDB::CSeqDB(), and CMakeBlastDBApp::x_ProcessInputData().

◆ SeqDB_CombinePath()

void SeqDB_CombinePath ( const CSeqDB_Substring path,
const CSeqDB_Substring file,
const CSeqDB_Substring extn,
string outp 
)

Combine a filesystem path and file name.

Combine a provided filesystem path and a file name. This function tries to avoid duplicated delimiters. If either string is empty, the other is returned. Conceptually, the first path might be the current working directory and the second path is a filename. So, if the second path starts with "/", the first path is ignored. Also, care is taken to avoid duplicated delimiters. If the first path ends with the delimiter character, another delimiter will not be added between the strings. The delimiter used will vary from operating system to operating system, and is adjusted accordingly. If a file extension is specified, it will also be appended.

Parameters
pathThe filesystem path to use
fileThe name of the file (may include path components)
extnThe file extension (without the "."), or NULL if none.
outpA returned string containing the combined path and file name

Definition at line 131 of file seqdbcommon.cpp.

References CSeqDB_Substring::Empty(), CSeqDB_Substring::GetBegin(), CSeqDB_Substring::GetEnd(), CDirEntry::GetPathSeparator(), CSeqDB_Substring::GetString(), isalpha(), and CSeqDB_Substring::Size().

Referenced by CSeqDB_BasePath::CSeqDB_BasePath(), CSeqDB_Path::CSeqDB_Path(), CSeqDBLMDBSet::CSeqDBLMDBSet(), CSeqDB_Path::ReplaceFilename(), s_SeqDB_TryPaths(), and CVDBAliasNode::x_ResolveVDBList().

◆ SeqDB_CompareVolume()

bool SeqDB_CompareVolume ( const string volpath1,
const string volpath2 
)

Compares two volume file names and determine the volume order.

Parameters
volpath1The 1st volume path
volpath2The 2nd volume path
Returns
true if vol1 should appear before vol2

Definition at line 190 of file seqdbcommon.cpp.

References CSeqDB_Path::FindBaseName(), and CSeqDB_Substring::GetString().

Referenced by CIndexedDb_Old::CIndexedDb_Old(), and CSeqDBAliasNode::FindVolumePaths().

◆ SeqDB_ConvertOSPath()

void SeqDB_ConvertOSPath ( string dbs)

Change path delimiters to platform preferred kind in-place.

The path is modified in place. The 'Convert' interface is more efficient for cases where the new path would be assigned to the same string object. Delimiter conversion should be called by SeqDB at least once on any path received from the user, or via filesystem sources such as alias files.

Parameters
dbsThis string will be changed in-place.

Definition at line 284 of file seqdbcommon.cpp.

References CDirEntry::GetPathSeparator(), and i.

Referenced by CSeqDB_BaseName::FixDelimiters(), CSeqDB_FileName::FixDelimiters(), CSeqDB_BasePath::FixDelimiters(), s_Tokenize(), and SeqDB_MakeOSPath().

◆ SeqDB_FileIntegrityAssert()

void SeqDB_FileIntegrityAssert ( const string file,
int  line,
const string text 
)

Report file corruption by throwing an eFile CSeqDBException.

This function is only called in the case of validation failure, and is used in code paths where the validation failure may be related to file corruption or filesystem problems. File data is considered a user input, so checks for corrupt file are treated as input validation. This means that (1) checks that may be caused by file corruption scenarios are not disabled in debug mode, and (2) an exception (rather than an abort) is used. Note that this function does not check the assert, so it should only be called in case of failure.

Parameters
fileName of the file containing the assert.
lineThe line the assert in on.
textThe text version of the asserted condition.

Definition at line 2255 of file seqdbcommon.cpp.

References CSeqDBException::eFileErr, file, NStr::IntToString(), SeqDB_ThrowException(), and text().

◆ SeqDB_FindBlastDBPath()

string SeqDB_FindBlastDBPath ( const string file_name,
char  dbtype,
string sp,
bool  exact,
CSeqDBAtlas atlas 
)

Finds a file in the search path.

This function resolves the full name of a file. It searches for a file of the provided base name and returns the provided name with the full path attached. If the exact_name flag is set, the file is assumed to have any extension it may need, and none is added for searching or stripped from the return value. If exact_name is not set, the file is assumed to end in ".pin", ".nin", ".pal", or ".nal", and if such a file is found, that extension is stripped from the returned string. Furthermore, in the exact_name == false case, only file extensions relevant to the dbtype are considered. Thus, if dbtype is set to 'p' for protein, only ".pin" and ".pal" are checked for; if it is set to nucleotide, only ".nin" and ".nal" are considered. The places where the file may be found are dependant on the search path. The search path consists of the current working directory, the contents of the BLASTDB environment variable, the BLASTDB member of the BLAST group of settings in the NCBI meta-registry. This registry is an interface to settings found in (for example) a ".ncbirc" file found in the user's home directory (but several paths are usually checked). Finally, if the provided file_name starts with the default path delimiter (which is OS dependant, but for example, "/" on Linux), the path will be taken to be absolute, and the search path will not affect the results.

Parameters
file_nameFile base name for which to search
dbtypeInput file base name
spIf non-null, the ":" delimited search path is returned here
exactIf true, the file_name already includes any needed extension
atlasThe memory management layer.
lockedThe lock holder object for this thread.
Returns
Fully qualified filename and path, minus extension

Definition at line 416 of file seqdbcommon.cpp.

References dbname(), CSeqDBAtlas::GetSearchPath(), and s_SeqDB_FindBlastDBPath().

Referenced by CSeqDBAliasSets::x_FindBlastDBPath(), and CSeqDBAliasNode::x_ResolveNames().

◆ SeqDB_GetFileExtensions()

void SeqDB_GetFileExtensions ( bool  db_is_protein,
vector< string > &  extensions,
EBlastDbVersion  dbver = eBDB_Version4 
)

Retrieves a list of all supported file extensions for BLAST databases.

Parameters
db_is_proteinset to true if the database is protein else false [in]
extensionswhere the return value will be stored [in|out]
dbverBLASTDB version to use [in]

Definition at line 2620 of file seqdbcommon.cpp.

References eBDB_Version4, eBDB_Version5, and SeqDB_GetLMDBFileExtensions().

Referenced by DeleteBlastDb(), CSeqDB::GetDiskUsage(), CBlastdbConvertApp::Run(), and CSeqDB::x_GetDBFilesMetaData().

◆ SeqDB_GetLMDBFileExtensions()

void SeqDB_GetLMDBFileExtensions ( bool  db_is_protein,
vector< string > &  extn 
)

Retrieves file extensions for BLAST LMDB files.

Parameters
db_is_proteinset to true if the database is protein else false [in]
extensionswhere the return value will be stored [in|out]

Definition at line 2658 of file seqdbcommon.cpp.

References NULL.

Referenced by DeleteBlastDb(), DeleteLMDBFiles(), and SeqDB_GetFileExtensions().

◆ SeqDB_GetMetadataFileExtension()

void SeqDB_GetMetadataFileExtension ( bool  db_is_protein,
string extn 
)

◆ SeqDB_GetOidMaskFileExt()

const string SeqDB_GetOidMaskFileExt ( bool  db_is_protein,
EOidMaskType  t 
)

Definition at line 2705 of file seqdbcommon.cpp.

References fExcludeModel, NCBI_THROW, and t.

Referenced by CWriteDB_CreateOidMaskDB(), and CSeqDBOIDList::x_ComputeFilters().

◆ SeqDB_IsBinaryGiList()

bool SeqDB_IsBinaryGiList ( const string fname)

Read a text or binary SeqId list from a file.

The SeqIds in a file are read into the provided vector<string>. If the in_order parameter is not null, the function will test the SeqIds for orderedness. It will set the bool to which in_order points to true if so, false if not.

Parameters
fnameThe name of the SeqId list file. [in]
sisThe SeqIds returned by this function. [out]
in_orderIf non-null, returns true iff the SeqIds were in order. [out]

void SeqDB_ReadSeqIdList(const string & fname, vector<string> & sis, bool * in_order = 0); Returns true if the file name passed contains a binary gi list

Parameters
fnameThe name of the GI list file. [in]
Exceptions
CSeqDBExceptionif file is invalid or empty

Definition at line 1400 of file seqdbcommon.cpp.

References CSeqDBFileGiList::eGiList, and s_ContainsBinaryNumericIdList().

Referenced by BOOST_AUTO_TEST_CASE(), and CBlastDBAliasApp::CreateAliasFile().

◆ SeqDB_IsBinaryTiList()

bool SeqDB_IsBinaryTiList ( const string fname)

Returns true if the file name passed contains a binary TI list.

Parameters
fnameThe name of the TI list file. [in]
Exceptions
CSeqDBExceptionif file is invalid or empty

Definition at line 1395 of file seqdbcommon.cpp.

References CSeqDBFileGiList::eTiList, and s_ContainsBinaryNumericIdList().

Referenced by CBlastDBAliasApp::CreateAliasFile().

◆ SeqDB_JoinDelim()

void SeqDB_JoinDelim ( string a,
const string b,
const string delim 
)

Join two strings with a delimiter.

This function returns whichever of two provided strings is non-empty. If both are non-empty, they are joined with a delimiter placed between them. It is intended for use when combining strings, such as a space delimited list of database volumes. It is probably not suitable for joining file system paths with filenames (use something like SeqDB_CombinePaths).

Parameters
aFirst component and returned path
bSecond component
delimThe delimiter to use when joining elements

Definition at line 480 of file seqdbcommon.cpp.

References a, b, and s_SeqDB_QuickAssign().

Referenced by CSeqDB_TitleWalker::AddString().

◆ SeqDB_MakeOSPath()

string SeqDB_MakeOSPath ( const string dbs)

Return path with delimiters changed to platform preferred kind.

The path is modified and returned. The 'Make' interface is more convenient for cases where the input path and output path are different objects. Delimiter conversion should be called by SeqDB at least once on any path received from the user, or via filesystem sources such as alias files.

Parameters
dbsThis is the input path.
Returns
The modified path is returned.

Definition at line 298 of file seqdbcommon.cpp.

References SeqDB_ConvertOSPath().

Referenced by CSeqDB_SimpleAccessor::DoesFileExist(), s_ContainsBinaryNumericIdList(), s_SeqDB_TryPaths(), SeqDB_ReadBinaryGiList(), SeqDB_ReadGiList(), SeqDB_ReadMixList(), SeqDB_ReadPigList(), SeqDB_ReadSiList(), SeqDB_ReadTaxIdList(), and SeqDB_ReadTiList().

◆ SeqDB_ReadBinaryGiList()

void SeqDB_ReadBinaryGiList ( const string name,
vector< TGi > &  gis 
)

Read a binary-format GI list from a file.

Parameters
nameThe name of the file containing GIs. [in]
gisThe GIs returned by this function. [out]

Definition at line 819 of file seqdbcommon.cpp.

References CMemoryFile::GetPtr(), CMemoryFile::GetSize(), GI_FROM, NCBI_THROW, SeqDB_GetStdOrd(), and SeqDB_MakeOSPath().

◆ SeqDB_ReadGiList() [1/2]

void SeqDB_ReadGiList ( const string fname,
vector< CSeqDBGiList::SGiOid > &  gis,
bool in_order = 0 
)

Read a text or binary GI list from a file.

The GIs in a file are read into the provided SGiOid vector. The GI half of each element of the vector is assigned, but the OID half will be left as -1. If the in_order parameter is not null, the function will test the GIs for orderedness. It will set the bool to which in_order points to true if so, false if not.

Parameters
fnameThe name of the GI list file. [in]
gisThe GIs returned by this function. [out]
in_orderIf non-null, returns true iff the GIs were in order. [out]

Definition at line 1405 of file seqdbcommon.cpp.

References CMemoryFile::GetPtr(), CMemoryFile::GetSize(), SeqDB_MakeOSPath(), and SeqDB_ReadMemoryGiList().

Referenced by CSeqDBFileGiList::CSeqDBFileGiList(), and SeqDB_ReadGiList().

◆ SeqDB_ReadGiList() [2/2]

void SeqDB_ReadGiList ( const string fname,
vector< TGi > &  gis,
bool in_order = 0 
)

Read a text or binary GI list from a file.

The GIs in a file are read into the provided vector<int>. If the in_order parameter is not null, the function will test the GIs for orderedness. It will set the bool to which in_order points to true if so, false if not.

Parameters
fnameThe name of the GI list file. [in]
gisThe GIs returned by this function. [out]
in_orderIf non-null, returns true iff the GIs were in order. [out]

Definition at line 1462 of file seqdbcommon.cpp.

References ITERATE, and SeqDB_ReadGiList().

◆ SeqDB_ReadMemoryGiList()

void SeqDB_ReadMemoryGiList ( const char *  fbeginp,
const char *  fendp,
vector< CSeqDBGiList::SGiOid > &  gis,
bool in_order = 0 
)

Read a text or binary GI list from an area of memory.

The GIs in a memory region are read into the provided SGiOid vector. The GI half of each element of the vector is assigned, but the OID half will be left as -1. If the in_order parameter is not null, the function will test the GIs for orderedness. It will set the bool to which in_order points to true if so, false if not.

Parameters
fbeginpThe start of the memory region holding the GI list. [in]
fendpThe end of the memory region holding the GI list. [in]
gisThe GIs returned by this function. [out]
in_orderIf non-null, returns true iff the GIs were in order. [out]

Definition at line 925 of file seqdbcommon.cpp.

References _ASSERT, GI_FROM, NCBI_THROW, s_ReadDigit(), s_SeqDB_IsBinaryNumericList(), SeqDB_GetStdOrd(), and ZERO_GI.

Referenced by CSeqDBNodeFileIdList::CSeqDBNodeFileIdList(), and SeqDB_ReadGiList().

◆ SeqDB_ReadMemoryMixList()

void SeqDB_ReadMemoryMixList ( const char *  fbeginp,
const char *  fendp,
vector< CSeqDBGiList::SGiOid > &  gis,
vector< CSeqDBGiList::STiOid > &  tis,
vector< CSeqDBGiList::SSiOid > &  sis,
bool in_order 
)

Read an ID list (mixed type) from an area of memory.

The Seq ids in a memory region are read into the provided SSeqIdOid vector. The gi, ti or seqid half of each element of the vector is assigned, but the OID half will be left as -1. If the in_order parameter is not null, the function will test the SeqIds for orderedness. It will set the bool to which in_order points to true if so, false if not.

Parameters
fbeginpThe start of the memory region holding the SeqId list. [in]
fendpThe end of the memory region holding the SeqId list. [in]
gisThe gis returned by this function. [out]
tisThe tis returned by this function. [out]
sisThe seqids returned by this function. [out]
in_orderIf non-null, returns true iff the seqids were in order. [out]

Definition at line 1324 of file seqdbcommon.cpp.

References eGiId, eStringId, eTiId, GI_FROM, head, SeqDB_SimplifyAccession(), and NStr::ToLower().

Referenced by SeqDB_ReadMixList().

◆ SeqDB_ReadMemoryPigList()

void SeqDB_ReadMemoryPigList ( const char *  fbeginp,
const char *  fendp,
vector< CSeqDBGiList::SPigOid > &  pigs,
bool in_order 
)

◆ SeqDB_ReadMemorySiList()

void SeqDB_ReadMemorySiList ( const char *  fbeginp,
const char *  fendp,
vector< CSeqDBGiList::SSiOid > &  sis,
bool in_order = 0 
)

Read a text SeqID list from an area of memory.

The Seqids in a memory region are read into the provided SSeqIdOid vector. The SeqId half of each element of the vector is assigned, but the OID half will be left as -1. If the in_order parameter is not null, the function will test the SeqIds for orderedness. It will set the bool to which in_order points to true if so, false if not.

Parameters
fbeginpThe start of the memory region holding the SeqId list. [in]
fendpThe end of the memory region holding the SeqId list. [in]
seqidsThe SeqId returned by this function. [out]
in_orderIf non-null, returns true iff the seqids were in order. [out]

Definition at line 1284 of file seqdbcommon.cpp.

References NStr::eTrunc_Both, head, and NStr::TruncateSpaces().

Referenced by CSeqDBNodeFileIdList::CSeqDBNodeFileIdList(), and SeqDB_ReadSiList().

◆ SeqDB_ReadMemoryTaxIdList()

void SeqDB_ReadMemoryTaxIdList ( const char *  fbeginp,
const char *  fendp,
CSeqDBGiList::STaxIdsOids taxids 
)

◆ SeqDB_ReadMemoryTiList()

void SeqDB_ReadMemoryTiList ( const char *  fbeginp,
const char *  fendp,
vector< CSeqDBGiList::STiOid > &  tis,
bool in_order = 0 
)

Read a text or binary TI list from an area of memory.

The TIs in a memory region are read into the provided STiOid vector. The TI half of each element of the vector is assigned, but the OID half will be left as -1. If the in_order parameter is not null, the function will test the TIs for orderedness. It will set the bool to which in_order points to true if so, false if not.

Parameters
fbeginpThe start of the memory region holding the TI list. [in]
fendpThe end of the memory region holding the TI list. [in]
tisThe TIs returned by this function. [out]
in_orderIf non-null, returns true iff the TIs were in order. [out]

Definition at line 1149 of file seqdbcommon.cpp.

References int, NCBI_THROW, s_ReadDigit(), s_SeqDB_IsBinaryNumericList(), and SeqDB_GetStdOrd().

Referenced by CSeqDBNodeFileIdList::CSeqDBNodeFileIdList(), and SeqDB_ReadTiList().

◆ SeqDB_ReadMixList()

void SeqDB_ReadMixList ( const string fname,
vector< CSeqDBGiList::SGiOid > &  gis,
vector< CSeqDBGiList::STiOid > &  tis,
vector< CSeqDBGiList::SSiOid > &  sis,
bool in_order 
)

Read a text SeqId list from a file.

The Seqids in a file are read into the provided SSeqIdOid vector. The Gi/Ti/Si half of each element of the vector is assigned, but the OID half will be left as -1. If the in_order parameter is not null, the function will test the SeqIds for orderedness. It will set the bool to which in_order points to true if so, false if not.

Parameters
fnameThe name of the SeqId list file. [in]
tisThe TIs returned by this function. [out]
sisThe SeqIds returned by this function. [out]
in_orderIf non-null, returns true iff the SeqIds were in order. [out]

Definition at line 1428 of file seqdbcommon.cpp.

References CMemoryFile::GetPtr(), CMemoryFile::GetSize(), SeqDB_MakeOSPath(), and SeqDB_ReadMemoryMixList().

Referenced by CSeqDBFileGiList::CSeqDBFileGiList().

◆ SeqDB_ReadPigList()

void SeqDB_ReadPigList ( const string fname,
vector< CSeqDBGiList::SPigOid > &  pigs,
bool in_order 
)

◆ SeqDB_ReadSiList()

void SeqDB_ReadSiList ( const string fname,
vector< CSeqDBGiList::SSiOid > &  sis,
bool in_order,
SBlastSeqIdListInfo db_info 
)

Read a text SeqId list from a file.

The Seqids in a file are read into the provided SSeqIdOid vector. The SeqId half of each element of the vector is assigned, but the OID half will be left as -1. If the in_order parameter is not null, the function will test the SeqIds for orderedness. It will set the bool to which in_order points to true if so, false if not.

Parameters
fnameThe name of the SeqId list file. [in]
gisThe GIs returned by this function. [out]
sisThe SeqIds returned by this function. [out]
in_orderIf non-null, returns true iff the SeqIds were in order. [out]

Definition at line 1476 of file seqdbcommon.cpp.

References CMemoryFile::GetPtr(), CBlastSeqidlistFile::GetSeqidlist(), CMemoryFile::GetSize(), SeqDB_MakeOSPath(), and SeqDB_ReadMemorySiList().

Referenced by CSeqDBFileGiList::CSeqDBFileGiList().

◆ SeqDB_ReadTaxIdList()

void SeqDB_ReadTaxIdList ( const string fname,
CSeqDBGiList::STaxIdsOids taxids 
)

◆ SeqDB_ReadTiList()

void SeqDB_ReadTiList ( const string fname,
vector< CSeqDBGiList::STiOid > &  tis,
bool in_order = 0 
)

Read a text or binary TI list from a file.

The TIs in a file are read into the provided STiOid vector. The TI half of each element of the vector is assigned, but the OID half will be left as -1. If the in_order parameter is not null, the function will test the TIs for orderedness. It will set the bool to which in_order points to true if so, false if not.

Parameters
fnameThe name of the TI list file. [in]
tisThe TIs returned by this function. [out]
in_orderIf non-null, returns true iff the TIs were in order. [out]

Definition at line 1417 of file seqdbcommon.cpp.

References CMemoryFile::GetPtr(), CMemoryFile::GetSize(), SeqDB_MakeOSPath(), and SeqDB_ReadMemoryTiList().

Referenced by CSeqDBFileGiList::CSeqDBFileGiList().

◆ SeqDB_RemoveDirName()

CSeqDB_Substring SeqDB_RemoveDirName ( CSeqDB_Substring  s)

Returns a filename minus greedy path.

Substring version. This returns the part of a file name after the last path delimiter, or the whole path if no delimiter was found.

Parameters
sInput path
Returns
Filename portion of path

Definition at line 50 of file seqdbcommon.cpp.

References CSeqDB_Substring::EraseFront(), CSeqDB_Substring::FindLastOf(), and CDirEntry::GetPathSeparator().

Referenced by CSeqDB_BasePath::FindBaseName(), CSeqDB_Path::FindBaseName(), CSeqDB_Path::FindFileName(), and CMakeProfileDBApp::x_CreateAliasFile().

◆ SeqDB_RemoveExtn()

CSeqDB_Substring SeqDB_RemoveExtn ( CSeqDB_Substring  s)

Returns a filename minus greedy path.

This returns the part of a file name after the last path delimiter, or the whole path if no delimiter was found.

Parameters
sInput path
Returns
Path minus file extension

Definition at line 76 of file seqdbcommon.cpp.

References CSeqDB_Substring::GetEnd(), CSeqDB_Substring::Resize(), and CSeqDB_Substring::Size().

Referenced by CSeqDB_Path::FindBaseName(), and CSeqDB_Path::FindBasePath().

◆ SeqDB_RemoveFileName()

CSeqDB_Substring SeqDB_RemoveFileName ( CSeqDB_Substring  s)

Returns a path minus filename.

Substring version of the above. This returns the part of a file path before the last path delimiter, or the whole path if no delimiter was found.

Parameters
sInput path
Returns
Path minus file extension

Definition at line 62 of file seqdbcommon.cpp.

References CSeqDB_Substring::Clear(), CSeqDB_Substring::FindLastOf(), CDirEntry::GetPathSeparator(), and CSeqDB_Substring::Resize().

Referenced by CSeqDB_BasePath::FindDirName(), and CSeqDB_Path::FindDirName().

◆ SeqDB_ResolveDbPath()

string SeqDB_ResolveDbPath ( const string filename)

Resolve a file path using SeqDB's path algorithms.

This finds a file using the same algorithm used by SeqDB to find blast database filenames. The filename must include the extension if any. Paths which start with '/', '\', or a drive letter (depending on operating system) will be treated as absolute paths. If the file is not found an empty string will be returned.

Parameters
filenameName of file to find.
Returns
Resolved path or empty string if not found.

Definition at line 453 of file seqdbcommon.cpp.

References s_SeqDB_FindBlastDBPath().

Referenced by CIndexedDb_New::AddIndexInfo(), BOOST_AUTO_TEST_CASE(), CIgAnnotationInfo::CIgAnnotationInfo(), CIndexedDb_Old::CIndexedDb_Old(), CTaxDBFileInfo::CTaxDBFileInfo(), CTaxonomy4BlastSQLite::CTaxonomy4BlastSQLite(), CBlastDatabaseArgs::ExtractAlgorithmOptions(), CIgBlastArgs::ExtractAlgorithmOptions(), CBlastSeqidlistFile::GetSeqidlistInfo(), s_GetTaxIDList(), CBlastTabularInfo::x_CheckTaxDB(), CCmdLineBlastXML2ReportData::x_InitCommon(), CSeqDBGiMask::x_Open(), CVDBAliasNode::x_ResolveDBList(), and CVDBAliasNode::x_ResolveVDBList().

◆ SeqDB_ResolveDbPathForLinkoutDB()

string SeqDB_ResolveDbPathForLinkoutDB ( const string filename)

Resolve a file path using SeqDB's path algorithms.

Identical to SeqDB_ResolveDbPathNoExtension with the exception that this function searches for ISAM or SQLite files, specifically those storing numeric and string data (for LinkoutDB; i.e.: '.sqlite3'). This is intended to check whether the files used in LinkoutDB exist or not.

Parameters
filenameName of file to find.
Returns
Resolved path or empty string if not found.

Definition at line 472 of file seqdbcommon.cpp.

References CSeqDBAtlas::GenerateSearchPath(), and s_SeqDB_TryPaths().

◆ SeqDB_ResolveDbPathNoExtension()

string SeqDB_ResolveDbPathNoExtension ( const string filename,
char  dbtype = '-' 
)

Resolve a file path using SeqDB's path algorithms.

Identical to SeqDB_ResolveDbPath with the exception that this function does not require the extension to be provided. This is intended to check whether a BLAST DB exists or not.

Parameters
filenameName of file to find.
dbtypeDetermines whether the BLAST DB is protein ('p'), nucleotide ('n'), or whether the algorithm should guess it ('-')
Returns
Resolved path or empty string if not found.

Definition at line 464 of file seqdbcommon.cpp.

References s_SeqDB_FindBlastDBPath().

Referenced by CBlastDBCmdApp::Run(), s_DoesBlastDbExist(), and CVDBAliasNode::x_ResolveDBList().

◆ SeqDB_SimplifyAccession() [1/2]

const string SeqDB_SimplifyAccession ( const string acc)

String id simplification.

This simpler version will convert string id to the standard ISAM form, and return "" if the conversion fails.

Parameters
accThe string to look up. [in]
Returns
The resulting converted id.

Definition at line 2610 of file seqdbcommon.cpp.

References eStringId, result, and SeqDB_SimplifyAccession().

◆ SeqDB_SimplifyAccession() [2/2]

ESeqDBIdType SeqDB_SimplifyAccession ( const string acc,
Int8 num_id,
string str_id,
bool simpler 
)

String id simplification.

This routine tries to produce a numerical type from a string identifier. SeqDB can use faster lookup mechanisms if a PIG, GI, or OID type can be recognized in the string, for example. Even when the output is a string, it may be better formed for the purpose of lookup in the string ISAM file.

Parameters
accThe string to look up. [in]
num_idThe returned identifier, if numeric. [out]
str_idThe returned identifier, if a string. [out]
simplerWhether an adjustment was done at all. [out]
Returns
The resulting identifier type.

Definition at line 2535 of file seqdbcommon.cpp.

References CSeq_id::BestRank(), NStr::EqualNocase(), eStringId, NStr::fConvErr_NoThrow, FindBestChoice(), NULL, CSeq_id::ParseFastaIds(), result, s_SeqDB_ParseSeqIDs(), SeqDB_SimplifySeqid(), NStr::SplitInTwo(), and NStr::ToLower().

Referenced by CSeqDBVol::AccessionToOids(), CInputGiList::AppendSi(), BOOST_AUTO_TEST_CASE(), CSeqDBGiList::PreprocessIdsForISAMSiLookup(), CSeqDBNegativeList::PreprocessIdsForISAMSiLookup(), SeqDB_ReadMemoryMixList(), and SeqDB_SimplifyAccession().

◆ SeqDB_SimplifySeqid()

ESeqDBIdType SeqDB_SimplifySeqid ( CSeq_id bestid,
const string acc,
Int8 num_id,
string str_id,
bool simpler 
)

Seq-id simplification.

Given a Seq-id, this routine devolves it to a GI or PIG if possible. If not, it formats the Seq-id into a canonical form for lookup in the string ISAM files. If the Seq-id was parsed from an accession, it can be provided in the "acc" parameter, and it will be used if the Seq-id is not in a form this code can recognize. In the case that new Seq-id types are added, support for which has not been added to this code, this mechanism will try to use the original string.

Parameters
bestidThe Seq-id to look up. [in]
accThe original string the Seq-id was created from (or NULL). [in]
num_idThe returned identifier, if numeric. [out]
str_idThe returned identifier, if a string. [out]
simplerWhether an adjustment was done at all. [out]
Returns
The resulting identifier type.

Definition at line 2264 of file seqdbcommon.cpp.

References CSeq_id::AsFastaString(), CTextseq_id_Base::CanGetAccession(), CDbtag_Base::CanGetDb(), CTextseq_id_Base::CanGetName(), CDbtag_Base::CanGetTag(), CTextseq_id_Base::CanGetVersion(), NStr::CompareNocase(), CSeq_id_Base::e_Ddbj, CSeq_id_Base::e_Embl, CSeq_id_Base::e_Genbank, CSeq_id_Base::e_General, CSeq_id_Base::e_Gi, CSeq_id_Base::e_Gibbsq, CSeq_id_Base::e_Gpipe, CSeq_id_Base::e_Local, CSeq_id_Base::e_Other, CSeq_id_Base::e_Pir, CSeq_id_Base::e_Prf, CSeq_id_Base::e_Swissprot, CSeq_id_Base::e_Tpd, CSeq_id_Base::e_Tpe, CSeq_id_Base::e_Tpg, CSeq_id::eFasta, eGiId, eOID, ePigId, eStringId, eTiId, CSeq_id::fLabel_GeneralDbIsContent, CSeq_id::fLabel_Version, CTextseq_id_Base::GetAccession(), CDbtag_Base::GetDb(), CSeq_id_Base::GetGeneral(), CSeq_id_Base::GetGi(), CSeq_id_Base::GetGibbsq(), CObject_id_Base::GetId(), CSeq_id::GetLabel(), CSeq_id_Base::GetLocal(), CTextseq_id_Base::GetName(), CObject_id_Base::GetStr(), CDbtag_Base::GetTag(), CSeq_id::GetTextseq_Id(), CTextseq_id_Base::GetVersion(), GI_TO, NStr::IntToString(), CObject_id_Base::IsStr(), result, NStr::StringToInt8(), NStr::ToLower(), NStr::UIntToString(), and CSeq_id_Base::Which().

Referenced by CSeqDBGiList::FindId(), CSeqDBNegativeList::FindId(), SeqDB_SimplifyAccession(), CSeqDBVol::SeqidToOids(), and CBlastDB_BioseqFormatter::Write().

◆ SeqDB_SplitQuoted() [1/2]

void SeqDB_SplitQuoted ( const string dbname,
vector< CSeqDB_Substring > &  dbs,
bool  keep_quote = false 
)

Combine and quote list of database names.

Parameters
dbnameCombined database name.
dbsDatabase names to combine.

Definition at line 1762 of file seqdbcommon.cpp.

References dbname(), and i.

Referenced by CAlignFormatUtil::GetBlastDbInfo(), CSeqDBAliasNode::GetMaskList(), s_Tokenize(), CMakeBlastDBApp::x_BuildDatabase(), CMakeBlastDBApp::x_ProcessInputData(), and CSeqDBAliasNode::x_Tokenize().

◆ SeqDB_SplitQuoted() [2/2]

void SeqDB_SplitQuoted ( const string dbname,
vector< CTempString > &  dbs,
bool  keep_quote = false 
)

Split a (possibly) quoted list of database names into pieces.

SeqDB permits multiple databases to be opened by a single CSeqDB instance, by passing the database names as a space-delimited list to the CSeqDB constructor. To support paths and filenames with embedded spaces, surround any space-containing names with double quotes ('"'). Filenames not containing spaces may be quoted safely with no effect. (This solution prevents the use of names containing embedded double quotes.)

This method splits a string encoded in this way into individual database names. Note that the resulting vector's objects are CTempString "slice" objects, and are only valid while the original (encoded) string is unchanged.

Parameters
dbnameCombined database name.
dbsDatabase names to combine.

Definition at line 1744 of file seqdbcommon.cpp.

References dbname(), ITERATE, and tmp.

◆ SeqDB_SplitString()

bool SeqDB_SplitString ( CSeqDB_Substring buffer,
CSeqDB_Substring front,
char  delim 
)

Parse a prefix from a substring.

The `buffer' argument is searched for a character. If found, the region before the delimiter is returned in `front' and the region after the delimiter is returned in `buffer', and true is returned. If not found, neither argument changes and false is returned.

Parameters
bufferSource data to search and remainder if found. [in|out]
frontRegion before delim if found. [out]
delimCharacter for which to search. [in]
Returns
true if the character was found, false otherwise.

Definition at line 113 of file seqdbcommon.cpp.

References buffer, ctll::front(), and i.

Referenced by CSeqDBTaxInfo::GetTaxNames().

Modified on Tue May 21 10:57:56 2024 by modify_doxy.py rev. 669887