92 string extn2(extn, 2, 4);
98 (extn[1] ==
'n' || extn[1] ==
'p') &&
99 (extn2 ==
"al" || extn2 ==
"in" || extn2 ==
"db")) {
138 int extn_amt = extn ? (extn->
Size()+1) : 0;
146 bool only_two =
false;
148 if (one.
Empty() || two[0] == delim) {
163 outp.reserve(two.
Size() + extn_amt);
173 outp.reserve(one.
Size() + two.
Size() + 1 + extn_amt);
177 if (outp[outp.size() - 1] != delim) {
195 if (x1 != x2)
return (x1 < x2);
196 else return (s1 < s2);
235 bool linkoutdb_search)
238 path.reserve(
dbname.size() + 4);
241 if (linkoutdb_search) {
243 path.append(
".sqlite3");
250 path[path.size()-3] = dbtype;
256 path[path.size()-2] =
'i';
257 path[path.size()-1] =
'n';
272 const char * splitter = 0;
274 #if defined(NCBI_OS_UNIX)
290 for(
size_t i = 0;
i<dbs.size();
i++) {
291 if (dbs[
i] ==
'/' || dbs[
i] ==
'\\') {
337 bool linkoutdb_search =
false)
347 vector<string> roads;
353 ITERATE(vector<string>, road, roads) {
383 const string path=
"")
474 const char dbtype(
'p');
492 size_t newlen =
a.length() +
b.length() + delim.length();
494 if (
a.capacity() < newlen) {
497 while(newcap < newlen) {
510 : m_CurrentOrder(
eNone), m_MaskOpts(0)
540 return lhs.
gi < rhs.
gi;
569 return lhs.
ti < rhs.
ti;
584 return lhs.
si < rhs.
si;
589 template<
class TCompare,
class TVector>
594 TCompare compare_less;
596 for(
int i = 1;
i < (
int) v.size();
i++) {
597 if (compare_less(v[
i], v[
i-1])) {
604 sort(v.begin(), v.end(), compare_less);
625 "Out of sequence sort order requested.");
637 s_InsureOrder<CSeqDB_SortGiLessThan>(
m_GisOids);
638 s_InsureOrder<CSeqDB_SortTiLessThan>(
m_TisOids);
639 s_InsureOrder<CSeqDB_SortSiLessThan>(
m_SisOids);
640 s_InsureOrder<CSeqDB_SortPigLessThan>(
m_PigsOids);
646 "Unrecognized sort order requested.");
656 int oid(0), index(0);
657 return (
const_cast<CSeqDBGiList *
>(
this))->GiToOid(gi, oid, index);
664 return GiToOid(gi, oid, index);
680 }
else if (m_gi > gi) {
696 int oid(0), index(0);
697 return (
const_cast<CSeqDBGiList *
>(
this))->TiToOid(ti, oid, index);
704 return TiToOid(ti, oid, index);
720 }
else if (m_ti > ti) {
735 int oid(0), index(0);
736 return (
const_cast<CSeqDBGiList *
>(
this))->SiToOid(
si, oid, index);
757 }
else if (
si < m_si) {
777 gis.push_back(itr->gi);
788 pigs.push_back(itr->pig);
799 tis.push_back(itr->ti);
811 sis.push_back(itr->si);
826 Int4 num_gis = (
Int4) (endp - beginp) - 2;
830 if (((endp - beginp) < 2U)
831 || (beginp[0] != 0xFFFFFFFFU)
835 "Specified file is not a valid binary GI file.");
838 gis.reserve(num_gis);
840 for(
Uint4 * elem = (beginp + 2); elem < endp; ++elem) {
855 bool& has_long_ids,
bool* has_tis =
NULL)
858 has_long_ids =
false;
861 Uint8 file_size = fendp - fbeginp;
863 if (file_size == 0) {
866 "Specified file is empty.");
867 }
else if (
isdigit((
unsigned char)(*((
char*) fbeginp))) ||
868 ((
unsigned char)(*((
char*) fbeginp)) ==
'#')) {
870 }
else if ((file_size >= 8) && ((*fbeginp & 0xFF) == 0xFF)) {
873 int marker = fbeginp[3] & 0xFF;
875 if (marker == 0xFE || marker == 0xFC) {
878 if (has_tis && (marker == 0xFD || marker == 0xFC)) {
884 "Specified file is not a valid GI/TI list.");
918 string msg =
string(
"Invalid byte in text" + list_type +
" list [") +
927 vector<CSeqDBGiList::SGiOid> & gis,
930 bool long_ids =
false;
931 Uint8 file_size = fendp - fbeginp;
938 Uint8 num_gis = bendp - bbeginp - 2;
942 if ((bbeginp[0] != 0xFFFFFFFFU)
946 "Specified file is not a valid binary GI file.");
949 gis.reserve(num_gis);
953 bool in_gi_order =
true;
955 Uint4* elem = bbeginp + 2;
956 while(elem < bendp) {
958 gis.push_back(this_gi);
960 if (prev_gi > this_gi) {
968 while(elem < bendp) {
972 *in_order = in_gi_order;
974 for(
Uint4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
985 gis.reserve((
int) (file_size / 7));
988 const string list_type(
"GI");
990 for(
const char * p = fbeginp; p < fendp; p ++) {
1007 vector<CSeqDBGiList::SPigOid> & pigs,
1010 bool long_ids =
false;
1011 Int8 file_size = fendp - fbeginp;
1017 Int4 num_pigs = (
Int4) (bendp - bbeginp) - 2;
1021 if (((bendp - bbeginp) < 2U)
1022 || (bbeginp[0] != 0xFFFFFFFFU)
1026 "Specified file is not a valid binary IPG file.");
1029 pigs.reserve(num_pigs);
1035 Uint4* elem = bbeginp + 2;
1036 while(elem < bendp) {
1038 pigs.push_back(this_pig);
1040 if (prev_pig > this_pig) {
1044 prev_pig = this_pig;
1048 while(elem < bendp) {
1054 for(
Uint4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
1059 pigs.reserve((
int) (file_size / 7));
1062 const string list_type(
"IPG");
1064 for(
const char * p = fbeginp; p < fendp; p ++) {
1069 pigs.push_back(elem);
1084 bool long_ids =
false;
1089 Uint8 num_taxids = (bendp - bbeginp) - 2;
1092 taxids.
oids.clear();
1094 if (((bendp - bbeginp) < 2) || (bbeginp[0] != 0xFFFFFFFF)
1097 "Specified file is not a valid binary Tax Id List file.");
1100 for(
Int4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
1105 const string list_type(
"TAXID");
1107 for(
const char * p = fbeginp; p < fendp; p ++) {
1151 vector<CSeqDBGiList::STiOid> & tis,
1154 bool long_ids =
false;
1155 Int8 file_size = fendp - fbeginp;
1160 Int4 * bdatap = bbeginp + 2;
1162 Uint4 num_tis = (
int)(bendp-bdatap);
1164 int remainder = num_tis % 2;
1172 bool bad_fmt =
false;
1174 if (bendp < bdatap) {
1180 if ((marker != -3 && marker != -4) ||
1181 (num_ids != num_tis) ||
1182 (remainder && long_ids)) {
1191 "Specified file is not a valid binary GI or TI file.");
1194 tis.reserve(num_tis);
1202 bool in_ti_order =
true;
1204 Int8 * elem = bdatap8;
1206 while(elem < bendp8) {
1208 tis.push_back(this_ti);
1210 if (prev_ti > this_ti) {
1211 in_ti_order =
false;
1218 while(elem < bendp8) {
1222 *in_order = in_ti_order;
1224 for(
Int8 * elem = bdatap8; elem < bendp8; elem ++) {
1231 bool in_ti_order =
true;
1233 Int4 * elem = bdatap;
1235 while(elem < bendp) {
1237 tis.push_back(this_ti);
1239 if (prev_ti > this_ti) {
1240 in_ti_order =
false;
1247 while(elem < bendp) {
1251 *in_order = in_ti_order;
1253 for(
Int4 * elem = bdatap; elem < bendp; elem ++) {
1264 tis.reserve(
int(file_size / 7));
1267 const string list_type(
"TI");
1269 for(
const char * p = fbeginp; p < fendp; p ++) {
1273 tis.push_back(elem);
1286 vector<CSeqDBGiList::SSiOid> & sis,
1289 Int8 file_size = fendp - fbeginp;
1296 sis.reserve(sis.size() +
int(file_size / 7));
1298 const char * p = fbeginp;
1300 while ( p < fendp) {
1302 while (p< fendp && (*p==
'>' || *p==
' ' || *p==
'\t' || *p==
'\n' || *p==
'\r')) ++p;
1303 if (p< fendp && *p ==
'#') {
1305 while (p< fendp && *p!=
'\n') ++p;
1309 while (p< fendp && *p!=
' ' && *p!=
'\t' && *p!=
'\n' && *p!=
'\r') ++p;
1311 string acc(
head, p);
1314 sis.push_back(str_id);
1316 cerr <<
"WARNING: " << acc
1317 <<
" is not a valid seqid string." << endl;
1321 if (in_order) *in_order =
false;
1326 vector<CSeqDBGiList::SGiOid> & gis,
1327 vector<CSeqDBGiList::STiOid> & tis,
1328 vector<CSeqDBGiList::SSiOid> & sis,
1331 Int8 file_size = fendp - fbeginp;
1338 sis.reserve(sis.size() +
int(file_size / 7));
1340 const char * p = fbeginp;
1342 while ( p < fendp) {
1344 while (p< fendp && (*p==
'>' || *p==
' ' || *p==
'\t' || *p==
'\n' || *p==
'\r')) ++p;
1345 if (p< fendp && *p ==
'#') {
1347 while (p< fendp && *p!=
'\n') ++p;
1351 while (p< fendp && *p!=
' ' && *p!=
'\t' && *p!=
'\n' && *p!=
'\r') ++p;
1353 string acc(
head, p);
1361 else if (
eTiId == id_type) {
1362 tis.push_back((
TTi) num_id);
1364 else if (
eGiId == id_type) {
1368 cerr <<
"WARNING: " << acc
1369 <<
" is not a valid seqid string." << endl;
1373 if (in_order) *in_order =
false;
1381 const char * fbeginp = (
char*) mfile.
GetPtr();
1382 const char * fendp = fbeginp + (
int)file_size;
1384 bool ignore =
false;
1385 bool has_tis =
false;
1390 retval = has_tis && retval;
1405 void SeqDB_ReadGiList(
const string & fname, vector<CSeqDBGiList::SGiOid> & gis,
bool * in_order)
1410 const char * fbeginp = (
char*) mfile.
GetPtr();
1411 const char * fendp = fbeginp + file_size;
1417 void SeqDB_ReadTiList(
const string & fname, vector<CSeqDBGiList::STiOid> & tis,
bool * in_order)
1422 const char * fbeginp = (
char*) mfile.
GetPtr();
1423 const char * fendp = fbeginp + file_size;
1429 vector<CSeqDBGiList::STiOid> & tis, vector<CSeqDBGiList::SSiOid> & sis,
bool * in_order)
1434 const char *fbeginp = (
char*) mfile.
GetPtr();
1435 const char *fendp = fbeginp + file_size;
1445 const char * fbeginp = (
char*) mfile.
GetPtr();
1446 const char * fendp = fbeginp + file_size;
1456 const char * fbeginp = (
char*) mfile.
GetPtr();
1457 const char * fendp = fbeginp + file_size;
1464 typedef vector<CSeqDBGiList::SGiOid> TPairList;
1469 gis.reserve(pairs.size());
1471 ITERATE(TPairList, iter, pairs) {
1472 gis.push_back(iter->gi);
1485 const char *fbeginp = (
char*) mfile.
GetPtr();
1486 const char *fendp = fbeginp + file_size;
1494 int b(0), e((
int)
m_Gis.size());
1502 }
else if (m_gi > gi) {
1517 int b(0), e((
int)
m_Tis.size());
1525 }
else if (m_ti > ti) {
1537 bool match_type =
false;
1538 return FindId(
id, match_type);
1545 int b(0), e((
int)
m_Sis.size());
1549 string m_si =
m_Sis[m];
1553 }
else if (m_si >
si) {
1567 match_type = (
GetNumGis() > 0) ?
true :
false;
1571 }
else if (
id.IsGeneral() &&
id.GetGeneral().GetDb() ==
"ti") {
1572 match_type = (
GetNumTis() > 0) ?
true :
false;
1575 const CObject_id & obj =
id.GetGeneral().GetTag();
1584 match_type = (
GetNumSis() > 0) ?
true :
false;
1602 size_t pos = str_id.find(
".");
1603 if (pos != str_id.npos) {
1604 string nover(str_id, 0, pos);
1636 return FindGi(
id.GetGi());
1637 }
else if (
id.IsGeneral() &&
id.GetGeneral().GetDb() ==
"ti") {
1638 const CObject_id & obj =
id.GetGeneral().GetTag();
1654 if (
FindSi(str_id))
return true;
1657 size_t pos = str_id.find(
".");
1658 if (pos != str_id.npos) {
1659 string nover(str_id, 0, pos);
1669 bool in_order =
false;
1722 for(
unsigned i = 0;
i < dbs.size();
i++) {
1728 for(
unsigned i = 0;
i < dbs.size();
i++) {
1733 if (dbs[
i].find(
" ") != string::npos) {
1745 vector<CTempString> & dbs,
1748 vector<CSeqDB_Substring> subs;
1753 dbs.reserve(subs.size());
1755 ITERATE(vector<CSeqDB_Substring>, iter, subs) {
1763 vector<CSeqDB_Substring> & dbs,
1768 const char * sp =
dbname.data();
1770 bool quoted =
false;
1773 for(
unsigned i = 0;
i <
dbname.size();
i++) {
1794 }
else if (ch ==
'"') {
1798 begin = keep_quote ?
i :
i + 1;
1804 if (begin <
dbname.size()) {
1815 sort(gis.begin(), gis.end());
1820 int gis_n = (
int) gis.size();
1822 while(list_i < list_n && gis_i < gis_n) {
1849 sort(gis.begin(), gis.end());
1854 int gis_n = (
int) gis.size();
1856 while(list_i < list_n && gis_i < gis_n) {
1873 TGi last_gi = gis[gis_i];
1874 do { gis_i++; }
while (gis_i < gis_n && gis[gis_i] == last_gi);
1878 while (gis_i < gis_n) {
1904 #ifdef NCBI_STRICT_GI
1920 sort(ids.begin(), ids.end());
1921 ids.erase(unique(ids.begin(), ids.end()), ids.end());
1927 sort(ids.begin(), ids.end());
1928 ids.erase(unique(ids.begin(), ids.end()), ids.end());
1947 incl_A = incl_B = incl_AB =
false;
1976 if ((! A_pos) && (! B_pos)) {
1979 A_pos = B_pos =
true;
1984 if ((! A_pos) || (! B_pos)) {
1993 result_pos = A_pos == B_pos;
2009 incl_AB = A_pos && B_pos;
2014 incl_A = incl_B = incl_AB =
true;
2018 incl_AB = (A_pos != B_pos);
2019 incl_A = incl_B = ! incl_AB;
2029 const vector<Int8> &
A,
2031 const vector<Int8> &
B,
2048 size_t A_i(0), B_i(0);
2050 while((A_i <
A.size()) && (B_i <
B.size())) {
2051 Int8 Ax(
A[A_i]),
Bx(
B[B_i]), target(-1);
2052 bool included(
false);
2058 }
else if (Ax >
Bx) {
2070 result.push_back(target);
2075 while(A_i <
A.size()) {
2081 while(B_i <
B.size()) {
2088 const vector<Int4> & ids,
2097 bool result_pos(
true);
2112 const vector<Int8> & ids,
2120 bool result_pos(
true);
2135 const vector<Uint8> & ids,
2143 bool result_pos(
true);
2162 "Set operation requested but ID types don't match.");
2166 bool result_pos(
true);
2187 "Positive ID list requested but only negative exists.");
2200 _ASSERT(((*iter) >> 32) == 0);
2213 "Negative ID list requested but only positive exists.");
2228 _ASSERT(((*iter) >> 32) == 0);
2244 : m_Positive (
false),
2257 const string &
text)
2259 string msg =
"Validation failed: [" +
text +
"] at ";
2274 bool matched =
true;
2276 switch(bestid.
Which()) {
2294 if (dbt.
GetDb() ==
"BL_ORD_ID") {
2301 if (dbt.
GetDb() ==
"PIG") {
2308 if (dbt.
GetDb() ==
"ti") {
2344 if (objid.
IsStr()) {
2446 size_t vbar =
str.find(
'|', pos);
2448 if (vbar == string::npos) {
2449 return string::npos;
2452 string portion(
str, pos, vbar - pos);
2458 size_t vbar_prev = vbar;
2461 vbar =
str.find(
'|', vbar_prev + 1);
2463 if (vbar == string::npos) {
2467 int start_pt =
int(vbar_prev + 1);
2468 string element(
str, start_pt, vbar - start_pt);
2478 return string::npos;
2481 return (vbar == string::npos) ?
str.size() : vbar;
2505 while (pos < line.size()) {
2508 if (end == string::npos) {
2514 string element(line, pos, end - pos);
2521 catch(invalid_argument &) {
2526 seqids.push_back(
id);
2530 return ! seqids.empty();
2543 vector< CRef< CSeq_id > > seqid_set;
2556 list< CRef<CSeq_id> > seqids;
2564 if (!seqids.empty() && seqids.front()->IsPdb() &&
2565 acc.find(
"_") != string::npos) {
2567 str_id = seqids.front()->AsFastaString();
2570 else if (!seqids.empty() && seqids.front()->IsLocal()) {
2572 if( acc.find(
":") != string::npos) {
2573 static const char* GNL_DBs[] = {
"CDD",
"SRA",
"TSA",
"GNOMON",
NULL};
2574 string db_tag, gnl_id;
2576 const char** p = GNL_DBs;
2577 for (; p && *p; ++p) {
2579 str_id =
"gnl|" + db_tag +
"|" + gnl_id;
2580 seqids.front().Reset();
2582 seqids.front() = new_id;
2593 str_id =
"lcl|" + acc;
2614 bool simpler(
false);
2627 const string kExtnMol(1, db_is_protein ?
'p' :
'n');
2629 extn.push_back(kExtnMol +
"al");
2630 extn.push_back(kExtnMol +
"in");
2631 extn.push_back(kExtnMol +
"hr");
2632 extn.push_back(kExtnMol +
"sq");
2633 extn.push_back(kExtnMol +
"ni");
2634 extn.push_back(kExtnMol +
"nd");
2636 extn.push_back(kExtnMol +
"si");
2637 extn.push_back(kExtnMol +
"sd");
2639 extn.push_back(kExtnMol +
"pi");
2640 extn.push_back(kExtnMol +
"pd");
2641 extn.push_back(kExtnMol +
"js");
2643 vector<string> lmdbs;
2645 extn.insert(extn.end(), lmdbs.begin(), lmdbs.end());
2648 extn.push_back(kExtnMol +
"aa");
2649 extn.push_back(kExtnMol +
"ab");
2650 extn.push_back(kExtnMol +
"ac");
2651 extn.push_back(kExtnMol +
"og");
2652 extn.push_back(kExtnMol +
"hi");
2653 extn.push_back(kExtnMol +
"hd");
2654 extn.push_back(kExtnMol +
"ti");
2655 extn.push_back(kExtnMol +
"td");
2662 static const char * ext[]={
"db",
"os",
"ot",
"tf",
"to",
"db-lock",
"tf-lock",
NULL};
2664 const string kExtnMol(1, db_is_protein ?
'p' :
'n');
2665 for(
const char ** p=ext; *p !=
NULL; p++) {
2666 extn.push_back(kExtnMol + (*p));
2673 const string kExtnMol(1, db_is_protein ?
'p' :
'n');
2674 extn = kExtnMol +
"js";
2680 switch(
id.Which()) {
2686 const CDbtag & dbt =
id.GetGeneral();
2710 return (db_is_protein ?
"pxm":
"nxm");
static int GetSeqidlist(CMemoryFile &file, vector< CSeqDBGiList::SSiOid > &idlist, SBlastSeqIdListInfo &list_info)
Get seqidlist from dbv5 seqidlist file.
CIntersectionGiList(CSeqDBGiList &gilist, vector< TGi > &gis)
Construct an intersection of two lists of GIs.
bool DoesFileExist(const string &fname)
Check if file exists.
const string GetSearchPath() const
Get BlastDB search path.
static const string GenerateSearchPath()
Generate search path.
@ eFileErr
Files were missing or contents were incorrect.
CSeqDBFileGiList(const string &fname, EIdType idtype=eGiList)
Build a GI list from a file.
void AddTi(TTi ti)
Add a new TI to the list.
vector< SGiOid > m_GisOids
Pairs of GIs and OIDs.
int GetNumGis() const
Get the number of GIs in the array.
bool GiToOid(TGi gi, int &oid)
Try to find a GI and return the associated OID.
const SGiOid & GetGiOid(int index) const
Access an element of the array.
vector< SPigOid > m_PigsOids
bool FindSi(const string &si) const
int GetNumSis() const
Get the number of Seq-ids in the array.
bool SiToOid(const string &si, int &oid)
CSeqDBGiList()
Constructor.
void GetPigList(vector< TPig > &pigs) const
void GetGiList(vector< TGi > &gis) const
Get the gi list.
bool TiToOid(TTi ti, int &oid)
Try to find a TI and return the associated OID.
bool FindTi(TTi ti) const
Test for existence of a TI.
void GetTiList(vector< TTi > &tis) const
Get the ti list.
int GetNumTis() const
Get the number of TIs in the array.
vector< STiOid > m_TisOids
Pairs of GIs and OIDs.
SBlastSeqIdListInfo m_ListInfo
void GetSiList(vector< string > &sis) const
TODO Get the seqid list?
ESortOrder
Possible sorting states.
@ eNone
The array is unsorted or the sortedness is unknown.
@ eGi
The array is sorted by GI.
void AddGi(TGi gi)
Add a new GI to the list.
void ReserveGis(size_t n)
Reserve space for GIs.
void ReserveTis(size_t n)
Reserve space for TIs.
void PreprocessIdsForISAMSiLookup()
Preprocess ids for ISAM string id lookup.
bool FindGi(TGi gi) const
Test for existence of a GI.
void InsureOrder(ESortOrder order)
Sort if necessary to insure order of elements.
vector< SSiOid > m_SisOids
Pairs of Seq-ids and OIDs.
bool FindId(const CSeq_id &id)
Test for existence of a Seq-id by type.
ESortOrder m_CurrentOrder
Indicates the current sort order, if any, of this container.
Helper class to allow copy-on-write semantics for CSeqDBIdSet.
const vector< Int8 > & Get() const
Access the Int8 set.
size_t Size() const
Get the number of elements stored here.
vector< Int8 > & Set()
Access the Int8 set.
vector< string > & SetSeqIDs()
Access the string set.
SeqDB ID list for performing boolean set operations.
CRef< CSeqDBIdSet_Vector > m_Ids
Ids stored here.
static void x_SortAndUnique(vector< Int8 > &ids)
Sort and unique the internal set.
void x_BooleanSetOperation(EOperation op, const vector< Int8 > &A, bool A_pos, const vector< Int8 > &B, bool B_pos, vector< Int8 > &result, bool &result_pos)
Compute boolean operation on two vectors.
static void x_SummarizeBooleanOp(EOperation op, bool A_pos, bool B_pos, bool &result_pos, bool &incl_A, bool &incl_B, bool &incl_AB)
Compute inclusion flags for a boolean operation.
CSeqDBIdSet()
Construct a 'blank' CSeqDBIdSet object.
bool m_Positive
True if the current list is positive.
void Negate()
Invert the current list.
bool Blank() const
Check if an ID list is blank.
EIdType
Type of IDs stored here.
EOperation
Types of operations that may be performed on GI lists.
void Compute(EOperation op, const vector< int > &ids, bool positive=true)
Perform a logical operation on a list.
CRef< CSeqDBNegativeList > GetNegativeList()
Retrieve a negative GI list.
CRef< CSeqDBGiList > GetPositiveList()
Retrieve a positive GI list.
void AddSi(const string &si)
Add a new SeqId to the list.
void AddGi(TGi gi)
Add a new GI to the list.
void ReserveGis(size_t n)
Reserve space for GIs.
void AddTi(TTi ti)
Add a new TI to the list.
int GetNumTis() const
Get the number of TIs in the array.
bool FindId(const CSeq_id &id, bool &match_type)
Test for existence of a TI or GI here and report whether the ID was one of those types.
void ReserveTis(size_t n)
Reserve space for TIs.
vector< TTi > m_Tis
TIs to exclude from the SeqDB instance.
TGi GetGi(int index) const
Access an element of the GI array.
vector< string > m_Sis
SeqIds to exclude from the SeqDB instance.
bool FindTi(TTi ti)
Test for existence of a TI.
void PreprocessIdsForISAMSiLookup()
int GetNumGis() const
Get the number of GIs in the array.
bool FindGi(TGi gi)
Test for existence of a GI.
size_t m_LastSortSize
Zero if unsorted, or the size it had after the last sort.
vector< TGi > m_Gis
GIs to exclude from the SeqDB instance.
int GetNumSis() const
Get the number of SeqIds in the array.
void ReserveSis(size_t n)
void InsureOrder()
Sort list if not already sorted.
Check file existence using CSeqDBAtlas.
CSeqDB_AtlasAccessor(CSeqDBAtlas &atlas)
Constructor.
virtual bool DoesFileExist(const string &fname)
Test file existence.
File existence test interface.
virtual ~CSeqDB_FileExistence()
Destructor.
virtual bool DoesFileExist(const string &fname)=0
Check if file exists at fully qualified path.
CSeqDB_Substring FindBaseName() const
Returns the portion of this path containing the base name.
Check file existence using CFile.
virtual bool DoesFileExist(const string &fname)
Test file existence.
CSeqDB_SimpleAccessor()
Constructor.
Compare SGiOid structs by GI.
int operator()(const CSeqDBGiList::SGiOid &lhs, const CSeqDBGiList::SGiOid &rhs)
Test whether lhs is less than (occurs before) rhs.
Compare SGiOid structs by OID.
int operator()(const CSeqDBGiList::SGiOid &lhs, const CSeqDBGiList::SGiOid &rhs)
Test whether lhs is less than (occurs before) rhs.
int operator()(const CSeqDBGiList::SPigOid &lhs, const CSeqDBGiList::SPigOid &rhs)
Test whether lhs is less than (occurs before) rhs.
Compare SSeqIdOid structs by SeqId.
int operator()(const CSeqDBGiList::SSiOid &lhs, const CSeqDBGiList::SSiOid &rhs)
Test whether lhs is less than (occurs before) rhs.
Compare SGiOid structs by GI.
int operator()(const CSeqDBGiList::STiOid &lhs, const CSeqDBGiList::STiOid &rhs)
Test whether lhs is less than (occurs before) rhs.
void GetString(string &s) const
Return the data by assigning it to a string.
int Size() const
Return the length of the string in bytes.
void Clear()
Reset the string to an empty state.
const char * GetEnd() const
Returns a pointer to the end of the string, which is always a pointer to the character past the last ...
void EraseFront(int n)
Disinclude data from the beginning of the string.
void Resize(int n)
Change the length of the string.
const char * GetBegin() const
Returns a pointer to the start of the string.
bool Empty() const
Returns true iff the string is empty.
int FindLastOf(char ch) const
Find last instance of a character in the substring.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
iterator_bool insert(const value_type &val)
static const char si[8][64]
static const char * str(char *buf, int n)
#define GI_FROM(T, value)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define TAX_ID_FROM(T, value)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
size_t GetSize(void) const
Get length of the mapped region.
void * GetPtr(void) const
Get pointer to beginning of data.
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
const string AsFastaString(void) const
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
static E_Choice WhichInverseSeqId(const CTempString &SeqIdCode)
Converts a string to a choice, no need to require a member.
static int BestRank(const CRef< CSeq_id > &id)
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
@ fLabel_Version
Show the version.
@ fLabel_GeneralDbIsContent
For type general, use the database name as the tag and the (text or numeric) key as the content.
@ eFasta
Tagged ID in NCBI's traditional FASTA style.
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static Int8 StringToInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Int8.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
static string & ToLower(string &str)
Convert string to lower case – string& version.
@ fConvErr_NoThrow
Do not throw an exception on error.
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
@ eTrunc_Both
Truncate whitespace at both begin and end of string.
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
bool IsStr(void) const
Check if variant Str is selected.
const TTag & GetTag(void) const
Get the Tag member data.
bool IsId(void) const
Check if variant Id is selected.
bool CanGetDb(void) const
Check if it is safe to call GetDb method.
bool CanGetTag(void) const
Check if it is safe to call GetTag method.
const TDb & GetDb(void) const
Get the Db member data.
const TStr & GetStr(void) const
Get the variant data.
TId GetId(void) const
Get the variant data.
TGibbsq GetGibbsq(void) const
Get the variant data.
const TName & GetName(void) const
Get the Name member data.
bool CanGetName(void) const
Check if it is safe to call GetName method.
bool IsPrf(void) const
Check if variant Prf is selected.
E_Choice Which(void) const
Which variant is currently selected.
TGi GetGi(void) const
Get the variant data.
TVersion GetVersion(void) const
Get the Version member data.
bool CanGetVersion(void) const
Check if it is safe to call GetVersion method.
const TLocal & GetLocal(void) const
Get the variant data.
bool CanGetAccession(void) const
Check if it is safe to call GetAccession method.
const TGeneral & GetGeneral(void) const
Get the variant data.
bool IsPir(void) const
Check if variant Pir is selected.
const TAccession & GetAccession(void) const
Get the Accession member data.
@ e_Other
for historical reasons, 'other' = 'refseq'
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
@ e_Tpe
Third Party Annot/Seq EMBL.
@ e_Tpd
Third Party Annot/Seq DDBJ.
@ e_Gibbsq
Geninfo backbone seqid.
@ e_General
for other databases
@ e_Gi
GenInfo Integrated Database.
@ e_not_set
No variant selected.
@ e_Tpg
Third Party Annot/Seq Genbank.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
unsigned int
A callback function used to compare two keys in a database.
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is whole
static void text(MDB_val *v)
constexpr auto sort(_Init &&init)
constexpr auto front(list< Head, As... >, T=T()) noexcept -> Head
const string version
version string
const struct ncbi::grid::netcache::search::fields::SIZE size
Defines unified interface to application:
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
The SeqDB memory management layer.
bool SeqDB_IsBinaryTiList(const string &fname)
Returns true if the file name passed contains a binary TI list.
void SeqDB_ReadPigList(const string &fname, vector< CSeqDBGiList::SPigOid > &pigs, bool *in_order)
void SeqDB_ReadMemoryPigList(const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SPigOid > &pigs, bool *in_order)
static bool s_SeqDB_DBExists(const string &dbname, char dbtype, CSeqDB_FileExistence &access, bool linkoutdb_search)
Test whether an index or alias file exists.
void SeqDB_ReadGiList(const string &fname, vector< CSeqDBGiList::SGiOid > &gis, bool *in_order)
Read a text or binary GI list from a file.
void SeqDB_GetLMDBFileExtensions(bool db_is_protein, vector< string > &extn)
Retrieves file extensions for BLAST LMDB files.
void SeqDB_ReadMemorySiList(const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order)
Read a text SeqID list from an area of memory.
void SeqDB_ReadBinaryGiList(const string &fname, vector< TGi > &gis)
Read a binary-format GI list from a file.
void SeqDB_ReadMemoryGiList(const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SGiOid > &gis, bool *in_order)
Read a text or binary GI list from an area of memory.
CSeqDB_Substring SeqDB_RemoveExtn(CSeqDB_Substring s)
Returns a filename minus greedy path.
bool SeqDB_CompareVolume(const string &s1, const string &s2)
Compares two volume file names and determine the volume order.
ESeqDBIdType SeqDB_SimplifySeqid(CSeq_id &bestid, const string *acc, Int8 &num_id, string &str_id, bool &simpler)
Seq-id simplification.
void SeqDB_ReadMemoryMixList(const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SGiOid > &gis, vector< CSeqDBGiList::STiOid > &tis, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order)
Read an ID list (mixed type) from an area of memory.
string SeqDB_FindBlastDBPath(const string &dbname, char dbtype, string *sp, bool exact, CSeqDBAtlas &atlas)
Finds a file in the search path.
string GetBlastSeqIdString(const CSeq_id &seqid, bool version)
Return ID string as stored in lmdb.
void SeqDB_ReadMixList(const string &fname, vector< CSeqDBGiList::SGiOid > &gis, vector< CSeqDBGiList::STiOid > &tis, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order)
Read a text SeqId list from a file.
static string s_SeqDB_FindBlastDBPath(const string &dbname, char dbtype, string *sp, bool exact, CSeqDB_FileExistence &access, const string path="")
void SeqDB_SplitQuoted(const string &dbname, vector< CTempString > &dbs, bool keep_quote)
Split a (possibly) quoted list of database names into pieces.
int s_ReadDigit(const char d, const string &list_type)
static string s_SeqDB_TryPaths(const string &blast_paths, const string &dbname, char dbtype, bool exact, CSeqDB_FileExistence &access, bool linkoutdb_search=false)
Search for a file in a provided set of paths.
CSeqDB_Substring SeqDB_RemoveFileName(CSeqDB_Substring s)
Returns a path minus filename.
static string s_GetPathSplitter()
Returns the character used to seperate path components in the current operating system or platform.
void SeqDB_GetFileExtensions(bool db_is_protein, vector< string > &extn, EBlastDbVersion dbver)
Retrieves a list of all supported file extensions for BLAST databases.
bool IsStringId(const CSeq_id &id)
Determine if id is srting id.
void SeqDB_JoinDelim(string &a, const string &b, const string &delim)
Join two strings with a delimiter.
ESeqDBIdType SeqDB_SimplifyAccession(const string &acc, Int8 &num_id, string &str_id, bool &simpler)
String id simplification.
static bool s_SeqDB_ParseSeqIDs(const string &line, vector< CRef< CSeq_id > > &seqids)
Parse string into a sequence of Seq-id objects.
bool SeqDB_IsBinaryGiList(const string &fname)
Read a text or binary SeqId list from a file.
void s_InsureOrder(TVector &v)
void SeqDB_FileIntegrityAssert(const string &file, int line, const string &text)
Report file corruption by throwing an eFile CSeqDBException.
CSeqDB_Substring SeqDB_RemoveDirName(CSeqDB_Substring s)
Returns a filename minus greedy path.
void SeqDB_ReadTaxIdList(const string &fname, CSeqDBGiList::STaxIdsOids &taxids)
void SeqDB_ReadSiList(const string &fname, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order, SBlastSeqIdListInfo &db_info)
Read a text SeqId list from a file.
const string SeqDB_GetOidMaskFileExt(bool db_is_protein, EOidMaskType t)
string SeqDB_ResolveDbPathForLinkoutDB(const string &filename)
Resolve a file path using SeqDB's path algorithms.
void SeqDB_ReadMemoryTaxIdList(const char *fbeginp, const char *fendp, CSeqDBGiList::STaxIdsOids &taxids)
void SeqDB_ReadMemoryTiList(const char *fbeginp, const char *fendp, vector< CSeqDBGiList::STiOid > &tis, bool *in_order)
Read a text or binary TI list from an area of memory.
void SeqDB_CombineAndQuote(const vector< string > &dbs, string &dbname)
Combine and quote list of database names.
string SeqDB_MakeOSPath(const string &dbs)
Return path with delimiters changed to platform preferred kind.
void SeqDB_ReadTiList(const string &fname, vector< CSeqDBGiList::STiOid > &tis, bool *in_order)
Read a text or binary TI list from a file.
static bool s_SeqDB_IsBinaryNumericList(const char *fbeginp, const char *fendp, bool &has_long_ids, bool *has_tis=NULL)
This function determines whether a file is a valid binary GI/TI file.
string SeqDB_ResolveDbPath(const string &filename)
Resolve a file path using SeqDB's path algorithms.
string SeqDB_ResolveDbPathNoExtension(const string &filename, char dbtype)
Resolve a file path using SeqDB's path algorithms.
void SeqDB_GetMetadataFileExtension(bool db_is_protein, string &extn)
bool SeqDB_SplitString(CSeqDB_Substring &buffer, CSeqDB_Substring &front, char delim)
Parse a prefix from a substring.
static bool s_ContainsBinaryNumericIdList(const string &fname, CSeqDBFileGiList::EIdType type)
static size_t s_SeqDB_EndOfFastaID(const string &str, size_t pos)
Find the end of a single element in a Seq-id set.
void SeqDB_CombinePath(const CSeqDB_Substring &one, const CSeqDB_Substring &two, const CSeqDB_Substring *extn, string &outp)
Combine a filesystem path and file name.
const string kSeqDBGroupAliasFileName("index.alx")
void SeqDB_ConvertOSPath(string &dbs)
Change path delimiters to platform preferred kind in-place.
Defines exception class and several constants for SeqDB.
EBlastDbVersion
BLAST database version.
ESeqDBIdType
Various identifier formats used in Id lookup.
@ eStringId
Each PIG identifier refers to exactly one protein sequence.
@ eTiId
Genomic ID is a relatively stable numeric identifier for sequences.
@ ePigId
Trace ID is a numeric identifier for Trace sequences.
@ eOID
Lookup from sequence hash values to OIDs.
This file defines several SeqDB utility functions related to byte order and file system portability.
void SeqDB_ThrowException(CSeqDBException::EErrCode code, const string &msg)
Thow a SeqDB exception; this is seperated into a function primarily to allow a breakpoint to be set.
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
void s_SeqDB_QuickAssign(string &dst, const char *bp, const char *ep)
Higher Performance String Assignment.
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
Structure that holds GI,OID pairs.
TGi gi
The GI or 0 if unknown.
int oid
The OID or -1 if unknown.
TPig pig
The PIG or 0 if unknown.
Structure that holds Seq-id,OID pairs.
string si
The String-id or "" if unknown.
vector< blastdb::TOid > oids
Structure that holds TI,OID pairs.
TTi ti
The TI or 0 if unknown.
Blast DB v5 seqid list info.
vector< CRef< CSeq_id > > TIdList