49 #define NOT_AVAILABLE "N/A"
65 }
else if (
id.IsGi()) {
69 }
else if (
id.IsPig()) {
72 string acc(
id.GetStringId());
81 target_seq_id = &(*seq_id);
88 "Entry not found in BLAST database");
94 "Entry found in BLAST database has invalid length");
107 "start pos > length of sequence");
123 if (e.
GetMsg().find(
"oid headers do not contain target gi")) {
125 "Entry not found in BLAST database");
153 if ((*itr)->IsGi()) {
154 m_Gi = (*itr)->GetGi();
183 if ((*itr)->IsSetLinks()) {
199 retval.erase(retval.size()-1, 1);
220 (*itr)->IsSetMemberships()) {
222 (*itr)->GetMemberships()) {
320 gi2title[gi] = (*bd)->GetTitle();
362 retval = retval.erase(0, 4);
379 if ((*itr)->IsTitle()) {
380 return (*itr)->GetTitle();
393 if (taxids.
empty()) {
398 if (retval.empty()) {
425 const TTaxId kTaxID = *taxid_iter;
429 if (retval.empty()) {
436 if (retval.empty()) {
461 const TTaxId kTaxID = *taxid_iter;
465 if (retval.empty()) {
472 if (retval.empty()) {
553 #if ((defined(NCBI_COMPILER_WORKSHOP) && (NCBI_COMPILER_VERSION <= 550)) || \
554 defined(NCBI_COMPILER_MIPSPRO))
604 #define CTRL_A "\001"
608 static const string kTarget(
" >gi|");
623 return (*desc)->GetTitle();
633 static const string kStandardSeparator(
" >");
639 for (
auto token : tokens) {
645 const string kPossibleId(token, 0, pos !=
NPOS ? pos : token.length());
652 if (!seqids.empty()) {
657 retval += token.substr(pos, token.length() - pos);
659 retval += kStandardSeparator + token;
666 stringstream
out(
"");
697 if (!masked_ranges.
empty()) {
703 fasta.
SetMask(kMaskType, masks);
710 lcl_tmp = lcl_tmp.erase(0, 4);
722 CRef<CSeq_id> id = FindBestChoice(m_Bioseq->GetId(), CSeq_id::Score);
723 out << GetBareId(*id);
725 string title = s_GetTitle(*m_Bioseq.GetNonNullPointer());
726 out << ' ' << s_ConfigureDeflineTitle(title, m_UseCtrlA);
729 CScope scope(*CObjectManager::GetInstance());
730 fasta.WriteSequence(scope.AddBioseq(*m_Bioseq), range);
733 catch (const CObjmgrUtilException& e) {
734 if (e.GetErrCode() == CObjmgrUtilException::eBadLocation) {
735 NCBI_THROW(CInvalidDataException, eInvalidRange,
736 "Invalid sequence range");
742 TTaxId CBlastDBExtractor::x_ExtractTaxId()
746 if (m_Gi != ZERO_GI) {
747 if (m_Gi2TaxidMap.first != m_Oid)
749 m_Gi2TaxidMap.first = m_Oid;
750 m_BlastDb.GetTaxIDs(m_Oid, m_Gi2TaxidMap.second);
752 return m_Gi2TaxidMap.second[m_Gi];
754 // for database without Gi:
755 vector<TTaxId> taxid;
756 m_BlastDb.GetTaxIDs(m_Oid, taxid);
757 return taxid.size() ? taxid[0] : ZERO_TAX_ID;
760 void CBlastDBExtractor::x_ExtractLeafTaxIds(set<TTaxId>& taxids)
764 if (m_Gi != ZERO_GI) {
765 if (m_Gi2TaxidSetMap.first != m_Oid)
767 m_Gi2TaxidSetMap.first = m_Oid;
768 m_BlastDb.GetLeafTaxIDs(m_Oid, m_Gi2TaxidSetMap.second);
771 const set<TTaxId>& taxid_set = m_Gi2TaxidSetMap.second[m_Gi];
772 taxids.insert(taxid_set.begin(), taxid_set.end());
775 // for database without Gi:
776 vector<TTaxId> taxid;
777 m_BlastDb.GetLeafTaxIDs(m_Oid, taxid);
779 taxids.insert(taxid.begin(), taxid.end());
783 CBlastDBExtractor::x_ExtractMaskingData(CSeqDB::TSequenceRanges &ranges,
788 m_BlastDb.GetMaskData(m_Oid, algo_id, ranges);
792 void CBlastDBExtractor::SetConfig(TSeqRange range, objects::ENa_strand strand,
795 m_OrigSeqRange = range;
797 m_FiltAlgoId = filt_algo_id;
800 void CBlastDeflineUtil::ExtractDataFromBlastDeflineSet(const CBlast_def_line_set & dl_set,
801 vector<string> & results,
802 BlastDeflineFields fields,
806 CSeq_id target_seq_id (target_id, CSeq_id::fParse_PartialOK | CSeq_id::fParse_Default);
807 Int8 num_id = NStr::StringToNumeric<Int8>(target_id, NStr::fConvErr_NoThrow);
808 bool can_be_gi = errno ? false: true;
809 ITERATE(CBlast_def_line_set::Tdata, itr, dl_set.Get()) {
810 ITERATE(CBlast_def_line::TSeqid, id, (*itr)->GetSeqid()) {
811 if ((*id)->Match(target_seq_id) || (can_be_gi && (*id)->IsGi() && ((*id)->GetGi() == GI_FROM(TIntId, num_id)))) {
812 CBlastDeflineUtil::ExtractDataFromBlastDefline( **itr, results, fields, use_long_id);
818 NCBI_THROW(CException, eInvalid, "Failed to find target id " + target_id);
821 static string s_CheckName(const string & name)
823 if(name == "-") return NOT_AVAILABLE;
824 if(name == "unclassified") return NOT_AVAILABLE;
829 void CBlastDeflineUtil::ExtractDataFromBlastDefline(const CBlast_def_line & dl,
830 vector<string> & results,
831 BlastDeflineFields fields,
835 results.resize(CBlastDeflineUtil::max_index, kEmptyStr);
836 if (fields.gi == 1) {
837 results[CBlastDeflineUtil::gi] = NOT_AVAILABLE;
838 ITERATE(CBlast_def_line::TSeqid, id, dl.GetSeqid()) {
840 TGi gi = (*id)->GetGi();
841 results[CBlastDeflineUtil::gi] = NStr::NumericToString(gi);
846 if ((fields.accession == 1) || (fields.seq_id == 1)) {
847 CRef<CSeq_id> theId = FindBestChoice(dl.GetSeqid(), CSeq_id::WorstRank);
848 if(fields.seq_id == 1) {
849 results[CBlastDeflineUtil::seq_id] = theId->AsFastaString();
851 if(fields.accession == 1) {
852 results[CBlastDeflineUtil::accession] = GetBareId(*theId);
855 if(fields.title == 1) {
856 if(dl.IsSetTitle()) {
857 results[CBlastDeflineUtil::title] = dl.GetTitle();
860 results[CBlastDeflineUtil::title] = NOT_AVAILABLE;
863 if ((fields.tax_id == 1) || (fields.tax_names == 1)) {
864 TTaxId tax_id = ZERO_TAX_ID;
865 if (dl.IsSetTaxid()) {
866 tax_id = dl.GetTaxid();
869 if (fields.tax_id == 1) {
870 results[CBlastDeflineUtil::tax_id] = NStr::NumericToString(tax_id);
873 if (fields.tax_names == 1) {
875 SSeqDBTaxInfo taxinfo;
876 CSeqDB::GetTaxInfo(tax_id, taxinfo);
877 results[CBlastDeflineUtil::scientific_name] = taxinfo.scientific_name;
878 results[CBlastDeflineUtil::common_name] = taxinfo.common_name;
879 results[CBlastDeflineUtil::blast_name] = s_CheckName(taxinfo.blast_name);
880 results[CBlastDeflineUtil::super_kingdom] = s_CheckName(taxinfo.s_kingdom);
881 } catch (const CException&) {
882 results[CBlastDeflineUtil::scientific_name] = NOT_AVAILABLE;
883 results[CBlastDeflineUtil::common_name] = NOT_AVAILABLE;
884 results[CBlastDeflineUtil::blast_name] = NOT_AVAILABLE;
885 results[CBlastDeflineUtil::super_kingdom] = NOT_AVAILABLE;
890 if ((fields.leaf_node_tax_ids == 1) || (fields.leaf_node_tax_names == 1)) {
891 set<TTaxId> tax_id_set = dl.GetLeafTaxIds();
892 if (tax_id_set.empty()) {
893 if (dl.IsSetTaxid()) {
894 tax_id_set.insert(dl.GetTaxid());
897 tax_id_set.insert(ZERO_TAX_ID);
901 string separator = kEmptyStr;
902 ITERATE(set<TTaxId>, itr, tax_id_set) {
903 if (fields.leaf_node_tax_names == 1) {
905 SSeqDBTaxInfo taxinfo;
906 CSeqDB::GetTaxInfo(*itr, taxinfo);
907 results[CBlastDeflineUtil::leaf_node_scientific_names] += separator + taxinfo.scientific_name;
908 results[CBlastDeflineUtil::leaf_node_common_names] += separator + taxinfo.common_name;
909 } catch (const CException&) {
910 results[CBlastDeflineUtil::leaf_node_scientific_names] += separator + NOT_AVAILABLE;
911 results[CBlastDeflineUtil::leaf_node_common_names] += separator + NOT_AVAILABLE;
914 results[CBlastDeflineUtil::leaf_node_tax_ids] += separator + NStr::NumericToString(*itr);
915 separator = SEPARATOR;
919 if (fields.membership == 1) {
921 if(dl.IsSetMemberships()) {
922 ITERATE(CBlast_def_line::TMemberships, memb_int, dl.GetMemberships()) {
923 membership += *memb_int;
926 results[CBlastDeflineUtil::membership] = NStr::NumericToString(membership);
929 if (fields.pig == 1) {
931 if (dl.IsSetOther_info()) {
932 ITERATE(CBlast_def_line::TOther_info, itr, dl.GetOther_info()) {
939 results[CBlastDeflineUtil::pig] = NStr::NumericToString(pig);
941 if(fields.links == 1) {
942 if (dl.IsSetLinks()) {
943 ITERATE(CBlast_def_line::TLinks, links_int, dl.GetLinks()) {
944 results[CBlastDeflineUtil::links] += NStr::NumericToString(*links_int) + SEPARATOR;
948 results[CBlastDeflineUtil::links] = NOT_AVAILABLE;
952 if(fields.asn_defline == 1) {
954 tmp << MSerial_AsnText << dl;
955 results[CBlastDeflineUtil::asn_defline] = CNcbiOstrstreamToString(tmp);
959 void CBlastDeflineUtil::ProcessFastaDeflines(
966 const CSeq_id* id = bioseq.GetFirstId();
970 if (id->IsGeneral() && id->GetGeneral().GetDb() == "BL_ORD_ID") {
971 out = ">" + s_GetTitle(bioseq) + '\n';
973 else if (id->IsLocal()) {
974 string lcl_tmp = id->AsFastaString();
975 lcl_tmp = lcl_tmp.erase(0,4);
976 out = ">" + lcl_tmp + ' ' + s_GetTitle(bioseq) + '\n';
979 id = FindBestChoice(bioseq.GetId(), CSeq_id::Score);
980 out += GetBareId(*id) + ' ';
982 string title = s_GetTitle(bioseq);
983 out += s_ConfigureDeflineTitle(title, use_ctrla);
988 void CBlastDeflineUtil::ProcessFastaDeflines(
992 const CSeq_loc* location,
997 const CSeq_id* id = bioseq.GetFirstId();
1002 if (location != NULL) {
1003 TSeqPos start = location->GetStart(eExtreme_Biological) + 1;
1004 TSeqPos stop = location->GetStop(eExtreme_Biological) + 1;
1005 if (strand == eNa_strand_minus) {
1007 + NStr::IntToString(stop) + "-" + NStr::IntToString(start)
1011 + NStr::IntToString(start) + "-" + NStr::IntToString(stop)
1015 if (id->IsGeneral() && id->GetGeneral().GetDb() == "BL_ORD_ID") {
1016 out = ">" + range + s_GetTitle(bioseq) + '\n';
1018 else if (id->IsLocal()) {
1019 string lcl_tmp = id->AsFastaString();
1020 lcl_tmp = lcl_tmp.erase(0,4);
1021 out = ">" + lcl_tmp + (range.empty() ? " " : range)
1022 + s_GetTitle(bioseq) + '\n';
1025 id = FindBestChoice(bioseq.GetId(), CSeq_id::Score);
1026 out += GetBareId(*id) + (range.empty() ? " " : range);
1028 string title = s_GetTitle(bioseq);
1029 out += s_ConfigureDeflineTitle(title, use_ctrla);
1034 // Calculates hash for a buffer in IUPACna (NCBIeaa for proteins) format.
1035 // NOTE: if sequence is in a different format, the function below can be modified to convert
1036 // each byte into IUPACna encoding on the fly.
1037 Uint4 CBlastSeqUtil::GetSeqHash(const char* buffer, int length)
1039 CChecksum crc(CChecksum::eCRC32ZIP);
1041 for(int ii = 0; ii < length; ii++) {
1042 if (buffer[ii] != '\n')
1043 crc.AddChars(buffer+ii,1);
1045 return (crc.GetChecksum() ^ (0xFFFFFFFFL));
1048 void CBlastSeqUtil::ApplySeqMask(string & seq, const CSeqDB::TSequenceRanges & masks, const TSeqRange r)
1051 ITERATE(CSeqDB::TSequenceRanges, itr, masks) {
1052 transform(&seq[itr->first], &seq[itr->second],
1053 &seq[itr->first], (int (*)(int))::tolower);
1057 const TSeqPos r_from = r.GetFrom();
1058 ITERATE(CSeqDB::TSequenceRanges, itr, masks) {
1059 TSeqRange mask (*itr);
1060 if(mask.GetFrom() > r.GetTo()) {
1063 TSeqRange tmp = r.IntersectionWith(mask);
1065 transform(&seq[tmp.GetFrom() -r_from], &seq[tmp.GetToOpen() - r_from],
1066 &seq[tmp.GetFrom() -r_from], (int (*)(int))::tolower);
1072 void CBlastSeqUtil::GetReverseStrandSeq(string & seq)
1074 CSeqManip::ReverseComplement(seq, CSeqUtil::e_Iupacna, 0, static_cast<ncbi::TSeqPos>(seq.size()));
1077 string CBlastSeqUtil::GetMasksString(const CSeqDB::TSequenceRanges & masks)
1079 if (masks.empty()) {
1080 return kNoMasksFound;
1082 CNcbiOstrstream out;
1083 ITERATE(CSeqDB::TSequenceRanges, range, masks) {
1084 out << range->first << "-" << range->second << SEPARATOR;
1086 return CNcbiOstrstreamToString(out);
void transform(Container &c, UnaryFunction *op)
Checksum and hash calculation classes.
TSeqPos GetLength(void) const
Encapsulates identifier to retrieve data from a BLAST database.
static Uint4 GetSeqHash(const char *buffer, int length)
FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp>
Defines invalid user input exceptions.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
@ eFileErr
Files were missing or contents were incorrect.
@ eArgErr
Argument validation failed.
int TPIG
Sequence type accepted and returned for PIG indices.
bool OidToPig(int oid, int &pig) const
Translate an OID to a PIG.
bool PigToOid(int pig, int &oid) const
Translate a PIG to an OID.
void GetSequenceAsString(int oid, CSeqUtil::ECoding coding, string &output, TSeqRange range=TSeqRange()) const
Get a sequence in a given encoding.
int GetSeqLength(int oid) const
Returns the sequence length in base pairs or residues.
CRef< CBioseq > GetBioseq(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const
Get a CBioseq for a sequence.
CRef< CBioseq > GetBioseqNoData(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const
Get a CBioseq for a sequence without sequence data.
void AccessionToOids(const string &acc, vector< int > &oids) const
Translate an Accession to a list of OIDs.
static void GetTaxInfo(TTaxId taxid, SSeqDBTaxInfo &info)
Get taxonomy information.
CRef< CBlast_def_line_set > GetHdr(int oid) const
Get the ASN.1 header for the sequence.
static CRef< CBlast_def_line_set > ExtractBlastDefline(const CBioseq &bioseq)
Extract a Blast-def-line-set object from a Bioseq retrieved by CSeqDB.
bool GiToOid(TGi gi, int &oid) const
Translate a GI to an OID.
static SIZE_TYPE ReverseComplement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
std::ofstream out("events_result.xml")
main entry point for tests
#define GI_FROM(T, value)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
SStrictId_Tax::TId TTaxId
Taxon id type.
TErrCode GetErrCode(void) const
Get error code.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
#define MSerial_AsnText
I/O stream manipulators –.
const string AsFastaString(void) const
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
static int WorstRank(const CRef< CSeq_id > &id)
static int Score(const CRef< CSeq_id > &id)
Wrappers for use with FindBestChoice from <corelib/ncbiutil.hpp>
static int BestRank(const CRef< CSeq_id > &id)
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
@ fParse_Default
By default in ParseIDs and IsValid, allow raw parsable non-numeric accessions and plausible local acc...
@ eContent
Untagged human-readable accession or the like.
void SetMask(EMaskType type, CConstRef< CSeq_loc > location)
virtual void Write(const CSeq_entry_Handle &handle, const CSeq_loc *location=0)
Unspecified locations designate complete sequences; non-empty custom titles override the usual title ...
void SetWidth(TSeqPos width)
EMaskType
Which residues to mask out in subsequent output.
virtual void WriteSequence(const CBioseq_Handle &handle, const CSeq_loc *location=0, CSeq_loc::EOpFlags merge_flags=CSeq_loc::fMerge_AbuttingOnly)
void SetFlag(EFlags flag)
void SetAllFlags(TFlags flags)
void ResetFlag(EFlags flag)
@ fKeepGTSigns
don't convert '>' to '_' in title
@ fSuppressRange
never include location details in defline
@ fEnableGI
Use this flag to enable GI output in the defline.
@ eSoftMask
write as lowercase rather than uppercase
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
bool NotEmpty(void) const
static position_type GetPositionMax(void)
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
NCBI_NS_STD::string::size_type SIZE_TYPE
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static string & ToUpper(string &str)
Convert string to upper case – string& version.
@ fSplit_ByPattern
Require full delimiter strings.
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
list< CRef< CSeq_id > > TSeqid
const Tdata & Get(void) const
Get the member data.
list< CRef< CBlast_def_line > > Tdata
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
void SetTo(TTo value)
Assign a value to To data member.
const TDb & GetDb(void) const
Get the Db member data.
bool IsGeneral(void) const
Check if variant General is selected.
TGi GetGi(void) const
Get the variant data.
bool IsLocal(void) const
Check if variant Local is selected.
const TGeneral & GetGeneral(void) const
Get the variant data.
bool IsGi(void) const
Check if variant Gi is selected.
list< CRef< CSeqdesc > > Tdata
const TId & GetId(void) const
Get the Id member data.
const Tdata & Get(void) const
Get the member data.
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
list< CRef< CSeq_id > > TId
void SetDescr(TDescr &value)
Assign a value to Descr data member.
const TDescr & GetDescr(void) const
Get the Descr member data.
@ e_Title
a title for this sequence
unsigned int
A callback function used to compare two keys in a database.
range(_Ty, _Ty) -> range< _Ty >
Useful/utility classes and methods.
bool IsStringId(const CSeq_id &id)
Determine if id is srting id.
List of sequence offset ranges.
string common_name
Common name, such as "noisy night monkey".
string blast_name
A simple category name, such as "birds".
string s_kingdom
A string of length 1 indicating the "Super Kingdom".
string scientific_name
Scientific name, such as "Aotus vociferans".
TTaxId taxid
An identifier for this species or taxonomic group.