57 #include <wx/richmsgdlg.h>
58 #include <wx/clipbrd.h>
59 #include <unordered_set>
70 you can import a gff3 file, an ncbi 5-column feature table file, or a fasta file with protein sequences for your coding regions. click ok to select and load the file.\r\n\
72 if you are importing a protein fasta file, make sure that the sequence ids for the protein sequences match the nucleotide sequences on which the coding regions should be created.\r\n\
74 features tables for influenza a and b sequences can be created here:\r\n\
75 https://www.ncbi.nlm.nih.gov/genomes/flu/database/annotation.cgi \r\n\
77 an ncbi 5-column feature table must be a plain text file. \r\n\
79 - the header line begins with >feature lcl| \r\n\
81 - the text following \"lcl|\" must contain the sequence id of the sequences in your records.\r\n\
82 for example: >feature lcl|abc-1\r\n\
83 in this example, abc-1 is the sequence id. \r\n\
85 - the table is composed of 5, tab-separated columns:\r\n\
86 column 1- nucleotide position of the start of the feature\r\n\
87 column 2- nucleotide location of the end of a feature\r\n\
88 column 3- feature type (gene, cds, etc.)\r\n\
89 column 4- feature qualifier (note, product, etc.)\r\n\
90 column 5- qualifier value (for example: gag protein)\r\n\
92 - the columns in the table must be separated by tabs. \r\n\
93 use the tab key on your keyboard to separate each column.\r\n\
94 the qualifiers follow on lines starting with three tabs. \r\n\
96 - for more feature table format information: \r\n\
97 https://www.ncbi.nlm.nih.gov/sequin/table.html#table layout \r\n\
99 - questions about the feature table format? write to: info@ncbi.nlm.nih.gov\r\n\
100 --------------------------------------------------\r\n\
101 example feature table for 2 sequences:\r\n\
102 --------------------------------------------------\r\n\
103 >feature lcl|abc-1\r\n\
107 product polymerase pb2\r\n\
108 >feature lcl|abc-2\r\n\
113 product matrix protein 2\r\n\
117 product matrix protein 1\r\n\
122 static bool showDialog(
true);
126 wxRichMessageDialog dlg(
NULL,
wxT(
"Feature table instructions"), wxEmptyString, wxOK|wxCENTER);
128 dlg.ShowCheckBox(
"Don't show this dialog in the current session");
131 if (dlg.IsCheckBoxChecked()) {
143 report->SetTitle(
wxT(
"Import Feature Table Report"));
160 report->SetTitle(
wxT(
"Import Feature Table Report"));
172 unsigned int &startingLocusTagNumber,
unsigned int &startingFeatureId,
const string &locus_tag,
bool euk)
175 startingLocusTagNumber, startingFeatureId, locus_tag, euk);
215 vector<string> format_ids;
216 if (format_id.empty())
218 format_ids.push_back(
"file_loader_gff");
219 format_ids.push_back(
"file_loader_5col");
223 format_ids.push_back(format_id);
225 fileManager->LoadFormats(format_ids);
226 fileManager->SetWorkDir(workDir);
228 vector<CIRef<IOpenObjectsPanelClient> > loadManagers;
232 dlg.SetSize(710, 480);
236 if (format_id ==
"file_loader_5col")
237 dlg.
SetHelpUrl(
_(
"https://www.ncbi.nlm.nih.gov/tools/gbench/manual11/#5-column-feature-table"));
238 if (format_id ==
"file_loader_gff")
239 dlg.
SetHelpUrl(
_(
"https://www.ncbi.nlm.nih.gov/tools/gbench/manual11/#gff3-file"));
241 if (dlg.ShowModal() != wxID_OK)
245 if (!object_loader) {
246 wxMessageBox(
wxT(
"Failed to get object loader"),
wxT(
"Error"), wxOK | wxICON_ERROR);
286 if (wxTheClipboard->Open())
288 wxTextDataObject
data;
289 if (wxTheClipboard->IsSupported( wxDF_UNICODETEXT ))
290 wxTheClipboard->GetData(
data );
291 if (
data.GetText().length() == 0) {
292 wxMessageBox(
"No data in clipboard for table");
293 wxTheClipboard->Close();
299 fname =
f.GetFileName();
300 wxTheClipboard->Close();
303 vector<wxString> fnames;
308 wxMessageBox(
wxT(
"Failed to get object loader"),
wxT(
"Error"), wxOK | wxICON_ERROR);
316 if (!execute_unit->PreExecute())
322 if (!execute_unit->PostExecute())
325 CFile tmp_file(fname);
346 unordered_set<string> product_ids;
352 for (
auto& feat_it : ftbl) {
353 if (feat_it->IsSetProduct() &&
354 feat_it->IsSetData() &&
356 if (
const CSeq_id*
id = feat_it->GetProduct().GetId()) {
357 auto id_string =
id->GetSeqIdString(
true);
358 auto res = product_ids.insert(id_string);
377 unordered_set<string> product_ids;
382 for (
auto& feat_it : ftbl) {
383 if (feat_it->IsSetProduct() && feat_it->IsSetData()) {
384 const CSeq_id*
id = feat_it->GetProduct().GetId();
386 if (
id && feat_it->GetData().IsCdregion()) {
387 product_ids.insert(id_string);
395 vector<string> dupl_product_ids;
397 for (
auto&& id_it : bseq->GetCompleteBioseq()->GetId()) {
398 const string id_string = id_it->GetSeqIdString(
true);
399 if (product_ids.find(id_string) != product_ids.end()) {
400 dupl_product_ids.push_back(id_string);
405 for (
auto&& it : dupl_product_ids) {
408 msg +=
"CDS feature with product id " + it +
" already exists in your data";
431 if (ftbl.size() == 1)
434 auto feat_it = ftbl.begin();
435 while (feat_it != ftbl.end()) {
436 auto feature = *feat_it;
437 if (feature->IsSetData() && feature->GetData().IsCdregion() && !
s_IsPseudo(*feature)) {
438 auto it_cds = feat_it;
441 while (feat_it != ftbl.end() &&
442 !((*feat_it)->IsSetData() && (*feat_it)->GetData().IsCdregion() && !
s_IsPseudo(**feat_it))) {
445 auto next_cds = feat_it;
446 if (next_cds != ftbl.end()) {
447 const string& this_pid = (*it_cds)->GetNamedQual(
"protein_id");
448 const string& next_pid = (*next_cds)->GetNamedQual(
"protein_id");
449 bool has_pid = !this_pid.empty();
450 bool next_has_pid = !next_pid.empty();
452 bool has_local_pid =
false;
453 bool next_has_local_pid =
false;
454 if (!has_pid != !next_has_pid) {
462 if (!prot_ids.empty() && prot_ids.front()->IsLocal()) {
463 has_local_pid =
true;
474 if (!prot_ids.empty() && prot_ids.front()->IsLocal()) {
475 next_has_local_pid =
true;
478 if (!has_local_pid != !next_has_local_pid) {
479 msg =
"Some coding regions have a protein_id qualifier and others do not. "
480 "Protein id qualifiers are used to generate protein sequence ids and they should "
481 "be consistently present or absent in the feature table";
503 const CObject& ptr = obj_it->GetObject();
524 if (!non_matched_ftbl_ids.
empty()) {
526 if (dlg.ShowModal() == wxID_OK) {
534 ftbl_seqid_map[it->first] = it->second;
537 unsigned int startingLocusTagNumber = 1;
538 unsigned int startingFeatureId = 1;
545 const CObject& ptr = obj_it->GetObject();
558 if (!locus_tag.empty())
562 if (!locus_tag.empty())
572 startingLocusTagNumber = tail;
578 ERR_POST(
Error <<
"Invalid locus tag: Only one \"_\", and suffix must be numeric");
589 if (dlg.ShowModal() == wxID_OK)
593 if (locus_tag.empty())
594 startingLocusTagNumber = 1;
607 const CObject& ptr = obj_it->GetObject();
647 unsigned int &startingLocusTagNumber,
648 unsigned int &startingFeatureId,
649 const string &locus_tag,
657 annot->
Assign(orig_annot);
660 xPostProcessAnnot(*annot, startingLocusTagNumber, startingFeatureId, locus_tag, euk);
662 vector<CRef<CSeq_feat>> imported_cds;
670 if (!bsh && it == ftbl_seqid_map.
end()) {
672 m_Error.assign(
"Feature table identifiers do not match record\n");
683 if (!bsh && it != ftbl_seqid_map.
end()) {
685 bsh =
m_Seh.GetScope().GetBioseqHandle(it->second.GetSeqId().GetObject());
693 imported_cds.push_back(new_feat);
702 for (
auto new_feat : imported_cds) {
708 result->AddCommand(*cds_cmd);
718 edit::CFeatTableEdit fte(annot, locus_tag, startingLocusTagNumber, startingFeatureId);
720 fte.GenerateMissingParentFeatures(euk);
721 fte.GenerateLocusTags();
722 fte.GenerateProteinAndTranscriptIds();
724 fte.InstantiateProductsNames();
725 fte.EliminateBadQualifiers();
726 fte.SubmitFixProducts();
728 startingLocusTagNumber = fte.PendingLocusTagNumber();
729 startingFeatureId = fte.PendingFeatureId();
808 if (prot_str.empty())
812 prot_str = prot_str.substr(0, prot_str.length() - 1);
818 protein->
SetInst().SetSeq_data().SetIupacaa().Set(prot_str);
820 protein->
SetInst().SetSeq_data().SetNcbieaa().Set(prot_str);
829 for (
auto& it : prot_ids) {
830 protein->
SetId().push_back(it);
834 const string& prot_id_qual = feat->
GetNamedQual(
"protein_id");
835 if (!prot_id_qual.empty()) {
836 for (
auto& it : prot_ids) {
837 protein->
SetId().push_back(it);
842 feat->
SetProduct().SetWhole().Assign(*best_id);
877 feat->
SetProduct().SetWhole().Assign(*prot_id);
879 bool has_protid =
false;
880 for (
auto& it : prot_ids) {
883 protein->
SetId().push_back(it);
884 has_protid |= (prot_id->
Match(*it));
888 protein->
SetId().push_back(prot_id);
909 prot_ref->
Assign(*orig_ref);
910 prot->SetData().SetProt(*prot_ref);
930 if (
id.GetLocal().IsStr()) {
941 }
else if (
id.GetLocal().IsId()) {
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
@ eExtreme_Biological
5' and 3'
bool GUI_AsyncExecUnit(IExecuteUnit &exec_unit, const wxString &msg)
CRef< objects::CSeq_id > GetNewLocalProtId(const string &id_base, CScope &scope, int &offset)
CRef< CGenetic_code > GetGeneticCodeForBioseq(CBioseq_Handle bh)
GetGeneticCodeForBioseq A function to construct the appropriate CGenetic_code object to use when cons...
static bool ParseCodeBreaks(CSeq_feat &feat, CScope &scope)
Parses all valid transl_except Gb-quals into code-breaks for cdregion, then removes the transl_except...
void AddCommand(IEditCommand &command)
static CIRef< IEditCommand > Create(const objects::CSeq_entry_Handle &seh, const vector< string > &quals)
virtual void SetRegistryPath(const string &path)
void SetText(const wxString &text)
map< string, int > m_OffsetForId
CIRef< IEditCommand > x_CreateCommand(const IObjectLoader::TObjects &objects)
CIRef< IEditCommand > x_DoImportCDS(CRef< objects::CSeq_feat > feat)
objects::CBioseq_Handle x_FindLocalBioseq_Handle(const objects::CSeq_feat &feat, const objects::CTSE_Handle &tseh) const
CIRef< IEditCommand > ImportFeaturesFromClipboard(const wxString &workDir)
string x_CheckCollidingIds(const IObjectLoader::TObjects &objects)
void xPostProcessAnnot(objects::CSeq_annot &annot, unsigned int &startingLocusTagNumber, unsigned int &startingFeatureId, const string &locus_tag, bool euk)
objects::CSeq_id_Handle x_GetFixedId(const objects::CSeq_id &id, const objects::CTSE_Handle &tseh) const
CIRef< IEditCommand > x_DoImportFeaturesFromFile(const wxString &workDir, const string &format_id=kEmptyStr)
CIRef< IEditCommand > x_DoImportFeaturesFromClipboard(const wxString &workDir)
CIRef< IEditCommand > ImportFeaturesFromFile(const wxString &workDir=wxEmptyString, const string &format_id=kEmptyStr)
static void ShowTableInfo()
CIRef< IEditCommand > x_GatherAdditionalChanges(CIRef< IEditCommand > &start_cmd)
CIRef< IEditCommand > AddSeqAnnotToSeqEntry(const objects::CSeq_annot &orig_annot, TFeatSeqIDMap &ftbl_seqid_map, unsigned int &startingLocusTagNumber, unsigned int &startingFeatureId, const string &locus_tag, bool euk)
objects::CSeq_entry_Handle m_Seh
bool x_ContainsDuplicateIds(const IObjectLoader::TObjects &objects)
returns true if the feature table contains at least one pair of duplicate protein/transcript ids
string x_CheckConsistentProteinIds(const IObjectLoader::TObjects &objects)
CIRef< IEditCommand > TestImport(const objects::CSeq_annot &orig_annot, TFeatSeqIDMap &ftbl_seqid_map, unsigned int &startingLocusTagNumber, unsigned int &startingFeatureId, const string &locus_tag, bool euk)
void GetFtableIDToSeqIDMap(TFeatSeqIDMap &ftbl_seqid_map)
IObjectLoader * GetObjectLoader()
void SetManagers(vector< CIRef< IOpenObjectsPanelClient > > &managers)
void SetWorkDir(const wxString &workDir)
ESubtype GetSubtype(void) const
static CTempString SubtypeValueToName(ESubtype eSubtype)
Turns a ESubtype into its string value which is NOT necessarily related to the identifier of the enum...
SeqVector related exceptions.
bool IsFtable(void) const
namespace ncbi::objects::
const CProt_ref * GetProtXref(void) const
get protein (if present) from Seq-feat.xref list
const string & GetNamedQual(const CTempString &qual_name) const
Return a named qualifier.
void RemoveQualifier(const string &qual_name)
Remove all qualifiers with the given name; do nothing if no such qualifier exists.
const CGene_ref * GetGeneXref(void) const
See related function in util/feature.hpp.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id) const
Get Bioseq handle from this TSE.
Template class for iteration on objects of class C.
void SetHelpUrl(const wxString &url)
virtual bool PreExecute()=0
virtual bool PostExecute()=0
vector< SObject > TObjects
container_type::const_iterator const_iterator
const_iterator end() const
const_iterator find(const key_type &key) const
iterator_bool insert(const value_type &val)
void SetMolinfoForProtein(CRef< objects::CSeq_entry > protein, bool partial5, bool partial3)
CRef< objects::CSeq_feat > AddProteinFeatureToProtein(CRef< objects::CSeq_entry > protein, bool partial5, bool partial3)
static void cleanup(void)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
void Error(CExceptionArgs_Base &args)
const string & GetMsg(void) const
Get message string.
void Info(CExceptionArgs_Base &args)
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
@ eIfExists_ReturnCurrent
Return reference to current stream, create new one if it does not exists yet.
@ eNoRemove
Do not remove file.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
const string AsFastaString(void) const
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
CConstRef< CSeq_id > GetSeqId(void) const
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
static int BestRank(const CRef< CSeq_id > &id)
bool IsBetter(const CSeq_id_Handle &h) const
True if "this" is a better bioseq than "h".
@ eContent
Untagged human-readable accession or the like.
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
bool IsPartialStop(ESeqLocExtremes ext) const
TSeqPos GetStop(ESeqLocExtremes ext) const
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
bool IsPseudo(const CSeq_feat &feat, CScope &scope)
Determines whether given feature is pseudo, using gene associated with feature if necessary Checks to...
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
vector< CSeq_id_Handle > TId
TSeqPos GetBioseqLength(void) const
CSeq_entry_Handle GetSeq_entry_Handle(void) const
Get parent Seq-entry handle.
CScope & GetScope(void) const
Get scope this handle belongs to.
const TId & GetId(void) const
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static int StringToNonNegativeInt(const CTempString str, TStringToNumFlags flags=0)
Convert string to non-negative integer value.
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
@ eCase
Case sensitive compare.
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
void Start(void)
Start the timer.
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
TStr & SetStr(void)
Select the variant.
TId & SetId(void)
Select the variant.
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
bool IsSetCode(void) const
genetic code used Check if a value has been assigned to Code data member.
bool IsCdregion(void) const
Check if variant Cdregion is selected.
void SetProduct(TProduct &value)
Assign a value to Product data member.
void SetCode(TCode &value)
Assign a value to Code data member.
const TLocation & GetLocation(void) const
Get the Location member data.
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
TPseudo GetPseudo(void) const
Get the Pseudo member data.
const TProduct & GetProduct(void) const
Get the Product member data.
bool IsSetPseudo(void) const
annotated on pseudogene? Check if a value has been assigned to Pseudo data member.
const TGene & GetGene(void) const
Get the variant data.
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
void SetFrame(TFrame value)
Assign a value to Frame data member.
bool IsSetFrame(void) const
Check if a value has been assigned to Frame data member.
bool IsGenbank(void) const
Check if variant Genbank is selected.
const TWhole & GetWhole(void) const
Get the variant data.
bool IsGeneral(void) const
Check if variant General is selected.
TLocal & SetLocal(void)
Select the variant.
bool IsLocal(void) const
Check if variant Local is selected.
bool IsWhole(void) const
Check if variant Whole is selected.
TSeq & SetSeq(void)
Select the variant.
TId & SetId(void)
Assign a value to Id data member.
list< CRef< CSeq_id > > TId
void SetInst(TInst &value)
Assign a value to Inst data member.
const TFtable & GetFtable(void) const
Get the variant data.
list< CRef< CSeq_feat > > TFtable
const TData & GetData(void) const
Get the Data member data.
@ eRepr_raw
continuous sequence
static string s_GetIdBase(const CSeq_id &id)
static void s_RemapFeatureSeqIds(CSeq_feat &feat, const CSeq_id &set_id)
const wxString kfeaturetableinstructions
static bool s_IsPseudo(const CSeq_feat &feat)
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
wxString ToWxString(const string &s)