85 #include <common/ncbi_revision.h>
88 #ifndef NCBI_SC_VERSION
89 # define FLATFILE_PARSER_ENABLED
90 #elif (NCBI_SC_VERSION == 0)
91 # define FLATFILE_PARSER_ENABLED
94 #ifdef FLATFILE_PARSER_ENABLED
115 CSeq_loc& loc = *visitor;
131 auto& bioseqAnnots = bioseq.
SetAnnot();
132 auto it = find_if(bioseqAnnots.begin(),
136 return (pAnnot && pAnnot->IsFtable());
138 if (it != bioseqAnnots.end()) {
145 void s_AddAnnotsToBioseq(
151 for (
auto pAnnot : annots) {
152 objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
153 featEdit.MergeFeatures(pAnnot->SetData().SetFtable());
158 pBioseqAnnot = s_GetBioseqAnnot(bioseq);
161 pBioseqAnnot = annots.front();
162 bioseq.
SetAnnot().push_back(pBioseqAnnot);
163 auto it =
next(annots.begin());
164 while (it != annots.end()) {
165 objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
166 featEdit.MergeFeatures((*it)->SetData().SetFtable());
171 for (
auto pAnnot : annots) {
172 objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
173 featEdit.MergeFeatures(pAnnot->SetData().SetFtable());
181 CBioseq_set::GetTypeInfo(),
182 CBioseq::GetTypeInfo(),
183 CSeq_entry::GetTypeInfo(),
184 CSeq_submit::GetTypeInfo(),
185 CSeq_annot::GetTypeInfo(),
191 if (content_type ==
"Bioseq-set") {
193 auto& bioseq_set = obj->SetSet();
198 if (content_type ==
"Seq-submit") {
204 if (content_type ==
"Seq-entry") {
210 if (content_type ==
"Bioseq") {
234 if (sType == CBioseq_set::GetTypeInfo()->GetName()) {
237 }
else if (sType == CSeq_submit::GetTypeInfo()->GetName()) {
246 entry = *submit->
SetData().SetEntrys().begin();
247 }
else if (sType == CSeq_entry::GetTypeInfo()->GetName()) {
250 }
else if (sType == CSeq_annot::GetTypeInfo()->GetName()) {
262 }
while (sType == CSeq_annot::GetTypeInfo()->GetName());
289 reader.
SetAllGap(args[
"aln-gapchar"].AsString());
290 reader.
SetMissing(args[
"aln-gapchar"].AsString());
291 if (args[
"aln-alphabet"].AsString() ==
"nuc") {
303 if (pSeqEntry && args[
"a"]) {
314 auto it = s_StringToClass.
find(args[
"a"].AsString());
315 if (it != s_StringToClass.
end()) {
316 pSeqEntry->SetSet().SetClass(it->second);
355 "File format not supported", 0);
365 pReader->SetGapLinkageEvidence(
408 if (! content_info) {
419 auto*
in =
file.m_stream.get();
430 #ifdef FLATFILE_PARSER_ENABLED
439 switch (
file.m_format)
465 mAtSequenceData(
false)
481 const string sType = pObjIstrm->ReadFileHeader();
482 if (sType == CSeq_descr::GetTypeInfo()->GetName()) {
486 out_desc->
Set().insert(out_desc->
Set().end(), descr->
Get().begin(), descr->
Get().end());
487 }
else if (sType == CSeqdesc::GetTypeInfo()->GetName()) {
491 out_desc->
Set().push_back(desc);
492 }
else if (sType == CPubdesc::GetTypeInfo()->GetName()) {
496 out_desc->
Set().push_back(desc);
498 throw runtime_error(
"Descriptor file must contain "
499 "either Seq_descr or Seqdesc elements");
503 throw runtime_error(
"Unable to read descriptor from file:" + ex.
GetMsg());
515 string sType = pObjIstrm->ReadFileHeader();
518 if (sType == CSeq_entry::GetTypeInfo()->GetName()) {
521 }
else if (sType == CBioseq::GetTypeInfo()->GetName()) {
526 }
else if (sType == CSeq_submit::GetTypeInfo()->GetName()) {
531 throw runtime_error(
"Seq-submit template must contain "
532 "exactly one Seq-entry");
534 }
else if (sType == CSubmit_block::GetTypeInfo()->GetName()) {
548 }
else if (sType == CSeqdesc::GetTypeInfo()->GetName()) {
552 "Submit-block. Object seems to be of type: " << sType);
566 if (ent_iter->IsSetDescr()) {
567 descr = &ent_iter->GetDescr();
571 tmp->SetSeq().SetInst();
575 switch ((*desc_iter)->Which()) {
583 desc->
Assign(**desc_iter);
584 tmp->SetSeq().SetDescr().Set().push_back(desc);
590 if (
tmp->IsSetDescr() && !
tmp->GetDescr().Get().empty())
595 if (!pObjIstrm->EndOfData()) {
596 if (sType != CSeqdesc::GetTypeInfo()->GetName())
597 sType = pObjIstrm->ReadFileHeader();
599 while (sType == CSeqdesc::GetTypeInfo()->GetName()) {
617 if (pObjIstrm->EndOfData())
621 sType = pObjIstrm->ReadFileHeader();
640 throw runtime_error(
"The Seq-entry must be a Bioseq not a Bioseq-set.");
652 if (args[
"output-type"].AsString() ==
"Seq-entry") {
661 class AllowedDuplicates :
public set<CSeqdesc_Base::E_Choice>
669 AllowedDuplicates m_allowed_duplicates;
671 template <
typename _which>
673 typename _which::E_Choice compare_to;
674 bool operator()(_which
l)
const
676 return l.Which() == compare_to;
680 return l->Which() == compare_to;
695 bool duplicates = (m_allowed_duplicates.find(
source.Which()) != m_allowed_duplicates.end());
698 desc.Set(duplicates).Assign(
source);
732 bool post_process =
false;
736 "Specified GFF3 file does not include any sequence data", 0);
743 const string& objectType,
744 unique_ptr<istream>& pIstr,
765 if (! pInputObject) {
767 "File format not recognized", 0);
811 istream.
get().open(filename);
816 if (input_sequence.
Empty())
818 "File format not recognized", 0);
821 input_sequence =
xApplyTemplate(input_sequence, merge_template_descriptors);
830 entry = submit->
SetData().SetEntrys().front();
850 CopyDescr(*seq, *entry);
851 CopyAnnot(*seq, *entry);
857 if (merge_template_descriptors) {
862 "Template file descriptors are ignored if input is ASN.1");
864 *unique_ptr<CLineError>(
908 for (
const auto&
msg : readerListener) {
922 for (
auto pFeat :
ftable) {
923 if (pFeat->IsSetDbxref()) {
924 auto& dbxrefs = pFeat->SetDbxref();
925 auto it =
remove_if(dbxrefs.begin(), dbxrefs.end(),
927 return(pDbtag && pDbtag->IsSetDb() &&
928 NStr::EqualNocase(pDbtag->GetDb(),
"GenBank"));
930 dbxrefs.erase(it, dbxrefs.end());
931 if (dbxrefs.empty()) {
932 pFeat->ResetDbxref();
941 unsigned int startingLocusTagNumber = 1;
942 unsigned int startingFeatureId = 1;
943 for (
auto it = annots.begin(); it != annots.end(); ++it) {
947 if (!
data.IsFtable() ||
data.GetFtable().empty()) {
953 edit::CFeatTableEdit fte(
960 "GFF annotation requires locus tags, which are missing from one or more genes, so the command line argument -locus-tag-prefix is needed");
962 fte.GenerateLocusTags();
964 fte.GenerateProteinAndTranscriptIds();
966 fte.ProcessCodonRecognized();
967 fte.EliminateBadQualifiers();
968 fte.SubmitFixProducts();
970 startingLocusTagNumber = fte.PendingLocusTagNumber();
971 startingFeatureId = fte.PendingFeatureId();
1003 "Descriptor file seems to be in an unsupported format: "
1011 unique_ptr<CObjectIStream> pObjIstrm(
1023 auto hugefile = std::make_unique<objects::edit::CHugeFile>();
1024 hugefile->OpenPlain(filename);
1033 if (ext ==
".gff" || ext ==
".gff3")
1035 else if (ext ==
".gtf")
1037 else if (ext ==
".tbl")
1039 else if (ext ==
".asn" || ext ==
".sqn" || ext ==
".sap")
1056 auto*
in = hugefile->m_stream.get();
1060 auto reader5col = std::make_unique<CFast5colReader>();
1069 reader5col->Open(std::move(hugefile));
1070 reader = std::move(reader5col);
1073 auto obj_stream = hugefile->MakeObjStream(0);
1089 #ifdef FLATFILE_PARSER_ENABLED
1094 if (pEntry && pEntry->IsSetAnnot()) {
1095 annots = pEntry->GetAnnot();
1102 "Annotation file format not recognized. Run format validator on your annotation file", 1);
1105 if (!reader.get() && !annots.empty()) {
1106 auto whole_file = std::make_unique<CWholeFileAnnotation>();
1108 whole_file->AddAnnots(annots);
1109 reader = std::move(whole_file);
1120 std::vector<CRef<CSeq_id>> ids(bioseq.
GetId().begin(), bioseq.
GetId().end());
1123 return CSeq_id::Score(l) < CSeq_id::Score(r);
1127 for (
auto pSeqId : ids) {
1130 if (annots.empty()) {
1137 for (
auto pAnnot : annots) {
1144 s_ModifySeqIds(*pAnnot, *pAnnotId, matching_id);
1147 s_AddAnnotsToBioseq(annots, bioseq, pBioseqAnnot);
1172 #ifdef FLATFILE_PARSER_ENABLED
1175 unique_ptr<Parser> pp(
new Parser);
1196 "This flat file format is not supported: " + filename, 0);
1203 auto obj = ffparser.
Parse(*pp, instream);
1204 if (obj.NotEmpty()) {
1205 if (obj->GetThisTypeInfo() == CBioseq_set::GetTypeInfo()) {
1209 auto& annot = entry->SetAnnot();
1210 for (
auto& bioseq : bioseq_set->SetSeq_set()) {
1212 annot.splice(annot.end(), bioseq->
SetAnnot());
1214 if (entry->IsSetAnnot())
1224 auto indexed_annots = std::make_unique<CWholeFileAnnotation>();
1226 indexed_annots->AddAnnots(annots);
1227 reader = std::move(indexed_annots);
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void g_LogGeneralParsingError(EDiagSev sev, const string &idString, const string &msg, objects::ILineErrorListener &listener)
void remove_if(Container &c, Predicate *__pred)
class CAlnReader supports importing a large variety of text-based alignment formats into standard dat...
void Read(bool guess, bool generate_local_ids=false, objects::ILineErrorListener *pErrorListener=nullptr)
void SetAlphabet(const string &value)
CRef< objects::CSeq_entry > GetSeqEntry(TFastaFlags fasta_flags=objects::CFastaReader::fAddMods, objects::ILineErrorListener *pErrorListener=nullptr)
void SetMissing(const string &value)
void SetAllGap(const string &value)
Convenience function for setting beginning, middle, and end gap to the same thing.
Modification of the CFastaReader class that allows for reading a single sequence as a degenarate mult...
@ fAllIdsAsLocal
= 0x100 (Do not attempt to parse accessions)
@ fLeaveProteinIds
= 0x80 (Leave all protein_id as a qualifiers)
@ fCreateGenesFromCDSs
= 0x10 (If a CDS has a gene xref, create a gene with the same intervals if one doesn't already exist....
@ fPreferGenbankId
= 0x200 (Prefer Genbank accession ids)
CRef< CSerialObject > Parse(Parser &parseInfo)
void ConvertNs2Gaps(CSeq_entry &entry)
bool AtSequenceData() const
void ReadSeqAnnots(TAnnotList &, CNcbiIstream &, ILineErrorListener *=nullptr) override
Read all objects from given insput stream, returning them as a vector of Seq-annots.
shared_ptr< CGff3LocationMerger > GetLocationMerger()
static CLineError * Create(EProblem eProblem, EDiagSev eSeverity, const std::string &strSeqId, unsigned int uLine, const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), const std::string &strErrorMessage=string(""), const TVecOfLines &vecOfOtherLines=TVecOfLines())
Use this because the constructor is protected.
CFormatGuess::EFormat OpenFile(const string &filename, CRef< CSerialObject > &input_sequence, TAnnots &annots)
void LoadGFF3Fasta(istream &in, TAnnots &annots)
static void GetSeqEntry(CRef< objects::CSeq_entry > &entry, CRef< objects::CSeq_submit > &submit, CRef< CSerialObject > obj)
TAnnots xReadGFF3(CNcbiIstream &instream, bool post_process)
void MergeDescriptors(objects::CSeq_descr &dest, const objects::CSeq_descr &source) const
CMultiReader(CTable2AsnContext &context)
list< CRef< objects::CSeq_annot > > TAnnots
unique_ptr< CObjectIStream > xCreateASNStream(const string &filename) const
void LoadIndexedAnnot(std::unique_ptr< IIndexedFeatureReader > &reader, const string &filename)
void AddAnnots(IIndexedFeatureReader *reader, CBioseq &bioseq) const
CTable2AsnContext & m_context
CRef< CSerialObject > ReadNextEntry()
void xAnnotGetFormat(objects::edit::CHugeFile &file) const
static const set< TTypeInfo > kSupportedTypes
TAnnots xReadGTF(CNcbiIstream &instream) const
shared_ptr< objects::CGff3LocationMerger > m_gff3_merger
CRef< CSerialObject > xReadASN1Binary(CObjectIStream &pObjIstrm, const string &content_type) const
bool AtSeqenceData() const
void WriteObject(const CSerialObject &, ostream &)
void LoadDescriptors(const string &ifname, CRef< objects::CSeq_descr > &out_desc) const
void ApplyDescriptors(objects::CSeq_entry &obj, const objects::CSeq_descr &source) const
CRef< CSerialObject > xApplyTemplate(CRef< CSerialObject > obj, bool merge_template_descriptors) const
CRef< objects::CSeq_entry > xReadFasta(CNcbiIstream &instream)
CRef< CSerialObject > FetchEntry(const CFormatGuess::EFormat &format, const string &objectType, unique_ptr< CNcbiIstream > &pIstr, TAnnots &annots)
CRef< objects::CSeq_entry > ReadAlignment(CNcbiIstream &instream, const CArgs &args)
void LoadTemplate(const string &ifname)
void GetIndexedAnnot(std::unique_ptr< IIndexedFeatureReader > &reader, TAnnots &annots)
void x_PostProcessAnnots(TAnnots &annots) const
CFormatGuess::EFormat xInputGetFormat(CNcbiIstream &, CFileContentInfo *=nullptr) const
unique_ptr< CObjectIStream > m_obj_stream
CRef< objects::CSeq_entry > xReadFlatfile(CFormatGuess::EFormat format, const string &filename, CNcbiIstream &instream)
CRef< CSerialObject > xReadASN1Text(CObjectIStream &pObjIstrm) const
static CRef< CSeq_id > AsSeqId(const string &rawId, long flags=0, bool localInts=true)
Convert a raw ID string to a Seq-id, based in given customization flags.
@ fAllIdsAsLocal
all identifiers are local IDs
@Seq_descr.hpp User-defined methods of the data storage class.
const TAnnot & GetAnnot(void) const
const CSeq_descr & GetDescr(void) const
bool IsSetAnnot(void) const
void ResetParentEntry(void)
void SetDescr(CSeq_descr &value)
bool IsSetDescr(void) const
Base class for all serializable objects.
Simple implementation of ILineReader for i(o)streams.
string m_genome_center_id
bool m_binary_asn1_output
objects::ILineErrorListener * m_logger
TSeqPos m_gap_Unknown_length
objects::CGapsEditor::TEvidenceSet m_DefaultEvidence
CRef< objects::CSeq_entry > m_entry_template
CRef< objects::CSeq_submit > m_submit_template
void MergeWithTemplate(objects::CSeq_entry &entry) const
string m_locus_tag_prefix
objects::CGapsEditor::TCountToEvidenceMap m_GapsizeToEvidence
objects::CBioseq_set::TClass m_ClassValue
void MakeGenomeCenterId(objects::CSeq_entry &entry) const
Template class for iteration on objects of class C.
virtual std::list< CRef< objects::CSeq_annot > > GetAndUseAnnot(CRef< objects::CSeq_id > seqid)=0
static CRef< objects::CSeq_id > GetAnnotId(const objects::CSeq_annot &annot)
const_iterator end() const
const_iterator find(const key_type &key) const
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
@ eTakeOwnership
An object can take ownership of another.
@ eDiag_Info
Informational message.
@ eDiag_Warning
Warning message.
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
#define NCBI_THROW2(exception_class, err_code, message, extra)
Throw exception with extra parameter.
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
ESerialDataFormat
Data file format.
@ eSerial_AsnText
ASN.1 text.
@ eSerial_AsnBinary
ASN.1 binary.
@ fHyphensIgnoreAndWarn
When a hyphen is encountered in seq data, ignore it but warn.
@ fLetterGaps
Parse runs of Ns when splitting data.
@ fIgnoreMods
Ignore mods entirely. Incompatible with fAddMods.
@ fNoUserObjs
Don't save raw deflines in User-objects.
@ fForceType
Force specified type regardless of accession.
@ fParseRawID
Try to identify raw accessions.
@ fNoSplit
Don't split out ambiguous sequence regions.
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
@ fParseGaps
Make a delta sequence if gaps found.
@ fValidate
Check (alphabetic) residue validity.
@ fDisableParseRange
No ranges in seq-ids. Ranges part of seq-id instead.
@ e_YES
SeqIds compared, but are different.
void Read(const CObjectInfo &object)
Read object of know type.
pair< TObjectPtr, TTypeInfo > ObjectInfo(C &obj)
virtual string ReadFileHeader(void)
Read file header.
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static string & ToLower(string &str)
Convert string to lower case – string& version.
@ eCurrent
Use current time. See also CCurrentTime.
TData & SetData(void)
Assign a value to Data data member.
TSub & SetSub(void)
Select the variant.
TSet & SetSet(void)
Select the variant.
const TSet & GetSet(void) const
Get the variant data.
bool IsSet(void) const
Check if variant Set is selected.
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
TSeq & SetSeq(void)
Select the variant.
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_genbank
converted genbank
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
void SetData(TData &value)
Assign a value to Data data member.
list< CRef< CSeqdesc > > Tdata
TId & SetId(void)
Assign a value to Id data member.
const TUser & GetUser(void) const
Get the variant data.
void SetPub(TPub &value)
Assign a value to Pub data member.
TPub & SetPub(void)
Select the variant.
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TId & GetId(void) const
Get the Id member data.
const Tdata & Get(void) const
Get the member data.
void SetInst(TInst &value)
Assign a value to Inst data member.
TUser & SetUser(void)
Select the variant.
Tdata & Set(void)
Assign a value to data member.
bool IsUser(void) const
Check if variant User is selected.
@ eRepr_raw
continuous sequence
@ e_User
user defined object
@ e_Pub
a reference to the publication
@ e_Source
source of materials, includes Org-ref
const TEntrys & GetEntrys(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
static const CS_INT unused
constexpr auto sort(_Init &&init)
const CharType(& source)[N]
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static SLJIT_INLINE sljit_ins lr(sljit_gpr dst, sljit_gpr src)
static void s_RemoveGenBankDbxrefs(list< CRef< CSeq_feat >> &ftable)
void g_LogDiagMessage(ILineErrorListener *logger, EDiagSev sev, const string &msg)
CFileContentInfoGenbank mInfoGenbank
static CS_CONTEXT * context