88 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
106 if (family.empty()) {
265 if (it ==
m_Map.
end())
return false;
274 vector<string> tokens;
276 while (! stream.eof()) {
300 if (line.length() < 6 || line.substr(2, 3) !=
" ")
continue;
301 string code(line.substr(0, 2));
302 string value(line.substr(5));
317 string bp(
value.substr(
value.rfind(
';') + 1));
320 }
else if (
code ==
"DE") {
325 }
else if (
code ==
"CC") {
356 pair<TSpecificity2Taxid::iterator, bool> i_specificity =
358 if (i_specificity.second) {
360 if (! i_specificity.first->second) {
362 <<
"RepeatMasker library species failed lookup to taxonomy: "
377 const string& name)
const
382 template <
typename T>
384 const string& qual,
const T val)
390 qual_list.push_back(
result);
411 if (! family.empty())
val += family;
415 if (! family.empty())
s_SetQual(qual_list,
"rpt_family", family);
427 s_SetQual(qual_list,
"rpt_family", klass);
443 if (! family.empty())
s_SetQual(qual_list,
"rpt_family", family);
476 m_Ids.Reset(&generator);
492 imp.
SetKey(
"repeat_region");
501 ref->SetId().Assign(*id_it->second);
502 feat->
SetXref().push_back(ref);
511 bool standardized(
false);
523 if (! standardized) {
542 bool include_specificity_name(
false);
546 include_specificity_name = ! specificity_name.empty();
552 tag->SetTag().SetId(specificity);
559 include_specificity_name=
false;
578 include_rpt_left =
false;
593 if (include_rpt_left) {
606 s_SetQual(qual_list,
"overlapped",
true);
615 s_SetQual(qual_list,
"rpt_length", rpt_length);
618 if (include_specificity_name) {
628 uo->
SetType().SetStr(
"RepeatMasker");
635 if (include_rpt_left) {
644 uo->
AddField(
"query_length",
static_cast<int>(
657 uo->
AddField(
"rpt_length",
static_cast<int>(rpt_length));
660 if (include_specificity_name) {
669 if (qual_list.empty()) feat->
ResetQual();
674 tag->SetDb(
"REPBASE");
689 const char eq(
'='), sep(
' ');
691 comment <<
"source=RepeatMasker";
707 <<
"query_range" <<
eq;
709 if (reverse) comment <<
"complement(";
712 if (reverse) comment <<
")";
742 : m_SeqIdResolver(&seqid_resolver)
743 , m_ToFeat(
flags, lib, ids)
783 size_t record_counter = 0;
785 while ( !
lr.AtEOF() ) {
802 "RepeatMasker Reader: Parse error in record = " + line) );
812 "RepeatMasker Reader: Verification error in record = " + line) );
823 "RepeatMasker Reader: Aborting file import, "
824 "unable to create feature table for record = " + line) );
842 string labels_1st_line[] = {
"perc",
"query",
"position",
"matching",
"" };
843 string labels_2nd_line[] = {
"score",
"div.",
"del.",
"ins.",
"sequence",
"" };
846 size_t current_offset = 0;
848 for ( ; labels_1st_line[
i] !=
""; ++
i ) {
849 current_offset =
NStr::FindCase( line, labels_1st_line[
i], current_offset );
850 if (
NPOS == current_offset ) {
854 if ( labels_1st_line[
i] ==
"" ) {
861 for ( ; labels_2nd_line[
i] !=
""; ++
i ) {
862 current_offset =
NStr::FindCase( line, labels_2nd_line[
i], current_offset );
863 if (
NPOS == current_offset ) {
873 if (
NStr::StartsWith(line,
"There were no repetitive sequences detected in "))
885 if (e > 0 && s[
b] ==
'(') {
887 if (s[e - 1] ==
')') --e;
891 s = s.substr(
b, e -
b);
897 const size_t MIN_VALUE_COUNT = 15;
900 list< string > values;
907 list<string>::iterator it = values.begin();
927 if (!
id)
return false;
935 if (pos_begin == 0)
return false;
941 if (pos_end == 0 || pos_end < pos_begin)
return false;
960 string class_family = *it;
966 string field12 = *it;
974 string field14 = *it;
995 mask_data.
overlapped = (it != values.end() && (*it) ==
"*");
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Default implementation of a Seq-id resolver, which knows about FASTA-formatted sequence identifiers.
CSeq_id_Handle ResolveSeqId(const string &id) const
Returns a normalized representation of a sequence identifier, as Seq-id handle.
@Gb_qual.hpp User-defined methods of the data storage class.
*** Import *********************************************** * * Features imported from other databases...
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Default implementation for a generator of identifiers, as integers, mashalled as CFeat_id objects.
void ProcessError(CObjReaderLineException &, ILineErrorListener *)
virtual void xAddConversionInfo(CSeq_annot &, ILineErrorListener *)
Class acting as an interface to a RepeatMasker library.
TSpecificity2Taxid m_Specificity2TaxId
CConstIRef< ITaxonomyResolver > m_Taxonomy
bool TestSpecificityMatchesName(TRepeat::TTaxId taxid, const string &name) const
Check if a given taxid's scientific name matches the original specificity string.
void Read(CNcbiIstream &stream)
Reads a library from the RepeatMaskerLib.embl-style input.
bool Get(const string &name, TRepeat &dest) const
Gets information about a given repeat, specified by name.
Implements a concrete class for reading RepeatMasker output from tabular form and rendering it as ASN...
void SetSeqIdResolver(ISeqIdResolver &seqid_resolver)
Use specified delegate for Seq-id resolution.
virtual CRef< CSerialObject > ReadObject(CNcbiIstream &istr, ILineErrorListener *pErrors=nullptr)
Read an object from a given input stream, render it as the most appropriate Genbank object.
virtual bool VerifyData(const SRepeatRegion &mask_data)
CRepeatMaskerReader(TFlags flags=fDefaults, CConstRef< TRepeatLibrary > lib=null, const ISeqIdResolver &seqid_resolver= *(CConstIRef< ISeqIdResolver >(new CFastaIdsResolver)), TIdGenerator &ids= *(CIRef< TIdGenerator >(new COrdinalFeatIdGenerator)))
Implement CReaderBase.
virtual bool ParseRecord(const string &record, SRepeatRegion &mask_data)
virtual bool IsHeaderLine(const string &line)
TConverter & SetConverter()
Delegate for conversion from IRepeatRegion to ASN.1.
virtual bool IsIgnoredLine(const string &line)
void ResetSeqIdResolver()
Use default Seq-id resolution.
virtual CRef< CSeq_annot > ReadSeqAnnot(CNcbiIstream &istr, ILineErrorListener *pErrors=nullptr)
Read an object from a given input stream, render it as a single Seq-annot.
CConstIRef< ISeqIdResolver > m_SeqIdResolver
virtual ~CRepeatMaskerReader(void)
Class which, given an input IRepeatRegion, can generate an appropriate and normalized NCBI ASN....
void ResetRepeatLibrary()
Clear out any repeat library which may be used to add additional attributes to repeats.
CRepeatToFeat(TFlags flags=fDefaults, CConstRef< TRepeatLibrary > lib=null, TIdGenerator &ids= *(CIRef< TIdGenerator >(new COrdinalFeatIdGenerator)))
void SetRepeatLibrary(const TRepeatLibrary &lib)
Set a repeat library which may be used to add additional attributes to repeats.
CRef< CSeq_feat > operator()(const IRepeatRegion &repeat)
Transforms the input repeat into a repeat feature.
void ResetIdGenerator()
Reset the Feature-id generator, do use a default implementation which will generate unique integer lo...
void AssertReferencesResolved()
Asserts that all forward/backward references between any objects visited have now been resolved.
CIRef< TIdGenerator > m_Ids
void SetIdGenerator(TIdGenerator &generator)
Set the Feature-id generator which will be used to assign unique feature IDs.
CConstRef< TRepeatLibrary > m_Library
Deprecated, old API for loading RepeatMasker output.
void Read(CRef< CSeq_annot > annot, TFlags flags=fDefaults, size_t errors=kMax_UInt)
static CRmReader * OpenReader(CNcbiIstream &istr)
CRmReader(CNcbiIstream &istr)
static void CloseReader(CRmReader *reader)
namespace ncbi::objects::
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
Abstract base class for lightweight line-by-line reading.
virtual TSeqPos GetRptPosBegin() const =0
virtual TPercent GetPercIns() const =0
virtual TRptId GetRptId() const =0
string GetRptClassFamily() const
Covenience function to get the class and family as one value, the way that RepeatMasker emits them.
virtual bool IsOverlapped() const =0
Flag that there is a higher-scoring match whose domain partly (<80%) includes the domain of this matc...
virtual TPercent GetPercDiv() const =0
virtual TScore GetSwScore() const =0
virtual TSeqPos GetRptPosEnd() const =0
virtual TSeqPos GetRptLeft() const =0
virtual TSeqPos GetSeqLeft() const =0
virtual TPercent GetPercDel() const =0
Interface defining a read-only RepeatMasker repeat feature.
virtual bool IsReverseStrand() const
Convenience functions that gets the strand on the sequence, without dealing with a Seq-loc.
virtual string GetSeqIdString() const
Gets the sequence from the location of the repeat, without dealing with a Seq-loc.
virtual TSeqPos GetSeqPosBegin() const
Convenience function that gets the position start on the sequence, without dealing with a Seq-loc.
virtual TSeqPos GetSeqPosEnd() const
Convenience functions that gets the position end on the sequence, without dealing with a Seq-loc.
virtual CConstRef< CSeq_loc > GetLocation(void) const =0
Gets the location of this repeat.
virtual string GetRptFamily() const =0
Gets repeat family, or empty string if not known.
virtual string GetRptClass() const =0
Gets repeat class, or empty string if not known.
virtual string GetRptName() const =0
Gets repeat name.
ITaxonomyResolver::TTaxId TTaxId
@ fIncludeRepeatId
Store original RepeatMasker repeat_id.
@ fIncludeRepeatSpecificity
Store the specificity from the RepeatMasker library, if provided.
@ fIncludeRepeatPos
Store the repeat position, that is, the interval on the repeat sequence.
@ fRemoveRedundancy
Removes redundant fields.
@ fAllowNonstandardQualifiers
Avoid user objects and instead, put selected information in non-standard and invalid GenBank qualifie...
@ fStandardizeNomenclature
Translate RepeatMasker output to INSDC standard nomenclature for repeats.
@ fIncludeCoreStatistics
Store core statistics, which include the scores of sw_score, perc_div, perc_del, perc_ins,...
@ fIncludeRepeatRepbaseId
Store the RepbaseID from the RepeatMasker library, if provided.
@ fIncludeRepeatLength
Store the repeat length as reported in the library.
@ fSetComment
Selected attributes beyond what is stored in GenBank standard qualifiers will be included as comments...
@ fIncludeExtraStatistics
Store extra statistics, which includes the length of the query (or query_left, equivalently),...
Interface for resolving a sequence identifier given a textual representation.
Structure implementing the IRepeatRegion API as a simple store of data memebers.
string GetRptClass() const
Gets repeat class, or empty string if not known.
TPercent GetPercDiv() const
TPercent GetPercDel() const
string GetSeqIdString() const
Overridden version returns the orginal unparsed sequence identifier, if it was set (non-empty).
bool IsOverlapped() const
Flag that there is a higher-scoring match whose domain partly (<80%) includes the domain of this matc...
TPercent GetPercIns() const
TSeqPos GetSeqLeft() const
string GetRptName() const
Gets repeat name.
string GetRptFamily() const
Gets repeat family, or empty string if not known.
TSeqPos GetRptLeft() const
TScore GetSwScore() const
TTaxId GetRptSpecificity() const
Returns 0, not known.
TSeqPos GetRptPosEnd() const
TSeqPos GetRptLength() const
Gets repeat length, or kInvalidSeqPos if not known.
string GetRptRepbaseId() const
Returns an empty string, not known.
TSeqPos GetRptPosBegin() const
CRef< CSeq_loc > query_location
string GetRptSpecificityName() const
Returns an empty string, not known.
CConstRef< CSeq_loc > GetLocation(void) const
Gets the location of this repeat.
CConstRef< CFeat_id > GetId() const
Gets the more general feature ID for this repeat, which identifies a single repeat,...
Implementation of IRepeat backed by a simple structure.
string GetRptRepbaseId() const
Gets the RepbaseID, or empty string if not known.
string m_RptSpecificityName
string GetRptSpecificityName() const
Gets specificity as a name, or empty string if not known.
TSeqPos GetRptLength() const
Gets repeat length, or kInvalidSeqPos if not known.
TTaxId GetRptSpecificity() const
Gets specificity as a taxonomy ID, or 0 if not known.
container_type::const_iterator const_iterator
container_type::iterator iterator
const_iterator end() const
iterator_bool insert(const value_type &val)
container_type::value_type value_type
const_iterator find(const key_type &key) const
Include a standard set of the NCBI C++ Toolkit most basic headers.
static const char location[]
unsigned int TSeqPos
Type for sequence locations and lengths.
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
#define DIAG_COMPILE_INFO
Make compile time diagnostic information object to use in CNcbiDiag and CException.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
@ eDiag_Error
Error message.
void Warning(CExceptionArgs_Base &args)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
@ eShallow
Assign/Compare pointers only.
const string AsFastaString(void) const
CConstRef< CSeq_id > GetSeqIdOrNull(void) const
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
static int Score(const CRef< CSeq_id > &id)
Wrappers for use with FindBestChoice from <corelib/ncbiutil.hpp>
bool IsReverseStrand(void) const
Return true if all ranges have reverse strand.
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
TSeqPos GetStop(ESeqLocExtremes ext) const
void Reset(void)
Reset reference object.
void Reset(void)
Reset reference object.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
NCBI_NS_STD::string::size_type SIZE_TYPE
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static bool MatchesMask(CTempString str, CTempString mask, ECase use_case=eCase)
Match "str" against the "mask".
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static SIZE_TYPE FindCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case sensitive search.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
void SetType(TType &value)
Assign a value to Type data member.
TXref & SetXref(void)
Assign a value to Xref data member.
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
void SetLocation(TLocation &value)
Assign a value to Location data member.
void SetComment(const TComment &value)
Assign a value to Comment data member.
TExts & SetExts(void)
Assign a value to Exts data member.
void ResetExts(void)
Reset Exts data member.
void SetId(TId &value)
Assign a value to Id data member.
void SetData(TData &value)
Assign a value to Data data member.
vector< CRef< CGb_qual > > TQual
TQual & SetQual(void)
Assign a value to Qual data member.
void ResetQual(void)
Reset Qual data member.
void SetKey(const TKey &value)
Assign a value to Key data member.
void SetData(TData &value)
Assign a value to Data data member.
list< CRef< CSeq_id > > TId
virtual void Reset(void)
Reset the whole object.
list< CRef< CSeq_feat > > TFtable
Lightweight interface for getting lines of data with minimal memory copying.
double value_type
The numeric datatype used by the parser.
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Defines NCBI C++ exception handling.
Multi-threading – classes, functions, and features.
Useful/utility classes and methods.
bool eq(T x_, T y_, T round_)
static bool s_StandardizeNomenclature(const IRepeatRegion &repeat, CSeq_feat::TQual &qual_list)
Translate RepeatMasker output to INSDC standard nomenclature for repeats.
static void StripParens(string &s)
static void s_SetQual(CSeq_feat::TQual &qual_list, const string &qual, const T val)
static SLJIT_INLINE sljit_ins lr(sljit_gpr dst, sljit_gpr src)
const value_slice::CValueConvert< value_slice::SRunTimeCP, FROM > Convert(const FROM &value)