41 #include <objtools/readers/source_mod_parser.hpp>
91 #define FASTA_LINE_EXPT(_eSeverity, _uLineNum, _MessageStrmOps, _eErrCode, _eProblem, _sFeature, _sQualName, _sQualValue) \
93 stringstream err_strm_49518053; \
94 err_strm_49518053 << _MessageStrmOps; \
95 PostWarning(pMessageListener, (_eSeverity), (_uLineNum), (err_strm_49518053.str()), (_eErrCode), (_eProblem), (_sFeature), (_sQualName), (_sQualValue)); \
101 #define FASTA_PROGRESS(_MessageStrmOps) \
103 stringstream err_strm_49518053; \
104 err_strm_49518053 << _MessageStrmOps; \
105 if( pMessageListener ) { \
106 pMessageListener->PutProgress(err_strm_49518053.str()); \
111 #define FASTA_WARNING(_uLineNum, _MessageStrmOps, _eProblem, _Feature) \
112 FASTA_LINE_EXPT(eDiag_Warning, _uLineNum, _MessageStrmOps, CObjReaderParseException::eFormat, _eProblem, _Feature, kEmptyStr, kEmptyStr)
114 #define FASTA_WARNING_EX(_uLineNum, _MessageStrmOps, _eProblem, _Feature, _sQualName, _sQualValue) \
115 FASTA_LINE_EXPT(eDiag_Warning, _uLineNum, _MessageStrmOps, CObjReaderParseException::eFormat, _eProblem, _Feature, _sQualName, _sQualValue)
117 #define FASTA_ERROR(_uLineNum, _MessageStrmOps, _eErrCode) \
118 FASTA_LINE_EXPT(eDiag_Error, _uLineNum, _MessageStrmOps, _eErrCode, ILineError::eProblem_GeneralParsingError, kEmptyStr, kEmptyStr, kEmptyStr)
120 #define NCBI_USE_ERRCODE_X Objtools_Rd_Fasta
125 template <
typename TStack>
139 template<
class TObject>
170 return c >=
'A' && c <=
'Z';
175 return c >=
'a' && c <=
'z';
186 return c + (
'A' -
'a');
213 case 'A':
case 'C':
case 'G':
case 'T':
214 case 'a':
case 'c':
case 'g':
case 't':
223 const list<string>& stringFlags,
260 : m_LineReader(&reader), m_MaskVec(0),
261 m_gapNmin(0), m_gap_Unknown_length(0),
263 m_fIdCheck(f_idcheck)
277 m_gapNmin(0), m_gap_Unknown_length(0),
279 m_fIdCheck(f_idcheck)
337 bool need_defline =
true;
346 "CFastaReader: Unexpected end-of-file around line " <<
LineNumber(),
356 strmodified.append(
tmp.data(),
tmp.length());
357 next_line = strmodified;
366 need_defline =
false;
383 if (c ==
'!' || c ==
'#' || c ==
';') {
386 }
else if (need_defline) {
389 need_defline =
false;
394 "CFastaReader: Input doesn't start with"
406 strmodified.append(
tmp.data(),
tmp.length());
424 "CFastaReader: There are invalid " +
x_NucOrProt() +
"residue(s) in input sequence",
425 bad_residue_positions );
430 "CFastaReader: Expected defline around line " <<
LineNumber(),
513 ids = std::move(
data.ids);
514 hasRange =
data.has_range;
515 rangeStart =
data.range_start;
516 rangeEnd =
data.range_end;
517 seqTitles = std::move(
data.titles);
532 denseg.
SetIds().push_back(new_id);
533 denseg.
SetIds().push_back(old_id);
535 denseg.
SetStarts().push_back(range_start);
536 if (range_start > range_end) {
537 denseg.
SetLens().push_back(range_start + 1 - range_end);
541 denseg.
SetLens().push_back(range_end + 1 - range_start);
550 for (
auto pId : ids) {
576 if (
data.ids.empty()) {
580 "CFastaReader: Defline lacks a proper ID around line " <<
LineNumber(),
610 "CFastaReader: Seq-id " << h.
AsString()
611 <<
" is a duplicate around line " <<
LineNumber(),
622 const bool has_range,
626 if (defline_ids.empty()) {
673 const static size_t kWarnTitleLength = 1000;
674 if( lineInfo.
m_sLineText.length() > kWarnTitleLength ) {
676 "FASTA-Reader: Title is very long: " << lineInfo.
m_sLineText.length()
677 <<
" characters (max is " << kWarnTitleLength <<
")",
719 size_t good = 0, bad = 0;
724 static_cast<size_t>(70));
725 const bool bIsNuc = (
729 size_t ambig_nuc = 0;
730 for (
size_t pos = 0; pos < len_to_check; ++pos) {
731 unsigned char c = s[pos];
737 }
else if( c ==
'-' ) {
738 if( ! bIgnoreHyphens ) {
743 }
else if (
isspace(c) || (c >=
'0' && c <=
'9')) {
745 }
else if (c ==
';') {
751 if (bad >= good / 3 &&
752 (len_to_check > 3 || good == 0 || bad > good))
756 <<
", there's a line that doesn't look like plausible data, "
757 "but it's not marked as defline or comment.",
761 const static size_t kWarnPercentAmbiguous = 40;
762 const size_t percent_ambig = (good == 0)?100:((ambig_nuc * 100) / good);
763 if( len_to_check > 3 && percent_ambig > kWarnPercentAmbiguous ) {
765 "FASTA-Reader: Start of first data line in seq is about "
766 << percent_ambig <<
"% ambiguous nucleotides (shouldn't be over "
767 << kWarnPercentAmbiguous <<
"%)",
787 const size_t s_len = s.
length();
800 for( ; pos < s_len && (c = s[pos]) !=
';'; ++pos) {
809 const bool bIsNuc = (
819 Int8 bad_pos_line_num = -1;
820 vector<TSeqPos> bad_pos_vec;
823 const bool bHyphensAreGaps =
825 const bool bAllowLetterGaps =
828 bool bIgnorableHyphenSeen =
false;
832 eCharType_NormalNonGap,
833 eCharType_MaskedNonGap,
835 eCharType_JustIgnore,
836 eCharType_HyphenToIgnoreAndWarn,
841 for (
size_t pos = 0; pos < s_len; ++pos) {
842 const unsigned char c = s[pos];
856 case 'A':
case 'B':
case 'C':
case 'D':
860 case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
867 case 'a':
case 'b':
case 'c':
case 'd':
871 case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
879 case 'O':
case 'P':
case 'Q':
895 case 'o':
case 'p':
case 'q':
897 char_type = (bIsNuc ? eCharType_Bad : eCharType_MaskedNonGap );
901 char_type = ( bIsNuc && bAllowLetterGaps ?
902 eCharType_Gap : eCharType_NormalNonGap );
905 char_type = ( bIsNuc && bAllowLetterGaps ?
906 eCharType_Gap : eCharType_MaskedNonGap );
911 bAllowLetterGaps ? eCharType_Gap :
912 eCharType_NormalNonGap);
916 bAllowLetterGaps ? eCharType_Gap :
917 eCharType_MaskedNonGap);
922 bHyphensAreGaps ? eCharType_Gap :
923 bHyphensIgnoreAndWarn ? eCharType_HyphenToIgnoreAndWarn :
924 eCharType_NormalNonGap );
930 case '\t':
case '\n':
case '\v':
case '\f':
case '\r':
case ' ':
939 case eCharType_NormalNonGap:
945 case eCharType_MaskedNonGap:
951 case eCharType_Gap: {
955 size_t pos2 = pos + 1;
956 while( pos2 < s_len && s[pos2] == c ) {
965 case eCharType_JustIgnore:
967 case eCharType_HyphenToIgnoreAndWarn:
968 bIgnorableHyphenSeen =
true;
970 case eCharType_Comment:
975 if( bad_pos_line_num < 0 ) {
978 bad_pos_vec.push_back(pos);
987 if( bIgnorableHyphenSeen ) {
988 _ASSERT( bHyphensIgnoreAndWarn );
990 "CFastaReader: Hyphens are invalid and will be ignored around line " <<
LineNumber(),
998 if( ! bad_pos_vec.empty() ) {
1001 "CFastaReader: There are invalid " +
x_NucOrProt() +
"residue(s) in input sequence",
1004 stringstream warn_strm;
1005 warn_strm <<
"FASTA-Reader: Ignoring invalid " <<
x_NucOrProt()
1006 <<
"residues at position(s): ";
1052 if ((l == pos) || (l == pos + (*
GetLineReader()).length() && atStartOfLine)) {
1061 const auto& gap_linkage_evidence =
1071 gap_linkage_evidence));
1113 sRemainingLine = sRemainingLine.
substr(3);
1122 while( uNumDigits != sRemainingLine.
size() &&
1123 ::
isdigit(sRemainingLine[uNumDigits]) )
1130 if( uGapSize <= 0 ) {
1132 "CFastaReader: Bad gap size at line " <<
LineNumber(),
1138 sRemainingLine = sRemainingLine.
substr(sDigits.
length());
1144 TModKeyValueMultiMap modKeyValueMultiMap;
1145 while( ! sRemainingLine.
empty() ) {
1148 uOpenBracketPos = 0;
1153 uPosOfEqualSign = sRemainingLine.
find(
'=', uOpenBracketPos + 1);
1157 uCloseBracketPos = sRemainingLine.
find(
']', uPosOfEqualSign + 1);
1162 "CFastaReader: Problem parsing gap mods at line "
1171 sRemainingLine.
substr(uOpenBracketPos + 1,
1172 (uPosOfEqualSign - uOpenBracketPos - 1) ) );
1174 sRemainingLine.
substr(uPosOfEqualSign + 1,
1175 uCloseBracketPos - uPosOfEqualSign - 1) );
1178 modKeyValueMultiMap.insert(
1182 sRemainingLine = sRemainingLine.
substr(uCloseBracketPos + 1);
1191 bool bConflictingGapTypes =
false;
1197 if (
m_gap_type && modKeyValueMultiMap.empty())
1205 if (rec.second.m_eType == *pGapType)
1207 eLinkEvid = rec.second.m_eLinkEvid;
1212 ITERATE(TModKeyValueMultiMap, modKeyValue_it, modKeyValueMultiMap) {
1213 const TStr & sKey = modKeyValue_it->first;
1214 const TStr & sValue = modKeyValue_it->second;
1217 if( canonicalKey ==
"gap-type") {
1220 if( pGapTypeInfo ) {
1226 }
else if( eGapType != *pGapType ) {
1228 bConflictingGapTypes =
true;
1233 "Unknown gap-type: " << sValue,
1243 if( canonicalKey ==
"linkage-evidence") {
1245 vector<CTempString> arrLinkageEvidences;
1249 ITERATE(vector<CTempString>, link_evid_it, arrLinkageEvidences) {
1253 if( find_iter != linkage_evidence_to_value_map.
end() ) {
1254 setOfLinkageEvidence.
insert(
1256 find_iter->second));
1260 "Unknown linkage-evidence: " << sValue,
1273 "Unknown gap modifier name(s): " << sKey,
1278 if( bConflictingGapTypes ) {
1280 "There were conflicting gap-types around line " <<
LineNumber(),
1288 if( ! pGapType && ! setOfLinkageEvidence.
empty() ) {
1293 switch( eLinkEvid ) {
1295 if( setOfLinkageEvidence.
empty() ) {
1300 }
else if( setOfLinkageEvidence.
size() > 1 ||
1306 "FASTA-Reader: Unknown gap-type can have linkage-evidence "
1307 "of type 'unspecified' only.",
1310 setOfLinkageEvidence.
clear();
1315 if( ! setOfLinkageEvidence.
empty() ) {
1317 "FASTA-Reader: This gap-type cannot have any "
1318 "linkage-evidence specified, so any will be ignored.",
1321 setOfLinkageEvidence.
clear();
1325 if( setOfLinkageEvidence.
empty() ) {
1328 if( setOfLinkageEvidence.
size() == 1 &&
1332 "CFastaReader: This gap-type should have at least one "
1333 "specified linkage-evidence.",
1343 uPos, uGapSize, eIsKnown,
LineNumber(), pGapType,
1344 setOfLinkageEvidence ) );
1377 vector<TSeqPos> badIndexes;
1379 if ( ! badIndexes.empty() ) {
1381 "CFastaReader: Invalid " +
x_NucOrProt() +
"residue(s) in input sequence",
1389 char gap_char(inst.
IsAa() ?
'X' :
'N');
1397 const bool bHasSpecifiedGapType =
1399 const bool bHasSpecifiedLinkEvid =
1400 ( ! (*it)->m_setOfLinkageEvidence.empty() &&
1401 ( (*it)->m_setOfLinkageEvidence.size() > 1 ||
1403 if( bHasSpecifiedGapType || bHasSpecifiedLinkEvid )
1406 "CFastaReader: Gap mods are ignored because gaps are "
1407 "becoming N's or X's in this case.",
1411 if ((*it)->m_uPos > pos) {
1412 new_data.append(
m_SeqData, pos, (*it)->m_uPos - pos);
1413 pos = (*it)->m_uPos;
1415 new_data.append((*it)->m_uLen, gap_char);
1436 "FASTA-Reader: No residues given",
1455 if (
n==0 ||
m_Gaps[0]->m_uPos > 0)
1466 gap_ds->SetLoc().SetNull();
1468 gap_ds->SetLiteral().SetLength(
m_Gaps[
i]->m_uLen);
1473 if(
m_Gaps[
i]->m_pGapType || !
m_Gaps[
i]->m_setOfLinkageEvidence.empty() ) {
1474 CSeq_gap & seq_gap = gap_ds->SetLiteral().SetSeq_data().SetGap();
1480 if( !
m_Gaps[
i]->m_setOfLinkageEvidence.empty() ) {
1488 m_Gaps[
i]->m_setOfLinkageEvidence )
1492 pNewLinkEvid->SetType( *link_evid_it );
1493 vecLinkEvids.push_back(std::move(pNewLinkEvid));
1498 delta_ext.
Set().push_back(std::move(gap_ds));
1501 if (next_start !=
m_Gaps[
i]->m_uPos) {
1507 if (delta_ext.
Get().size() == 1) {
1511 ->SetLiteral().SetSeq_data());
1521 pDesc->SetMolinfo().SetBiomol(biomol);
1522 bioseq.
SetDescr().Set().emplace_back(std::move(pDesc));
1527 auto& seqInst = bioseq.
SetInst();
1529 (find_if(beginSeqData, endSeqData, [](
char c) {
return (c==
't' || c ==
'T'); }) != endSeqData);
1531 (find_if(beginSeqData, endSeqData, [](
char c) {
return (c==
'u' || c ==
'U'); }) != endSeqData);
1533 if (hasT && !hasU) {
1539 if (hasU && !hasT) {
1571 inst.SetMol(default_mol);
1575 if (inst.IsSetMol()) {
1610 inst.SetMol(default_mol);
1618 const TStr& sLineText,
1624 const static size_t kWarnNumNucCharsAtEnd = 20;
1625 const static size_t kWarnAminoAcidCharsAtEnd = 50;
1627 const size_t length = sLineText.
length();
1632 const SIZE_TYPE last_pos_to_check_for_nuc = (sLineText.
length() - kWarnNumNucCharsAtEnd);
1633 for( ; pos_to_check >= last_pos_to_check_for_nuc; --pos_to_check ) {
1639 if( pos_to_check < last_pos_to_check_for_nuc ) {
1641 "FASTA-Reader: Title ends with at least " << kWarnNumNucCharsAtEnd
1642 <<
" valid nucleotide characters. Was the sequence "
1643 <<
"accidentally put in the title line?",
1655 const SIZE_TYPE last_pos_to_check_for_amino_acid =
1656 ( sLineText.
length() - kWarnAminoAcidCharsAtEnd );
1657 for( ; pos_to_check >= last_pos_to_check_for_amino_acid; --pos_to_check ) {
1660 const char ch = sLineText[pos_to_check];
1661 if( ( ch >=
'A' && ch <=
'Z') || (ch >=
'a' && ch <=
'z') ) {
1669 if( pos_to_check < last_pos_to_check_for_amino_acid ) {
1671 "FASTA-Reader: Title ends with at least " << kWarnAminoAcidCharsAtEnd
1672 <<
" valid amino acid characters. Was the sequence "
1673 <<
"accidentally put in the title line?",
1690 if ( !entry->
IsSet()
1692 static_cast<unsigned int>(
max(reference_row + 1, 2)))
1695 "CFastaReader::ReadAlignedSet: not enough input sequences.",
1697 }
else if (reference_row >= 0) {
1712 vector<TSeqPos> lengths;
1739 vector<TSeqPos>::const_iterator it(lengths.begin());
1741 for (++it; it != lengths.end(); ++it) {
1744 "CFastaReader::ReadAlignedSet: Rows have different "
1745 "lengths. For example, look around line " <<
LineNumber(),
1761 vector<TBuilderRef> builders(rows);
1764 if (
r != reference_row) {
1765 builders[
r].Reset(
new TBuilder(ids[reference_row], ids[
r]));
1769 const TSubMap& submap = it->second;
1771 if (rr_it2 == submap.
end()) {
1775 builders[
r]->AddData(it->first, TBuilder::kContinued,
1781 if (it2 != submap.
end() &&
r == it2->first) {
1782 if (
r != reference_row) {
1783 builders[
r]->AddData(it->first, rr_it2->second,
1789 builders[
r]->AddData(it->first, rr_it2->second,
1790 TBuilder::kContinued);
1799 if (
r != reference_row) {
1800 annot_align.push_back(builders[
r]->GetCompletedAlignment());
1825 const TSubMap& submap = it->second;
1828 if (it2 != submap.
end() &&
r == it2->first) {
1829 dss.push_back(it2->second);
1832 _ASSERT(dss.size() >=
size_t(rows) && old_len > 0);
1835 dss.push_back(last_pos);
1837 dss.push_back(last_pos + old_len);
1846 annot.
SetData().SetAlign().push_back(sa);
1938 if ( !
input.is_open() ) {
1952 if ( !
input.is_open() ) {
1959 while ( !lr->
AtEOF() ) {
1967 if ( !lr->
AtEOF() ) {
1980 for (
const auto&
mod : mods) {
2005 transform(postponed_mods.begin(), postponed_mods.end(),
2007 [](
const string& mod_name) { return CModHandler::GetCanonicalName(mod_name); });
2019 const string& title,
2024 string processed_title = title;
2026 x_AddMods(line_number, bioseq, processed_title, pMessageListener);
2032 "FASTA-Reader: Ignoring FASTA modifier(s) found because "
2033 "the input was not expected to have any.",
2039 if (!processed_title.empty()) {
2041 pDesc->SetTitle() = processed_title;
2042 bioseq.
SetDescr().Set().push_back(std::move(pDesc));
2050 string& processed_title,
2062 const auto idString = pFirstID->AsFastaString();
2070 errorReporter(idString, line_number, pMessageListener);
2078 const bool logInfo =
2085 processed_title = remainder;
2097 auto it = mods.begin();
2098 while(it != mods.end()) {
2104 mit->second.second.push_back(*it);
2109 it = mods.
erase(it);
2132 newString.reserve(sValue.
length());
2135 const char ch = sValue[ii];
2137 newString.push_back(
tolower(ch));
2138 }
else if( ch ==
' ' || ch ==
'_' ) {
2139 newString.push_back(
'-');
2141 newString.push_back(ch);
2158 m_eKnownSize(eKnownSize),
2159 m_uLineNumber(uLineNumber),
2160 m_pGapType(pGapType),
2161 m_setOfLinkageEvidence(setOfLinkageEvidence)
2175 for (
const auto& key_val : countToEvidenceMap) {
2176 const auto& input_evidence_set = key_val.second;
2178 for (
const auto& evidence : input_evidence_set) {
2190 for (
const auto& evidence : evidences) {
2207 (_eSeverity),
static_cast<unsigned>(_uLineNum),
2210 sSeqId, (_sFeature),
2211 (_sQualName), (_sQualValue),
2213 if ( ! pMessageListener && (_eSeverity) <=
eDiag_Warning ) {
2215 }
else if ( ! pMessageListener || ! pMessageListener->
PutError( *pLineExpt ) )
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static const NStr::TNumToStringFlags kFlags
void transform(Container &c, UnaryFunction *op)
const SBadResiduePositions & GetBadResiduePositions(void) const THROWS_NONE
const CSeq_id * GetFirstId() const
void AddAndSplit(const CTempString &src, CSeq_data::E_Choice format, TSeqPos length, bool gaps_ok=false, bool allow_packing=true)
add a chunk of sequence, splitting it as necessary for the sake of compactness (isolating ambiguous p...
Helper class to build pairwise alignments, with double gaps automatically spliced out.
static TSeqPos ParseRange(const CTempString &s, TSeqPos &start, TSeqPos &end, ILineErrorListener *pMessageListener)
static size_t s_MaxGeneralTagLength
static size_t s_MaxAccessionLength
static void ParseDefline(const CTempString &defline, const SDeflineParseInfo &info, const TIgnoredProblems &ignoredErrors, TIds &ids, bool &hasRange, TSeqPos &rangeStart, TSeqPos &rangeEnd, TSeqTitles &seqTitles, ILineErrorListener *pMessageListener)
static size_t s_MaxLocalIDLength
bool CacheIdHandle(CSeq_id_Handle idh)
virtual CRef< CSeq_id > GenerateID(bool unique_id)
void SetGenerator(CSeqIdGenerator &generator)
void ParseDefLine(const TStr &s, ILineErrorListener *pMessageListener)
void ParseTitle(const SLineTextAndLoc &lineInfo, ILineErrorListener *pMessageListener=0)
SFastaFileMap::SFastaEntry m_MapEntry
void AssembleSeq(ILineErrorListener *pMessageListener)
CFastaMapper(ILineReader &reader, SFastaFileMap *fasta_map, TFlags flags, FIdCheck f_idcheck=CSeqIdCheck())
Base class for reading FASTA sequences.
static void Apply(const CModHandler &mod_handler, CBioseq &bioseq, TSkippedMods &skipped_mods, FPostMessage fPostMessage=nullptr)
void SetExcludedMods(const vector< string > &excluded_mods)
static const string & GetCanonicalName(const TModEntry &mod_entry)
void SetIgnoredMods(const list< string > &ignored_mods)
list< CModData > TModList
void AddMods(const TModList &mods, EHandleExisting handle_existing, TModList &rejected_mods, FReportError fReportError=nullptr)
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
@ eFormat
Some of these are pretty specialized.
Defines and provides stubs for a general interface to a variety of file readers.
static void xAddStringFlagsWithMap(const list< string > &stringFlags, const map< string, TReaderFlags > flagMap, TReaderFlags &baseFlags)
void SetCounter(TCount count)
TCount GetCounter(void) const
static const TGapTypeMap & GetNameToGapTypeInfoMap(void)
This is for if the user needs to get the gap-type string to SGapTypeInfo info directly (For example,...
ELinkEvid
indicates which linkage-evidences a given gap-type can accept, if any
@ eLinkEvid_UnspecifiedOnly
only the "unspecified" linkage-evidence is allowed
@ eLinkEvid_Forbidden
no linkage-evidence is allowed
@ eLinkEvid_Required
any linkage-evidence is allowed, and at least one is required
static const SGapTypeInfo * NameToGapTypeInfo(const CTempString &sName)
From a gap-type string, get the SGapTypeInfo, insensitive to case, etc.
static bool IsAa(EMol mol)
static void Validate(const CSeq_data &in_seq, vector< TSeqPos > *badIdx, TSeqPos uBeginIdx=0, TSeqPos uLength=0)
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
CTempPusher(TStack &s, const TValue &v)
TStack::value_type TValue
CRef< TObject > & m_pObj1
CRef< TObject > & m_pObj2
CTempRefSwap(CRef< TObject > &pObj1, CRef< TObject > &pObj2)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
static bool HasMods(const CTempString &title)
static void Apply(const CTempString &title, TModList &mods, string &remainder)
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
Callback interface to scan fasta file for entries.
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
@ eProblem_InvalidResidue
@ eProblem_IgnoredResidue
@ eProblem_ParsingModifiers
@ eProblem_UnexpectedAminoAcids
@ eProblem_ModifierFoundButNoneExpected
@ eProblem_UnexpectedNucResidues
@ eProblem_TooManyAmbiguousResidues
@ eProblem_UnrecognizedQualifierName
@ eProblem_NonPositiveLength
@ eProblem_ContradictoryModifiers
@ eProblem_ExpectedModifierMissing
@ eProblem_ExtraModifierFound
Abstract base class for lightweight line-by-line reading.
container_type::const_iterator const_iterator
const_iterator begin() const
const_iterator end() const
iterator_bool insert(const value_type &val)
const_iterator find(const key_type &key) const
iterator_bool insert(const value_type &val)
const_iterator begin() const
ECharType
Porter's Stemming Algorithm.
static bool s_ASCII_IsUnAmbigNuc(unsigned char c)
bool s_ASCII_IsAmbigNuc(unsigned char c)
bool s_ASCII_IsUpper(unsigned char c)
unsigned char s_ASCII_MustBeLowerToUpper(unsigned char c)
CTempPusher< stack< CFastaReader::TFlags > > CFlagGuard
#define FASTA_WARNING(_uLineNum, _MessageStrmOps, _eProblem, _Feature)
#define FASTA_PROGRESS(_MessageStrmOps)
bool s_ASCII_IsLower(unsigned char c)
static void s_AppendMods(const CModHandler::TModList &mods, string &title)
#define FASTA_WARNING_EX(_uLineNum, _MessageStrmOps, _eProblem, _Feature, _sQualName, _sQualValue)
static bool sRefineNaMol(const char *beginSeqData, const char *endSeqData, CBioseq &bioseq)
bool s_ASCII_IsAlpha(unsigned char c)
#define FASTA_ERROR(_uLineNum, _MessageStrmOps, _eErrCode)
static void s_AddBiomol(CMolInfo::EBiomol biomol, CBioseq &bioseq)
Operators to edit gaps in sequences.
Helper class to build pairwise alignments, with double gaps automatically spliced out.
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
#define ITERATE_0_IDX(idx, up_to)
idx loops from 0 (inclusive) to up_to (exclusive)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
#define LOG_POST_X(err_subcode, message)
#define DIAG_COMPILE_INFO
Make compile time diagnostic information object to use in CNcbiDiag and CException.
EDiagSev
Severity level for the posted diagnostics.
@ eDiag_Info
Informational message.
@ eDiag_Warning
Warning message.
void Warning(CExceptionArgs_Base &args)
#define NCBI_THROW2(exception_class, err_code, message, extra)
Throw exception with extra parameter.
#define ENUM_METHOD_NAME(EnumName)
virtual bool ParseGapLine(const TStr &s, ILineErrorListener *pMessageListener)
virtual void ParseTitle(const SLineTextAndLoc &lineInfo, ILineErrorListener *pMessageListener)
TFastaSeqIds all_seq_ids
List of all seq.ids.
TSeqTitles m_CurrentSeqTitles
bool TestFlag(EFlags flag) const
TSeqPos m_CurrentGapLength
void SetIgnoredMods(const list< string > &ignored_mods)
virtual bool IsValidLocalID(const TStr &s) const
const CSeqIdGenerator & GetIDGenerator(void) const
virtual void CheckDataLine(const TStr &s, ILineErrorListener *pMessageListener)
virtual CRef< CSeq_entry > ReadOneSeq(ILineErrorListener *pMessageListener=nullptr)
Read a single effective sequence, which may turn out to be a segmented set.
void SetGapLinkageEvidence(CSeq_gap::EType type, const set< int > &defaultEvidence, const map< TSeqPos, set< int >> &countToEvidenceMap)
virtual ~CFastaReader(void)
static size_t ParseRange(const TStr &s, TSeqPos &start, TSeqPos &end, ILineErrorListener *pMessageListener)
SGap::TNullableGapType m_gap_type
virtual CRef< CSerialObject > ReadObject(ILineReader &lr, ILineErrorListener *pErrors)
CReaderBase overrides.
std::string x_NucOrProt(void) const
void x_ApplyMods(const string &title, TSeqPos line_number, CBioseq &bioseq, ILineErrorListener *pMessageListener)
TFlags GetFlags(void) const
CRef< CSeq_entry > ReadSet(int max_seqs=kMax_Int, ILineErrorListener *pMessageListener=nullptr)
Read multiple sequences (by default, as many as are available.)
static string CanonicalizeString(const CTempString &sValue)
CSeqIdGenerator & SetIDGenerator(void)
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
void SaveMasks(TMasks *masks)
long TFlags
binary OR of EFlags
virtual CRef< CSeq_entry > ReadSeqEntry(ILineReader &lr, ILineErrorListener *pErrors)
Read an object from a given line reader, render it as a single Seq-entry, if possible.
SGap::TLinkEvidSet m_DefaultLinkageEvidence
virtual void x_CloseMask(void)
virtual void UngetLine(void)=0
Unget current line, which must be valid.
virtual bool CreateWarningsForSeqDataInTitle(const TStr &sLineText, TSeqPos iLineNum, ILineErrorListener *pMessageListener) const
void SetGapLinkageEvidences(CSeq_gap::EType type, const set< int > &evidences)
CRef< CSeq_loc > SaveMask(void)
Directs the *following* call to ReadOneSeq to note the locations of lowercase letters.
CRef< CFastaIdHandler > m_IDHandler
SGap(TSeqPos pos, TSignedSeqPos len, EKnownSize eKnownSize, Uint8 uLineNumber, TNullableGapType pGapType=TNullableGapType(), const set< CLinkage_evidence::EType > &setOfLinkageEvidence=set< CLinkage_evidence::EType >())
TSeqPos m_gap_Unknown_length
CRef< CSeq_entry > x_ReadSeqsToAlign(TIds &ids, ILineErrorListener *pMessageListener)
static void AddStringFlags(const list< string > &stringFlags, TFlags &baseFlags)
virtual void AssembleSeq(ILineErrorListener *pMessageListener)
CRef< ILineReader > m_LineReader
void x_AddMultiwayAlignment(CSeq_annot &annot, const TIds &ids)
const CSeq_id & GetBestID(void) const
CBioseq::TId & SetIDs(void)
TSeqPos GetCurrentPos(EPosType pos_type)
const TPostponedModMap & GetPostponedModMap() const
void x_CheckForPostponedMods(const string &idString, TSeqPos line_number, CModHandler::TModList &mods)
TPostponedModMap m_PostponedModMap
virtual void PostProcessIDs(const CBioseq::TId &defline_ids, const string &defline, bool has_range=false, TSeqPos range_start=kInvalidSeqPos, TSeqPos range_end=kInvalidSeqPos)
CRef< CBioseq > m_CurrentSeq
virtual void EntryFound(CRef< CSeq_entry > se, CNcbiStreampos stream_position)=0
Callback function, called after reading the fasta entry.
void SetMinGaps(TSeqPos gapNmin, TSeqPos gap_Unknown_length)
vector< CRef< CSeq_id > > TIds
virtual void x_OpenMask(void)
bool xSetSeqMol(const list< CRef< CSeq_id >> &ids, CSeq_inst_Base::EMol &mol)
CRef< CSeq_entry > ReadAlignedSet(int reference_row, ILineErrorListener *pMessageListener=nullptr)
Read as many sequences as are available, and interpret them as an alignment, with hyphens marking rel...
void x_AddMods(TSeqPos line_number, CBioseq &bioseq, string &processed_title, ILineErrorListener *pMessageListener)
virtual void ParseDataLine(const TStr &s, ILineErrorListener *pMessageListener)
std::vector< ILineError::EProblem > m_ignorable
virtual void x_CloseGap(TSeqPos len, bool atStartOfLine, ILineErrorListener *pMessageListener)
CNcbiStreampos stream_offset
Molecule offset in file.
vector< ILineError::EProblem > TIgnoredProblems
void x_AddPairwiseAlignments(CSeq_annot &annot, const TIds &ids, TRowNum reference_row)
CRef< CSeq_align > xCreateAlignment(CRef< CSeq_id > old_id, CRef< CSeq_id > new_id, TSeqPos range_start, TSeqPos range_end)
void SetExcludedMods(const vector< string > &excluded_mods)
static void ParseDefLine(const TStr &defLine, const SDefLineParseInfo &info, const TIgnoredProblems &ignoredErrors, list< CRef< CSeq_id >> &ids, bool &hasRange, TSeqPos &rangeStart, TSeqPos &rangeEnd, TSeqTitles &seqTitles, ILineErrorListener *pMessageListener)
bool AtEOF(void) const
Indicates (negatively) whether there is any more input.
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
void SetMaxIDLength(Uint4 max_len)
If this is set, an exception will be thrown if a Sequence ID exceeds the given length.
CFastaReader(ILineReader &reader, TFlags flags=0, FIdCheck f_idcheck=CSeqIdCheck())
void ScanFastaFile(IFastaEntryScan *scanner, CNcbiIfstream &input, CFastaReader::TFlags fread_flags)
Scan FASTA files, call IFastaEntryScan::EntryFound (payload function)
Uint8 LineNumber(void) const
Int8 StreamPosition(void) const
string seq_id
Primary sequence Id.
virtual char PeekChar(void) const =0
Returns the first character of the next string without consuming it.
CFastaDeflineReader::FIdCheck FIdCheck
void SetPostponedMods(const list< string > &postponed_mods)
virtual ~IFastaEntryScan()
void ReadFastaFileMap(SFastaFileMap *fasta_map, CNcbiIfstream &input)
Function reads input stream (assumed that it is FASTA format) one molecule entry after another fillin...
bool m_bModifiedMaxIdLength
virtual void AssignMolType(ILineErrorListener *pMessageListener)
string description
Molecule description.
void CloseGap(bool atStartOfLine=false, ILineErrorListener *pMessageListener=nullptr)
unordered_set< string > m_PostponedMods
ILineReader & GetLineReader(void)
void IgnoreProblem(ILineError::EProblem problem)
const CBioseq::TId & GetIDs(void) const
void x_SetDeflineParseInfo(SDefLineParseInfo &info)
vector< SLineTextAndLoc > TSeqTitles
TCountToLinkEvidMap m_GapsizeToLinkageEvidence
virtual CT_POS_TYPE GetPosition(void) const =0
Return the current (absolute) position.
CRef< CSeq_entry > ReadFasta(CNcbiIstream &in, CFastaReader::TFlags flags, int *counter, CFastaReader::TMasks *lcv, ILineErrorListener *pMessageListener)
A const-correct replacement for the deprecated ReadFasta function.
virtual void GenerateID(void)
virtual void PostWarning(ILineErrorListener *pMessageListener, EDiagSev _eSeverity, size_t _uLineNum, CTempString _MessageStrmOps, CObjReaderParseException::EErrCode _eErrCode, ILineError::EProblem _eProblem, CTempString _sFeature, CTempString _sQualName, CTempString _sQualValue) const
@ fNoParseID
Generate an ID (whole defline -> title)
@ fQuickIDCheck
Just check local IDs' first characters.
@ fDLOptional
Don't require a leading defline.
@ fLaxGuess
Use legacy heuristic for guessing seq. type.
@ fHyphensIgnoreAndWarn
When a hyphen is encountered in seq data, ignore it but warn.
@ fRequireID
Reject deflines that lack IDs.
@ fSkipCheck
Skip (rudimentary) body content check.
@ fDisableNoResidues
If no residues found do not raise an error.
@ fUniqueIDs
Forbid duplicate IDs.
@ fLetterGaps
Parse runs of Ns when splitting data.
@ fIgnoreMods
Ignore mods entirely. Incompatible with fAddMods.
@ fAddMods
Parse defline mods and add to SeqEntry.
@ fNoUserObjs
Don't save raw deflines in User-objects.
@ fForceType
Force specified type regardless of accession.
@ fParseRawID
Try to identify raw accessions.
@ fNoSplit
Don't split out ambiguous sequence regions.
@ fUseIupacaa
If Prot, use iupacaa instead of the default ncbieaa.
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
@ fParseGaps
Make a delta sequence if gaps found.
@ fValidate
Check (alphabetic) residue validity.
@ fOneSeq
Just read the first sequence found.
@ fLeaveAsText
Don't reencode at all, just parse.
@ fAssumeProt
Assume prots unless accns indicate otherwise.
@ fNoSeqData
Parse the deflines but skip the data.
@ fStrictGuess
Assume no typos when guessing sequence type.
@ fDisableParseRange
No ranges in seq-ids. Ranges part of seq-id instead.
const string AsFastaString(void) const
EAccessionInfo
For IdentifyAccession (below)
static bool IsValidLocalID(const CTempString &s)
Perform rudimentary validation on potential local IDs, whose contents should be pure ASCII and limite...
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string AsString(void) const
static int BestRank(const CRef< CSeq_id > &id)
void SetPacked_int(TPacked_int &v)
void SetNull(void)
Override all setters to incorporate cache invalidation.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
void Reset(void)
Reset reference object.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
void Swap(TThisType &ref)
Swaps the pointer with another reference.
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
IO_PREFIX::streampos CNcbiStreampos
Portable alias for streampos.
NCBI_NS_STD::string::size_type SIZE_TYPE
static string PrintableString(const CTempString str, TPrintableMode mode=fNewLine_Quote|fNonAscii_Passthru)
Get a printable version of the specified string.
static CTempString TruncateSpaces_Unsafe(const CTempString str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
size_type length(void) const
Return the length of the represented array.
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
TErrCode GetErrCode(void) const
Get error code.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
size_type size(void) const
Return the length of the represented array.
static const size_type npos
@ fConvErr_NoThrow
Do not throw an exception on error.
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
@ eTrunc_Begin
Truncate leading spaces only.
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
void SetType(TType &value)
Assign a value to Type data member.
TLens & SetLens(void)
Assign a value to Lens data member.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
const TLens & GetLens(void) const
Get the Lens member data.
void SetDim(TDim value)
Assign a value to Dim data member.
vector< TSignedSeqPos > TStarts
void SetDim(TDim value)
Assign a value to Dim data member.
void SetType(TType value)
Assign a value to Type data member.
TStarts & SetStarts(void)
Assign a value to Starts data member.
TStrands & SetStrands(void)
Assign a value to Strands data member.
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
TIds & SetIds(void)
Assign a value to Ids data member.
@ eType_partial
mapping pieces together
TSet & SetSet(void)
Select the variant.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TSet & GetSet(void) const
Get the variant data.
bool IsSeq(void) const
Check if variant Seq is selected.
bool IsSet(void) const
Check if variant Set is selected.
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
TSeq & SetSeq(void)
Select the variant.
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
void SetData(TData &value)
Assign a value to Data data member.
const TInst & GetInst(void) const
Get the Inst member data.
void SetExt(TExt &value)
Assign a value to Ext data member.
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
list< CRef< CSeq_align > > TAlign
void SetType(TType value)
Assign a value to Type data member.
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
Tdata & Set(void)
Assign a value to data member.
TLength GetLength(void) const
Get the Length member data.
list< CRef< CSeq_id > > TId
void SetInst(TInst &value)
Assign a value to Inst data member.
EMol
molecule class in living organism
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TUser & SetUser(void)
Select the variant.
void SetRepr(TRepr value)
Assign a value to Repr data member.
const Tdata & Get(void) const
Get the member data.
void SetLength(TLength value)
Assign a value to Length data member.
TLinkage_evidence & SetLinkage_evidence(void)
Assign a value to Linkage_evidence data member.
void SetLinkage(TLinkage value)
Assign a value to Linkage data member.
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
void ResetExt(void)
Reset Ext data member.
list< CRef< CLinkage_evidence > > TLinkage_evidence
@ eRepr_delta
sequence made by changes (delta) to others
@ eRepr_raw
continuous sequence
@ eRepr_virtual
no seq data
@ e_Ncbieaa
extended ASCII 1 letter aa codes
@ e_Iupacna
IUPAC 1 letter nuc acid code.
@ e_Iupacaa
IUPAC 1 letter amino acid code.
@ eMol_not_set
> cdna = rna
@ eMol_na
just a nucleic acid
constexpr auto front(list< Head, As... >, T=T()) noexcept -> Head
string_type::value_type char_type
The character type used by the parser.
double value_type
The numeric datatype used by the parser.
Useful/utility classes and methods.
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define FIELD_CHAIN_OF_2_IS_SET(Var, Fld1, Fld2)
FIELD_CHAIN_OF_2_IS_SET.
void AddBadIndexMap(const TBadIndexMap &additionalBadIndexMap)
void ConvertBadIndexesToString(CNcbiOstream &out, unsigned int maxRanges=1000) const
CConstRef< CSeq_id > m_SeqId
TBadIndexMap m_BadIndexMap
Holds information about a given gap-type string.
CSeq_gap::EType m_eType
The underlying type that the string corresponds to.
ELinkEvid m_eLinkEvid
Indicates what linkage-evidences are compatible with this.