57 #define NCBI_USE_ERRCODE_X Objtools_Rd_Align
65 auto lineNumber =
error.GetLineNum();
66 if (lineNumber == -1) {
68 "At ID '" <<
error.GetID() <<
"' "
69 "in category '" <<
static_cast<int>(
error.GetCategory()) <<
"': "
70 <<
error.GetMsg() <<
"'");
73 "At ID '" <<
error.GetID() <<
"' "
74 "in category '" <<
static_cast<int>(
error.GetCategory()) <<
"' "
75 "at line " <<
error.GetLineNum() <<
": "
76 <<
error.GetMsg() <<
"'");
131 const string& idString,
152 using TIds = list<CRef<CSeq_id>>;
172 m_fValidateIds(fValidateIds),
174 m_IS(is), m_ReadDone(
false), m_ReadSucceeded(
false),
188 if (!fSingleIdValidate) {
192 return [fSingleIdValidate](
const list<CRef<CSeq_id>>& ids,
195 for (
const auto& pId : ids) {
196 fSingleIdValidate(*pId, lineNum, errorReporter);
212 {EAlphabet::eAlpha_Default,
215 {EAlphabet::eAlpha_Nucleotide,
216 "ABCDGHKMNRSTUVWXYabcdghkmnrstuvwxy"},
218 {EAlphabet::eAlpha_Protein,
219 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz*"},
221 {EAlphabet::eAlpha_Dna,
222 "ABCDGHKMNRSTVWXYabcdghkmnrstvwxy"},
224 {EAlphabet::eAlpha_Rna,
225 "ABCDGHKMNRSTVWXYabcdghkmnrstvwxy"},
227 {EAlphabet::eAlpha_Dna_no_ambiguity,
230 {EAlphabet::eAlpha_Rna_no_ambiguity,
233 return alphaMap[alphaId];
274 const string& message,
294 TReadFlags readFlags,
295 ncbi::objects::ILineErrorListener* pErrorListener)
321 bool generate_local_ids,
322 ncbi::objects::ILineErrorListener* pErrorListener)
347 const auto& idString = seqIdInfo.
mData;
362 "Unable to parse sequence ID string.");
377 const auto num_sequences = alignmentInfo.
NumSequences();
379 if (num_sequences == 0) {
383 "No sequence data was detected in alignment file.");
387 if (num_sequences == 1) {
391 "Only one sequence was detected in the alignment file. An alignment file must contain more than one sequence.");
398 for (
auto seqIdInfo : alignmentInfo.
mIds) {
402 m_Ids.push_back(ids);
407 if (numDeflines ==
m_Ids.size()) {
409 for (
int i=0;
i< numDeflines; ++
i) {
418 "Expected %d deflines but finding %d. ",
422 "If deflines are used, each sequence must have a corresponding defline. "
423 "Note that deflines are optional.",
440 if (begin_len <
m_Seqs[row_i].length()) {
441 string::iterator s =
m_Seqs[row_i].end();
442 while (s !=
m_Seqs[row_i].begin()) {
444 if (
GetEndGap().find(*s) != string::npos) {
523 "CAlnReader::GetSeqAlign(): "
524 "Seq_align is not available until after Read()", 0);
550 aln_stop =
m_Seqs[row_i].size();
560 vector<bool> is_gap; is_gap.resize(
m_Dim,
true);
561 vector<bool> prev_is_gap; prev_is_gap.resize(
m_Dim,
true);
562 vector<TSignedSeqPos> next_start; next_start.resize(
m_Dim, 0);
564 TSeqPos prev_aln_pos = 0, prev_len = 0;
568 for (
TSeqPos aln_pos = 0; aln_pos < aln_stop; aln_pos++) {
570 if (aln_pos >=
m_Seqs[row_i].length()) {
571 if (!is_gap[row_i]) {
572 is_gap[row_i] =
true;
576 string residue =
m_Seqs[row_i].substr(aln_pos, 1);
578 if (!
x_IsGap(row_i, aln_pos, residue)) {
581 is_gap[row_i] =
false;
590 if ( !is_gap[row_i] ) {
591 is_gap[row_i] =
true;
601 lens.push_back(prev_len = aln_pos - prev_aln_pos);
603 if ( !prev_is_gap[row_i] ) {
604 next_start[row_i] += prev_len;
609 starts.resize(starts_i +
m_Dim);
612 starts[starts_i++] = -1;
614 starts[starts_i++] = next_start[row_i];;
616 prev_is_gap[row_i] = is_gap[row_i];
619 prev_aln_pos = aln_pos;
630 lens.push_back(aln_stop - prev_aln_pos);
632 _ASSERT(lens.size() == numseg);
636 m_Aln->Validate(
true);
644 const string& alphabet,
645 const string& seqData,
655 const string& alphabet,
656 const string& seqData,
662 string seqChars = seqData;
663 if (!missingChars.empty()) {
665 remove_if(seqChars.begin(), seqChars.end(),
666 [&](
char c) { return missingChars.find(c) != string::npos;}),
678 alphabet.size() >= 2*26) {
682 auto posFirstT = seqChars.find_first_of(
"Tt");
683 auto posFirstU = seqChars.find_first_of(
"Uu");
684 if (posFirstT != string::npos && posFirstU != string::npos) {
685 string msg =
"Invalid Mol Type: "
686 "U and T cannot appear in the same nucleotide sequence. "
687 "Reinterpreting as protein.";
705 const string& seqData)
const
709 pSeqInst->SetMol(mol);
710 pSeqInst->SetLength(seqData.size());
711 CSeq_data& data = pSeqInst->SetSeq_data();
729 "CAlnReader::GetSeqEntry(): "
730 "Seq_entry is not available until after Read()", 0);
741 seq_annot->
SetData().SetAlign().push_back(seq_align);
744 m_Entry->SetSet().SetAnnot().push_back(seq_annot);
746 auto& seq_set =
m_Entry->SetSet().SetSeq_set();
750 const string& seq_str =
m_SeqVec[row_i];
754 auto& ids = pSubEntry->SetSeq().SetId();
765 const string seqId = ids.front()->AsFastaString();
770 pSubEntry->SetSeq().SetInst(*pSeqInst);
771 seq_set.push_back(pSubEntry);
777 for (
auto& pSeqEntry : seq_set) {
782 for (
auto& pSeqEntry : seq_set) {
784 pSeqEntry->SetSeq());
798 for (
const auto&
mod : mods) {
811 auto defline = defline_info.
mData;
818 const auto idString = pFirstID->AsFastaString();
821 errorReporter(idString, defline_info.
mNumLine, pErrorListener);
838 const bool logInfo = pErrorListener ?
842 CModAdder::Apply(mod_handler, bioseq, skipped_mods, logInfo, errorReporter);
858 pDesc->SetTitle() = title;
859 bioseq.
SetDescr().Set().push_back(std::move(pDesc));
User-defined methods of the data storage class.
User-defined methods of the data storage class.
END_ENUM_INFO string ErrorPrintf(const char *format,...)
thread_local unique_ptr< CAlnErrorReporter > theErrorReporter
static void sReportError(ILineErrorListener *pEC, EDiagSev severity, int code, int subcode, const string &seqId, int lineNumber, const string &message, ILineError::EProblem problemType=ILineError::eProblem_GeneralParsingError)
static CAlnReader::FValidateIds s_GetMultiIdValidate(CAlnReader::FIdValidate fSingleIdValidate)
static void s_AppendMods(const CModHandler::TModList &mods, string &title)
string sAlnErrorToString(const CAlnError &error)
bool ReadAlignmentFile(istream &istr, bool gen_local_ids, bool use_nexus_info, CSequenceInfo &sequence_info, SAlignmentFile &alignmentInfo, ILineErrorListener *pErrorListener=nullptr)
static unsigned int line_num
void remove_if(Container &c, Predicate *__pred)
void Report(int lineNumber, EDiagSev severity, EReaderCode subsystem, EAlnSubcode errorCode, const string &descr, const string &seqId="")
CAlnError(int category, int line_num, string id, string message)
EAlnErr GetCategory() const
const string & GetMsg() const
const string & GetID() const
class CAlnReader supports importing a large variety of text-based alignment formats into standard dat...
vector< string > m_IdStrings
void ParseDefline(const string &defline, const SDeflineParseInfo &info, const TIgnoredProblems &ignoredErrors, list< CRef< objects::CSeq_id >> &ids, bool &hasRange, TSeqPos &rangeStart, TSeqPos &rangeEnd, TSeqTitles &seqTitles, objects::ILineErrorListener *pMessageListener)
void x_ParseAndValidateSeqIds(const TLineInfo &seqIdInfo, TReadFlags flags, TIdList &ids)
objects::CFastaDeflineReader::TIgnoredProblems TIgnoredProblems
objects::CSeq_inst::EMol GetSequenceMolType(const string &alphabet, const string &seqData, objects::ILineErrorListener *pErrorListener=nullptr)
Get a sequence's moltype, also considering the alphabet used to read it.
void x_CalculateMiddleSections()
virtual ~CAlnReader(void)
objects::CFastaDeflineReader::SDeflineParseInfo SDeflineParseInfo
void SetPaup(EAlphabet alpha)
vector< string > m_SeqVec
void Read(bool guess, bool generate_local_ids=false, objects::ILineErrorListener *pErrorListener=nullptr)
TAlignMiddles m_MiddleSections
function< void(const list< CRef< objects::CSeq_id > > &, int, objects::CAlnErrorReporter *)> FValidateIds
static string GetAlphabetLetters(EAlphabet)
vector< TSeqPos > m_SeqLen
objects::CSeq_inst::EMol x_GetSequenceMolType(const string &alphabet, const string &seqData, const string &seqId="", objects::ILineErrorListener *pErrorListener=nullptr)
const string & GetMiddleGap(void) const
int TReadFlags
binary OR of EReadFlags
objects::CDense_seg::TDim TNumrow
const string & GetAlphabet(void) const
pair< TSeqPos, TSeqPos > TAlignMiddleInterval
characters have different contexts, depending on whether they are before the first non-gap character,...
FValidateIds m_fValidateIds
objects::CFastaDeflineReader::TFastaFlags TFastaFlags
CRef< objects::CSeq_inst > x_GetSeqInst(objects::CSeq_inst::EMol mol, const string &seqData) const
void SetClustal(EAlphabet alpha)
void x_AddMods(const TLineInfo &defline_info, objects::CBioseq &bioseq, objects::ILineErrorListener *pErrorListener)
function< void(const objects::CSeq_id &, int, objects::CAlnErrorReporter *)> FIdValidate
ncbi::objects::CSequenceInfo mSequenceInfo
CRef< objects::CSeq_align > GetSeqAlign(TFastaFlags fasta_flags=0, objects::ILineErrorListener *pErrorListener=nullptr)
Create ASN.1 classes from the parsed alignment.
EAlignFormat m_AlignFormat
CNcbiIstream & m_IS
Other internal data.
CRef< objects::CSeq_entry > m_Entry
const string & GetEndGap(void) const
bool x_IsGap(TNumrow row, TSeqPos pos, const string &residue)
const string & GetMissing(void) const
objects::CFastaDeflineReader::TSeqTitles TSeqTitles
void SetPhylip(EAlphabet alpha)
void SetAlphabet(const string &value)
virtual CRef< objects::CSeq_id > GenerateID(const string &fasta_defline, const TSeqPos &line_number, TFastaFlags fasta_flags)
CRef< objects::CSeq_align > m_Aln
void x_VerifyAlignmentInfo(const ncbi::objects::SAlignmentFile &, TReadFlags readFlags)
void x_AssignDensegIds(TFastaFlags fasta_flags, objects::CDense_seg &denseg)
CRef< objects::CSeq_entry > GetSeqEntry(TFastaFlags fasta_flags=objects::CFastaReader::fAddMods, objects::ILineErrorListener *pErrorListener=nullptr)
void SetAllGap(const string &value)
Convenience function for setting beginning, middle, and end gap to the same thing.
void x_AddTitle(const string &defline, objects::CBioseq &bioseq)
list< CRef< objects::CSeq_id > > TIdList
Parsed result data (analogous to SAlignmentFile) Seqs are upper-case strings representing the sequenc...
vector< TLineInfo > m_DeflineInfo
void SetFastaGap(EAlphabet alpha)
Alternative & easy way to choose alphabet, etc.
const string & GetBeginningGap(void) const
CAlnReader(CNcbiIstream &is, FValidateIds fIdValidate=nullptr)
const CSeq_id * GetFirstId() const
CAlnErrorReporter * m_pErrorReporter
CDefaultIdErrorReporter(CAlnErrorReporter *pErrorReporter)
void operator()(EDiagSev severity, int lineNum, const string &idString, CFastaIdValidate::EErrCode, const string &msg)
void operator()(const TIds &ids, int lineNum, CAlnErrorReporter *pErrorReporter)
list< CRef< CSeq_id > > TIds
CFastaIdValidate m_FastaIdValidate
static void ParseDefline(const CTempString &defline, const SDeflineParseInfo &info, const TIgnoredProblems &ignoredErrors, TIds &ids, bool &hasRange, TSeqPos &rangeStart, TSeqPos &rangeEnd, TSeqTitles &seqTitles, ILineErrorListener *pMessageListener)
static CLineErrorEx * Create(EProblem eProblem, EDiagSev eSeverity, int code, int subcode, const std::string &strSeqId, unsigned int uLine, const std::string &strErrorMessage=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), const TVecOfLines &vecOfOtherLines=TVecOfLines())
Use this because the constructor is protected.
static void Apply(const CModHandler &mod_handler, CBioseq &bioseq, TSkippedMods &skipped_mods, FPostMessage fPostMessage=nullptr)
list< CModData > TModList
void AddMods(const TModList &mods, EHandleExisting handle_existing, TModList &rejected_mods, FReportError fReportError=nullptr)
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
static void Apply(const CTempString &title, TModList &mods, string &remainder)
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
@ eProblem_GeneralParsingError
vector< string > mSequences
size_t NumDeflines() const
size_t NumSequences() const
vector< TLineInfo > mDeflines
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Operators to edit gaps in sequences.
unsigned int TSeqPos
Type for sequence locations and lengths.
EDiagSev
Severity level for the posted diagnostics.
@ eDiag_Info
Informational message.
@ eDiag_Error
Error message.
#define NCBI_THROW2(exception_class, err_code, message, extra)
Throw exception with extra parameter.
#define FORMAT(message)
Format message using iostreams library.
@ fAddMods
Parse defline mods and add to SeqEntry.
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
EAccessionInfo
For IdentifyAccession (below)
static int BestRank(const CRef< CSeq_id > &id)
@ fParse_RawText
Try to ID raw non-numeric accessions.
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static string & ToUpper(string &str)
Convert string to upper case – string& version.
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
TLens & SetLens(void)
Assign a value to Lens data member.
vector< TSignedSeqPos > TStarts
void SetDim(TDim value)
Assign a value to Dim data member.
vector< CRef< CSeq_id > > TIds
TStarts & SetStarts(void)
Assign a value to Starts data member.
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
TIds & SetIds(void)
Assign a value to Ids data member.
@ eClass_pop_set
population study
void SetData(TData &value)
Assign a value to Data data member.
TIupacna & SetIupacna(void)
Select the variant.
EMol
molecule class in living organism
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TIupacaa & SetIupacaa(void)
Select the variant.
@ eRepr_raw
continuous sequence
@ eMol_not_set
> cdna = rna
@ eMol_na
just a nucleic acid
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
@ eAlnSubcode_BadSequenceCount
@ eAlnSubcode_IllegalSequenceId
@ eAlnSubcode_InconsistentMolType
@ eAlnSubcode_InsufficientDeflineInfo