92 #include <unordered_set>
98 #define NCBI_USE_ERRCODE_X Objtools_Rd_Feature
108 static const char *
const kCdsFeatName =
"CDS";
142 Uchar best_weight = UCHAR_MAX;
145 Uchar new_weight = std_order[(*it)->Which()];
146 if (new_weight < best_weight)
149 best_weight = new_weight;
160 {
'A', list<char>({
'A'})},
161 {
'G', list<char>({
'G'})},
162 {
'C', list<char>({
'C'})},
163 {
'T', list<char>({
'T'})},
164 {
'U', list<char>({
'U'})},
165 {
'M', list<char>({
'A',
'C'})},
166 {
'R', list<char>({
'A',
'G'})},
167 {
'W', list<char>({
'A',
'T'})},
168 {
'S', list<char>({
'C',
'G'})},
169 {
'Y', list<char>({
'C',
'T'})},
170 {
'K', list<char>({
'G',
'T'})},
171 {
'V', list<char>({
'A',
'C',
'G'})},
172 {
'H', list<char>({
'A',
'C',
'T'})},
173 {
'D', list<char>({
'A',
'G',
'T'})},
174 {
'B', list<char>({
'C',
'G',
'T'})},
175 {
'N', list<char>({
'A',
'C',
'G',
'T'})}
314 const string &seq_id,
319 const string& feat_name,
323 const string &seq_id );
331 const unsigned int line_number,
380 const string &feat_name,
381 const string& qual,
const string&
val,
385 const string& qual_val,
386 const string& feat_name,
397 EQual qtype,
const string& qual,
const string&
val);
399 const string &feat_name,
409 const string& feat_name,
414 const string& qual,
const string&
val);
780 {
"Asp or Asn",
'B' },
781 {
"Asparagine",
'N' },
782 {
"Aspartate",
'D' },
783 {
"Aspartic Acid",
'D' },
789 {
"Glu or Gln",
'Z' },
790 {
"Glutamate",
'E' },
791 {
"Glutamic Acid",
'E' },
792 {
"Glutamine",
'Q' },
797 {
"Histidine",
'H' },
800 {
"Isoleucine",
'I' },
802 {
"Leu or Ile",
'J' },
807 {
"Methionine",
'M' },
810 {
"Phenylalanine",
'F' },
814 {
"Pyrrolysine",
'O' },
816 {
"Selenocysteine",
'U' },
821 {
"Termination",
'*' },
823 {
"Threonine",
'T' },
825 {
"Tryptophan",
'W' },
841 "environmental_sample",
847 "ribosomal_slippage",
855 : m_reader(reader), m_LineNumber(
line_num), m_pMessageListener(pMessageListener)
904 out_offset = new_offset;
922 bool isminus =
false;
923 bool ispoint =
false;
925 bool partial5 =
false;
926 bool partial3 =
false;
930 string start, stop, feat, qual,
val, stnd;
934 if (line.
empty ())
return false;
941 numtkns = tkns.size ();
958 if(
val.length() >= 2 &&
val[0] ==
'"' &&
val[
val.length()-1] ==
'"' ) {
966 bool has_start =
false;
967 if (! start.empty ()) {
968 if (start [0] ==
'<') {
972 len = start.length ();
973 if (
len > 1 && start [
len - 1] ==
'^') {
975 start [
len - 1] =
'\0';
982 bool has_stop =
false;
983 if (! stop.empty ()) {
984 if (stop [0] ==
'>') {
993 if ( startv <= 0 || stopv <= 0 ) {
999 if (! stnd.empty ()) {
1000 if (stnd ==
"minus" || stnd ==
"-" || stnd ==
"complement") {
1018 if ((has_start && startv < 0) || (has_stop && stopv < 0)) {
1025 loc_info.
start_pos = ( startv < 0 ? -1 : startv);
1026 loc_info.
stop_pos = ( stopv < 0 ? -1 : stopv);
1175 vector<string> &out_tokens )
1180 string::size_type startPosOfNextRoundOfTokenization = 0;
1181 while ( startPosOfNextRoundOfTokenization < line.
size() ) {
1182 auto posAfterSpaces = line.
find_first_not_of(
" ", startPosOfNextRoundOfTokenization );
1183 if( posAfterSpaces == string::npos ) {
1187 string::size_type posOfTab = line.
find(
'\t', posAfterSpaces );
1188 if( posOfTab == string::npos ) {
1189 posOfTab = line.
length();
1194 string &new_token = out_tokens.back();
1195 copy( line.
begin() + posAfterSpaces, line.
begin() + posOfTab, back_inserter(new_token) );
1198 startPosOfNextRoundOfTokenization = ( posOfTab + 1 );
1215 vector<string> &out_tokens )
1219 if( line.
empty() ) {
1229 if( start_of_qual == line.
end() ) {
1232 auto start_of_whitespace_after_qual = find_if( start_of_qual, line.
end(),
CIsSpace() );
1233 auto start_of_val = find_if( start_of_whitespace_after_qual, line.
end(),
CIsNotSpace() );
1242 string &qual = out_tokens.back();
1243 copy( start_of_qual, start_of_whitespace_after_qual, back_inserter(qual) );
1246 if( start_of_val != line.
end() ) {
1248 string &
val = out_tokens.back();
1249 copy( start_of_val, line.
end(), back_inserter(
val) );
1257 auto first_column_start = line.
begin();
1258 auto first_whitespace = find_if( first_column_start, line.
end(),
CIsSpace() );
1259 auto second_column_start = find_if( first_whitespace, line.
end(),
CIsNotSpace() );
1260 auto second_whitespace = find_if( second_column_start, line.
end(),
CIsSpace() );
1261 auto third_column_start = find_if( second_whitespace, line.
end(),
CIsNotSpace() );
1262 auto third_whitespace = find_if( third_column_start, line.
end(),
CIsSpace() );
1264 auto sixth_column_start = find_if( third_whitespace, line.
end(),
CIsNotSpace() );
1265 auto sixth_whitespace = find_if( sixth_column_start, line.
end(),
CIsSpace() );
1268 string &
first = out_tokens.back();
1269 copy( first_column_start, first_whitespace, back_inserter(
first) );
1272 string &second = out_tokens.back();
1273 copy( second_column_start, second_whitespace, back_inserter(second) );
1276 string &third = out_tokens.back();
1277 copy( third_column_start, third_whitespace, back_inserter(third) );
1279 if( sixth_column_start != line.
end() ) {
1285 string &sixth = out_tokens.back();
1286 copy( sixth_column_start, sixth_whitespace, back_inserter(sixth) );
1313 syn.push_back (
val);
1335 EQual qtype,
const string& val
1373 fun.push_back (
val);
1380 prod.push_back (
val);
1416 kCdsFeatName,
"transl_table",
val);
1435 if (ch >
' ' && ch !=
'"' && ch !=
'\'')
return false;
1446 const static char* kOrder =
"ORDER";
1449 string::size_type pos = 0;
1455 if( pos >= line.
length() ) {
1478 for( ; loc_iter; ++loc_iter ) {
1479 if( ! mix_pieces.empty() ) {
1480 mix_pieces.push_back( loc_piece_null );
1484 mix_pieces.push_back( new_piece );
1488 if( mix_pieces.size() > 1 ) {
1521 string normalized_string =
str;
1522 normalized_string.erase(
1524 end(normalized_string),
1525 [](
char c) {
return isspace(c);}),
1526 end(normalized_string));
1531 if (pos_end != string::npos) {
1532 string pos_str = normalized_string.substr (5, pos_end - 5);
1534 if (aa_start != string::npos) {
1536 if (seq_start != string::npos &&
1537 seq_start < aa_start+3) {
1541 size_t aa_length = (seq_start ==
NPOS) ?
1542 pos_str.size() - (aa_start+3) :
1543 seq_start - (aa_start+3);
1545 string abbrev = pos_str.substr (aa_start + 3, aa_length);
1553 aa->SetNcbieaa (t_iter->second);
1554 ext_trna.
SetAa(*aa);
1555 pos_str = pos_str.substr (0, aa_start);
1558 pos_str = pos_str.substr (0, pos_str.length() - 1);
1595 for( ; pos <
str.length(); ++pos ) {
1596 switch(
str[pos] ) {
1631 if( ! strToConvert.
empty() &&
isdigit(strToConvert[0]) ) {
1644 strFeatureName, strQualifierName, strToConvert );
1658 strFeatureName, strQualifierName, strToConvert );
1715 rrp.
SetExt().SetGen().SetQuals().Set().push_back(q);
1735 const auto aaval_it =
sm_TrnaKeys.find(aa_string.c_str());
1741 taa.SetNcbieaa(aaval_it->second);
1742 if (aa_string ==
"fMet" ||
1743 aa_string ==
"iMet" ||
1744 aa_string ==
"Ile2") {
1751 "tRNA",
"product",
val);
1763 "tRNA",
"anticodon",
val );
1797 if (
val.size() != 3) {
1803 for (
char char1 : s_IUPACmap.at(
val[0])) {
1804 for (
char char2 : s_IUPACmap.at(
val[1])) {
1805 for (
char char3 : s_IUPACmap.at(
val[2])) {
1807 codons.insert(codon_index);
1813 trna_ext.
SetAa().SetNcbieaa();
1814 for (
const auto codon_index :
codons) {
1815 trna_ext.
SetCodon().push_back(codon_index);
1835 const char*
str =
nullptr;
1845 if (
val !=
"other") {
1847 const vector<string>& allowed_values =
1849 if (find(allowed_values.cbegin(), allowed_values.cend(),
val)
1850 == allowed_values.cend()) {
1887 str =
"dbSnpSynonymyData";
1900 str =
"stsUserObject";
1916 str =
"cloneUserObject";
1943 const string &feat_name,
1960 if (g_iter != sm_GenomeKeys.end ()) {
1966 feat_name,
"organelle",
val );
2019 slist.push_back (ssp);
2038 mlist.push_back (omp);
2050 if (qual.empty ())
return false;
2061 if( ! potential_normalized_qual.
empty() ) {
2062 normalized_qual = potential_normalized_qual;
2066 auto& qlist = sfp->
SetQual ();
2068 gbq->
SetQual() = normalized_qual;
2074 qlist.push_back (gbq);
2087 typedef pair<TChoiceCI, TChoiceCI> TChoiceEqualRange;
2088 TChoiceEqualRange cds_equal_range =
2090 if( cds_equal_range.first == cds_equal_range.second )
2098 TStringToGeneAndLineMap locusToGeneAndLineMap;
2099 TStringToGeneAndLineMap locusTagToGeneAndLineMap;
2100 const TChoiceEqualRange gene_equal_range =
2102 for( TChoiceCI gene_choice_ci = gene_equal_range.first;
2103 gene_choice_ci != gene_equal_range.second;
2109 locusToGeneAndLineMap.insert(
2111 gene_ref.
GetLocus(), gene_feat_ref_and_line));
2114 locusTagToGeneAndLineMap.insert(
2122 for( TChoiceCI cds_choice_ci = cds_equal_range.first;
2123 cds_choice_ci != cds_equal_range.second ; ++cds_choice_ci)
2126 const TSeqPos cds_line_num = cds_choice_ci->second.m_uLineNum;
2131 if( ! pGeneXrefOnCDS ) {
2142 const string locus =
2147 const string locus_tag =
2156 typedef TStringToGeneAndLineMap::iterator TStrToGeneCI;
2157 typedef pair<TStrToGeneCI, TStrToGeneCI> TStrToGeneEqualRange;
2161 TStrToGeneEqualRange locus_equal_range =
2163 for( TStrToGeneCI locus_gene_ci = locus_equal_range.first;
2164 locus_gene_ci != locus_equal_range.second;
2168 auto gene_feat = locus_gene_ci->second.m_pFeat;
2169 if (gene_feat->GetData().GetGene().IsSetLocus_tag() &&
2170 gene_feat->GetData().GetGene().GetLocus_tag() != locus_tag) {
2174 locusGeneMatches.
insert(locus_gene_ci->second);
2180 TStrToGeneEqualRange locus_tag_equal_range =
2182 for( TStrToGeneCI locus_tag_gene_ci = locus_tag_equal_range.first;
2183 locus_tag_gene_ci != locus_tag_equal_range.second;
2184 ++locus_tag_gene_ci )
2187 auto gene_feat = locus_tag_gene_ci->second.m_pFeat;
2188 if (gene_feat->GetData().GetGene().IsSetLocus() &&
2189 gene_feat->GetData().GetGene().GetLocus() != locus) {
2193 locusTagGeneMatches.
insert(locus_tag_gene_ci->second);
2197 if( locusGeneMatches.
empty() ) {
2199 matchingGenes.
swap(locusTagGeneMatches);
2200 }
else if( locusTagGeneMatches.
empty() ) {
2202 matchingGenes.
swap(locusGeneMatches);
2206 locusGeneMatches.
begin(), locusGeneMatches.
end(),
2207 locusTagGeneMatches.
begin(), locusTagGeneMatches.
end(),
2208 inserter(matchingGenes, matchingGenes.
begin()));
2216 const CSeq_loc & gene_loc = gene_feat_and_line_ci->m_pFeat->GetLocation();
2217 const TSeqPos gene_line_num = gene_feat_and_line_ci->m_uLineNum;
2225 if( pCdsMinusGeneLoc &&
2226 ! pCdsMinusGeneLoc->
IsNull() &&
2227 ! pCdsMinusGeneLoc->
IsEmpty() )
2230 if( gene_line_num > 0 ) {
2231 gene_lines.push_back(gene_line_num);
2246 matchingGenes.
empty() )
2250 pNewGene->
SetData().SetGene().Assign( *pGeneXrefOnCDS );
2257 the_ftable.push_back(pNewGene);
2265 locusToGeneAndLineMap.insert(
2267 pGeneXrefOnCDS->
GetLocus(), gene_feat_and_line));
2270 locusTagToGeneAndLineMap.insert(
2290 string lqual = qual;
2324 const string& feat_name,
2326 const string& val) {
2332 if (qual !=
"note") {
2333 string error_message =
2334 qual +
" is not a valid qualifier for this feature. Converting to note.";
2337 feat_name, qual,
kEmptyStr, error_message);
2344 const string &feat_name,
2367 if (o_iter != sm_OrgRefKeys.end ()) {
2368 EOrgRef rtype = o_iter->second;
2373 if (s_iter != sm_SubSrcKeys.end ()) {
2381 if (m_iter != sm_OrgModKeys.end ()) {
2395 if (q_iter != sm_QualKeys.end ()) {
2396 EQual qtype = q_iter->second;
2438 sfdata.
SetPub().SetPub().Set().push_back( new_pub );
2448 sfdata.
SetProt().SetActivity().push_back(
val );
2481 if (
val ==
"experimental") {
2483 }
else if (
val ==
"not_experimental" ||
val ==
"non_experimental" ||
2484 val ==
"not-experimental" ||
val ==
"non-experimental") {
2492 string prefix, remainder;
2500 feat_name, qual,
val);
2506 string val_copy =
val;
2581 syn.push_back (
val);
2605 static const char* digits =
"0123456789";
2611 dblist.push_back (dbt);
2634 feat_name, qual,
val,
2655 feat_name, qual,
val,
2656 "Invalid transcript_id : " +
val);
2660 for (
const auto&
id : ids) {
2661 auto id_string =
id->GetSeqIdString(
true);
2663 if (res.second ==
false) {
2666 feat_name, qual,
val,
2667 "Transcript ID " + id_string +
" appears on multiple mRNA features"
2694 feat_name, qual,
val,
2695 "Invalid protein_id : " +
val);
2700 for (
const auto&
id : ids) {
2701 auto id_string =
id->GetSeqIdString(
true);
2703 if (res.second ==
false) {
2706 feat_name, qual,
val,
2707 "Protein ID " + id_string +
" appears on multiple CDS features"
2730 feat_name, qual,
val );
2746 if( line.
length() < 6 ) {
2750 if( line[0] ==
'=' ) {
2752 "===================================================================";
2756 }
else if( line[0] ==
' ') {
2803 const Int4 orig_start = start;
2817 if (loc_info.
is_point || start == stop ) {
2819 if (mix_set.empty())
2834 if( stop != (start+1) ) {
2869 if( ! mix_set.empty() ) {
2870 const CSeq_loc & last_loc = *mix_set.back();
2880 mix_set.push_back(loc);
2900 if (feat.empty ())
return false;
2942 rrp.
SetExt().SetGen().SetClass(
"snRNA");
2946 rrp.
SetExt().SetGen().SetClass(
"scRNA");
2950 rrp.
SetExt().SetGen().SetClass(
"snoRNA");
2961 rrp.
SetExt().SetName(
"misc_RNA");
3024 imp.
SetKey (
"misc_feature");
3045 const string& strFeatureName,
3046 const string& strQualifierName,
3047 const string& strQualifierValue,
3048 const string& strErrorMessage,
3066 const string & strFeatureName,
3067 const string & strQualifierName,
3068 const string & strQualifierValue,
3069 const string& strErrorMessage,
3080 strQualifierName, strQualifierValue));
3082 pErr->AddOtherLine(*line_it);
3093 const unsigned int line_number,
3113 curr_feat_intervals_done =
false;
3121 if (
last.IsInt() &&
last.GetInt().IsSetStrand())
3123 strand =
last.GetInt().GetStrand();
3126 if (
last.IsPnt() &&
last.GetPnt().IsSetStrand())
3128 strand =
last.GetPnt().GetStrand();
3138 for (
auto pSeqLoc : feat.
SetLocation().SetMix().Set()) {
3139 if (pSeqLoc->IsPnt()) {
3140 auto& seq_point = pSeqLoc->SetPnt();
3141 const auto old_strand =
3142 seq_point.IsSetStrand() ?
3143 seq_point.GetStrand() :
3146 seq_point.SetStrand(strand);
3147 if (old_strand != strand) {
3172 (!feat->
SetData().SetPub().IsSetPub() ||
3173 feat->
SetData().SetPub().GetPub().Get().empty())) {
3178 string msg =
"Reference feature is empty. Skipping feature.";
3209 const string& qual_val,
3210 const string& feat_name,
3259 string feat, qual, qual_value;
3260 string curr_feat_name;
3264 bool ignore_until_next_feature_key =
false;
3271 const bool bIgnoreWebComments =
3295 bool curr_feat_intervals_done =
false;
3297 if (! in_annotname.
empty ()) {
3300 annot->
SetName (in_annotname);
3301 descr.
Set().push_back (annot);
3327 if (line [0] ==
'[') {
3344 if( loc_with_nulls ) {
3351 replace( qual_value.begin(), qual_value.end(),
'\"',
'\'' );
3374 best_CDS_finder.
AddFeat( *sfp );
3380 ignore_until_next_feature_key =
false;
3382 curr_feat_name = feat;
3388 ignore_until_next_feature_key =
true;
3391 }
else if (ignore_until_next_feature_key) {
3402 qual_value.empty ()) {
3404 if( curr_feat_intervals_done ) {
3408 ignore_until_next_feature_key =
true;
3422 curr_feat_intervals_done =
true;
3425 else if (!feat.empty()) {
3431 curr_feat_intervals_done =
true;
3459 const string &seq_id,
3490 if (id->IsGenbank())
3505 const string& feat_name,
3509 const string &seq_id1 )
3518 if (!
val.empty ()) {
3545 out_annotname.
clear();
3599 ReadSeqAnnot( lr, pMessageListener ).ReleaseOrNull() );
3614 const string& seqid,
3615 const string& annotname,
3627 const string& seqid,
3628 const string& annotname,
3636 return impl.ReadSequinFeatureTable(seqid, annotname,
flags, filter);
3665 const string& seqid_prefix)
3675 while (orig_seqid.
empty () && !pLineReader->AtEOF() ) {
3679 static_cast<unsigned>(pLineReader->GetLineNumber()),
3685 if (seqid_prefix.empty()) {
3688 if (orig_seqid.
find(
'|') == string::npos)
3689 temp_seqid = seqid_prefix + orig_seqid;
3693 temp_seqid = seqid_prefix + orig_seqid.
substr(4);
3695 orig_seqid = temp_seqid;
3706 const string& seqid_prefix
3717 const string& seqid_prefix
3738 const list<string>& stringFlags,
3758 struct SCSeqidCompare
3763 return *left < *right;
3780 seq_map[seq_id->GetPointer()].Reset(&*seqit);
3785 while ( !reader.
AtEOF() ) {
3788 if (entry.
IsSeq()) {
3792 _ASSERT(annot->GetData().IsFtable());
3793 if (annot->GetData().GetFtable().empty()) {
3801 seq = seq_map[feat_id].GetPointer();
3806 <<
"ReadSequinFeatureTables: unable to find match for "
3819 unsigned int line_number,
3825 return impl.CreateSeqFeat (feat,
location,
flags, (seq_id ? *seq_id :
string() ), filter);
3831 const string& feat_name,
3837 const string &seq_id
3842 impl.AddFeatQual (sfp, feat_name, qual,
val,
flags, seq_id) ;
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static unsigned int line_num
void remove_if(Container &c, Predicate *__pred)
bool AddFeat(const CSeq_feat &new_cds)
bool x_AddQualifierToFeature(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags)
static void PutProgress(const CTempString &seq_id, const unsigned int line_number, ILineErrorListener *pListener)
CSeq_annot::C_Data::TFtable TFtable
bool x_TryToParseOffset(const CTempString &sLine, Int4 &out_offset)
void AddFeatQual(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags, const string &seq_id)
@ eQual_mobile_element_type
@ eQual_ribosomal_slippage
@ eQual_secondary_accession
bool x_AddQualifierToImp(CRef< CSeq_feat > sfp, CSeqFeatData &sfdata, EQual qtype, const string &qual, const string &val)
static bool ParseInitialFeatureLine(const CTempString &line_arg, CTempStringEx &out_seqid, CTempStringEx &out_annotname)
bool x_AddNoteToFeature(CRef< CSeq_feat > sfp, const string ¬e)
CConstRef< CSeq_feat > TFeatConstRef
void x_TokenizeStrict(const CTempString &line, vector< string > &out_tokens)
bool x_AddQualifierToGene(CSeqFeatData &sfdata, EQual qtype, const string &val)
ILineReader *const GetLineReaderPtr(void)
string x_TrnaToAaString(const string &val)
bool x_AddIntervalToFeature(CTempString strFeatureName, CRef< CSeq_feat > &sfp, const SFeatLocInfo &loc_info)
unsigned int m_LineNumber
void x_ProcessQualifier(const string &qual_name, const string &qual_val, const string &feat_name, CRef< CSeq_feat > feat, TFlags flags)
void x_InitId(const CTempString &seq_id, const TFlags flags)
multimap< CSeqFeatData::E_Choice, SFeatAndLineNum > TChoiceToFeatMap
bool x_AddQualifierToRna(CRef< CSeq_feat > sfp, EQual qtype, const string &val)
void x_ResetFeat(CRef< CSeq_feat > &feat, bool &curr_feat_intervals_done)
CRef< CSeq_annot > ReadSequinFeatureTable(const CTempString &seqid, const CTempString &annotname, const TFlags flags, ITableFilter *filter)
bool x_ParseTrnaExtString(CTrna_ext &ext_trna, const string &str)
void x_CreateGenesFromCDSs(CRef< CSeq_annot > sap, TChoiceToFeatMap &choiceToFeatMap, const TFlags flags)
long x_StringToLongNoThrow(CTempString strToConvert, CTempString strFeatureName, CTempString strQualifierName, ILineError::EProblem eProblem=ILineError::eProblem_Unset)
ILineErrorListener * m_pMessageListener
void x_FinishFeature(CRef< CSeq_feat > &feat, TFtable &ftable)
CFeatureTableReader_Imp(ILineReader *reader, unsigned int line_num, ILineErrorListener *pMessageListener)
bool x_AddGBQualToFeature(CRef< CSeq_feat > sfp, const string &qual, const string &val)
bool x_IsWebComment(CTempString line)
CFeature_table_reader::TFlags TFlags
ILineErrorListener *const GetErrorListenerPtr(void)
bool x_StringIsJustQuotes(const string &str)
void x_GetPointStrand(const CSeq_feat &feat, CSeq_interval::TStrand &strand) const
CRef< CSeq_feat > CreateSeqFeat(const string &feat, CSeq_loc &location, const TFlags flags, const string &seq_id, ITableFilter *filter)
SIZE_TYPE x_MatchingParenPos(const string &str, SIZE_TYPE open_paren_pos)
bool x_AddQualifierToCdregion(CRef< CSeq_feat > sfp, CSeqFeatData &sfdata, EQual qtype, const string &val)
void x_TokenizeLenient(const CTempString &line, vector< string > &out_tokens)
void x_UpdatePointStrand(CSeq_feat &feat, CSeq_interval::TStrand strand) const
unordered_set< string > m_ProcessedProteinIds
bool x_ParseFeatureTableLine(const CTempString &line, SFeatLocInfo &loc_info, string &feat, string &qual, string &val, Int4 offset)
bool x_AddQualifierToBioSrc(CSeqFeatData &sfdata, const string &feat_name, EOrgRef rtype, const string &val)
CFeatureTableReader_Imp & operator=(const CFeatureTableReader_Imp &value)
~CFeatureTableReader_Imp(void)
bool x_AddCodons(const string &val, CTrna_ext &trna_ext) const
bool x_SetupSeqFeat(CRef< CSeq_feat > sfp, const string &feat, const TFlags flags, ITableFilter *filter)
unordered_set< string > m_ProcessedTranscriptIds
void x_ProcessMsg(int line_num, ILineError::EProblem eProblem, EDiagSev eSeverity, const std::string &strFeatureName=kEmptyStr, const std::string &strQualifierName=kEmptyStr, const std::string &strQualifierValue=kEmptyStr, const std::string &strErrorMessage=kEmptyStr, const ILineError::TVecOfLines &vecOfOtherLines=ILineError::TVecOfLines())
void x_ProcessMsg(ILineError::EProblem eProblem, EDiagSev eSeverity, const std::string &strFeatureName=kEmptyStr, const std::string &strQualifierName=kEmptyStr, const std::string &strQualifierValue=kEmptyStr, const std::string &strErrorMessage=kEmptyStr, const ILineError::TVecOfLines &vecOfOtherLines=ILineError::TVecOfLines())
CFeatureTableReader_Imp(const CFeatureTableReader_Imp &value)
CFeature_table_reader(TReaderFlags fReaderFlags=0)
CRef< CSerialObject > ReadObject(ILineReader &lr, ILineErrorListener *pErrors) override
Read an object from a given line reader, render it as the most appropriate Genbank object.
long TFlags
binary OR of EFlags
static bool ParseInitialFeatureLine(const CTempString &line_arg, CTempStringEx &out_seqid, CTempStringEx &out_annotname)
If line_arg is a feature line (e.g.
static void ReadSequinFeatureTables(ILineReader &reader, CSeq_entry &entry, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, ITableFilter *filter=nullptr)
@ fSuppressBadKeyWarning
= 0x400 (Suppress 'bad key' errors; Not recommended.)
@ fReportDiscouragedKey
= 0x40 (Report discouraged keys into the error container)
@ fKeepBadKey
= 0x02 (As much as possible, try to use bad keys as if they were acceptable)
@ fIgnoreWebComments
= 0x08 (ignore web comment lines such as lines that start with " INFO:", or consist of many equals si...
@ fAllIdsAsLocal
= 0x100 (Do not attempt to parse accessions)
@ fLeaveProteinIds
= 0x80 (Leave all protein_id as a qualifiers)
@ fCreateGenesFromCDSs
= 0x10 (If a CDS has a gene xref, create a gene with the same intervals if one doesn't already exist....
@ fPreferGenbankId
= 0x200 (Prefer Genbank accession ids)
@ fTranslateBadKey
= 0x04 (yields misc_feature /standard_name="...")
@ fCDSsMustBeInTheirGenes
= 0x20 (If a CDS has a gene xref, it *must* be inside of that gene)
CRef< CSeq_annot > ReadSeqAnnot(ILineReader &lr, ILineErrorListener *pErrors) override
Read an object from a given line reader, render it as a single Seq-annot, if possible.
unique_ptr< CFeatureTableReader_Imp > m_pImpl
static CRef< CSeq_annot > x_ReadFeatureTable(CFeatureTableReader_Imp &reader, const CTempString &seqid, const CTempString &annot_name, const TFlags flags, ITableFilter *filter)
static CRef< CSeq_feat > CreateSeqFeat(const string &feat, CSeq_loc &location, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, unsigned int line=0, std::string *seq_id=nullptr, ITableFilter *filter=nullptr)
static void AddFeatQual(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, int line=0, const string &seq_id=std::string())
static void AddStringFlags(const list< string > &stringFlags, TFlags &baseFlags)
CRef< CSeq_annot > ReadSequinFeatureTable(const TFlags flags=0, ITableFilter *filter=nullptr, const string &seqid_prefix=kEmptyStr)
@Gb_qual.hpp User-defined methods of the data storage class.
static const CTrans_table & GetTransTable(int id)
static int CodonToIndex(char base1, char base2, char base3)
*** Import *********************************************** * * Features imported from other databases...
static void GetPrefixAndRemainder(const string &inference, string &prefix, string &remainder)
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
@OrgMod.hpp User-defined methods of the data storage class.
@RNA_ref.hpp User-defined methods of the data storage class.
static void AddGeneOntologyTerm(CSeq_feat &feature, const CTempString &qual, const CTempString &val)
Defines and provides stubs for a general interface to a variety of file readers.
static void xAddStringFlagsWithMap(const list< string > &stringFlags, const map< string, TReaderFlags > flagMap, TReaderFlags &baseFlags)
void SetRegion(const TRegion &v)
void SetBiosrc(TBiosrc &v)
static bool IsDiscouragedQual(EQualifier qual)
EQualifier
List of available qualifiers for feature keys.
void SetBond(const TBond &v)
static bool CanHaveGene(ESubtype subtype)
void SetSite(const TSite &v)
static const CSiteList * GetSiteList()
ESubtype GetSubtype(void) const
static bool IsDiscouragedSubtype(ESubtype subtype)
static E_Choice GetTypeFromSubtype(ESubtype subtype)
void SetCdregion(TCdregion &v)
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
@ eSubtype_transit_peptide_aa
@ eSubtype_sig_peptide_aa
@ eSubtype_mat_peptide_aa
static EQualifier GetQualifierType(CTempString qual)
convert qual string to enumerated value
static const CBondList * GetBondList()
static CTempString GetQualifierAsString(EQualifier qual)
Convert a qualifier from an enumerated value to a string representation or empty if not found.
static ESubtype SubtypeNameToValue(CTempString sName)
Turn a string into its ESubtype which is NOT necessarily related to the identifier of the enum.
static bool IsRegulatory(ESubtype subtype)
static const vector< string > & GetRegulatoryClassList()
bool IsFtable(void) const
namespace ncbi::objects::
const CGene_ref * GetGeneXref(void) const
See related function in util/feature.hpp.
void SetGeneXref(CGene_ref &value)
void SetProtXref(CProt_ref &value)
void SetPartialStart(bool val, ESeqLocExtremes ext)
void SetPartialStop(bool val, ESeqLocExtremes ext)
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
void SetRightOf(bool val)
void SetPartialStart(bool val, ESeqLocExtremes ext)
void SetPartialStop(bool val, ESeqLocExtremes ext)
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
TBase::const_iterator const_iterator
Simple implementation of ILineReader for i(o)streams.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
@ eParse_Number
Parse a real or integer number, otherwise string.
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
virtual void PutProgress(const string &sMessage, const Uint8 iNumDone=0, const Uint8 iNumTotal=0)=0
This is used for processing progress messages.
virtual EDiagSev Severity(void) const
@ eProblem_InvalidQualifier
@ eProblem_QualifierBadValue
@ eProblem_NumericQualifierValueIsNotANumber
@ eProblem_InternalPartialsInFeatLocation
@ eProblem_FeatMustBeInXrefdGene
@ eProblem_UnrecognizedFeatureName
@ eProblem_FeatureNameNotAllowed
@ eProblem_IncompleteFeature
@ eProblem_QualifierWithoutFeature
@ eProblem_FeatureBadStartAndOrStop
@ eProblem_NumericQualifierValueHasExtraTrailingCharacters
@ eProblem_UnrecognizedSquareBracketCommand
@ eProblem_UnrecognizedQualifierName
@ eProblem_BadFeatureInterval
@ eProblem_DiscouragedFeatureName
@ eProblem_NoFeatureProvidedOnIntervals
@ eProblem_DiscouragedQualifierName
virtual const std::string & ErrorMessage(void) const
vector< unsigned int > TVecOfLines
virtual EProblem Problem(void) const =0
Abstract base class for lightweight line-by-line reading.
Use to give a feature filter to CFeature_table_reader.
EAction
How a given feature name should be handled.
@ eAction_Okay
Just accept the feat.
@ eAction_Disallowed
Do not accept the feat and give message eProblem_FeatureNameNotAllowed.
virtual EAction GetFeatAction(const string &feature_name) const =0
Returns how we should treat the given feature name.
const_iterator_pair equal_range(const key_type &key) const
iterator insert(const value_type &val)
container_type::iterator iterator
container_type::value_type value_type
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator_pair equal_range(const key_type &key) const
const_iterator end() const
static const char location[]
Include a standard set of the NCBI C++ Toolkit most basic headers.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
const TResidue codons[4][4]
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
#define ENTREZ_ID_FROM(T, value)
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
EDiagSev
Severity level for the posted diagnostics.
@ eDiag_Error
Error message.
@ eDiag_Warning
Warning message.
void Warning(CExceptionArgs_Base &args)
virtual void UngetLine(void)=0
Unget current line, which must be valid.
virtual Uint8 GetLineNumber(void) const =0
Returns the current line number (counting from 1, not 0).
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
const string AsFastaString(void) const
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
@ fParse_Default
By default in ParseIDs and IsValid, allow raw parsable non-numeric accessions and plausible local acc...
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
@ eFasta
Tagged ID in NCBI's traditional FASTA style.
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
ENa_strand GetStrand(void) const
Get the location's strand.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
CRef< CSeq_loc > Subtract(const CSeq_loc &other, TOpFlags flags, ISynonymMapper *syn_mapper, ILengthGetter *len_getter) const
Subtract seq-loc from this, merge/sort resulting ranges depending on flags.
const CSeq_loc & GetEmbeddingSeq_loc(void) const
Get the nearest seq-loc containing the current range.
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
void SetNull(void)
Override all setters to incorporate cache invalidation.
bool IsPartialStop(ESeqLocExtremes ext) const
void Reset(void)
Reset reference object.
TObjectType * GetPointerOrNull(void) const THROWS_NONE
Get pointer value.
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
int32_t Int4
4-byte (32-bit) signed integer
unsigned char Uchar
Alias for unsigned char.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
NCBI_NS_STD::string::size_type SIZE_TYPE
const_iterator end() const
Return an iterator to the string's ending position (one past the end of the represented sequence)
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
CTempStringEx substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
void clear(void)
Clear value to an empty string.
static long StringToLong(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to long.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
size_type length(void) const
Return the length of the represented array.
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
size_type find_first_not_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character not in the matching string within the current string,...
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
size_type size(void) const
Return the length of the represented array.
static string & ToLower(string &str)
Convert string to lower case – string& version.
static const size_type npos
const_iterator begin() const
Return an iterator to the string's starting position.
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
@ eTrunc_End
Truncate trailing spaces only.
@ eTrunc_Begin
Truncate leading spaces only.
@ eNocase
Case insensitive compare.
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
list< CRef< CSubSource > > TSubtype
void SetGenome(TGenome value)
Assign a value to Genome data member.
void SetOrg(TOrg &value)
Assign a value to Org data member.
void SetName(const TName &value)
Assign a value to Name data member.
EGenome
biological context
TSubtype & SetSubtype(void)
Assign a value to Subtype data member.
@ eSubtype_collection_date
DD-MMM-YYYY format.
@ eSubtype_insertion_seq_name
@ eSubtype_transposon_name
@ eSubtype_fwd_primer_seq
sequence (possibly more than one; semicolon-separated)
@ eSubtype_lat_lon
+/- decimal degrees
@ eSubtype_rev_primer_name
@ eSubtype_collected_by
name of person who collected the sample
@ eSubtype_fwd_primer_name
@ eSubtype_rev_primer_seq
sequence (possibly more than one; semicolon-separated)
@ eSubtype_isolation_source
@ eSubtype_environmental_sample
@ eSubtype_endogenous_virus_name
@ eSubtype_identified_by
name of person who identified the sample
@ eGenome_endogenous_virus
TSyn & SetSyn(void)
Assign a value to Syn data member.
void SetAllele(const TAllele &value)
Assign a value to Allele data member.
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
void SetLocus(const TLocus &value)
Assign a value to Locus data member.
void SetLocus_tag(const TLocus_tag &value)
Assign a value to Locus_tag data member.
void SetMaploc(const TMaploc &value)
Assign a value to Maploc data member.
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
const TLocus & GetLocus(void) const
Get the Locus member data.
void SetDesc(const TDesc &value)
Assign a value to Desc data member.
bool IsStr(void) const
Check if variant Str is selected.
void SetTag(TTag &value)
Assign a value to Tag data member.
const TStr & GetStr(void) const
Get the variant data.
TStr & SetStr(void)
Select the variant.
void SetType(TType &value)
Assign a value to Type data member.
void SetDb(const TDb &value)
Assign a value to Db data member.
TId & SetId(void)
Select the variant.
void SetDiv(const TDiv &value)
Assign a value to Div data member.
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
list< CRef< COrgMod > > TMod
void SetGcode(TGcode value)
Assign a value to Gcode data member.
void SetMgcode(TMgcode value)
Assign a value to Mgcode data member.
TMod & SetMod(void)
Assign a value to Mod data member.
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
void SetSubname(const TSubname &value)
Assign a value to Subname data member.
void SetLineage(const TLineage &value)
Assign a value to Lineage data member.
@ eSubtype_gb_acronym
used by taxonomy database
@ eSubtype_gb_synonym
used by taxonomy database
@ eSubtype_substrain
Definition: