92 #include <unordered_set>
98 #define NCBI_USE_ERRCODE_X Objtools_Rd_Feature
108 static const char *
const kCdsFeatName =
"CDS";
142 Uchar best_weight = UCHAR_MAX;
145 Uchar new_weight = std_order[(*it)->Which()];
146 if (new_weight < best_weight)
149 best_weight = new_weight;
160 {
'A', list<char>({
'A'})},
161 {
'G', list<char>({
'G'})},
162 {
'C', list<char>({
'C'})},
163 {
'T', list<char>({
'T'})},
164 {
'U', list<char>({
'U'})},
165 {
'M', list<char>({
'A',
'C'})},
166 {
'R', list<char>({
'A',
'G'})},
167 {
'W', list<char>({
'A',
'T'})},
168 {
'S', list<char>({
'C',
'G'})},
169 {
'Y', list<char>({
'C',
'T'})},
170 {
'K', list<char>({
'G',
'T'})},
171 {
'V', list<char>({
'A',
'C',
'G'})},
172 {
'H', list<char>({
'A',
'C',
'T'})},
173 {
'D', list<char>({
'A',
'G',
'T'})},
174 {
'B', list<char>({
'C',
'G',
'T'})},
175 {
'N', list<char>({
'A',
'C',
'G',
'T'})}
314 const string &seq_id,
319 const string& feat_name,
323 const string &seq_id );
331 const unsigned int line_number,
382 const string &feat_name,
383 const string& qual,
const string&
val,
387 const string& qual_val,
388 const string& feat_name,
399 EQual qtype,
const string& qual,
const string&
val);
401 const string &feat_name,
411 const string& feat_name,
416 const string& qual,
const string&
val);
785 {
"Asp or Asn",
'B' },
786 {
"Asparagine",
'N' },
787 {
"Aspartate",
'D' },
788 {
"Aspartic Acid",
'D' },
794 {
"Glu or Gln",
'Z' },
795 {
"Glutamate",
'E' },
796 {
"Glutamic Acid",
'E' },
797 {
"Glutamine",
'Q' },
802 {
"Histidine",
'H' },
805 {
"Isoleucine",
'I' },
807 {
"Leu or Ile",
'J' },
812 {
"Methionine",
'M' },
815 {
"Phenylalanine",
'F' },
819 {
"Pyrrolysine",
'O' },
821 {
"Selenocysteine",
'U' },
826 {
"Termination",
'*' },
828 {
"Threonine",
'T' },
830 {
"Tryptophan",
'W' },
846 "environmental_sample",
852 "ribosomal_slippage",
860 : m_reader(reader), m_LineNumber(
line_num), m_pMessageListener(pMessageListener)
909 out_offset = new_offset;
927 bool isminus =
false;
928 bool ispoint =
false;
930 bool partial5 =
false;
931 bool partial3 =
false;
935 string start, stop, feat, qual,
val, stnd;
939 if (line.
empty ())
return false;
946 numtkns = tkns.size ();
963 if(
val.length() >= 2 &&
val[0] ==
'"' &&
val[
val.length()-1] ==
'"' ) {
971 bool has_start =
false;
972 if (! start.empty ()) {
973 if (start [0] ==
'<') {
977 len = start.length ();
978 if (
len > 1 && start [
len - 1] ==
'^') {
980 start [
len - 1] =
'\0';
987 bool has_stop =
false;
988 if (! stop.empty ()) {
989 if (stop [0] ==
'>') {
998 if ( startv <= 0 || stopv <= 0 ) {
1004 if (! stnd.empty ()) {
1005 if (stnd ==
"minus" || stnd ==
"-" || stnd ==
"complement") {
1023 if ((has_start && startv < 0) || (has_stop && stopv < 0)) {
1030 loc_info.
start_pos = ( startv < 0 ? -1 : startv);
1031 loc_info.
stop_pos = ( stopv < 0 ? -1 : stopv);
1047 vector<string> &out_tokens )
1052 string::size_type startPosOfNextRoundOfTokenization = 0;
1053 while ( startPosOfNextRoundOfTokenization < line.
size() ) {
1054 auto posAfterSpaces = line.
find_first_not_of(
" ", startPosOfNextRoundOfTokenization );
1055 if( posAfterSpaces == string::npos ) {
1059 string::size_type posOfTab = line.
find(
'\t', posAfterSpaces );
1060 if( posOfTab == string::npos ) {
1061 posOfTab = line.
length();
1066 string &new_token = out_tokens.back();
1067 copy( line.
begin() + posAfterSpaces, line.
begin() + posOfTab, back_inserter(new_token) );
1070 startPosOfNextRoundOfTokenization = ( posOfTab + 1 );
1087 vector<string> &out_tokens )
1091 if( line.
empty() ) {
1101 if( start_of_qual == line.
end() ) {
1104 auto start_of_whitespace_after_qual = find_if( start_of_qual, line.
end(),
CIsSpace() );
1105 auto start_of_val = find_if( start_of_whitespace_after_qual, line.
end(),
CIsNotSpace() );
1114 string &qual = out_tokens.back();
1115 copy( start_of_qual, start_of_whitespace_after_qual, back_inserter(qual) );
1118 if( start_of_val != line.
end() ) {
1120 string &
val = out_tokens.back();
1121 copy( start_of_val, line.
end(), back_inserter(
val) );
1129 auto first_column_start = line.
begin();
1130 auto first_whitespace = find_if( first_column_start, line.
end(),
CIsSpace() );
1131 auto second_column_start = find_if( first_whitespace, line.
end(),
CIsNotSpace() );
1132 auto second_whitespace = find_if( second_column_start, line.
end(),
CIsSpace() );
1133 auto third_column_start = find_if( second_whitespace, line.
end(),
CIsNotSpace() );
1134 auto third_whitespace = find_if( third_column_start, line.
end(),
CIsSpace() );
1136 auto sixth_column_start = find_if( third_whitespace, line.
end(),
CIsNotSpace() );
1137 auto sixth_whitespace = find_if( sixth_column_start, line.
end(),
CIsSpace() );
1140 string &
first = out_tokens.back();
1141 copy( first_column_start, first_whitespace, back_inserter(
first) );
1144 string &second = out_tokens.back();
1145 copy( second_column_start, second_whitespace, back_inserter(second) );
1148 string &third = out_tokens.back();
1149 copy( third_column_start, third_whitespace, back_inserter(third) );
1151 if( sixth_column_start != line.
end() ) {
1157 string &sixth = out_tokens.back();
1158 copy( sixth_column_start, sixth_whitespace, back_inserter(sixth) );
1185 syn.push_back (
val);
1207 EQual qtype,
const string& val
1245 fun.push_back (
val);
1252 prod.push_back (
val);
1288 kCdsFeatName,
"transl_table",
val);
1307 if (ch >
' ' && ch !=
'"' && ch !=
'\'')
return false;
1318 const static char* kOrder =
"ORDER";
1321 string::size_type pos = 0;
1327 if( pos >= line.
length() ) {
1350 for( ; loc_iter; ++loc_iter ) {
1351 if( ! mix_pieces.empty() ) {
1352 mix_pieces.push_back( loc_piece_null );
1356 mix_pieces.push_back( new_piece );
1360 if( mix_pieces.size() > 1 ) {
1393 string normalized_string =
str;
1394 normalized_string.erase(
1396 end(normalized_string),
1397 [](
char c) {
return isspace(c);}),
1398 end(normalized_string));
1403 if (pos_end != string::npos) {
1404 string pos_str = normalized_string.substr (5, pos_end - 5);
1406 if (aa_start != string::npos) {
1408 if (seq_start != string::npos &&
1409 seq_start < aa_start+3) {
1413 size_t aa_length = (seq_start ==
NPOS) ?
1414 pos_str.size() - (aa_start+3) :
1415 seq_start - (aa_start+3);
1417 string abbrev = pos_str.substr (aa_start + 3, aa_length);
1425 aa->SetNcbieaa (t_iter->second);
1426 ext_trna.
SetAa(*aa);
1427 pos_str = pos_str.substr (0, aa_start);
1430 pos_str = pos_str.substr (0, pos_str.length() - 1);
1467 for( ; pos <
str.length(); ++pos ) {
1468 switch(
str[pos] ) {
1503 if( ! strToConvert.
empty() &&
isdigit(strToConvert[0]) ) {
1516 strFeatureName, strQualifierName, strToConvert );
1530 strFeatureName, strQualifierName, strToConvert );
1587 rrp.
SetExt().SetGen().SetQuals().Set().push_back(q);
1607 const auto aaval_it =
sm_TrnaKeys.find(aa_string.c_str());
1613 taa.SetNcbieaa(aaval_it->second);
1614 if (aa_string ==
"fMet" ||
1615 aa_string ==
"iMet" ||
1616 aa_string ==
"Ile2") {
1623 "tRNA",
"product",
val);
1635 "tRNA",
"anticodon",
val );
1669 if (
val.size() != 3) {
1675 for (
char char1 : s_IUPACmap.at(
val[0])) {
1676 for (
char char2 : s_IUPACmap.at(
val[1])) {
1677 for (
char char3 : s_IUPACmap.at(
val[2])) {
1679 codons.insert(codon_index);
1685 trna_ext.
SetAa().SetNcbieaa();
1686 for (
const auto codon_index :
codons) {
1687 trna_ext.
SetCodon().push_back(codon_index);
1707 const char*
str =
nullptr;
1717 if (
val !=
"other") {
1719 const vector<string>& allowed_values =
1721 if (find(allowed_values.cbegin(), allowed_values.cend(),
val)
1722 == allowed_values.cend()) {
1759 str =
"dbSnpSynonymyData";
1772 str =
"stsUserObject";
1788 str =
"cloneUserObject";
1815 const string &feat_name,
1832 if (g_iter != sm_GenomeKeys.end ()) {
1838 feat_name,
"organelle",
val );
1891 slist.push_back (ssp);
1910 mlist.push_back (omp);
1922 if (qual.empty ())
return false;
1933 if( ! potential_normalized_qual.
empty() ) {
1934 normalized_qual = potential_normalized_qual;
1938 auto& qlist = sfp->
SetQual ();
1940 gbq->
SetQual() = normalized_qual;
1946 qlist.push_back (gbq);
1959 typedef pair<TChoiceCI, TChoiceCI> TChoiceEqualRange;
1960 TChoiceEqualRange cds_equal_range =
1962 if( cds_equal_range.first == cds_equal_range.second )
1970 TStringToGeneAndLineMap locusToGeneAndLineMap;
1971 TStringToGeneAndLineMap locusTagToGeneAndLineMap;
1972 const TChoiceEqualRange gene_equal_range =
1974 for( TChoiceCI gene_choice_ci = gene_equal_range.first;
1975 gene_choice_ci != gene_equal_range.second;
1981 locusToGeneAndLineMap.insert(
1983 gene_ref.
GetLocus(), gene_feat_ref_and_line));
1986 locusTagToGeneAndLineMap.insert(
1994 for( TChoiceCI cds_choice_ci = cds_equal_range.first;
1995 cds_choice_ci != cds_equal_range.second ; ++cds_choice_ci)
1998 const TSeqPos cds_line_num = cds_choice_ci->second.m_uLineNum;
2003 if( ! pGeneXrefOnCDS ) {
2014 const string locus =
2019 const string locus_tag =
2028 typedef TStringToGeneAndLineMap::iterator TStrToGeneCI;
2029 typedef pair<TStrToGeneCI, TStrToGeneCI> TStrToGeneEqualRange;
2033 TStrToGeneEqualRange locus_equal_range =
2035 for( TStrToGeneCI locus_gene_ci = locus_equal_range.first;
2036 locus_gene_ci != locus_equal_range.second;
2040 auto gene_feat = locus_gene_ci->second.m_pFeat;
2041 if (gene_feat->GetData().GetGene().IsSetLocus_tag() &&
2042 gene_feat->GetData().GetGene().GetLocus_tag() != locus_tag) {
2046 locusGeneMatches.
insert(locus_gene_ci->second);
2052 TStrToGeneEqualRange locus_tag_equal_range =
2054 for( TStrToGeneCI locus_tag_gene_ci = locus_tag_equal_range.first;
2055 locus_tag_gene_ci != locus_tag_equal_range.second;
2056 ++locus_tag_gene_ci )
2059 auto gene_feat = locus_tag_gene_ci->second.m_pFeat;
2060 if (gene_feat->GetData().GetGene().IsSetLocus() &&
2061 gene_feat->GetData().GetGene().GetLocus() != locus) {
2065 locusTagGeneMatches.
insert(locus_tag_gene_ci->second);
2069 if( locusGeneMatches.
empty() ) {
2071 matchingGenes.
swap(locusTagGeneMatches);
2072 }
else if( locusTagGeneMatches.
empty() ) {
2074 matchingGenes.
swap(locusGeneMatches);
2078 locusGeneMatches.
begin(), locusGeneMatches.
end(),
2079 locusTagGeneMatches.
begin(), locusTagGeneMatches.
end(),
2080 inserter(matchingGenes, matchingGenes.
begin()));
2088 const CSeq_loc & gene_loc = gene_feat_and_line_ci->m_pFeat->GetLocation();
2089 const TSeqPos gene_line_num = gene_feat_and_line_ci->m_uLineNum;
2097 if( pCdsMinusGeneLoc &&
2098 ! pCdsMinusGeneLoc->
IsNull() &&
2099 ! pCdsMinusGeneLoc->
IsEmpty() )
2102 if( gene_line_num > 0 ) {
2103 gene_lines.push_back(gene_line_num);
2118 matchingGenes.
empty() )
2122 pNewGene->
SetData().SetGene().Assign( *pGeneXrefOnCDS );
2129 the_ftable.push_back(pNewGene);
2137 locusToGeneAndLineMap.insert(
2139 pGeneXrefOnCDS->
GetLocus(), gene_feat_and_line));
2142 locusTagToGeneAndLineMap.insert(
2162 string lqual = qual;
2196 const string& feat_name,
2198 const string& val) {
2204 if (qual !=
"note") {
2205 string error_message =
2206 qual +
" is not a valid qualifier for this feature. Converting to note.";
2209 feat_name, qual,
kEmptyStr, error_message);
2216 const string &feat_name,
2239 if (o_iter != sm_OrgRefKeys.end ()) {
2240 EOrgRef rtype = o_iter->second;
2245 if (s_iter != sm_SubSrcKeys.end ()) {
2253 if (m_iter != sm_OrgModKeys.end ()) {
2267 if (q_iter != sm_QualKeys.end ()) {
2268 EQual qtype = q_iter->second;
2310 sfdata.
SetPub().SetPub().Set().push_back( new_pub );
2320 sfdata.
SetProt().SetActivity().push_back(
val );
2353 if (
val ==
"experimental") {
2355 }
else if (
val ==
"not_experimental" ||
val ==
"non_experimental" ||
2356 val ==
"not-experimental" ||
val ==
"non-experimental") {
2364 string prefix, remainder;
2372 feat_name, qual,
val);
2378 string val_copy =
val;
2453 syn.push_back (
val);
2477 static const char* digits =
"0123456789";
2483 dblist.push_back (dbt);
2506 feat_name, qual,
val,
2527 feat_name, qual,
val,
2528 "Invalid transcript_id : " +
val);
2532 for (
const auto&
id : ids) {
2533 auto id_string =
id->GetSeqIdString(
true);
2535 if (res.second ==
false) {
2538 feat_name, qual,
val,
2539 "Transcript ID " + id_string +
" appears on multiple mRNA features"
2566 feat_name, qual,
val,
2567 "Invalid protein_id : " +
val);
2572 for (
const auto&
id : ids) {
2573 auto id_string =
id->GetSeqIdString(
true);
2575 if (res.second ==
false) {
2578 feat_name, qual,
val,
2579 "Protein ID " + id_string +
" appears on multiple CDS features"
2602 feat_name, qual,
val );
2618 if( line.
length() < 6 ) {
2622 if( line[0] ==
'=' ) {
2624 "===================================================================";
2628 }
else if( line[0] ==
' ') {
2675 const Int4 orig_start = start;
2689 if (loc_info.
is_point || start == stop ) {
2691 if (mix_set.empty())
2706 if( stop != (start+1) ) {
2741 if( ! mix_set.empty() ) {
2742 const CSeq_loc & last_loc = *mix_set.back();
2752 mix_set.push_back(loc);
2772 if (feat.empty ())
return false;
2814 rrp.
SetExt().SetGen().SetClass(
"snRNA");
2818 rrp.
SetExt().SetGen().SetClass(
"scRNA");
2822 rrp.
SetExt().SetGen().SetClass(
"snoRNA");
2833 rrp.
SetExt().SetName(
"misc_RNA");
2896 imp.
SetKey (
"misc_feature");
2917 const string& strFeatureName,
2918 const string& strQualifierName,
2919 const string& strQualifierValue,
2920 const string& strErrorMessage,
2938 const string & strFeatureName,
2939 const string & strQualifierName,
2940 const string & strQualifierValue,
2941 const string& strErrorMessage,
2949 unique_ptr<CObjReaderLineException> pErr (
2952 strQualifierName, strQualifierValue));
2958 for (
auto line : vecOfOtherLines) {
2959 pErr->AddOtherLine(line);
2970 const unsigned int line_number,
2989 curr_feat_intervals_done =
false;
2997 if (
last.IsInt() &&
last.GetInt().IsSetStrand())
2999 strand =
last.GetInt().GetStrand();
3002 if (
last.IsPnt() &&
last.GetPnt().IsSetStrand())
3004 strand =
last.GetPnt().GetStrand();
3014 for (
auto pSeqLoc : feat.
SetLocation().SetMix().Set()) {
3015 if (pSeqLoc->IsPnt()) {
3016 auto& seq_point = pSeqLoc->SetPnt();
3017 const auto old_strand =
3018 seq_point.IsSetStrand() ?
3019 seq_point.GetStrand() :
3022 seq_point.SetStrand(strand);
3023 if (old_strand != strand) {
3046 const auto& featData = feat->
GetData();
3048 (!featData.GetPub().IsSetPub() ||
3049 !featData.GetPub().GetPub().IsSet() ||
3050 featData.GetPub().GetPub().Get().empty())) {
3056 string msg =
"Reference feature is empty. Skipping feature.";
3087 const string& qual_val,
3088 const string& feat_name,
3138 string feat, qual, qual_value;
3139 string curr_feat_name;
3143 bool ignore_until_next_feature_key =
false;
3150 const bool bIgnoreWebComments =
3174 bool curr_feat_intervals_done =
false;
3176 if (! in_annotname.
empty ()) {
3179 annot->
SetName (in_annotname);
3180 descr.
Set().push_back (annot);
3206 if (line [0] ==
'[') {
3225 if( loc_with_nulls ) {
3232 replace( qual_value.begin(), qual_value.end(),
'\"',
'\'' );
3261 ignore_until_next_feature_key =
false;
3263 curr_feat_name = feat;
3269 ignore_until_next_feature_key =
true;
3272 }
else if (ignore_until_next_feature_key) {
3283 qual_value.empty ()) {
3285 if( curr_feat_intervals_done ) {
3289 ignore_until_next_feature_key =
true;
3303 curr_feat_intervals_done =
true;
3306 else if (!feat.empty()) {
3312 curr_feat_intervals_done =
true;
3340 const string &seq_id,
3371 if (id->IsGenbank())
3386 const string& feat_name,
3390 const string &seq_id1 )
3399 if (!
val.empty ()) {
3426 out_annotname.
clear();
3503 const string& seqid,
3504 const string& annotname,
3516 const string& seqid,
3517 const string& annotname,
3525 return impl.ReadSequinFeatureTable(seqid, annotname,
flags, filter);
3554 const string& seqid_prefix)
3564 while (orig_seqid.
empty () && !pLineReader->AtEOF() ) {
3568 static_cast<unsigned>(pLineReader->GetLineNumber()),
3574 if (seqid_prefix.empty()) {
3577 if (orig_seqid.
find(
'|') == string::npos)
3578 temp_seqid = seqid_prefix + orig_seqid;
3582 temp_seqid = seqid_prefix + orig_seqid.
substr(4);
3584 orig_seqid = temp_seqid;
3595 const string& seqid_prefix
3606 const string& seqid_prefix
3627 const list<string>& stringFlags,
3652 return *left < *right;
3669 seq_map[seq_id->GetPointer()].Reset(&*seqit);
3674 while ( !reader.
AtEOF() ) {
3676 if (entry.
IsSeq()) {
3680 _ASSERT(annot->GetData().IsFtable());
3681 if (annot->GetData().GetFtable().empty()) {
3689 seq = seq_map[feat_id].GetPointer();
3694 <<
"ReadSequinFeatureTables: unable to find match for "
3707 unsigned int line_number,
3713 return impl.CreateSeqFeat (feat,
location,
flags, (seq_id ? *seq_id :
string() ), filter);
3719 const string& feat_name,
3725 const string &seq_id
3730 impl.AddFeatQual (sfp, feat_name, qual,
val,
flags, seq_id) ;
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
bool AddFeat(const CSeq_feat &new_cds)
bool x_AddQualifierToFeature(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags)
static void PutProgress(const CTempString &seq_id, const unsigned int line_number, ILineErrorListener *pListener)
CSeq_annot::C_Data::TFtable TFtable
bool x_TryToParseOffset(const CTempString &sLine, Int4 &out_offset)
void AddFeatQual(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags, const string &seq_id)
@ eQual_mobile_element_type
@ eQual_ribosomal_slippage
@ eQual_secondary_accession
bool x_AddQualifierToImp(CRef< CSeq_feat > sfp, CSeqFeatData &sfdata, EQual qtype, const string &qual, const string &val)
static bool ParseInitialFeatureLine(const CTempString &line_arg, CTempStringEx &out_seqid, CTempStringEx &out_annotname)
bool x_AddNoteToFeature(CRef< CSeq_feat > sfp, const string ¬e)
CConstRef< CSeq_feat > TFeatConstRef
void x_TokenizeStrict(const CTempString &line, vector< string > &out_tokens)
bool x_AddQualifierToGene(CSeqFeatData &sfdata, EQual qtype, const string &val)
ILineReader *const GetLineReaderPtr(void)
string x_TrnaToAaString(const string &val)
bool x_AddIntervalToFeature(CTempString strFeatureName, CRef< CSeq_feat > &sfp, const SFeatLocInfo &loc_info)
unsigned int m_LineNumber
void x_ProcessQualifier(const string &qual_name, const string &qual_val, const string &feat_name, CRef< CSeq_feat > feat, TFlags flags)
void x_InitId(const CTempString &seq_id, const TFlags flags)
multimap< CSeqFeatData::E_Choice, SFeatAndLineNum > TChoiceToFeatMap
bool x_AddQualifierToRna(CRef< CSeq_feat > sfp, EQual qtype, const string &val)
void x_ResetFeat(CRef< CSeq_feat > &feat, bool &curr_feat_intervals_done)
CRef< CSeq_annot > ReadSequinFeatureTable(const CTempString &seqid, const CTempString &annotname, const TFlags flags, ITableFilter *filter)
bool x_ParseTrnaExtString(CTrna_ext &ext_trna, const string &str)
void x_CreateGenesFromCDSs(CRef< CSeq_annot > sap, TChoiceToFeatMap &choiceToFeatMap, const TFlags flags)
long x_StringToLongNoThrow(CTempString strToConvert, CTempString strFeatureName, CTempString strQualifierName, ILineError::EProblem eProblem=ILineError::eProblem_Unset)
ILineErrorListener * m_pMessageListener
void x_FinishFeature(CRef< CSeq_feat > &feat, TFtable &ftable)
CFeatureTableReader_Imp(ILineReader *reader, unsigned int line_num, ILineErrorListener *pMessageListener)
bool x_AddGBQualToFeature(CRef< CSeq_feat > sfp, const string &qual, const string &val)
bool x_IsWebComment(CTempString line)
CFeature_table_reader::TFlags TFlags
ILineErrorListener *const GetErrorListenerPtr(void)
bool x_StringIsJustQuotes(const string &str)
void x_GetPointStrand(const CSeq_feat &feat, CSeq_interval::TStrand &strand) const
CRef< CSeq_feat > CreateSeqFeat(const string &feat, CSeq_loc &location, const TFlags flags, const string &seq_id, ITableFilter *filter)
SIZE_TYPE x_MatchingParenPos(const string &str, SIZE_TYPE open_paren_pos)
bool x_AddQualifierToCdregion(CRef< CSeq_feat > sfp, CSeqFeatData &sfdata, EQual qtype, const string &val)
void x_TokenizeLenient(const CTempString &line, vector< string > &out_tokens)
void x_UpdatePointStrand(CSeq_feat &feat, CSeq_interval::TStrand strand) const
unordered_set< string > m_ProcessedProteinIds
bool x_ParseFeatureTableLine(const CTempString &line, SFeatLocInfo &loc_info, string &feat, string &qual, string &val, Int4 offset)
CRef< CSeq_feat > m_pCurrentFeat
bool x_AddQualifierToBioSrc(CSeqFeatData &sfdata, const string &feat_name, EOrgRef rtype, const string &val)
unsigned int x_GetLineNumber() const
CFeatureTableReader_Imp & operator=(const CFeatureTableReader_Imp &value)
~CFeatureTableReader_Imp(void)
bool x_AddCodons(const string &val, CTrna_ext &trna_ext) const
bool x_SetupSeqFeat(CRef< CSeq_feat > sfp, const string &feat, const TFlags flags, ITableFilter *filter)
unordered_set< string > m_ProcessedTranscriptIds
void x_ProcessMsg(int line_num, ILineError::EProblem eProblem, EDiagSev eSeverity, const std::string &strFeatureName=kEmptyStr, const std::string &strQualifierName=kEmptyStr, const std::string &strQualifierValue=kEmptyStr, const std::string &strErrorMessage=kEmptyStr, const ILineError::TVecOfLines &vecOfOtherLines=ILineError::TVecOfLines())
void x_ProcessMsg(ILineError::EProblem eProblem, EDiagSev eSeverity, const std::string &strFeatureName=kEmptyStr, const std::string &strQualifierName=kEmptyStr, const std::string &strQualifierValue=kEmptyStr, const std::string &strErrorMessage=kEmptyStr, const ILineError::TVecOfLines &vecOfOtherLines=ILineError::TVecOfLines())
CFeatureTableReader_Imp(const CFeatureTableReader_Imp &value)
CFeature_table_reader(TReaderFlags fReaderFlags=0)
CRef< CSerialObject > ReadObject(ILineReader &lr, ILineErrorListener *pErrors) override
Read an object from a given line reader, render it as the most appropriate Genbank object.
long TFlags
binary OR of EFlags
static bool ParseInitialFeatureLine(const CTempString &line_arg, CTempStringEx &out_seqid, CTempStringEx &out_annotname)
If line_arg is a feature line (e.g.
static void ReadSequinFeatureTables(ILineReader &reader, CSeq_entry &entry, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, ITableFilter *filter=nullptr)
@ fSuppressBadKeyWarning
= 0x400 (Suppress 'bad key' errors; Not recommended.)
@ fReportDiscouragedKey
= 0x40 (Report discouraged keys into the error container)
@ fKeepBadKey
= 0x02 (As much as possible, try to use bad keys as if they were acceptable)
@ fIgnoreWebComments
= 0x08 (ignore web comment lines such as lines that start with " INFO:", or consist of many equals si...
@ fIncludeObjectInMsg
= 0x800 (Include reference to feature object in message).
@ fAllIdsAsLocal
= 0x100 (Do not attempt to parse accessions)
@ fLeaveProteinIds
= 0x80 (Leave all protein_id as a qualifiers)
@ fCreateGenesFromCDSs
= 0x10 (If a CDS has a gene xref, create a gene with the same intervals if one doesn't already exist....
@ fPreferGenbankId
= 0x200 (Prefer Genbank accession ids)
@ fTranslateBadKey
= 0x04 (yields misc_feature /standard_name="...")
@ fCDSsMustBeInTheirGenes
= 0x20 (If a CDS has a gene xref, it *must* be inside of that gene)
CRef< CSeq_annot > ReadSeqAnnot(ILineReader &lr, ILineErrorListener *pErrors) override
Read an object from a given line reader, render it as a single Seq-annot, if possible.
unique_ptr< CFeatureTableReader_Imp > m_pImpl
static CRef< CSeq_annot > x_ReadFeatureTable(CFeatureTableReader_Imp &reader, const CTempString &seqid, const CTempString &annot_name, const TFlags flags, ITableFilter *filter)
static CRef< CSeq_feat > CreateSeqFeat(const string &feat, CSeq_loc &location, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, unsigned int line=0, std::string *seq_id=nullptr, ITableFilter *filter=nullptr)
static void AddFeatQual(CRef< CSeq_feat > sfp, const string &feat_name, const string &qual, const string &val, const TFlags flags=0, ILineErrorListener *pMessageListener=nullptr, int line=0, const string &seq_id=std::string())
static void AddStringFlags(const list< string > &stringFlags, TFlags &baseFlags)
CRef< CSeq_annot > ReadSequinFeatureTable(const TFlags flags=0, ITableFilter *filter=nullptr, const string &seqid_prefix=kEmptyStr)
@Gb_qual.hpp User-defined methods of the data storage class.
static const CTrans_table & GetTransTable(int id)
static int CodonToIndex(char base1, char base2, char base3)
*** Import *********************************************** * * Features imported from other databases...
static void GetPrefixAndRemainder(const string &inference, string &prefix, string &remainder)
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
@OrgMod.hpp User-defined methods of the data storage class.
@RNA_ref.hpp User-defined methods of the data storage class.
static void AddGeneOntologyTerm(CSeq_feat &feature, const CTempString &qual, const CTempString &val)
Defines and provides stubs for a general interface to a variety of file readers.
static void xAddStringFlagsWithMap(const list< string > &stringFlags, const map< string, TReaderFlags > flagMap, TReaderFlags &baseFlags)
void SetRegion(const TRegion &v)
void SetBiosrc(TBiosrc &v)
static bool IsDiscouragedQual(EQualifier qual)
EQualifier
List of available qualifiers for feature keys.
void SetBond(const TBond &v)
static bool CanHaveGene(ESubtype subtype)
void SetSite(const TSite &v)
static const CSiteList * GetSiteList()
ESubtype GetSubtype(void) const
static bool IsDiscouragedSubtype(ESubtype subtype)
static E_Choice GetTypeFromSubtype(ESubtype subtype)
void SetCdregion(TCdregion &v)
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
@ eSubtype_transit_peptide_aa
@ eSubtype_sig_peptide_aa
@ eSubtype_mat_peptide_aa
static EQualifier GetQualifierType(CTempString qual)
convert qual string to enumerated value
static const CBondList * GetBondList()
static CTempString GetQualifierAsString(EQualifier qual)
Convert a qualifier from an enumerated value to a string representation or empty if not found.
static ESubtype SubtypeNameToValue(CTempString sName)
Turn a string into its ESubtype which is NOT necessarily related to the identifier of the enum.
static bool IsRegulatory(ESubtype subtype)
static const vector< string > & GetRegulatoryClassList()
bool IsFtable(void) const
namespace ncbi::objects::
const CGene_ref * GetGeneXref(void) const
See related function in util/feature.hpp.
void SetGeneXref(CGene_ref &value)
void SetProtXref(CProt_ref &value)
void SetPartialStart(bool val, ESeqLocExtremes ext)
void SetPartialStop(bool val, ESeqLocExtremes ext)
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
void SetRightOf(bool val)
void SetPartialStart(bool val, ESeqLocExtremes ext)
void SetPartialStop(bool val, ESeqLocExtremes ext)
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
TBase::const_iterator const_iterator
Simple implementation of ILineReader for i(o)streams.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
@ eParse_Number
Parse a real or integer number, otherwise string.
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
virtual void PutProgress(const string &sMessage, const Uint8 iNumDone=0, const Uint8 iNumTotal=0)=0
This is used for processing progress messages.
virtual EDiagSev Severity(void) const
@ eProblem_InvalidQualifier
@ eProblem_QualifierBadValue
@ eProblem_NumericQualifierValueIsNotANumber
@ eProblem_InternalPartialsInFeatLocation
@ eProblem_FeatMustBeInXrefdGene
@ eProblem_UnrecognizedFeatureName
@ eProblem_FeatureNameNotAllowed
@ eProblem_IncompleteFeature
@ eProblem_QualifierWithoutFeature
@ eProblem_FeatureBadStartAndOrStop
@ eProblem_NumericQualifierValueHasExtraTrailingCharacters
@ eProblem_UnrecognizedSquareBracketCommand
@ eProblem_UnrecognizedQualifierName
@ eProblem_BadFeatureInterval
@ eProblem_DiscouragedFeatureName
@ eProblem_NoFeatureProvidedOnIntervals
@ eProblem_DiscouragedQualifierName
vector< unsigned int > TVecOfLines
virtual EProblem Problem(void) const =0
virtual const string & ErrorMessage() const
Abstract base class for lightweight line-by-line reading.
Use to give a feature filter to CFeature_table_reader.
EAction
How a given feature name should be handled.
@ eAction_Okay
Just accept the feat.
@ eAction_Disallowed
Do not accept the feat and give message eProblem_FeatureNameNotAllowed.
virtual EAction GetFeatAction(const string &feature_name) const =0
Returns how we should treat the given feature name.
const_iterator_pair equal_range(const key_type &key) const
iterator insert(const value_type &val)
container_type::iterator iterator
container_type::value_type value_type
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator_pair equal_range(const key_type &key) const
const_iterator end() const
Include a standard set of the NCBI C++ Toolkit most basic headers.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
static unsigned int line_num
static const char * str(char *buf, int n)
static const char location[]
const TResidue codons[4][4]
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
#define ENTREZ_ID_FROM(T, value)
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
EDiagSev
Severity level for the posted diagnostics.
@ eDiag_Error
Error message.
@ eDiag_Warning
Warning message.
void Warning(CExceptionArgs_Base &args)
virtual void UngetLine(void)=0
Unget current line, which must be valid.
virtual Uint8 GetLineNumber(void) const =0
Returns the current line number (counting from 1, not 0).
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
const string AsFastaString(void) const
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
@ fParse_Default
By default in ParseIDs and IsValid, allow raw parsable non-numeric accessions and plausible local acc...
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
@ eFasta
Tagged ID in NCBI's traditional FASTA style.
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
ENa_strand GetStrand(void) const
Get the location's strand.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
CRef< CSeq_loc > Subtract(const CSeq_loc &other, TOpFlags flags, ISynonymMapper *syn_mapper, ILengthGetter *len_getter) const
Subtract seq-loc from this, merge/sort resulting ranges depending on flags.
const CSeq_loc & GetEmbeddingSeq_loc(void) const
Get the nearest seq-loc containing the current range.
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
void SetNull(void)
Override all setters to incorporate cache invalidation.
bool IsPartialStop(ESeqLocExtremes ext) const
void Reset(void)
Reset reference object.
TObjectType * GetPointerOrNull(void) const THROWS_NONE
Get pointer value.
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
int32_t Int4
4-byte (32-bit) signed integer
unsigned char Uchar
Alias for unsigned char.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
NCBI_NS_STD::string::size_type SIZE_TYPE
const_iterator end() const
Return an iterator to the string's ending position (one past the end of the represented sequence)
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
CTempStringEx substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
void clear(void)
Clear value to an empty string.
static long StringToLong(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to long.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
size_type length(void) const
Return the length of the represented array.
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
size_type find_first_not_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character not in the matching string within the current string,...
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
size_type size(void) const
Return the length of the represented array.
static string & ToLower(string &str)
Convert string to lower case – string& version.
static const size_type npos
const_iterator begin() const
Return an iterator to the string's starting position.
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
@ eTrunc_End
Truncate trailing whitespace only.
@ eTrunc_Begin
Truncate leading whitespace only.
@ eNocase
Case insensitive compare.
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
list< CRef< CSubSource > > TSubtype
void SetGenome(TGenome value)
Assign a value to Genome data member.
void SetOrg(TOrg &value)
Assign a value to Org data member.
void SetName(const TName &value)
Assign a value to Name data member.
EGenome
biological context
TSubtype & SetSubtype(void)
Assign a value to Subtype data member.
@ eSubtype_collection_date
DD-MMM-YYYY format.
@ eSubtype_insertion_seq_name
@ eSubtype_transposon_name
@ eSubtype_fwd_primer_seq
sequence (possibly more than one; semicolon-separated)
@ eSubtype_lat_lon
+/- decimal degrees
@ eSubtype_rev_primer_name
@ eSubtype_collected_by
name of person who collected the sample
@ eSubtype_fwd_primer_name
@ eSubtype_rev_primer_seq
sequence (possibly more than one; semicolon-separated)
@ eSubtype_isolation_source
@ eSubtype_environmental_sample
@ eSubtype_endogenous_virus_name
@ eSubtype_identified_by
name of person who identified the sample
@ eGenome_endogenous_virus
TSyn & SetSyn(void)
Assign a value to Syn data member.
void SetAllele(const TAllele &value)
Assign a value to Allele data member.
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
void SetLocus(const TLocus &value)
Assign a value to Locus data member.
void SetLocus_tag(const TLocus_tag &value)
Assign a value to Locus_tag data member.
void SetMaploc(const TMaploc &value)
Assign a value to Maploc data member.
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
const TLocus & GetLocus(void) const
Get the Locus member data.
void SetDesc(const TDesc &value)
Assign a value to Desc data member.
bool IsStr(void) const
Check if variant Str is selected.
void SetTag(TTag &value)
Assign a value to Tag data member.
const TStr & GetStr(void) const
Get the variant data.
TStr & SetStr(void)
Select the variant.
void SetType(TType &value)
Assign a value to Type data member.
void SetDb(const TDb &value)
Assign a value to Db data member.
TId & SetId(void)
Select the variant.
void SetDiv(const TDiv &value)
Assign a value to Div data member.
void SetSubtype(TSubtype value)
Assign a value to Subtype data member.
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
list< CRef< COrgMod > > TMod
void SetGcode(TGcode value)
Assign a value to Gcode data member.
void SetMgcode(TMgcode value)
Assign a value to Mgcode data member.
TMod & SetMod(void)
Assign a value to Mod data member.
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
void SetSubname(const TSubname &value)
Assign a value to Subname data member.
void SetLineage(const TLineage &value)
Assign a value to Lineage data member.
@ eSubtype_gb_acronym
used by taxonomy database
@ eSubtype_gb_synonym
used by taxonomy database
@ eSubtype_dosage
chromosome dosage of hybrid
@ eSubtype_nat_host
natural host of this specimen
@ eSubtype_metagenome_source