86 #include <unordered_set>
95 static string kAssemblyGap_feature =
"assembly_gap";
96 static string kGapType_qual =
"gap_type";
97 static string kLinkageEvidence_qual =
"linkage_evidence";
102 CSeq_descr::Tdata::iterator it = src.
SetDescr().Set().begin();
104 while(it != src.
SetDescr().Set().end())
106 switch ((**it).Which())
111 dest.
SetDescr().Set().push_back(*it);
122 dest.
SetDescr().Set().push_back(*it);
132 const char mapids[] = {
158 struct SSeqAnnotCompare
163 const char* m = mapids;
167 return strchr(m, c)-m;
175 return mapwhich(left->
GetData().
Which()) < mapwhich(right->GetData().Which());
183 if (!(**annot_it).IsFtable())
continue;
198 void FindMaximumId(
const CSeq_entry& entry,
int&
id)
202 FindMaximumId(entry.
GetAnnot(),
id);
212 FindMaximumId(**set_it,
id);
234 if ((**xref_it).IsSetData())
236 if ((**xref_it).GetData().IsProt() &&
237 (**xref_it).GetData().GetProt().IsSetName())
239 protein_name = (**xref_it).GetData().GetProt().GetName().front();
262 id->SetLocal().SetStr(id_label);
275 if (!hid || !it->IsBetter(hid)) {
284 return GetNewProteinId(seh.
GetScope(), id_base);
287 string NewProteinName(
const CSeq_feat& feature,
bool make_hypotethic)
293 if (protein_name.empty() && make_hypotethic)
295 protein_name =
"hypothetical protein";
308 for (
auto pId : pProtEntry->GetSeq().GetId()) {
331 seqid->Assign(*bioseq.
GetId().begin()->GetPointerOrNull());
333 best.push_back(orfs.front());
336 if ((**it).GetTotalRange().GetLength() >
337 best.front()->GetTotalRange().GetLength() )
353 for (
auto it: seq.
GetId()) {
365 for (
auto it: seq_ids) {
366 if (!BioseqHasId(bioseq, it))
368 bioseq.
SetId().push_back(it);
377 if (it->IsGenbank() || best.
Empty())
385 for (
auto it = seq_ftable.begin(); it != seq_ftable.end(); ++it) {
386 auto prot_feat = *it;
392 seq_ftable.erase(it);
408 ftable = &it->SetData().SetFtable();
416 ftable = &annot->SetData().SetFtable();
422 if (prot_feat.
Empty())
424 ftable->push_back(prot_feat);
426 prot_feat =
ftable->front();
430 int GetGenomicCodeOfBioseq(
const CBioseq& bioseq)
433 if (closest_biosource.
Empty())
454 auto append_nonduplicated_item = [](list<string>& current_list,
455 const list<string>& other_list)
457 unordered_set<string> current_set;
458 for (
const auto& item : current_list) {
459 current_set.insert(item);
462 for (
const auto& item : other_list) {
463 if (current_set.find(item) == current_set.end()) {
464 current_list.push_back(item);
470 append_nonduplicated_item(current_ref.
SetName(),
479 append_nonduplicated_item(current_ref.
SetEc(),
484 append_nonduplicated_item(current_ref.
SetActivity(),
489 for (
const auto& pDBtag : other_ref.
GetDb()) {
490 current_ref.
SetDb().push_back(pDBtag);
511 bool nameFromRNAProduct{
false };
516 nameFromRNAProduct =
true;
519 prot_ref.
SetName().push_back(product_name);
523 if (pMrna.
Empty() || nameFromRNAProduct) {
530 if (extName.empty()) {
535 for (
auto& protName : prot_ref.
SetName()) {
541 prot_ref.
SetName().push_back(extName);
553 bool was_extended =
false;
568 protein_entry->
SetSeq(*protein);
584 if (protein->
GetId().empty())
586 const string* protein_ids =
nullptr;
588 qual_to_remove =
"protein_id";
591 if (protein_ids->empty())
593 qual_to_remove =
"orig_protein_id";
597 if (protein_ids->empty())
603 if (protein_ids->empty())
609 if (protein_ids->empty()) {
614 MergeSeqIds(*protein, {
whole });
622 MergeSeqIds(*protein, new_ids);
631 if (protein->
GetId().empty())
634 if (!bioseq.
GetId().empty()) {
637 protein->
SetId().push_back(GetNewProteinId(*token.
scope, base_name));
640 for (
auto prot_id : protein->
GetId()) {
641 prot_feat = MoveParentProt(seq_ftable, *prot_id);
646 CreateOrSetFTable(*protein, prot_feat);
654 prot_ref.
SetName().push_back(
"hypothetical protein");
659 prot_feat->
SetLocation().SetInt().SetId().Assign(*GetAccessionId(protein->
GetId()));
664 cd_feature.
SetProduct().SetWhole().Assign(*GetAccessionId(protein->
GetId()));
684 auto& ext = mrna->
SetData().SetRna().SetExt();
686 (ext.IsName() && ext.SetName().empty()))
687 ext.SetName() = prot_ref.
GetName().front();
703 return protein_entry;
717 return *left < *right;
739 const CBioseq* pNucSeq=
nullptr;
741 const auto& bioseqSet = nuc_prot.
GetSet();
742 for (
const auto& pSubEntry : bioseqSet.GetSeq_set()) {
743 const auto& bioseq = pSubEntry->GetSeq();
754 inserter(proteinIds, proteinIds.
end()),
764 for (
auto pAnnot : pNucSeq->
GetAnnot()) {
765 if (pAnnot->IsFtable()) {
766 for (
auto pSeqFeat : pAnnot->GetData().GetFtable()) {
768 !pSeqFeat->IsSetData() ||
769 !pSeqFeat->GetData().IsCdregion()) {
773 if (!pSeqFeat->IsSetProduct() ||
774 !pSeqFeat->GetProduct().GetId() ||
775 proteinIds.
find(pSeqFeat->GetProduct().GetId())
776 == proteinIds.
end()) {
798 switch (entry.
Which())
836 switch(entry.
Which())
866 seq_ftable.sort(SSeqAnnotCompare());
867 auto feat_it = seq_ftable.begin();
868 while (feat_it != seq_ftable.end())
878 if (
data.IsCdregion())
880 if (!
data.GetCdregion().IsSetCode())
882 int code = GetGenomicCodeOfBioseq(*token.
bioseq);
886 data.SetCdregion().SetCode().SetId(
code);
888 if (!
data.GetCdregion().IsSetFrame())
914 set_ftable.push_back(feature);
915 feat_it = seq_ftable.erase(feat_it);
927 if (!entry.
IsSet() ||
932 auto entry_it = find_if(seq_set.begin(), seq_set.end(),
937 pEntry->GetSeq().IsSetInst() &&
938 pEntry->GetSeq().IsNa() &&
939 pEntry->GetSeq().IsSetAnnot());
942 if (entry_it == seq_set.end()) {
946 auto& bioseq = token.
bioseq;
947 bioseq.
Reset(&((*entry_it)->SetSeq()));
952 find_if(annots.begin(), annots.end(),
955 if (annot_it == annots.end()) {
959 auto main_ftable = *annot_it;
962 while (annot_it != annots.end()) {
963 auto pAnnot = *annot_it;
965 main_ftable->
SetData().SetFtable().splice(
966 end(main_ftable->SetData().SetFtable()),
967 pAnnot->
SetData().SetFtable());
968 annot_it = annots.erase(annot_it);
975 auto seq_ftable = main_ftable->SetData().SetFtable();
993 if (seq_ftable.empty()) {
994 bioseq->
SetAnnot().remove(main_ftable);
997 main_ftable->SetData().SetFtable() = move(seq_ftable);
1005 if (!set_ftable.empty()) {
1026 unique_ptr<CFastaReader> pReader(
new CFastaReader(0,
flags));
1027 pReader->SetPostponedMods({
"gene",
"allele"});
1038 if (
result->IsSetDescr())
1040 if (
result->GetDescr().Get().empty())
1043 result->SetSeq().ResetDescr();
1045 result->SetSet().ResetDescr();
1052 set->SetSet().SetSeq_set().push_back(
result);
1066 list<CConstRef<CBioseq>> proteins;
1067 if (possible_proteins.
IsSeq()) {
1068 proteins.emplace_back(&(possible_proteins.
GetSeq()));
1074 proteins.emplace_back(&(pSubEntry->GetSeq()));
1082 auto it = proteins.begin();
1083 while(it != proteins.end()) {
1085 it = proteins.erase(it);
1113 if ((**annot_it).IsFtable())
1117 if((**feat_it).CanGetData())
1119 switch ((**feat_it).GetData().Which())
1144 MoveSomeDescr(entry, bioseq);
1189 if (ival.
GetTo() < bioseqLength - 4) {
1192 ival.
SetTo(bioseqLength - 1);
1199 bool changed =
false;
1201 if ( partial5 && partial3 ) {
1203 }
else if ( partial5 ) {
1205 }
else if ( partial3 ) {
1229 if ((*annot_it)->IsFtable()) {
1241 if ((*feat_it)->IsSetData() && (*feat_it)->GetData().IsProt() && !(*feat_it)->GetData().GetProt().IsSetProcessed()) {
1242 prot_feat = *feat_it;
1248 prot_feat->
SetData().SetProt();
1249 ftable->SetData().SetFtable().push_back(prot_feat);
1252 prot_id->Assign(*(protein.
GetId().front()));
1253 prot_feat->
SetLocation().SetInt().SetId(*prot_id);
1258 if (partial5 || partial3) {
1283 if (
set &&
set->IsSetSeq_set()) {
1286 CBioseq_set::TDescr::Tdata::const_iterator it =
nuc->GetDescr().Get().begin();
1287 while (it !=
nuc->GetDescr().Get().end()) {
1288 if (!(*it)->IsMolinfo() && !(*it)->IsTitle()) {
1293 it =
nuc->GetDescr().Get().begin();
1320 for (; annot_ci; ++annot_ci) {
1321 if ((*annot_ci).IsFtable()) {
1337 m_feh = aeh.AddFeat(*m_Feat);
1346 const string& idString,
1348 objects::ILineErrorListener& logger)
1350 for (
const auto& modName : duplicateMods) {
1351 string message =
"Multiple '" + modName +
"' modifiers. Only the first will be used.";
1352 logger.PutError(*unique_ptr<CLineError>(
1354 "",
"",
"", message)));
1361 CBioseq& protein,
bool partial5,
bool partial3)
1365 const auto& proteinIds = pOriginalProtIds.empty() ?
1369 for (
auto pId : proteinIds) {
1370 const auto idString = pId->AsFastaString();
1372 const auto& modList = it->second.second;
1373 lineNumber = it->second.first;
1375 for (
const auto&
mod : modList) {
1376 if (!
smp.AddMods(
mod.GetName(),
mod.GetValue())) {
1386 if (!
smp.GetAllMods().empty()) {
1387 smp.ApplyAllMods(protein);
1389 smp.ApplyAllMods(
nuc->SetSeq(),
"", cds_loc);
1392 for (
auto pEntry :
nuc->SetSet().SetSeq_set()) {
1393 if (pEntry->IsSeq() && pEntry->GetSeq().IsNa()) {
1394 smp.ApplyAllMods(pEntry->SetSeq(),
"", cds_loc);
1407 for (
auto pId : protein.
GetId()) {
1413 else if (seh.
IsSet()) {
1415 if (bit->IsSynonym(*pId)) {
1431 if (nuc_count > 1) {
1442 const CSeq_loc& genomicLoc,
1449 auto alignment = prosplign.
FindAlignment(scope, proteinId, genomicLoc,
1461 if (!filter.
Match(*alignment)) {
1466 bool found_start_codon =
false;
1467 bool found_stop_codon =
false;
1468 list<CRef<CSeq_loc>> exonLocs;
1470 if (alignment->IsSetSegs() && alignment->GetSegs().IsSpliced()) {
1472 seq_id->
Assign(*(genomicLoc.GetId()));
1473 const auto& splicedSegs = alignment->GetSegs().GetSpliced();
1474 const bool isMinusStrand = (splicedSegs.IsSetGenomic_strand() &&
1477 for (
auto pExon : splicedSegs.GetExons()) {
1478 auto pExonLoc =
Ref(
new CSeq_loc(*seq_id,
1479 pExon->GetGenomic_start(),
1480 pExon->GetGenomic_end()));
1482 if (isMinusStrand) {
1484 }
else if (pExon->IsSetGenomic_strand()) {
1485 pExonLoc->SetStrand(pExon->GetGenomic_strand());
1487 exonLocs.push_back(pExonLoc);
1490 for (
auto pModifier : splicedSegs.GetModifiers()) {
1491 if (pModifier->IsStart_codon_found()) {
1492 found_start_codon = pModifier->GetStart_codon_found();
1494 if (pModifier->IsStop_codon_found()) {
1495 found_stop_codon = pModifier->GetStop_codon_found();
1500 if (exonLocs.empty()) {
1504 auto pCDSLoc =
Ref(
new CSeq_loc());
1505 if (exonLocs.size() == 1) {
1506 pCDSLoc->Assign(*(exonLocs.front()));
1509 pCDSLoc->SetMix().Set() = exonLocs;
1512 if (!found_start_codon) {
1516 if (found_stop_codon) {
1518 auto& finalInterval = pCDSLoc->IsMix() ?
1519 pCDSLoc->SetMix().Set().back()->SetInt() :
1532 pCds->SetLocation(loc);
1534 pCds->SetPartial(
true);
1536 pCds->SetData().SetCdregion();
1537 pCds->SetProduct().SetWhole(productId);
1554 bool id_match{
false};
1568 bioseq_id->
Assign(*(bsh_match.GetSeqId()));
1569 CRef<CSeq_loc> match_loc(
new CSeq_loc(*bioseq_id, 0, bsh_match.GetBioseqLength() - 1));
1575 pOriginalIds = move(protein_entry->
SetSeq().
SetId());
1578 protein_entry->
SetSeq().
SetId().push_back(product_id);
1589 string error =
"Unable to find coding region location for protein sequence " +
label +
".";
1601 protein_entry->
SetSeq(), partial5, partial3);
1603 AddSeqEntry(bsh_match.GetParentEntry(), protein_entry);
1605 auto new_cds =
s_MakeCDSFeat(*cds_loc, (partial5 || partial3),
1607 AddFeature(seh, new_cds);
1612 string title = protein_name;
1613 if (!org_name.empty())
1629 for (CBioseq::TAnnot::iterator annot_it = bioseq.
SetAnnot().begin(); annot_it != bioseq.
SetAnnot().end(); )
1631 if ((**annot_it).IsFtable() && (**annot_it).GetData().GetFtable().empty())
1633 annot_it = bioseq.
SetAnnot().erase(annot_it);
1649 return (feat.
GetNamedQual(
"estimated_length") ==
"unknown");
1655 const string& sGT = feature_gap.
GetNamedQual(kGapType_qual);
1669 gap_type = gap_type_info->
m_eType;
1676 const string& sLE_name = (**sLE_qual).GetQual();
1677 if (sLE_name != kLinkageEvidence_qual)
1683 if (it == linkage_evidence_to_value_map.
end())
1686 string(
"Unrecognized linkage evidence ") + (**sLE_qual).GetVal(),
1702 string(
"Linkage evidence must not be specified for ") + sGT,
1713 string(
"Linkage evidence must be specified for ") + sGT,
1726 evidences.
insert(evidence);
1732 string(
"Unrecognized gap type ") + sGT,
1756 for (
CBioseq_CI bioseq_it(seh); bioseq_it; ++bioseq_it)
1760 for (
CFeat_CI feature_it(*bioseq_it, annot_sel); feature_it; )
1762 if (feature_it->IsSetData() && feature_it->GetData().IsImp())
1764 const CImp_feat& imp = feature_it->GetData().GetImp();
1768 const CSeq_feat& feature_gap = feature_it->GetOriginalFeature();
1773 auto pBioseq =
const_cast<CBioseq*
>(bioseq_it->GetCompleteBioseq().GetPointer());
1779 "Failed to convert feature gap into a gap",
1798 CBioseq& bioseq = (
CBioseq&)*bioseq_it->GetEditHandle().GetCompleteBioseq();
1817 for (
auto pAnnot : bioseq.
SetAnnot()) {
1818 if (!pAnnot->IsSetData() ||
1824 auto&
ftable = pAnnot->SetData().SetFtable();
1825 auto fit =
ftable.begin();
1826 while (fit !=
ftable.end()) {
1827 auto pSeqFeat = *fit;
1828 if (pSeqFeat->IsSetData() &&
1829 pSeqFeat->GetData().IsImp() &&
1830 pSeqFeat->GetData().GetImp().IsSetKey() &&
1831 pSeqFeat->GetData().GetImp().GetKey() == kAssemblyGap_feature) {
1834 if (
MakeGap(bioseq, *pSeqFeat)) {
1839 "Failed to convert feature gap into a gap",
1862 CSeqTranslator::ChangeDeltaProteinToRawProtein(Ref(&bioseq));
1872 switch(loc.Which()) {
1874 return &loc.GetWhole();
1876 return &(loc.GetInt().GetId());
1878 return &(loc.GetPnt().GetId());
1880 if (!loc.GetPacked_int().Get().empty()) {
1881 return &(loc.GetPacked_int().Get().front()->GetId());
1885 if (loc.GetPacked_pnt().IsSetId()) {
1886 return &(loc.GetPacked_pnt().GetId());
1899 using TFeatIt = list<CRef<CSeq_feat>>::const_iterator;
1909 list<SRegionIterators>& its)
1912 for (
auto annot_it = annots.begin();
1913 annot_it != annots.end();
1916 const auto& annot = **annot_it;
1917 if (annot.IsFtable()) {
1918 const auto&
ftable = annot.GetData().GetFtable();
1919 list<SRegionIterators::TFeatIt> feat_its;
1920 for (
auto feat_it =
ftable.begin(); feat_it !=
ftable.end(); ++feat_it) {
1921 const auto& pFeat = *feat_it;
1922 if (pFeat->IsSetData() &&
1923 pFeat->GetData().IsRegion()) {
1924 feat_its.push_back(feat_it);
1927 if (!feat_its.empty()) {
1937 if (!seq_entry.
IsSet()) {
1941 auto& bioseq_set = seq_entry.
SetSet();
1943 if (!bioseq_set.IsSetClass() ||
1945 if (bioseq_set.IsSetSeq_set()) {
1946 for (
auto pEntry : bioseq_set.SetSeq_set()) {
1955 _ASSERT(bioseq_set.IsSetSeq_set());
1961 list<SRegionIterators> region_its;
1963 for (
auto pSubEntry : bioseq_set.SetSeq_set()) {
1965 auto& seq = pSubEntry->SetSeq();
1976 region_its.empty()) {
1981 pScope->AddTopLevelSeqEntry(seq_entry);
1984 for (
auto its : region_its) {
1985 for (
auto feat_it : its.feat_its) {
1986 auto pRegion = *feat_it;
1992 pRegion->SetLocation(*pMappedLoc);
1996 (*its.annot_it)->SetData().SetFtable().
erase(feat_it);
1999 if ((*its.annot_it)->GetData().GetFtable().empty()) {
2000 pNucSeq->
SetAnnot().erase(its.annot_it);
2008 for (
auto pSubEntry : bioseq_set.SetSeq_set()) {
2009 auto& bioseq = pSubEntry->SetSeq();
2010 if (bioseq.
IsNa()) {
2015 for (
auto pId : bioseq.
GetId()) {
2017 while (it != mapped_regions.
end() && (it->first->Compare(*pId) ==
CSeq_id::e_YES)) {
2023 it = mapped_regions.
erase(it);
2028 bioseq.
SetAnnot().push_back(pAnnot);
2031 if(mapped_regions.
empty()) {
2040 if (entry.
IsSeq()) {
2044 auto& bioseq_set = entry.
SetSet();
2045 if (!bioseq_set.IsSetSeq_set()) {
2049 bool any_change =
false;
2050 if (!bioseq_set.IsSetClass() ||
2052 for (
auto pSubEntry : bioseq_set.SetSeq_set()) {
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
@ eExtreme_Biological
5' and 3'
User-defined methods of the data storage class.
void g_LogGeneralParsingError(EDiagSev sev, const string &idString, const string &msg, objects::ILineErrorListener &listener)
string GetIdHashOrValue(const string &base, int offset)
void transform(Container &c, UnaryFunction *op)
CAlignFilter exposes a query language for inspecting properties and scores placed on Seq-align object...
bool Match(const objects::CSeq_align &align)
Match a single alignment.
CSeqdesc & Set(bool skip_lookup=false)
int GetGenCode(int def=1) const
CConstRef< CSeqdesc > GetClosestDescriptor(CSeqdesc::E_Choice choice, int *level=NULL) const
TSeqPos GetLength(void) const
static bool ExtendToStopIfShortAndNotPartial(CSeq_feat &f, CBioseq_Handle bsh, bool check_for_stop=true)
Extends a coding region up to 50 nt.
static bool ParseCodeBreaks(CSeq_feat &feat, CScope &scope)
Parses all valid transl_except Gb-quals into code-breaks for cdregion, then removes the transl_except...
static bool ExtendStopPosition(CSeq_feat &f, const CSeq_feat *cdregion, size_t extension=0)
static CRef< CSeq_loc > GetProteinLocationFromNucleotideLocation(const CSeq_loc &nuc_loc, CScope &scope)
static bool MoveProteinSpecificFeats(CSeq_entry_Handle seh)
Moves protein-specific features from nucleotide sequences in the Seq-entry to the appropriate protein...
static bool LocationMayBeExtendedToMatch(const CSeq_loc &orig, const CSeq_loc &improved)
Checks whether it is possible to extend the original location up to improved one.
void xParseCdregions(objects::CSeq_entry &entry, TAsyncToken &)
objects::CFastaReader::TPostponedModMap m_PrtModMap
void AddProteins(const objects::CSeq_entry &possible_proteins, objects::CSeq_entry &entry)
void MoveRegionsToProteins(objects::CSeq_entry &entry)
void xMoveCdRegions(objects::CSeq_entry_Handle entry_h, objects::CSeq_annot::TData::TFtable &seq_ftable, objects::CSeq_annot::TData::TFtable &set_ftable, TAsyncToken &)
void FindOpenReadingFrame(objects::CSeq_entry &entry) const
bool xAddProteinToSeqEntry(const objects::CBioseq &protein, objects::CSeq_entry_Handle seh)
void ConvertNucSetToSet(CRef< objects::CSeq_entry > &entry) const
CFeatureTableReader(CTable2AsnContext &context)
CTable2AsnContext & m_context
void MakeGapsFromFeatures(objects::CSeq_entry_Handle seh) const
CRef< objects::CSeq_entry > ReadProtein(ILineReader &line_reader)
void ChangeDeltaProteinToRawProtein(objects::CSeq_entry &entry) const
void xConvertSeqIntoSeqSet(objects::CSeq_entry &entry, bool nuc_prod_set) const
static void RemoveEmptyFtable(objects::CBioseq &bioseq)
CRef< objects::CSeq_feat > x_AddProteinFeatureToProtein(CRef< objects::CSeq_entry > nuc, CConstRef< objects::CSeq_loc > cds_loc, const list< CRef< objects::CSeq_id >> &pOriginalProtIds, objects::CBioseq &protein, bool partial5, bool partial3)
CRef< objects::CDelta_seq > MakeGap(objects::CBioseq &bioseq, const objects::CSeq_feat &feature_gap) const
void xMergeCDSFeatures_impl(objects::CSeq_entry &, TAsyncToken &)
void MergeCDSFeatures(objects::CSeq_entry &, TAsyncToken &)
void MoveProteinSpecificFeats(objects::CSeq_entry &entry)
CRef< objects::CSeq_entry > m_replacement_protein
bool xCheckIfNeedConversion(const objects::CSeq_entry &entry) const
CRef< objects::CSeq_entry > xTranslateProtein(const objects::CBioseq &bioseq, objects::CSeq_feat &cd_feature, list< CRef< CSeq_feat >> &seq_ftable, TAsyncToken &)
CRef< CDelta_seq > CreateGap(CBioseq &bioseq, TSeqPos gap_start, TSeqPos gap_length)
@Imp_feat.hpp User-defined methods of the data storage class.
static CLineError * Create(EProblem eProblem, EDiagSev eSeverity, const std::string &strSeqId, unsigned int uLine, const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), const std::string &strErrorMessage=string(""), const TVecOfLines &vecOfOtherLines=TVecOfLines())
Use this because the constructor is protected.
vector< CRef< objects::CSeq_loc > > TLocVec
static CRef< objects::CSeq_annot > MakeCDSAnnot(const TLocVec &orfs, int genetic_code=1, objects::CSeq_id *id=NULL)
/ This version returns an annot full of CDS features.
static void FindOrfs(const string &seq, TLocVec &results, unsigned int min_length_bp=3, int genetic_code=1, const vector< string > &allowable_starts=vector< string >(), bool longest_orfs=true, size_t max_seq_gap=k_default_max_seq_gap)
Find ORFs in both orientations.
CProSplignOptions_Base & SetAltStarts(bool allow_alt_start)
Output filtering parameters.
@ ePassThrough
all zeroes - no filtering
@ eWithHoles
default filtering parameters
spliced protein to genomic alignment
CRef< objects::CSeq_align > FindAlignment(objects::CScope &scope, const objects::CSeq_id &protein, const objects::CSeq_loc &genomic, CProSplignOutputOptions output_options=CProSplignOutputOptions())
Aligns protein to a region on genomic sequence.
void GetLabel(string *label) const
bool IsFtable(void) const
@Seq_descr.hpp User-defined methods of the data storage class.
const TAnnot & GetAnnot(void) const
bool IsSetAnnot(void) const
void SetDescr(CSeq_descr &value)
list< CRef< CSeq_annot > > TAnnot
CSeq_entry * GetParentEntry(void) const
namespace ncbi::objects::
const CProt_ref * GetProtXref(void) const
get protein (if present) from Seq-feat.xref list
const string & GetNamedQual(const CTempString &qual_name) const
Return a named qualifier.
void RemoveQualifier(const string &qual_name)
Remove all qualifiers with the given name; do nothing if no such qualifier exists.
bool AddSeqFeatXref(const CSeqFeatXref::TId &id)
@ eLinkEvid_UnspecifiedOnly
only the "unspecified" linkage-evidence is allowed
@ eLinkEvid_Forbidden
no linkage-evidence is allowed
@ eLinkEvid_Required
any linkage-evidence is allowed, and at least one is required
static const SGapTypeInfo * NameToGapTypeInfo(const CTempString &sName)
From a gap-type string, get the SGapTypeInfo, insensitive to case, etc.
static bool GetOrgName(string &name, const objects::CSeq_entry &entry)
objects::ILineErrorListener * m_logger
bool m_use_hypothetic_protein
SPrtAlnOptions prtAlnOptions
static bool IsDBLink(const objects::CSeqdesc &desc)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
@ eProblem_GeneralParsingError
Abstract base class for lightweight line-by-line reading.
container_type::const_iterator const_iterator
const_iterator end() const
const_iterator lower_bound(const key_type &key) const
const_iterator find(const key_type &key) const
iterator_bool insert(const value_type &val)
const_iterator find(const key_type &key) const
const_iterator end() const
void SetMolinfoForProtein(CRef< objects::CSeq_entry > protein, bool partial5, bool partial3)
CRef< objects::CSeq_feat > AddEmptyProteinFeatureToProtein(CRef< objects::CSeq_entry > protein, bool partial5, bool partial3)
bool SetMolinfoCompleteness(objects::CMolInfo &mi, bool partial5, bool partial3)
Operators to edit gaps in sequences.
static void s_SetProtRef(const CSeq_feat &cds, CConstRef< CSeq_feat > pMrna, CProt_ref &prot_ref)
static CBioseq_Handle s_MatchProteinById(const CBioseq &protein, CSeq_entry_Handle seh)
static void s_AppendProtRefInfo(CProt_ref ¤t_ref, const CProt_ref &other_ref)
static void s_ReportDuplicateMods(const set< string > &duplicateMods, const string &idString, TSeqPos lineNumber, objects::ILineErrorListener &logger)
static bool s_MoveProteinSpecificFeats(CSeq_entry &entry)
static bool s_TranslateCds(const CSeq_feat &cds, CScope &scope)
static CRef< CSeq_loc > s_GetCDSLoc(CScope &scope, const CSeq_id &proteinId, const CSeq_loc &genomicLoc, TSeqPos bioseqLength, const CTable2AsnContext::SPrtAlnOptions &prtAlnOptions)
static bool s_HasUnprocessedCdregions(const CSeq_entry &nuc_prot)
static CRef< CSeq_feat > s_MakeCDSFeat(CSeq_loc &loc, bool isPartial, CSeq_id &productId)
static const CSeq_id * s_GetIdFromLocation(const CSeq_loc &loc)
static bool s_UnknownEstimatedLength(const CSeq_feat &feat)
static CBioseq_Handle s_GetSingleNucSeq(CSeq_entry_Handle seh)
static void s_GatherRegionIterators(list< CRef< CSeq_annot >> &annots, list< SRegionIterators > &its)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
@ eDiag_Error
Error message.
const string & GetMsg(void) const
Get message string.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
CNcbiIos & MSerial_VerifyNo(CNcbiIos &io)
#define ENUM_METHOD_NAME(EnumName)
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
#define MSerial_AsnText
I/O stream manipulators –.
static string CanonicalizeString(const CTempString &sValue)
@ fAddMods
Parse defline mods and add to SeqEntry.
@ fNoUserObjs
Don't save raw deflines in User-objects.
@ fForceType
Force specified type regardless of accession.
@ fAssumeProt
Assume prots unless accns indicate otherwise.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
CConstRef< CSeq_id > GetSeqId(void) const
E_SIC Compare(const CSeq_id &sid2) const
Compare() - more general.
@ e_YES
SeqIds compared, but are different.
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
@ eContent
Untagged human-readable accession or the like.
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
TSeqPos GetStop(ESeqLocExtremes ext) const
bool CopyFeaturePartials(CSeq_feat &dst, const CSeq_feat &src)
CopyFeaturePartials A function to copy the start and end partialness from one feature to another.
bool AdjustProteinMolInfoToMatchCDS(CMolInfo &molinfo, const CSeq_feat &cds)
AdjustProteinMolInfoToMatchCDS A function to change an existing MolInfo to match a coding region.
bool IsPseudo(const CSeq_feat &feat, CScope &scope)
Determines whether given feature is pseudo, using gene associated with feature if necessary Checks to...
static CRef< CBioseq > TranslateToProtein(const CSeq_feat &cds, CScope &scope)
string GetProteinName(const CBioseq_Handle &seq)
Return protein name from corresponding Prot-ref feature.
static CCdregion::EFrame FindBestFrame(const CSeq_feat &cds, CScope &scope)
Find "best" frame for a coding region.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
void RemoveTopLevelSeqEntry(const CTSE_Handle &entry)
Revoke TSE previously added using AddTopLevelSeqEntry() or AddBioseq().
CSeq_entry_EditHandle GetSeq_entryEditHandle(const CSeq_entry &entry)
CBioseq_set_EditHandle GetEditHandle(void) const
Get 'edit' version of handle.
vector< CSeq_id_Handle > TId
TClass GetClass(void) const
CRef< CSeqdesc > RemoveSeqdesc(const CSeqdesc &v) const
CBioseq_set_Handle GetParentBioseq_set(void) const
Return a handle for the parent Bioseq-set, or null handle.
CBioseq_set_EditHandle GetParentBioseq_set(void) const
Get parent bioseq-set edit handle.
CSeq_annot_EditHandle AttachAnnot(CSeq_annot &annot) const
Attach an annotation.
TSet ConvertSeqToSet(TClass set_class=CBioseq_set::eClass_not_set) const
Convert the entry from Bioseq to Bioseq-set.
CConstRef< CBioseq_set > GetCompleteBioseq_set(void) const
Return the complete bioseq-set object.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id) const
Get Bioseq handle from the TSE of this Seq-entry.
CSeq_entry_EditHandle GetEditHandle(void) const
Get 'edit' version of handle.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
CSeq_entry_Handle GetParentEntry(void) const
Return a handle for the parent seq-entry of the bioseq.
CSeq_entry_EditHandle AttachEntry(CSeq_entry &entry, int index=-1) const
Attach an existing seq-entry.
bool IsSetClass(void) const
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
CScope & GetScope(void) const
Get scope this handle belongs to.
CSeq_entry_EditHandle AttachEntry(CSeq_entry &entry, int index=-1) const
Attach an existing seq-entry.
CConstRef< TObject > GetCompleteObject(void) const
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
bool AddSeqdesc(CSeqdesc &v) const
const TId & GetId(void) const
int GetSeq_entry_Index(const CSeq_entry_Handle &handle) const
bool IsSynonym(const CSeq_id &id) const
Check if this id can be used to obtain this bioseq handle.
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
TObjectType * GetPointerOrNull(void) const THROWS_NONE
Get pointer value.
TObjectType * GetPointerOrNull(void) THROWS_NONE
Get pointer value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
virtual bool IsType(TTypeInfo type) const
static const char label[]
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
bool IsId(void) const
Check if variant Id is selected.
TId GetId(void) const
Get the variant data.
TActivity & SetActivity(void)
Assign a value to Activity data member.
bool IsSetDesc(void) const
description (instead of name) Check if a value has been assigned to Desc data member.
const TDb & GetDb(void) const
Get the Db member data.
const TActivity & GetActivity(void) const
Get the Activity member data.
TEc & SetEc(void)
Assign a value to Ec data member.
const TName & GetName(void) const
Get the Name member data.
bool IsSetDb(void) const
ids in other dbases Check if a value has been assigned to Db data member.
bool IsSetEc(void) const
E.C.
void SetDesc(const TDesc &value)
Assign a value to Desc data member.
TProcessed GetProcessed(void) const
Get the Processed member data.
void SetProcessed(TProcessed value)
Assign a value to Processed data member.
bool IsSetName(void) const
protein name Check if a value has been assigned to Name data member.
const TDesc & GetDesc(void) const
Get the Desc member data.
bool IsSetActivity(void) const
activities Check if a value has been assigned to Activity data member.
const TEc & GetEc(void) const
Get the Ec member data.
TDb & SetDb(void)
Assign a value to Db data member.
TName & SetName(void)
Assign a value to Name data member.
bool IsSetExt(void) const
generic fields for ncRNA, tmRNA, miscRNA Check if a value has been assigned to Ext data member.
const TName & GetName(void) const
Get the variant data.
const TExt & GetExt(void) const
Get the Ext member data.
bool IsName(void) const
Check if variant Name is selected.
@ e_not_set
No variant selected.
TXref & SetXref(void)
Assign a value to Xref data member.
const TKey & GetKey(void) const
Get the Key member data.
void ResetPartial(void)
Reset Partial data member.
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
E_Choice Which(void) const
Which variant is currently selected.
bool IsProt(void) const
Check if variant Prot is selected.
void SetLocation(TLocation &value)
Assign a value to Location data member.
bool IsCdregion(void) const
Check if variant Cdregion is selected.
void SetPartial(TPartial value)
Assign a value to Partial data member.
void SetProduct(TProduct &value)
Assign a value to Product data member.
const TQual & GetQual(void) const
Get the Qual member data.
bool IsSetKey(void) const
Check if a value has been assigned to Key data member.
const TId & GetId(void) const
Get the Id member data.
const TLocal & GetLocal(void) const
Get the variant data.
bool IsSetXref(void) const
cite other relevant features Check if a value has been assigned to Xref data member.
const TLocation & GetLocation(void) const
Get the Location member data.
bool IsLocal(void) const
Check if variant Local is selected.
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
void SetData(TData &value)
Assign a value to Data data member.
bool IsSetId(void) const
Check if a value has been assigned to Id data member.
const TProduct & GetProduct(void) const
Get the Product member data.
const TGene & GetGene(void) const
Get the variant data.
const TProt & GetProt(void) const
Get the variant data.
const TXref & GetXref(void) const
Get the Xref member data.
vector< CRef< CSeqFeatXref > > TXref
vector< CRef< CGb_qual > > TQual
const TRna & GetRna(void) const
Get the variant data.
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
@ e_Het
cofactor, prosthetic grp, etc, bound to seq
@ e_not_set
No variant selected.
@ e_Region
named region (globin locus)
@ e_Seq
to annotate origin from another seq
@ e_Txinit
transcription initiation
@ e_Num
a numbering system
@ e_Pub
publication applies to this seq
@ e_User
user defined structure
@ e_Rsite
restriction site (for maps really)
@ e_Comment
just a comment
@ e_Non_std_residue
non-standard residue here in seq
void SetTo(TTo value)
Assign a value to To data member.
const TWhole & GetWhole(void) const
Get the variant data.
TFrom GetFrom(void) const
Get the From member data.
void SetFrom(TFrom value)
Assign a value to From data member.
bool IsSetStrand(void) const
Check if a value has been assigned to Strand data member.
TStrand GetStrand(void) const
Get the Strand member data.
TTo GetTo(void) const
Get the To member data.
bool IsWhole(void) const
Check if variant Whole is selected.
const TSeq & GetSeq(void) const
Get the variant data.
bool IsSetClass(void) const
Check if a value has been assigned to Class data member.
TSet & SetSet(void)
Select the variant.
TClass GetClass(void) const
Get the Class member data.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TSet & GetSet(void) const
Get the variant data.
bool IsSeq(void) const
Check if variant Seq is selected.
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
E_Choice Which(void) const
Which variant is currently selected.
bool IsSet(void) const
Check if variant Set is selected.
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
void SetClass(TClass value)
Assign a value to Class data member.
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_nuc_prot
nuc acid and coded proteins
@ eClass_gen_prod_set
genomic products, chrom+mRNA+protein
@ eClass_genbank
converted genbank
void SetCompleteness(TCompleteness value)
Assign a value to Completeness data member.
void SetData(TData &value)
Assign a value to Data data member.
bool IsSetCompleteness(void) const
Check if a value has been assigned to Completeness data member.
TId & SetId(void)
Assign a value to Id data member.
void ResetId(void)
Reset Id data member.
const TInst & GetInst(void) const
Get the Inst member data.
TTitle & SetTitle(void)
Select the variant.
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
bool IsSetRepr(void) const
Check if a value has been assigned to Repr data member.
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
const TSource & GetSource(void) const
Get the variant data.
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TAnnot & GetAnnot(void) const
Get the Annot member data.
const TId & GetId(void) const
Get the Id member data.
void ResetAnnot(void)
Reset Annot data member.
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
TLength GetLength(void) const
Get the Length member data.
list< CRef< CSeq_id > > TId
void SetInst(TInst &value)
Assign a value to Inst data member.
virtual void Reset(void)
Reset the whole object.
void SetBiomol(TBiomol value)
Assign a value to Biomol data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
list< CRef< CSeq_feat > > TFtable
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
void SetTech(TTech value)
Assign a value to Tech data member.
TMolinfo & SetMolinfo(void)
Select the variant.
@ eCompleteness_complete
complete biological entity
@ eCompleteness_no_left
missing 5' or NH3 end
@ eCompleteness_no_right
missing 3' or COOH end
@ eCompleteness_no_ends
missing both ends
@ eTech_concept_trans
conceptual translation
@ e_User
user defined object
@ e_Update_date
date of last update
@ e_Pub
a reference to the publication
@ e_Molinfo
info on the molecule and techniques
@ e_Create_date
date entry first created/released
@ e_Title
a title for this sequence
@ e_Source
source of materials, includes Org-ref
@ eMol_na
just a nucleic acid
bm::gap_word_t gap_length(const bm::gap_word_t *buf) noexcept
Returs GAP block length.
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is whole
void VisitAllBioseqs(objects::CSeq_entry &entry, _M &&m)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Utility macros and typedefs for exploring NCBI objects from seq.asn.
Utility macros and typedefs for exploring NCBI objects from seqset.asn.
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
Holds information about a given gap-type string.
CSeq_gap::EType m_eType
The underlying type that the string corresponds to.
ELinkEvid m_eLinkEvid
Indicates what linkage-evidences are compatible with this.
Compare objects pointed to by (smart) pointer.
bool operator()(const CSeq_id *const left, const CSeq_id *const right) const
list< CRef< CSeq_feat > >::const_iterator TFeatIt
list< CRef< CSeq_annot > >::iterator TAnnotIt
CRef< objects::CBioseq > bioseq
CRef< objects::CSeq_feat > ParentGene(const objects::CSeq_feat &cds)
CRef< objects::CSeq_feat > ParentMrna(const objects::CSeq_feat &cds)
static void s_ExtendIntervalToEnd(objects::CSeq_interval &ival, objects::CBioseq_Handle bsh)
bool AssignLocalIdIfEmpty(CSeq_feat &feature, int &id)
static CS_CONTEXT * context