86 #include <unordered_set>
95 static string kAssemblyGap_feature =
"assembly_gap";
96 static string kGapType_qual =
"gap_type";
97 static string kLinkageEvidence_qual =
"linkage_evidence";
102 CSeq_descr::Tdata::iterator it = src.
SetDescr().Set().begin();
104 while(it != src.
SetDescr().Set().end())
106 switch ((**it).Which())
111 dest.
SetDescr().Set().push_back(*it);
122 dest.
SetDescr().Set().push_back(*it);
132 const char mapids[] = {
158 struct SSeqAnnotCompare
163 const char* m = mapids;
167 return strchr(m, c)-m;
175 return mapwhich(left->
GetData().
Which()) < mapwhich(right->GetData().Which());
183 if (!(**annot_it).IsFtable())
continue;
198 void FindMaximumId(
const CSeq_entry& entry,
int&
id)
202 FindMaximumId(entry.
GetAnnot(),
id);
212 FindMaximumId(**set_it,
id);
234 if ((**xref_it).IsSetData())
236 if ((**xref_it).GetData().IsProt() &&
237 (**xref_it).GetData().GetProt().IsSetName())
239 protein_name = (**xref_it).GetData().GetProt().GetName().front();
262 id->SetLocal().SetStr(id_label);
275 if (!hid || !it->IsBetter(hid)) {
284 return GetNewProteinId(seh.
GetScope(), id_base);
287 string NewProteinName(
const CSeq_feat& feature,
bool make_hypotethic)
293 if (protein_name.empty() && make_hypotethic)
295 protein_name =
"hypothetical protein";
308 for (
auto pId : pProtEntry->GetSeq().GetId()) {
331 seqid->Assign(*bioseq.
GetId().begin()->GetPointerOrNull());
333 best.push_back(orfs.front());
336 if ((**it).GetTotalRange().GetLength() >
337 best.front()->GetTotalRange().GetLength() )
353 for (
auto it: seq.
GetId()) {
365 for (
auto it: seq_ids) {
366 if (!BioseqHasId(bioseq, it))
368 bioseq.
SetId().push_back(it);
377 if (it->IsGenbank() || best.
Empty())
385 for (
auto it = seq_ftable.begin(); it != seq_ftable.end(); ++it) {
386 auto prot_feat = *it;
392 seq_ftable.erase(it);
408 ftable = &it->SetData().SetFtable();
416 ftable = &annot->SetData().SetFtable();
422 if (prot_feat.
Empty())
424 ftable->push_back(prot_feat);
426 prot_feat =
ftable->front();
430 int GetGenomicCodeOfBioseq(
const CBioseq& bioseq)
433 if (closest_biosource.
Empty())
454 auto append_nonduplicated_item = [](list<string>& current_list,
455 const list<string>& other_list)
457 unordered_set<string> current_set;
458 for (
const auto& item : current_list) {
459 current_set.insert(item);
462 for (
const auto& item : other_list) {
463 if (current_set.find(item) == current_set.end()) {
464 current_list.push_back(item);
470 append_nonduplicated_item(current_ref.
SetName(),
479 append_nonduplicated_item(current_ref.
SetEc(),
484 append_nonduplicated_item(current_ref.
SetActivity(),
489 for (
const auto& pDBtag : other_ref.
GetDb()) {
490 current_ref.
SetDb().push_back(pDBtag);
513 const string& product_name = cds.
GetNamedQual(
"product");
515 prot_ref.
SetName().push_back(product_name);
524 for (
auto& prot_name : prot_ref.
SetName()) {
543 bool was_extended =
false;
558 protein_entry->
SetSeq(*protein);
574 if (protein->
GetId().empty())
576 const string* protein_ids =
nullptr;
578 qual_to_remove =
"protein_id";
581 if (protein_ids->empty())
583 qual_to_remove =
"orig_protein_id";
587 if (protein_ids->empty())
593 if (protein_ids->empty())
599 if (protein_ids->empty()) {
604 MergeSeqIds(*protein, {
whole });
612 MergeSeqIds(*protein, new_ids);
621 if (protein->
GetId().empty())
624 if (!bioseq.
GetId().empty()) {
627 protein->
SetId().push_back(GetNewProteinId(*token.
scope, base_name));
630 for (
auto prot_id : protein->
GetId()) {
631 prot_feat = MoveParentProt(seq_ftable, *prot_id);
636 CreateOrSetFTable(*protein, prot_feat);
644 prot_ref.
SetName().push_back(
"hypothetical protein");
649 prot_feat->
SetLocation().SetInt().SetId().Assign(*GetAccessionId(protein->
GetId()));
653 cd_feature.
SetProduct().SetWhole().Assign(*GetAccessionId(protein->
GetId()));
673 auto& ext = mrna->
SetData().SetRna().SetExt();
675 (ext.IsName() && ext.SetName().empty()))
676 ext.SetName() = prot_ref.
GetName().front();
692 return protein_entry;
706 return *left < *right;
718 const CBioseq* pNucSeq=
nullptr;
720 const auto& bioseqSet = nuc_prot.
GetSet();
721 for (
const auto& pSubEntry : bioseqSet.GetSeq_set()) {
722 const auto& bioseq = pSubEntry->GetSeq();
733 inserter(proteinIds, proteinIds.
end()),
743 for (
auto pAnnot : pNucSeq->
GetAnnot()) {
744 if (pAnnot->IsFtable()) {
745 for (
auto pSeqFeat : pAnnot->GetData().GetFtable()) {
747 !pSeqFeat->IsSetData() ||
748 !pSeqFeat->GetData().IsCdregion()) {
752 if (!pSeqFeat->IsSetProduct() ||
753 !pSeqFeat->GetProduct().GetId() ||
754 proteinIds.
find(pSeqFeat->GetProduct().GetId())
755 == proteinIds.
end()) {
777 switch (entry.
Which())
815 switch(entry.
Which())
845 seq_ftable.sort(SSeqAnnotCompare());
846 auto feat_it = seq_ftable.begin();
847 while (feat_it != seq_ftable.end())
861 int code = GetGenomicCodeOfBioseq(*token.
bioseq);
893 set_ftable.push_back(feature);
894 feat_it = seq_ftable.erase(feat_it);
906 if (!entry.
IsSet() ||
911 auto entry_it = find_if(seq_set.begin(), seq_set.end(),
916 pEntry->GetSeq().IsSetInst() &&
917 pEntry->GetSeq().IsNa() &&
918 pEntry->GetSeq().IsSetAnnot());
921 if (entry_it == seq_set.end()) {
925 auto& bioseq = token.
bioseq;
926 bioseq.
Reset(&((*entry_it)->SetSeq()));
931 find_if(annots.begin(), annots.end(),
934 if (annot_it == annots.end()) {
938 auto main_ftable = *annot_it;
941 while (annot_it != annots.end()) {
942 auto pAnnot = *annot_it;
944 main_ftable->
SetData().SetFtable().splice(
945 end(main_ftable->SetData().SetFtable()),
946 pAnnot->
SetData().SetFtable());
947 annot_it = annots.erase(annot_it);
954 auto seq_ftable = main_ftable->SetData().SetFtable();
972 if (seq_ftable.empty()) {
973 bioseq->
SetAnnot().remove(main_ftable);
976 main_ftable->SetData().SetFtable() = move(seq_ftable);
984 if (!set_ftable.empty()) {
1005 unique_ptr<CFastaReader> pReader(
new CFastaReader(0,
flags));
1006 pReader->SetPostponedMods({
"gene",
"allele"});
1017 if (
result->IsSetDescr())
1019 if (
result->GetDescr().Get().empty())
1022 result->SetSeq().ResetDescr();
1024 result->SetSet().ResetDescr();
1031 set->SetSet().SetSeq_set().push_back(
result);
1045 list<CConstRef<CBioseq>> proteins;
1046 if (possible_proteins.
IsSeq()) {
1047 proteins.emplace_back(&(possible_proteins.
GetSeq()));
1053 proteins.emplace_back(&(pSubEntry->GetSeq()));
1061 auto it = proteins.begin();
1062 while(it != proteins.end()) {
1064 it = proteins.erase(it);
1092 if ((**annot_it).IsFtable())
1096 if((**feat_it).CanGetData())
1098 switch ((**feat_it).GetData().Which())
1123 MoveSomeDescr(entry, bioseq);
1168 if (ival.
GetTo() < bioseqLength - 4) {
1171 ival.
SetTo(bioseqLength - 1);
1178 bool changed =
false;
1180 if ( partial5 && partial3 ) {
1182 }
else if ( partial5 ) {
1184 }
else if ( partial3 ) {
1208 if ((*annot_it)->IsFtable()) {
1220 if ((*feat_it)->IsSetData() && (*feat_it)->GetData().IsProt() && !(*feat_it)->GetData().GetProt().IsSetProcessed()) {
1221 prot_feat = *feat_it;
1227 prot_feat->
SetData().SetProt();
1228 ftable->SetData().SetFtable().push_back(prot_feat);
1231 prot_id->Assign(*(protein.
GetId().front()));
1232 prot_feat->
SetLocation().SetInt().SetId(*prot_id);
1237 if (partial5 || partial3) {
1262 if (
set &&
set->IsSetSeq_set()) {
1265 CBioseq_set::TDescr::Tdata::const_iterator it =
nuc->GetDescr().Get().begin();
1266 while (it !=
nuc->GetDescr().Get().end()) {
1267 if (!(*it)->IsMolinfo() && !(*it)->IsTitle()) {
1272 it =
nuc->GetDescr().Get().begin();
1299 for (; annot_ci; ++annot_ci) {
1300 if ((*annot_ci).IsFtable()) {
1316 m_feh = aeh.AddFeat(*m_Feat);
1325 const string& idString,
1327 objects::ILineErrorListener& logger)
1329 for (
const auto& modName : duplicateMods) {
1330 string message =
"Multiple '" + modName +
"' modifiers. Only the first will be used.";
1331 logger.PutError(*unique_ptr<CLineError>(
1333 "",
"",
"", message)));
1340 CBioseq& protein,
bool partial5,
bool partial3)
1344 const auto& proteinIds = pOriginalProtIds.empty() ?
1348 for (
auto pId : proteinIds) {
1349 const auto idString = pId->AsFastaString();
1351 const auto& modList = it->second.second;
1352 lineNumber = it->second.first;
1354 for (
const auto&
mod : modList) {
1371 for (
auto pEntry :
nuc->SetSet().SetSeq_set()) {
1372 if (pEntry->IsSeq() && pEntry->GetSeq().IsNa()) {
1386 for (
auto pId : protein.
GetId()) {
1392 else if (seh.
IsSet()) {
1394 if (bit->IsSynonym(*pId)) {
1410 if (nuc_count > 1) {
1421 const CSeq_loc& genomicLoc,
1428 auto alignment = prosplign.
FindAlignment(scope, proteinId, genomicLoc,
1440 if (!filter.
Match(*alignment)) {
1445 bool found_start_codon =
false;
1446 bool found_stop_codon =
false;
1447 list<CRef<CSeq_loc>> exonLocs;
1449 if (alignment->IsSetSegs() && alignment->GetSegs().IsSpliced()) {
1451 seq_id->
Assign(*(genomicLoc.GetId()));
1452 const auto& splicedSegs = alignment->GetSegs().GetSpliced();
1453 const bool isMinusStrand = (splicedSegs.IsSetGenomic_strand() &&
1456 for (
auto pExon : splicedSegs.GetExons()) {
1457 auto pExonLoc =
Ref(
new CSeq_loc(*seq_id,
1458 pExon->GetGenomic_start(),
1459 pExon->GetGenomic_end()));
1461 if (isMinusStrand) {
1463 }
else if (pExon->IsSetGenomic_strand()) {
1464 pExonLoc->SetStrand(pExon->GetGenomic_strand());
1466 exonLocs.push_back(pExonLoc);
1469 for (
auto pModifier : splicedSegs.GetModifiers()) {
1470 if (pModifier->IsStart_codon_found()) {
1471 found_start_codon = pModifier->GetStart_codon_found();
1473 if (pModifier->IsStop_codon_found()) {
1474 found_stop_codon = pModifier->GetStop_codon_found();
1479 if (exonLocs.empty()) {
1483 auto pCDSLoc =
Ref(
new CSeq_loc());
1484 if (exonLocs.size() == 1) {
1485 pCDSLoc->Assign(*(exonLocs.front()));
1488 pCDSLoc->SetMix().Set() = exonLocs;
1491 if (!found_start_codon) {
1495 if (found_stop_codon) {
1497 auto& finalInterval = pCDSLoc->IsMix() ?
1498 pCDSLoc->SetMix().Set().back()->SetInt() :
1511 pCds->SetLocation(loc);
1513 pCds->SetPartial(
true);
1515 pCds->SetData().SetCdregion();
1516 pCds->SetProduct().SetWhole(productId);
1533 bool id_match{
false};
1547 bioseq_id->
Assign(*(bsh_match.GetSeqId()));
1548 CRef<CSeq_loc> match_loc(
new CSeq_loc(*bioseq_id, 0, bsh_match.GetBioseqLength() - 1));
1554 pOriginalIds = move(protein_entry->
SetSeq().
SetId());
1557 protein_entry->
SetSeq().
SetId().push_back(product_id);
1568 string error =
"Unable to find coding region location for protein sequence " +
label +
".";
1580 protein_entry->
SetSeq(), partial5, partial3);
1582 AddSeqEntry(bsh_match.GetParentEntry(), protein_entry);
1584 auto new_cds =
s_MakeCDSFeat(*cds_loc, (partial5 || partial3),
1586 AddFeature(seh, new_cds);
1591 string title = protein_name;
1592 if (!org_name.empty())
1608 for (CBioseq::TAnnot::iterator annot_it = bioseq.
SetAnnot().begin(); annot_it != bioseq.
SetAnnot().end(); )
1610 if ((**annot_it).IsFtable() && (**annot_it).GetData().GetFtable().empty())
1612 annot_it = bioseq.
SetAnnot().erase(annot_it);
1628 return (feat.
GetNamedQual(
"estimated_length") ==
"unknown");
1634 const string& sGT = feature_gap.
GetNamedQual(kGapType_qual);
1648 gap_type = gap_type_info->
m_eType;
1651 linkage_evidence_to_value_map = CLinkage_evidence::GetTypeInfo_enum_EType()->NameToValue();
1655 const string& sLE_name = (**sLE_qual).GetQual();
1656 if (sLE_name != kLinkageEvidence_qual)
1662 if (it == linkage_evidence_to_value_map.
end())
1665 string(
"Unrecognized linkage evidence ") + (**sLE_qual).GetVal(),
1681 string(
"Linkage evidence must not be specified for ") + sGT,
1692 string(
"Linkage evidence must be specified for ") + sGT,
1705 evidences.
insert(evidence);
1711 string(
"Unrecognized gap type ") + sGT,
1735 for (
CBioseq_CI bioseq_it(seh); bioseq_it; ++bioseq_it)
1739 for (
CFeat_CI feature_it(*bioseq_it, annot_sel); feature_it; )
1741 if (feature_it->IsSetData() && feature_it->GetData().IsImp())
1743 const CImp_feat& imp = feature_it->GetData().GetImp();
1747 const CSeq_feat& feature_gap = feature_it->GetOriginalFeature();
1752 auto pBioseq =
const_cast<CBioseq*
>(bioseq_it->GetCompleteBioseq().GetPointer());
1758 "Failed to convert feature gap into a gap",
1777 CBioseq& bioseq = (
CBioseq&)*bioseq_it->GetEditHandle().GetCompleteBioseq();
1796 for (
auto pAnnot : bioseq.
SetAnnot()) {
1797 if (!pAnnot->IsSetData() ||
1803 auto&
ftable = pAnnot->SetData().SetFtable();
1804 auto fit =
ftable.begin();
1805 while (fit !=
ftable.end()) {
1806 auto pSeqFeat = *fit;
1807 if (pSeqFeat->IsSetData() &&
1808 pSeqFeat->GetData().IsImp() &&
1809 pSeqFeat->GetData().GetImp().IsSetKey() &&
1810 pSeqFeat->GetData().GetImp().GetKey() == kAssemblyGap_feature) {
1813 if (
MakeGap(bioseq, *pSeqFeat)) {
1818 "Failed to convert feature gap into a gap",
1841 CSeqTranslator::ChangeDeltaProteinToRawProtein(Ref(&bioseq));
1851 switch(loc.Which()) {
1853 return &loc.GetWhole();
1855 return &(loc.GetInt().GetId());
1857 return &(loc.GetPnt().GetId());
1859 if (!loc.GetPacked_int().Get().empty()) {
1860 return &(loc.GetPacked_int().Get().front()->GetId());
1864 if (loc.GetPacked_pnt().IsSetId()) {
1865 return &(loc.GetPacked_pnt().GetId());
1878 using TFeatIt = list<CRef<CSeq_feat>>::const_iterator;
1888 list<SRegionIterators>& its)
1891 for (
auto annot_it = annots.begin();
1892 annot_it != annots.end();
1895 const auto& annot = **annot_it;
1896 if (annot.IsFtable()) {
1897 const auto&
ftable = annot.GetData().GetFtable();
1898 list<SRegionIterators::TFeatIt> feat_its;
1899 for (
auto feat_it =
ftable.begin(); feat_it !=
ftable.end(); ++feat_it) {
1900 const auto& pFeat = *feat_it;
1901 if (pFeat->IsSetData() &&
1902 pFeat->GetData().IsRegion()) {
1903 feat_its.push_back(feat_it);
1906 if (!feat_its.empty()) {
1916 if (!seq_entry.
IsSet()) {
1920 auto& bioseq_set = seq_entry.
SetSet();
1922 if (!bioseq_set.IsSetClass() ||
1924 if (bioseq_set.IsSetSeq_set()) {
1925 for (
auto pEntry : bioseq_set.SetSeq_set()) {
1934 _ASSERT(bioseq_set.IsSetSeq_set());
1940 list<SRegionIterators> region_its;
1942 for (
auto pSubEntry : bioseq_set.SetSeq_set()) {
1944 auto& seq = pSubEntry->SetSeq();
1955 region_its.empty()) {
1960 pScope->AddTopLevelSeqEntry(seq_entry);
1963 for (
auto its : region_its) {
1964 for (
auto feat_it : its.feat_its) {
1965 auto pRegion = *feat_it;
1971 pRegion->SetLocation(*pMappedLoc);
1975 (*its.annot_it)->SetData().SetFtable().
erase(feat_it);
1978 if ((*its.annot_it)->GetData().GetFtable().empty()) {
1979 pNucSeq->
SetAnnot().erase(its.annot_it);
1987 for (
auto pSubEntry : bioseq_set.SetSeq_set()) {
1988 auto& bioseq = pSubEntry->SetSeq();
1989 if (bioseq.
IsNa()) {
1994 for (
auto pId : bioseq.
GetId()) {
1996 while (it != mapped_regions.
end() && (it->first->Compare(*pId) ==
CSeq_id::e_YES)) {
2002 it = mapped_regions.
erase(it);
2007 bioseq.
SetAnnot().push_back(pAnnot);
2010 if(mapped_regions.
empty()) {
2019 if (entry.
IsSeq()) {
2023 auto& bioseq_set = entry.
SetSet();
2024 if (!bioseq_set.IsSetSeq_set()) {
2028 bool any_change =
false;
2029 if (!bioseq_set.IsSetClass() ||
2031 for (
auto pSubEntry : bioseq_set.SetSeq_set()) {
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
@ eExtreme_Biological
5' and 3'
User-defined methods of the data storage class.
void g_LogGeneralParsingError(EDiagSev sev, const string &idString, const string &msg, objects::ILineErrorListener &listener)
string GetIdHashOrValue(const string &base, int offset)
void transform(Container &c, UnaryFunction *op)
CAlignFilter exposes a query language for inspecting properties and scores placed on Seq-align object...
bool Match(const objects::CSeq_align &align)
Match a single alignment.
CSeqdesc & Set(bool skip_lookup=false)
int GetGenCode(int def=1) const
CConstRef< CSeqdesc > GetClosestDescriptor(CSeqdesc::E_Choice choice, int *level=NULL) const
TSeqPos GetLength(void) const
static bool ExtendToStopIfShortAndNotPartial(CSeq_feat &f, CBioseq_Handle bsh, bool check_for_stop=true)
Extends a coding region up to 50 nt.
static bool ParseCodeBreaks(CSeq_feat &feat, CScope &scope)
Parses all valid transl_except Gb-quals into code-breaks for cdregion, then removes the transl_except...
static bool ExtendStopPosition(CSeq_feat &f, const CSeq_feat *cdregion, size_t extension=0)
static CRef< CSeq_loc > GetProteinLocationFromNucleotideLocation(const CSeq_loc &nuc_loc, CScope &scope)
static bool MoveProteinSpecificFeats(CSeq_entry_Handle seh)
Moves protein-specific features from nucleotide sequences in the Seq-entry to the appropriate protein...
static bool LocationMayBeExtendedToMatch(const CSeq_loc &orig, const CSeq_loc &improved)
Checks whether it is possible to extend the original location up to improved one.
void xParseCdregions(objects::CSeq_entry &entry, TAsyncToken &)
objects::CFastaReader::TPostponedModMap m_PrtModMap
void AddProteins(const objects::CSeq_entry &possible_proteins, objects::CSeq_entry &entry)
void MoveRegionsToProteins(objects::CSeq_entry &entry)
void xMoveCdRegions(objects::CSeq_entry_Handle entry_h, objects::CSeq_annot::TData::TFtable &seq_ftable, objects::CSeq_annot::TData::TFtable &set_ftable, TAsyncToken &)
void FindOpenReadingFrame(objects::CSeq_entry &entry) const
bool xAddProteinToSeqEntry(const objects::CBioseq &protein, objects::CSeq_entry_Handle seh)
void ConvertNucSetToSet(CRef< objects::CSeq_entry > &entry) const
CFeatureTableReader(CTable2AsnContext &context)
CTable2AsnContext & m_context
void MakeGapsFromFeatures(objects::CSeq_entry_Handle seh) const
CRef< objects::CSeq_entry > ReadProtein(ILineReader &line_reader)
void ChangeDeltaProteinToRawProtein(objects::CSeq_entry &entry) const
void xConvertSeqIntoSeqSet(objects::CSeq_entry &entry, bool nuc_prod_set) const
static void RemoveEmptyFtable(objects::CBioseq &bioseq)
CRef< objects::CSeq_feat > x_AddProteinFeatureToProtein(CRef< objects::CSeq_entry > nuc, CConstRef< objects::CSeq_loc > cds_loc, const list< CRef< objects::CSeq_id >> &pOriginalProtIds, objects::CBioseq &protein, bool partial5, bool partial3)
CRef< objects::CDelta_seq > MakeGap(objects::CBioseq &bioseq, const objects::CSeq_feat &feature_gap) const
void xMergeCDSFeatures_impl(objects::CSeq_entry &, TAsyncToken &)
void MergeCDSFeatures(objects::CSeq_entry &, TAsyncToken &)
void MoveProteinSpecificFeats(objects::CSeq_entry &entry)
CRef< objects::CSeq_entry > m_replacement_protein
bool xCheckIfNeedConversion(const objects::CSeq_entry &entry) const
CRef< objects::CSeq_entry > xTranslateProtein(const objects::CBioseq &bioseq, objects::CSeq_feat &cd_feature, list< CRef< CSeq_feat >> &seq_ftable, TAsyncToken &)
CRef< CDelta_seq > CreateGap(CBioseq &bioseq, TSeqPos gap_start, TSeqPos gap_length)
@Imp_feat.hpp User-defined methods of the data storage class.
static CLineError * Create(EProblem eProblem, EDiagSev eSeverity, const std::string &strSeqId, unsigned int uLine, const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), const std::string &strErrorMessage=string(""), const TVecOfLines &vecOfOtherLines=TVecOfLines())
Use this because the constructor is protected.
vector< CRef< objects::CSeq_loc > > TLocVec
static CRef< objects::CSeq_annot > MakeCDSAnnot(const TLocVec &orfs, int genetic_code=1, objects::CSeq_id *id=NULL)
/ This version returns an annot full of CDS features.
static void FindOrfs(const string &seq, TLocVec &results, unsigned int min_length_bp=3, int genetic_code=1, const vector< string > &allowable_starts=vector< string >(), bool longest_orfs=true, size_t max_seq_gap=k_default_max_seq_gap)
Find ORFs in both orientations.
CProSplignOptions_Base & SetAltStarts(bool allow_alt_start)
Output filtering parameters.
@ ePassThrough
all zeroes - no filtering
@ eWithHoles
default filtering parameters
spliced protein to genomic alignment
CRef< objects::CSeq_align > FindAlignment(objects::CScope &scope, const objects::CSeq_id &protein, const objects::CSeq_loc &genomic, CProSplignOutputOptions output_options=CProSplignOutputOptions())
Aligns protein to a region on genomic sequence.
void GetLabel(string *label) const
void SetCdregion(TCdregion &v)
bool IsFtable(void) const
@Seq_descr.hpp User-defined methods of the data storage class.
const TAnnot & GetAnnot(void) const
bool IsSetAnnot(void) const
void SetDescr(CSeq_descr &value)
list< CRef< CSeq_annot > > TAnnot
CSeq_entry * GetParentEntry(void) const
namespace ncbi::objects::
const CProt_ref * GetProtXref(void) const
get protein (if present) from Seq-feat.xref list
const string & GetNamedQual(const CTempString &qual_name) const
Return a named qualifier.
void RemoveQualifier(const string &qual_name)
Remove all qualifiers with the given name; do nothing if no such qualifier exists.
bool AddSeqFeatXref(const CSeqFeatXref::TId &id)
@ eLinkEvid_UnspecifiedOnly
only the "unspecified" linkage-evidence is allowed
@ eLinkEvid_Forbidden
no linkage-evidence is allowed
@ eLinkEvid_Required
any linkage-evidence is allowed, and at least one is required
static const SGapTypeInfo * NameToGapTypeInfo(const CTempString &sName)
From a gap-type string, get the SGapTypeInfo, insensitive to case, etc.
static bool GetOrgName(string &name, const objects::CSeq_entry &entry)
objects::ILineErrorListener * m_logger
bool m_use_hypothetic_protein
SPrtAlnOptions prtAlnOptions
static bool IsDBLink(const objects::CSeqdesc &desc)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
@ eProblem_GeneralParsingError
Abstract base class for lightweight line-by-line reading.
container_type::const_iterator const_iterator
const_iterator end() const
const_iterator lower_bound(const key_type &key) const
const_iterator find(const key_type &key) const
iterator_bool insert(const value_type &val)
const_iterator find(const key_type &key) const
const_iterator end() const
void SetMolinfoForProtein(CRef< objects::CSeq_entry > protein, bool partial5, bool partial3)
CRef< objects::CSeq_feat > AddEmptyProteinFeatureToProtein(CRef< objects::CSeq_entry > protein, bool partial5, bool partial3)
bool SetMolinfoCompleteness(objects::CMolInfo &mi, bool partial5, bool partial3)
Operators to edit gaps in sequences.
static void s_SetProtRef(const CSeq_feat &cds, CConstRef< CSeq_feat > pMrna, CProt_ref &prot_ref)
static CBioseq_Handle s_MatchProteinById(const CBioseq &protein, CSeq_entry_Handle seh)
static void s_AppendProtRefInfo(CProt_ref ¤t_ref, const CProt_ref &other_ref)
static void s_ReportDuplicateMods(const set< string > &duplicateMods, const string &idString, TSeqPos lineNumber, objects::ILineErrorListener &logger)
static bool s_MoveProteinSpecificFeats(CSeq_entry &entry)
static CRef< CSeq_loc > s_GetCDSLoc(CScope &scope, const CSeq_id &proteinId, const CSeq_loc &genomicLoc, TSeqPos bioseqLength, const CTable2AsnContext::SPrtAlnOptions &prtAlnOptions)
static bool s_HasUnprocessedCdregions(const CSeq_entry &nuc_prot)
static CRef< CSeq_feat > s_MakeCDSFeat(CSeq_loc &loc, bool isPartial, CSeq_id &productId)
static const CSeq_id * s_GetIdFromLocation(const CSeq_loc &loc)
static bool s_UnknownEstimatedLength(const CSeq_feat &feat)
static CBioseq_Handle s_GetSingleNucSeq(CSeq_entry_Handle seh)
static void s_GatherRegionIterators(list< CRef< CSeq_annot >> &annots, list< SRegionIterators > &its)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
@ eDiag_Error
Error message.
const string & GetMsg(void) const
Get message string.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
CNcbiIos & MSerial_VerifyNo(CNcbiIos &io)
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
#define MSerial_AsnText
I/O stream manipulators –.
bool AddMods(const CTempString &name, const CTempString &value)
static string CanonicalizeString(const CTempString &sValue)
void ApplyAllMods(CBioseq &seq, CTempString organism=kEmptyStr, CConstRef< CSeq_loc > location=CConstRef< CSeq_loc >())
Apply previously extracted modifiers to the given object, marking all relevant ones as used.
const TMods & GetAllMods(void) const
@ fAddMods
Parse defline mods and add to SeqEntry.
@ fNoUserObjs
Don't save raw deflines in User-objects.
@ fForceType
Force specified type regardless of accession.
@ fAssumeProt
Assume prots unless accns indicate otherwise.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
CConstRef< CSeq_id > GetSeqId(void) const
E_SIC Compare(const CSeq_id &sid2) const
Compare() - more general.
@ e_YES
SeqIds compared, but are different.
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
@ eContent
Untagged human-readable accession or the like.
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
TSeqPos GetStop(ESeqLocExtremes ext) const
bool CopyFeaturePartials(CSeq_feat &dst, const CSeq_feat &src)
CopyFeaturePartials A function to copy the start and end partialness from one feature to another.
bool AdjustProteinMolInfoToMatchCDS(CMolInfo &molinfo, const CSeq_feat &cds)
AdjustProteinMolInfoToMatchCDS A function to change an existing MolInfo to match a coding region.
bool IsPseudo(const CSeq_feat &feat, CScope &scope)
Determines whether given feature is pseudo, using gene associated with feature if necessary Checks to...
static CRef< CBioseq > TranslateToProtein(const CSeq_feat &cds, CScope &scope)
string GetProteinName(const CBioseq_Handle &seq)
Return protein name from corresponding Prot-ref feature.
static CCdregion::EFrame FindBestFrame(const CSeq_feat &cds, CScope &scope)
Find "best" frame for a coding region.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
void RemoveTopLevelSeqEntry(const CTSE_Handle &entry)
Revoke TSE previously added using AddTopLevelSeqEntry() or AddBioseq().
CSeq_entry_EditHandle GetSeq_entryEditHandle(const CSeq_entry &entry)
CBioseq_set_EditHandle GetEditHandle(void) const
Get 'edit' version of handle.
vector< CSeq_id_Handle > TId
TClass GetClass(void) const
CRef< CSeqdesc > RemoveSeqdesc(const CSeqdesc &v) const
CBioseq_set_Handle GetParentBioseq_set(void) const
Return a handle for the parent Bioseq-set, or null handle.
CBioseq_set_EditHandle GetParentBioseq_set(void) const
Get parent bioseq-set edit handle.
CSeq_annot_EditHandle AttachAnnot(CSeq_annot &annot) const
Attach an annotation.
TSet ConvertSeqToSet(TClass set_class=CBioseq_set::eClass_not_set) const
Convert the entry from Bioseq to Bioseq-set.
CConstRef< CBioseq_set > GetCompleteBioseq_set(void) const
Return the complete bioseq-set object.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id) const
Get Bioseq handle from the TSE of this Seq-entry.
CSeq_entry_EditHandle GetEditHandle(void) const
Get 'edit' version of handle.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
CSeq_entry_Handle GetParentEntry(void) const
Return a handle for the parent seq-entry of the bioseq.
CSeq_entry_EditHandle AttachEntry(CSeq_entry &entry, int index=-1) const
Attach an existing seq-entry.
bool IsSetClass(void) const
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
CScope & GetScope(void) const
Get scope this handle belongs to.
CSeq_entry_EditHandle AttachEntry(CSeq_entry &entry, int index=-1) const
Attach an existing seq-entry.
CConstRef< TObject > GetCompleteObject(void) const
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
bool AddSeqdesc(CSeqdesc &v) const
const TId & GetId(void) const
int GetSeq_entry_Index(const CSeq_entry_Handle &handle) const
bool IsSynonym(const CSeq_id &id) const
Check if this id can be used to obtain this bioseq handle.
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
TObjectType * GetPointerOrNull(void) const THROWS_NONE
Get pointer value.
TObjectType * GetPointerOrNull(void) THROWS_NONE
Get pointer value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
virtual bool IsType(TTypeInfo type) const
static const char label[]
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
bool IsId(void) const
Check if variant Id is selected.
TId GetId(void) const
Get the variant data.
TActivity & SetActivity(void)
Assign a value to Activity data member.
bool IsSetDesc(void) const
description (instead of name) Check if a value has been assigned to Desc data member.
const TDb & GetDb(void) const
Get the Db member data.
const TActivity & GetActivity(void) const
Get the Activity member data.
TEc & SetEc(void)
Assign a value to Ec data member.
const TName & GetName(void) const
Get the Name member data.
bool IsSetDb(void) const
ids in other dbases Check if a value has been assigned to Db data member.
bool IsSetEc(void) const
E.C.
void SetDesc(const TDesc &value)
Assign a value to Desc data member.
TProcessed GetProcessed(void) const
Get the Processed member data.
void SetProcessed(TProcessed value)
Assign a value to Processed data member.
bool IsSetName(void) const
protein name Check if a value has been assigned to Name data member.
const TDesc & GetDesc(void) const
Get the Desc member data.
bool IsSetActivity(void) const
activities Check if a value has been assigned to Activity data member.
const TEc & GetEc(void) const
Get the Ec member data.
TDb & SetDb(void)
Assign a value to Db data member.
TName & SetName(void)
Assign a value to Name data member.
bool IsSetExt(void) const
generic fields for ncRNA, tmRNA, miscRNA Check if a value has been assigned to Ext data member.
const TName & GetName(void) const
Get the variant data.
const TExt & GetExt(void) const
Get the Ext member data.
bool IsName(void) const
Check if variant Name is selected.
@ e_not_set
No variant selected.
TXref & SetXref(void)
Assign a value to Xref data member.
const TKey & GetKey(void) const
Get the Key member data.
void ResetPartial(void)
Reset Partial data member.
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
E_Choice Which(void) const
Which variant is currently selected.
bool IsSetCode(void) const
genetic code used Check if a value has been assigned to Code data member.
bool IsProt(void) const
Check if variant Prot is selected.
void SetLocation(TLocation &value)
Assign a value to Location data member.
bool IsCdregion(void) const
Check if variant Cdregion is selected.
void SetPartial(TPartial value)
Assign a value to Partial data member.
void SetProduct(TProduct &value)
Assign a value to Product data member.
const TQual & GetQual(void) const
Get the Qual member data.
bool IsSetKey(void) const
Check if a value has been assigned to Key data member.
const TId & GetId(void) const
Get the Id member data.
const TLocal & GetLocal(void) const
Get the variant data.
bool IsSetXref(void) const
cite other relevant features Check if a value has been assigned to Xref data member.
const TLocation & GetLocation(void) const
Get the Location member data.
bool IsLocal(void) const
Check if variant Local is selected.
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
void SetData(TData &value)
Assign a value to Data data member.
const TCdregion & GetCdregion(void) const
Get the variant data.
bool IsSetId(void) const
Check if a value has been assigned to Id data member.
const TProduct & GetProduct(void) const
Get the Product member data.
const TGene & GetGene(void) const
Get the variant data.
const TProt & GetProt(void) const
Get the variant data.
const TXref & GetXref(void) const
Get the Xref member data.
vector< CRef< CSeqFeatXref > > TXref
vector< CRef< CGb_qual > > TQual
const TRna & GetRna(void) const
Get the variant data.
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
bool IsSetFrame(void) const
Check if a value has been assigned to Frame data member.
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
@ e_Het
cofactor, prosthetic grp, etc, bound to seq
@ e_not_set
No variant selected.
@ e_Region
named region (globin locus)
@ e_Seq
to annotate origin from another seq
@ e_Txinit
transcription initiation
@ e_Num
a numbering system
@ e_Pub
publication applies to this seq
@ e_User
user defined structure
@ e_Rsite
restriction site (for maps really)
@ e_Comment
just a comment
@ e_Non_std_residue
non-standard residue here in seq
void SetTo(TTo value)
Assign a value to To data member.
const TWhole & GetWhole(void) const
Get the variant data.
TFrom GetFrom(void) const
Get the From member data.
void SetFrom(TFrom value)
Assign a value to From data member.
bool IsSetStrand(void) const
Check if a value has been assigned to Strand data member.
TStrand GetStrand(void) const
Get the Strand member data.
TTo GetTo(void) const
Get the To member data.
bool IsWhole(void) const
Check if variant Whole is selected.
const TSeq & GetSeq(void) const
Get the variant data.
bool IsSetClass(void) const
Check if a value has been assigned to Class data member.
TSet & SetSet(void)
Select the variant.
TClass GetClass(void) const
Get the Class member data.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TSet & GetSet(void) const
Get the variant data.
bool IsSeq(void) const
Check if variant Seq is selected.
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
E_Choice Which(void) const
Which variant is currently selected.
bool IsSet(void) const
Check if variant Set is selected.
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
void SetClass(TClass value)
Assign a value to Class data member.
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_nuc_prot
nuc acid and coded proteins
@ eClass_gen_prod_set
genomic products, chrom+mRNA+protein
@ eClass_genbank
converted genbank
void SetCompleteness(TCompleteness value)
Assign a value to Completeness data member.
void SetData(TData &value)
Assign a value to Data data member.
bool IsSetCompleteness(void) const
Check if a value has been assigned to Completeness data member.
TId & SetId(void)
Assign a value to Id data member.
void ResetId(void)
Reset Id data member.
const TInst & GetInst(void) const
Get the Inst member data.
TTitle & SetTitle(void)
Select the variant.
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
bool IsSetRepr(void) const
Check if a value has been assigned to Repr data member.
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
const TSource & GetSource(void) const
Get the variant data.
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TAnnot & GetAnnot(void) const
Get the Annot member data.
const TId & GetId(void) const
Get the Id member data.
void ResetAnnot(void)
Reset Annot data member.
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
TLength GetLength(void) const
Get the Length member data.
list< CRef< CSeq_id > > TId
void SetInst(TInst &value)
Assign a value to Inst data member.
virtual void Reset(void)
Reset the whole object.
void SetBiomol(TBiomol value)
Assign a value to Biomol data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
list< CRef< CSeq_feat > > TFtable
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
void SetTech(TTech value)
Assign a value to Tech data member.
TMolinfo & SetMolinfo(void)
Select the variant.
@ eCompleteness_complete
complete biological entity
@ eCompleteness_no_left
missing 5' or NH3 end
@ eCompleteness_no_right
missing 3' or COOH end
@ eCompleteness_no_ends
missing both ends
@ eTech_concept_trans
conceptual translation
@ e_User
user defined object
@ e_Update_date
date of last update
@ e_Pub
a reference to the publication
@ e_Molinfo
info on the molecule and techniques
@ e_Create_date
date entry first created/released
@ e_Title
a title for this sequence
@ e_Source
source of materials, includes Org-ref
@ eMol_na
just a nucleic acid
bm::gap_word_t gap_length(const bm::gap_word_t *buf) noexcept
Returs GAP block length.
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is whole
void VisitAllBioseqs(objects::CSeq_entry &entry, _M &&m)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Utility macros and typedefs for exploring NCBI objects from seq.asn.
Utility macros and typedefs for exploring NCBI objects from seqset.asn.
Holds information about a given gap-type string.
CSeq_gap::EType m_eType
The underlying type that the string corresponds to.
ELinkEvid m_eLinkEvid
Indicates what linkage-evidences are compatible with this.
Compare objects pointed to by (smart) pointer.
bool operator()(const CSeq_id *const left, const CSeq_id *const right) const
list< CRef< CSeq_feat > >::const_iterator TFeatIt
list< CRef< CSeq_annot > >::iterator TAnnotIt
CRef< objects::CBioseq > bioseq
CRef< objects::CSeq_feat > ParentGene(const objects::CSeq_feat &cds)
CRef< objects::CSeq_feat > ParentMrna(const objects::CSeq_feat &cds)
static void s_ExtendIntervalToEnd(objects::CSeq_interval &ival, objects::CBioseq_Handle bsh)
bool AssignLocalIdIfEmpty(CSeq_feat &feature, int &id)