55 : m_CleanupOptions(options) {}
66 auto& biosets = m_bioseq_set_list;
67 if (biosets.size()>1) {
68 auto firstTrueBioset =
next(biosets.begin());
79 TParent::FlattenGenbankSet();
92 auto it = m_top_ids.
begin();
93 while (it != m_top_ids.end()) {
96 if (smallGenomeLabelToId.
find(mit->second)
97 == smallGenomeLabelToId.
end()) {
98 smallGenomeLabelToId.emplace(mit->second, *it);
100 it = m_top_ids.
erase(it);
106 for (
auto entry : smallGenomeLabelToId) {
107 m_top_ids.push_back(entry.second);
116 for (
const auto& bioseq : GetBioseqs()) {
131 _ASSERT(m_bioseq_set_list.size()>1);
133 auto it =
next(m_bioseq_set_list.begin(),2);
134 while (it != m_bioseq_set_list.end()) {
159 !m_top_entry->IsSetDescr() ||
160 !m_top_entry->GetDescr().IsSet()) {
171 auto& descriptors = m_top_entry->SetDescr().Set();
172 auto it = descriptors.begin();
173 while (it != descriptors.end()) {
175 it = descriptors.erase(it);
177 else if ((*it)->IsSource()) {
179 it = descriptors.erase(it);
183 else if ((*it)->IsMolinfo()) {
187 it = descriptors.erase(it);
200 if (descriptors.empty()) {
201 m_top_entry->SetSet().ResetDescr();
216 bool addMolInfo =
false;
221 auto it = find_if(descriptors.begin(), descriptors.end(),
223 return (pDesc && pDesc->IsMolinfo());
225 if (it == descriptors.end()) {
231 entry.
SetDescr().Set().push_back(pSource);
250 if (
auto it=idMap.
find(
id); it != idMap.
end()) {
263 for (
auto pFeatId : feat.
SetIds()) {
271 for (
auto pXref : feat.
SetXref()) {
272 if (pXref && pXref->IsSetId()) {
286 for (
auto pSeqFeat : annot.
SetData().SetFtable()) {
299 for (
auto pAnnot : bioseq.
SetAnnot()) {
310 for (
auto pAnnot : bioseqSet.
SetAnnot()) {
318 for (
auto pSubEntry : bioseqSet.
SetSeq_set()) {
320 if (pSubEntry->IsSeq()) {
345 const TBioseqSetInfo&
info,
346 eAddTopEntry add_top_entry)
const
354 auto pSubEntry = TParent::LoadSeqEntry(setInfo, eAddTopEntry::no);
362 pSmallGenomeEntry->SetSet().SetSeq_set().push_back(pSubEntry);
367 return pSmallGenomeEntry;
371 auto pEntry = TParent::LoadSeqEntry(
info, eAddTopEntry::no);
391 for (
const auto& pDesc : descr.
Get()) {
392 if (pDesc->IsSource()) {
393 const auto&
source = pDesc->GetSource();
407 template<
typename TMap>
410 if (mapToVal.empty()) {
414 auto it = mapToVal.begin();
415 while (it != mapToVal.end()) {
416 if (it->second ==
val) {
417 it = mapToVal.erase(it);
427 const string& fluLabel,
440 if (seqDescrs && seqDescrs->
IsSet()) {
441 for (
auto pDesc : seqDescrs->
Get()) {
442 if (pDesc->IsSource()) {
443 return g_FindSegs(pDesc->GetSource(), numRequired, segments);
448 if (setDescrs && setDescrs->
IsSet()) {
449 for (
auto pDesc : setDescrs->
Get()) {
450 if (pDesc->IsSource()) {
451 return g_FindSegs(pDesc->GetSource(), numRequired, segments);
462 auto it = idSet.lower_bound(pId);
463 if (it != idSet.end()) {
464 if ((*it)->CompareOrdered(*pId) == 0 ||
465 (*it)->Compare(*pId) == CSeq_id::E_SIC::e_YES) {
477 for (
const auto& bioseqInfo : GetBioseqs()) {
481 auto parent = bioseqInfo.m_parent_set;
483 pNpSetDescr = parent->m_descr;
484 parent = parent->m_parent_set;
486 if (!IsHugeSet(parent->m_class)) {
490 if (bioseqInfo.m_descr) {
497 bool makeSmallGenomeSet =
498 s_CheckForSegments(bioseqInfo.m_descr, pNpSetDescr, fluLabel, fluLabelToSegs[fluLabel]);
500 if (makeSmallGenomeSet) {
501 const auto& setInfo = *FindTopObject(bioseqInfo.m_ids.front());
504 for (
auto pId : bioseqInfo.m_ids) {
512 for (
const auto& entry : fluLabelToSegs) {
513 const auto& fluLabel = entry.first;
514 const auto& segsFound = entry.second;
527 if (segsFound.
size() != numRequired) {
543 auto fluLabel = it->second;
544 fluLabelsToRemove.
insert(fluLabel);
556 for (
const auto& fluLabel : fluLabelsToRemove) {
568 if (it->first->CompareOrdered(*pId) == 0 ||
569 it->first->Compare(*pId) == CSeq_id::E_SIC::e_YES) {
627 auto pos =
in.GetStreamPos() + m_next_pos;
628 context.bioseq_stack.push_back({});
630 auto parent =
context.bioseq_set_stack.back();
632 if (hasGenbankParent) {
638 type.GetTypeInfo()->DefaultSkipData(
in);
640 auto& bioseqinfo =
context.bioseq_stack.back();
641 m_bioseq_list.push_back({pos, parent, bioseqinfo.m_length, bioseqinfo.m_descr, bioseqinfo.m_ids, bioseqinfo.m_mol, bioseqinfo.m_repr});
642 context.bioseq_stack.pop_back();
663 auto pos =
in.GetStreamPos() + m_next_pos;
664 auto parent =
context.bioseq_set_stack.back();
666 if (hasGenbankParent) {
672 m_bioseq_set_list.push_back({pos, parent});
674 auto last =
prev(m_bioseq_set_list.end());
680 it.ReadClassMember(objectInfo);
681 if ((*it).GetAlias() ==
"class") {
682 auto memIdx = (*it).GetMemberIndex();
690 if (pBioseqSet->IsSetLevel()) {
691 last->m_Level = pBioseqSet->GetLevel();
694 if (pBioseqSet->IsSetDescr()) {
695 last->m_descr.Reset(&(pBioseqSet->GetDescr()));
698 if (IsHugeSet(
last->m_class) &&
700 m_HasHugeSetAnnot =
true;
703 context.bioseq_set_stack.pop_back();
722 auto* pObject =
object.GetObjectPtr();
723 object.GetTypeInfo()->DefaultReadData(
in, pObject);
731 if (pSeqFeat->IsSetId()) {
735 if (pSeqFeat->IsSetIds()) {
736 for (
auto pFeatId : pSeqFeat->GetIds()) {
749 type.GetTypeInfo()->DefaultReadData(
in, pSeqFeat);
752 if (pSeqFeat->IsSetId()) {
756 if (pSeqFeat->IsSetIds()) {
757 for (
auto pFeatId : pSeqFeat->GetIds()) {
770 if (pSeqFeat->IsSetData() &&
771 (pSeqFeat->GetData().IsCdregion() ||
772 pSeqFeat->GetData().IsGene())) {
775 const auto* pSeqId = pSeqFeat->GetLocation().GetId();
792 TParent::x_SetHooks(objStream,
context);
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
All the changes made during cleanup.
void SetChanged(EChanges e)
void x_PruneIfFeatsIncomplete()
TIdToFluLabel m_IdToFluLabel
void x_SetBioseqHooks(CObjectIStream &objStream, TContext &context) override
void x_CreateSmallGenomeSets()
const CCleanupChangeCore & GetChanges() const
void x_RecordFeatureId(const CFeat_id &featId)
map< TFileSize, string > m_SetPosToFluLabel
map< string, list< TBioseqSetInfo > > m_FluLabelToSetInfo
void x_SetHooks(CObjectIStream &objStream, TContext &context) override
void x_SetSeqFeatHooks(CObjectIStream &objStream, TContext &context)
void FlattenGenbankSet() override
set< CConstRef< CSeq_id >, CRefLess > m_HasIncompleteFeats
void x_PruneIfSegsMissing(const string &fluLabel, const set< size_t > &segsFound)
CRef< CSeqdesc > m_pTopLevelMolInfo
void x_AddTopLevelDescriptors(CSeq_entry &entry) const
CCleanupChangeCore m_Changes
bool x_IsExtendedCleanup() const
list< CRef< CSeqdesc > > m_TopLevelBiosources
void x_PruneAndReorderTopIds()
const TOptions m_CleanupOptions
CRef< CSeq_entry > LoadSeqEntry(const TBioseqSetInfo &info, eAddTopEntry add_top_entry=eAddTopEntry::yes) const override
CFeat_id::TLocal::TId TFeatId
void x_SetBioseqSetHooks(CObjectIStream &objStream, TContext &context) override
void x_CleanupTopLevelDescriptors()
TIdToFluLabel::iterator x_GetFluLabel(const CConstRef< CSeq_id > &pId)
bool x_LooksLikeNucProtSet() const
static bool NormalizeDescriptorOrder(CSeq_descr &descr)
Normalize Descriptor Order on a specific Seq-entry.
static void AddNcbiCleanupObject(int ncbi_cleanup_version, CSeq_descr &descr)
Adds NcbiCleanup User Object to Seq-descr.
Reading (iterating through) members of the class (SET, SEQUENCE)
static size_t GetNumRequired(EInfluenzaType fluType)
static EInfluenzaType GetInfluenzaType(const string &taxname)
static string GetKey(const COrg_ref &org)
bool IsFtable(void) const
@Seq_descr.hpp User-defined methods of the data storage class.
const CSeq_descr & GetDescr(void) const
void SetDescr(CSeq_descr &value)
bool IsSetDescr(void) const
namespace ncbi::objects::
container_type::iterator iterator
const_iterator begin() const
const_iterator end() const
const_iterator lower_bound(const key_type &key) const
const_iterator find(const key_type &key) const
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator find(const key_type &key) const
const_iterator end() const
static void cleanup(void)
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static const TObjectType * SafeCast(TTypeInfo type)
TObjectPtr GetObjectPtr(void) const
Get pointer to object.
CObjectInfo GetMember(void) const
Get class member data.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
void Reset(void)
Reset reference object.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
bool IsId(void) const
Check if variant Id is selected.
TId & SetId(void)
Select the variant.
TId GetId(void) const
Get the variant data.
TXref & SetXref(void)
Assign a value to Xref data member.
TIds & SetIds(void)
Assign a value to Ids data member.
const TLocal & GetLocal(void) const
Get the variant data.
bool IsSetXref(void) const
cite other relevant features Check if a value has been assigned to Xref data member.
bool IsLocal(void) const
Check if variant Local is selected.
TLocal & SetLocal(void)
Select the variant.
void SetId(TId &value)
Assign a value to Id data member.
bool IsSetIds(void) const
set of Ids; will replace 'id' field Check if a value has been assigned to Ids data member.
bool IsSetId(void) const
Check if a value has been assigned to Id data member.
TSet & SetSet(void)
Select the variant.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
bool IsSeq(void) const
Check if variant Seq is selected.
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
TSeq & SetSeq(void)
Select the variant.
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_parts
parts for 2 or 3
@ eClass_nuc_prot
nuc acid and coded proteins
@ eClass_genbank
converted genbank
@ eClass_segset
segmented sequence + parts
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
void SetData(TData &value)
Assign a value to Data data member.
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const Tdata & Get(void) const
Get the member data.
bool IsSet(void) const
Check if a value has been assigned to data member.
static void s_FindNextOffset(const TFeatIdSet &existing_ids, const TFeatIdSet &new_existing_ids, const TFeatIdSet ¤t_ids, CCleanupHugeAsnReader::TFeatId &offset)
static void s_UpdateFeatureId(CFeat_id &featId, const TFeatIdMap &idMap)
static string s_GetInfluenzaLabel(const CSeq_descr &descr)
static bool s_CheckForSegments(CConstRef< CSeq_descr > seqDescrs, CConstRef< CSeq_descr > setDescrs, const string &fluLabel, set< size_t > &segments)
static void s_RemoveEntriesWithVal(const string &val, TMap &mapToVal)
static void s_UpdateFeatureIds(CSeq_feat &feat, const TFeatIdMap &idMap)
static bool s_IdInSet(const CConstRef< CSeq_id > &pId, const set< CConstRef< CSeq_id >, CHugeAsnReader::CRefLess > &idSet)
bool g_FindSegs(const CBioSource &src, size_t numRequired, set< size_t > &segsFound)
fallback to Cassandra storage</td > n</tr > n</table > n</td > n< td > yes
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
std::istream & in(std::istream &in_, double &x_)
void SetLocalSkipHook(const CObjectTypeInfo &obj_type_info, CObjectIStream &istr, _Func _func)
void SetLocalReadHook(const CObjectTypeInfo &obj_type_info, CObjectIStream &ostr, _Func _func)
set< TFeatId > ExistingIds
set< TFeatId > NewExistingIds
Compare objects pointed to by (smart) pointer.
static CS_CONTEXT * context