src/objects/genomecoll/genome_collection.asn

Go to the SVN repository for this file
Go to list of all specification files

-- $Id: genome_collection.asn 95484 2021-11-18 18:50:23Z smithrg $
-- **********************************************************************
--
--  NCBI Genome Collections
--  by Mike DiCuccio, Avi Kimchi, Karl Sirotkin, Josh Cherry
--
-- **********************************************************************




NCBI-GenomeCollection DEFINITIONS ::=
BEGIN

EXPORTS GC-Assembly, GC-Genome, GC-Project, GC-GenomicPart, GC-Replicon,
        GC-Sequence, GC-SequenceStats, GC-Scaf-stats, GC-SeqIdAlias,
        GC-TypedSeqId, GC-DbTagAlias, GC-SequenceRole;

IMPORTS Dbtag, Date FROM NCBI-General
        MolInfo, Seqdesc, Seq-descr, Delta-ext, Seq-annot, Seq-gap FROM NCBI-Sequence
        Seq-id FROM NCBI-Seqloc
        BioSource FROM NCBI-BioSource;


GC-Genome ::= SEQUENCE
{
    -- Gencoll identifier for this genome
    id SET OF Dbtag,

    -- project ID for this genome
    -- this is the ID for this asssembly and may reflect the submitter/source
    projects SET OF GC-Project OPTIONAL,

    -- Various attributes assigned at this level:
    -- biosrc, comments, publications...
    descr Seq-descr OPTIONAL ,

    -- official list of chromosome names
    -- this should be the complete expected set of chromosomes
    -- regardless of whether they are represented in *any*
    -- genomic part
    chr-names SET OF VisibleString OPTIONAL,

    -- the bits that make up this genome
    -- this preserves the expected display order
    parts SET OF GC-GenomicPart

}

GC-Project ::= SEQUENCE {
        project-role VisibleString, -- Genbank / Refseq
        project-id INTEGER
    }

GC-GenomicPart ::= CHOICE {
    assembly GC-Assembly,
    mol GC-Replicon,
    seqs GC-Sequence
}



--
-- A GC-Assemblies is a mere set of assemblies.
-- used for returning list of assemblies from a search
--

GC-Query ::= SEQUENCE {
        param   VisibleString,
        value   VisibleString
}

GC-Assemblies ::= SEQUENCE {
    query SET OF GC-Query OPTIONAL,
    assemblies SET OF GC-Assembly
}

--
-- A GC-Assembly is either a single assembly unit or it is a set of assemblies
--
GC-Assembly ::= CHOICE {
    unit GC-AssemblyUnit,
    assembly-set GC-AssemblySet
}

--- GC-DbTagAlias reports relationships between assembly-releases
--
GC-DbTagAlias ::= SEQUENCE {
    refseq   Dbtag OPTIONAL,   -- the refseq assembly release-id
    genbank  Dbtag OPTIONAL,   -- the genbank assembly release-id

    -- Similarity: relationship between 2 assemblies represented by DbTags
    -- Can be expanded to multiple level of identicality
    -- can use bitmap to flag ambiguity between different identicality levels
    similarity  INTEGER {
        unknown (0) ,
        identical (1) ,
        different (255)
    }
}

GC-AssemblySet ::= SEQUENCE {
    -- The identifier of this assembly
    -- examples: GC internal id, Assembly-accession.version
    -- Types of DbTags used:
    -- db "GenColl", tag id 
    -- db "GenColl", tag str 
    -- db "AGP", tag id 
    -- db "GenColl_Chain", tag id 

    -- db "UCSC_name", tag str 
    -- db "Ensembl_name",tag str 
    id SET OF Dbtag,

    -- set type: Assembly-Sets can be of 2 categories
    set-type INTEGER {
        full-assembly(0), -- full-assembly: set of asm-units
        assembly-set(100), --set of full-assemblies
        -- stopper
        other(255)
    },

    -- 'class' defines the semantics of how to interpret this item
    class INTEGER {
        -- general type of full-assembly (not expected to be used)
        full-assembly(0),
        -- this assembly represents a single haploid assembly in its entirety
        haploid(1),
        -- this assembly has exactly one unit and at least one alt-loci
        haploid-with-alt-loci(2),
        -- assembly represents a diploid assembly; we expect at least two
        -- haploid-unit assemblies contained herein
        diploid(3),
        -- unresolved-diploid: single unit which includes the diploid sequences
        unresolved-diploid(4),
        -- second part of diploid; linked to another haploid full-assembly
        -- this one is alt-loci or second full-haploid
        alternate-haplotype(5),
        alternate-pseudohaplotype(6),
        -- assembly is a collection for annotation
        annotation-target-set(101),
        -- analysis set used for sequencing by alignments
        analysis-set(102),
        -- stopper
        other(255)
    },

    -- descriptors live in a shared data block
    desc GC-AssemblyDesc,

    -- we contain hierarchically a set of assemblies
    primary-assembly GC-Assembly,
    more-assemblies SET OF GC-Assembly OPTIONAL,
    -- statistics
    -- "stats" field holds stats for all chromosomes combined:
    stats GC-SequenceStats OPTIONAL

}

-- GC-TaggedSequences: set of sequences in a specific role
GC-TaggedSequences ::= SEQUENCE
{
    state  INTEGER {
        not-set(0),         -- error
        placed (1),         -- exist only within a replicon. placed sequences
                            -- on higher sequence
        unlocalized (2) ,   -- exist only within a replicon. "random" on a
                            -- given chromosome
        unplaced (3),       -- exist only on primary/diploid assembly-unit.
                            -- unknown chromosome
        aligned (4),        -- exist only on alt-loci/patch units. List all
                            -- aligned (eg cross-placed) sequences.
                            -- (unaligned are listed as unlocalized).
        bits (6)            -- exist only on primary/diploid unit. if need to
                            -- report low-level contigs
    },
    seqs SET OF GC-Sequence
}

GC-AssemblyUnit ::= SEQUENCE
{
    -- The identifier of this assembly
    -- Contains: GenColl internal id, GenColl accession.version,
    --            AGP id, UCSC name, Ensembl name
    -- Types of DbTags used:
    -- db "GenColl", tag id 
    -- db "GenColl", tag str 
    -- db "AGP", tag id 
    -- db "GenColl_Chain", tag id 
    -- "Ensembl_name" and "UCSC_name" will not appear in the Unit level since
    -- these organizations do not define assembly-units.
    id SET OF Dbtag,

    -- 'class' defines the semantics of how to interpret this item
    class INTEGER {
        --  units for haploid/diploid assemblies
        haploid-unit(1),
        -- this unit represents one or more alternate loci for a haploid
        -- assembly
        alt-loci(2),
        -- assembly-patch
        assembly-patch(3),
        -- stopper
        other(255)
    },

    -- descriptors live in a shared data block
    desc GC-AssemblyDesc,

    -- collections of molecules for this assembly
    mols SET OF GC-Replicon OPTIONAL,

    -- On primary assembly-unit: here will be the unplaced sequences
    -- On alt-loci: list of sequences aligned/unaligned to primary unit
    other-sequences SET OF GC-TaggedSequences  OPTIONAL,

    -- statistics
    -- "stats" field holds stats for all chromosomes combined:
    --     ordered/unordered scaffolds
    -- "unplaced-stats" holds stats for ChrUn which is omitted from "stats"
    stats GC-SequenceStats OPTIONAL,
    unplaced-stats SET OF GC-Scaf-stats OPTIONAL,
    unplaced-unlocalized-stats SET OF GC-Scaf-stats OPTIONAL

}

GC-AssemblyDesc ::= SEQUENCE {
    -- project ID for this genome
    -- this is the ID for this asssembly and may reflect the submitter/source
    projects SET OF GC-Project OPTIONAL,

    -- Names of the assembly
    name VisibleString OPTIONAL,
    submitter-name VisibleString OPTIONAL,
    display-name VisibleString OPTIONAL,
    long-name UTF8String OPTIONAL,
    filesafe-name VisibleString OPTIONAL,
    wgs-acc-prefix VisibleString OPTIONAL,
    wgs-acc-number INTEGER OPTIONAL,

    -- release type: RefSeq / GenBank.
    release-type INTEGER {
        genbank (1),
        refseq(2)
    } OPTIONAL,

    -- release status (numeric values have gaps for possible additional states
    release-status INTEGER {
        new(0),
        gpipe(5),
        public(10),
        suppressed(15),
        hup(100),
        withdrawn(105)
    } OPTIONAL,

    -- in alt-loci units - contain alignment of this sequence to the primary
    -- unit
    annot SET OF Seq-annot OPTIONAL,

    -- Synonyms: Other releases of same assembly
    synonyms   SET OF GC-DbTagAlias OPTIONAL,

    -- Submitter release date
    submitter-date Date OPTIONAL,

    -- Various attributes assigned at this level:
    -- biosrc, comments, publications...
    --
    -- Special user-objects:
    -- Internal identifiers (GCOL-1178):  type  str ="gencoll-misc"
    --              data : 2 user fields :
    --              label  str "asm-name" str 
    --              label  str "asm-id" int 
    -- ftp-sites:  type  str ="ftp-sites"
    --              data : one or 2 fields, depending if both GB and RS have ftp-sites
    --              label  str : "genbank" or "refseq" str : path to ftp-site
    -- diffs between GB / RS assemblies: type str "diff-from-synonym",
    --         data label id  str 

    descr Seq-descr OPTIONAL ,

    -- flag: is this assembly partial?
    -- **NOTE: not set = not known!!
    partial BOOLEAN OPTIONAL,

    -- level of coverage for this assembly
    -- **NOTE: not set = not known!!
    coverage REAL OPTIONAL,

    -- release level: Most of releases are major
    release-level INTEGER {
        major(0),
        patch(1),
        minor(2),
        -- stopper
        other(255)
    } DEFAULT major,

    -- organization which submitted this assembly.
    -- populated from BioProject
    submitter-organization UTF8String OPTIONAL
}

GC-Replicon ::= SEQUENCE
{
     -- name for this molecule
    name VisibleString OPTIONAL,  -- the official name,

    -- If the chromosome has a different name in this assembly.
    local-name VisibleString OPTIONAL,

   -- the sequence(s) representing this molecule
    -- in the case of 2L and 2R - the molecule is represented by
    -- several sequences, and there is no seq-id of "chr 2" as a whole.
    sequence CHOICE {
        single GC-Sequence,
        set SET OF GC-Sequence
    }
}


-- Seq-id-alias
-- a sequence has multiple seq-ids - refseq, genbank, local
-- and each one has both gi and accession.version
-- We cannot provide them as simply a list of seq-ids since it will be
-- unclear which gi goes with which accession, therefore we group the
-- related ones as alias pair.

GC-SeqIdAlias ::= SEQUENCE {
    public Seq-id,   -- the refseq/genbank accession
    gpipe  Seq-id OPTIONAL,   -- the gpipe accession
    gi     Seq-id OPTIONAL,   -- optional since not all sequences have GIs
    -- Similarity: relationship between this synonym to main seqid of the
    -- GC-Sequence
    -- Can be expanded to multiple level of identicality
    -- can use bitmap to flag ambiguity between different levels
    similarity INTEGER  {
        unknown (0) ,
        identical (1) ,
        different (255)
    }
}

GC-External-Seqid::= SEQUENCE {
    external VisibleString,
    id       Seq-id
}

GC-TypedSeqId ::= CHOICE {
    genbank  GC-SeqIdAlias,
    refseq   GC-SeqIdAlias,
    private  Seq-id,
    external GC-External-Seqid
}

-- sequence-role: what role(s) does this sequence have in the assembly
-- a sequence may have more than one role. e.g. a complete-sequence is all 4 roles
-- pseudo-scaffold - this is for UCSC chr1_random etc
-- submitter-pseudo-scaffold - this is for FlyBase pseudo scaffolds.
GC-SequenceRole ::= INTEGER {
        chromosome (2),
        scaffold (3),
        component(4),
        top-level (10),
        pseudo-scaffold (20),
        submitter-pseudo-scaffold (21)
}

-- GC-Sequence:
-- Used for scaffolds, scaffold-sets and components in Gencoll terminology.
-- Theoretically can support sequences decsribed with more levels than
-- Chromosome / scaffold-set/scaffold/component.
-- The meaning of the GC-Sequence record - if it is component, scaffold or
-- or scaffold-set, is defined by its context.
-- ??? DO WE NEED A FIELD TO SPECIFY WHAT LEVEL IT IS???
--
-- GC-Sequences are made of GC-Sequences: At the lowest level there is only one
-- seq-id of a component with no further structure.
GC-Sequence ::= SEQUENCE {
    -- identifiers are: Local / gpipe-satellite / genbank / refseq
    -- local is merely local name (or maybe with WGS accession??)
    -- others are accession/ver/gi
    --
    -- Main identifier:
    -- we will report the one that matches the context of who asked.
    seq-id Seq-id,
    -- Other known identifiers: Local / gpipe-satellite / genbank / refseq
    seq-id-synonyms SET OF GC-TypedSeqId OPTIONAL,

    -- Various attributes assigned at this level:
    -- biosrc, comments, publications...
    descr Seq-descr OPTIONAL ,

    -- Feature annotation;
    -- Contains Pseudo Autosomal regions on chromosomes and scaffolds in the
    -- following format:
    --   Id = 
    --   Desc (set of Annot-descr)
    --      Name: "pseudo autosomal region"
    --       Region (seq-loc): int (seq-interval)
    --          Id:  
    --            From: 
    --            To : 
    --      Data: locs (set of seq-loc):
    --         List of int (seq-interval):
    --            Id: GI of scaffold
    --              From/to: position on scaffold that belongs to this PAR region.
    --                  
    --
    -- in alt-loci units - contain alignment of this sequence to the primary
    -- unit
    annot SET OF Seq-annot OPTIONAL,

    -- placed: populated both on chromosome and scaffold levels
    -- unlocalized: populated on chromosome level
    sequences SET OF GC-TaggedSequences  OPTIONAL,

    -- locations of ordered scaffolds/components
    structure Delta-ext OPTIONAL,

    -- statistics
    stats GC-SequenceStats OPTIONAL,

    patch-type INTEGER {
        novel(0),
        fix(1),
        -- stopper
        other(255)
    } OPTIONAL,

    -- sequence-role: what role(s) does this sequence have in the assembly
    roles  SET OF GC-SequenceRole OPTIONAL
}


-- Stats of complex objects
-- Used at assembly and chromosome levels
GC-SequenceStats ::= SEQUENCE {
    all-scaf        SET OF GC-Scaf-stats,
    ordered-scaf    SET OF GC-Scaf-stats,
    unordered-scaf  SET OF GC-Scaf-stats, --
    unplaced-scaf   SET OF GC-Scaf-stats OPTIONAL, -- unplaced at full assembly level
    aligned-scaf    SET OF GC-Scaf-stats OPTIONAL, -- aligned at full assembly level
    unaligned-scaf  SET OF GC-Scaf-stats OPTIONAL -- unaligned at full assembly level
}

GC-Scaf-stats ::= SEQUENCE {
    --NOTE: These values equal to the stats_cd values in CodeStatistics table in GenomeColl
    stats-category  INTEGER {
        replicon-count (70),
        scaffold-count (22),
        component-count (23),-- How many components
        component-span-count (24),-- How many placements of components
        total-length (1),
        ungapped-length (2),
        min-gapped-scaf-length (25),
        max-gapped-scaf-length (26),
        min-ungapped-scaf-length (27),
        max-ungapped-scaf-length (28),
        active-finishing-bases (3),
        draft-bases (5),
        finished-bases (7),
        whole-genome-finishing-bases (9),
        other-sequences (11),
        pre-draft (13),
        wgs-bases (15),
        bases-in-spanned-gaps (17),
        n50 (19),
        spanned-gaps (20),
        unspanned-gaps (21),
        bases-in-unspanned-gaps (29),
        count-contig (30),  -- contig count
        contig-n50 (31),  -- contig N50
        contig-L50 (32),  -- contig L50
        contig-n75 (33),  -- contig N75
        contig-n90 (34),  -- contig N90
        scaf-L50 (35),  -- scaffold L50
        scaf-n75 (36),  -- scaffold N75
        scaf-n90 (37),  -- scaffold N90
        gc-count (38),  -- base counts for G,C
        atgc-count (39),  -- base count for A,T,G,C
        gc-perc (40),   -- gc-count / atgc-count as percentage.
        is-there-unplaced-scaf (72), -- 1 means there is unplaced scaffold. 0 means none
        count-singleton-unordered-scaffold (73),
        count-chr-made-of-singleton-ordered-scaffold (74),
    -- GCOL-675 - assembly release review stats. Meaningful only at assembly
    -- level.
        -- Do any of the chromosomes have a single component each?
        --   if yes, do all chromosomes have single component each?
        count-chr-with-single-component (75), -- INTEGER OPTIONAL,
        -- number of unplaced/unlocalized singleton scaffolds in minus orientation
        count-singleton-in-minus-orientation (76),-- INTEGER OPTIONAL,
        --chr-terminal-gap-types SET OF GC-ChrGapTypes OPTIONAL,
        -- are any gaps flagged as 'unknown' and how many? 0 means none.
        count-unknown-gaps (80), --  INTEGER OPTIONAL,
        -- 1.2c Scaffold source names: count implicit/explicit
        count-implicit-scaf-src-name (82), -- INTEGER OPTIONAL,
        count-explicit-scaf-src-name (83), -- INTEGER OPTIONAL
        all-chr-has-single-component (84), -- if 1 yes, 0 no. BOOLEAN OPTIONAL,
        -- Do any scaffolds have terminal gaps? How many?
        count-scaf-with-terminal-gaps (85), -- INTEGER OPTIONAL,
        -- Do any chromosome have terminal gaps? what is the gap type for each?
        count-chr-with-terminal-gaps  (86), -- INTEGER OPTIONAL,
        -- Number of component GIs (uniq components). Is the count different from the count in the
        -- latest GenBank/Refseq release for the same assembly, by how many?
        count-uniq-components (87), --INTEGER OPTIONAL ,
        count-uniq-component-diff-from-last-release (88), --INTEGER OPTIONAL,
        count-chromosome-types (89), --INTEGER OPTIONAL,
        count-chromosome-terminal-gap-types (90), -- # of chromosome terminal gap types
        count-dropped-components (91), -- dropped component due to foreign screen
        count-non-chromosome-replicon (92), -- number of non-Chromosome replicons
        count-assembly-units (93), -- # of assembly-units
        count-alt-loci-units (94), -- # of alt-loci units
        count-fixed-patches (95),  -- # of fixed patches
        count-novel-patches (96),  -- # of novel patches
        count-regions (97),        -- # of regions
        count-patches (98),        -- # of patches
        count-par-regions (99),    -- # of PAR regions
        count-genomic-regions (100), -- # of genomic regions
        count-chromosome-replicons (101), -- # of chromosomal replicons
        assembly-status (102), -- Assembly status 1 - Contig only;2 - Unplaced scaffolds only; 3 - Some chromosomes assembled;
                    -- 4 - all chromosomes assembled; 5 - complete sequence genome; 6-unlocalized and unplaced scaffolds.
        net-count-scaffold (103),  -- number of scaffolds that are not also chromosomes (22 - 75)
        net-count-component (104), -- number of components that are not also chromosomes or scaffolds (30 - 75 -73)
        count-regions-contain-alt-loci (105), -- number of regions containing alt-loci
        count-regions-contain-fix-patch (106), -- number of regions containing fix patch
        count-regions-contain-novel-patch (107), -- number of regions containing novel patch
        count-fix-patch-with-alignment (108), -- number of fix patch with alignment to primary assembly
        count-novel-patch-with-alignment (109), -- number of novel patch with alignment to primary assembly
        count-alt-scaf-with-alignment (110), -- number of alt scaffolds with alignment to primary assembly
        count-alt-loci-scaf (111), -- number of alt-loci scaffolds.
        count-real-scaffolds (112), -- number of scaffolds with gb_is_skipped = 0.
        top-level-count (113), -- Number of chromosomes or plasmids, unplaced/unlocalized scaffolds, alt-loci scaffolds, and patch scaffolds
        total-gap-length (114), -- Total length of gaps
        count-replicons-without-ordered-scaf (115), -- count of replicons without ordered scaffold

        other (255) -- catch all
    },
    value INTEGER
}



END