Go to the SVN repository for this file
Go to list of all specification files

--$Revision: 32084 $
--  Biological Macromolecule 3-D Structure Data Types for MMDB,
--                A Molecular Modeling Database
--  Definitions for a biomolecular assembly and the MMDB database
--  By Hitomi Ohkawa, Jim Ostell, Chris Hogue, and Steve Bryant 
--  National Center for Biotechnology Information
--  National Institutes of Health
--  Bethesda, MD 20894 USA
--  July 1995

-- Contents of the MMDB database are currently based on files distributed by
-- the Protein Data Bank, PDB.  These data are changed in form, as described
-- in this specification. To some extent they are also changed in content, in 
-- that many data items implicit in PDB are made explicit, and others are
-- corrected or omitted as a consequence of validation checks.  The semantics
-- of MMDB data items are indicated by comments within the specification below.
-- These comments explain in detail the manner in which data items from  PDB 
-- have been mapped into MMDB. 



EXPORTS Biostruc, Biostruc-id, Biostruc-set, Biostruc-annot-set,

IMPORTS Biostruc-graph, Biomol-descr, Residue-graph FROM MMDB-Chemical-graph 
	Biostruc-model FROM MMDB-Structural-model
	Biostruc-feature-set FROM MMDB-Features
	Date, Object-id, Dbtag FROM NCBI-General;

-- A structure report or "biostruc" describes the components of a biomolecular 
-- assembly in terms of their names and descriptions, and a chemical graph 
-- giving atomic formula, connectivity and chirality. It also gives one or more
-- three-dimensional model structures, literally a mapping of the atoms, 
-- residues and/or molecules of each component into a measured three-
-- dimensional space. Structure may also be described by named features, which 
-- associate nodes in the chemical graph, or regions in space, with text or 
-- numeric descriptors.

-- Note that a biostruc may also contain cross references to other databases,
-- including citations to relevant scientific literature. These cross 
-- references use object types from other NCBI data specifications, which are 
-- "imported" into MMDB, and not repeated in this specification. 

Biostruc ::= SEQUENCE {
	id			SEQUENCE OF Biostruc-id,
	descr			SEQUENCE OF Biostruc-descr OPTIONAL,
	chemical-graph		Biostruc-graph,
	features		SEQUENCE OF Biostruc-feature-set OPTIONAL,
	model			SEQUENCE OF Biostruc-model OPTIONAL }

-- A Biostruc-id is a collection identifiers for the molecular assembly.
-- Mmdb-id's are NCBI-assigned, and are intended to be unique and stable 
-- identifiers.  Other-id's are synonyms.

Biostruc-id ::= CHOICE {
	mmdb-id			Mmdb-id,
	other-database		Dbtag,
	local-id		Object-id }

Mmdb-id ::= INTEGER

-- The description of a biostruc refers to both the reported chemical and 
-- spatial structure of a biomolecular assembly.  PDB-derived descriptors
-- which refer specifically to the chemical components or spatial structure
-- are not provided here, but instead as descriptors of the biostruc-graph or 
-- biostruc-model. For PDB-derived structures the biostruc name is the PDB 
-- id-code.  PDB-derived citations appear as publications within the biostruc 
-- description, and include a data-submission citation derived from PDB AUTHOR 
-- records.  Citations are described using the NCBI Pub specification.

Biostruc-descr ::= CHOICE {
	name			VisibleString,
	pdb-comment		VisibleString,
	other-comment		VisibleString,
	history			Biostruc-history, 
	attribution		Pub }

-- The history of a biostruc indicates it's origin and it's update history
-- within MMDB, the NCBI-maintained molecular structure database.  

Biostruc-history ::= SEQUENCE {
	replaces		Biostruc-replace OPTIONAL,
	replaced-by		Biostruc-replace OPTIONAL,
	data-source		Biostruc-source OPTIONAL }

Biostruc-replace ::= SEQUENCE {
	id			Biostruc-id,
	date			Date }

-- The origin of a biostruc is a reference to another database.  PDB release 
-- date and PDB-assigned id codes are recorded here, as are the PDB-assigned 
-- entry date and replacement history.

Biostruc-source ::= SEQUENCE {
	name-of-database	VisibleString,
	version-of-database	CHOICE {
		release-date		Date,
		release-code		VisibleString } OPTIONAL,
	database-entry-id	Biostruc-id,
	database-entry-date	Date,
	database-entry-history	SEQUENCE OF VisibleString OPTIONAL}

-- A biostruc set is a means to collect ASN.1 data for many biostrucs in 
-- one file, as convenient for application programs.  The object type is not
-- inteded to imply similarity of the biostrucs grouped together.

Biostruc-set ::= SEQUENCE {
	id		SEQUENCE OF Biostruc-id OPTIONAL,
	descr		SEQUENCE OF Biostruc-descr OPTIONAL,
	biostrucs	SEQUENCE OF Biostruc }

-- A biostruc annotation set is a means to collect ASN.1 data for biostruc
-- features into one file. The object type is intended as a means to store 
-- feature annotation of similar type, such as "core" definitions for a 
-- threading program, or structure-structure alignments for a structure-
-- similarity browser.

Biostruc-annot-set ::= SEQUENCE {
	id		SEQUENCE OF Biostruc-id OPTIONAL,
	descr		SEQUENCE OF Biostruc-descr OPTIONAL,
	features	SEQUENCE OF Biostruc-feature-set }

-- A biostruc residue graph set is a collection of residue graphs.  The object
-- type is intended as a means to record dictionaries containing the chemical
-- subgraphs of "standard" residue types, which are used as a means to 
-- simplify discription of the covalent structure of a biomolecular assembly.
-- The standard residue graph dictionary supplied with the MMDB database 
-- contains 20 standard L amino acids and 8 standard ribonucleotide groups. 
-- These graphs are complete, including explicit hydrogen atoms and separate 
-- instances for the terminal polypeptide and polynucleotide residues. 

Biostruc-residue-graph-set ::= SEQUENCE {
	id			SEQUENCE OF Biostruc-id OPTIONAL,
	descr			SEQUENCE OF Biomol-descr OPTIONAL,
	residue-graphs		SEQUENCE OF Residue-graph }


--  Biological Macromolecule 3-D Structure Data Types for MMDB,
--                A Molecular Modeling Database
--  Definitions for a chemical graph
--  By Hitomi Ohkawa, Jim Ostell, Chris Hogue and Steve Bryant 
--  National Center for Biotechnology Information
--  National Institutes of Health
--  Bethesda, MD 20894 USA
--  July, 1995

MMDB-Chemical-graph DEFINITIONS ::=


EXPORTS Biostruc-graph, Biomol-descr, Residue-graph,
	Molecule-id, PCSubstance-id, Residue-id, Atom-id;

	BioSource FROM NCBI-BioSource
	Seq-id FROM NCBI-Seqloc
	Biostruc-id FROM MMDB;

-- A biostruc graph contains the complete chemical graph of the biomolecular 
-- assembly.  The assembly graph is defined hierarchically, in terms of 
-- subgraphs graphs of component molecules.  For PDB-derived biostrucs,
-- the molecules forming the assembly are the individual biopolymer chains and 
-- any non-polymer or "heterogen" groups which are present. 

-- The PDB-derived  "compound name" field appears as the name within the
-- biostruc-graph description.  PDB "class" and "source" fields appear as 
-- explicit attributes.  PDB-derived structures are assigned an assembly type 
-- of "other" unless they have been further classified as the "physiological
-- form" or "crystallographic cell" contents.  If they have, the source of the 
-- type classification appears as a citation within the  assembly description. 

-- Note that the biostruc-graph also includes as literals the subgraphs of 
-- any nonstandard residues present within it. For PDB-derived biostrucs these 
-- subgraphs are constructed automatically, with validation as described below.

Biostruc-graph ::= SEQUENCE {
	descr			SEQUENCE OF Biomol-descr OPTIONAL,
	molecule-graphs		SEQUENCE OF Molecule-graph,
	inter-molecule-bonds	SEQUENCE OF Inter-residue-bond OPTIONAL,
	residue-graphs		SEQUENCE OF Residue-graph OPTIONAL }

-- A biomolecule description refers to the chemical structure of a molecule or 
-- component substructures.  This descriptor type is used at the level of
-- assemblies, molecules and residues, and also for residue-graph dictionaries.
-- The BioSource object type is drawn from NCBI taxonomy data specifications,
-- and is not repeated here.

Biomol-descr ::= CHOICE {
	name			VisibleString,
	pdb-class		VisibleString,
	pdb-source		VisibleString,
	pdb-comment		VisibleString,
	other-comment		VisibleString,
	organism		BioSource,
	attribution		Pub,
	assembly-type		INTEGER {	physiological-form(1),
						other(255) },
	molecule-type		INTEGER {	dna(1),
						other(255) } }

-- A molecule chemical graph is defined by a sequence of residues.  Nonpolymers
-- are described in the same way, but may contain only a single residue.  

-- Biopolymer molecules are identified within PDB entries according to their
-- appearance on SEQRES records, which formally define a biopolymer as such. 
-- Biopolymers are defined by the distinction between ATOM and HETATM 
-- coordinate records only in cases where the chemical sequence from SEQRES
-- is in conflict with coordinate data. The PDB-assigned chain code appears as 
-- the name within the molecule descriptions of the biopolymers.

-- Nonpolymer molecules from PDB correspond to individual HETEROGEN groups, 
-- excluding any HETEROGEN groups which represent modified biopolymer residues.
-- These molecules are named according to the chain, residue type and residue 
-- number fields as assigned by PDB. Any description appearing in the PDB HET 
-- record appears as a pdb-comment within the molecule description. 

-- Molecule types for PDB-derived molecule graphs are assigned by matching 
-- residue and atom names against the PDB-documented standard types for protein,
-- DNA and RNA, and against residue codes commonly used to indicate solvent.
-- Classification is by "majority rule". If more than half of the residues in
-- a biopolymer are standard groups of one type, then the molecule is of that 
-- type, and otherwise classified as "other". Note that this classification does
-- not preclude the presence of modified residues, but insists they constitute 
-- less than half the biopolymer. Non-polymers are classified only as "solvent"
-- or "other".  

-- Note that a molecule graph may also contain a set of cross references 
-- to biopolymer sequence databases.  All biopolymer molecules in MMDB contain 
-- appropriate identifiers for the corresponding entry in the NCBI-Sequences 
-- database, in particular the NCBI "gi" number, which may be used for sequence
-- retrieval. The Seq-id object type is defined in the NCBI molecular sequence 
-- specification, and not repeated here.

Molecule-graph ::= SEQUENCE {
	id			Molecule-id,
	descr			SEQUENCE OF Biomol-descr OPTIONAL,
	seq-id			Seq-id OPTIONAL,
	residue-sequence	SEQUENCE OF Residue,
	inter-residue-bonds	SEQUENCE OF Inter-residue-bond OPTIONAL, 
	sid                     PCSubstance-id OPTIONAL }
Molecule-id ::= INTEGER

-- Pubchem substance id

PCSubstance-id ::= INTEGER

-- Residues may be assigned a text-string name as well as an id number. PDB 
-- assigned residue numbers appear as the residue name.

Residue ::= SEQUENCE {
	id			Residue-id,
	name			VisibleString OPTIONAL,
	residue-graph		Residue-graph-pntr }

Residue-id ::= INTEGER

-- Residue graphs from different sources may be referenced within a molecule
-- graph.  The allowed choices are the nonstandard residue graphs included in 
-- the present biostruc, residue graphs within other biostrucs, or residue 
-- graphs within tables of standard residue definitions.

Residue-graph-pntr ::= CHOICE {
	local			Residue-graph-id,
	biostruc		Biostruc-graph-pntr,
	standard		Biostruc-residue-graph-set-pntr }
Biostruc-graph-pntr ::= SEQUENCE {
	biostruc-id		Biostruc-id,
	residue-graph-id	Residue-graph-id }

Biostruc-residue-graph-set-pntr ::= SEQUENCE {
	biostruc-residue-graph-set-id	Biostruc-id,
	residue-graph-id		Residue-graph-id } 

-- Residue graphs define atomic formulae, connectivity, chirality, and names.
-- For standard residue graphs from the MMDB dictionary the PDB-assigned 
-- residue-type code appears as the name within the residue graph description,
-- and the full trivial name of the residue as a comment within that 
-- description.  For any nonstandard residue graphs provided with an MMDB 
-- biostruc the PDB-assigned residue-type code similarly appears as the name 
-- within the description, and any information provided on PDB HET records as 
-- a pdb-comment within that description.  

-- Note that nonstandard residue graphs for a PDB-derived biostruc may be 
-- incomplete. Current PDB format cannot represent connectivity for groups 
-- which are disordered, and for which no coordinates are given.  In these 
-- cases the residue graph defined in MMDB represents only the subgraph that 
-- could be identified from available ATOM, HETATM and CONECT records.

Residue-graph ::= SEQUENCE {
	id			Residue-graph-id,
	descr			SEQUENCE OF Biomol-descr OPTIONAL,
	residue-type		INTEGER {	deoxyribonucleotide(1),
						other(255) } OPTIONAL,
	iupac-code		SEQUENCE OF VisibleString OPTIONAL,
	atoms			SEQUENCE OF Atom,
	bonds			SEQUENCE OF Intra-residue-bond,
	chiral-centers		SEQUENCE OF Chiral-center OPTIONAL }
Residue-graph-id ::= INTEGER

-- Atoms in residue graphs are defined by elemental symbols and names.  PDB-
-- assigned atom names appear here in the name field, except in cases of known 
-- PDB synonyms.  In these cases atom names are mapped to the names used in the
-- MMDB standard dictionary. This occurs primarily for hydrogen atoms, where 
-- PDB practice allows synonyms for several atom types.  For PDB atoms the 
-- elemental symbol is obtained by parsing the PDB atom name field, allowing 
-- for known special-semantics cases where the atom name does not follow the
-- documented encoding rule.  Ionizable protons are identified within standard 
-- residue graphs in the MMDB dictionary, but not within automatically-defined
-- nonstandard graphs.

Atom ::= SEQUENCE {
	id			Atom-id,
	name			VisibleString OPTIONAL,
	iupac-code		SEQUENCE OF VisibleString OPTIONAL,
	element			ENUMERATED {
				h(1),   he(2),  li(3),  be(4),  b(5), 
				c(6),   n(7),   o(8),   f(9),   ne(10), 
				na(11), mg(12), al(13), si(14), p(15), 
				s(16),  cl(17), ar(18), k(19),  ca(20), 
				sc(21), ti(22), v(23),  cr(24), mn(25), 
				fe(26), co(27), ni(28), cu(29), zn(30), 
				ga(31), ge(32), as(33), se(34), br(35), 
				kr(36), rb(37), sr(38), y(39),  zr(40),
				nb(41), mo(42), tc(43), ru(44), rh(45),
				pd(46), ag(47), cd(48), in(49), sn(50),
				sb(51), te(52), i(53),  xe(54), cs(55),
				ba(56), la(57), ce(58), pr(59), nd(60),
				pm(61), sm(62), eu(63), gd(64), tb(65),
				dy(66), ho(67), er(68), tm(69), yb(70),
				lu(71), hf(72), ta(73), w(74),  re(75),
				os(76), ir(77), pt(78), au(79), hg(80),
				tl(81), pb(82), bi(83), po(84), at(85),
				rn(86), fr(87), ra(88), ac(89), th(90),
				pa(91), u(92),  np(93), pu(94), am(95),
				cm(96), bk(97), cf(98), es(99), 
				fm(100), md(101), no(102), lr(103),
				other(254), unknown(255) },
	ionizable-proton	ENUMERATED {
					unknown(255) } OPTIONAL }
Atom-id ::= INTEGER

-- Intra-residue-bond specifies connectivity between atoms in Residue-graph.
-- Unlike Inter-residue-bond defined later, its participating atoms are part of
-- a residue subgraph dictionary, not part of a specific biostruc-graph.

-- For residue graphs in the standard MMDB dictionary bonds are defined from
-- the known chemical structures of amino acids and nucleotides.  For 
-- nonstandard residue graphs bonds are defined from PDB CONECT records, with
-- validation for consistency with coordinate data, and from stereochemical
-- calculation to identify unreported bonds.  Validation and bond identification
-- are based on comparison of inter-atomic distances to the sum of covalent
-- radii for the corresponding elements. 

Intra-residue-bond ::= SEQUENCE {
	atom-id-1		Atom-id,
	atom-id-2		Atom-id,
	bond-order		INTEGER {
					unknown(255)} OPTIONAL }

-- Chiral centers are atoms with tetrahedral geometry.  Chirality is defined
-- by a chiral volume involving the chiral center and 3 other atoms bonded to 
-- it.  For any coordinates assigned to atoms c, n1, n2, and n3, the vector 
-- triple product (n1-c) dot ( (n2-c) cross (n3-c) ) must have the indicated
-- sign.  The calculation assumes an orthogonal right-handed coordinate system
-- as is used for MMDB model structures.  

-- Chirality is defined for standard residues in the MMDB dictionary, but is 
-- not assigned automatically for PDB-derived nonstandard residues. If assigned
-- for nonstandard residues, the source of chirality information is described 
-- by a citation within the residue description.

Chiral-center ::= SEQUENCE {
	c			Atom-id,
	n1			Atom-id,
	n2			Atom-id,
	n3			Atom-id,
	sign			ENUMERATED { positive(1),
					     negative(2) } }

-- Inter-residue bonds are defined by a reference to two atoms. For PDB-derived 
-- structures bonds are identified from biopolymer connectivity according to
-- SEQRES and from other connectivity information on SSBOND and CONECT 
-- records. These data are validated and unreported bonds identified by
-- stereochemical calculation, using the same criteria as for intra-residue 
-- bonds.

Inter-residue-bond ::= SEQUENCE {
	atom-id-1		Atom-pntr,
	atom-id-2		Atom-pntr,
	bond-order		INTEGER {
					unknown(255)} OPTIONAL }

-- Atoms, residues and molecules within the current biostruc are referenced 
-- by hierarchical pointers.

Atom-pntr ::= SEQUENCE {
	molecule-id		Molecule-id,
	residue-id		Residue-id,
	atom-id			Atom-id }

Atom-pntr-set ::= SEQUENCE OF Atom-pntr