src/objects/scoremat/scoremat.asn

Go to the SVN repository for this file
Go to list of all specification files

--$Id: scoremat.asn 98157 2022-10-05 15:42:23Z lanczyck $
-- ===========================================================================
--
--                            PUBLIC DOMAIN NOTICE
--               National Center for Biotechnology Information
--
--  This software/database is a "United States Government Work" under the
--  terms of the United States Copyright Act.  It was written as part of
--  the author's official duties as a United States Government employee and
--  thus cannot be copyrighted.  This software/database is freely available
--  to the public for use. The National Library of Medicine and the U.S.
--  Government have not placed any restriction on its use or reproduction.
--
--  Although all reasonable efforts have been taken to ensure the accuracy
--  and reliability of the software and data, the NLM and the U.S.
--  Government do not and cannot warrant the performance or results that
--  may be obtained by using this software or data. The NLM and the U.S.
--  Government disclaim all warranties, express or implied, including
--  warranties of performance, merchantability or fitness for any particular
--  purpose.
--
--  Please cite the author in any work or product based on this material.
--
-- ===========================================================================
--
-- Author:  Christiam Camacho
--
-- File Description:
--      ASN.1 definitions for scoring matrix
--
-- ===========================================================================

NCBI-ScoreMat DEFINITIONS ::= BEGIN

EXPORTS    Pssm, PssmIntermediateData, PssmFinalData, 
           PssmParameters, PssmWithParameters;
    
IMPORTS    Object-id   FROM NCBI-General
           Seq-entry   FROM NCBI-Seqset;

-- a rudimentary block/core-model, to be used with block-based alignment 
-- routines and threading

BlockProperty ::= SEQUENCE {
  type     INTEGER { unassigned  (0),
                     threshold   (1),       -- score threshold for heuristics
		     minscore    (2),       -- observed minimum score in CD
		     maxscore    (3),       -- observed maximum score in CD
		     meanscore   (4),       -- observed mean score in CD
		     variance    (5),       -- observed score variance
		     name       (10),       -- just name the block
		     is-optional(20),       -- block may not have to be used    
                     other     (255) },
  intvalue  INTEGER OPTIONAL,
  textvalue VisibleString OPTIONAL
}

CoreBlock ::= SEQUENCE {
  start          INTEGER,                   -- begin of block on query
  stop           INTEGER,                   -- end of block on query
  minstart       INTEGER OPTIONAL,          -- optional N-terminal extension
  maxstop        INTEGER OPTIONAL,          -- optional C-terminal extension
  property       SEQUENCE OF BlockProperty OPTIONAL
}

LoopConstraint ::= SEQUENCE {
  minlength      INTEGER DEFAULT 0,         -- minimum length of unaligned region
  maxlength      INTEGER DEFAULT 100000     -- maximum length of unaligned region
}

CoreDef ::= SEQUENCE {
  nblocks        INTEGER,                   -- number of core elements/blocks
  blocks         SEQUENCE OF CoreBlock,     -- nblocks locations
  loops          SEQUENCE OF LoopConstraint, -- (nblocks+1) constraints

  isDiscontinuous BOOLEAN OPTIONAL,         -- is it a discontinuous domain

  insertions SEQUENCE OF INTEGER OPTIONAL   -- positions of long insertions
}

Site-annot ::= SEQUENCE {
  startPosition  INTEGER,                -- location of the annotation,
  stopPosition   INTEGER,                -- start and stop position in the
                                         -- PSSM

  description    VisibleString OPTIONAL, -- holds description or names, that
                                         -- can be used for labels in
                                         -- visualization

  type           INTEGER OPTIONAL,       -- type of the annotated feature,
                                         -- similarly to Align-annot in
                                         -- NCBI-Cdd

  aliases        SEQUENCE OF VisibleString OPTIONAL, -- additional names for
                                                     -- the annotation

  motif          VisibleString OPTIONAL, -- motif to validate mapping of sites

  motifuse       INTEGER OPTIONAL        -- 0 for validation
                                         -- 1 for motif in seqloc
                                         -- 2 for multiple motifs in seqloc
}

Site-annot-set ::= SEQUENCE OF Site-annot

-- ===========================================================================
-- PSI-BLAST, formatrpsdb, RPS-BLAST workflow:
-- ===========================================
--
-- Two possible inputs to PSI-BLAST and formatrpsdb:
-- 1) PssmWithParams where pssm field contains intermediate PSSM data (matrix 
--    of frequency ratios)
-- 2) PssmWithParams where pssm field contains final PSSM data (matrix of 
--    scores and statistical parameters) - such as written by cddumper
--
-- In case 1, PSI-BLAST's PSSM engine is invoked to create the PSSM and perform
-- the PSI-BLAST search or build the PSSM to then build the RPS-BLAST database.
-- In case 2, PSI-BLAST's PSSM engine is not invoked and the matrix of scores
-- statistical parameters are used to perform the search in PSI-BLAST and the
-- same data and the data in PssmWithParams::params::rpsdbparams is used to
-- build the PSSM and ultimately the RPS-BLAST database
-- 
-- 
--                 reads    ++++++++++++++ writes
-- PssmWithParams  ====>    + PSI-BLAST  + =====> PssmWithParams
--                          ++++++++++++++             |  ^
--         ^                                           |  |
--         |                                           |  |
--         +===========================================+  |
--                                                     |  |
--         +===========================================+  |
--         |                                              |
-- reads   |                                              | 
--         v                                              |
--  +++++++++++++++ writes +++++++++++++++++++++++        |
--  | formatrpsdb | =====> | RPS-BLAST databases |        |
--  +++++++++++++++        +++++++++++++++++++++++        |
--                                   ^                    |
--                                   |                    |
--                                   | reads              |
--                             +++++++++++++              |
--                             | RPS-BLAST |              |
--                             +++++++++++++              |
--                                                        |
--       reads  ++++++++++++               writes         |
--  Cdd ======> | cddumper | =============================+
--              ++++++++++++
--
-- ===========================================================================

-- Contains the PSSM's scores and its associated statistical parameters. 
-- Dimensions and order in which scores are stored must be the same as that 
-- specified in Pssm::numRows, Pssm::numColumns, and Pssm::byrow
PssmFinalData ::= SEQUENCE {

    -- PSSM's scores
    scores              SEQUENCE OF INTEGER, 

    -- Karlin & Altschul parameter produced during the PSSM's calculation
    lambda              REAL,

    -- Karlin & Altschul parameter produced during the PSSM's calculation
	kappa               REAL,

    -- Karlin & Altschul parameter produced during the PSSM's calculation
    h                   REAL,

    -- scaling factor used to obtain more precision when building the PSSM.
    -- (i.e.: scores are scaled by this value). By default, PSI-BLAST's PSSM
    -- engine generates PSSMs which are not scaled-up, however, if PSI-BLAST is
    -- given a PSSM which contains a scaled-up PSSM (indicated by having a
    -- scalingFactor greater than 1), then it will scale down the PSSM to
    -- perform the initial stages of the search with it.
    -- N.B.: When building RPS-BLAST databases, if formatrpsdb is provided 
    -- scaled-up PSSMs, it will ensure that all PSSMs used to build the 
    -- RPS-BLAST database are scaled by the same factor (otherwise, RPS-BLAST 
    -- will silently produce incorrect results).
    scalingFactor       INTEGER DEFAULT 1,

    -- Karlin & Altschul parameter produced during the PSSM's calculation
    lambdaUngapped      REAL OPTIONAL,

    -- Karlin & Altschul parameter produced during the PSSM's calculation
	kappaUngapped       REAL OPTIONAL,

    -- Karlin & Altschul parameter produced during the PSSM's calculation
    hUngapped           REAL OPTIONAL,

    -- Word score threshold
    wordScoreThreshold    REAL OPTIONAL
}

-- Contains the PSSM's intermediate data used to create the PSSM's scores 
-- and statistical parameters. Dimensions and order in which scores are 
-- stored must be the same as that specified in Pssm::numRows, 
-- Pssm::numColumns, and Pssm::byrow
PssmIntermediateData ::= SEQUENCE {

    -- observed residue frequencies (or counts) per position of the PSSM 
    -- (prior to application of pseudocounts)
    resFreqsPerPos              SEQUENCE OF INTEGER OPTIONAL, 

    -- Weighted observed residue frequencies per position of the PSSM.
    -- (N.B.: each position's weights should add up to 1.0).
    -- This field corresponds to f_i (f sub i) in equation 2 of 
    -- Nucleic Acids Res. 2001 Jul 15;29(14):2994-3005.
    -- NOTE: this is needed for diagnostics information only (i.e.:
    -- -out_ascii_pssm option in psiblast)
    weightedResFreqsPerPos      SEQUENCE OF REAL OPTIONAL,

    -- PSSM's frequency ratios
    freqRatios                  SEQUENCE OF REAL,

    -- Information content per position of the PSSM
    -- NOTE: this is needed for diagnostics information only (i.e.:
    -- -out_ascii_pssm option in psiblast)
    informationContent          SEQUENCE OF REAL OPTIONAL,

    -- Relative weight for columns of the PSSM without gaps to pseudocounts
    -- NOTE: this is needed for diagnostics information only (i.e.:
    -- -out_ascii_pssm option in psiblast)
    gaplessColumnWeights        SEQUENCE OF REAL OPTIONAL,

    -- Used in sequence weights computation
    -- NOTE: this is needed for diagnostics information only (i.e.:
    -- -out_ascii_pssm option in psiblast)
    sigma                       SEQUENCE OF REAL OPTIONAL,

    -- Length of the aligned regions per position of the query sequence
    -- NOTE: this is needed for diagnostics information only (i.e.:
    -- -out_ascii_pssm option in psiblast)
    intervalSizes               SEQUENCE OF INTEGER OPTIONAL,

    -- Number of matching sequences per position of the PSSM (including the
    -- query)
    -- NOTE: this is needed for diagnostics information only (i.e.:
    -- -out_ascii_pssm option in psiblast)
    numMatchingSeqs             SEQUENCE OF INTEGER OPTIONAL,

    -- Number of independent observations per position of the PSSM
    -- NOTE: this is needed for building CDD database for DELTA-BLAST
    numIndeptObsr               SEQUENCE OF REAL OPTIONAL
}

-- Position-specific scoring matrix
--
-- Column indices on the PSSM refer to the positions corresponding to the
-- query/master sequence, i.e. the number of columns (N) is the same
-- as the length of the query/master sequence. 
-- Row indices refer to individual amino acid types, i.e. the number of 
-- rows (M) is the same as the number of different residues in the 
-- alphabet we use. Consequently, row labels are amino acid identifiers.
--
-- PSSMs are stored as linear arrays of integers. By default, we store
-- them column-by-column, M values for the first column followed by M
-- values for the second column, and so on. In order to provide
-- flexibility for external applications, the boolean field "byrow" is 
-- provided to specify the storage order.
Pssm ::= SEQUENCE {

    -- Is the this a protein or nucleotide scoring matrix?
    isProtein       BOOLEAN DEFAULT TRUE,	

    -- PSSM identifier
    identifier      Object-id OPTIONAL,	

    -- The dimensions of the matrix are returned so the client can
    -- verify that all data was received.

    numRows         INTEGER,	-- number of rows
    numColumns      INTEGER,	-- number of columns

    -- row-labels is given to note the order of residue types so that it can
    -- be cross-checked between applications.
    -- If this field is not given, the matrix values are presented in 
    -- order of the alphabet ncbistdaa is used for protein, ncbi4na for nucl.
    -- for proteins the values returned correspond to 
    -- (-,-), (-,A), (-,B), (-,C) ... (A,-), (A,A), (A,B), (A,C) ...
    rowLabels       SEQUENCE OF VisibleString OPTIONAL,

    -- are matrices stored row by row?
    byRow           BOOLEAN DEFAULT FALSE, 

    -- PSSM representative sequence (master) 
    query           Seq-entry OPTIONAL,           

    -- both intermediateData and finalData can be provided, but at least one of
    -- them must be provided.
    -- N.B.: by default PSI-BLAST will return the PSSM in its PssmIntermediateData 
    -- representation. 

    -- Intermediate or final data for the PSSM
    intermediateData    PssmIntermediateData OPTIONAL,

    -- Final representation for the PSSM
    finalData           PssmFinalData OPTIONAL
}

-- This structure is used to create the RPS-BLAST database auxiliary file 
-- (*.aux) and it contains parameters set at creation time of the PSSM.
-- Also, the matrixName field is used by formatrpsdb to build a PSSM from 
-- a Pssm structure which only contains PssmIntermediateData.
FormatRpsDbParameters ::= SEQUENCE {

    -- name of the underlying score matrix whose frequency ratios were
    -- used in PSSM construction (e.g.: BLOSUM62)
    matrixName   VisibleString,

    -- gap opening penalty corresponding to the matrix above
    gapOpen      INTEGER OPTIONAL,             

    -- gap extension penalty corresponding to the matrix above
    gapExtend    INTEGER OPTIONAL

}

-- Populated by PSSM engine of PSI-BLAST, original source for these values 
-- are the PSI-BLAST options specified using the BLAST options API
PssmParameters ::= SEQUENCE {

    -- pseudocount constant used for PSSM. This field corresponds to beta in 
    -- equation 2 of Nucleic Acids Res. 2001 Jul 15;29(14):2994-3005.
    pseudocount INTEGER OPTIONAL,             

    -- data needed by formatrpsdb to create RPS-BLAST databases. matrixName is
    -- populated by PSI-BLAST
    rpsdbparams     FormatRpsDbParameters OPTIONAL,

    -- alignment constraints needed by sequence-structure threader
    -- and other global or local block-alignment algorithms
    constraints     CoreDef OPTIONAL,

    -- bit score threshold for specific conserved domain hits
    bitScoreThresh  REAL OPTIONAL,

    -- bit score threshold for reporting any conserved domain hits
    bitScoreReportingThresh  REAL OPTIONAL,

    -- conserved functional sites with annotations
    annotatedSites  Site-annot-set OPTIONAL
}

-- Envelope containing PSSM and the parameters used to create it. 
-- Provided for use in PSI-BLAST, formatrpsdb, and for the structure group.
PssmWithParameters ::= SEQUENCE {

    -- This field is applicable to PSI-BLAST and formatrpsdb.
    -- When both the intermediate and final PSSM data are provided in this
    -- field, the final data (matrix of scores and associated statistical
    -- parameters) takes precedence and that data is used for further
    -- processing. The rationale for this is that the PSSM's scores and
    -- statistical parameters might have been calculated by other applications
    -- and it might not be possible to recreate it by using PSI-BLAST's PSSM 
    -- engine.
	pssm        Pssm,

    -- This field's rpsdbparams is used to specify the values of options 
    -- for processing by formatrpsdb. If these are not set, the command 
    -- line defaults of formatrpsdb are applied. This field is used
    -- by PSI-BLAST to verify that the underlying scorem matrix used to BUILD
    -- the PSSM is the same as the one being specified through the BLAST
    -- Options API. If this field is omitted, no verification will be
    -- performed, so be careful to keep track of what matrix was used to build
    -- the PSSM or else the results produced by PSI-BLAST will be unreliable.
    params      PssmParameters OPTIONAL
}

END