Source code for bayes_logistic.bayes_logistic

# Copyright (c) 2015 MaxPoint Interactive, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
# following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following
#    disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
#    disclaimer in the documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
#    derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


from __future__ import absolute_import, print_function
import numpy as np
from scipy.optimize import minimize
from scipy.stats import norm

def logistic_prob(X, w):
    """ MAP (Bayes point) logistic regression probability with overflow prevention via exponent truncation

    Parameters
    ----------
    X : array-like, shape (N, p)
        Feature matrix
    w : array-like, shape (p, )
        Parameter vector

    Returns
    -------

    pr : array-like, shape (N, )
        vector of logistic regression probabilities

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # set a truncation exponent.
    trunc = 8.  # exp(8)/(1+exp(8)) = 0.9997 which is close enough to 1 as to not matter in most cases.

    # calculate argument of logit
    z = np.dot(X, w)

    # truncate to avoid numerical over/underflow
    z = np.clip(z, -trunc, trunc)

    # calculate logitstic probability
    pr = np.exp(z)
    pr = pr / (1. + pr)

    return pr


def f_log_posterior(w, wprior, H, y, X, weights=None):
    """Returns negative log posterior probability.

    Parameters
    ----------
    w : array-like, shape (p, )
        vector of parameters at which the negative log posterior is to be evaluated
    wprior : array-like, shape (p, )
        vector of prior means on the parameters to be fit
    H : array-like, shape (p, p) or (p, )
        Array of prior Hessian (inverse covariance of prior distribution of parameters)
    y : array-like, shape (N, )
        vector of binary ({0,1} responses)
    X : array-like, shape (N, p)
        array of features
    weights : array-like, shape (N, )
        vector of data point weights. Should be within [0,1]

    Returns
    -------
    neg_log_post : float
                negative log posterior probability

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    # calculate negative log posterior

    eps = 1e-6  # this defined to ensure that we never take a log of zero

    mu = logistic_prob(X, w)

    if len(H.shape) == 2:
        neg_log_post = (- (np.dot(y.T, weights * np.log(mu + eps))
                           + np.dot((1. - y).T, weights * np.log(1. - mu + eps)))
                        + 0.5 * np.dot((w - wprior).T, np.dot(H, (w - wprior))))
    elif len(H.shape) == 1:
        neg_log_post = (- (np.dot(y.T, weights * np.log(mu + eps))
                           + np.dot((1. - y).T, weights * np.log(1. - mu + eps)))
                        + 0.5 * np.dot((w - wprior).T, H * (w - wprior)))
    else:
        raise ValueError('Incompatible Hessian')

    return float(neg_log_post)


def g_log_posterior(w, wprior, H, y, X, weights=None):
    """Returns gradient of the negative log posterior probability.

    Parameters
    ----------
    w : array-like, shape (p, )
        parameter vector at which the gradient is to be evaluated
    wprior : array-like, shape (p, )
        array of prior means on the parameters to be fit
    H : array-like, shape (p, p) or (p, )
        array of prior Hessian (inverse covariance of prior distribution of parameters)
    y : array-like, shape (N, )
        array of binary ({0,1} responses)
    X : array-like, shape (N, p)
        array of features
    weights : array-like, shape (N, )
        array of data point weights. Should be within [0,1]

    Returns
    -------
    grad_log_post : array-like, shape (p, )
                gradient of negative log posterior

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    # calculate gradient

    mu_ = logistic_prob(X, w)

    if len(H.shape) == 2:
        grad_log_post = np.dot(X.T, weights * (mu_ - y)) + np.dot(H, (w - wprior))
    elif len(H.shape) == 1:
        grad_log_post = np.dot(X.T, weights * (mu_ - y)) + H * (w - wprior)
    else:
        raise ValueError('Incompatible Hessian')

    return grad_log_post


def g_log_posterior_small(w, wprior, H, y, X, weights=None):
    """Returns normalized (to 1) gradient of the negative log posterior probability.
    This is used for BFGS and L-BFGS-B solvers which tend to not converge unless
    the gradient is normalized.

    Parameters
    ----------
    w : array-like, shape (p, )
        parameter vector at which the gradient is to be evaluated
    wprior : array-like, shape (p, )
        array of prior means on the parameters to be fit
    H : array-like, shape (p, p) or (p, )
        array of prior Hessian (inverse covariance of prior distribution of parameters)
    y : array-like, shape (N, )
        array of binary ({0,1} responses)
    X : array-like, shape (N, p)
        array of features
    weights : array-like, shape (N, )
        array of data point weights. Should be within [0,1]

    Returns
    -------
    grad_log_post : array-like, shape (p, )
                normalized (to 1) gradient of negative log posterior

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    # calculate gradient

    mu = logistic_prob(X, w)

    if len(H.shape) == 2:
        grad_log_post = np.dot(X.T, weights * (mu - y)) + np.dot(H, (w - wprior))
    elif len(H.shape) == 1:
        grad_log_post = np.dot(X.T, weights * (mu - y)) + H * (w - wprior)
    else:
        raise ValueError('Incompatible Hessian')

    # normalize gradient to length 1
    grad_log_post = grad_log_post / np.sqrt(np.sum(grad_log_post * grad_log_post))

    return grad_log_post


def H_log_posterior(w, wprior, H, y, X, weights=None):
    """Returns Hessian (either full or diagonal) of the negative log posterior probability.

    Parameters
    ----------
    w : array-like, shape (p, )
        parameter vector at which the Hessian is to be evaluated
    wprior : array-like, shape (p, )
        array of prior means on the parameters to be fit
    H : array-like, shape (p, p) or (p, )
        array of log prior Hessian (inverse covariance of prior distribution of parameters)
    y : array-like, shape (N, )
        array of binary ({0,1} responses)
    X : array-like, shape (N, p)
        array of features
    weights : array-like, shape (N, )
        array of data point weights. Should be within [0,1]

    Returns
    -------
    H_log_post : array-like, shape like `H`
                Hessian of negative log posterior

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    # calculate log posterior Hessian

    mu = logistic_prob(X, w)

    S = mu * (1. - mu) * weights

    if len(H.shape) == 2:
        H_log_post = np.dot(X.T, X * S[:, np.newaxis]) + H
    elif len(H.shape) == 1:
        H_log_post = np.diag(np.dot(X.T, X * S[:, np.newaxis])) + H
    else:
        raise ValueError('Incompatible Hessian')

    return H_log_post


def HP_log_posterior(w, q, wprior, H, y, X, weights=None):
    """Returns diagonal Hessian of the negative log posterior probability multiplied by an arbitrary vector.
    This is useful for the Newton-CG solver, particularly when we only want to store a diagonal Hessian.

    Parameters
    ----------
    w : array-like, shape (p, )
        parameter vector at which the Hessian is to be evaluated
    q : array-like, shape (p, )
        arbitrary vector to multiply Hessian by
    wprior : array-like, shape (p, )
        array of prior means on the parameters to be fit
    H : array-like, shape (p, )
        array of diagonal log prior Hessian (inverse covariance of prior distribution of parameters)
    y : array-like, shape (N, )
        array of binary ({0,1} responses)
    X : array-like, shape (N, p)
        array of features
    weights : array-like, shape (N, )
        array of data point weights. Should be within [0,1]

    Returns
    -------
    HP : array-like, shape (p, )
        Hessian of log posterior (diagonal approx) multiplied by arbitrary vector

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    HP = H_log_posterior(w, wprior, H, y, X, weights)
    HP = HP * q

    return HP


[docs]def fit_bayes_logistic(y, X, wprior, H, weights=None, solver='Newton-CG', bounds=None, maxiter=100):
    """ Bayesian Logistic Regression Solver.  Assumes Laplace (Gaussian) Approximation
    to the posterior of the fitted parameter vector. Uses scipy.optimize.minimize

    Parameters
    ----------
    y : array-like, shape (N, )
        array of binary {0,1} responses
    X : array-like, shape (N, p)
        array of features
    wprior : array-like, shape (p, )
        array of prior means on the parameters to be fit
    H : array-like, shape (p, p) or (p, )
        array of prior Hessian (inverse covariance of prior distribution of parameters)
    weights : array-like, shape (N, )
        array of data point weights. Should be within [0,1]
    solver : string
        scipy optimize solver used.  this should be either 'Newton-CG', 'BFGS' or 'L-BFGS-B'.
        The default is Newton-CG.
    bounds : iterable of length p
        a length p list (or tuple) of tuples each of length 2.
        This is only used if the solver is set to 'L-BFGS-B'. In that case, a tuple
        (lower_bound, upper_bound), both floats, is defined for each parameter.  See the
        scipy.optimize.minimize docs for further information.
    maxiter : int
        maximum number of iterations for scipy.optimize.minimize solver.

    Returns
    -------
    w_fit : array-like, shape (p, )
        posterior parameters (MAP estimate)
    H_fit : array-like, shape like `H`
        posterior Hessian  (Hessian of negative log posterior evaluated at MAP parameters)

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # Check that dimensionality of inputs agrees

    # check X
    if len(X.shape) != 2:
        raise ValueError('X must be a N*p matrix')
    (nX, pX) = X.shape

    # check y
    if len(y.shape) > 1:
        raise ValueError('y must be a vector of shape (p, )')
    if len(np.atleast_1d(y)) != nX:
        raise ValueError('y and X do not have the same number of rows')

    # check wprior
    if len(wprior.shape) > 1:
        raise ValueError('prior should be a vector of shape (p, )')
    if len(np.atleast_1d(wprior)) != pX:
        raise ValueError('prior mean has incompatible length')

    # check H
    if len(H.shape) == 1:
        if np.atleast_1d(H).shape[0] != pX:
            raise ValueError('prior Hessian is diagonal but has incompatible length')
    elif len(H.shape) == 2:
        (h1,h2) = np.atleast_2d(H).shape
        if h1 != h2:
            raise ValueError('prior Hessian must either be a p*p square matrix or a vector or shape (p, ) ')
        if h1 != pX:
            raise ValueError('prior Hessian is square but has incompatible size')

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    # Do the regression

    if solver == 'Newton-CG':

        if len(H.shape) == 2:

            ww = minimize(f_log_posterior, wprior, args=(wprior, H, y, X, weights), jac=g_log_posterior,
                          hess=H_log_posterior, method='Newton-CG', options={'maxiter': maxiter})
            w_fit = ww.x
            H_fit = H_log_posterior(w_fit, wprior, H, y, X, weights)

        elif len(H.shape) == 1:

            ww = minimize(f_log_posterior, wprior, args=(wprior, H, y, X, weights), jac=g_log_posterior,
                          hessp=HP_log_posterior, method='Newton-CG', options={'maxiter': maxiter})
            w_fit = ww.x
            H_fit = H_log_posterior(w_fit, wprior, H, y, X, weights)

        else:
            raise ValueError(' You must either use the full Hessian or its diagonal as a vector')

    elif solver == 'BFGS':
        ww = minimize(f_log_posterior, wprior, args=(wprior, H, y, X, weights), jac=g_log_posterior_small,
                      method='BFGS', options={'maxiter': maxiter})
        w_fit = ww.x
        H_fit = H_log_posterior(w_fit, wprior, H, y, X, weights)

    elif solver == 'L-BFGS-B':
        ww = minimize(f_log_posterior, wprior, args=(wprior, H, y, X, weights), jac=g_log_posterior_small,
                      method='L-BFGS-B', bounds=bounds, options={'maxiter': maxiter})
        w_fit = ww.x
        H_fit = H_log_posterior(w_fit, wprior, H, y, X, weights)

    else:
        raise ValueError('Unknown solver specified: "{0}"'.format(solver))

    return w_fit, H_fit


[docs]def get_pvalues(w, H):
    """ Calculates p-values on fitted parameters. This can be used for variable selection by,
    for example, discarding every parameter with a p-value less than 0.05 (or some other cutoff)

    Parameters
    ----------
    w : array-like, shape (p, )
        array of posterior means on the fitted parameters
    H : array-like, shape (p, p) or (p, )
        array of log posterior Hessian

    Returns
    -------
    pvals : array-like, shape (p, )
        array of p-values for each of the fitted parameters

    References
    ----------
    Chapter 2 of Pawitan, Y. 'In All Likelihood', Oxford University Press (2013)
    Also see: Gerhard, F. 'Extraction of network topology from multi-electrode recordings: is there
    a small world effect', Frontiers in Computational Neuroscience (2011) for a use case of
    p-value based variable selection.

    """

    # get inverse standard error of each parameter from the square root of the Hessian,
    # which is equal to the Fisher information
    if len(H.shape) == 2:
        inv_std_err = np.sqrt(np.diag(H))
    elif len(H.shape) == 1:
        inv_std_err = np.sqrt(H)
    else:
        raise ValueError("Incompatible Hessian provided")

    # calculate Wald statistic
    z_ = w * inv_std_err

    # get p-value by comparing Wald statistic to cdf of Normal distribution
    pvals = 2. * (1. - norm.cdf(np.abs(z_)))

    return pvals


[docs]def bayes_logistic_prob(X, w, H):
    """ Posterior predictive logistic regression probability.  Uses probit approximation
        to the logistic regression sigmoid. Also has overflow prevention via exponent truncation.

    Parameters
    ----------
    X : array-like, shape (N, p)
        array of covariates
    w : array-like, shape (p, )
        array of fitted MAP parameters
    H : array-like, shape (p, p) or (p, )
        array of log posterior Hessian (covariance matrix of fitted MAP parameters)

    Returns
    -------
    pr : array-like, shape (N, )
        moderated (by full distribution) logistic probability

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # set a truncation exponent
    trunc = 8.  # exp(8)/(1+exp(8)) = 0.9997 which is close enough to 1 as to not matter in most cases.

    # unmoderated argument of exponent
    z_a = np.dot(X, w)

    # find  the moderation
    if len(H.shape) == 2:
        H_inv_ = np.linalg.inv(H)
        sig2_a = np.sum(X * np.dot(H_inv_, X.T).T, axis=1)
    elif len(H.shape) == 1:
        H_inv_ = 1. / H
        sig2_a = np.sum(X * (H_inv_ * X), axis=1)
    else:
        raise ValueError(' You must either use the full Hessian or its diagonal as a vector')

    # get the moderation factor. Implicit in here is approximating the logistic sigmoid with
    # a probit by setting the probit and sigmoid slopes to be equal at the origin. This is where
    # the factor of pi/8 comes from.
    kappa_sig2_a = 1. / np.sqrt(1. + 0.125 * np.pi * sig2_a)

    # calculate the moderated argument of the logit
    z = z_a * kappa_sig2_a

    # do a truncation to prevent exp overflow
    z = np.clip(z, -trunc, trunc)

    # get the moderated logistic probability
    pr = np.exp(z)
    pr = pr / (1. + pr)

    return pr