Source code for bayes_logistic.bayes_logistic

# Copyright (c) 2015 MaxPoint Interactive, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
# following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following
#    disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
#    disclaimer in the documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products
#    derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


from __future__ import absolute_import, print_function
import numpy as np
from scipy.optimize import minimize
from scipy.stats import norm

def logistic_prob(X, w):
    """ MAP (Bayes point) logistic regression probability with overflow prevention via exponent truncation

    Parameters
    ----------
    X : array-like, shape (N, p)
        Feature matrix
    w : array-like, shape (p, )
        Parameter vector

    Returns
    -------

    pr : array-like, shape (N, )
        vector of logistic regression probabilities

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # set a truncation exponent.
    trunc = 8.  # exp(8)/(1+exp(8)) = 0.9997 which is close enough to 1 as to not matter in most cases.

    # calculate argument of logit
    z = np.dot(X, w)

    # truncate to avoid numerical over/underflow
    z = np.clip(z, -trunc, trunc)

    # calculate logitstic probability
    pr = np.exp(z)
    pr = pr / (1. + pr)

    return pr


def f_log_posterior(w, wprior, H, y, X, weights=None):
    """Returns negative log posterior probability.

    Parameters
    ----------
    w : array-like, shape (p, )
        vector of parameters at which the negative log posterior is to be evaluated
    wprior : array-like, shape (p, )
        vector of prior means on the parameters to be fit
    H : array-like, shape (p, p) or (p, )
        Array of prior Hessian (inverse covariance of prior distribution of parameters)
    y : array-like, shape (N, )
        vector of binary ({0,1} responses)
    X : array-like, shape (N, p)
        array of features
    weights : array-like, shape (N, )
        vector of data point weights. Should be within [0,1]

    Returns
    -------
    neg_log_post : float
                negative log posterior probability

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    # calculate negative log posterior

    eps = 1e-6  # this defined to ensure that we never take a log of zero

    mu = logistic_prob(X, w)

    if len(H.shape) == 2:
        neg_log_post = (- (np.dot(y.T, weights * np.log(mu + eps))
                           + np.dot((1. - y).T, weights * np.log(1. - mu + eps)))
                        + 0.5 * np.dot((w - wprior).T, np.dot(H, (w - wprior))))
    elif len(H.shape) == 1:
        neg_log_post = (- (np.dot(y.T, weights * np.log(mu + eps))
                           + np.dot((1. - y).T, weights * np.log(1. - mu + eps)))
                        + 0.5 * np.dot((w - wprior).T, H * (w - wprior)))
    else:
        raise ValueError('Incompatible Hessian')

    return float(neg_log_post)


def g_log_posterior(w, wprior, H, y, X, weights=None):
    """Returns gradient of the negative log posterior probability.

    Parameters
    ----------
    w : array-like, shape (p, )
        parameter vector at which the gradient is to be evaluated
    wprior : array-like, shape (p, )
        array of prior means on the parameters to be fit
    H : array-like, shape (p, p) or (p, )
        array of prior Hessian (inverse covariance of prior distribution of parameters)
    y : array-like, shape (N, )
        array of binary ({0,1} responses)
    X : array-like, shape (N, p)
        array of features
    weights : array-like, shape (N, )
        array of data point weights. Should be within [0,1]

    Returns
    -------
    grad_log_post : array-like, shape (p, )
                gradient of negative log posterior

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    # calculate gradient

    mu_ = logistic_prob(X, w)

    if len(H.shape) == 2:
        grad_log_post = np.dot(X.T, weights * (mu_ - y)) + np.dot(H, (w - wprior))
    elif len(H.shape) == 1:
        grad_log_post = np.dot(X.T, weights * (mu_ - y)) + H * (w - wprior)
    else:
        raise ValueError('Incompatible Hessian')

    return grad_log_post


def g_log_posterior_small(w, wprior, H, y, X, weights=None):
    """Returns normalized (to 1) gradient of the negative log posterior probability.
    This is used for BFGS and L-BFGS-B solvers which tend to not converge unless
    the gradient is normalized.

    Parameters
    ----------
    w : array-like, shape (p, )
        parameter vector at which the gradient is to be evaluated
    wprior : array-like, shape (p, )
        array of prior means on the parameters to be fit
    H : array-like, shape (p, p) or (p, )
        array of prior Hessian (inverse covariance of prior distribution of parameters)
    y : array-like, shape (N, )
        array of binary ({0,1} responses)
    X : array-like, shape (N, p)
        array of features
    weights : array-like, shape (N, )
        array of data point weights. Should be within [0,1]

    Returns
    -------
    grad_log_post : array-like, shape (p, )
                normalized (to 1) gradient of negative log posterior

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    # calculate gradient

    mu = logistic_prob(X, w)

    if len(H.shape) == 2:
        grad_log_post = np.dot(X.T, weights * (mu - y)) + np.dot(H, (w - wprior))
    elif len(H.shape) == 1:
        grad_log_post = np.dot(X.T, weights * (mu - y)) + H * (w - wprior)
    else:
        raise ValueError('Incompatible Hessian')

    # normalize gradient to length 1
    grad_log_post = grad_log_post / np.sqrt(np.sum(grad_log_post * grad_log_post))

    return grad_log_post


def H_log_posterior(w, wprior, H, y, X, weights=None):
    """Returns Hessian (either full or diagonal) of the negative log posterior probability.

    Parameters
    ----------
    w : array-like, shape (p, )
        parameter vector at which the Hessian is to be evaluated
    wprior : array-like, shape (p, )
        array of prior means on the parameters to be fit
    H : array-like, shape (p, p) or (p, )
        array of log prior Hessian (inverse covariance of prior distribution of parameters)
    y : array-like, shape (N, )
        array of binary ({0,1} responses)
    X : array-like, shape (N, p)
        array of features
    weights : array-like, shape (N, )
        array of data point weights. Should be within [0,1]

    Returns
    -------
    H_log_post : array-like, shape like `H`
                Hessian of negative log posterior

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    # calculate log posterior Hessian

    mu = logistic_prob(X, w)

    S = mu * (1. - mu) * weights

    if len(H.shape) == 2:
        H_log_post = np.dot(X.T, X * S[:, np.newaxis]) + H
    elif len(H.shape) == 1:
        H_log_post = np.diag(np.dot(X.T, X * S[:, np.newaxis])) + H
    else:
        raise ValueError('Incompatible Hessian')

    return H_log_post


def HP_log_posterior(w, q, wprior, H, y, X, weights=None):
    """Returns diagonal Hessian of the negative log posterior probability multiplied by an arbitrary vector.
    This is useful for the Newton-CG solver, particularly when we only want to store a diagonal Hessian.

    Parameters
    ----------
    w : array-like, shape (p, )
        parameter vector at which the Hessian is to be evaluated
    q : array-like, shape (p, )
        arbitrary vector to multiply Hessian by
    wprior : array-like, shape (p, )
        array of prior means on the parameters to be fit
    H : array-like, shape (p, )
        array of diagonal log prior Hessian (inverse covariance of prior distribution of parameters)
    y : array-like, shape (N, )
        array of binary ({0,1} responses)
    X : array-like, shape (N, p)
        array of features
    weights : array-like, shape (N, )
        array of data point weights. Should be within [0,1]

    Returns
    -------
    HP : array-like, shape (p, )
        Hessian of log posterior (diagonal approx) multiplied by arbitrary vector

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)

    """

    # fill in weights if need be
    if weights is None:
        weights = np.ones(len(np.atleast_1d(y)), )
    if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)):
        raise ValueError(' weight vector must be same length as response vector')

    HP = H_log_posterior(w, wprior, H, y, X, weights)
    HP = HP * q

    return HP


[docs]def fit_bayes_logistic(y, X, wprior, H, weights=None, solver='Newton-CG', bounds=None, maxiter=100): """ Bayesian Logistic Regression Solver. Assumes Laplace (Gaussian) Approximation to the posterior of the fitted parameter vector. Uses scipy.optimize.minimize Parameters ---------- y : array-like, shape (N, ) array of binary {0,1} responses X : array-like, shape (N, p) array of features wprior : array-like, shape (p, ) array of prior means on the parameters to be fit H : array-like, shape (p, p) or (p, ) array of prior Hessian (inverse covariance of prior distribution of parameters) weights : array-like, shape (N, ) array of data point weights. Should be within [0,1] solver : string scipy optimize solver used. this should be either 'Newton-CG', 'BFGS' or 'L-BFGS-B'. The default is Newton-CG. bounds : iterable of length p a length p list (or tuple) of tuples each of length 2. This is only used if the solver is set to 'L-BFGS-B'. In that case, a tuple (lower_bound, upper_bound), both floats, is defined for each parameter. See the scipy.optimize.minimize docs for further information. maxiter : int maximum number of iterations for scipy.optimize.minimize solver. Returns ------- w_fit : array-like, shape (p, ) posterior parameters (MAP estimate) H_fit : array-like, shape like `H` posterior Hessian (Hessian of negative log posterior evaluated at MAP parameters) References ---------- Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012) Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006) """ # Check that dimensionality of inputs agrees # check X if len(X.shape) != 2: raise ValueError('X must be a N*p matrix') (nX, pX) = X.shape # check y if len(y.shape) > 1: raise ValueError('y must be a vector of shape (p, )') if len(np.atleast_1d(y)) != nX: raise ValueError('y and X do not have the same number of rows') # check wprior if len(wprior.shape) > 1: raise ValueError('prior should be a vector of shape (p, )') if len(np.atleast_1d(wprior)) != pX: raise ValueError('prior mean has incompatible length') # check H if len(H.shape) == 1: if np.atleast_1d(H).shape[0] != pX: raise ValueError('prior Hessian is diagonal but has incompatible length') elif len(H.shape) == 2: (h1,h2) = np.atleast_2d(H).shape if h1 != h2: raise ValueError('prior Hessian must either be a p*p square matrix or a vector or shape (p, ) ') if h1 != pX: raise ValueError('prior Hessian is square but has incompatible size') # fill in weights if need be if weights is None: weights = np.ones(len(np.atleast_1d(y)), ) if len(np.atleast_1d(weights)) != len(np.atleast_1d(y)): raise ValueError(' weight vector must be same length as response vector') # Do the regression if solver == 'Newton-CG': if len(H.shape) == 2: ww = minimize(f_log_posterior, wprior, args=(wprior, H, y, X, weights), jac=g_log_posterior, hess=H_log_posterior, method='Newton-CG', options={'maxiter': maxiter}) w_fit = ww.x H_fit = H_log_posterior(w_fit, wprior, H, y, X, weights) elif len(H.shape) == 1: ww = minimize(f_log_posterior, wprior, args=(wprior, H, y, X, weights), jac=g_log_posterior, hessp=HP_log_posterior, method='Newton-CG', options={'maxiter': maxiter}) w_fit = ww.x H_fit = H_log_posterior(w_fit, wprior, H, y, X, weights) else: raise ValueError(' You must either use the full Hessian or its diagonal as a vector') elif solver == 'BFGS': ww = minimize(f_log_posterior, wprior, args=(wprior, H, y, X, weights), jac=g_log_posterior_small, method='BFGS', options={'maxiter': maxiter}) w_fit = ww.x H_fit = H_log_posterior(w_fit, wprior, H, y, X, weights) elif solver == 'L-BFGS-B': ww = minimize(f_log_posterior, wprior, args=(wprior, H, y, X, weights), jac=g_log_posterior_small, method='L-BFGS-B', bounds=bounds, options={'maxiter': maxiter}) w_fit = ww.x H_fit = H_log_posterior(w_fit, wprior, H, y, X, weights) else: raise ValueError('Unknown solver specified: "{0}"'.format(solver)) return w_fit, H_fit
[docs]def get_pvalues(w, H): """ Calculates p-values on fitted parameters. This can be used for variable selection by, for example, discarding every parameter with a p-value less than 0.05 (or some other cutoff) Parameters ---------- w : array-like, shape (p, ) array of posterior means on the fitted parameters H : array-like, shape (p, p) or (p, ) array of log posterior Hessian Returns ------- pvals : array-like, shape (p, ) array of p-values for each of the fitted parameters References ---------- Chapter 2 of Pawitan, Y. 'In All Likelihood', Oxford University Press (2013) Also see: Gerhard, F. 'Extraction of network topology from multi-electrode recordings: is there a small world effect', Frontiers in Computational Neuroscience (2011) for a use case of p-value based variable selection. """ # get inverse standard error of each parameter from the square root of the Hessian, # which is equal to the Fisher information if len(H.shape) == 2: inv_std_err = np.sqrt(np.diag(H)) elif len(H.shape) == 1: inv_std_err = np.sqrt(H) else: raise ValueError("Incompatible Hessian provided") # calculate Wald statistic z_ = w * inv_std_err # get p-value by comparing Wald statistic to cdf of Normal distribution pvals = 2. * (1. - norm.cdf(np.abs(z_))) return pvals
[docs]def bayes_logistic_prob(X, w, H): """ Posterior predictive logistic regression probability. Uses probit approximation to the logistic regression sigmoid. Also has overflow prevention via exponent truncation. Parameters ---------- X : array-like, shape (N, p) array of covariates w : array-like, shape (p, ) array of fitted MAP parameters H : array-like, shape (p, p) or (p, ) array of log posterior Hessian (covariance matrix of fitted MAP parameters) Returns ------- pr : array-like, shape (N, ) moderated (by full distribution) logistic probability References ---------- Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012) Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006) """ # set a truncation exponent trunc = 8. # exp(8)/(1+exp(8)) = 0.9997 which is close enough to 1 as to not matter in most cases. # unmoderated argument of exponent z_a = np.dot(X, w) # find the moderation if len(H.shape) == 2: H_inv_ = np.linalg.inv(H) sig2_a = np.sum(X * np.dot(H_inv_, X.T).T, axis=1) elif len(H.shape) == 1: H_inv_ = 1. / H sig2_a = np.sum(X * (H_inv_ * X), axis=1) else: raise ValueError(' You must either use the full Hessian or its diagonal as a vector') # get the moderation factor. Implicit in here is approximating the logistic sigmoid with # a probit by setting the probit and sigmoid slopes to be equal at the origin. This is where # the factor of pi/8 comes from. kappa_sig2_a = 1. / np.sqrt(1. + 0.125 * np.pi * sig2_a) # calculate the moderated argument of the logit z = z_a * kappa_sig2_a # do a truncation to prevent exp overflow z = np.clip(z, -trunc, trunc) # get the moderated logistic probability pr = np.exp(z) pr = pr / (1. + pr) return pr