Source code for permute.core

"""
Core functions.
"""


import numpy as np
import warnings
from scipy.optimize import brentq, fsolve
from scipy.stats import ttest_ind, ttest_1samp
from fractions import Fraction

from .utils import get_prng, potential_outcomes, permute


[docs]def corr(x, y, alternative='greater', reps=10**4, seed=None, plus1=True):
    r"""
    Simulate permutation p-value for Pearson correlation coefficient

    Parameters
    ----------
    x : array-like
    y : array-like
    alternative : {'greater', 'less', 'two-sided'}
        The alternative hypothesis to test
    reps : int
    seed : RandomState instance or {None, int, RandomState instance}
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`;
        If int, seed is the seed used by the random number generator;
        If RandomState instance, seed is the pseudorandom number generator
    plus1 : bool
        flag for whether to add 1 to the numerator and denominator of the
        p-value based on the empirical permutation distribution. 
        Default is True.

    Returns
    -------
    tuple
        Returns test statistic, p-value, simulated distribution
    """
    prng = get_prng(seed)
    tst = np.corrcoef(x, y)[0, 1]
    sims = [np.corrcoef(permute(x, prng), y)[0, 1] for i in range(reps)]
    left_pv = (np.sum(sims <= tst)+plus1) / (reps+plus1)
    right_pv = (np.sum(sims >= tst)+plus1) / (reps+plus1)
    if alternative == 'greater':
        pvalue = right_pv
    elif alternative == 'less':
        pvalue = left_pv
    elif alternative == 'two-sided':
        pvalue = np.min([1, 2 * np.min([left_pv, right_pv])])
    return tst, pvalue, sims


[docs]def spearman_corr(x, y, alternative='greater', reps=10**4, seed=None, plus1=True):
    r"""
    Simulate permutation p-value for Spearman correlation coefficient

    Parameters
    ----------
    x : array-like
    y : array-like
    alternative : {'greater', 'less', 'two-sided'}
        The alternative hypothesis to test
    reps : int
    seed : RandomState instance or {None, int, RandomState instance}
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`;
        If int, seed is the seed used by the random number generator;
        If RandomState instance, seed is the pseudorandom number generator
    plus1 : bool
        flag for whether to add 1 to the numerator and denominator of the
        p-value based on the empirical permutation distribution. 
        Default is True.

    Returns
    -------
    tuple
        Returns test statistic, p-value, simulated distribution
    """
    
    xnew = np.argsort(x)+1
    ynew = np.argsort(y)+1
    return corr(xnew, ynew, alternative=alternative, reps=reps, seed=seed)


[docs]def two_sample_core(potential_outcomes_all, nx, tst_stat, alternative='greater',
                    reps=10**5, keep_dist=False, seed=None, plus1=True):
    r"""
    Main workhorse function for two_sample and two_sample_shift

    Parameters
    ----------
    potential_outcomes_all : array-like
        2D array of potential outcomes under treatment (1st column)
        and control (2nd column). To be passed in from potential_outcomes
    nx : int
        Size of the treatment group x
    reps : int
        number of repetitions
    tst_stat: function
        The test statistic
    alternative : {'greater', 'less', 'two-sided'}
        The alternative hypothesis to test
    keep_dist : bool
        flag for whether to store and return the array of values
        of the test statistic. Default is False.
    seed : RandomState instance or {None, int, RandomState instance}
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`;
        If int, seed is the seed used by the random number generator;
        If RandomState instance, seed is the pseudorandom number generator
    plus1 : bool
        flag for whether to add 1 to the numerator and denominator of the
        p-value based on the empirical permutation distribution. 
        Default is True.

    Returns
    -------
    float
        the estimated p-value
    float
        the test statistic
    list
        The distribution of test statistics.
        These values are only returned if `keep_dist` == True
    """
    prng = get_prng(seed)

    rr = list(range(potential_outcomes_all.shape[0]))
    tst = tst_stat(potential_outcomes_all[:nx, 0],
                   potential_outcomes_all[nx:, 1])

    thePvalue = {
        'greater': lambda pUp, pDn: pUp+plus1/(reps+plus1),
        'less': lambda pUp, pDn: pDn+plus1/(reps+plus1),
        'two-sided': lambda pUp, pDn: 2 * np.min([0.5, \
                                    pUp+plus1/(reps+plus1), \
                                    pDn+plus1/(reps+plus1)])
    }

    if keep_dist:
        dist = np.empty(reps)
        for i in range(reps):
            prng.shuffle(rr)
            pp = np.take(potential_outcomes_all, rr, axis=0)
            dist[i] = tst_stat(pp[:nx, 0], pp[nx:, 1])
        pUp = np.sum(dist >= tst)/(reps+plus1)
        pDn = np.sum(dist <= tst)/(reps+plus1)
        return thePvalue[alternative](pUp, pDn), dist
    else:
        hitsUp = 0
        hitsDn = 0
        for i in range(reps):
            prng.shuffle(rr)
            pp = np.take(potential_outcomes_all, rr, axis=0)
            hitsUp += tst_stat(pp[:nx, 0], pp[nx:, 1]) >= tst
            hitsDn += tst_stat(pp[:nx, 0], pp[nx:, 1]) <= tst
        pUp = hitsUp/(reps+plus1)
        pDn = hitsDn/(reps+plus1)
        return thePvalue[alternative](pUp, pDn)


[docs]def two_sample(x, y, reps=10**5, stat='mean', alternative="greater",
               keep_dist=False, seed=None, plus1=True):
    r"""
    One-sided or two-sided, two-sample permutation test for equality of
    two means, with p-value estimated by simulated random sampling with
    reps replications.

    Tests the hypothesis that x and y are a random partition of x,y
    against the alternative that x comes from a population with mean

    (a) greater than that of the population from which y comes,
        if side = 'greater'
    (b) less than that of the population from which y comes,
        if side = 'less'
    (c) different from that of the population from which y comes,
        if side = 'two-sided'

    If ``keep_dist``, return the distribution of values of the test statistic;
    otherwise, return only the number of permutations for which the value of
    the test statistic and p-value.

    Parameters
    ----------
    x : array-like
        Sample 1
    y : array-like
        Sample 2
    reps : int
        number of repetitions
    stat : {'mean', 't'}
        The test statistic.

        (a) If stat == 'mean', the test statistic is (mean(x) - mean(y))
            (equivalently, sum(x), since those are monotonically related)
        (b) If stat == 't', the test statistic is the two-sample t-statistic--
            but the p-value is still estimated by the randomization,
            approximating the permutation distribution.
            The t-statistic is computed using scipy.stats.ttest_ind
        (c) If stat is a function (a callable object), the test statistic is
            that function. The function should take two arguments:
            given a permutation of the pooled data, the first argument is the
            "new" x and the second argument is the "new" y.
            For instance, if the test statistic is the Kolmogorov-Smirnov distance 
            between the empirical distributions of the two samples, 
            $\max_t |F_x(t) - F_y(t)|$, the test statistic could be written:

            f = lambda u, v: np.max( \
                [abs(sum(u<=val)/len(u)-sum(v<=val)/len(v)) for val in np.concatenate([u, v])]\
                )

    alternative : {'greater', 'less', 'two-sided'}
        The alternative hypothesis to test
    keep_dist : bool
        flag for whether to store and return the array of values
        of the irr test statistic
    seed : RandomState instance or {None, int, RandomState instance}
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`;
        If int, seed is the seed used by the random number generator;
        If RandomState instance, seed is the pseudorandom number generator
    plus1 : bool
        flag for whether to add 1 to the numerator and denominator of the
        p-value based on the empirical permutation distribution. 
        Default is True.

    Returns
    -------
    float
        the estimated p-value
    float
        the test statistic
    list
        The distribution of test statistics.
        These values are only returned if `keep_dist` == True
    """
    # Set up potential outcomes; under the null, all units are exchangeable
    pot_out_all = np.column_stack(
        [np.concatenate([x, y]), np.concatenate([x, y])])

    # If stat is callable, use it as the test function. Otherwise, look in the
    # dictionary
    stats = {
        'mean': lambda u, v: np.mean(u) - np.mean(v),
        't': lambda u, v: ttest_ind(u, v, equal_var=True)[0]
    }
    if callable(stat):
        tst_fun = stat
    else:
        tst_fun = stats[stat]

    nx = len(x)
    observed_tst = tst_fun(pot_out_all[:nx, 0], pot_out_all[nx:, 1])

    res = two_sample_core(pot_out_all, nx, tst_fun, alternative=alternative,
                          reps=reps, keep_dist=keep_dist, seed=seed, plus1=plus1)

    if keep_dist:
        return res[0], observed_tst, res[1]
    else:
        return res, observed_tst


[docs]def two_sample_shift(x, y, reps=10**5, stat='mean', alternative="greater",
                     keep_dist=False, seed=None, shift=None, plus1=True):
    r"""
    One-sided or two-sided, two-sample permutation test for equality of
    two means, with p-value estimated by simulated random sampling with
    reps replications.

    Tests the hypothesis that x and y are a random partition of x,y
    against the alternative that x comes from a population with mean

    (a) greater than that of the population from which y comes,
        if side = 'greater'
    (b) less than that of the population from which y comes,
        if side = 'less'
    (c) different from that of the population from which y comes,
        if side = 'two-sided'

    If ``keep_dist``, return the distribution of values of the test statistic;
    otherwise, return only the number of permutations for which the value of
    the test statistic and p-value.

    Parameters
    ----------
    x : array-like
        Sample 1
    y : array-like
        Sample 2
    reps : int
        number of repetitions
    stat : {'mean', 't'}
        The test statistic.

        (a) If stat == 'mean', the test statistic is (mean(x) - mean(y))
            (equivalently, sum(x), since those are monotonically related)
        (b) If stat == 't', the test statistic is the two-sample t-statistic--
            but the p-value is still estimated by the randomization,
            approximating the permutation distribution.
            The t-statistic is computed using scipy.stats.ttest_ind
        (c) If stat is a function (a callable object), the test statistic is
            that function. The function should take two arguments:
            given a permutation of the pooled data, the first argument is the
            "new" x and the second argument is the "new" y.
            For instance, if the test statistic is the Kolmogorov-Smirnov distance 
            between the empirical distributions of the two samples, 
            $\max_t |F_x(t) - F_y(t)|$, the test statistic could be written:

            f = lambda u, v: np.max( \
                [abs(sum(u<=val)/len(u)-sum(v<=val)/len(v)) for val in np.concatenate([u, v])]\
                )
                
    alternative : {'greater', 'less', 'two-sided'}
        The alternative hypothesis to test
    keep_dist : bool
        flag for whether to store and return the array of values
        of the irr test statistic
    seed : RandomState instance or {None, int, RandomState instance}
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`;
        If int, seed is the seed used by the random number generator;
        If RandomState instance, seed is the pseudorandom number generator
    shift : float
        The relationship between x and y under the null hypothesis.

        (a) A constant scalar shift in the distribution of y. That is, x is equal
            in distribution to y + shift.
        (b) A tuple containing the function and its inverse $(f, f^{-1})$, so
            $x_i = f(y_i)$ and $y_i = f^{-1}(x_i)$
    plus1 : bool
        flag for whether to add 1 to the numerator and denominator of the
        p-value based on the empirical permutation distribution. 
        Default is True.

    Returns
    -------
    float
        the estimated p-value
    float
        the test statistic
    list
        The distribution of test statistics.
        These values are only returned if `keep_dist` == True
    """
    # Set up potential outcomes according to shift
    if isinstance(shift, float) or isinstance(shift, int):
        # Potential outcomes for all units under treatment
        pot_outx = np.concatenate([x, y + shift])
        # Potential outcomes for all units under control
        pot_outy = np.concatenate([x - shift, y])
        pot_out_all = np.column_stack([pot_outx, pot_outy])
    elif isinstance(shift, tuple):
        assert (callable(shift[0])), "Supply f and finverse in shift tuple"
        assert (callable(shift[1])), "Supply f and finverse in shift tuple"
        pot_out_all = potential_outcomes(x, y, shift[0], shift[1])
    else:
        raise ValueError("Bad input for shift")

    # If stat is callable, use it as the test function. Otherwise, look in the
    # dictionary
    stats = {
        'mean': lambda u, v: np.mean(u) - np.mean(v),
        't': lambda u, v: ttest_ind(u, v, equal_var=True)[0]
    }
    if callable(stat):
        tst_fun = stat
    else:
        tst_fun = stats[stat]

    nx = len(x)
    observed_tst = tst_fun(pot_out_all[:nx, 0], pot_out_all[nx:, 1])

    res = two_sample_core(pot_out_all, nx, tst_fun, alternative=alternative,
                          reps=reps, keep_dist=keep_dist, seed=seed, plus1=plus1)

    if keep_dist:
        return res[0], observed_tst, res[1]
    else:
        return res, observed_tst


[docs]def two_sample_conf_int(x, y, cl=0.95, alternative="two-sided", seed=None,
                        reps=10**4, stat="mean", shift=None, plus1=True):
    r"""
    One-sided or two-sided confidence interval for the parameter determining
    the treatment effect.  The default is the "shift model", where we are
    interested in the parameter d such that x is equal in distribution to
    y + d. In general, if we have some family of invertible functions parameterized
    by d, we'd like to find d such that x is equal in distribution to f(y, d).

    Parameters
    ----------
    x : array-like
        Sample 1
    y : array-like
        Sample 2
    cl : float in (0, 1)
        The desired confidence level. Default 0.95.
    alternative : {"two-sided", "lower", "upper"}
        Indicates the alternative hypothesis.
    seed : RandomState instance or {None, int, RandomState instance}
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`;
        If int, seed is the seed used by the random number generator;
        If RandomState instance, seed is the pseudorandom number generator
    reps : int
        number of repetitions in two_sample
    stat : {'mean', 't'}
        The test statistic.

        (a) If stat == 'mean', the test statistic is (mean(x) - mean(y))
            (equivalently, sum(x), since those are monotonically related)
        (b) If stat == 't', the test statistic is the two-sample t-statistic--
            but the p-value is still estimated by the randomization,
            approximating the permutation distribution.
            The t-statistic is computed using scipy.stats.ttest_ind
        (c) If stat is a function (a callable object), the test statistic is
            that function. The function should take two arguments:
            given a permutation of the pooled data, the first argument is the
            "new" x and the second argument is the "new" y.
            For instance, if the test statistic is the Kolmogorov-Smirnov distance 
            between the empirical distributions of the two samples, 
            $\max_t |F_x(t) - F_y(t)|$, the test statistic could be written:

            f = lambda u, v: np.max( \
                [abs(sum(u<=val)/len(u)-sum(v<=val)/len(v)) for val in np.concatenate([u, v])]\
                )
                
    shift : float
        The relationship between x and y under the null hypothesis.

        (a) If None, the relationship is assumed to be additive (e.g. x = y+d)
        (b) A tuple containing the function and its inverse $(f, f^{-1})$, so
            $x_i = f(y_i, d)$ and $y_i = f^{-1}(x_i, d)$
    plus1 : bool
        flag for whether to add 1 to the numerator and denominator of the
        p-value based on the empirical permutation distribution. 
        Default is True.

    Returns
    -------
    tuple
        the estimated confidence limits

    Notes
    -----
    xtol : float
        Tolerance in brentq
    rtol : float
        Tolerance in brentq
    maxiter : int
        Maximum number of iterations in brentq
    """
    # print warning
    warnings.warn('This function is under construction and outputs may be unreliable.')
    
    assert alternative in ("two-sided", "lower", "upper")

    if shift is None:
        shift_limit = max(abs(max(x) - min(y)), abs(max(y) - min(x)))
        # FIXME: unused observed
        # observed = np.mean(x) - np.mean(y)
    elif isinstance(shift, tuple):
        assert (callable(shift[0])), "Supply f and finverse in shift tuple"
        assert (callable(shift[1])), "Supply f and finverse in shift tuple"
        f = shift[0]
        finverse = shift[1]
        # Check that f is increasing in d; this is very ad hoc!
        assert (f(5, 1) < f(5, 2)), "f must be increasing in the parameter d"
        shift_limit = max(abs(fsolve(lambda d: f(max(y), d) - min(x), 0)),
                          abs(fsolve(lambda d: f(min(y), d) - max(x), 0)))
        # FIXME: unused observed
        # observed = fsolve(lambda d: np.mean(x) - np.mean(f(y, d)), 0)
    else:
        raise ValueError("Bad input for shift")
    ci_low = -shift_limit
    ci_upp = shift_limit

    if alternative == 'two-sided':
        cl = 1 - (1 - cl) / 2

    if alternative != "upper":
        if shift is None:
            g = lambda q: cl - two_sample_shift(x, y, alternative="less", seed=seed,
                                                shift=q, reps=reps, stat=stat, plus1=plus1)[0]
        else:
            g = lambda q: cl - two_sample_shift(x, y, alternative="less", seed=seed,
                                                shift=(lambda u: f(u, q), lambda u: finverse(u, q)), 
                                                reps=reps, stat=stat, plus1=plus1)[0]
        ci_low = brentq(g, -2 * shift_limit, 2 * shift_limit)

    if alternative != "lower":
        if shift is None:
            g = lambda q: cl - two_sample_shift(x, y, alternative="greater", seed=seed,
                                                shift=q, reps=reps, stat=stat, plus1=plus1)[0]
        else:
            g = lambda q: cl - two_sample_shift(x, y, alternative="greater", seed=seed,
                                                shift=(lambda u: f(u, q), lambda u: finverse(u, q)), 
                                                reps=reps, stat=stat, plus1=plus1)[0]
        ci_upp = brentq(g, -2 * shift_limit, 2 * shift_limit)

    return ci_low, ci_upp


[docs]def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater",
               keep_dist=False, seed=None, plus1=True):
    r"""
    One-sided or two-sided, one-sample permutation test for the mean,
    with p-value estimated by simulated random sampling with
    reps replications.

    Alternatively, a permutation test for equality of means of two paired
    samples.

    Tests the hypothesis that x is distributed symmetrically symmetric about 0
    (or x and y have the same center) against the alternative that x comes from
    a population with mean

    (a) greater than 0 (greater than that of the population from which y comes),
        if side = 'greater'
    (b) less than 0 (less than that of the population from which y comes),
        if side = 'less'
    (c) different from 0 (different from that of the population from which y comes),
        if side = 'two-sided'

    If ``keep_dist``, return the distribution of values of the test statistic;
    otherwise, return only the number of permutations for which the value of
    the test statistic and p-value.

    Parameters
    ----------
    x : array-like
        Sample 1
    y : array-like
        Sample 2. Must preserve the order of pairs with x.
        If None, x is taken to be the one sample.
    reps : int
        number of repetitions
    stat : {'mean', 't'}
        The test statistic. The statistic is computed based on either z = x or
        z = x - y, if y is specified.

        (a) If stat == 'mean', the test statistic is mean(z).
        (b) If stat == 't', the test statistic is the t-statistic--
            but the p-value is still estimated by the randomization,
            approximating the permutation distribution.
        (c) If stat is a function (a callable object), the test statistic is
            that function.  The function should take a permutation of the
            data and compute the test function from it. For instance, if the
            test statistic is the maximum absolute value, $\max_i |z_i|$,
            the test statistic could be written:

            f = lambda u: np.max(abs(u))
    alternative : {'greater', 'less', 'two-sided'}
        The alternative hypothesis to test
    keep_dist : bool
        flag for whether to store and return the array of values
        of the irr test statistic
    seed : RandomState instance or {None, int, RandomState instance}
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`;
        If int, seed is the seed used by the random number generator;
        If RandomState instance, seed is the pseudorandom number generator
    plus1 : bool
        flag for whether to add 1 to the numerator and denominator of the
        p-value based on the empirical permutation distribution. 
        Default is True.

    Returns
    -------
    float
        the estimated p-value
    float
        the test statistic
    list
        The distribution of test statistics.
        These values are only returned if `keep_dist` == True
    """
    prng = get_prng(seed)

    if y is None:
        z = x
    elif len(x) != len(y):
        raise ValueError('x and y must be pairs')
    else:
        z = np.array(x) - np.array(y)

    thePvalue = {
        'greater': lambda pUp, pDn: pUp+plus1/(reps+plus1),
        'less': lambda pUp, pDn: pDn+plus1/(reps+plus1),
        'two-sided': lambda pUp, pDn: 2 * np.min([0.5, \
                                    pUp+plus1/(reps+plus1), \
                                    pDn+plus1/(reps+plus1)])
    }
    stats = {
        'mean': lambda u: np.mean(u),
        't': lambda u: ttest_1samp(u, 0)[0]
    }
    if callable(stat):
        tst_fun = stat
    else:
        tst_fun = stats[stat]

    tst = tst_fun(z)
    n = len(z)
    if keep_dist:
        dist = []
        for i in range(reps):
            dist.append(tst_fun(z * (1 - 2 * prng.randint(0, 2, n))))
        pUp = np.sum(dist >= tst)/(reps + plus1)
        pDn = np.sum(dist <= tst)/(reps + plus1)
        return thePvalue[alternative](pUp, pDn), tst, dist
    else:
        hitsUp = 0
        hitsDn = 0
        for i in range(reps):
            tv = tst_fun(z * (1 - 2 * prng.randint(0, 2, n)))
            hitsUp += (tv >= tst)
            hitsDn += (tv <= tst)
        pUp = hitsUp/(reps+plus1)
        pDn = hitsDn/(reps+plus1)
        return thePvalue[alternative](pUp, pDn), tst