Source code for permute.core

"""
Core functions.
"""


import numpy as np
import warnings
from scipy.optimize import brentq, fsolve
from scipy.stats import ttest_ind, ttest_1samp
from fractions import Fraction

from .utils import get_prng, potential_outcomes, permute


[docs]def corr(x, y, alternative='greater', reps=10**4, seed=None, plus1=True): r""" Simulate permutation p-value for Pearson correlation coefficient Parameters ---------- x : array-like y : array-like alternative : {'greater', 'less', 'two-sided'} The alternative hypothesis to test reps : int seed : RandomState instance or {None, int, RandomState instance} If None, the pseudorandom number generator is the RandomState instance used by `np.random`; If int, seed is the seed used by the random number generator; If RandomState instance, seed is the pseudorandom number generator plus1 : bool flag for whether to add 1 to the numerator and denominator of the p-value based on the empirical permutation distribution. Default is True. Returns ------- tuple Returns test statistic, p-value, simulated distribution """ prng = get_prng(seed) tst = np.corrcoef(x, y)[0, 1] sims = [np.corrcoef(permute(x, prng), y)[0, 1] for i in range(reps)] left_pv = (np.sum(sims <= tst)+plus1) / (reps+plus1) right_pv = (np.sum(sims >= tst)+plus1) / (reps+plus1) if alternative == 'greater': pvalue = right_pv elif alternative == 'less': pvalue = left_pv elif alternative == 'two-sided': pvalue = np.min([1, 2 * np.min([left_pv, right_pv])]) return tst, pvalue, sims
[docs]def spearman_corr(x, y, alternative='greater', reps=10**4, seed=None, plus1=True): r""" Simulate permutation p-value for Spearman correlation coefficient Parameters ---------- x : array-like y : array-like alternative : {'greater', 'less', 'two-sided'} The alternative hypothesis to test reps : int seed : RandomState instance or {None, int, RandomState instance} If None, the pseudorandom number generator is the RandomState instance used by `np.random`; If int, seed is the seed used by the random number generator; If RandomState instance, seed is the pseudorandom number generator plus1 : bool flag for whether to add 1 to the numerator and denominator of the p-value based on the empirical permutation distribution. Default is True. Returns ------- tuple Returns test statistic, p-value, simulated distribution """ xnew = np.argsort(x)+1 ynew = np.argsort(y)+1 return corr(xnew, ynew, alternative=alternative, reps=reps, seed=seed)
[docs]def two_sample_core(potential_outcomes_all, nx, tst_stat, alternative='greater', reps=10**5, keep_dist=False, seed=None, plus1=True): r""" Main workhorse function for two_sample and two_sample_shift Parameters ---------- potential_outcomes_all : array-like 2D array of potential outcomes under treatment (1st column) and control (2nd column). To be passed in from potential_outcomes nx : int Size of the treatment group x reps : int number of repetitions tst_stat: function The test statistic alternative : {'greater', 'less', 'two-sided'} The alternative hypothesis to test keep_dist : bool flag for whether to store and return the array of values of the test statistic. Default is False. seed : RandomState instance or {None, int, RandomState instance} If None, the pseudorandom number generator is the RandomState instance used by `np.random`; If int, seed is the seed used by the random number generator; If RandomState instance, seed is the pseudorandom number generator plus1 : bool flag for whether to add 1 to the numerator and denominator of the p-value based on the empirical permutation distribution. Default is True. Returns ------- float the estimated p-value float the test statistic list The distribution of test statistics. These values are only returned if `keep_dist` == True """ prng = get_prng(seed) rr = list(range(potential_outcomes_all.shape[0])) tst = tst_stat(potential_outcomes_all[:nx, 0], potential_outcomes_all[nx:, 1]) thePvalue = { 'greater': lambda pUp, pDn: pUp+plus1/(reps+plus1), 'less': lambda pUp, pDn: pDn+plus1/(reps+plus1), 'two-sided': lambda pUp, pDn: 2 * np.min([0.5, \ pUp+plus1/(reps+plus1), \ pDn+plus1/(reps+plus1)]) } if keep_dist: dist = np.empty(reps) for i in range(reps): prng.shuffle(rr) pp = np.take(potential_outcomes_all, rr, axis=0) dist[i] = tst_stat(pp[:nx, 0], pp[nx:, 1]) pUp = np.sum(dist >= tst)/(reps+plus1) pDn = np.sum(dist <= tst)/(reps+plus1) return thePvalue[alternative](pUp, pDn), dist else: hitsUp = 0 hitsDn = 0 for i in range(reps): prng.shuffle(rr) pp = np.take(potential_outcomes_all, rr, axis=0) hitsUp += tst_stat(pp[:nx, 0], pp[nx:, 1]) >= tst hitsDn += tst_stat(pp[:nx, 0], pp[nx:, 1]) <= tst pUp = hitsUp/(reps+plus1) pDn = hitsDn/(reps+plus1) return thePvalue[alternative](pUp, pDn)
[docs]def two_sample(x, y, reps=10**5, stat='mean', alternative="greater", keep_dist=False, seed=None, plus1=True): r""" One-sided or two-sided, two-sample permutation test for equality of two means, with p-value estimated by simulated random sampling with reps replications. Tests the hypothesis that x and y are a random partition of x,y against the alternative that x comes from a population with mean (a) greater than that of the population from which y comes, if side = 'greater' (b) less than that of the population from which y comes, if side = 'less' (c) different from that of the population from which y comes, if side = 'two-sided' If ``keep_dist``, return the distribution of values of the test statistic; otherwise, return only the number of permutations for which the value of the test statistic and p-value. Parameters ---------- x : array-like Sample 1 y : array-like Sample 2 reps : int number of repetitions stat : {'mean', 't'} The test statistic. (a) If stat == 'mean', the test statistic is (mean(x) - mean(y)) (equivalently, sum(x), since those are monotonically related) (b) If stat == 't', the test statistic is the two-sample t-statistic-- but the p-value is still estimated by the randomization, approximating the permutation distribution. The t-statistic is computed using scipy.stats.ttest_ind (c) If stat is a function (a callable object), the test statistic is that function. The function should take two arguments: given a permutation of the pooled data, the first argument is the "new" x and the second argument is the "new" y. For instance, if the test statistic is the Kolmogorov-Smirnov distance between the empirical distributions of the two samples, $\max_t |F_x(t) - F_y(t)|$, the test statistic could be written: f = lambda u, v: np.max( \ [abs(sum(u<=val)/len(u)-sum(v<=val)/len(v)) for val in np.concatenate([u, v])]\ ) alternative : {'greater', 'less', 'two-sided'} The alternative hypothesis to test keep_dist : bool flag for whether to store and return the array of values of the irr test statistic seed : RandomState instance or {None, int, RandomState instance} If None, the pseudorandom number generator is the RandomState instance used by `np.random`; If int, seed is the seed used by the random number generator; If RandomState instance, seed is the pseudorandom number generator plus1 : bool flag for whether to add 1 to the numerator and denominator of the p-value based on the empirical permutation distribution. Default is True. Returns ------- float the estimated p-value float the test statistic list The distribution of test statistics. These values are only returned if `keep_dist` == True """ # Set up potential outcomes; under the null, all units are exchangeable pot_out_all = np.column_stack( [np.concatenate([x, y]), np.concatenate([x, y])]) # If stat is callable, use it as the test function. Otherwise, look in the # dictionary stats = { 'mean': lambda u, v: np.mean(u) - np.mean(v), 't': lambda u, v: ttest_ind(u, v, equal_var=True)[0] } if callable(stat): tst_fun = stat else: tst_fun = stats[stat] nx = len(x) observed_tst = tst_fun(pot_out_all[:nx, 0], pot_out_all[nx:, 1]) res = two_sample_core(pot_out_all, nx, tst_fun, alternative=alternative, reps=reps, keep_dist=keep_dist, seed=seed, plus1=plus1) if keep_dist: return res[0], observed_tst, res[1] else: return res, observed_tst
[docs]def two_sample_shift(x, y, reps=10**5, stat='mean', alternative="greater", keep_dist=False, seed=None, shift=None, plus1=True): r""" One-sided or two-sided, two-sample permutation test for equality of two means, with p-value estimated by simulated random sampling with reps replications. Tests the hypothesis that x and y are a random partition of x,y against the alternative that x comes from a population with mean (a) greater than that of the population from which y comes, if side = 'greater' (b) less than that of the population from which y comes, if side = 'less' (c) different from that of the population from which y comes, if side = 'two-sided' If ``keep_dist``, return the distribution of values of the test statistic; otherwise, return only the number of permutations for which the value of the test statistic and p-value. Parameters ---------- x : array-like Sample 1 y : array-like Sample 2 reps : int number of repetitions stat : {'mean', 't'} The test statistic. (a) If stat == 'mean', the test statistic is (mean(x) - mean(y)) (equivalently, sum(x), since those are monotonically related) (b) If stat == 't', the test statistic is the two-sample t-statistic-- but the p-value is still estimated by the randomization, approximating the permutation distribution. The t-statistic is computed using scipy.stats.ttest_ind (c) If stat is a function (a callable object), the test statistic is that function. The function should take two arguments: given a permutation of the pooled data, the first argument is the "new" x and the second argument is the "new" y. For instance, if the test statistic is the Kolmogorov-Smirnov distance between the empirical distributions of the two samples, $\max_t |F_x(t) - F_y(t)|$, the test statistic could be written: f = lambda u, v: np.max( \ [abs(sum(u<=val)/len(u)-sum(v<=val)/len(v)) for val in np.concatenate([u, v])]\ ) alternative : {'greater', 'less', 'two-sided'} The alternative hypothesis to test keep_dist : bool flag for whether to store and return the array of values of the irr test statistic seed : RandomState instance or {None, int, RandomState instance} If None, the pseudorandom number generator is the RandomState instance used by `np.random`; If int, seed is the seed used by the random number generator; If RandomState instance, seed is the pseudorandom number generator shift : float The relationship between x and y under the null hypothesis. (a) A constant scalar shift in the distribution of y. That is, x is equal in distribution to y + shift. (b) A tuple containing the function and its inverse $(f, f^{-1})$, so $x_i = f(y_i)$ and $y_i = f^{-1}(x_i)$ plus1 : bool flag for whether to add 1 to the numerator and denominator of the p-value based on the empirical permutation distribution. Default is True. Returns ------- float the estimated p-value float the test statistic list The distribution of test statistics. These values are only returned if `keep_dist` == True """ # Set up potential outcomes according to shift if isinstance(shift, float) or isinstance(shift, int): # Potential outcomes for all units under treatment pot_outx = np.concatenate([x, y + shift]) # Potential outcomes for all units under control pot_outy = np.concatenate([x - shift, y]) pot_out_all = np.column_stack([pot_outx, pot_outy]) elif isinstance(shift, tuple): assert (callable(shift[0])), "Supply f and finverse in shift tuple" assert (callable(shift[1])), "Supply f and finverse in shift tuple" pot_out_all = potential_outcomes(x, y, shift[0], shift[1]) else: raise ValueError("Bad input for shift") # If stat is callable, use it as the test function. Otherwise, look in the # dictionary stats = { 'mean': lambda u, v: np.mean(u) - np.mean(v), 't': lambda u, v: ttest_ind(u, v, equal_var=True)[0] } if callable(stat): tst_fun = stat else: tst_fun = stats[stat] nx = len(x) observed_tst = tst_fun(pot_out_all[:nx, 0], pot_out_all[nx:, 1]) res = two_sample_core(pot_out_all, nx, tst_fun, alternative=alternative, reps=reps, keep_dist=keep_dist, seed=seed, plus1=plus1) if keep_dist: return res[0], observed_tst, res[1] else: return res, observed_tst
[docs]def two_sample_conf_int(x, y, cl=0.95, alternative="two-sided", seed=None, reps=10**4, stat="mean", shift=None, plus1=True): r""" One-sided or two-sided confidence interval for the parameter determining the treatment effect. The default is the "shift model", where we are interested in the parameter d such that x is equal in distribution to y + d. In general, if we have some family of invertible functions parameterized by d, we'd like to find d such that x is equal in distribution to f(y, d). Parameters ---------- x : array-like Sample 1 y : array-like Sample 2 cl : float in (0, 1) The desired confidence level. Default 0.95. alternative : {"two-sided", "lower", "upper"} Indicates the alternative hypothesis. seed : RandomState instance or {None, int, RandomState instance} If None, the pseudorandom number generator is the RandomState instance used by `np.random`; If int, seed is the seed used by the random number generator; If RandomState instance, seed is the pseudorandom number generator reps : int number of repetitions in two_sample stat : {'mean', 't'} The test statistic. (a) If stat == 'mean', the test statistic is (mean(x) - mean(y)) (equivalently, sum(x), since those are monotonically related) (b) If stat == 't', the test statistic is the two-sample t-statistic-- but the p-value is still estimated by the randomization, approximating the permutation distribution. The t-statistic is computed using scipy.stats.ttest_ind (c) If stat is a function (a callable object), the test statistic is that function. The function should take two arguments: given a permutation of the pooled data, the first argument is the "new" x and the second argument is the "new" y. For instance, if the test statistic is the Kolmogorov-Smirnov distance between the empirical distributions of the two samples, $\max_t |F_x(t) - F_y(t)|$, the test statistic could be written: f = lambda u, v: np.max( \ [abs(sum(u<=val)/len(u)-sum(v<=val)/len(v)) for val in np.concatenate([u, v])]\ ) shift : float The relationship between x and y under the null hypothesis. (a) If None, the relationship is assumed to be additive (e.g. x = y+d) (b) A tuple containing the function and its inverse $(f, f^{-1})$, so $x_i = f(y_i, d)$ and $y_i = f^{-1}(x_i, d)$ plus1 : bool flag for whether to add 1 to the numerator and denominator of the p-value based on the empirical permutation distribution. Default is True. Returns ------- tuple the estimated confidence limits Notes ----- xtol : float Tolerance in brentq rtol : float Tolerance in brentq maxiter : int Maximum number of iterations in brentq """ # print warning warnings.warn('This function is under construction and outputs may be unreliable.') assert alternative in ("two-sided", "lower", "upper") if shift is None: shift_limit = max(abs(max(x) - min(y)), abs(max(y) - min(x))) # FIXME: unused observed # observed = np.mean(x) - np.mean(y) elif isinstance(shift, tuple): assert (callable(shift[0])), "Supply f and finverse in shift tuple" assert (callable(shift[1])), "Supply f and finverse in shift tuple" f = shift[0] finverse = shift[1] # Check that f is increasing in d; this is very ad hoc! assert (f(5, 1) < f(5, 2)), "f must be increasing in the parameter d" shift_limit = max(abs(fsolve(lambda d: f(max(y), d) - min(x), 0)), abs(fsolve(lambda d: f(min(y), d) - max(x), 0))) # FIXME: unused observed # observed = fsolve(lambda d: np.mean(x) - np.mean(f(y, d)), 0) else: raise ValueError("Bad input for shift") ci_low = -shift_limit ci_upp = shift_limit if alternative == 'two-sided': cl = 1 - (1 - cl) / 2 if alternative != "upper": if shift is None: g = lambda q: cl - two_sample_shift(x, y, alternative="less", seed=seed, shift=q, reps=reps, stat=stat, plus1=plus1)[0] else: g = lambda q: cl - two_sample_shift(x, y, alternative="less", seed=seed, shift=(lambda u: f(u, q), lambda u: finverse(u, q)), reps=reps, stat=stat, plus1=plus1)[0] ci_low = brentq(g, -2 * shift_limit, 2 * shift_limit) if alternative != "lower": if shift is None: g = lambda q: cl - two_sample_shift(x, y, alternative="greater", seed=seed, shift=q, reps=reps, stat=stat, plus1=plus1)[0] else: g = lambda q: cl - two_sample_shift(x, y, alternative="greater", seed=seed, shift=(lambda u: f(u, q), lambda u: finverse(u, q)), reps=reps, stat=stat, plus1=plus1)[0] ci_upp = brentq(g, -2 * shift_limit, 2 * shift_limit) return ci_low, ci_upp
[docs]def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", keep_dist=False, seed=None, plus1=True): r""" One-sided or two-sided, one-sample permutation test for the mean, with p-value estimated by simulated random sampling with reps replications. Alternatively, a permutation test for equality of means of two paired samples. Tests the hypothesis that x is distributed symmetrically symmetric about 0 (or x and y have the same center) against the alternative that x comes from a population with mean (a) greater than 0 (greater than that of the population from which y comes), if side = 'greater' (b) less than 0 (less than that of the population from which y comes), if side = 'less' (c) different from 0 (different from that of the population from which y comes), if side = 'two-sided' If ``keep_dist``, return the distribution of values of the test statistic; otherwise, return only the number of permutations for which the value of the test statistic and p-value. Parameters ---------- x : array-like Sample 1 y : array-like Sample 2. Must preserve the order of pairs with x. If None, x is taken to be the one sample. reps : int number of repetitions stat : {'mean', 't'} The test statistic. The statistic is computed based on either z = x or z = x - y, if y is specified. (a) If stat == 'mean', the test statistic is mean(z). (b) If stat == 't', the test statistic is the t-statistic-- but the p-value is still estimated by the randomization, approximating the permutation distribution. (c) If stat is a function (a callable object), the test statistic is that function. The function should take a permutation of the data and compute the test function from it. For instance, if the test statistic is the maximum absolute value, $\max_i |z_i|$, the test statistic could be written: f = lambda u: np.max(abs(u)) alternative : {'greater', 'less', 'two-sided'} The alternative hypothesis to test keep_dist : bool flag for whether to store and return the array of values of the irr test statistic seed : RandomState instance or {None, int, RandomState instance} If None, the pseudorandom number generator is the RandomState instance used by `np.random`; If int, seed is the seed used by the random number generator; If RandomState instance, seed is the pseudorandom number generator plus1 : bool flag for whether to add 1 to the numerator and denominator of the p-value based on the empirical permutation distribution. Default is True. Returns ------- float the estimated p-value float the test statistic list The distribution of test statistics. These values are only returned if `keep_dist` == True """ prng = get_prng(seed) if y is None: z = x elif len(x) != len(y): raise ValueError('x and y must be pairs') else: z = np.array(x) - np.array(y) thePvalue = { 'greater': lambda pUp, pDn: pUp+plus1/(reps+plus1), 'less': lambda pUp, pDn: pDn+plus1/(reps+plus1), 'two-sided': lambda pUp, pDn: 2 * np.min([0.5, \ pUp+plus1/(reps+plus1), \ pDn+plus1/(reps+plus1)]) } stats = { 'mean': lambda u: np.mean(u), 't': lambda u: ttest_1samp(u, 0)[0] } if callable(stat): tst_fun = stat else: tst_fun = stats[stat] tst = tst_fun(z) n = len(z) if keep_dist: dist = [] for i in range(reps): dist.append(tst_fun(z * (1 - 2 * prng.randint(0, 2, n)))) pUp = np.sum(dist >= tst)/(reps + plus1) pDn = np.sum(dist <= tst)/(reps + plus1) return thePvalue[alternative](pUp, pDn), tst, dist else: hitsUp = 0 hitsDn = 0 for i in range(reps): tv = tst_fun(z * (1 - 2 * prng.randint(0, 2, n))) hitsUp += (tv >= tst) hitsDn += (tv <= tst) pUp = hitsUp/(reps+plus1) pDn = hitsDn/(reps+plus1) return thePvalue[alternative](pUp, pDn), tst