Source code for pyyeti.stats

# -*- coding: utf-8 -*-
"""
Statistics tools for tolerance bounds/intervals and order statistics.
"""

import warnings
import numpy as np
from scipy.stats import norm, nct, chi2, binom
from scipy.special import betainc
from scipy.optimize import brentq


[docs] def ksingle(p, c, n): """ Compute statistical k-factor for a single-sided tolerance limit. Parameters ---------- p : scalar or array_like; real Portion of population to bound; 0 < p < 1 c : scalar or array_like; real Probability level (confidence); 0 < c < 1 n : scalar or array_like; integer Number of observations in sample; n > 1 Returns ------- k : scalar or ndarray; real The statistical k-factor for a single-sided tolerance limit. Notes ----- The inputs `p`, `c`, and `n` must be broadcast-compatible. The k-factor allows the computation of a tolerance bound that has the probability `c` of bounding at least the proportion `p` of the population. The statistics are based on having a sample of a normally distributed population; `n` is the number of observations in the sample. The tolerance bound is computed by:: bound = m + k std (or bound = m - k std) where `m` is the sample mean and `std` is the sample standard deviation. .. note:: The math behind this routine is covered in the pyYeti :ref:`tutorial`: :doc:`/tutorials/flight_data_statistics`. There is also a link to the source Jupyter notebook at the top of the tutorial. See also -------- :func:`kdouble` Examples -------- Assume we have 21 samples. Determine the k-factor to have a 90% probability of bounding 99% of the population. In other words, we need the 'P99/90' single-sided k-factor for N = 21. (From table: N = 21, k = 3.028) >>> from pyyeti.stats import ksingle >>> ksingle(.99, .90, 21) # doctest: +ELLIPSIS 3.02823... Make a table of single-sided k-factors using 50% confidence. The probabilities will be: 95%, 97.725%, 99% and 99.865%. Number of samples will be: 2-10, 1000000. Have `n` define the rows and `p` define the columns: >>> import numpy as np >>> from pandas import DataFrame >>> n = [[i] for i in range(2, 11)] # create list of lists >>> n.append([1000000]) >>> p = [.95, .97725, .99, .99865] >>> table = ksingle(p, .50, n) >>> DataFrame(table, index=[i[0] for i in n], columns=p) 0.95000 0.97725 0.99000 0.99865 2 2.338727 2.880624 3.375968 4.391208 3 1.938416 2.369068 2.764477 3.579188 4 1.829514 2.231482 2.600817 3.362580 5 1.779283 2.168283 2.525770 3.263359 6 1.750462 2.132099 2.482840 3.206631 7 1.731792 2.108690 2.455081 3.169958 8 1.718720 2.092314 2.435669 3.144318 9 1.709060 2.080220 2.421337 3.125390 10 1.701632 2.070925 2.410323 3.110845 1000000 1.644854 2.000003 2.326349 2.999978 """ n = np.asarray(n) sn = np.sqrt(n) pnonc = sn * norm.ppf(p) return nct.ppf(c, n - 1, pnonc) / sn
def _getr(n, prob, tol): """ Get R needed by :func:`kdouble` routine. This routine calculates R defined by the integral:: 1/sqrt(n) + R / prob = 1/sqrt(2*pi) | exp(-t^2/2) dt / 1/sqrt(n) - R The inputs `n` and `prob` must be broadcast-compatible. `tol` is the error tolerance. Uses Newton's method (with derivative from Leibniz's rule) for root finding. See also :func:`kdouble`. """ sn = 1 / np.sqrt(n) spi = 1 / np.sqrt(2 * np.pi) # initial guess at r = r_inf * (1+1/(2*n) r = norm.ppf(prob + (1 - prob) / 2) * (1 + 1 / (2 * n)) rold = r + 10 loops = 0 MAXLOOPS = 100 while np.any(abs(r - rold) > tol) and loops < MAXLOOPS: rold = r lhi = sn + rold llo = sn - rold num = norm.cdf(lhi) - norm.cdf(llo) - prob den = spi * (np.exp(-(lhi**2) / 2) + np.exp(-(llo**2) / 2)) r = rold - num / den loops += 1 if loops == MAXLOOPS: # pragma: no cover warnings.warn( "maximum number of loops exceeded. Solution will likely be inaccurate.", RuntimeWarning, ) return r
[docs] def kdouble(p, c, n, tol=1e-12): """ Compute statistical k-factor for a double-sided tolerance interval. Parameters ---------- p : scalar or array_like; real Portion of population to bound; 0 < p < 1 c : scalar or array_like; real Probability level (confidence); 0 < c < 1 n : scalar or array_like; integer Number of observations in sample; n > 1 tol : scalar; optional Error tolerance to pass to the :func:`_getr` routine Returns ------- k : scalar or ndarray; real The statistical k-factor for a double-sided tolerance interval. Notes ----- The inputs `p`, `c`, and `n` must be broadcast-compatible. The k-factor allows the computation of a tolerance interval that has the probability `c` of containing at least the proportion `p` of the population. The statistics are based on having a sample of a normally distributed population; `n` is the number of observations in the sample. The bounds of the tolerance interval are calculated by:: lower = m - k std upper = m + k std where `m` is the sample mean and `std` is the sample standard deviation. See references [#stat1]_, [#stat2]_, and [#stat3]_ for the mathematical background on these tolerance limits. References ---------- .. [#stat1] Churchill Eisenhart; Millard Hastay; and W. Allen Wallis, 'Selected Techniques of Statistical Analysis for Scientific and Industrial Research and Production and Management Engineering,' by the Statistical Research Group, Columbia University, First Edition, New York and London, McGraw-Hill Book Company, Inc, 1947. .. [#stat2] Albert H. Bowker, 'Computation of Factors for Tolerance Limits on a Normal Distribution when the Sample Size is Large,' Annals of Mathematical Statistics, Vol. 17, 1946, pp 238-240. .. [#stat3] A. Wald and J. Wolfowitz, 'Tolerance Limits for a Normal Distribution,' Annals of Mathematical Statistics, Vol. 17, 1946, pp 208-215. See also -------- :func:`ksingle` Examples -------- Assume we have 21 samples. Determine the k-factor to have a 90% probability of the interval containing 99% of the population. In other words, we need the 'P99/90' double-sided k-factor for N = 21. (From table: N = 21, k = 3.340) >>> from pyyeti.stats import kdouble >>> kdouble(.99, .90, 21) # doctest: +ELLIPSIS 3.3404115111514... Make a table of double-sided k-factors using 50% confidence. The probabilities `p` will be: 95%, 97.725%, 99% and 99.865%. Number of samples `n` will be: 2-10, 1000000. Have `n` define the rows and `p` define the columns: >>> import numpy as np >>> from pandas import DataFrame >>> n = [[i] for i in range(2, 11)] # create list of lists >>> n.append([1000000]) >>> p = [.95, .9545, .99, .9973] >>> table = kdouble(p, .50, n) >>> DataFrame(table, index=[i[0] for i in n], columns=p) 0.9500 0.9545 0.9900 0.9973 2 3.502564 3.568593 4.502465 5.175585 3 2.697379 2.750060 3.498689 4.041051 4 2.456441 2.505211 3.200478 3.706120 5 2.336939 2.383750 3.052519 3.540237 6 2.264675 2.310279 2.962781 3.439587 7 2.215998 2.260775 2.902111 3.371448 8 2.180884 2.225054 2.858183 3.322030 9 2.154316 2.198021 2.824834 3.284445 10 2.133494 2.176829 2.798616 3.254847 1000000 1.959966 2.000004 2.575831 2.999979 """ p = np.asarray(p) c = np.asarray(c) n = np.asarray(n) chi = chi2.ppf(1 - c, n - 1) r = _getr(n, p, tol) return np.sqrt((n - 1) / chi) * r
[docs] def order_stats(which, *, p=None, c=None, n=None, r=None): """ Compute a parameter from order statistics. Parameters ---------- which : str Either 'p', 'c', 'n', or 'r' to specify which of the following arguments is to be computed from the others. p : scalar or array_like; real, (0, 1) Proportion of population c : scalar or array_like; real, (0, 1) Probability of bounding proportion `p` of the population (confidence level). n : scalar or array_like; integer Sample size r : scalar or ndarray; integer Largest-value order statistic. Note:: number of failures = r - 1 .. note:: Zero will be returned for `r` if there are not enough samples to meet the criteria. For example, it takes at least 230 samples to reach a ``p=0.99, c=0.90`` (P99/90) level. Therefore, ``order_stats("r", p=0.99, c=0.90, n=25)`` will return 0. Returns ------- One of `p`, `c`, `n`, or `r`; according to `which`. Notes ----- One of the inputs of `p`, `c`, `n`, and `r` can be left as None; the remaining inputs must be broadcast-compatible and must be named. The binomial distribution forms the mathematical foundation of this routine; see reference [#stat4]_. [#stat5]_ has a good definition of the order statistic. See also "Bernoulli Trials", reference [#stat6]_, which ties some of these ideas together in the analysis of success/failure probabilities. References ---------- .. [#stat4] Wikipedia, "Binomial distribution", https://en.wikipedia.org/wiki/Binomial_distribution .. [#stat5] Wikipedia, "Order statistic", https://en.wikipedia.org/wiki/Order_statistic .. [#stat6] Wikipedia, "Bernoulli trial", https://en.wikipedia.org/wiki/Bernoulli_trial Examples -------- Start with 700 samples of unknown distribution. After sorting, which of the samples represents at least a P99/90 level? From published tables, `r` should be 4, meaning the 4-th highest value of the 700 is an estimate of the P99/90 level (or higher). Another way to look at that result is that 3 failures (or fewer) out of 700 trials demonstrates at least a P99/90 level. >>> from pyyeti.stats import order_stats >>> order_stats('r', p=.99, c=.90, n=700) 4 Holding the probability constant at 90%, the portion of the population bounded has to be at least 99%. But, what did it turn out to be? >>> order_stats('p', c=.90, n=700, r=4) # doctest: +ELLIPSIS 0.99048109... Instead, hold the portion constant. What is the probability of covering 99% percent of the population by selecting the 4th highest of 700? >>> order_stats('c', p=.99, n=700, r=4) # doctest: +ELLIPSIS 0.91927834... How many samples did we really need to reach at least the P99/90 level by selecting the 4th highest? >>> order_stats('n', p=.99, c=.90, r=4) 667 Generate a 90% confidence table showing the number of trials needed for: `r` will go from 1 to 12 (defining the rows), and population coverage will be: [.95, .9772, .99, .9973, .99865] (defining the columns). Display using a pandas DataFrame: >>> from pandas import DataFrame >>> r = np.arange(1, 13).reshape(-1, 1) >>> p = [.95, .97725, .99, .9973, .99865] >>> table = order_stats('n', c=.90, r=r, p=p) >>> DataFrame(table, index=r.ravel(), columns=p) 0.95000 0.97725 0.99000 0.99730 0.99865 1 45 101 230 852 1705 2 77 170 388 1440 2880 3 105 233 531 1970 3941 4 132 292 667 2473 4947 5 158 350 798 2959 5920 6 184 406 926 3433 6868 7 209 461 1051 3899 7800 8 234 516 1175 4358 8717 9 258 569 1297 4811 9624 10 282 622 1418 5259 10521 11 306 675 1538 5704 11410 12 330 727 1658 6145 12293 """ if which == "c": r = np.asarray(r) p = np.asarray(p) return binom.sf(r - 1, n, 1 - p) elif which == "r": # c = np.asarray(c) # p = np.asarray(p) # return binom.ppf(1-c, n, 1-p) # gets 'value too deep error' b = np.broadcast(c, n, p) r = np.empty(b.shape) r.flat = [binom.ppf(1 - c, n, 1 - p) for (c, n, p) in b] if r.ndim == 0: return int(r[()]) return r.astype(int) elif which == "n": def _func(n, p, s, pr): return p - (1 - betainc(s + 1, n - s, pr)) def _run_brentq(c, r, p): # find [a, b] interval by brute force: a = r b = 2 * a loops = 0 while _func(b, 1 - c, r - 1, 1 - p) < 0 and loops < 30: a = b b = 2 * a loops += 1 return brentq(_func, a, b, args=(1 - c, r - 1, 1 - p)) b = np.broadcast(c, r, p) n = np.empty(b.shape) n.flat = [_run_brentq(c, r, p) for (c, r, p) in b] return np.ceil(n).astype(int) elif which == "p": def _func(pr, p, s, n): return p - binom.cdf(s, n, pr) b = np.broadcast(c, r, n) n = np.empty(b.shape) n.flat = [1 - brentq(_func, 0, 1, args=(1 - c, r - 1, n)) for (c, r, n) in b] if n.ndim == 0: return n[()] return n raise ValueError("invalid `which` setting")