# -*- coding: utf-8 -*-
"""
Statistics tools for tolerance bounds/intervals and order statistics.
"""
import warnings
import numpy as np
from scipy.stats import norm, nct, chi2, binom
from scipy.special import betainc
from scipy.optimize import brentq
[docs]
def ksingle(p, c, n):
"""
Compute statistical k-factor for a single-sided tolerance limit.
Parameters
----------
p : scalar or array_like; real
Portion of population to bound; 0 < p < 1
c : scalar or array_like; real
Probability level (confidence); 0 < c < 1
n : scalar or array_like; integer
Number of observations in sample; n > 1
Returns
-------
k : scalar or ndarray; real
The statistical k-factor for a single-sided tolerance
limit.
Notes
-----
The inputs `p`, `c`, and `n` must be broadcast-compatible.
The k-factor allows the computation of a tolerance bound that has
the probability `c` of bounding at least the proportion `p` of the
population. The statistics are based on having a sample of a
normally distributed population; `n` is the number of observations
in the sample.
The tolerance bound is computed by::
bound = m + k std (or bound = m - k std)
where `m` is the sample mean and `std` is the sample standard
deviation.
.. note::
The math behind this routine is covered in the pyYeti
:ref:`tutorial`: :doc:`/tutorials/flight_data_statistics`.
There is also a link to the source Jupyter notebook at the top
of the tutorial.
See also
--------
:func:`kdouble`
Examples
--------
Assume we have 21 samples. Determine the k-factor to have a 90%
probability of bounding 99% of the population. In other words, we
need the 'P99/90' single-sided k-factor for N = 21. (From table:
N = 21, k = 3.028)
>>> from pyyeti.stats import ksingle
>>> ksingle(.99, .90, 21) # doctest: +ELLIPSIS
3.02823...
Make a table of single-sided k-factors using 50% confidence. The
probabilities will be: 95%, 97.725%, 99% and 99.865%. Number of
samples will be: 2-10, 1000000. Have `n` define the rows and `p`
define the columns:
>>> import numpy as np
>>> from pandas import DataFrame
>>> n = [[i] for i in range(2, 11)] # create list of lists
>>> n.append([1000000])
>>> p = [.95, .97725, .99, .99865]
>>> table = ksingle(p, .50, n)
>>> DataFrame(table, index=[i[0] for i in n], columns=p)
0.95000 0.97725 0.99000 0.99865
2 2.338727 2.880624 3.375968 4.391208
3 1.938416 2.369068 2.764477 3.579188
4 1.829514 2.231482 2.600817 3.362580
5 1.779283 2.168283 2.525770 3.263359
6 1.750462 2.132099 2.482840 3.206631
7 1.731792 2.108690 2.455081 3.169958
8 1.718720 2.092314 2.435669 3.144318
9 1.709060 2.080220 2.421337 3.125390
10 1.701632 2.070925 2.410323 3.110845
1000000 1.644854 2.000003 2.326349 2.999978
"""
n = np.asarray(n)
sn = np.sqrt(n)
pnonc = sn * norm.ppf(p)
return nct.ppf(c, n - 1, pnonc) / sn
def _getr(n, prob, tol):
"""
Get R needed by :func:`kdouble` routine.
This routine calculates R defined by the integral::
1/sqrt(n) + R
/
prob = 1/sqrt(2*pi) | exp(-t^2/2) dt
/
1/sqrt(n) - R
The inputs `n` and `prob` must be broadcast-compatible. `tol` is
the error tolerance.
Uses Newton's method (with derivative from Leibniz's rule) for
root finding.
See also :func:`kdouble`.
"""
sn = 1 / np.sqrt(n)
spi = 1 / np.sqrt(2 * np.pi)
# initial guess at r = r_inf * (1+1/(2*n)
r = norm.ppf(prob + (1 - prob) / 2) * (1 + 1 / (2 * n))
rold = r + 10
loops = 0
MAXLOOPS = 100
while np.any(abs(r - rold) > tol) and loops < MAXLOOPS:
rold = r
lhi = sn + rold
llo = sn - rold
num = norm.cdf(lhi) - norm.cdf(llo) - prob
den = spi * (np.exp(-(lhi**2) / 2) + np.exp(-(llo**2) / 2))
r = rold - num / den
loops += 1
if loops == MAXLOOPS: # pragma: no cover
warnings.warn(
"maximum number of loops exceeded. Solution will likely be inaccurate.",
RuntimeWarning,
)
return r
[docs]
def kdouble(p, c, n, tol=1e-12):
"""
Compute statistical k-factor for a double-sided tolerance interval.
Parameters
----------
p : scalar or array_like; real
Portion of population to bound; 0 < p < 1
c : scalar or array_like; real
Probability level (confidence); 0 < c < 1
n : scalar or array_like; integer
Number of observations in sample; n > 1
tol : scalar; optional
Error tolerance to pass to the :func:`_getr` routine
Returns
-------
k : scalar or ndarray; real
The statistical k-factor for a double-sided tolerance
interval.
Notes
-----
The inputs `p`, `c`, and `n` must be broadcast-compatible.
The k-factor allows the computation of a tolerance interval that
has the probability `c` of containing at least the proportion `p`
of the population. The statistics are based on having a sample of
a normally distributed population; `n` is the number of
observations in the sample.
The bounds of the tolerance interval are calculated by::
lower = m - k std
upper = m + k std
where `m` is the sample mean and `std` is the sample standard
deviation.
See references [#stat1]_, [#stat2]_, and [#stat3]_ for the
mathematical background on these tolerance limits.
References
----------
.. [#stat1] Churchill Eisenhart; Millard Hastay; and W. Allen
Wallis, 'Selected Techniques of Statistical Analysis for
Scientific and Industrial Research and Production and
Management Engineering,' by the Statistical Research Group,
Columbia University, First Edition, New York and London,
McGraw-Hill Book Company, Inc, 1947.
.. [#stat2] Albert H. Bowker, 'Computation of Factors for
Tolerance Limits on a Normal Distribution when the Sample
Size is Large,' Annals of Mathematical Statistics, Vol. 17,
1946, pp 238-240.
.. [#stat3] A. Wald and J. Wolfowitz, 'Tolerance Limits for a
Normal Distribution,' Annals of Mathematical Statistics,
Vol. 17, 1946, pp 208-215.
See also
--------
:func:`ksingle`
Examples
--------
Assume we have 21 samples. Determine the k-factor to have a 90%
probability of the interval containing 99% of the population. In
other words, we need the 'P99/90' double-sided k-factor for N =
21. (From table: N = 21, k = 3.340)
>>> from pyyeti.stats import kdouble
>>> kdouble(.99, .90, 21) # doctest: +ELLIPSIS
3.3404115111514...
Make a table of double-sided k-factors using 50% confidence. The
probabilities `p` will be: 95%, 97.725%, 99% and 99.865%. Number
of samples `n` will be: 2-10, 1000000. Have `n` define the rows
and `p` define the columns:
>>> import numpy as np
>>> from pandas import DataFrame
>>> n = [[i] for i in range(2, 11)] # create list of lists
>>> n.append([1000000])
>>> p = [.95, .9545, .99, .9973]
>>> table = kdouble(p, .50, n)
>>> DataFrame(table, index=[i[0] for i in n], columns=p)
0.9500 0.9545 0.9900 0.9973
2 3.502564 3.568593 4.502465 5.175585
3 2.697379 2.750060 3.498689 4.041051
4 2.456441 2.505211 3.200478 3.706120
5 2.336939 2.383750 3.052519 3.540237
6 2.264675 2.310279 2.962781 3.439587
7 2.215998 2.260775 2.902111 3.371448
8 2.180884 2.225054 2.858183 3.322030
9 2.154316 2.198021 2.824834 3.284445
10 2.133494 2.176829 2.798616 3.254847
1000000 1.959966 2.000004 2.575831 2.999979
"""
p = np.asarray(p)
c = np.asarray(c)
n = np.asarray(n)
chi = chi2.ppf(1 - c, n - 1)
r = _getr(n, p, tol)
return np.sqrt((n - 1) / chi) * r
[docs]
def order_stats(which, *, p=None, c=None, n=None, r=None):
"""
Compute a parameter from order statistics.
Parameters
----------
which : str
Either 'p', 'c', 'n', or 'r' to specify which of the following
arguments is to be computed from the others.
p : scalar or array_like; real, (0, 1)
Proportion of population
c : scalar or array_like; real, (0, 1)
Probability of bounding proportion `p` of the population
(confidence level).
n : scalar or array_like; integer
Sample size
r : scalar or ndarray; integer
Largest-value order statistic. Note::
number of failures = r - 1
.. note::
Zero will be returned for `r` if there are not enough
samples to meet the criteria. For example, it takes at
least 230 samples to reach a ``p=0.99, c=0.90`` (P99/90)
level. Therefore, ``order_stats("r", p=0.99, c=0.90,
n=25)`` will return 0.
Returns
-------
One of `p`, `c`, `n`, or `r`; according to `which`.
Notes
-----
One of the inputs of `p`, `c`, `n`, and `r` can be left as None;
the remaining inputs must be broadcast-compatible and must be
named.
The binomial distribution forms the mathematical foundation of
this routine; see reference [#stat4]_. [#stat5]_ has a good
definition of the order statistic. See also "Bernoulli Trials",
reference [#stat6]_, which ties some of these ideas together in
the analysis of success/failure probabilities.
References
----------
.. [#stat4] Wikipedia, "Binomial distribution",
https://en.wikipedia.org/wiki/Binomial_distribution
.. [#stat5] Wikipedia, "Order statistic",
https://en.wikipedia.org/wiki/Order_statistic
.. [#stat6] Wikipedia, "Bernoulli trial",
https://en.wikipedia.org/wiki/Bernoulli_trial
Examples
--------
Start with 700 samples of unknown distribution. After sorting,
which of the samples represents at least a P99/90 level? From
published tables, `r` should be 4, meaning the 4-th highest value
of the 700 is an estimate of the P99/90 level (or higher). Another
way to look at that result is that 3 failures (or fewer) out of
700 trials demonstrates at least a P99/90 level.
>>> from pyyeti.stats import order_stats
>>> order_stats('r', p=.99, c=.90, n=700)
4
Holding the probability constant at 90%, the portion of the
population bounded has to be at least 99%. But, what did it turn
out to be?
>>> order_stats('p', c=.90, n=700, r=4) # doctest: +ELLIPSIS
0.99048109...
Instead, hold the portion constant. What is the probability of
covering 99% percent of the population by selecting the 4th highest
of 700?
>>> order_stats('c', p=.99, n=700, r=4) # doctest: +ELLIPSIS
0.91927834...
How many samples did we really need to reach at least the P99/90
level by selecting the 4th highest?
>>> order_stats('n', p=.99, c=.90, r=4)
667
Generate a 90% confidence table showing the number of trials
needed for: `r` will go from 1 to 12 (defining the rows), and
population coverage will be: [.95, .9772, .99, .9973, .99865]
(defining the columns). Display using a pandas DataFrame:
>>> from pandas import DataFrame
>>> r = np.arange(1, 13).reshape(-1, 1)
>>> p = [.95, .97725, .99, .9973, .99865]
>>> table = order_stats('n', c=.90, r=r, p=p)
>>> DataFrame(table, index=r.ravel(), columns=p)
0.95000 0.97725 0.99000 0.99730 0.99865
1 45 101 230 852 1705
2 77 170 388 1440 2880
3 105 233 531 1970 3941
4 132 292 667 2473 4947
5 158 350 798 2959 5920
6 184 406 926 3433 6868
7 209 461 1051 3899 7800
8 234 516 1175 4358 8717
9 258 569 1297 4811 9624
10 282 622 1418 5259 10521
11 306 675 1538 5704 11410
12 330 727 1658 6145 12293
"""
if which == "c":
r = np.asarray(r)
p = np.asarray(p)
return binom.sf(r - 1, n, 1 - p)
elif which == "r":
# c = np.asarray(c)
# p = np.asarray(p)
# return binom.ppf(1-c, n, 1-p) # gets 'value too deep error'
b = np.broadcast(c, n, p)
r = np.empty(b.shape)
r.flat = [binom.ppf(1 - c, n, 1 - p) for (c, n, p) in b]
if r.ndim == 0:
return int(r[()])
return r.astype(int)
elif which == "n":
def _func(n, p, s, pr):
return p - (1 - betainc(s + 1, n - s, pr))
def _run_brentq(c, r, p):
# find [a, b] interval by brute force:
a = r
b = 2 * a
loops = 0
while _func(b, 1 - c, r - 1, 1 - p) < 0 and loops < 30:
a = b
b = 2 * a
loops += 1
return brentq(_func, a, b, args=(1 - c, r - 1, 1 - p))
b = np.broadcast(c, r, p)
n = np.empty(b.shape)
n.flat = [_run_brentq(c, r, p) for (c, r, p) in b]
return np.ceil(n).astype(int)
elif which == "p":
def _func(pr, p, s, n):
return p - binom.cdf(s, n, pr)
b = np.broadcast(c, r, n)
n = np.empty(b.shape)
n.flat = [1 - brentq(_func, 0, 1, args=(1 - c, r - 1, n)) for (c, r, n) in b]
if n.ndim == 0:
return n[()]
return n
raise ValueError("invalid `which` setting")