1
0
mirror of https://github.com/exaloop/codon.git synced 2025-06-03 15:03:52 +08:00

stdlib/statistics.codon

This commit is contained in:
Ishak Numanagić 2022-01-24 11:12:38 +01:00
parent 76e025305c
commit cfc0f563f1

View File

@ -1,22 +1,33 @@
# (c) 2022 Exaloop Inc. All rights reserved.
import bisect import bisect
import random import random
from math import frexp as _frexp, floor as _floor, fabs as _fabs, erf as _erf from math import frexp as _frexp, floor as _floor, fabs as _fabs, erf as _erf
from math import gcd as _gcd, exp as _exp, log as _log, sqrt as _sqrt, tau as _tau, hypot as _hypot from math import (
gcd as _gcd,
exp as _exp,
log as _log,
sqrt as _sqrt,
tau as _tau,
hypot as _hypot,
)
class StatisticsError: class StatisticsError:
_hdr: ExcHeader _hdr: ExcHeader
def __init__(self): def __init__(self):
self._hdr = ('StatisticsError', '', '', '', 0, 0) self._hdr = ("StatisticsError", "", "", "", 0, 0)
def __init__(self, message: str): def __init__(self, message: str):
self._hdr = ('StatisticsError', message, '', '', 0, 0) self._hdr = ("StatisticsError", message, "", "", 0, 0)
@property @property
def message(self): def message(self):
return self._hdr.msg return self._hdr.msg
def median[T](data: List[T]) -> float:
def median(data: List[T], T: type) -> float:
""" """
Return the median (middle value) of numeric data. Return the median (middle value) of numeric data.
@ -28,13 +39,14 @@ def median[T](data: List[T]) -> float:
n = len(data) n = len(data)
if n == 0: if n == 0:
raise StatisticsError("no median for empty data") raise StatisticsError("no median for empty data")
if n%2 == 1: if n % 2 == 1:
return float(data[n//2]) return float(data[n // 2])
else: else:
i = n//2 i = n // 2
return (data[i - 1] + data[i]) / 2 return (data[i - 1] + data[i]) / 2
def median_low[T](data: List[T]) -> float:
def median_low(data: List[T], T: type) -> float:
""" """
Return the low median of numeric data. Return the low median of numeric data.
@ -45,12 +57,13 @@ def median_low[T](data: List[T]) -> float:
n = len(data) n = len(data)
if n == 0: if n == 0:
raise StatisticsError("no median for empty data") raise StatisticsError("no median for empty data")
if n%2 == 1: if n % 2 == 1:
return float(data[n//2]) return float(data[n // 2])
else: else:
return float(data[n//2 -1]) return float(data[n // 2 - 1])
def median_high[T](data: List[T]) -> float:
def median_high(data: List[T], T: type) -> float:
""" """
Return the high median of data. Return the high median of data.
@ -61,9 +74,10 @@ def median_high[T](data: List[T]) -> float:
n = len(data) n = len(data)
if n == 0: if n == 0:
raise StatisticsError("no median for empty data") raise StatisticsError("no median for empty data")
return float(data[n//2]) return float(data[n // 2])
def _find_lteq[T](a: List[T], x: float):
def _find_lteq(a: List[T], x: float, T: type):
""" """
Locate the leftmost value exactly equal to x Locate the leftmost value exactly equal to x
""" """
@ -72,16 +86,18 @@ def _find_lteq[T](a: List[T], x: float):
return i return i
assert False assert False
def _find_rteq[T](a: List[T], l: int, x: float):
def _find_rteq(a: List[T], l: int, x: float, T: type):
""" """
Locate the rightmost value exactly equal to x Locate the rightmost value exactly equal to x
""" """
i = bisect.bisect_right(a, x, lo=l) i = bisect.bisect_right(a, x, lo=l)
if i != (len(a)+1) and a[i-1] == x: if i != (len(a) + 1) and a[i - 1] == x:
return i-1 return i - 1
assert False assert False
def median_grouped[T,S=int](data: List[T], interval: S = 1) -> float:
def median_grouped(data: List[T], interval: S = 1, T: type, S: type = int) -> float:
""" """
Return the 50th percentile (median) of grouped continuous data. Return the 50th percentile (median) of grouped continuous data.
""" """
@ -93,8 +109,8 @@ def median_grouped[T,S=int](data: List[T], interval: S = 1) -> float:
return float(data[0]) return float(data[0])
# Find the value at the midpoint. # Find the value at the midpoint.
x = float(data[n//2]) x = float(data[n // 2])
L = x - float(interval)/2 # The lower limit of the median interval. L = x - float(interval) / 2 # The lower limit of the median interval.
# Find the position of leftmost occurrence of x in data # Find the position of leftmost occurrence of x in data
l1 = _find_lteq(data, x) l1 = _find_lteq(data, x)
@ -103,9 +119,10 @@ def median_grouped[T,S=int](data: List[T], interval: S = 1) -> float:
l2 = _find_rteq(data, l1, x) l2 = _find_rteq(data, l1, x)
cf = l1 cf = l1
f = l2 - l1 + 1 f = l2 - l1 + 1
return L + interval*(n/2 - cf)/f return L + interval * (n / 2 - cf) / f
def mode[T](data: List[T]) -> T:
def mode(data: List[T], T: type) -> T:
""" """
Return the most common data point from discrete or nominal data. Return the most common data point from discrete or nominal data.
""" """
@ -119,7 +136,8 @@ def mode[T](data: List[T]) -> T:
elem = i elem = i
return elem return elem
def multimode[T](data: List[T]):
def multimode(data: List[T], T: type):
""" """
Return a list of the most frequently occurring values. Return a list of the most frequently occurring values.
@ -141,7 +159,10 @@ def multimode[T](data: List[T]):
mulmode.append(i) mulmode.append(i)
return mulmode return mulmode
def quantiles[T](data: List[T], n: int = 4, method: str = 'exclusive') -> List[float]:
def quantiles(
data: List[T], n: int = 4, method: str = "exclusive", T: type
) -> List[float]:
""" """
Divide *data* into *n* continuous intervals with equal probability. Divide *data* into *n* continuous intervals with equal probability.
@ -159,32 +180,33 @@ def quantiles[T](data: List[T], n: int = 4, method: str = 'exclusive') -> List[f
maximum value is treated as the 100th percentile. maximum value is treated as the 100th percentile.
""" """
if n < 1: if n < 1:
raise StatisticsError('n must be at least 1') raise StatisticsError("n must be at least 1")
data = sorted(data) data = sorted(data)
ld = len(data) ld = len(data)
if ld < 2: if ld < 2:
raise StatisticsError('must have at least two data points') raise StatisticsError("must have at least two data points")
if method == 'inclusive': if method == "inclusive":
m = ld - 1 m = ld - 1
result = List[float]() result = List[float]()
for i in range(1, n): for i in range(1, n):
j = i * m // n j = i * m // n
delta = (i * m) - (j * n) delta = (i * m) - (j * n)
interpolated = (data[j] * (n - delta) + data[j+1] * delta) / n interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
result.append(interpolated) result.append(interpolated)
return result return result
if method == 'exclusive': if method == "exclusive":
m = ld + 1 m = ld + 1
result = List[float]() result = List[float]()
for i in range(1, n): for i in range(1, n):
j = i * m // n # rescale i to m/n j = i * m // n # rescale i to m/n
j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1 j = 1 if j < 1 else ld - 1 if j > ld - 1 else j # clamp to 1 .. ld-1
delta = (i * m) - (j * n) # exact integer math delta = (i * m) - (j * n) # exact integer math
interpolated = (data[j-1] * (n - delta) + data[j] * delta) / n interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
result.append(interpolated) result.append(interpolated)
return result return result
raise ValueError(f'Unknown method: {method}') raise ValueError(f"Unknown method: {method}")
def _lcm(x: int, y: int): def _lcm(x: int, y: int):
""" """
@ -233,6 +255,7 @@ def _sum(data: List[float]) -> float:
i += 1 i += 1
return s + c return s + c
def mean(data: List[float]) -> float: def mean(data: List[float]) -> float:
""" """
Return the sample arithmetic mean of data. Return the sample arithmetic mean of data.
@ -243,9 +266,10 @@ def mean(data: List[float]) -> float:
""" """
n = len(data) n = len(data)
if n < 1: if n < 1:
raise StatisticsError('mean requires at least one data point') raise StatisticsError("mean requires at least one data point")
total = _sum(data) total = _sum(data)
return total/n return total / n
''' '''
def fmean(data: List[float]) -> float: def fmean(data: List[float]) -> float:
@ -259,6 +283,7 @@ def fmean(data: List[float]) -> float:
pass pass
''' '''
def geometric_mean(data: List[float]) -> float: def geometric_mean(data: List[float]) -> float:
""" """
Convert data to floats and compute the geometric mean. Convert data to floats and compute the geometric mean.
@ -274,6 +299,7 @@ def geometric_mean(data: List[float]) -> float:
raise StatisticsError("geometric mean requires a non-empty dataset") raise StatisticsError("geometric mean requires a non-empty dataset")
return _exp(mean(list(map(_log, data)))) return _exp(mean(list(map(_log, data))))
def _fail_neg(values: List[float], errmsg: str): def _fail_neg(values: List[float], errmsg: str):
""" """
Iterate over values, failing if any are less than zero. Iterate over values, failing if any are less than zero.
@ -283,6 +309,7 @@ def _fail_neg(values: List[float], errmsg: str):
raise StatisticsError(errmsg) raise StatisticsError(errmsg)
yield x yield x
def harmonic_mean(data: List[float]) -> float: def harmonic_mean(data: List[float]) -> float:
""" """
Return the harmonic mean of data. Return the harmonic mean of data.
@ -307,9 +334,10 @@ def harmonic_mean(data: List[float]) -> float:
for x in _fail_neg(data, errmsg): for x in _fail_neg(data, errmsg):
if x == 0.0: if x == 0.0:
return 0.0 return 0.0
li.append(1/x) li.append(1 / x)
total = _sum(li) total = _sum(li)
return n/total return n / total
def _ss(data: List[float], c: float): def _ss(data: List[float], c: float):
""" """
@ -319,13 +347,14 @@ def _ss(data: List[float], c: float):
from the mean are calculated in a second pass. Otherwise, deviations are from the mean are calculated in a second pass. Otherwise, deviations are
calculated from c as given. calculated from c as given.
""" """
total = _sum([(x-c)**2 for x in data]) total = _sum([(x - c) ** 2 for x in data])
total2 = _sum([(x-c) for x in data]) total2 = _sum([(x - c) for x in data])
total -= total2**2/len(data) total -= total2 ** 2 / len(data)
return total return total
def pvariance(data: List[float], mu: Optional[float]=None):
def pvariance(data: List[float], mu: Optional[float] = None):
""" """
Return the population variance of `data`. Return the population variance of `data`.
@ -336,24 +365,28 @@ def pvariance(data: List[float], mu: Optional[float]=None):
TODO/CAVEATS: TODO/CAVEATS:
- Assumes input is a list of floats - Assumes input is a list of floats
""" """
mu1 = ~mu if mu else mean(data) if mu is None:
mu = mean(data)
n = len(data) n = len(data)
if n < 1: if n < 1:
raise StatisticsError("pvariance requires at least one data point") raise StatisticsError("pvariance requires at least one data point")
ss = _ss(data, mu1) ss = _ss(data, mu)
return ss/n return ss / n
def pstdev(data: List[float], mu: Optional[float]=None):
def pstdev(data: List[float], mu: Optional[float] = None):
""" """
Return the square root of the population variance. Return the square root of the population variance.
""" """
mu1 = ~mu if mu else mean(data) if mu is None:
var = pvariance(data, mu1) mu = mean(data)
var = pvariance(data, mu)
return _sqrt(var) return _sqrt(var)
def variance(data: List[float], xbar: Optional[float]=None):
def variance(data: List[float], xbar: Optional[float] = None):
""" """
Return the sample variance of data. Return the sample variance of data.
@ -361,36 +394,43 @@ def variance(data: List[float], xbar: Optional[float]=None):
The optional argument xbar, if given, should be the mean of The optional argument xbar, if given, should be the mean of
the data. If it is missing or None, the mean is automatically calculated. the data. If it is missing or None, the mean is automatically calculated.
""" """
xbar1 = ~xbar if xbar else mean(data) if xbar is None:
xbar = mean(data)
n = len(data) n = len(data)
if n < 2: if n < 2:
raise StatisticsError("variance requires at least two data points") raise StatisticsError("variance requires at least two data points")
ss = _ss(data, xbar1) ss = _ss(data, xbar)
return ss/(n-1) return ss / (n - 1)
def stdev(data, xbar: Optional[float]=None):
def stdev(data, xbar: Optional[float] = None):
""" """
Return the square root of the sample variance. Return the square root of the sample variance.
""" """
xbar1 = ~xbar if xbar else mean(data) if xbar is None:
var = variance(data, xbar1) xbar = mean(data)
var = variance(data, xbar)
return _sqrt(var) return _sqrt(var)
class NormalDist: class NormalDist:
""" """
Normal distribution of a random variable Normal distribution of a random variable
""" """
PRECISION: float PRECISION: float
_mu: float _mu: float
_sigma: float _sigma: float
def __eq__(self, other: NormalDist): def __eq__(self, other: NormalDist):
return (self._mu - other._mu) < self.PRECISION and (self._sigma - other._sigma) < self.PRECISION return (self._mu - other._mu) < self.PRECISION and (
self._sigma - other._sigma
) < self.PRECISION
def _init(self, mu: float, sigma: float): def _init(self, mu: float, sigma: float):
self.PRECISION = 1e-6 self.PRECISION = 1e-6
if sigma < 0.0: if sigma < 0.0:
raise StatisticsError('sigma must be non-negative') raise StatisticsError("sigma must be non-negative")
self._mu = mu self._mu = mu
self._sigma = sigma self._sigma = sigma
@ -447,7 +487,7 @@ class NormalDist:
""" """
variance = self._sigma ** 2.0 variance = self._sigma ** 2.0
if not variance: if not variance:
raise StatisticsError('pdf() not defined when sigma is zero') raise StatisticsError("pdf() not defined when sigma is zero")
return _exp((x - self._mu) ** 2.0 / (-2.0 * variance)) / _sqrt(_tau * variance) return _exp((x - self._mu) ** 2.0 / (-2.0 * variance)) / _sqrt(_tau * variance)
def cdf(self, x): def cdf(self, x):
@ -455,7 +495,7 @@ class NormalDist:
Cumulative distribution function. P(X <= x) Cumulative distribution function. P(X <= x)
""" """
if not self._sigma: if not self._sigma:
raise StatisticsError('cdf() not defined when sigma is zero') raise StatisticsError("cdf() not defined when sigma is zero")
return 0.5 * (1.0 + _erf((x - self._mu) / (self._sigma * _sqrt(2.0)))) return 0.5 * (1.0 + _erf((x - self._mu) / (self._sigma * _sqrt(2.0))))
def _normal_dist_inv_cdf(self, p: float, mu: float, sigma: float): def _normal_dist_inv_cdf(self, p: float, mu: float, sigma: float):
@ -470,22 +510,55 @@ class NormalDist:
if _fabs(q) <= 0.425: if _fabs(q) <= 0.425:
r = 0.180625 - q * q r = 0.180625 - q * q
# Hash sum: 55.88319_28806_14901_4439 # Hash sum: 55.88319_28806_14901_4439
num = (((((((2.5090809287301226727e+3 * r + num = (
3.3430575583588128105e+4) * r + (
6.7265770927008700853e+4) * r + (
4.5921953931549871457e+4) * r + (
1.3731693765509461125e+4) * r + (
1.9715909503065514427e+3) * r + (
1.3314166789178437745e+2) * r + (
3.3871328727963666080e+0) * q 2.5090809287301226727e3 * r
den = (((((((5.2264952788528545610e+3 * r + + 3.3430575583588128105e4
2.8729085735721942674e+4) * r + )
3.9307895800092710610e+4) * r + * r
2.1213794301586595867e+4) * r + + 6.7265770927008700853e4
5.3941960214247511077e+3) * r + )
6.8718700749205790830e+2) * r + * r
4.2313330701600911252e+1) * r + + 4.5921953931549871457e4
1.0) )
* r
+ 1.3731693765509461125e4
)
* r
+ 1.9715909503065514427e3
)
* r
+ 1.3314166789178437745e2
)
* r
+ 3.3871328727963666080e0
) * q
den = (
(
(
(
(
(5.2264952788528545610e3 * r + 2.8729085735721942674e4)
* r
+ 3.9307895800092710610e4
)
* r
+ 2.1213794301586595867e4
)
* r
+ 5.3941960214247511077e3
)
* r
+ 6.8718700749205790830e2
)
* r
+ 4.2313330701600911252e1
) * r + 1.0
x = num / den x = num / den
return mu + (x * sigma) return mu + (x * sigma)
r = p if q <= 0.0 else 1.0 - p r = p if q <= 0.0 else 1.0 - p
@ -493,41 +566,105 @@ class NormalDist:
if r <= 5.0: if r <= 5.0:
r = r - 1.6 r = r - 1.6
# Hash sum: 49.33206_50330_16102_89036 # Hash sum: 49.33206_50330_16102_89036
num = (((((((7.74545014278341407640e-4 * r + num = (
2.27238449892691845833e-2) * r + (
2.41780725177450611770e-1) * r + (
1.27045825245236838258e+0) * r + (
3.64784832476320460504e+0) * r + (
5.76949722146069140550e+0) * r + (
4.63033784615654529590e+0) * r + 7.74545014278341407640e-4 * r
1.42343711074968357734e+0) + 2.27238449892691845833e-2
den = (((((((1.05075007164441684324e-9 * r + )
5.47593808499534494600e-4) * r + * r
1.51986665636164571966e-2) * r + + 2.41780725177450611770e-1
1.48103976427480074590e-1) * r + )
6.89767334985100004550e-1) * r + * r
1.67638483018380384940e+0) * r + + 1.27045825245236838258e0
2.05319162663775882187e+0) * r + )
1.0) * r
+ 3.64784832476320460504e0
)
* r
+ 5.76949722146069140550e0
)
* r
+ 4.63033784615654529590e0
) * r + 1.42343711074968357734e0
den = (
(
(
(
(
(
1.05075007164441684324e-9 * r
+ 5.47593808499534494600e-4
)
* r
+ 1.51986665636164571966e-2
)
* r
+ 1.48103976427480074590e-1
)
* r
+ 6.89767334985100004550e-1
)
* r
+ 1.67638483018380384940e0
)
* r
+ 2.05319162663775882187e0
) * r + 1.0
else: else:
r = r - 5.0 r = r - 5.0
# Hash sum: 47.52583_31754_92896_71629 # Hash sum: 47.52583_31754_92896_71629
num = (((((((2.01033439929228813265e-7 * r + num = (
2.71155556874348757815e-5) * r + (
1.24266094738807843860e-3) * r + (
2.65321895265761230930e-2) * r + (
2.96560571828504891230e-1) * r + (
1.78482653991729133580e+0) * r + (
5.46378491116411436990e+0) * r + 2.01033439929228813265e-7 * r
6.65790464350110377720e+0) + 2.71155556874348757815e-5
den = (((((((2.04426310338993978564e-15 * r + )
1.42151175831644588870e-7) * r + * r
1.84631831751005468180e-5) * r + + 1.24266094738807843860e-3
7.86869131145613259100e-4) * r + )
1.48753612908506148525e-2) * r + * r
1.36929880922735805310e-1) * r + + 2.65321895265761230930e-2
5.99832206555887937690e-1) * r + )
1.0) * r
+ 2.96560571828504891230e-1
)
* r
+ 1.78482653991729133580e0
)
* r
+ 5.46378491116411436990e0
) * r + 6.65790464350110377720e0
den = (
(
(
(
(
(
2.04426310338993978564e-15 * r
+ 1.42151175831644588870e-7
)
* r
+ 1.84631831751005468180e-5
)
* r
+ 7.86869131145613259100e-4
)
* r
+ 1.48753612908506148525e-2
)
* r
+ 1.36929880922735805310e-1
)
* r
+ 5.99832206555887937690e-1
) * r + 1.0
x = num / den x = num / den
if q < 0.0: if q < 0.0:
x = -x x = -x
@ -542,12 +679,12 @@ class NormalDist:
probability. probability.
""" """
if p <= 0.0 or p >= 1.0: if p <= 0.0 or p >= 1.0:
raise StatisticsError('p must be in the range 0.0 < p < 1.0') raise StatisticsError("p must be in the range 0.0 < p < 1.0")
if self._sigma <= 0.0: if self._sigma <= 0.0:
raise StatisticsError('cdf() not defined when sigma at or below zero') raise StatisticsError("cdf() not defined when sigma at or below zero")
return self._normal_dist_inv_cdf(p, self._mu, self._sigma) return self._normal_dist_inv_cdf(p, self._mu, self._sigma)
def quantiles(self, n: int=4): def quantiles(self, n: int = 4):
""" """
Divide into *n* continuous intervals with equal probability. Divide into *n* continuous intervals with equal probability.
@ -557,7 +694,7 @@ class NormalDist:
Set *n* to 100 for percentiles which gives the 99 cuts points that Set *n* to 100 for percentiles which gives the 99 cuts points that
separate the normal distribution in to 100 equal sized groups. separate the normal distribution in to 100 equal sized groups.
""" """
return [self.inv_cdf(float(i)/float(n)) for i in range(1, n)] return [self.inv_cdf(float(i) / float(n)) for i in range(1, n)]
def overlap(self, other: NormalDist) -> float: def overlap(self, other: NormalDist) -> float:
""" """
@ -575,7 +712,7 @@ class NormalDist:
X_var, Y_var = X.variance, Y.variance X_var, Y_var = X.variance, Y.variance
if not X_var or not Y_var: if not X_var or not Y_var:
raise StatisticsError('overlap() not defined when sigma is zero') raise StatisticsError("overlap() not defined when sigma is zero")
dv = Y_var - X_var dv = Y_var - X_var
dm = _fabs(Y._mu - X._mu) dm = _fabs(Y._mu - X._mu)
@ -584,7 +721,7 @@ class NormalDist:
return 1.0 - _erf(dm / (2.0 * X._sigma * _sqrt(2.0))) return 1.0 - _erf(dm / (2.0 * X._sigma * _sqrt(2.0)))
a = X._mu * Y_var - Y._mu * X_var a = X._mu * Y_var - Y._mu * X_var
b = X._sigma * Y._sigma * _sqrt(dm**2.0 + dv * _log(Y_var / X_var)) b = X._sigma * Y._sigma * _sqrt(dm ** 2.0 + dv * _log(Y_var / X_var))
x1 = (a + b) / dv x1 = (a + b) / dv
x2 = (a - b) / dv x2 = (a - b) / dv
@ -689,4 +826,4 @@ class NormalDist:
return hash((self._mu, self._sigma)) return hash((self._mu, self._sigma))
def __repr__(self): def __repr__(self):
return f'NormalDist(mu={self._mu}, sigma={self._sigma})' return f"NormalDist(mu={self._mu}, sigma={self._sigma})"