# 我花了一年时间研究不确定性估算，写下了这份最全指南

def generate_time_series (k= 200 , m= 1000 , sigma= 100 , n= 50 ,
start_date=datetime.date( 2017 , 7 , 1 )):
xs = numpy.linspace( 0 , 1 , n, endpoint= False )
ys = [k*x + m + random.gauss( 0 , sigma) for x in xs]
ts = [start_date + datetime.timedelta(x)* 365 for x in xs]
x_scale = numpy.linspace( -1 , 2 , 500 ) # for plotting
t_scale = [start_date + datetime.timedelta(x)* 365 for x in x_scale]
return xs, ys, ts, x_scale, t_scale
xs, ys, ts, x_scale, t_scale = generate_time_series()

pyplot .scatter(ts, ys, alpha= 0.5 , s= 100 )
pyplot .xlabel( 'Date' )
pyplot .ylabel( 'Weight of elephant (kg)' )

1. 数据本身。给定一定的时间范围（t ，t '），在这个时间间隔内大象体重的分布是什么？

2.某些参数的不确定性。如参数k在线性关系y = k t + m里，或者某些估算器的不确定性，就像许多观测值的平均值一样。

3.预测数值的不确定性。因此，如果我们预测日期为t（可能在未来）时大象的重量是y公斤，我们想知道数量y的不确定性。

d = pandas.DataFrame({ 'x' : xs, 't' : ts , 'Weight (kg)' : ys})
d[ 'Month' ] = d[ 't' ].apply(lambda t: t. strftime ( '%Y-%m' ))
seaborn.boxplot(data=d, x = 'Month' , y = 'Weight (kg)' )

def plot_confidence_interval(observations_by_group):
groups = list (sorted(observations_by_group. keys ()))
lo_bound = []
hi_bound = []
for group in group s:
series = observations_by_group[group]
mu, std, n = numpy.mean(series), numpy.std(series), len (series)
lo_bound. append (mu - 1.96 *std*n**- 0.5 )
hi_bound. append (mu + 1.96 *std*n**- 0.5 )
pyplot.fill_between(groups, lo_bound, hi_bound, alpha= 0.2 ,
label= 'Confidence interval (normal)' )
pyplot.scatter( ts , ys, alpha= 0.5 , s= 100 )
observations_by_month = {}
for month, y in zip(d[ 'Month' ], d[ 'Weight (kg)' ]):
observations_by_month.setdefault(month, []). append ( y )
plot_confidence_interval(observations_by_month)
pyplot.ylabel( 'Weight of elephant (kg)' )
pyplot.legend()

STATES = [ 'CA' , 'NY' , 'FL' , 'TX' , 'PA' , 'IL' , 'OH' ]
GROUPS = [ 'test' , 'control' ]
def generate_binary_categorical (states=STATES, groups=GROUPS, k= 400 ,
zs=[ 0 , 0.2 ], z_std= 0.1 , b= -3 , b_std= 1 ):
# Don't pay too much attention to this code. The main thing happens in
# numpy.random.binomial, which is where we draw the "k out of n" outcomes.
output = {}
e_obs_per_state = numpy.random.exponential(k, size=len(states))
state_biases = numpy.random.normal(b, b_std, size=len(states))
for group, z in zip(groups, zs):
noise = numpy.random.normal(z, z_std, size=len(states))
ps = 1 / ( 1 + numpy.exp(-(state_biases + noise)))
ns = numpy.random.poisson(e_obs_per_state)
ks = numpy.random.binomial(ns, ps)
output[group] = (ns, ks)
return output

data = generate_binary_categorical()
for group, (ns, ks) in data.items():
pyplot.scatter(STATES, ks/ns, label=group, alpha= 0.7 , s= 400 )
pyplot.ylabel( 'Conversion rate' )
pyplot.legend()

n, k = 100 , 3
scipy.stats.beta.ppf([ 0.025 , 0.975 ], k, n-k)
array([ 0.00629335 , 0.07107612 ])

for group, (ns, ks) in data.items():
lo = scipy.stats.beta.ppf( 0.025 , ks, ns-ks)
hi = scipy.stats.beta.ppf( 0.975 , ks, ns-ks)
mean = ks/ns
pyplot.errorbar(STATES, y=mean, yerr=[mean-lo, hi-mean],
label=group, alpha= 0.7 , linewidth= 0 , elinewidth= 50 )
pyplot.ylabel( 'Conversion rate' )
pyplot.legend()

Bootstrapping算法（拔靴法）

lo_bound = []
hi_bound = []
months = sorted(observations_by_month.keys())
for month in months:
series = observations_by_month[month]
bootstrapped_means = []
for i in range( 1000 ):
# sample with replacement
bootstrap = [random.choice(series) for _ in series]
bootstrapped_means.append(numpy.mean(bootstrap))
lo_bound.append(numpy.percentile(bootstrapped_means, 2.5 ))
hi_bound.append(numpy.percentile(bootstrapped_means, 97.5 ))
pyplot.scatter(ts, ys, alpha= 0.5 , s= 100 )
pyplot.fill_between(months, lo_bound, hi_bound, alpha= 0.2 ,
label= 'Confidence interval' )
pyplot.ylabel( 'Weight of elephant (kg)' )
pyplot.legend()

Bootstrapping算法很不错，因为它可以让你回避了任何关于生成数据的概率分布的问题。虽然它可能有点慢，但它基本上是即插即用的，并且几乎适用于所有问题。

seaborn.barplot(data=d, x='Month', y='Weight (kg)')

xs, ys, ts, x_scale, t_scale = generate_time_series()
def model (xs, k, m):
return k * xs + m
def l2_loss (tup, xs, ys):
k, m = tup
delta = model(xs, k, m) - ys
return numpy.dot(delta, delta)
k_hat, m_hat = scipy.optimize.minimize(l2_loss, ( 0 , 0 ), args=(xs, ys)).x
pyplot.scatter(ts, ys, alpha= 0.5 , s= 100 )
pyplot.plot(t_scale, model(x_scale, k_hat, m_hat), color= 'red' ,
linewidth= 5 , alpha= 0.5 )
pyplot.ylabel( 'Weight of elephant (kg)' )

import scipy.optimize
def neg_log_likelihood (tup, xs, ys):
# Since sigma > 0, we use use log(sigma) as the parameter instead.
# That way we have an unconstrained problem.
k, m, log_sigma = tup
sigma = numpy.exp(log_sigma)
delta = model(xs, k, m) - ys
return len(xs)/ 2 *numpy.log( 2 *numpy.pi*sigma** 2 ) + \
numpy.dot(delta, delta) / ( 2 *sigma** 2 )

k_hat, m_hat, log_sigma_hat = scipy.optimize.minimize(
neg_log_likelihood, ( 0 , 0 , 0 ), args=(xs, ys)
).x
sigma_hat = numpy.exp(log_sigma_hat)
pyplot.scatter(ts, ys, alpha= 0.5 , s= 100 )
pyplot.plot(t_scale, model(x_scale, k_hat, m_hat),
color= 'green' , linewidth= 5 )
pyplot.fill_between(
t_scale,
model(x_scale, k_hat, m_hat) - 1.96 *sigma_hat,
model(x_scale, k_hat, m_hat) + 1.96 *sigma_hat,
color= 'red' , alpha= 0.3 )
pyplot.legend()
pyplot.ylabel( 'Weight of elephant (kg)' )

pyplot.scatter(ts, ys, alpha= 0.5 , s= 100 )
xys = list(zip(xs, ys))
curves = []
for i in range( 100 ):
# sample with replacement
bootstrap = [random.choice(xys) for _ in xys]
xs_bootstrap = numpy.array([x for x, y in bootstrap])
ys_bootstrap = numpy.array([y for x, y in bootstrap])
k_hat, m_hat = scipy.optimize.minimize(
l2_loss, ( 0 , 0 ), args=(xs_bootstrap, ys_bootstrap)
).x
curves.append(model(x_scale, k_hat, m_hat))
# Plot individual lines
for curve in curves:
pyplot.plot(t_scale, curve, alpha= 0.1 , linewidth= 3 , color= 'green' )
# Plot 95% confidence interval
lo, hi = numpy.percentile(curves, ( 2.5 , 97.5 ), axis= 0 )
pyplot.fill_between(t_scale, lo, hi, color= 'red' , alpha= 0.5 )
pyplot.ylabel( 'Weight of elephant (kg)' )

●  第一个图找到k和m的一个解，并显示预测的不确定性。所以，如果你被问到下个月大象体重的范围是什么，你可以从图表中得到它。
●  第二个图找到了k和m的许多解，并显示了kx + m的不确定性。因此，这回答了一个不同的问题 - 大象体重随时间变化的趋势是什么，趋势的不确定性是什么？

pyplot.scatter(ts, ys, alpha= 0.5 , s= 100 )
xys = list(zip(xs, ys))
curves = []
for i in range( 4000 ):
# sample with replacement
bootstrap = [random.choice(xys) for _ in xys]
xs_bootstrap = numpy.array([x for x, y in bootstrap])
ys_bootstrap = numpy.array([y for x, y in bootstrap])
k_hat, m_hat, log_sigma_hat = scipy.optimize.minimize(
neg_log_likelihood, ( 0 , 0 , 0 ), args=(xs_bootstrap, ys_bootstrap)
).x
curves.append(
model(x_scale, k_hat, m_hat) +
# Note what's going on here: we're _adding_ the random term
# to the predictions!
numpy.exp(log_sigma_hat) * numpy.random.normal(size=x_scale.shape)
)
# Plot 95% confidence interval
lo, hi = numpy.percentile(curves, ( 2.5 , 97.5 ), axis= 0 )
pyplot.fill_between(t_scale, lo, hi, color= 'red' , alpha= 0.5 )
pyplot.ylabel( 'Weight of elephant (kg)' )

import emcee
xs, ys, ts, x_scale, t_scale = generate_time_series()
def log_likelihood (tup, xs, ys):
return -neg_log_likelihood(tup, xs, ys)
ndim, nwalkers = 3 , 10
p0 = [numpy.random.rand(ndim) for i in range(nwalkers)]
sampler = emcee.EnsembleSampler(nwalkers, ndim, log_likelihood,
args=[xs, ys])
sampler.run_mcmc(p0, 10000 )

# Grab the last 10 from each walker
samples = sampler.chain[:, -10 :, :].reshape(( -1 , ndim))
pyplot.scatter(ts, ys, alpha= 0.5 , s= 100 )
for k, m, log_sigma in samples:
pyplot.plot(t_scale, model(x_scale, k, m), alpha= 0.1 ,
linewidth= 3 , color= 'green' )
pyplot.ylabel( 'Weigh of elephant (kg)' )

# Grab slightly more samples this time
samples = sampler.chain[:, -500 :, :].reshape(( -1 , ndim))
k_samples, m_samples, log_sigma_samples = samples.T
seaborn.distplot(k_samples, label= 'k' )
seaborn.distplot(m_samples, label= 'm' )
seaborn.distplot(numpy.exp(log_sigma_samples), label= 'sigma' )
pyplot.legend()

pyplot.scatter(ts, ys, alpha= 0.5 , s= 100 )
samples = sampler.chain[:, -4000 :, :].reshape(( -1 , ndim))
curves = []
for k, m, log_sigma in samples:
curves.append(
model(x_scale, k, m) +
numpy.exp(log_sigma) * numpy.random.normal(size=x_scale.shape)
)
# Plot 95% confidence interval
lo, hi = numpy.percentile(curves, ( 2.5 , 97.5 ), axis= 0 )
pyplot.fill_between(t_scale, lo, hi, color= 'red' , alpha= 0.5 )
pyplot.ylabel( 'Weight of elephant (kg)' )

|
24天前
|

29 1
|
6月前
|
Cloud Native 算法 Go

69 0
|

【基础理论-白盒测试】：只要你看完这篇文章，你就超过了99.99%的同行了
【基础理论-白盒测试】：只要你看完这篇文章，你就超过了99.99%的同行了
233 0
|

176 0
LeetCode 训练场：1450. 在既定时间做作业的学生人数
LeetCode 训练场：1450. 在既定时间做作业的学生人数
98 0
|

1704 0
|