四、状态不确定性(1)
前面已经讨论了两个不确定性,即结果状态和模型的不确定性。在这一部分将不确定性扩展到状态。具体讲,接收到的观测值与状态只有概率关系,而不是精确地观察状态。此类问题可以建模为部分可观察的马尔可夫决策过程(POMDP),但POMDP很难以最佳方式解决所有问题,因而需要引入更多的近似策略。
1. Beliefs
解决POMDP的一种常见方法是在当前时间步推断基础状态的置信分布,然后应用将置信映射到行动的策略。接下来,将讨论置信参数表示和非参表示。
在POMDP中,智能体得到的来自观察空间$\mathcal{O}$中的某个观察。在给定已做出的行动$a$和转移状态$s^{\prime}$的情况下,观察$o$的概率为$O(o \mid a, s^{\prime})$。
struct POMDP
γ # discount factor
𝒮 # state space
𝒜 # action space
𝒪 # observation space
T # transition function
R # reward function
O # observation function
TRO # sample transition, reward, and observation
end
递归贝叶斯估计可以用来推断并更新当前状态下的置信分布,给出最新的行动和观察结果。用$b(s)$表示分配给状态$s$的概率,一个特定的置信$b$属于置信空间$\mathcal{B}$。
1.1 离散状态滤波器
若状态空间和观测空间都是有限的,可用离散状态滤波器来精确地进行推断。
function update(b::Vector{Float64}, 𝒫, a, o)
𝒮, T, O = 𝒫.𝒮, 𝒫.T, 𝒫.O
b′ = similar(b)
for (i′, s′) in enumerate(𝒮)
po = O(a, s′, o)
b′[i′] = po * sum(T(s, a, s′) * b[i] for (i, s) in enumerate(𝒮))
end
if sum(b′) ≈ 0.0
fill!(b′, 1)
end
return normalize!(b′, 1)
end
置信更新的成功取决于具有观察和转移模型的准确性。在这些模型不为人所知的情况下,建议使用具有更分散分布的简化模型,以帮助防止过度置信,从而导致状态估计的脆性。
1.2 Kalman滤波器
Kalman滤波器在给出一定系统假设下,以简便的迭代解决连续的置信更新问题,即$$b^{\prime}(s^{\prime}) \propto O(o \mid a, s^{\prime}) \int T(s^{\prime} \mid s, a) b(s) \ {\rm d}s.$$
在给定$T$和$O$是线性高斯,$b$是高斯的前提下,Kalman滤波器给出的更新过程为:$$\begin{align*} T(s^{\prime} \mid s, a) & = \mathcal{N}(s^{\prime} \mid T_{s} + T_{a}, \Sigma_{s}), \\ O(o \mid s^{\prime}) & = \mathcal{N}(o \mid O_{s} s^{\prime}, \Sigma_{o}), \\ b(s) & = \mathcal{N}(s \mid \mu_{b}, \Sigma_{b}).\end{align*}$$
struct KalmanFilter
μb # mean vector
Σb # covariance matrix
end
function update(b::KalmanFilter, 𝒫, a, o)
μb, Σb = b.μb, b.Σb
Ts, Ta, Os = 𝒫.Ts, 𝒫.Ta, 𝒫.Os
Σs, Σo = 𝒫.Σs, 𝒫.Σo
# predict step
μp = Ts*μb + Ta*a
Σp = Ts*Σb*Ts' + Σs
# update step
Σpo = Σp*Os'
K = Σpo/(Os*Σp*Os' + Σo)
μb′ = μp + K*(o - Os*μp)
Σb′ = (I - K*Os)*Σp
return KalmanFilter(μb′, Σb′)
end
1.2.1 Extended Kalman滤波器
Extended Kalman滤波器将Kalman滤波器简单地扩展到具有高斯噪声的非线性动力学问题,即$$\begin{align*} T(s^{\prime} \mid s, a) & = \mathcal{N}(s^{\prime} \mid f_{T}(s, a), \Sigma_{s}), \\ O(o \mid s^{\prime}) & = \mathcal{N}(o \mid f_{O}(s^{\prime}), \Sigma_{o}), \\ b(s) & = \mathcal{N}(s \mid \mu_{b}, \Sigma_{b}).\end{align*}$$
struct ExtendedKalmanFilter
μb # mean vector
Σb # covariance matrix
end
import ForwardDiff: jacobian
function update(b::ExtendedKalmanFilter, 𝒫, a, o)
μb, Σb = b.μb, b.Σb
fT, fO = 𝒫.fT, 𝒫.fO
Σs, Σo = 𝒫.Σs, 𝒫.Σo
# predict
μp = fT(μb, a)
Ts = jacobian(s->fT(s, a), μb)
Os = jacobian(fO, μp)
Σp = Ts*Σb*Ts' + Σs
# update
Σpo = Σp*Os'
K = Σpo/(Os*Σp*Os' + Σo)
μb′ = μp + K*(o - fO(μp))
Σb′ = (I - K*Os)*Σp
return ExtendedKalmanFilter(μb′, Σb′)
end
1.2.2 Unscented Kalman滤波器(UKF)
UKF解决带高斯噪声的非线性问题1。该方法是无导数的,并且依赖于确定性采样策略来近似非线性变换的分布的影响。
struct UnscentedKalmanFilter
μb # mean vector
Σb # covariance matrix
λ # spread parameter
end
function unscented_transform(μ, Σ, f, λ, ws)
n = length(μ)
Δ = cholesky((n + λ) * Σ).L
S = [μ]
for i in 1:n
push!(S, μ + Δ[:,i])
push!(S, μ - Δ[:,i])
end
S′ = f.(S)
μ′ = sum(w*s for (w,s) in zip(ws, S′))
Σ′ = sum(w*(s - μ′)*(s - μ′)' for (w,s) in zip(ws, S′))
return (μ′, Σ′, S, S′)
end
function update(b::UnscentedKalmanFilter, 𝒫, a, o)
μb, Σb, λ = b.μb, b.Σb, b.λ
fT, fO = 𝒫.fT, 𝒫.fO
n = length(μb)
ws = [λ / (n + λ); fill(1/(2(n + λ)), 2n)]
# predict
μp, Σp, Sp, Sp′ = unscented_transform(μb, Σb, s->fT(s,a), λ, ws)
Σp += 𝒫.Σs
# update
μo, Σo, So, So′ = unscented_transform(μp, Σp, fO, λ, ws)
Σo += 𝒫.Σo
Σpo = sum(w*(s - μp)*(s′ - μo)' for (w,s,s′) in zip(ws, So, So′))
K = Σpo / Σo
μb′ = μp + K*(o - μo)
Σb′ = Σp - K*Σo*K'
return UnscentedKalmanFilter(μb′, Σb′, λ)
end
1.3 粒子滤波器
具有大状态空间的离散问题或具有动力学的连续问题,如果不能通过Kalman滤波器的线性高斯假设很好地近似,则会使用近似技术进行表征并更新。一种常见的方法是使用粒子过滤器,它将置信状态表示为一组状态,每一个在更新置信的状态称为粒子。
struct ParticleFilter
states # vector of state samples
end
function update(b::ParticleFilter, 𝒫, a, o)
T, O = 𝒫.T, 𝒫.O
states = [rand(T(s, a)) for s in b.states]
weights = [O(a, s′, o) for s′ in states]
D = SetCategorical(states, weights)
return ParticleFilter(rand(D, length(states)))
end
针对离散问题,叠加rejection。
struct RejectionParticleFilter
states # vector of state samples
end
function update(b::RejectionParticleFilter, 𝒫, a, o)
T, O = 𝒫.T, 𝒫.O
states = similar(b.states)
i = 1
while i ≤ length(states)
s = rand(b.states)
s′ = rand(T(s,a))
if rand(O(a,s′)) == o
states[i] = s′
i += 1
end
end
return RejectionParticleFilter(states)
end
1.3.1 Particle Injection
对于机器人定位问题,通常的做法是将均匀分布的粒子注入所有可能的机器人姿态,并根据当前观测值加权。
struct InjectionParticleFilter
states # vector of state samples
m_inject # number of samples to inject
D_inject # injection distribution
end
function update(b::InjectionParticleFilter, 𝒫, a, o)
T, O, m_inject, D_inject = 𝒫.T, 𝒫.O, b.m_inject, b.D_inject
states = [rand(T(s, a)) for s in b.states]
weights = [O(a, s′, o) for s′ in states]
D = SetCategorical(states, weights)
m = length(states)
states = vcat(rand(D, m - m_inject), rand(D_inject, m_inject))
return InjectionParticleFilter(states, m_inject, D_inject)
end
此外,还可以有更加自适应的方法。
mutable struct AdaptiveInjectionParticleFilter
states # vector of state samples
w_slow # slow moving average
w_fast # fast moving average
α_slow # slow moving average parameter
α_fast # fast moving average parameter
ν # injection parameter
D_inject # injection distribution
end
function update(b::AdaptiveInjectionParticleFilter, 𝒫, a, o)
T, O = 𝒫.T, 𝒫.O
w_slow, w_fast, α_slow, α_fast, ν, D_inject =
b.w_slow, b.w_fast, b.α_slow, b.α_fast, b.ν, b.D_inject
states = [rand(T(s, a)) for s in b.states]
weights = [O(a, s′, o) for s′ in states]
w_mean = mean(weights)
w_slow += α_slow*(w_mean - w_slow)
w_fast += α_fast*(w_mean - w_fast)
m = length(states)
m_inject = round(Int, m * max(0, 1.0 - ν*w_fast / w_slow))
D = SetCategorical(states, weights)
states = vcat(rand(D, m - m_inject), rand(D_inject, m_inject))
b.w_slow, b.w_fast = w_slow, w_fast
return AdaptiveInjectionParticleFilter(states,
w_slow, w_fast, α_slow, α_fast, ν, D_inject)
end
2. 精确置信状态规划
任何POMDP都可以被视为带置信状态的的MDP,即置信状态MDP。置信状态MDP的奖励函数取决于置信和所采取的行动,只是奖励的预期值,即$$R(b, a) = \sum_{s} R(s, a) b(s).$$该模型中的转移函数为:$$\begin{align*} T(b^{\prime} \mid b, a) & = \mathbb{P} (b^{\prime} \mid b, a) \\ & = \sum_{o}\sum_{s^{\prime}}\sum_{s} \mathbb{P} (b^{\prime} \mid b, a, o) \mathbb{P} (o \mid b, a, s, s^{\prime} ) \mathbb{P} (s^{\prime} \mid b, s, a) b(s).\end{align*}$$ 由于状态空间是连续的,求解置信状态MDP具有挑战性。
2.1 Conditional Plans
有许多方法可以表示POMDP的策略,其中一种是使用表示树的Conditional Plans。
struct ConditionalPlan
a # action to take at root
subplans # dictionary mapping observations to subplans
end
ConditionalPlan(a) = ConditionalPlan(a, Dict())
(π::ConditionalPlan)() = π.a
(π::ConditionalPlan)(o) = π.subplans[o]
# compute expected utility
function lookahead(𝒫::POMDP, U, s, a)
𝒮, 𝒪, T, O, R, γ = 𝒫.𝒮, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
u′ = sum(T(s,a,s′)*sum(O(a,s′,o)*U(o,s′) for o in 𝒪) for s′ in 𝒮)
return R(s,a) + γ*u′
end
function evaluate_plan(𝒫::POMDP, π::ConditionalPlan, s)
U(o,s′) = evaluate_plan(𝒫, π(o), s′)
return isempty(π.subplans) ? 𝒫.R(s,π()) : lookahead(𝒫, U, s, π())
end
2.2 置信效用
计算置信$b$的效用可通过:$$\mathcal{U}^{\pi}(b) = \sum_{s} b(s) \mathcal{U}^{\pi}(s).$$
2.2.1 Alpha Vectors
将上式重写为:$$\mathcal{U}^{\pi}(b) = \bm{\alpha}_{\pi}^{\rm T} \bm{b}.$$其中的$\bm{\alpha}_{\pi}$是alpha vector。
function alphavector(𝒫::POMDP, π::ConditionalPlan)
return [evaluate_plan(𝒫, π, s) for s in 𝒫.𝒮]
end
2.2.2 Alpha Vector Policy
基于 Alpha Vector 的表达形式,可有如下策略:
struct AlphaVectorPolicy
𝒫 # POMDP problem
Γ # alpha vectors
a # actions associated with alpha vectors
end
function utility(π::AlphaVectorPolicy, b)
return maximum(α⋅b for α in π.Γ)
end
function (π::AlphaVectorPolicy)(b)
i = argmax([α⋅b for α in π.Γ])
return π.a[i]
end
将该策略进行运用。
function lookahead(𝒫::POMDP, U, b::Vector, a)
𝒮, 𝒪, T, O, R, γ = 𝒫.𝒮, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
r = sum(R(s,a)*b[i] for (i,s) in enumerate(𝒮))
Posa(o,s,a) = sum(O(a,s′,o)*T(s,a,s′) for s′ in 𝒮)
Poba(o,b,a) = sum(b[i]*Posa(o,s,a) for (i,s) in enumerate(𝒮))
return r + γ*sum(Poba(o,b,a)*U(update(b, 𝒫, a, o)) for o in 𝒪)
end
function greedy(𝒫::POMDP, U, b::Vector)
u, a = findmax(a->lookahead(𝒫, U, b, a), 𝒫.𝒜)
return (a=a, u=u)
end
struct LookaheadAlphaVectorPolicy
𝒫 # POMDP problem
Γ # alpha vectors
end
function utility(π::LookaheadAlphaVectorPolicy, b)
return maximum(α⋅b for α in π.Γ)
end
function greedy(π, b)
U(b) = utility(π, b)
return greedy(π.𝒫, U, b)
end
(π::LookaheadAlphaVectorPolicy)(b) = greedy(π, b).a
2.2.3 剪枝
针对大量的alpha vectors,为了提高效率可使用剪枝。
function find_maximal_belief(α, Γ)
m = length(α)
if isempty(Γ)
return fill(1/m, m) # arbitrary belief
end
model = Model(GLPK.Optimizer)
@variable(model, δ)
@variable(model, b[i=1:m] ≥ 0)
@constraint(model, sum(b) == 1.0)
for a in Γ
@constraint(model, (α-a)⋅b ≥ δ)
end
@objective(model, Max, δ)
optimize!(model)
return value(δ) > 0 ? value.(b) : nothing
end
function find_dominating(Γ)
n = length(Γ)
candidates, dominating = trues(n), falses(n)
while any(candidates)
i = findfirst(candidates)
b = find_maximal_belief(Γ[i], Γ[dominating])
if b === nothing
candidates[i] = false
else
k = argmax([candidates[j] ? b⋅Γ[j] : -Inf for j in 1:n])
candidates[k], dominating[k] = false, true
end
end
return dominating
end
function prune(plans, Γ)
d = find_dominating(Γ)
return (plans[d], Γ[d])
end
2.3 值迭代
MDP的值迭代算法可直接应用于POMDP。
# This version of value iteration in terms of
# conditional plans and alpha vectors
# one-step
function value_iteration(𝒫::POMDP, k_max)
𝒮, 𝒜, R = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R
plans = [ConditionalPlan(a) for a in 𝒜]
Γ = [[R(s,a) for s in 𝒮] for a in 𝒜]
plans, Γ = prune(plans, Γ)
for k in 2:k_max
plans, Γ = expand(plans, Γ, 𝒫)
plans, Γ = prune(plans, Γ)
end
return (plans, Γ)
end
function solve(M::ValueIteration, 𝒫::POMDP)
plans, Γ = value_iteration(𝒫, M.k_max)
return LookaheadAlphaVectorPolicy(𝒫, Γ)
end
# expansion-step
function ConditionalPlan(𝒫::POMDP, a, plans)
subplans = Dict(o=>π for (o, π) in zip(𝒫.𝒪, plans))
return ConditionalPlan(a, subplans)
end
function combine_lookahead(𝒫::POMDP, s, a, Γo)
𝒮, 𝒪, T, O, R, γ = 𝒫.𝒮, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R, 𝒫.γ
U′(s′,i) = sum(O(a,s′,o)*α[i] for (o,α) in zip(𝒪,Γo))
return R(s,a) + γ*sum(T(s,a,s′)*U′(s′,i) for (i,s′) in enumerate(𝒮))
end
function combine_alphavector(𝒫::POMDP, a, Γo)
return [combine_lookahead(𝒫, s, a, Γo) for s in 𝒫.𝒮]
end
function expand(plans, Γ, 𝒫)
𝒮, 𝒜, 𝒪, T, O, R = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.T, 𝒫.O, 𝒫.R
plans′, Γ′ = [], []
for a in 𝒜
# iterate over all possible mappings from observations to plans
for inds in product([eachindex(plans) for o in 𝒪]...)
πo = plans[[inds...]]
Γo = Γ[[inds...]]
π = ConditionalPlan(𝒫, a, πo)
α = combine_alphavector(𝒫, a, Γo)
push!(plans′, π)
push!(Γ′, α)
end
end
return (plans′, Γ′)
end
- S. J. Julier and J. K. Uhlmann, “Unscented filtering and nonlinear estimation,” in Proceedings of the IEEE, vol. 92, no. 3, pp. 401–422, 2004. ↩