四、状态不确定性(2)
在有限维场景中,POMDP问题的精确解也经常很难计算。因而,考虑求得近似解的方法是合理的。本部分从离线近似解讨论到在线近似解,是近似方法的常规逻辑思路。
3. 离线置信状态规划
3.1 QMDP
一种简单的离线逼近技术是QMDP,即与全观察MDP相关的行动值函数方法。其核心迭代为:$$\alpha_{a}^{(k+1)}(s) = R(s,a) + \gamma \sum_{s^{\prime}}T(s^{\prime} \mid s,a) \max_{a^{\prime}} \alpha_{a^{\prime}}^{(k)}(s^{\prime}). $$
struct QMDP
k_max # maximum number of iterations
end
function update(𝒫::POMDP, M::QMDP, Γ)
𝒮, 𝒜, R, T, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.T, 𝒫.γ
Γ′ = [[R(s,a) + γ*sum(T(s,a,s′)*maximum(α′[j] for α′ in Γ)
for (j,s′) in enumerate(𝒮)) for s in 𝒮] for a in 𝒜]
return Γ′
end
function solve(M::QMDP, 𝒫::POMDP)
Γ = [zeros(length(𝒫.𝒮)) for a in 𝒫.𝒜]
Γ = alphavector_iteration(𝒫, M, Γ)
return AlphaVectorPolicy(𝒫, Γ, 𝒫.𝒜)
end
3.1.1 Fast Informed Bound
其核心迭代为:$$\alpha_{a}^{(k+1)}(s) = R(s,a) + \gamma \sum_{o} \max_{a^{\prime}} \sum_{s^{\prime}}O(o \mid a, s^{\prime}) T(s^{\prime} \mid s,a)\alpha_{a^{\prime}}^{(k)}(s^{\prime}). $$
struct FastInformedBound
k_max # maximum number of iterations
end
function update(𝒫::POMDP, M::FastInformedBound, Γ)
𝒮, 𝒜, 𝒪, R, T, O, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.R, 𝒫.T, 𝒫.O, 𝒫.γ
Γ′ = [[R(s, a) + γ*sum(maximum(sum(O(a,s′,o)*T(s,a,s′)*α′[j]
for (j,s′) in enumerate(𝒮)) for α′ in Γ) for o in 𝒪) for s in 𝒮] for a in 𝒜]
return Γ′
end
function solve(M::FastInformedBound, 𝒫::POMDP)
Γ = [zeros(length(𝒫.𝒮)) for a in 𝒫.𝒜]
Γ = alphavector_iteration(𝒫, M, Γ)
return AlphaVectorPolicy(𝒫, Γ, 𝒫.𝒜)
end
3.1.2 Fast Lower Bounds
一个常见的下限是 best-action worst-state(BAWS)lower bound算法。
function baws_lowerbound(𝒫::POMDP) 𝒮, 𝒜, R, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.γ r = maximum(minimum(R(s, a) for s in 𝒮) for a in 𝒜) / (1-γ) α = fill(r, length(𝒮)) return α end
Blind lower bound算法,该算法与QMDP相似。
function blind_lowerbound(𝒫, k_max) 𝒮, 𝒜, T, R, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.T, 𝒫.R, 𝒫.γ Q(s,a,α) = R(s,a) + γ*sum(T(s,a,s′)*α[j] for (j,s′) in enumerate(𝒮)) Γ = [baws_lowerbound(𝒫) for a in 𝒜] for k in 1:k_max Γ = [[Q(s,a,α) for s in 𝒮] for (α,a) in zip(Γ, 𝒜)] end return Γ end
3.2 Point-Based Value Iteration(PBVI)
具体思想可参见PBVI。
struct PointBasedValueIteration
B # set of belief points
k_max # maximum number of iterations
end
function backup(𝒫::POMDP, Γ, b)
𝒮, 𝒜, 𝒪, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.γ
R, T, O = 𝒫.R, 𝒫.T, 𝒫.O
Γa = []
for a in 𝒜
Γao = []
for o in 𝒪
b′ = update(b, 𝒫, a, o)
push!(Γao, argmax(α->α⋅b′, Γ))
end
α = [R(s, a) + γ*sum(sum(T(s, a, s′)*O(a, s′, o)*Γao[i][j]
for (j,s′) in enumerate(𝒮)) for (i,o) in enumerate(𝒪)) for s in 𝒮]
push!(Γa, α)
end
return argmax(α->α⋅b, Γa)
end
end
function update(𝒫::POMDP, M::PointBasedValueIteration, Γ)
return [backup(𝒫, Γ, b) for b in M.B]
end
function solve(M::PointBasedValueIteration, 𝒫)
Γ = fill(baws_lowerbound(𝒫), length(𝒫.𝒜))
Γ = alphavector_iteration(𝒫, M, Γ)
return LookaheadAlphaVectorPolicy(𝒫, Γ)
end
3.2.1 Randomized PBVI
struct RandomizedPointBasedValueIteration
B # set of belief points
k_max # maximum number of iterations
end
function update(𝒫::POMDP, M::RandomizedPointBasedValueIteration, Γ)
Γ′, B′ = [], copy(M.B)
while !isempty(B′)
b = rand(B′)
α = argmax(α->α⋅b, Γ)
α′ = backup(𝒫, Γ, b)
if α′⋅b ≥ α⋅b
push!(Γ′, α′)
else
push!(Γ′, α)
end
filter!(b->maximum(α⋅b for α in Γ′) <
maximum(α⋅b for α in Γ), B′)
end
return Γ′
end
function solve(M::RandomizedPointBasedValueIteration, 𝒫)
Γ = [baws_lowerbound(𝒫)]
Γ = alphavector_iteration(𝒫, M, Γ)
return LookaheadAlphaVectorPolicy(𝒫, Γ)
end
3.3 Sawtooth Upper Bound
该方法时用基来表示值函数,具体讲,基是一组belief-utility对。
struct SawtoothPolicy
𝒫 # POMDP problem
V # dictionary mapping beliefs to utilities
end
function basis(𝒫)
n = length(𝒫.𝒮)
e(i) = [j == i ? 1.0 : 0.0 for j in 1:n]
return [e(i) for i in 1:n]
end
function utility(π::SawtoothPolicy, b)
𝒫, V = π.𝒫, π.V
if haskey(V, b)
return V[b]
end
n = length(𝒫.𝒮)
E = basis(𝒫)
u = sum(V[E[i]] * b[i] for i in 1:n)
for (b′, u′) in V
if b′ ∉ E
i = argmax([norm(b-e, 1) - norm(b′-e, 1) for e in E])
w = [norm(b - e, 1) for e in E]
w[i] = norm(b - b′, 1)
w /= sum(w)
w = [1 - wi for wi in w]
α = [V[e] for e in E]
α[i] = u′
u = min(u, w⋅α)
end
end
return u
end
(π::SawtoothPolicy)(b) = greedy(π, b).a
3.3.1 点选择
无论是PBVI还是sawtooth迭代算法,都需要知道置信$B$。
function randstep(𝒫::POMDP, b, a)
s = rand(SetCategorical(𝒫.𝒮, b))
s′, r, o = 𝒫.TRO(s, a)
b′ = update(b, 𝒫, a, o)
return b′, r
end
function random_belief_expansion(𝒫, B)
B′ = copy(B)
for b in B
a = rand(𝒫.𝒜)
b′, r = randstep(𝒫, b, a)
push!(B′, b′)
end
return unique!(B′)
end
function exploratory_belief_expansion(𝒫, B)
B′ = copy(B)
for b in B
best = (b=copy(b), d=0.0)
for a in 𝒫.𝒜
b′, r = randstep(𝒫, b, a)
d = minimum(norm(b - b′, 1) for b in B′)
if d > best.d
best = (b=b′, d=d)
end
end
push!(B′, best.b)
end
return unique!(B′)
end
3.3.2 Sawtooth Heuristic Search
struct SawtoothHeuristicSearch
b # initial belief
δ # gap threshold
d # depth
k_max # maximum number of iterations
k_fib # number of iterations for fast informed bound
end
function explore!(M::SawtoothHeuristicSearch, 𝒫, πhi, πlo, b, d=0)
𝒮, 𝒜, 𝒪, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.γ
ϵ(b′) = utility(πhi, b′) - utility(πlo, b′)
if d ≥ M.d || ϵ(b) ≤ M.δ / γ^d
return
end
a = πhi(b)
o = argmax(o -> ϵ(update(b, 𝒫, a, o)), 𝒪)
b′ = update(b, 𝒫, a, o)
explore!(M, 𝒫, πhi, πlo, b′, d+1)
if b′ ∉ basis(𝒫)
πhi.V[b′] = greedy(πhi, b′).u
end
push!(πlo.Γ, backup(𝒫, πlo.Γ, b′))
end
function solve(M::SawtoothHeuristicSearch, 𝒫::POMDP)
πfib = solve(FastInformedBound(M.k_fib), 𝒫)
Vhi = Dict(e => utility(πfib, e) for e in basis(𝒫))
πhi = SawtoothPolicy(𝒫, Vhi)
πlo = LookaheadAlphaVectorPolicy(𝒫, [baws_lowerbound(𝒫)])
for i in 1:M.k_max
explore!(M, 𝒫, πhi, πlo, M.b)
if utility(πhi, M.b) - utility(πlo, M.b) < M.δ
break
end
end
return πlo
end
4. 在线置信状态规划
在线方法的思想来源于树搜索,虽然在每步迭代中进行了大量的计算,但对高维问题是有效的。
- 一个简单的在线策略是执行一步lookhead,它考虑从当前信念采取的每个行动,并使用近似值函数估计预期值。
- 前向搜索可以得到更好的策略,但其计算复杂性随着范围的增加而显著增加。
- 分支定界是一种更有效的正向搜索版本,通过在值函数上设置上限和下限,可以避免搜索某些路径。
- 稀疏采样是一种近似方法,可以减少在所有可能观测值的空间上迭代的计算负担。
- 蒙特卡罗树搜索可以通过对历史而不是状态进行操作来适应POMDP。
- 确定稀疏树搜索使用一粒子置信,确保观测值确定,从而大大减少搜索树。
- 启发式搜索智能地选择动作观察对,以探索其保持的值函数上下限之间存在较大差距的区域。
5. Controller Abstractions
OMDP策略的控制器表示允许策略维护自己的内部状态。与之前的方法相比,这些表示可以提高可扩展性。
5.1 控制器
控制器是维护自身内部状态的策略表示,具体表示为一个由有限组节点$X$组成的图。
mutable struct ControllerPolicy
𝒫 # problem
X # set of controller nodes
ψ # action selection distribution
η # successor selection distribution
end
function (π::ControllerPolicy)(x)
𝒜, ψ = π.𝒫.𝒜, π.ψ
dist = [ψ[x, a] for a in 𝒜]
return rand(SetCategorical(𝒜, dist))
end
function update(π::ControllerPolicy, x, a, o)
X, η = π.X, π.η
dist = [η[x, a, o, x′] for x′ in X]
return rand(SetCategorical(X, dist))
end
通过形成状态空间为$X\times S$的MDP,可以计算遵循控制器策略的效用。节点$x$处于活动状态时的迭代策略评估为:
function utility(π::ControllerPolicy, U, x, s)
𝒮, 𝒜, 𝒪 = π.𝒫.𝒮, π.𝒫.𝒜, π.𝒫.𝒪
T, O, R, γ = π.𝒫.T, π.𝒫.O, π.𝒫.R, π.𝒫.γ
X, ψ, η = π.X, π.ψ, π.η
U′(a,s′,o) = sum(η[x,a,o,x′]*U[x′,s′] for x′ in X)
U′(a,s′) = T(s,a,s′)*sum(O(a,s′,o)*U′(a,s′,o) for o in 𝒪)
U′(a) = R(s,a) + γ*sum(U′(a,s′) for s′ in 𝒮)
return sum(ψ[x,a]*U′(a) for a in 𝒜)
end
function iterative_policy_evaluation(π::ControllerPolicy, k_max)
𝒮, X = π.𝒫.𝒮, π.X
U = Dict((x, s) => 0.0 for x in X, s in 𝒮)
for k in 1:k_max
U = Dict((x, s) => utility(π, U, x, s) for x in X, s in 𝒮)
end
return U
end
5.2 实现案例
5.2.1 策略迭代
策略迭代从任何初始控制器开始,然后在策略评估和策略改进之间迭代。
struct ControllerPolicyIteration
k_max # number of iterations
eval_max # number of evaluation iterations
end
function solve(M::ControllerPolicyIteration, 𝒫::POMDP)
𝒜, 𝒪, k_max, eval_max = 𝒫.𝒜, 𝒫.𝒪, M.k_max, M.eval_max
X = [1]
ψ = Dict((x, a) => 1.0 / length(𝒜) for x in X, a in 𝒜)
η = Dict((x, a, o, x′) => 1.0 for x in X, a in 𝒜, o in 𝒪, x′ in X)
π = ControllerPolicy(𝒫, X, ψ, η)
for i in 1:k_max
prevX = copy(π.X)
U = iterative_policy_evaluation(π, eval_max)
policy_improvement!(π, U, prevX)
prune!(π, U, prevX)
end
return π
end
function policy_improvement!(π::ControllerPolicy, U, prevX)
𝒮, 𝒜, 𝒪 = π.𝒫.𝒮, π.𝒫.𝒜, π.𝒫.𝒪
X, ψ, η = π.X, π.ψ, π.η
repeatX𝒪 = fill(X, length(𝒪))
assign𝒜X′ = vec(collect(product(𝒜, repeatX𝒪...)))
for ax′ in assign𝒜X′
x, a = maximum(X) + 1, ax′[1]
push!(X, x)
successor(o) = ax′[findfirst(isequal(o), 𝒪) + 1]
U′(o,s′) = U[successor(o), s′]
for s in 𝒮
U[x, s] = lookahead(π.𝒫, U′, s, a)
end
for a′ in 𝒜
ψ[x, a′] = a′ == a ? 1.0 : 0.0
for (o, x′) in product(𝒪, prevX)
η[x, a′, o, x′] = x′ == successor(o) ? 1.0 : 0.0
end
end
end
for (x, a, o, x′) in product(X, 𝒜, 𝒪, X)
if !haskey(η, (x, a, o, x′))
η[x, a, o, x′] = 0.0
end
end
end
5.2.2 非线性规划
策略改进问题可以作为一个单一的、大型的、非线性规划公式。
struct NonlinearProgramming
b # initial belief
ℓ # number of nodes
end
function tensorform(𝒫::POMDP)
𝒮, 𝒜, 𝒪, R, T, O = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.R, 𝒫.T, 𝒫.O
𝒮′ = eachindex(𝒮)
𝒜′ = eachindex(𝒜)
𝒪′ = eachindex(𝒪)
R′ = [R(s,a) for s in 𝒮, a in 𝒜]
T′ = [T(s,a,s′) for s in 𝒮, a in 𝒜, s′ in 𝒮]
O′ = [O(a,s′,o) for a in 𝒜, s′ in 𝒮, o in 𝒪]
return 𝒮′, 𝒜′, 𝒪′, R′, T′, O′
end
function solve(M::NonlinearProgramming, 𝒫::POMDP)
x1, X = 1, collect(1:M.ℓ)
𝒫, γ, b = 𝒫, 𝒫.γ, M.b
𝒮, 𝒜, 𝒪, R, T, O = tensorform(𝒫)
model = Model(Ipopt.Optimizer)
@variable(model, U[X,𝒮])
@variable(model, ψ[X,𝒜] ≥ 0)
@variable(model, η[X,𝒜,𝒪,X] ≥ 0)
@objective(model, Max, b⋅U[x1,:])
@NLconstraint(model, [x=X,s=𝒮],
U[x,s] == (sum(ψ[x,a]*(R[s,a] + γ*sum(T[s,a,s′]*sum(O[a,s′,o]
*sum(η[x,a,o,x′]*U[x′,s′] for x′ in X) for o in 𝒪) for s′ in 𝒮)) for a in 𝒜)))
@constraint(model, [x=X], sum(ψ[x,:]) == 1)
@constraint(model, [x=X,a=𝒜,o=𝒪], sum(η[x,a,o,:]) == 1)
optimize!(model)
ψ′, η′ = value.(ψ), value.(η)
return ControllerPolicy(𝒫, X,
Dict((x, 𝒫.𝒜[a]) => ψ′[x, a] for x in X, a in 𝒜),
Dict((x, 𝒫.𝒜[a], 𝒫.𝒪[o], x′) => η′[x, a, o, x′]
for x in X, a in 𝒜, o in 𝒪, x′ in X))
end
总结
从单个决策到序列决策,整个过程中就是加入迭代,如何让迭代更优的过程。从确定性状态到模型不确定、状态不确定,就是使用参数化表示进行毕竟,进而得到策略解的过程。