【读书笔记】Algorithms for Decision Making(11)

简介: 在有限维场景中,POMDP问题的精确解也经常很难计算。因而,考虑求得近似解的方法是合理的。本部分从离线近似解讨论到在线近似解,是近似方法的常规逻辑思路。

四、状态不确定性(2)

在有限维场景中,POMDP问题的精确解也经常很难计算。因而,考虑求得近似解的方法是合理的。本部分从离线近似解讨论到在线近似解,是近似方法的常规逻辑思路。


3. 离线置信状态规划

3.1 QMDP

一种简单的离线逼近技术是QMDP,即与全观察MDP相关的行动值函数方法。其核心迭代为:$$\alpha_{a}^{(k+1)}(s) = R(s,a) + \gamma \sum_{s^{\prime}}T(s^{\prime} \mid s,a) \max_{a^{\prime}} \alpha_{a^{\prime}}^{(k)}(s^{\prime}). $$

struct QMDP
    k_max # maximum number of iterations
end

function update(𝒫::POMDP, M::QMDP, Γ)
    𝒮, 𝒜, R, T, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.T, 𝒫.γ
    Γ′ = [[R(s,a) + γ*sum(T(s,a,s′)*maximum(α′[j] for α′ in Γ)
    for (j,s′) in enumerate(𝒮)) for s in 𝒮] for a in 𝒜]
        return Γ′
    end

function solve(M::QMDP, 𝒫::POMDP)
    Γ = [zeros(length(𝒫.𝒮)) for a in 𝒫.𝒜]
    Γ = alphavector_iteration(𝒫, M, Γ)
    return AlphaVectorPolicy(𝒫, Γ, 𝒫.𝒜)
end

3.1.1 Fast Informed Bound

其核心迭代为:$$\alpha_{a}^{(k+1)}(s) = R(s,a) + \gamma \sum_{o} \max_{a^{\prime}} \sum_{s^{\prime}}O(o \mid a, s^{\prime}) T(s^{\prime} \mid s,a)\alpha_{a^{\prime}}^{(k)}(s^{\prime}). $$

struct FastInformedBound
    k_max # maximum number of iterations
end

function update(𝒫::POMDP, M::FastInformedBound, Γ)
    𝒮, 𝒜, 𝒪, R, T, O, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.R, 𝒫.T, 𝒫.O, 𝒫.γ
    Γ′ = [[R(s, a) + γ*sum(maximum(sum(O(a,s′,o)*T(s,a,s′)*α′[j]
    for (j,s′) in enumerate(𝒮)) for α′ in Γ) for o in 𝒪) for s in 𝒮] for a in 𝒜]
        return Γ′
    end

function solve(M::FastInformedBound, 𝒫::POMDP)
    Γ = [zeros(length(𝒫.𝒮)) for a in 𝒫.𝒜]
    Γ = alphavector_iteration(𝒫, M, Γ)
    return AlphaVectorPolicy(𝒫, Γ, 𝒫.𝒜)
end

3.1.2 Fast Lower Bounds

  • 一个常见的下限是 best-action worst-state(BAWS)lower bound算法。

    function baws_lowerbound(𝒫::POMDP)
        𝒮, 𝒜, R, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.R, 𝒫.γ
        r = maximum(minimum(R(s, a) for s in 𝒮) for a in 𝒜) / (1-γ)
        α = fill(r, length(𝒮))
        return α
    end
  • Blind lower bound算法,该算法与QMDP相似。

    function blind_lowerbound(𝒫, k_max)
        𝒮, 𝒜, T, R, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.T, 𝒫.R, 𝒫.γ
        Q(s,a,α) = R(s,a) + γ*sum(T(s,a,s′)*α[j] for (j,s′) in enumerate(𝒮))
        Γ = [baws_lowerbound(𝒫) for a in 𝒜]
        for k in 1:k_max
            Γ = [[Q(s,a,α) for s in 𝒮] for (α,a) in zip(Γ, 𝒜)]
        end
        return Γ
    end

3.2 Point-Based Value Iteration(PBVI)

具体思想可参见PBVI

struct PointBasedValueIteration
    B # set of belief points
    k_max # maximum number of iterations
end

function backup(𝒫::POMDP, Γ, b)
    𝒮, 𝒜, 𝒪, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.γ
    R, T, O = 𝒫.R, 𝒫.T, 𝒫.O
    Γa = []
    for a in 𝒜
        Γao = []
        for o in 𝒪
            b′ = update(b, 𝒫, a, o)
            push!(Γao, argmax(α->α⋅b′, Γ))
        end
        α = [R(s, a) + γ*sum(sum(T(s, a, s′)*O(a, s′, o)*Γao[i][j]
        for (j,s′) in enumerate(𝒮)) for (i,o) in enumerate(𝒪)) for s in 𝒮]
            push!(Γa, α)
        end
        return argmax(α->α⋅b, Γa)
    end
end

function update(𝒫::POMDP, M::PointBasedValueIteration, Γ)
    return [backup(𝒫, Γ, b) for b in M.B]
end

function solve(M::PointBasedValueIteration, 𝒫)
    Γ = fill(baws_lowerbound(𝒫), length(𝒫.𝒜))
    Γ = alphavector_iteration(𝒫, M, Γ)
    return LookaheadAlphaVectorPolicy(𝒫, Γ)
end

3.2.1 Randomized PBVI

struct RandomizedPointBasedValueIteration
    B # set of belief points
    k_max # maximum number of iterations
end

function update(𝒫::POMDP, M::RandomizedPointBasedValueIteration, Γ)
    Γ′, B′ = [], copy(M.B)
    while !isempty(B′)
        b = rand(B′)
        α = argmax(α->α⋅b, Γ)
        α′ = backup(𝒫, Γ, b)
        if α′⋅b ≥ α⋅b
            push!(Γ′, α′)
        else
            push!(Γ′, α)
        end
        filter!(b->maximum(α⋅b for α in Γ′) <
        maximum(α⋅b for α in Γ), B′)
    end
    return Γ′
end

function solve(M::RandomizedPointBasedValueIteration, 𝒫)
    Γ = [baws_lowerbound(𝒫)]
    Γ = alphavector_iteration(𝒫, M, Γ)
    return LookaheadAlphaVectorPolicy(𝒫, Γ)
end

3.3 Sawtooth Upper Bound

该方法时用基来表示值函数,具体讲,基是一组belief-utility对。

struct SawtoothPolicy
    𝒫 # POMDP problem
    V # dictionary mapping beliefs to utilities
end

function basis(𝒫)
    n = length(𝒫.𝒮)
    e(i) = [j == i ? 1.0 : 0.0 for j in 1:n]
    return [e(i) for i in 1:n]
end

function utility(π::SawtoothPolicy, b)
    𝒫, V = π.𝒫, π.V
    if haskey(V, b)
        return V[b]
    end
    n = length(𝒫.𝒮)
    E = basis(𝒫)
    u = sum(V[E[i]] * b[i] for i in 1:n)
    for (b′, u′) in V
        if b′ ∉ E
            i = argmax([norm(b-e, 1) - norm(b′-e, 1) for e in E])
            w = [norm(b - e, 1) for e in E]
            w[i] = norm(b - b′, 1)
            w /= sum(w)
            w = [1 - wi for wi in w]
            α = [V[e] for e in E]
            α[i] = u′
            u = min(u, w⋅α)
        end
    end
    return u
end

(π::SawtoothPolicy)(b) = greedy(π, b).a

3.3.1 点选择

无论是PBVI还是sawtooth迭代算法,都需要知道置信$B$。

function randstep(𝒫::POMDP, b, a)
    s = rand(SetCategorical(𝒫.𝒮, b))
    s′, r, o = 𝒫.TRO(s, a)
    b′ = update(b, 𝒫, a, o)
    return b′, r
end

function random_belief_expansion(𝒫, B)
    B′ = copy(B)
    for b in B
        a = rand(𝒫.𝒜)
        b′, r = randstep(𝒫, b, a)
        push!(B′, b′)
    end
    return unique!(B′)
end

function exploratory_belief_expansion(𝒫, B)
    B′ = copy(B)
    for b in B
        best = (b=copy(b), d=0.0)
        for a in 𝒫.𝒜
            b′, r = randstep(𝒫, b, a)
            d = minimum(norm(b - b′, 1) for b in B′)
            if d > best.d
                best = (b=b′, d=d)
            end
        end
        push!(B′, best.b)
    end
    return unique!(B′)
end

3.3.2 Sawtooth Heuristic Search

struct SawtoothHeuristicSearch
    b # initial belief
    δ # gap threshold
    d # depth
    k_max # maximum number of iterations
    k_fib # number of iterations for fast informed bound
end

function explore!(M::SawtoothHeuristicSearch, 𝒫, πhi, πlo, b, d=0)
    𝒮, 𝒜, 𝒪, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.γ
    ϵ(b′) = utility(πhi, b′) - utility(πlo, b′)
    if d ≥ M.d || ϵ(b) ≤ M.δ / γ^d
        return
    end
    a = πhi(b)
    o = argmax(o -> ϵ(update(b, 𝒫, a, o)), 𝒪)
    b′ = update(b, 𝒫, a, o)
    explore!(M, 𝒫, πhi, πlo, b′, d+1)
    if b′ ∉ basis(𝒫)
        πhi.V[b′] = greedy(πhi, b′).u
    end
    push!(πlo.Γ, backup(𝒫, πlo.Γ, b′))
end

function solve(M::SawtoothHeuristicSearch, 𝒫::POMDP)
    πfib = solve(FastInformedBound(M.k_fib), 𝒫)
    Vhi = Dict(e => utility(πfib, e) for e in basis(𝒫))
    πhi = SawtoothPolicy(𝒫, Vhi)
    πlo = LookaheadAlphaVectorPolicy(𝒫, [baws_lowerbound(𝒫)])
    for i in 1:M.k_max
        explore!(M, 𝒫, πhi, πlo, M.b)
        if utility(πhi, M.b) - utility(πlo, M.b) < M.δ
            break
        end
    end
    return πlo
end

4. 在线置信状态规划

在线方法的思想来源于树搜索,虽然在每步迭代中进行了大量的计算,但对高维问题是有效的。

  • 一个简单的在线策略是执行一步lookhead,它考虑从当前信念采取的每个行动,并使用近似值函数估计预期值。
  • 前向搜索可以得到更好的策略,但其计算复杂性随着范围的增加而显著增加。
  • 分支定界是一种更有效的正向搜索版本,通过在值函数上设置上限和下限,可以避免搜索某些路径。
  • 稀疏采样是一种近似方法,可以减少在所有可能观测值的空间上迭代的计算负担。
  • 蒙特卡罗树搜索可以通过对历史而不是状态进行操作来适应POMDP。
  • 确定稀疏树搜索使用一粒子置信,确保观测值确定,从而大大减少搜索树。
  • 启发式搜索智能地选择动作观察对,以探索其保持的值函数上下限之间存在较大差距的区域。

5. Controller Abstractions

OMDP策略的控制器表示允许策略维护自己的内部状态。与之前的方法相比,这些表示可以提高可扩展性。

5.1 控制器

控制器是维护自身内部状态的策略表示,具体表示为一个由有限组节点$X$组成的图。

mutable struct ControllerPolicy
    𝒫 # problem
    X # set of controller nodes
    ψ # action selection distribution
    η # successor selection distribution
end

function (π::ControllerPolicy)(x)
    𝒜, ψ = π.𝒫.𝒜, π.ψ
    dist = [ψ[x, a] for a in 𝒜]
    return rand(SetCategorical(𝒜, dist))
end

function update(π::ControllerPolicy, x, a, o)
    X, η = π.X, π.η
    dist = [η[x, a, o, x′] for x′ in X]
    return rand(SetCategorical(X, dist))
end

通过形成状态空间为$X\times S$的MDP,可以计算遵循控制器策略的效用。节点$x$处于活动状态时的迭代策略评估为:

function utility(π::ControllerPolicy, U, x, s)
    𝒮, 𝒜, 𝒪 = π.𝒫.𝒮, π.𝒫.𝒜, π.𝒫.𝒪
    T, O, R, γ = π.𝒫.T, π.𝒫.O, π.𝒫.R, π.𝒫.γ
    X, ψ, η = π.X, π.ψ, π.η
    U′(a,s′,o) = sum(η[x,a,o,x′]*U[x′,s′] for x′ in X)
    U′(a,s′) = T(s,a,s′)*sum(O(a,s′,o)*U′(a,s′,o) for o in 𝒪)
    U′(a) = R(s,a) + γ*sum(U′(a,s′) for s′ in 𝒮)
    return sum(ψ[x,a]*U′(a) for a in 𝒜)
end

function iterative_policy_evaluation(π::ControllerPolicy, k_max)
    𝒮, X = π.𝒫.𝒮, π.X
    U = Dict((x, s) => 0.0 for x in X, s in 𝒮)
    for k in 1:k_max
        U = Dict((x, s) => utility(π, U, x, s) for x in X, s in 𝒮)
    end
    return U
end

5.2 实现案例

5.2.1 策略迭代

策略迭代从任何初始控制器开始,然后在策略评估和策略改进之间迭代。

struct ControllerPolicyIteration
    k_max # number of iterations
    eval_max # number of evaluation iterations
end

function solve(M::ControllerPolicyIteration, 𝒫::POMDP)
    𝒜, 𝒪, k_max, eval_max = 𝒫.𝒜, 𝒫.𝒪, M.k_max, M.eval_max
    X = [1]
    ψ = Dict((x, a) => 1.0 / length(𝒜) for x in X, a in 𝒜)
    η = Dict((x, a, o, x′) => 1.0 for x in X, a in 𝒜, o in 𝒪, x′ in X)
    π = ControllerPolicy(𝒫, X, ψ, η)
    for i in 1:k_max
        prevX = copy(π.X)
        U = iterative_policy_evaluation(π, eval_max)
        policy_improvement!(π, U, prevX)
        prune!(π, U, prevX)
    end
    return π
end

function policy_improvement!(π::ControllerPolicy, U, prevX)
    𝒮, 𝒜, 𝒪 = π.𝒫.𝒮, π.𝒫.𝒜, π.𝒫.𝒪
    X, ψ, η = π.X, π.ψ, π.η
    repeatX𝒪 = fill(X, length(𝒪))
    assign𝒜X′ = vec(collect(product(𝒜, repeatX𝒪...)))
    for ax′ in assign𝒜X′
        x, a = maximum(X) + 1, ax′[1]
        push!(X, x)
        successor(o) = ax′[findfirst(isequal(o), 𝒪) + 1]
        U′(o,s′) = U[successor(o), s′]
        for s in 𝒮
            U[x, s] = lookahead(π.𝒫, U′, s, a)
        end
        for a′ in 𝒜
            ψ[x, a′] = a′ == a ? 1.0 : 0.0
            for (o, x′) in product(𝒪, prevX)
                η[x, a′, o, x′] = x′ == successor(o) ? 1.0 : 0.0
            end
        end
    end
    for (x, a, o, x′) in product(X, 𝒜, 𝒪, X)
        if !haskey(η, (x, a, o, x′))
            η[x, a, o, x′] = 0.0
        end
    end
end

5.2.2 非线性规划

策略改进问题可以作为一个单一的、大型的、非线性规划公式。

struct NonlinearProgramming
    b # initial belief
    ℓ # number of nodes
end

function tensorform(𝒫::POMDP)
    𝒮, 𝒜, 𝒪, R, T, O = 𝒫.𝒮, 𝒫.𝒜, 𝒫.𝒪, 𝒫.R, 𝒫.T, 𝒫.O
    𝒮′ = eachindex(𝒮)
    𝒜′ = eachindex(𝒜)
    𝒪′ = eachindex(𝒪)
    R′ = [R(s,a) for s in 𝒮, a in 𝒜]
    T′ = [T(s,a,s′) for s in 𝒮, a in 𝒜, s′ in 𝒮]
    O′ = [O(a,s′,o) for a in 𝒜, s′ in 𝒮, o in 𝒪]
    return 𝒮′, 𝒜′, 𝒪′, R′, T′, O′
end

function solve(M::NonlinearProgramming, 𝒫::POMDP)
    x1, X = 1, collect(1:M.ℓ)
    𝒫, γ, b = 𝒫, 𝒫.γ, M.b
    𝒮, 𝒜, 𝒪, R, T, O = tensorform(𝒫)
    model = Model(Ipopt.Optimizer)
    @variable(model, U[X,𝒮])
    @variable(model, ψ[X,𝒜] ≥ 0)
    @variable(model, η[X,𝒜,𝒪,X] ≥ 0)
    @objective(model, Max, b⋅U[x1,:])
    @NLconstraint(model, [x=X,s=𝒮],
    U[x,s] == (sum(ψ[x,a]*(R[s,a] + γ*sum(T[s,a,s′]*sum(O[a,s′,o]
        *sum(η[x,a,o,x′]*U[x′,s′] for x′ in X) for o in 𝒪) for s′ in 𝒮)) for a in 𝒜)))
    @constraint(model, [x=X], sum(ψ[x,:]) == 1)
    @constraint(model, [x=X,a=𝒜,o=𝒪], sum(η[x,a,o,:]) == 1)
    optimize!(model)
    ψ′, η′ = value.(ψ), value.(η)
    return ControllerPolicy(𝒫, X,
    Dict((x, 𝒫.𝒜[a]) => ψ′[x, a] for x in X, a in 𝒜),
    Dict((x, 𝒫.𝒜[a], 𝒫.𝒪[o], x′) => η′[x, a, o, x′] 
        for x in X, a in 𝒜, o in 𝒪, x′ in X))
end

总结

从单个决策到序列决策,整个过程中就是加入迭代,如何让迭代更优的过程。从确定性状态到模型不确定、状态不确定,就是使用参数化表示进行毕竟,进而得到策略解的过程。

相关文章
|
机器学习/深度学习 人工智能 算法
The 10 Algorithms Machine Learning Engineers Need to Know
The 10 Algorithms Machine Learning Engineers Need to Know
|
4月前
|
移动开发 算法 数据挖掘
【博士每天一篇文献-算法】Extending stability through hierarchical clusters in Echo State Networks
本文研究了在回声状态网络(ESN)中引入分层聚类结构对网络稳定性的影响,发现通过调整簇内和簇间的连接性及每个簇的主干单元数量,可以扩展谱半径的稳定范围,从而提高网络的稳定性和性能。
42 2
|
4月前
|
机器学习/深度学习 算法
【文献学习】RoemNet: Robust Meta Learning based Channel Estimation in OFDM Systems
本文提出了一种基于元学习的鲁棒信道估计算法RoemNet,旨在解决OFDM系统中由于训练和部署信道模型不一致导致的问题,并展示了其在不同信道环境下优越的性能。
43 5
|
算法 关系型数据库 数据建模
【读书笔记】Algorithms for Decision Making(4)
本部分讨论从数据学习或拟合模型参数的问题,进一步讨论了从数据中学习模型结构的方法,最后对决策理论进行了简单的概述。
101 0
【读书笔记】Algorithms for Decision Making(4)
|
机器学习/深度学习 自然语言处理 PyTorch
Re6:读论文 LeSICiN: A Heterogeneous Graph-based Approach for Automatic Legal Statute Identification fro
Re6:读论文 LeSICiN: A Heterogeneous Graph-based Approach for Automatic Legal Statute Identification fro
Re6:读论文 LeSICiN: A Heterogeneous Graph-based Approach for Automatic Legal Statute Identification fro
|
机器学习/深度学习 算法 数据挖掘
Re17:读论文 Challenges for Information Extraction from Dialogue in Criminal Law
Re17:读论文 Challenges for Information Extraction from Dialogue in Criminal Law
Re17:读论文 Challenges for Information Extraction from Dialogue in Criminal Law
|
机器学习/深度学习 自然语言处理 数据挖掘
Re7:读论文 FLA/MLAC/FactLaw Learning to Predict Charges for Criminal Cases with Legal Basis
Re7:读论文 FLA/MLAC/FactLaw Learning to Predict Charges for Criminal Cases with Legal Basis
Re7:读论文 FLA/MLAC/FactLaw Learning to Predict Charges for Criminal Cases with Legal Basis
|
vr&ar
【读书笔记】Algorithms for Decision Making(5)
此前讲述了在某个时间点做一个单一的决定的问题,但许多重要的问题需要做出一系列的决定。序列环境中的最佳决策需要对未来行动和观察序列进行推理。
119 0
|
机器学习/深度学习
【读书笔记】Algorithms for Decision Making(7)
策略搜索即搜索策略空间,而无需直接计算值函数。策略空间的维数通常低于状态空间,并且通常可以更有效地搜索。本部分首先讨论在初始状态分布下估计策略价值的方法。然后讨论不使用策略梯度估计的搜索方法和策略梯度方法。接着介绍Actor-Critic方法用值函数的估计来指导优化。
下一篇
DataWorks