【读书笔记】Algorithms for Decision Making(7)

简介: 策略搜索即搜索策略空间,而无需直接计算值函数。策略空间的维数通常低于状态空间,并且通常可以更有效地搜索。本部分首先讨论在初始状态分布下估计策略价值的方法。然后讨论不使用策略梯度估计的搜索方法和策略梯度方法。接着介绍Actor-Critic方法用值函数的估计来指导优化。

三、序列问题(3)

策略搜索即搜索策略空间,而无需直接计算值函数。策略空间的维数通常低于状态空间,并且通常可以更有效地搜索。本部分首先讨论在初始状态分布下估计策略价值的方法。然后讨论不使用策略梯度估计的搜索方法和策略梯度方法。接着介绍Actor-Critic方法用值函数的估计来指导优化。


4. 策略搜索

4.1 近似策略评估

在已知初始状态$b(s)$的情况下,可计算策略$\pi$的预期折扣回报:$$\mathcal{U}(\pi) = \sum_{s} b(s) \mathcal{U}^{\pi}(s).$$当状态空间大或连续时,可近似表示为:$$\mathcal{U}(\pi) = \mathbb{E} [R(\tau)] = \int p_{\pi}(\tau) R(\tau) \ {\rm d} \tau.$$ 蒙特卡罗策略评估可将建立上述两式的关系。

struct MonteCarloPolicyEvaluation
    𝒫 # problem
    b # initial state distribution
    d # depth
    m # number of samples
end

function (U::MonteCarloPolicyEvaluation)(π)
    R(π) = rollout(U.𝒫, rand(U.b), π, U.d)
    return mean(R(π) for i = 1:U.m)
end

(U::MonteCarloPolicyEvaluation)(π, θ) = U(s->π(θ, s))

4.2 局部搜索

struct HookeJeevesPolicySearch
    θ # initial parameterization
    α # step size
    c # step size reduction factor
    ϵ # termination step size
end

function optimize(M::HookeJeevesPolicySearch, π, U)
    θ, θ′, α, c, ϵ = copy(M.θ), similar(M.θ), M.α, M.c, M.ϵ
    u, n = U(π, θ), length(θ)
    while α > ϵ
        copyto!(θ′, θ)
        best = (i=0, sgn=0, u=u)
        for i in 1:n
            for sgn in (-1,1)
                θ′[i] = θ[i] + sgn*α
                u′ = U(π, θ′)
                if u′ > best.u
                    best = (i=i, sgn=sgn, u=u′)
                end
            end
            θ′[i] = θ[i]
        end
        if best.i != 0
            θ[best.i] += best.sgn*α
            u = best.u
        else
            α *= c
        end
    end
    return θ
end

4.3 遗传算法

struct GeneticPolicySearch
    θs # initial population
    σ # initial standard deviation
    m_elite # number of elite samples
    k_max # number of iterations
end

function optimize(M::GeneticPolicySearch, π, U)
    θs, σ = M.θs, M.σ
    n, m = length(first(θs)), length(θs)
    for k in 1:M.k_max
        us = [U(π, θ) for θ in θs]
        sp = sortperm(us, rev=true)
        θ_best = θs[sp[1]]
        rand_elite() = θs[sp[rand(1:M.m_elite)]]
        θs = [rand_elite() + σ.*randn(n) for i in 1:(m-1)]
        push!(θs, θ_best)
    end
    return last(θs)
end

4.4 交叉熵方法

该方法想要寻找$$\psi^{\ast} = \argmax_{\psi} \mathbb{E}_{\theta \sim p(\cdot \mid \psi)} [\mathcal{U}(\pi_{\theta})].$$事实上,直接去解上式很难实现。一般使用采样估计的方法去迭代。

struct CrossEntropyPolicySearch
    p # initial distribution
    m # number of samples
    m_elite # number of elite samples
    k_max # number of iterations
end

function optimize_dist(M::CrossEntropyPolicySearch, π, U)
    p, m, m_elite, k_max = M.p, M.m, M.m_elite, M.k_max
    for k in 1:k_max
        θs = rand(p, m)
        us = [U(π, θs[:,i]) for i in 1:m]
        θ_elite = θs[:,sortperm(us)[(m-m_elite+1):m]]
        p = Distributions.fit(typeof(p), θ_elite)
    end
    return p
end

function optimize(M, π, U)
    return Distributions.mode(optimize_dist(M, π, U))
end

4.5 进化策略

struct EvolutionStrategies
    D # distribution constructor
    ψ # initial distribution parameterization
    ∇logp # log search likelihood gradient
    m # number of samples
    α # step factor
    k_max # number of iterations
end

function evolution_strategy_weights(m)
    ws = [max(0, log(m/2+1) - log(i)) for i in 1:m]
    ws ./= sum(ws)
    ws .-= 1/m
    return ws
end

function optimize_dist(M::EvolutionStrategies, π, U)
    D, ψ, m, ∇logp, α = M.D, M.ψ, M.m, M.∇logp, M.α
    ws = evolution_strategy_weights(m)
    for k in 1:M.k_max
        θs = rand(D(ψ), m)
        us = [U(π, θs[:,i]) for i in 1:m]
        sp = sortperm(us, rev=true)
        ∇ = sum(w.*∇logp(ψ, θs[:,i]) for (w,i) in zip(ws,sp))
        ψ += α.*∇
    end
    return D(ψ)
end

5. 策略梯度

5.1 有限差分

在策略优化的背景下,估计遵循$\theta$参数化的策略所期望的效用梯度:$$\begin{align*} \nabla\mathcal{U}(\theta) & = \begin{bmatrix} \frac{\partial \mathcal{U}}{\partial \theta_{1}}(\theta), \cdots, \frac{\partial \mathcal{U}}{\partial \theta_{n}} \end{bmatrix} \\ & \approx \begin{bmatrix} \frac{\mathcal{U}(\theta + \delta e^{(1)}) - \mathcal{U}(\theta)}{\delta}, \cdots, \frac{\mathcal{U}(\theta + \delta e^{(n)}) - \mathcal{U}(\theta)}{\delta} \end{bmatrix}.\end{align*}$$

struct FiniteDifferenceGradient
    𝒫 # problem
    b # initial state distribution
    d # depth
    m # number of samples
    δ # step size
end

function gradient(M::FiniteDifferenceGradient, π, θ)
    𝒫, b, d, m, δ, γ, n = M.𝒫, M.b, M.d, M.m, M.δ, M.𝒫.γ, length(θ)
    Δθ(i) = [i == k ? δ : 0.0 for k in 1:n]
    R(τ) = sum(r*γ^(k-1) for (k, (s,a,r)) in enumerate(τ))
    U(θ) = mean(R(simulate(𝒫, rand(b), s->π(θ, s), d)) for i in 1:m)
    ΔU = [U(θ + Δθ(i)) - U(θ) for i in 1:n]
    return ΔU ./ δ
end

5.2 回归梯度

struct RegressionGradient
    𝒫 # problem
    b # initial state distribution
    d # depth
    m # number of samples
    δ # step size
end

function gradient(M::RegressionGradient, π, θ)
    𝒫, b, d, m, δ, γ = M.𝒫, M.b, M.d, M.m, M.δ, M.𝒫.γ
    ΔΘ = [δ.*normalize(randn(length(θ)), 2) for i = 1:m]
    R(τ) = sum(r*γ^(k-1) for (k, (s,a,r)) in enumerate(τ))
    U(θ) = R(simulate(𝒫, rand(b), s->π(θ,s), d))
    ΔU = [U(θ + Δθ) - U(θ) for Δθ in ΔΘ]
    return pinv(reduce(hcat, ΔΘ)') * ΔU
end

5.3 似然比

struct LikelihoodRatioGradient
    𝒫 # problem
    b # initial state distribution
    d # depth
    m # number of samples
    ∇logπ # gradient of log likelihood
end

function gradient(M::LikelihoodRatioGradient, π, θ)
    𝒫, b, d, m, ∇logπ, γ = M.𝒫, M.b, M.d, M.m, M.∇logπ, M.𝒫.γ
    πθ(s) = π(θ, s)
    R(τ) = sum(r*γ^(k-1) for (k, (s,a,r)) in enumerate(τ))
    ∇U(τ) = sum(∇logπ(θ, a, s) for (s,a) in τ)*R(τ)
    return mean(∇U(simulate(𝒫, rand(b), πθ, d)) for i in 1:m)
end

5.4 Reward-to-Go

struct RewardToGoGradient
    𝒫 # problem
    b # initial state distribution
    d # depth
    m # number of samples
    ∇logπ # gradient of log likelihood
end

function gradient(M::RewardToGoGradient, π, θ)
    𝒫, b, d, m, ∇logπ, γ = M.𝒫, M.b, M.d, M.m, M.∇logπ, M.𝒫.γ
    πθ(s) = π(θ, s)
    R(τ, j) = sum(r*γ^(k-1) for (k,(s,a,r)) in zip(j:d, τ[j:end]))
    ∇U(τ) = sum(∇logπ(θ, a, s)*R(τ,j) for (j, (s,a,r)) in enumerate(τ))
    return mean(∇U(simulate(𝒫, rand(b), πθ, d)) for i in 1:m)
end

5.5 基线校正(Baseline Subtraction)

struct BaselineSubtractionGradient
    𝒫 # problem
    b # initial state distribution
    d # depth
    m # number of samples
    ∇logπ # gradient of log likelihood
end

function gradient(M::BaselineSubtractionGradient, π, θ)
    𝒫, b, d, m, ∇logπ, γ = M.𝒫, M.b, M.d, M.m, M.∇logπ, M.𝒫.γ
    πθ(s) = π(θ, s)
    ℓ(a, s, k) = ∇logπ(θ, a, s)*γ^(k-1)
    R(τ, k) = sum(r*γ^(j-1) for (j,(s,a,r)) in enumerate(τ[k:end]))
    numer(τ) = sum(ℓ(a,s,k).^2*R(τ,k) for (k,(s,a,r)) in enumerate(τ))
    denom(τ) = sum(ℓ(a,s,k).^2 for (k,(s,a)) in enumerate(τ))
    base(τ) = numer(τ) ./ denom(τ)
    trajs = [simulate(𝒫, rand(b), πθ, d) for i in 1:m]
    rbase = mean(base(τ) for τ in trajs)
    ∇U(τ) = sum(ℓ(a,s,k).*(R(τ,k).-rbase) for (k,(s,a,r)) in enumerate(τ))
    return mean(∇U(τ) for τ in trajs)
end

6. Actor-Critic方法

6.1 Actor-Critic

具体讲,策略$\pi_{\theta}$是Actor,由$\theta$确定的参数,并借助$\phi$参数化的值函数$\mathcal{U}_{\phi}(s)$,$\mathcal{Q}_{\phi}(s,a)$或$\mathcal{A}_{\phi}(s,a)$作为Critic。

struct ActorCritic
    𝒫 # problem
    b # initial state distribution
    d # depth
    m # number of samples
    ∇logπ # gradient of log likelihood ∇logπ(θ,a,s)
    U # parameterized value function U(ϕ, s)
    ∇U # gradient of value function ∇U(ϕ,s)
end

function gradient(M::ActorCritic, π, θ, ϕ)
    𝒫, b, d, m, ∇logπ = M.𝒫, M.b, M.d, M.m, M.∇logπ
    U, ∇U, γ = M.U, M.∇U, M.𝒫.γ
    πθ(s) = π(θ, s)
    R(τ,j) = sum(r*γ^(k-1) for (k,(s,a,r)) in enumerate(τ[j:end]))
    A(τ,j) = τ[j][3] + γ*U(ϕ,τ[j+1][1]) - U(ϕ,τ[j][1])
    ∇Uθ(τ) = sum(∇logπ(θ,a,s)*A(τ,j)*γ^(j-1) for (j, (s,a,r)) in enumerate(τ[1:end-1]))
    ∇ℓϕ(τ) = sum((U(ϕ,s) - R(τ,j))*∇U(ϕ,s) for (j, (s,a,r)) in enumerate(τ))
    trajs = [simulate(𝒫, rand(b), πθ, d) for i in 1:m]
    return mean(∇Uθ(τ) for τ in trajs), mean(∇ℓϕ(τ) for τ in trajs)
end

6.2 广义的优势估计

该方法能够平衡偏差和方差。

struct GeneralizedAdvantageEstimation
    𝒫 # problem
    b # initial state distribution
    d # depth
    m # number of samples
    ∇logπ # gradient of log likelihood ∇logπ(θ,a,s)
    U # parameterized value function U(ϕ, s)
    ∇U # gradient of value function ∇U(ϕ,s)
    λ # weight ∈ [0,1]
end

function gradient(M::GeneralizedAdvantageEstimation, π, θ, ϕ)
    𝒫, b, d, m, ∇logπ = M.𝒫, M.b, M.d, M.m, M.∇logπ
    U, ∇U, γ, λ = M.U, M.∇U, M.𝒫.γ, M.λ
    πθ(s) = π(θ, s)
    R(τ,j) = sum(r*γ^(k-1) for (k,(s,a,r)) in enumerate(τ[j:end]))
    δ(τ,j) = τ[j][3] + γ*U(ϕ,τ[j+1][1]) - U(ϕ,τ[j][1])
    A(τ,j) = sum((γ*λ)^(ℓ-1)*δ(τ, j+ℓ-1) for ℓ in 1:d-j)
    ∇Uθ(τ) = sum(∇logπ(θ,a,s)*A(τ,j)*γ^(j-1) for (j, (s,a,r)) in enumerate(τ[1:end-1]))
    ∇ℓϕ(τ) = sum((U(ϕ,s) - R(τ,j))*∇U(ϕ,s) for (j, (s,a,r)) in enumerate(τ))
    trajs = [simulate(𝒫, rand(b), πθ, d) for i in 1:m]
    return mean(∇Uθ(τ) for τ in trajs), mean(∇ℓϕ(τ) for τ in trajs)
end

6.3 确定性策略梯度

struct DeterministicPolicyGradient
    𝒫 # problem
    b # initial state distribution
    d # depth
    m # number of samples
    ∇π # gradient of deterministic policy π(θ, s)
    Q # parameterized value function Q(ϕ,s,a)
    ∇Qϕ # gradient of value function with respect to ϕ
    ∇Qa # gradient of value function with respect to a
    σ # policy noise
end

function gradient(M::DeterministicPolicyGradient, π, θ, ϕ)
    𝒫, b, d, m, ∇π = M.𝒫, M.b, M.d, M.m, M.∇π
    Q, ∇Qϕ, ∇Qa, σ, γ = M.Q, M.∇Qϕ, M.∇Qa, M.σ, M.𝒫.γ
    π_rand(s) = π(θ, s) + σ*randn()*I
    ∇Uθ(τ) = sum(∇π(θ,s)*∇Qa(ϕ,s,π(θ,s))*γ^(j-1) for (j,(s,a,r)) in enumerate(τ))
    ∇ℓϕ(τ,j) = begin
        s, a, r = τ[j]
        s′ = τ[j+1][1]
        a′ = π(θ,s′)
        δ = r + γ*Q(ϕ,s′,a′) - Q(ϕ,s,a)
        return δ*(γ*∇Qϕ(ϕ,s′,a′) - ∇Qϕ(ϕ,s,a))
    end
    ∇ℓϕ(τ) = sum(∇ℓϕ(τ,j) for j in 1:length(τ)-1)
    trajs = [simulate(𝒫, rand(b), π_rand, d) for i in 1:m]
    return mean(∇Uθ(τ) for τ in trajs), mean(∇ℓϕ(τ) for τ in trajs)
end
在线方法,如蒙特卡罗树搜索,可用于指导策略和值函数估计的优化。

总结

到目前为止,在序列决策问题的讨论中,假设过渡和奖励模型是已知的。然而,在许多问题中,这些模型并不确切,代理必须通过经验来学习行动。解决存在模型不确定性的此类问题是强化学习领域的主题,也是接下来讨论的重点。

相关文章
|
机器学习/深度学习 算法 流计算
【读书笔记】Algorithms for Decision Making(6)
对于较大状态空间的问题,计算精确解需要极大的内存量,因而考虑近似解的方法。常使用approximate dynamic programming的方法去寻求近似解,进而使用在线方法实现实时计算。
163 0
【读书笔记】Algorithms for Decision Making(6)
|
机器学习/深度学习 人工智能 算法
【读书笔记】Algorithms for Decision Making(1)
我自己的粗浅看法:机器学习要不是拟合逼近(经常提及的machine learning),要不就是决策过程(reinforcement learning),这本书主要讲述后者的前世今生。
327 0
【读书笔记】Algorithms for Decision Making(1)
|
算法
【读书笔记】Algorithms for Decision Making(3)
上一部分给出了概率分布的表示论。本部分将展示如何使用概率表示进行推理,即确定一组给定观察变量相关值的一个或多个未观察变量的分布。在该部分中首先介绍直接推断的办法,然后给出几种有效的近似方法。
155 0
|
存储 安全 编译器
[笔记]读书笔记 C++设计新思维《一》基于策略的类设计(下)
[笔记]读书笔记 C++设计新思维《一》基于策略的类设计(下)
|
存储 关系型数据库 编译器
C++ Primer Plus 第6版 读书笔记(9)第 9章 函数——内存模型和名称空间
C++ Primer Plus 第6版 读书笔记(9)第 9章 函数——内存模型和名称空间
115 1
|
存储 算法 编译器
C++ Primer Plus 第6版 读书笔记(8)第 8章 函数探幽(二)
C++ Primer Plus 第6版 读书笔记(8)第 8章 函数探幽(二)
73 1
|
存储 算法 Java
[笔记]读书笔记 C++设计新思维《二》技术(Techniques)(二)
[笔记]读书笔记 C++设计新思维《二》技术(Techniques)(二)
|
安全 Java C++
[笔记]读书笔记 C++设计新思维《一》基于策略的类设计(上)
[笔记]读书笔记 C++设计新思维《一》基于策略的类设计
|
存储 编译器 程序员
C++ Primer Plus 第6版 读书笔记(10) 第十章 类与对象
C++ Primer Plus 第6版 读书笔记(10) 第十章 类与对象
72 0
|
存储 Java 编译器
C++ Primer Plus 第6版 读书笔记(8)第 8章 函数探幽(一)
C++ Primer Plus 第6版 读书笔记(8)第 8章 函数探幽(一)
61 0