三、序列问题(3)
策略搜索即搜索策略空间,而无需直接计算值函数。策略空间的维数通常低于状态空间,并且通常可以更有效地搜索。本部分首先讨论在初始状态分布下估计策略价值的方法。然后讨论不使用策略梯度估计的搜索方法和策略梯度方法。接着介绍Actor-Critic方法用值函数的估计来指导优化。
4. 策略搜索
4.1 近似策略评估
在已知初始状态$b(s)$的情况下,可计算策略$\pi$的预期折扣回报:$$\mathcal{U}(\pi) = \sum_{s} b(s) \mathcal{U}^{\pi}(s).$$当状态空间大或连续时,可近似表示为:$$\mathcal{U}(\pi) = \mathbb{E} [R(\tau)] = \int p_{\pi}(\tau) R(\tau) \ {\rm d} \tau.$$ 蒙特卡罗策略评估可将建立上述两式的关系。
struct MonteCarloPolicyEvaluation
𝒫 # problem
b # initial state distribution
d # depth
m # number of samples
end
function (U::MonteCarloPolicyEvaluation)(π)
R(π) = rollout(U.𝒫, rand(U.b), π, U.d)
return mean(R(π) for i = 1:U.m)
end
(U::MonteCarloPolicyEvaluation)(π, θ) = U(s->π(θ, s))
4.2 局部搜索
struct HookeJeevesPolicySearch
θ # initial parameterization
α # step size
c # step size reduction factor
ϵ # termination step size
end
function optimize(M::HookeJeevesPolicySearch, π, U)
θ, θ′, α, c, ϵ = copy(M.θ), similar(M.θ), M.α, M.c, M.ϵ
u, n = U(π, θ), length(θ)
while α > ϵ
copyto!(θ′, θ)
best = (i=0, sgn=0, u=u)
for i in 1:n
for sgn in (-1,1)
θ′[i] = θ[i] + sgn*α
u′ = U(π, θ′)
if u′ > best.u
best = (i=i, sgn=sgn, u=u′)
end
end
θ′[i] = θ[i]
end
if best.i != 0
θ[best.i] += best.sgn*α
u = best.u
else
α *= c
end
end
return θ
end
4.3 遗传算法
struct GeneticPolicySearch
θs # initial population
σ # initial standard deviation
m_elite # number of elite samples
k_max # number of iterations
end
function optimize(M::GeneticPolicySearch, π, U)
θs, σ = M.θs, M.σ
n, m = length(first(θs)), length(θs)
for k in 1:M.k_max
us = [U(π, θ) for θ in θs]
sp = sortperm(us, rev=true)
θ_best = θs[sp[1]]
rand_elite() = θs[sp[rand(1:M.m_elite)]]
θs = [rand_elite() + σ.*randn(n) for i in 1:(m-1)]
push!(θs, θ_best)
end
return last(θs)
end
4.4 交叉熵方法
该方法想要寻找$$\psi^{\ast} = \argmax_{\psi} \mathbb{E}_{\theta \sim p(\cdot \mid \psi)} [\mathcal{U}(\pi_{\theta})].$$事实上,直接去解上式很难实现。一般使用采样估计的方法去迭代。
struct CrossEntropyPolicySearch
p # initial distribution
m # number of samples
m_elite # number of elite samples
k_max # number of iterations
end
function optimize_dist(M::CrossEntropyPolicySearch, π, U)
p, m, m_elite, k_max = M.p, M.m, M.m_elite, M.k_max
for k in 1:k_max
θs = rand(p, m)
us = [U(π, θs[:,i]) for i in 1:m]
θ_elite = θs[:,sortperm(us)[(m-m_elite+1):m]]
p = Distributions.fit(typeof(p), θ_elite)
end
return p
end
function optimize(M, π, U)
return Distributions.mode(optimize_dist(M, π, U))
end
4.5 进化策略
struct EvolutionStrategies
D # distribution constructor
ψ # initial distribution parameterization
∇logp # log search likelihood gradient
m # number of samples
α # step factor
k_max # number of iterations
end
function evolution_strategy_weights(m)
ws = [max(0, log(m/2+1) - log(i)) for i in 1:m]
ws ./= sum(ws)
ws .-= 1/m
return ws
end
function optimize_dist(M::EvolutionStrategies, π, U)
D, ψ, m, ∇logp, α = M.D, M.ψ, M.m, M.∇logp, M.α
ws = evolution_strategy_weights(m)
for k in 1:M.k_max
θs = rand(D(ψ), m)
us = [U(π, θs[:,i]) for i in 1:m]
sp = sortperm(us, rev=true)
∇ = sum(w.*∇logp(ψ, θs[:,i]) for (w,i) in zip(ws,sp))
ψ += α.*∇
end
return D(ψ)
end
5. 策略梯度
5.1 有限差分
在策略优化的背景下,估计遵循$\theta$参数化的策略所期望的效用梯度:$$\begin{align*} \nabla\mathcal{U}(\theta) & = \begin{bmatrix} \frac{\partial \mathcal{U}}{\partial \theta_{1}}(\theta), \cdots, \frac{\partial \mathcal{U}}{\partial \theta_{n}} \end{bmatrix} \\ & \approx \begin{bmatrix} \frac{\mathcal{U}(\theta + \delta e^{(1)}) - \mathcal{U}(\theta)}{\delta}, \cdots, \frac{\mathcal{U}(\theta + \delta e^{(n)}) - \mathcal{U}(\theta)}{\delta} \end{bmatrix}.\end{align*}$$
struct FiniteDifferenceGradient
𝒫 # problem
b # initial state distribution
d # depth
m # number of samples
δ # step size
end
function gradient(M::FiniteDifferenceGradient, π, θ)
𝒫, b, d, m, δ, γ, n = M.𝒫, M.b, M.d, M.m, M.δ, M.𝒫.γ, length(θ)
Δθ(i) = [i == k ? δ : 0.0 for k in 1:n]
R(τ) = sum(r*γ^(k-1) for (k, (s,a,r)) in enumerate(τ))
U(θ) = mean(R(simulate(𝒫, rand(b), s->π(θ, s), d)) for i in 1:m)
ΔU = [U(θ + Δθ(i)) - U(θ) for i in 1:n]
return ΔU ./ δ
end
5.2 回归梯度
struct RegressionGradient
𝒫 # problem
b # initial state distribution
d # depth
m # number of samples
δ # step size
end
function gradient(M::RegressionGradient, π, θ)
𝒫, b, d, m, δ, γ = M.𝒫, M.b, M.d, M.m, M.δ, M.𝒫.γ
ΔΘ = [δ.*normalize(randn(length(θ)), 2) for i = 1:m]
R(τ) = sum(r*γ^(k-1) for (k, (s,a,r)) in enumerate(τ))
U(θ) = R(simulate(𝒫, rand(b), s->π(θ,s), d))
ΔU = [U(θ + Δθ) - U(θ) for Δθ in ΔΘ]
return pinv(reduce(hcat, ΔΘ)') * ΔU
end
5.3 似然比
struct LikelihoodRatioGradient
𝒫 # problem
b # initial state distribution
d # depth
m # number of samples
∇logπ # gradient of log likelihood
end
function gradient(M::LikelihoodRatioGradient, π, θ)
𝒫, b, d, m, ∇logπ, γ = M.𝒫, M.b, M.d, M.m, M.∇logπ, M.𝒫.γ
πθ(s) = π(θ, s)
R(τ) = sum(r*γ^(k-1) for (k, (s,a,r)) in enumerate(τ))
∇U(τ) = sum(∇logπ(θ, a, s) for (s,a) in τ)*R(τ)
return mean(∇U(simulate(𝒫, rand(b), πθ, d)) for i in 1:m)
end
5.4 Reward-to-Go
struct RewardToGoGradient
𝒫 # problem
b # initial state distribution
d # depth
m # number of samples
∇logπ # gradient of log likelihood
end
function gradient(M::RewardToGoGradient, π, θ)
𝒫, b, d, m, ∇logπ, γ = M.𝒫, M.b, M.d, M.m, M.∇logπ, M.𝒫.γ
πθ(s) = π(θ, s)
R(τ, j) = sum(r*γ^(k-1) for (k,(s,a,r)) in zip(j:d, τ[j:end]))
∇U(τ) = sum(∇logπ(θ, a, s)*R(τ,j) for (j, (s,a,r)) in enumerate(τ))
return mean(∇U(simulate(𝒫, rand(b), πθ, d)) for i in 1:m)
end
5.5 基线校正(Baseline Subtraction)
struct BaselineSubtractionGradient
𝒫 # problem
b # initial state distribution
d # depth
m # number of samples
∇logπ # gradient of log likelihood
end
function gradient(M::BaselineSubtractionGradient, π, θ)
𝒫, b, d, m, ∇logπ, γ = M.𝒫, M.b, M.d, M.m, M.∇logπ, M.𝒫.γ
πθ(s) = π(θ, s)
ℓ(a, s, k) = ∇logπ(θ, a, s)*γ^(k-1)
R(τ, k) = sum(r*γ^(j-1) for (j,(s,a,r)) in enumerate(τ[k:end]))
numer(τ) = sum(ℓ(a,s,k).^2*R(τ,k) for (k,(s,a,r)) in enumerate(τ))
denom(τ) = sum(ℓ(a,s,k).^2 for (k,(s,a)) in enumerate(τ))
base(τ) = numer(τ) ./ denom(τ)
trajs = [simulate(𝒫, rand(b), πθ, d) for i in 1:m]
rbase = mean(base(τ) for τ in trajs)
∇U(τ) = sum(ℓ(a,s,k).*(R(τ,k).-rbase) for (k,(s,a,r)) in enumerate(τ))
return mean(∇U(τ) for τ in trajs)
end
6. Actor-Critic方法
6.1 Actor-Critic
具体讲,策略$\pi_{\theta}$是Actor,由$\theta$确定的参数,并借助$\phi$参数化的值函数$\mathcal{U}_{\phi}(s)$,$\mathcal{Q}_{\phi}(s,a)$或$\mathcal{A}_{\phi}(s,a)$作为Critic。
struct ActorCritic
𝒫 # problem
b # initial state distribution
d # depth
m # number of samples
∇logπ # gradient of log likelihood ∇logπ(θ,a,s)
U # parameterized value function U(ϕ, s)
∇U # gradient of value function ∇U(ϕ,s)
end
function gradient(M::ActorCritic, π, θ, ϕ)
𝒫, b, d, m, ∇logπ = M.𝒫, M.b, M.d, M.m, M.∇logπ
U, ∇U, γ = M.U, M.∇U, M.𝒫.γ
πθ(s) = π(θ, s)
R(τ,j) = sum(r*γ^(k-1) for (k,(s,a,r)) in enumerate(τ[j:end]))
A(τ,j) = τ[j][3] + γ*U(ϕ,τ[j+1][1]) - U(ϕ,τ[j][1])
∇Uθ(τ) = sum(∇logπ(θ,a,s)*A(τ,j)*γ^(j-1) for (j, (s,a,r)) in enumerate(τ[1:end-1]))
∇ℓϕ(τ) = sum((U(ϕ,s) - R(τ,j))*∇U(ϕ,s) for (j, (s,a,r)) in enumerate(τ))
trajs = [simulate(𝒫, rand(b), πθ, d) for i in 1:m]
return mean(∇Uθ(τ) for τ in trajs), mean(∇ℓϕ(τ) for τ in trajs)
end
6.2 广义的优势估计
该方法能够平衡偏差和方差。
struct GeneralizedAdvantageEstimation
𝒫 # problem
b # initial state distribution
d # depth
m # number of samples
∇logπ # gradient of log likelihood ∇logπ(θ,a,s)
U # parameterized value function U(ϕ, s)
∇U # gradient of value function ∇U(ϕ,s)
λ # weight ∈ [0,1]
end
function gradient(M::GeneralizedAdvantageEstimation, π, θ, ϕ)
𝒫, b, d, m, ∇logπ = M.𝒫, M.b, M.d, M.m, M.∇logπ
U, ∇U, γ, λ = M.U, M.∇U, M.𝒫.γ, M.λ
πθ(s) = π(θ, s)
R(τ,j) = sum(r*γ^(k-1) for (k,(s,a,r)) in enumerate(τ[j:end]))
δ(τ,j) = τ[j][3] + γ*U(ϕ,τ[j+1][1]) - U(ϕ,τ[j][1])
A(τ,j) = sum((γ*λ)^(ℓ-1)*δ(τ, j+ℓ-1) for ℓ in 1:d-j)
∇Uθ(τ) = sum(∇logπ(θ,a,s)*A(τ,j)*γ^(j-1) for (j, (s,a,r)) in enumerate(τ[1:end-1]))
∇ℓϕ(τ) = sum((U(ϕ,s) - R(τ,j))*∇U(ϕ,s) for (j, (s,a,r)) in enumerate(τ))
trajs = [simulate(𝒫, rand(b), πθ, d) for i in 1:m]
return mean(∇Uθ(τ) for τ in trajs), mean(∇ℓϕ(τ) for τ in trajs)
end
6.3 确定性策略梯度
struct DeterministicPolicyGradient
𝒫 # problem
b # initial state distribution
d # depth
m # number of samples
∇π # gradient of deterministic policy π(θ, s)
Q # parameterized value function Q(ϕ,s,a)
∇Qϕ # gradient of value function with respect to ϕ
∇Qa # gradient of value function with respect to a
σ # policy noise
end
function gradient(M::DeterministicPolicyGradient, π, θ, ϕ)
𝒫, b, d, m, ∇π = M.𝒫, M.b, M.d, M.m, M.∇π
Q, ∇Qϕ, ∇Qa, σ, γ = M.Q, M.∇Qϕ, M.∇Qa, M.σ, M.𝒫.γ
π_rand(s) = π(θ, s) + σ*randn()*I
∇Uθ(τ) = sum(∇π(θ,s)*∇Qa(ϕ,s,π(θ,s))*γ^(j-1) for (j,(s,a,r)) in enumerate(τ))
∇ℓϕ(τ,j) = begin
s, a, r = τ[j]
s′ = τ[j+1][1]
a′ = π(θ,s′)
δ = r + γ*Q(ϕ,s′,a′) - Q(ϕ,s,a)
return δ*(γ*∇Qϕ(ϕ,s′,a′) - ∇Qϕ(ϕ,s,a))
end
∇ℓϕ(τ) = sum(∇ℓϕ(τ,j) for j in 1:length(τ)-1)
trajs = [simulate(𝒫, rand(b), π_rand, d) for i in 1:m]
return mean(∇Uθ(τ) for τ in trajs), mean(∇ℓϕ(τ) for τ in trajs)
end
在线方法,如蒙特卡罗树搜索,可用于指导策略和值函数估计的优化。
总结
到目前为止,在序列决策问题的讨论中,假设过渡和奖励模型是已知的。然而,在许多问题中,这些模型并不确切,代理必须通过经验来学习行动。解决存在模型不确定性的此类问题是强化学习领域的主题,也是接下来讨论的重点。