三、模型不确定性(2)
与基于模型的方法相比,无模型方法不需要构建转移函数和奖励函数的显性表示,而是直接作用于值函数建模。进一步地,考虑模拟学习来重建奖励函数。
3. 无模型方法
3.1 均值的增量估计
许多无模型方法从样本递增中估计作用值函数$Q(s,a)$。假设只关心$m$个样本中单个变量$X$的当前期望值:$$\hat{x}_{m} = \frac{1}{m} \sum_{i=1}^{m} x^{(i)}.$$对应的增量更新的过程可显示为:$$\hat{x}_{m} = \hat{x}_{m - 1} + \frac{1}{m} \left(x^{(m)} - \hat{x}_{m - 1}\right).$$进一步地,如果引入学习率的概念:$$\hat{x}_{m} = \hat{x}_{m - 1} + \alpha(m) \left(x^{(m)} - \hat{x}_{m - 1}\right).$$为了保证收敛,$\alpha(m)$需满足$\sum_{m} \alpha(m) = \infty$,且$\sum_{m} \alpha^{2}(m) < \infty$。
mutable struct IncrementalEstimate
μ # mean estimate
α # learning rate function
m # number of updates
end
function update!(model::IncrementalEstimate, x)
model.m += 1
model.μ += model.α(model.m) * (x - model.μ)
return model
end
3.2 Q学习
mutable struct QLearning
𝒮 # state space (assumes 1:nstates)
𝒜 # action space (assumes 1:nactions)
γ # discount
Q # action value function
α # learning rate
end
lookahead(model::QLearning, s, a) = model.Q[s,a]
function update!(model::QLearning, s, a, r, s′)
γ, Q, α = model.γ, model.Q, model.α
Q[s,a] += α*(r + γ*maximum(Q[s′,:]) - Q[s,a])
return model
end
3.3 Sarsa
该方法是Q学习的一个替代,名字来源于迭代Q函数的每一步都需要$(s, a, r, s^{\prime}, a^{\prime})$。
mutable struct Sarsa
𝒮 # state space (assumes 1:nstates)
𝒜 # action space (assumes 1:nactions)
γ # discount
Q # action value function
α # learning rate
ℓ # most recent experience tuple (s,a,r)
end
lookahead(model::Sarsa, s, a) = model.Q[s,a]
function update!(model::Sarsa, s, a, r, s′)
if model.ℓ != nothing
γ, Q, α, ℓ = model.γ, model.Q, model.α, model.ℓ
model.Q[ℓ.s,ℓ.a] += α*(ℓ.r + γ*Q[s,a] - Q[ℓ.s,ℓ.a])
end
model.ℓ = (s=s, a=a, r=r)
return model
end
3.4 Eligibility Traces
Q-learning和Sarsa的缺点之一是学习可能很慢,尤其是当奖励很少的情形下。可通过使用Eligibility Traces修改Q-learning和Sarsa,将奖励反向传播到导致奖励来源的状态和行动,以提高学习效率。
mutable struct SarsaLambda
𝒮 # state space (assumes 1:nstates)
𝒜 # action space (assumes 1:nactions)
γ # discount
Q # action value function
N # trace
α # learning rate
λ # trace decay rate
ℓ # most recent experience tuple (s,a,r)
end
lookahead(model::SarsaLambda, s, a) = model.Q[s,a]
function update!(model::SarsaLambda, s, a, r, s′)
if model.ℓ != nothing
γ, λ, Q, α, ℓ = model.γ, model.λ, model.Q, model.α, model.ℓ
model.N[ℓ.s,ℓ.a] += 1
δ = ℓ.r + γ*Q[s,a] - Q[ℓ.s,ℓ.a]
for s in model.𝒮
for a in model.𝒜
model.Q[s,a] += α*δ*model.N[s,a]
model.N[s,a] *= γ*λ
end
end
else
model.N[:,:] .= 0.0
end
model.ℓ = (s=s, a=a, r=r)
return model
end
当将该方法应用于非策略算法时,必须特别小心,因为它试图学习最优策略的值。
3.5 Reward Shaping
Reward Shaping1也可以改善学习,特别适用于奖励稀少的问题。具体讲,用$R(s,a,s^{\prime}) + F(s,a,s^{\prime})$替代传统的奖励函数$R(s,a,s^{\prime})$,第二项表征人为设计附加的奖励。
3.6 行动值函数近似
struct GradientQLearning
𝒜 # action space (assumes 1:nactions)
γ # discount
Q # parameterized action value function Q(θ,s,a)
∇Q # gradient of action value function
θ # action value function parameter
α # learning rate
end
function lookahead(model::GradientQLearning, s, a)
return model.Q(model.θ, s,a)
end
function update!(model::GradientQLearning, s, a, r, s′)
𝒜, γ, Q, θ, α = model.𝒜, model.γ, model.Q, model.θ, model.α
u = maximum(Q(θ,s′,a′) for a′ in 𝒜)
Δ = (r + γ*u - Q(θ,s,a))*model.∇Q(θ,s,a)
θ[:] += α*scale_gradient(Δ, 1)
return model
end
Q-learning和Sarsa所经历的灾难性遗忘可以通过经验重放来缓解。
struct ReplayGradientQLearning
𝒜 # action space (assumes 1:nactions)
γ # discount
Q # parameterized action value function Q(θ,s,a)
∇Q # gradient of action value function
θ # action value function parameter
α # learning rate
buffer # circular memory buffer
m # number of steps between gradient updates
m_grad # batch size
end
function lookahead(model::ReplayGradientQLearning, s, a)
return model.Q(model.θ, s,a)
end
function update!(model::ReplayGradientQLearning, s, a, r, s′)
𝒜, γ, Q, θ, α = model.𝒜, model.γ, model.Q, model.θ, model.α
buffer, m, m_grad = model.buffer, model.m, model.m_grad
if isfull(buffer)
U(s) = maximum(Q(θ,s,a) for a in 𝒜)
∇Q(s,a,r,s′) = (r + γ*U(s′) - Q(θ,s,a))*model.∇Q(θ,s,a)
Δ = mean(∇Q(s,a,r,s′) for (s,a,r,s′) in rand(buffer, m_grad))
θ[:] += α*scale_gradient(Δ, 1)
for i in 1:m # discard oldest experiences
popfirst!(buffer)
end
else
push!(buffer, (s,a,r,s′))
end
return model
end
4. 模拟学习
4.1 Behavioral Cloning
模仿学习的一种简单形式是将其视为有监督的学习问题,这种方法称为行为克隆。
struct BehavioralCloning
α # step size
k_max # number of iterations
∇logπ # log likelihood gradient
end
function optimize(M::BehavioralCloning, D, θ)
α, k_max, ∇logπ = M.α, M.k_max, M.∇logπ
for k in 1:k_max
∇ = mean(∇logπ(θ, a, s) for (s,a) in D)
θ += α*∇
end
return θ
end
专家演示越接近最佳状态,所产生的行为克隆策略的执行效果越好,但容易有cascading errors。虽然行为克隆由于其简单性而具有吸引力,但cascading errors会导致该方法在许多问题上表现不佳,特别是策略必须长期使用时。
4.2 Sequential interactive demonstration
解决cascading errors的一种方法是使用额外的专家输入纠正经过训练的策略,即Sequential interactive demonstration。
data set aggregation (DAgger) :在用数据训练策略,以及已训练的策略生成数据间交替迭代。
struct DataSetAggregation 𝒫 # problem with unknown reward function bc # behavioral cloning struct k_max # number of iterations m # number of rollouts per iteration d # rollout depth b # initial state distribution πE # expert πθ # parameterized policy end function optimize(M::DataSetAggregation, D, θ) 𝒫, bc, k_max, m = M.𝒫, M.bc, M.k_max, M.m d, b, πE, πθ = M.d, M.b, M.πE, M.πθ θ = optimize(bc, D, θ) for k in 2:k_max for i in 1:m s = rand(b) for j in 1:d push!(D, (s, πE(s))) a = rand(πθ(θ, s)) s = rand(𝒫.T(s, a)) end end θ = optimize(bc, D, θ) end return θ end
stochastic mixing iterative learning (SMILe):通过随机混合新训练的策略来迭代构建策略。
struct SMILe 𝒫 # problem with unknown reward bc # Behavioral cloning struct k_max # number of iterations m # number of rollouts per iteration d # rollout depth b # initial state distribution β # mixing scalar (e.g., d^-3) πE # expert policy πθ # parameterized policy end function optimize(M::SMILe, θ) 𝒫, bc, k_max, m = M.𝒫, M.bc, M.k_max, M.m d, b, β, πE, πθ = M.d, M.b, M.β, M.πE, M.πθ 𝒜, T = 𝒫.𝒜, 𝒫.T θs = [] π = s -> πE(s) for k in 1:k_max # execute latest π to get new data set D D = [] for i in 1:m s = rand(b) for j in 1:d push!(D, (s, πE(s))) a = π(s) s = rand(T(s, a)) end end # train new policy classifier θ = optimize(bc, D, θ) push!(θs, θ) # compute a new policy mixture Pπ = Categorical(normalize([(1-β)^(i-1) for i in 1:k],1)) π = s -> begin if rand() < (1-β)^(k-1) return πE(s) else return rand(Categorical(πθ(θs[rand(Pπ)], s))) end end Ps = normalize([(1-β)^(i-1) for i in 1:k_max],1) return Ps, θs end
4.3 Inverse Reinforcement Learning(IRL)
在逆IRL中,假设专家正在优化一个未知的奖励函数,即从专家已有的数据集$\mathcal{D}$出发,试图推导出奖励函数。有了这个奖励函数,可用已有方法来推导最优策略。根据奖励函数的参数化方法不同,有很多具体算法。
4.3.1 Maximum Margin IRL
该方法试图找到与专家数据集中发现的二进制特征频率相匹配的策略2。
struct InverseReinforcementLearning
𝒫 # problem
b # initial state distribution
d # depth
m # number of samples
π # parameterized policy
β # binary feature mapping
μE # expert feature expectations
RL # reinforcement learning method
ϵ # tolerance
end
function feature_expectations(M::InverseReinforcementLearning, π)
𝒫, b, m, d, β, γ = M.𝒫, M.b, M.m, M.d, M.β, M.𝒫.γ
μ(τ) = sum(γ^(k-1)*β(s, a) for (k,(s,a)) in enumerate(τ))
τs = [simulate(𝒫, rand(b), π, d) for i in 1:m]
return mean(μ(τ) for τ in τs)
end
function calc_weighting(M::InverseReinforcementLearning, μs)
μE = M.μE
k = length(μE)
model = Model(Ipopt.Optimizer)
@variable(model, t)
@variable(model, ϕ[1:k] ≥ 0)
@objective(model, Max, t)
for μ in μs
@constraint(model, ϕ⋅μE ≥ ϕ⋅μ + t)
end
@constraint(model, ϕ⋅ϕ ≤ 1)
optimize!(model)
return (value(t), value.(ϕ))
end
function calc_policy_mixture(M::InverseReinforcementLearning, μs)
μE = M.μE
k = length(μs)
model = Model(Ipopt.Optimizer)
@variable(model, λ[1:k] ≥ 0)
@objective(model, Min, (μE - sum(λ[i]*μs[i] for i in 1:k))⋅
(μE - sum(λ[i]*μs[i] for i in 1:k)))
@constraint(model, sum(λ) == 1)
optimize!(model)
return value.(λ)
end
function optimize(M::InverseReinforcementLearning, θ)
π, ϵ, RL = M.π, M.ϵ, M.RL
θs = [θ]
μs = [feature_expectations(M, s->π(θ,s))]
while true
t, ϕ = calc_weighting(M, μs)
if t ≤ ϵ
break
end
copyto!(RL.ϕ, ϕ) # R(s,a) = ϕ⋅β(s,a)
θ = optimize(RL, π, θ)
push!(θs, θ)
push!(μs, feature_expectations(M, s->π(θ,s)))
end
λ = calc_policy_mixture(M, μs)
return λ, θs
end
4.3.2 Maximum Entropy IRL
该方法将寻找最佳奖励参数的问题定义为最大似然估计问题。
struct MaximumEntropyIRL
𝒫 # problem
b # initial state distribution
d # depth
π # parameterized policy π(θ,s)
Pπ # parameterized policy likelihood π(θ, a, s)
∇R # reward function gradient
RL # reinforcement learning method
α # step size
k_max # number of iterations
end
function discounted_state_visitations(M::MaximumEntropyIRL, θ)
𝒫, b, d, Pπ = M.𝒫, M.b, M.d, M.Pπ
𝒮, 𝒜, T, γ = 𝒫.𝒮, 𝒫.𝒜, 𝒫.T, 𝒫.γ
b_sk = zeros(length(𝒫.𝒮), d)
b_sk[:,1] = [pdf(b, s) for s in 𝒮]
for k in 2:d
for (si′, s′) in enumerate(𝒮)
b_sk[si′,k] = γ*sum(sum(b_sk[si,k-1]*Pπ(θ, a, s)*T(s, a, s′)
for (si,s) in enumerate(𝒮)) for a in 𝒜)
end
end
return normalize!(vec(mean(b_sk, dims=2)),1)
end
function optimize(M::MaximumEntropyIRL, D, ϕ, θ)
𝒫, π, Pπ, ∇R, RL, α, k_max = M.𝒫, M.π, M.Pπ, M.∇R, M.RL, M.α, M.k_max
𝒮, 𝒜, γ, nD = 𝒫.𝒮, 𝒫.𝒜, 𝒫.γ, length(D)
for k in 1:k_max
copyto!(RL.ϕ, ϕ) # update parameters
θ = optimize(RL, π, θ)
b = discounted_state_visitations(M, θ)
∇Rτ = τ -> sum(γ^(i-1)*∇R(ϕ,s,a) for (i,(s,a)) in enumerate(τ))
∇f = sum(∇Rτ(τ) for τ in D) - nD*sum(b[si]*sum(Pπ(θ,a,s)*∇R(ϕ,s,a)
for (ai,a) in enumerate(𝒜)) for (si, s) in enumerate(𝒮))
ϕ += α*∇f
end
return ϕ, θ
end
4.4 Generative Adversarial Imitation Learning
总结
在许多问题中,这些模型是不确切的,必须通过经验来学习行动。在这个过程中需要已有经验与探索数据进行平衡。基于传统的机器学习理念,已知奖励函数基于模型和无模型的方法被讨论。除此之外,在更少的确定性先验已知下,模拟学习(看起来似乎更贴合实际)的方法得到讨论。至少对于本人,IRF是第一次被学习到。
- A. Y. Ng, D. Harada, and S. Russell, “Policy invariance under reward transformations: Theory and application to reward shaping,” in International Conference on Machine Learning (ICML), 1999. ↩
- P. Abbeel and A. Y. Ng, “Apprenticeship learning via inverse reinforcement learning,” in International Conference on Machine Learning (ICML), 2004. ↩