三、序列问题(2)
上文中提及的精确解方法适用于小型离散问题,对于较大状态空间的问题,计算精确解需要极大的内存量,因而考虑近似解的方法。常使用approximate dynamic programming的方法去寻求近似解,进而使用在线方法实现实时计算。
2. 近似值函数
2.1 参数化表示
记值函数的参数化表示为$\mathcal{U}_{\theta} (s)$。
struct ApproximateValueIteration
Uθ # initial parameterized value function that supports fit!
S # set of discrete states for performing backups
k_max # maximum number of iterations
end
function solve(M::ApproximateValueIteration, 𝒫::MDP)
Uθ, S, k_max = M.Uθ, M.S, M.k_max
for k in 1:k_max
U = [backup(𝒫, Uθ, s) for s in S]
fit!(Uθ, S, U)
end
return ValueFunctionPolicy(𝒫, Uθ)
end
接下来提及的所有参数表示均可与与上述逼近算法一起使用,且参数表示需要支持$\mathcal{U}_{\theta}$的计算以及$\mathcal{U}_{\theta} $与$S$中点效用估计的拟合。
参数化表示分为两类:
- 局部近似方法,其中$\theta$对应于$S$中状态的值。
- 全局近似方法,其中$\theta$与$S$中状态的值不直接相关。
但两者本质上都可以视为一个线性函数逼近,即$\mathcal{U}_{\theta} = \theta^{\rm T} \beta(s)$。
2.2 最邻近方法
mutable struct NearestNeighborValueFunction
k # number of neighbors
d # distance function d(s, s′)
S # set of discrete states
θ # vector of values at states in S
end
function (Uθ::NearestNeighborValueFunction)(s)
dists = [Uθ.d(s,s′) for s′ in Uθ.S]
ind = sortperm(dists)[1:Uθ.k]
return mean(Uθ.θ[i] for i in ind)
end
function fit!(Uθ::NearestNeighborValueFunction, S, U)
Uθ.θ = U
return Uθ
end
2.3 核光滑方法
mutable struct LocallyWeightedValueFunction
k # kernel function k(s, s′)
S # set of discrete states
θ # vector of values at states in S
end
function (Uθ::LocallyWeightedValueFunction)(s)
w = normalize([Uθ.k(s,s′) for s′ in Uθ.S], 1)
return Uθ.θ ⋅ w
end
function fit!(Uθ::LocallyWeightedValueFunction, S, U)
Uθ.θ = U
return Uθ
end
2.4 线性插值
mutable struct MultilinearValueFunction
o # position of lower-left corner
δ # vector of widths
θ # vector of values at states in S
end
function (Uθ::MultilinearValueFunction)(s)
o, δ, θ = Uθ.o, Uθ.δ, Uθ.θ
Δ = (s - o)./δ
# Multidimensional index of lower-left cell
i = min.(floor.(Int, Δ) .+ 1, size(θ) .- 1)
vertex_index = similar(i)
d = length(s)
u = 0.0
for vertex in 0:2^d-1
weight = 1.0
for j in 1:d
# Check whether jth bit is set
if vertex & (1 << (j-1)) > 0
vertex_index[j] = i[j] + 1
weight *= Δ[j] - i[j] + 1
else
vertex_index[j] = i[j]
weight *= i[j] - Δ[j]
end
end
u += θ[vertex_index...]*weight
end
return u
end
function fit!(Uθ::MultilinearValueFunction, S, U)
Uθ.θ = U
return Uθ
end
2.5 单纯形插值
mutable struct SimplexValueFunction
o # position of lower-left corner
δ # vector of widths
θ # vector of values at states in S
end
function (Uθ::SimplexValueFunction)(s)
Δ = (s - Uθ.o)./Uθ.δ
# Multidimensional index of upper-right cell
i = min.(floor.(Int, Δ) .+ 1, size(Uθ.θ) .- 1) .+ 1
u = 0.0
s′ = (s - (Uθ.o + Uθ.δ.*(i.-2))) ./ Uθ.δ
p = sortperm(s′) # increasing order
w_tot = 0.0
for j in p
w = s′[j] - w_tot
u += w*Uθ.θ[i...]
i[j] -= 1
w_tot += w
end
u += (1 - w_tot)*Uθ.θ[i...]
return u
end
function fit!(Uθ::SimplexValueFunction, S, U)
Uθ.θ = U
return Uθ
end
2.6 线性回归与神经网络回归
下面介绍全局方法。线性回归需要一组线性函数作为基函数,如下:
mutable struct LinearRegressionValueFunction
β # basis vector function
θ # vector of parameters
end
function (Uθ::LinearRegressionValueFunction)(s)
return Uθ.β(s) ⋅ Uθ.θ
end
function fit!(Uθ::LinearRegressionValueFunction, S, U)
X = hcat([Uθ.β(s) for s in S]...)'
Uθ.θ = pinv(X)*U
return Uθ
end
神经网络回归不必按照线性回归的要求构造一组适当的基函数。相反,使用神经网络来表示值函数。
3. 在线规划
3.1 滚动时域规划(Receding Horizon Planning)
预测控制的优化不是一次离线进行,而是随着采样时刻的前进反复地在线进行,故而该方法面临着确定滚动深度的问题。这种优化虽然得不到理想的全局最优解,但是反复对每一采样时刻的偏差进行优化计算,将可及时地校正控制过程中出现的各种复杂情况。
3.2 Lookahead with Rollouts
struct RolloutLookahead
𝒫 # problem
π # rollout policy
d # depth
end
randstep(𝒫::MDP, s, a) = 𝒫.TR(s, a)
function rollout(𝒫, s, π, d)
ret = 0.0
for t in 1:d
a = π(s)
s, r = randstep(𝒫, s, a)
ret += 𝒫.γ^(t-1) * r
end
return ret
end
function (π::RolloutLookahead)(s)
U(s) = rollout(π.𝒫, s, π.π, π.d)
return greedy(π.𝒫, U, s).
end
3.3 正向搜索(Forward Search)
struct ForwardSearch
𝒫 # problem
d # depth
U # value function at depth d
end
function forward_search(𝒫, s, d, U)
if d ≤ 0
return (a=nothing, u=U(s))
end
best = (a=nothing, u=-Inf)
U′(s) = forward_search(𝒫, s, d-1, U).u
for a in 𝒫.𝒜
u = lookahead(𝒫, U′, s, a)
if u > best.u
best = (a=a, u=u)
end
end
return best
end
(π::ForwardSearch)(s) = forward_search(π.𝒫, s, π.d, π.U).a
3.4 分支定界方法(Branch and Bound)
struct BranchAndBound
𝒫 # problem
d # depth
Ulo # lower bound on value function at depth d
Qhi # upper bound on action value function
end
function branch_and_bound(𝒫, s, d, Ulo, Qhi)
if d ≤ 0
return (a=nothing, u=Ulo(s))
end
U′(s) = branch_and_bound(𝒫, s, d-1, Ulo, Qhi).u
best = (a=nothing, u=-Inf)
for a in sort(𝒫.𝒜, by=a->Qhi(s,a), rev=true)
if Qhi(s, a) < best.u
return best # safe to prune
end
u = lookahead(𝒫, U′, s, a)
if u > best.u
best = (a=a, u=u)
end
end
return best
end
(π::BranchAndBound)(s) = branch_and_bound(π.𝒫, s, π.d, π.Ulo, π.Qhi).a
3.5 稀疏采样
struct SparseSampling
𝒫 # problem
d # depth
m # number of samples
U # value function at depth d
end
function sparse_sampling(𝒫, s, d, m, U)
if d ≤ 0
return (a=nothing, u=U(s))
end
best = (a=nothing, u=-Inf)
for a in 𝒫.𝒜
u = 0.0
for i in 1:m
s′, r = randstep(𝒫, s, a)
a′, u′ = sparse_sampling(𝒫, s′, d-1, m, U)
u += (r + 𝒫.γ*u′) / m
end
if u > best.u
best = (a=a, u=u)
end
end
return best
end
(π::SparseSampling)(s) = sparse_sampling(π.𝒫, s, π.d, π.m, π.U).a
3.6 蒙特卡罗树搜索
struct MonteCarloTreeSearch
𝒫 # problem
N # visit counts
Q # action value estimates
d # depth
m # number of simulations
c # exploration constant
U # value function estimate
end
function (π::MonteCarloTreeSearch)(s)
for k in 1:π.m
simulate!(π, s)
end
return argmax(a->π.Q[(s,a)], π.𝒫.𝒜)
end
3.7 启发式搜索
struct HeuristicSearch
𝒫 # problem
Uhi # upper bound on value function
d # depth
m # number of simulations
end
function simulate!(π::HeuristicSearch, U, s)
𝒫 = π.𝒫
for d in 1:π.d
a, u = greedy(𝒫, U, s)
U[s] = u
s = rand(𝒫.T(s, a))
end
end
function (π::HeuristicSearch)(s)
U = [π.Uhi(s) for s in π.𝒫.𝒮]
for i in 1:π.m
simulate!(π, U, s)
end
return greedy(π.𝒫, U, s).a
end
3.8 标签启发式搜索
struct LabeledHeuristicSearch
𝒫 # problem
Uhi # upper bound on value function
d # depth
δ # gap threshold
end
function (π::LabeledHeuristicSearch)(s)
U, solved = [π.Uhi(s) for s in 𝒫.𝒮], Set()
while s ∉ solved
simulate!(π, U, solved, s)
end
return greedy(π.𝒫, U, s).a
end
3.9 开环规划/model predictive control
开环规划可提供最佳闭环规划的满意近似,同时通过避免对未来信息的获取进行推理提高了计算效率。过程可表示为$$\max_{a_{1:d}} \mathcal{U}(a_{1:d}),$$即最大化是执行操作序列$a_{1:d}$时的预期返回。
- 确定性模型预测控制
$$\begin{align*} & \max_{a_{1:d}, s_{2:d}} \qquad \sum_{t = 1}^{d} \gamma^{t} R(s_{t}, a_{t}) \\ & {\rm s.t.} \qquad \qquad s_{t+1} = T(s_{t}, a_{t}), \ t \in 1:d-1. \end{align*}$$ - 鲁棒模型预测控制
$$\begin{align*} & \max_{a_{1:d}} \qquad \min_{s_{2:d}} \sum_{t = 1}^{d} \gamma^{t} R(s_{t}, a_{t}) \\ & {\rm s.t.} \qquad \quad s_{t+1} = T(s_{t}, a_{t}), \ t \in 1:d-1. \end{align*}$$ - 多预测模型预测控制
$$\begin{align*} & \max_{a_{1:d}^{1:m}, s_{2:d}^{i}} \qquad \frac{1}{m} \sum_{i=1}^{m}\sum_{k = 1}^{d} \gamma^{k} R(s_{k}^{(i)}, a_{k}^{(i)}) \\ & {\rm s.t.} \qquad \qquad s_{k+1}^{(i)} = T_{i}(s_{k}^{(i)}, a_{k}^{(i)}), \ k \in 1:d-1, i \in 1:m, \\ & \quad \qquad \qquad \ \ a_{1}^{(i)} = a_{1}^{(j)}, \qquad \qquad i, j \in 1:m. \end{align*}$$