【读书笔记】Algorithms for Decision Making(12)

简介: 现将单智能体的核心概念扩展到多智能体系统的问题。在该系统中,可将其他智能体建模为潜在的盟友或对手,并随着时间的推移进行相应的调整。

五、多智能体系统(1)

现将单智能体的核心概念扩展到多智能体系统的问题。在该系统中,可将其他智能体建模为潜在的盟友或对手,并随着时间的推移进行相应的调整。


1. 多智能体推理

Multiagent reasoning本质上是一个博弈论1的课题。

Myerson, Game Theory: Analysis of Conflict. Harvard University Press, 1997. [3] Y. Shoham and K. Leyton Brown, Multiagent Systems: Algorithmic, Game Theoretic, and LogicalFoundations. Cambridge University Press, 2009.

1.1 简单博弈

简单博弈包括下述几部分:

struct SimpleGame
    γ # discount factor
    ℐ # agents
    𝒜 # joint action space
    R # joint reward function
end

具体来讲,每个智能体$i \in \mathcal{I}$选择一个行动$a^{i} \in \mathcal{A}^{i}$去最大化自己的累积奖励$r^{i} \in R^{i}$。

  • 联合行动空间$\mathcal{A} = \mathcal{A}^{1} \times \cdots \times \mathcal{A}^{k}$包含多智能体系统的所有可能行动。
  • 联合行动$\bm{a} =\left(a^{1} ,\cdots, a^{k} \right) \in \mathcal{A}$。
  • 对应于联合行动的联合奖励函数$R(\bm{a})$。

联合策略$\bm{\pi}$指定智能体采取的联合行动的概率分布。在博弈论中,确定性策略称为纯策略,随机策略称为混合策略。从智能体$i$的角度来看,联合策略$\pi$的效用是$$\mathcal{U}^{i}(\bm{\pi}) = \sum_{\bm{a} \in \mathcal{A}} R^{i}(\bm{a}) \prod_{j \in \mathcal{I}} \pi^{j}(a^{j}).$$

struct SimpleGamePolicy
    p # dictionary mapping actions to probabilities
    function SimpleGamePolicy(p::Base.Generator)
        return SimpleGamePolicy(Dict(p))
    end
    
    function SimpleGamePolicy(p::Dict)
        vs = collect(values(p))
        vs ./= sum(vs)
        return new(Dict(k => v for (k,v) in zip(keys(p), vs)))
    end

    SimpleGamePolicy(ai) = new(Dict(ai => 1.0))
end

(πi::SimpleGamePolicy)(ai) = get(πi.p, ai, 0.0)

function (πi::SimpleGamePolicy)()
    D = SetCategorical(collect(keys(πi.p)), collect(values(πi.p)))
    return rand(D)
end

joint(X) = vec(collect(product(X...)))

joint(π, πi, i) = [i == j ? πi : πj for (j, πj) in enumerate(π)]

function utility(𝒫::SimpleGame, π, i)
    𝒜, R = 𝒫.𝒜, 𝒫.R
    p(a) = prod(πj(aj) for (πj, aj) in zip(π, a))
    return sum(R(a)[i]*p(a) for a in joint(𝒜))
end

1.2 响应模型

首先考虑在给定其他智能体的固定策略的情况下,对单个智能体$i$的响应建模。

function best_response(𝒫::SimpleGame, π, i)
    U(ai) = utility(𝒫, joint(π, SimpleGamePolicy(ai), i), i)
    ai = argmax(U, 𝒫.𝒜[i])
    return SimpleGamePolicy(ai)
end
function softmax_response(𝒫::SimpleGame, π, i, λ)
    𝒜i = 𝒫.𝒜[i]
    U(ai) = utility(𝒫, joint(π, SimpleGamePolicy(ai), i), i)
    return SimpleGamePolicy(ai => exp(λ*U(ai)) for ai in 𝒜i)
end

1.3 Nash均衡

找Nash均衡的过程可以视为如下的优化问题:

$$\begin{align*} \max_{\pi, \mathcal{U}} \quad & \sum_{i} \left(\mathcal{U}^{i} - \mathcal{U}^{i}(\pi)\right) \\ {\rm s.t.} \quad & \mathcal{U}^{i} \geq \mathcal{U}^{i} (a^{i}, \pi^{- i}), \ \forall i, a^{i} \\ & \sum_{a^{\prime}} \pi^{i} (a^{i}) = 1,\ \forall i \\ & \pi^{i} (a^{i}) \geq 0, \ \forall i, a^{i} \end{align*}$$

struct NashEquilibrium end

function tensorform(𝒫::SimpleGame)
    ℐ, 𝒜, R = 𝒫.ℐ, 𝒫.𝒜, 𝒫.R
    ℐ′ = eachindex(ℐ)
    𝒜′ = [eachindex(𝒜[i]) for i in ℐ]
    R′ = [R(a) for a in joint(𝒜)]
    return ℐ′, 𝒜′, R′
end

function solve(M::NashEquilibrium, 𝒫::SimpleGame)
    ℐ, 𝒜, R = tensorform(𝒫)
    model = Model(Ipopt.Optimizer)
    @variable(model, U[ℐ])
    @variable(model, π[i=ℐ, 𝒜[i]] ≥ 0)
    @NLobjective(model, Min,
        sum(U[i] - sum(prod(π[j,a[j]] for j in ℐ) * R[y][i]
        for (y,a) in enumerate(joint(𝒜))) for i in ℐ))
    @NLconstraint(model, [i=ℐ, ai=𝒜[i]],
        U[i] ≥ sum(
            prod(j==i ? (a[j]==ai ? 1.0 : 0.0) : π[j,a[j]] for j in ℐ)
            * R[y][i] for (y,a) in enumerate(joint(𝒜))))
    @constraint(model, [i=ℐ], sum(π[i,ai] for ai in 𝒜[i]) == 1)
    optimize!(model)
    πi′(i) = SimpleGamePolicy(𝒫.𝒜[i][ai] => value(π[i,ai]) for ai in 𝒜[i])
    return [πi′(i) for i in ℐ]
end

1.3.1 Correlated Equilibrium

struct CorrelatedEquilibrium end

function solve(M::CorrelatedEquilibrium, 𝒫::SimpleGame)
    ℐ, 𝒜, R = 𝒫.ℐ, 𝒫.𝒜, 𝒫.R
    model = Model(Ipopt.Optimizer)
    @variable(model, π[joint(𝒜)] ≥ 0)
    @objective(model, Max, sum(sum(π[a]*R(a) for a in joint(𝒜))))
    @constraint(model, [i=ℐ, ai=𝒜[i], ai′=𝒜[i]],
        sum(R(a)[i]*π[a] for a in joint(𝒜) if a[i]==ai)
        ≥ sum(R(joint(a,ai′,i))[i]*π[a] for a in joint(𝒜) if a[i]==ai))
    @constraint(model, sum(π) == 1)
    optimize!(model)
    return JointCorrelatedPolicy(a => value(π[a]) for a in joint(𝒜))
end

1.4 Iterated Best Response

由于计算Nash均衡可能需要大量的计算,另一种方法是在一系列重复游戏中迭代应用最佳响应。

struct IteratedBestResponse
    k_max # number of iterations
    π # initial policy
end

function IteratedBestResponse(𝒫::SimpleGame, k_max)
    π = [SimpleGamePolicy(ai => 1.0 for ai in 𝒜i) for 𝒜i in 𝒫.𝒜]
    return IteratedBestResponse(k_max, π)
end

function solve(M::IteratedBestResponse, 𝒫)
    π = M.π
    for k in 1:M.k_max
        π = [best_response(𝒫, π, i) for i in 𝒫.ℐ]
    end
    return π
end

1.4.1 Hierarchical Softmax

struct HierarchicalSoftmax
    λ # precision parameter
    k # level
    π # initial policy
end

function HierarchicalSoftmax(𝒫::SimpleGame, λ, k)
    π = [SimpleGamePolicy(ai => 1.0 for ai in 𝒜i) for 𝒜i in 𝒫.𝒜]
    return HierarchicalSoftmax(λ, k, π)
end

function solve(M::HierarchicalSoftmax, 𝒫)
    π = M.π
    for k in 1:M.k
        π = [softmax_response(𝒫, π, i, M.λ) for i in 𝒫.ℐ]
    end
    return π
end

1.5 Fictitious Play

计算不同智能体策略的另一种方法是让它们在模拟中相互作用,并学习如何最佳响应。

mutable struct FictitiousPlay
    𝒫 # simple game
    i # agent index
    N # array of action count dictionaries
    πi # current policy
end

function FictitiousPlay(𝒫::SimpleGame, i)
    N = [Dict(aj => 1 for aj in 𝒫.𝒜[j]) for j in 𝒫.ℐ]
    πi = SimpleGamePolicy(ai => 1.0 for ai in 𝒫.𝒜[i])
    return FictitiousPlay(𝒫, i, N, πi)
end

(πi::FictitiousPlay)() = πi.πi()

(πi::FictitiousPlay)(ai) = πi.πi(ai)

function update!(πi::FictitiousPlay, a)
    N, 𝒫, ℐ, i = πi.N, πi.𝒫, πi.𝒫.ℐ, πi.i
    for (j, aj) in enumerate(a)
        N[j][aj] += 1
    end
    p(j) = SimpleGamePolicy(aj => u/sum(values(N[j])) for (aj, u) in N[j])
    π = [p(j) for j in ℐ]
    πi.πi = best_response(𝒫, π, i)
end

1.6 梯度上升

mutable struct GradientAscent
    𝒫 # simple game
    i # agent index
    t # time step
    πi # current policy
end

function GradientAscent(𝒫::SimpleGame, i)
    uniform() = SimpleGamePolicy(ai => 1.0 for ai in 𝒫.𝒜[i])
    return GradientAscent(𝒫, i, 1, uniform())
end

(πi::GradientAscent)() = πi.πi()

(πi::GradientAscent)(ai) = πi.πi(ai)

function update!(πi::GradientAscent, a)
    𝒫, ℐ, 𝒜i, i, t = πi.𝒫, πi.𝒫.ℐ, πi.𝒫.𝒜[πi.i], πi.i, πi.t
    jointπ(ai) = [SimpleGamePolicy(j == i ? ai : a[j]) for j in ℐ]
    r = [utility(𝒫, jointπ(ai), i) for ai in 𝒜i]
    π′ = [πi.πi(ai) for ai in 𝒜i]
    π = project_to_simplex(π′ + r / sqrt(t))
    πi.t = t + 1
    πi.πi = SimpleGamePolicy(ai => p for (ai, p) in zip(𝒜i, π))
end


  1. 博弈论基础书可见 [1] D. Fudenberg and J. Tirole, Game Theory. MIT Press, 1991. [2] R. B.
相关文章
|
机器学习/深度学习 算法 vr&ar
【读书笔记】Algorithms for Decision Making(9)
与基于模型的方法相比,无模型方法不需要构建转移函数和奖励函数的显性表示,而是直接作用于值函数建模。进一步地,考虑模拟学习来重建奖励函数。
【读书笔记】Algorithms for Decision Making(9)
|
机器学习/深度学习 人工智能 算法
【读书笔记】Algorithms for Decision Making(1)
我自己的粗浅看法:机器学习要不是拟合逼近(经常提及的machine learning),要不就是决策过程(reinforcement learning),这本书主要讲述后者的前世今生。
333 0
【读书笔记】Algorithms for Decision Making(1)
|
算法 关系型数据库 数据建模
【读书笔记】Algorithms for Decision Making(4)
本部分讨论从数据学习或拟合模型参数的问题,进一步讨论了从数据中学习模型结构的方法,最后对决策理论进行了简单的概述。
104 0
【读书笔记】Algorithms for Decision Making(4)
|
Python
【读书笔记】Algorithms for Decision Making(2)
理性决策需要对不确定性和目标进行推理。不确定性源于预测未来事件能力的实际及理论限制。为了实现其目标,一个强有力的决策系统必须考虑到当前世界状况和未来事件中的各种不确定性来源。
122 0
【读书笔记】Algorithms for Decision Making(2)
|
机器学习/深度学习 算法 流计算
【读书笔记】Algorithms for Decision Making(6)
对于较大状态空间的问题,计算精确解需要极大的内存量,因而考虑近似解的方法。常使用approximate dynamic programming的方法去寻求近似解,进而使用在线方法实现实时计算。
167 0
【读书笔记】Algorithms for Decision Making(6)
|
机器学习/深度学习 API
【读书笔记】Algorithms for Decision Making(8)
解决存在模型不确定性的此类问题是强化学习领域的主题,这是这部分的重点。解决模型不确定性的几个挑战:首先,智能体必须仔细平衡环境探索和利用通过经验获得的知识。第二,在做出重要决策后很长时间内,可能会收到奖励,因此必须将以后奖励的学分分配给以前的决策。第三,智能体必须从有限的经验中进行概括。
217 0
【读书笔记】Algorithms for Decision Making(8)
|
算法 决策智能
【读书笔记】Algorithms for Decision Making(14)
本部分将简单游戏扩展到具有多个状态的连续上下文。马尔可夫博弈可以看作是多个具有自己奖励函数的智能体的马尔可夫决策过程。
372 0
【读书笔记】Algorithms for Decision Making(14)
|
运维 算法 数据挖掘
Statistical Approaches|学习笔记
快速学习 Statistical Approaches
Statistical Approaches|学习笔记
|
机器学习/深度学习 自然语言处理 PyTorch
Re6:读论文 LeSICiN: A Heterogeneous Graph-based Approach for Automatic Legal Statute Identification fro
Re6:读论文 LeSICiN: A Heterogeneous Graph-based Approach for Automatic Legal Statute Identification fro
Re6:读论文 LeSICiN: A Heterogeneous Graph-based Approach for Automatic Legal Statute Identification fro
|
决策智能
【读书笔记】Algorithms for Decision Making(13)
本部分将简单游戏扩展到具有多个状态的连续上下文。马尔可夫博弈可以看作是多个具有自己奖励函数的智能体的马尔可夫决策过程。
148 0