# 【翻译】Sklearn与TensorFlow机器学习实用指南 —— 第16章 强化学习（上）

y = 1. - tf.to_float(action)


learning_rate = 0.01
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(                    labels=y, logits=logits)


gradients = [grad for grad, variable in grads_and_vars]


gradient_placeholders = []
training_op = optimizer.apply_gradients(grads_and_vars_feed)

n_inputs = 4
n_hidden = 4
n_outputs = 1
initializer = tf.contrib.layers.variance_scaling_initializer()

learning_rate = 0.01
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden = fully_connected(X, n_hidden, activation_fn=tf.nn.elu,weights_initializer=initializer)
logits = fully_connected(hidden, n_outputs, activation_fn=None,                    weights_initializer=initializer)
outputs = tf.nn.sigmoid(logits)
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])
action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

y = 1. - tf.to_float(action)
cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)

init = tf.global_variables_initializer()
saver = tf.train.Saver()


def discount_rewards(rewards, discount_rate):
discounted_rewards = np.empty(len(rewards))
cumulative_rewards = 0
for step in reversed(range(len(rewards))):
cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate       discounted_rewards[step] = cumulative_rewards
return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
all_discounted_rewards = [discount_rewards(rewards) for rewards in all_rewards]
flat_rewards = np.concatenate(all_discounted_rewards)
reward_mean = flat_rewards.mean()
reward_std = flat_rewards.std()
return [(discounted_rewards - reward_mean)/reward_std  for discounted_rewards in all_discounted_rewards]


>>> discount_rewards([10, 0, -50], discount_rate=0.8)
array([-22., -40., -50.])
>>> discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8)
[array([-0.28435071, -0.86597718, -1.18910299]), array([ 1.26665318,  1.0727777 ])]


n_iterations = 250      # 训练迭代次数
n_max_steps = 1000      # 每一次的最大步长
n_games_per_update = 10 # 每迭代十次训练一次策略网络
save_iterations = 10    # 每十次迭代保存模型
discount_rate = 0.95
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
all_rewards = []    #每一次的所有奖励
for game in range(n_games_per_update):
current_rewards = []   #当前步的所有奖励
obs = env.reset()
for step in range(n_max_steps):
feed_dict={X: obs.reshape(1, n_inputs)}) # 一个obs
obs, reward, done, info = env.step(action_val[0][0])                current_rewards.append(reward)
if done:
break
all_rewards.append(current_rewards)
# 此时我们每10次运行一次策略，我们已经准备好使用之前描述的算法去更新策略，注：即使用迭代10次的结果来优化当前的策略。
all_rewards = discount_and_normalize_rewards(all_rewards)
feed_dict = {}
# 将梯度与行为分数相乘，并计算平均值
mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index] for game_index, rewards in enumerate(all_rewards)  for step, reward in enumerate(rewards)],axis=0)
sess.run(training_op, feed_dict=feed_dict)
if iteration % save_iterations == 0:
saver.save(sess, "./my_policy_net_pg.ckpt")


Bellman 找到了一种估计任何状态S的最佳状态值的方法，他提出了V(s)，它是智能体在其采取最佳行为达到状态s后所有衰减未来奖励的总和的平均期望。他表明，如果智能体的行为最佳，那么贝尔曼最优性公式适用（见公式 16-1）。这个递归公式表示，如果智能体最优地运行，那么当前状态的最优值等于在采取一个最优动作之后平均得到的奖励，加上该动作可能导致的所有可能的下一个状态的期望最优值。

T为智能体选择动作a时从状态s到状态s'的概率
R为智能体选择以动作a从状态s到状态s'的过程中得到的奖励
r 为衰减率

nan=np.nan  # 代表不可能的动作
T = np.array([  # shape=[s, a, s']
[[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]],
[[0.0, 1.0, 0.0], [nan, nan, nan], [0.0, 0.0, 1.0]],
[[nan, nan, nan], [0.8, 0.1, 0.1], [nan, nan, nan]],    ])
R = np.array([  # shape=[s, a, s']
[[10., 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
[[10., 0.0, 0.0], [nan, nan, nan], [0.0, 0.0, -50.]],
[[nan, nan, nan], [40., 0.0, 0.0], [nan, nan, nan]],    ])
possible_actions = [[0, 1, 2], [0, 2], [1]]


Q = np.full((3, 3), -np.inf)  # -inf 对应着不可能的动作
for state, actions in enumerate(possible_actions):
Q[state, actions] = 0.0  # 对所有可能的动作初始化为0.0
learning_rate = 0.01
discount_rate = 0.95
n_iterations = 100
for iteration in range(n_iterations):
Q_prev = Q.copy()
for s in range(3):
for a in possible_actions[s]:
Q[s, a] = np.sum([T[s, a, sp] * (R[s, a, sp] + discount_rate * np.max(Q_prev[sp]))
for sp in range(3)])


>>> Q
array([[ 21.89498982,  20.80024033,  16.86353093],
[  1.11669335,         -inf,   1.17573546],
[        -inf,  53.86946068,         -inf]])
>>> np.argmax(Q, axis=1)  # 每一状态的最优动作
array([0, 2, 1])


