本人以前主要做循环神经网络方面的工作,最近想上手强化学习。因此编了几个样例测试一下。在mountaincar样例上被卡了一个月,希望有大佬能在百忙之中抽出时间给看一下,感激不尽。 该程序是基于keras库的DQN强化学习程序,在cartpole问题中表现良好,但是在mountaincar问题上始终无法收敛。我也曾与其他人编写的程序进行逐句对比,但始终找不到问题在哪。希望有缘人能够解答。或者能否告知解决问题的一些网址。这是第一次提问,实在搞不定了,谢谢谢谢。
import numpy as np from tensorflow.keras import models, layers, optimizers import gym import random from collections import deque
BATCH_SIZE = 64 TRAINING_EPISODE = 1000 SAMPLE_EPISODE = 3 LEARNING_EPISODE = 3
class model(object): def init(self, obs_num, act_num): self.obs_num = obs_num self.dense1_size = 100 self.act_num = act_num def model_construct(self): inputs = layers.Input(shape = (self.obs_num, ), batch_size = BATCH_SIZE) x = layers.Dense(self.dense1_size, activation = 'relu')(inputs) outputs = layers.Dense(self.act_num)(x) model = models.Model(inputs = inputs, outputs = outputs) return model
class RL_algorithm(model): def init(self, obs_num, act_num, learning_rate = 0.001, r_delay = 0.95, e_greedy = [0.1, 0.99, 0.01], memory_size = 2000): self.obs_num = obs_num self.act_num = act_num self.step_num = 0 super(RL_algorithm, self).init(obs_num = self.obs_num, act_num = self.act_num) self.model = self.model_construct() self.model.compile(loss= 'mse', optimizer = optimizers.Adam(learning_rate)) self.model_target = self.model_construct() self.model_target.compile(loss= 'mse', optimizer = optimizers.Adam(learning_rate)) self.model_target.set_weights(self.model.get_weights()) self.memory = deque(maxlen = memory_size) self.r_delay = r_delay self.e_greedy, self.e_greedy_decay, self.e_greedy_min = e_greedy #贪心初始值、衰减比、最小值
def predict(self, obs):
act = np.argmax(self.model.predict(obs))
return act
def esample(self, obs):
if np.random.uniform(0, 1) > self.e_greedy:
act = self.predict(obs)
else:
act = np.random.randint(self.act_num)
return act
def sync_target(self):
self.model_target.set_weights(self.model.get_weights())
def egreedy_update(self):
if self.e_greedy > self.e_greedy_min:
self.e_greedy *= self.e_greedy_decay
def remember(self, data):
self.memory.append(data)
def learn(self, obs, act, reward, obs_, done):
Q_predict = self.model.predict(obs)
Q_target = self.model_target.predict(obs_)
for i in range(BATCH_SIZE):
Q_predict[i, act[i]] = reward[i] + (1-done[i]) * self.r_delay * np.max(Q_target[i, :])
loss = self.model.train_on_batch(obs, Q_predict)
return loss
def run_episode(): obs = env.reset() done = False reward_total = 0 while not done: act = DQN.esample(obs.reshape([1, -1])) obs_, reward, done, _ = env.step(act) reward_total += reward if done and reward_total > -200: reward = 100 DQN.remember([obs, act, reward, obs_, done]) obs = obs_ return reward_total
def learn_episode(): DQN.step_num += 1 samples = random.sample(DQN.memory, BATCH_SIZE) S, A, R, S_, D = [], [], [], [], [] for experiment in samples: S.append(experiment[0]) A.append(experiment[1]) R.append(experiment[2]) S_.append(experiment[3]) D.append(experiment[4]) S = np.array(S).astype(np.float32) A = np.array(A) R = np.array(R).astype(np.float32) S_ = np.array(S_).astype(np.float32) D = np.array(D).astype(np.float32) loss = DQN.learn(S, A, R, S_, D) return loss
def test_episode(): obs = env.reset() done = False reward_total = 0 step = 0 while not done: act = DQN.predict(obs.reshape([1, -1])) obs_, reward, done, _ = env.step(act) reward_total += reward obs = obs_ step += 1 return reward_total, step
def train(): reward_max = -200 for j in range(TRAINING_EPISODE): for i in range(SAMPLE_EPISODE): reward = run_episode() if reward > reward_max: reward_max = reward if len(DQN.memory) > 0.2 * DQN.memory.maxlen: for i in range(LEARNING_EPISODE): loss = learn_episode() if j % 50 == 0: DQN.sync_target() DQN.egreedy_update() reward, step = test_episode() print('training_step: ', j, ', reward: ', reward, 'reward_max: ', reward_max, ', complete_step: ', step, 'loss: ', loss)
def play(): obs = env.reset() env.render() done = False reward_total = 0 while not done: act = DQN.predict(obs.reshape([1, -1])) obs_, reward, done, _ = env.step(act) reward_total += reward obs = obs_ env.render() print('play ', ' reward:', int(reward_sum))
env = gym.make('MountainCar-v0') print('env.observation_space.shape[0] ', env.observation_space.shape[0], 'env.action_space.n ', env.action_space.n) DQN = RL_algorithm(obs_num = env.observation_space.shape[0], act_num = env.action_space.n) train() play()
```training_step: 0 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.31284344
training_step: 50 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.007191833
training_step: 100 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.008682369
training_step: 150 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.001420379
training_step: 200 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.00081316015
training_step: 250 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.0016744572
training_step: 300 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.0013028746
training_step: 350 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.0029593487
training_step: 400 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.13958925
training_step: 450 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.26738104
training_step: 500 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.0024652109
training_step: 550 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.002346919
training_step: 600 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.001029348
training_step: 650 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.3641917
training_step: 700 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.41639945
training_step: 750 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.0008203614
training_step: 800 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.0031921547
training_step: 850 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.003967342
training_step: 900 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.0011411577
training_step: 950 , reward: -200.0 reward_max: -200 , complete_step: 200 loss: 0.0059003183
版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。