# 蒙特卡洛法的简介以及实战应用(python实现 基于同策略首次访问蒙特卡洛算法 附源码)

## 三、蒙特卡洛预测

from collections import defaultdict
from my_book_gridworld0406 import GridWorldEnv
import numpy as np
class Agent():
def __init__(self):
self.Q = {}  # {state:{action:q_value}} 初始化Q
def create_epsilon_greedy_policy(self, nA):
"""
Creates an epsilon-greedy policy based on Q values.
基于Q值创建贪婪策略
alid_nA
best_action = max(self.Q[state], key=self.Q[state].get)  # 获取最大value值对应的key，即得到最大动作值函数对应的动作
A[best_action] += 1.0 - epsilon
return A
return policy_fn
def mc_control_epsilon_greedy(self, env, gamma, max_episode_num):
# flag = True
returns_sum = defaultdict(float)
returns_count = defaultdict(float)
target_policy = self.create_epsilon_greedy_policy(env.action_space.n)
num_episode = 0
for state in range(env.observation_space.n):
self.initValue(state, env.valid_actions_for_states, randomized=False)
print("episode:{}".format(num_episode))
print(epsilon_by_epsiode(num_episode))
for s in print_states:
if s in self.Q.keys():
print("{}_Q:".format(s), end="")
Q_s = []
for a in self.Q[s].keys():
Q_s.append(round(self.Q[s][a], 3))
print(Q_s)
probs = target_policy(env.valid_actions_for_states, s, epsilon_by_epsiode(num_episode))
action = np.random.choice(np.arange(len(probs)), p=probs)
p = []
for a in range(len(probs)):
p.append(round(probs[a], 3))
print(p)
print(action)
while num_episode < max_episode_num:
episode = []
state = env.reset()
while True:
# env.render()
probs = target_policy(env.valid_actions_for_states, state, epsilon_by_epsiode(num_episode))
action = np.random.choice(np.arange(len(probs)), p=probs)
next_state, reward, done, _ = env.step(action)
episode.append((state, action, reward))
if done:
break
state = next_state
num_episode += 1
# Find all (state, action) pairs we've visited in this episode
# We convert each state to a tuple so that we can use it as a dict key
sa_in_episode = set([(x[0], x[1]) for x in episode])
for state, action in sa_in_episode:
sa_pair = (state, action)
# Find the first occurance of the (state, action) pair in the episode
first_occurence_idx = next(i for i, x in enumerate(episode)
if x[0] == state and x[1] == action)
# Sum up all rewards since the first occurance
G = sum([x[2] * (gamma ** i) for i, x in enumerate(episode[first_occurence_idx:])])
# Calculate average return for this state over all sampled episodes
returns_sum[sa_pair] += G
returns_count[sa_pair] += 1.0
self.__setQValue(state, action, returns_sum[sa_pair] / returns_count[sa_pair])
if num_episode in print_episodes:
print("episode:{}".format(num_episode))
print(epsilon_by_epsiode(num_episode))
for s in print_states:
if s in self.Q.keys():
print("{}_Q:".format(s), end="")
Q_s = []
for a in self.Q[s].keys():
Q_s.append(round(self.Q[s][a], 3))
print(Q_s)
probs = target_policy(env.valid_actions_for_states, s, epsilon_by_epsiode(num_episode))
action = np.random.choice(np.arange(len(probs)), p=probs)
p = []
for a in range(len(probs)):
p.append(round(probs[a], 3))
print(p)
print(action)
return self.Q
# return a possible action list for a given state
# def possibleActionsForstate(self, state):
#  actions = []
#  return actions
# if a state exists in Q dictionary
def __isStateInQ(self, state):
# 判断空值。有值则返回值，无值则返回None - None is not None = False
return self.Q.get(state) is not None  # 因为是实例属性，所以要用self.进行引用
def initValue(self, s, valid_actions_list, randomized=False):  # 初始化Q和E
# Q[s]为空值时进入判断
if not self.__isStateInQ(s):
self.Q[s] = {}  # 初始化Q
for a in valid_actions_list[s]:  # 遍历所有action_name
self.Q[s][
a] = np.random().random() / 10 if randomized is True else 0.0  # 初始化Q(s,a)；随机一个动作值函数。只有结束状态的Q(s,a) = 0
"""Q的获取与设置方法"""
def __getQValue(self, s, a):  # ①
return self.Q[s][a]  # argmax(q)
def __setQValue(self, s, a, new_q):  # ②
self.Q[s][a] = new_q
np.random.seed(1)
epsilon_start = 0.5
epsilon_final = 0
epsilon_episodes = 20000
epsilon_by_epsiode = lambda episode_idx: epsilon_start - (epsilon_start - epsilon_final) * min(episode_idx,
epsilon_episodes) / epsilon_episodes
agent = Agent()
env = GridWorldEnv()
print_states = [5, 10, 18, 20, 24]
print_episodes = [1, 7500, 12500, 19999, 20000]
Q = agent.mc_control_epsilon_greedy(env=env, gamma=0.8, max_episode_num=20000)

|
10天前
|

45 12
|
7天前
|

【9月更文挑战第12天】决策树算法作为机器学习领域的一颗明珠，凭借其直观易懂和强大的解释能力，在分类与回归任务中表现出色。相比传统统计方法，决策树通过简单的分支逻辑实现了数据的精准分类。本文将借助Python和scikit-learn库，以鸢尾花数据集为例，展示如何使用决策树进行分类，并探讨其优势与局限。通过构建一系列条件判断，决策树不仅模拟了人类决策过程，还确保了结果的可追溯性和可解释性。无论您是新手还是专家，都能轻松上手，享受机器学习的乐趣。
22 9
|
4天前
|

Python中的列表推导式：简介与应用
【9月更文挑战第14天】本文旨在介绍Python中一种强大且简洁的构造列表的方法——列表推导式。我们将从基础语法入手，通过实例演示其用法，并探讨在数据处理和算法优化中的应用价值。文章将不包含代码示例，而是专注于概念理解和应用场景的描述，以促进读者对列表推导式的深入认识。
11 3
|
8天前
|

【9月更文挑战第11天】在数据的海洋中，线性回归算法犹如智慧的预言家，助我们揭示未知。本案例通过收集房屋面积、距市中心距离等数据，利用Python的pandas和scikit-learn库构建房价预测模型。经过训练与测试，模型展现出较好的预测能力，均方根误差（RMSE）低，帮助房地产投资者做出更明智决策。尽管现实关系复杂多变，线性回归仍提供了有效工具，引领我们在数据世界中自信前行。
22 5
|
10天前
|
JavaScript Java C语言
Python简介
Python简介。
17 4
|
17天前
|

R语言中的支持向量机（SVM）与K最近邻（KNN）算法实现与应用
【9月更文挑战第2天】无论是支持向量机还是K最近邻算法，都是机器学习中非常重要的分类算法。它们在R语言中的实现相对简单，但各有其优缺点和适用场景。在实际应用中，应根据数据的特性、任务的需求以及计算资源的限制来选择合适的算法。通过不断地实践和探索，我们可以更好地掌握这些算法并应用到实际的数据分析和机器学习任务中。
40 3
|
21天前
|
Ubuntu Linux 数据安全/隐私保护

32 2
|
9天前
|

39 0
|
21天前
|

61 0
|
13天前
|

40 20