需要源码请点赞关注收藏评论区留言或私信博主~~~
策略迭代的关键部分是策略评估,首先评估状态的价值,然后根据状态的动作值进行相应的策略改进,并进行下一轮评估和改进。直到策略稳定,策略改进可以通过求解静态最优化问题来实现,通过状态动作值来选择动作,通常比策略评估容易。
基于状态值的策略迭代算法包括以下三个阶段
1:初始化策略函数和状态值函数
2:策略评估:在当前策略下 使用贝尔曼方程更新状态值函数 直到收敛
3:策略改进:基于贪心策略得到更优策略,直到策略收敛不再变化
算法步骤如下
汽车租赁问题描述:杰克经营两家异地的汽车租赁场,每天都有客户来租车,如果每个租赁场有可供外租的汽车,每租出一辆汽车,杰克获得10美元,为了保证每个租赁场有足够的汽车可租,每天晚上杰克会在两个租赁场之间移动车辆,每辆车的移车费用为2美元,假设每个租赁场的租车和还车的数量是一个泊松随机量。给定三个假设
1:任何一个租赁场车辆总数不超过20辆
2:当天还回的车辆第二天才能出租
3:两个租车场之间每天最多可移车数量为5辆
下面利用策略迭代方法 计算两个租赁场点之间的最优移车策略
经过五轮的策略评估和改进后 得到最优策略和最优价值
最优策略如下
最优价值如下
部分代码如下
# 代10-例4.6-汽车租赁问题 import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt import numpy as np import seaborn as sns from scipy.stats import poisson # %matplotlib inline plt.rcParams['font.sans-serif']=['SimHei'] #显示中文标签 plt.rcParams['axes.unicode_minus']=False matplotlib.use('TkAgg') # 每个租赁场的最大车辆数 MAX_CARS = 20 # 每晚挪车的最大数 MAX_MOVE_OF_CARS = 5 #第一个租赁场租车的𝜆值 RENTAL_REQUEST_FIRST_LOC = 4 #第而个租赁场租车的𝜆值 RENTAL_REQUEST_SECOND_LOC = 3 #第一个租赁场还车的𝜆值 RETURNS_FIRST_LOC = 2 #第而个租赁场还车的𝜆值 RETURNS_SECOND_LOC = 3 DISCOUNT = 0.9 # 租出一辆车奖励值 RENTAL_CREDIT = 10 # 挪车费用 MOVE_CAR_COST = 2 # 所有可能的动作 actions = np.arange(-MAX_MOVE_OF_CARS, MAX_MOVE_OF_CARS + 1) # An up bound for poisson distribution # If n is greater than this value, then the probability of getting n is truncated to 0 POISSON_UPPER_BOUND = 11 # Probability for poisson distribution # @lam: lambda should be less than 10 for this function poisson_cache = dict() def poisson_probability(n, lam): global poisson_cache key = n * 10 + lam if key not in poisson_cache: poisson_cache[key] = poisson.pmf(n, lam) return poisson_cache[key] def expected_return(state, action, state_value, constant_returned_cars): """ @state: [# of cars in first location, # of cars in second location] @action: positive if moving cars from first location to second location, negative if moving cars from second location to first location @stateValue: state value matrix @constant_returned_cars: if set True, model is simplified such that the # of cars returned in daytime becomes constant rather than a random value from poisson distribution, which will reduce calculation time and leave the optimal policy/value state matrix almost the same """ # initailize total return returns = 0.0 # cost for moving cars returns -= MOVE_CAR_COST * abs(action) # moving cars NUM_OF_CARS_FIRST_LOC = min(state[0] - action, MAX_CARS) NUM_OF_CARS_SECOND_LOC = min(state[1] + action, MAX_CARS) # go through all possible rental requests for rental_request_first_loc in range(POISSON_UPPER_BOUND): for rental_request_second_loc in range(POISSON_UPPER_BOUND): # probability for current combination of rental requests prob = poisson_probability(rental_request_first_loc, RENTAL_REQUEST_FIRST_LOC) * \ poisson_probability(rental_request_second_loc, RENTAL_REQUEST_SECOND_LOC) num_of_cars_first_loc = NUM_OF_CARS_FIRST_LOC num_of_cars_second_loc = NUM_OF_CARS_SECOND_LOC # valid rental requests should be less than actual # of cars valid_rental_first_loc = min(num_of_cars_first_loc, rental_request_first_loc) valid_rental_second_loc = min(num_of_cars_second_loc, rental_request_second_loc) # get credits for renting reward = (valid_rental_first_loc + valid_rental_second_loc) * RENTAL_CREDIT num_of_cars_first_loc -= valid_rental_first_loc num_of_cars_second_loc -= valid_rental_second_loc if constant_returned_cars: # get returned cars, those cars can be used for renting tomorrow returned_cars_first_loc = RETURNS_FIRST_LOC returned_cars_second_loc = RETURNS_SECOND_LOC num_of_cars_first_loc = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS) num_of_cars_second_loc = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS) returns += prob * (reward + DISCOUNT * state_value[num_of_cars_first_loc, num_of_cars_second_loc]) else: for returned_cars_first_loc in range(POISSON_UPPER_BOUND): for returned_cars_second_loc in range(POISSON_UPPER_BOUND): prob_return = poisson_probability( returned_cars_first_loc, RETURNS_FIRST_LOC) * poisson_probability(returned_cars_second_loc, RETURNS_SECOND_LOC) num_of_cars_first_loc_ = min(num_of_cars_first_loc + returned_cars_first_loc, MAX_CARS) num_of_cars_second_loc_ = min(num_of_cars_second_loc + returned_cars_second_loc, MAX_CARS) prob_ = prob_return * prob returns += prob_ * (reward + DISCOUNT * state_value[num_of_cars_first_loc_, num_of_cars_second_loc_]) return returns def figure_4_2(constant_returned_cars=True): value = np.zeros((MAX_CARS + 1, MAX_CARS + 1)) policy = np.zeros(value.shape, dtype=np.int) iterations = 0 _, axes = plt.subplots(2, 3, figsize=(40, 20)) plt.subplots_adjust(wspace=0.1, hspace=0.2) axes = axes.flatten() while True: fig = sns.heatmap(np.flipud(policy), cmap="YlGnBu", vmax=5, vmin=-5,ax=axes[iterations],center=0) fig.set_ylabel('# B ',fontsize=30) fig.set_yticks(list(reversed(range(MAX_CARS + 1)))) fig.set_xlabel('# A', fontsize=30) fig.set_title('policy {}'.format(iterations), fontsize=30) # policy evaluation (in-place) while True: old_value = value.copy() or (-j <= action <= 0): # if(-i <= action <= 0) or (0 <= action <= j): action_returns.append(expected_return([i, j], action, value, constant_returned_cars)) else: action_returns.append(-np.inf) new_action = actions[np.argmax(action_returns)] policy[i, j] = new_action if policy_stable and old_action != new_action: policy_stable = False print('policy stable {}'.format(policy_stable)) if policy_stable: fig = sns.heatmap(np.flipud(value), cmap="YlGnBu", ax=axes[-1]) fig.set_ylabel('# 第一地点的车辆数', fontsize=30) fig.set_yticks(list(reversed(range(MAX_CARS + 1)))) fig.set_xlabel('# 第二地点的车辆数', fontsize=30) fig.set_title('optimal value', fontsize=30) break iterations += 1 #保存路径 #plt.savefig('images/figure_4_2.png') plt.show() #plt.close() if __name__ == '__main__': figure_4_2()