1 概述
动态规划是一种机器学习方法,它利用环境、计算资源和马尔可夫特性等知识来创建在环境中最佳执行的策略。有了这项强大的技术,一个看似复杂的问题就可以用几行代码来分析和解决。在本文告中,介绍了使用基于Matlab的动态程序解决多队列网络问题的整个过程。基于所得结果,最后得出结论,具有策略迭代的动态规划是解决该类问题有效方法。
2 Matlab代码实现
部分代码:
clear all , clc state2state = zeros ( 9 , 9 , 9 , 9 ) rewardfor1 = zeros ( 9 , 9 , 9 , 9 ) rewardfor2 = zeros ( 9 , 9 , 9 , 9 ) policy = zeros ( 9 , 9 ) values = zeros ( 9 , 9 ) none= .7 * .4 just1= .3 * .4 just2= .6 * .7 both= .6 * .3 for i = 1 : 9 for j = 1 : 9 state2state = applyProb(state2state, i , j ); end end stateDone = 1 for i = 1 : 9 for j = 1 : 9 rewardfor1 = applyReward(rewardfor1, i , j , 1 ); end end reward1Done = 1 for i = 1 : 9 for j = 1 : 9 rewardfor2 = applyReward(rewardfor2, i , j , 2 ); end end %randomize policy for i = 1 : 9 for j = 1 : 9 policy( i , j ) = 1 ; %policy(i,j) = binornd(1,.5) + 1; end end reward2Done = 1 setupDone = 1 %policyIteration.m run setup.m iterating = 1 total = 0 ; while iterating == 1 total=total + 1 %evaluation evaluating = 1 while evaluating == 1 theta = 0.001 ; delta = 0 ; discount = .4 ; for q2 = 1 : 9 for q1 = 1 : 9 v = values(q2,q1); sumOfValues = 0 ; for nextq2 = 1 : 9 for nextq1 = 1 : 9 myProb = tranP(q1 - 1 ,q2 - 1 ,policy(q2,q1),nextq1 - 1 ,nextq2 - 1 ,state2state); myReward = tranR(q1 - 1 ,q2 - 1 ,policy(q2,q1),nextq1 - 1 ,nextq2 - 1 ,rewardfor1,rewardfor2); myNextVal = values(nextq2,nextq1); sumOfValues = sumOfValues + myProb * (myReward + discount * myNextVal); end end values(q2,q1) = sumOfValues; delta = max (delta, abs (v - values(q2,q1))) end end if (delta < theta) evaluating = 0 ; end end %improvment policyStable = 1 for q2 = 1 : 9 for q1 = 1 : 9 b = policy(q2,q1); action1sumOfValues = 0 ; action2sumOfValues = 0 ; for nextq2 = 1 : 9 for nextq1 = 1 : 9 myProb = tranP(q1 - 1 ,q2 - 1 , 1 ,nextq1 - 1 ,nextq2 - 1 ,state2state); myReward = tranR(q1 - 1 ,q2 - 1 , 1 ,nextq1 - 1 ,nextq2 - 1 ,rewardfor1,rewardfor2); myNextVal = values(nextq2,nextq1); action1sumOfValues = action1sumOfValues + myProb * (myReward + discount * myNextVal); end end for nextq2 = 1 : 9 for nextq1 = 1 : 9 myProb = tranP(q1 - 1 ,q2 - 1 , 2 ,nextq1 - 1 ,nextq2 - 1 ,state2state); myReward = tranR(q1 - 1 ,q2 - 1 , 2 ,nextq1 - 1 ,nextq2 - 1 ,rewardfor1,rewardfor2); myNextVal = values(nextq2,nextq1); action2sumOfValues = action2sumOfValues + myProb * (myReward + discount * myNextVal); end end if (action1sumOfValues > action2sumOfValues) policy(q2,q1) = 1 ; elseif (action2sumOfValues >=action1sumOfValues) policy(q2,q1) = 2 ; end if (b ~= policy(q2,q1)) policyStable = 0 ; end end end if (policyStable == 1 ) iterating = 0 end end
clear all , clc state2state = zeros ( 9 , 9 , 9 , 9 ) rewardfor1 = zeros ( 9 , 9 , 9 , 9 ) rewardfor2 = zeros ( 9 , 9 , 9 , 9 ) policy = zeros ( 9 , 9 ) values = zeros ( 9 , 9 ) none= .7 * .4 just1= .3 * .4 just2= .6 * .7 both= .6 * .3 for i = 1 : 9 for j = 1 : 9 state2state = applyProb(state2state, i , j ); end end stateDone = 1 for i = 1 : 9 for j = 1 : 9 rewardfor1 = applyReward(rewardfor1, i , j , 1 ); end end reward1Done = 1 for i = 1 : 9 for j = 1 : 9 rewardfor2 = applyReward(rewardfor2, i , j , 2 ); end end %randomize policy for i = 1 : 9 for j = 1 : 9 policy( i , j ) = 1 ; %policy(i,j) = binornd(1,.5) + 1; end end reward2Done = 1 setupDone = 1 %policyIteration.m run setup.m iterating = 1 total = 0 ; while iterating == 1 total=total + 1 %evaluation evaluating = 1 while evaluating == 1 theta = 0.001 ; delta = 0 ; discount = .4 ; for q2 = 1 : 9 for q1 = 1 : 9 v = values(q2,q1); sumOfValues = 0 ; for nextq2 = 1 : 9 for nextq1 = 1 : 9 myProb = tranP(q1 - 1 ,q2 - 1 ,policy(q2,q1),nextq1 - 1 ,nextq2 - 1 ,state2state); myReward = tranR(q1 - 1 ,q2 - 1 ,policy(q2,q1),nextq1 - 1 ,nextq2 - 1 ,rewardfor1,rewardfor2); myNextVal = values(nextq2,nextq1); sumOfValues = sumOfValues + myProb * (myReward + discount * myNextVal); end end values(q2,q1) = sumOfValues; delta = max (delta, abs (v - values(q2,q1))) end end if (delta < theta) evaluating = 0 ; end end %improvment policyStable = 1 for q2 = 1 : 9 for q1 = 1 : 9 b = policy(q2,q1); action1sumOfValues = 0 ; action2sumOfValues = 0 ; for nextq2 = 1 : 9 for nextq1 = 1 : 9 myProb = tranP(q1 - 1 ,q2 - 1 , 1 ,nextq1 - 1 ,nextq2 - 1 ,state2state); myReward = tranR(q1 - 1 ,q2 - 1 , 1 ,nextq1 - 1 ,nextq2 - 1 ,rewardfor1,rewardfor2); myNextVal = values(nextq2,nextq1); action1sumOfValues = action1sumOfValues + myProb * (myReward + discount * myNextVal); end end for nextq2 = 1 : 9 for nextq1 = 1 : 9 myProb = tranP(q1 - 1 ,q2 - 1 , 2 ,nextq1 - 1 ,nextq2 - 1 ,state2state); myReward = tranR(q1 - 1 ,q2 - 1 , 2 ,nextq1 - 1 ,nextq2 - 1 ,rewardfor1,rewardfor2); myNextVal = values(nextq2,nextq1); action2sumOfValues = action2sumOfValues + myProb * (myReward + discount * myNextVal); end end if (action1sumOfValues > action2sumOfValues) policy(q2,q1) = 1 ; elseif (action2sumOfValues >=action1sumOfValues) policy(q2,q1) = 2 ; end if (b ~= policy(q2,q1)) policyStable = 0 ; end end end if (policyStable == 1 ) iterating = 0 end end
function [ matrix ] = applyProb ( matrix,row,col ) matrix(row,col,row,col) = .28 if (row < 9), matrix(row + 1 ,col,row,col) = .42 else end if (col < 9), matrix(row,col + 1 ,row,col) = .12 end if (row < 9), if (col < 9), matrix(row + 1 ,col + 1 ,row,col) = .18 end end if (row == 9 & col == 9 ) matrix(row,col,row,col) = 1 end if (row ~= 9 & col == 9 ) matrix(row + 1 , 9 ,row,col) = .42 + .18 matrix(row, 9 ,row,col) = .28 + .12 end if (row == 9 & col ~= 9 ) matrix( 9 ,col + 1 ,row,col) = .12 + .18 matrix( 9 ,col,row,col) = .28 + .42 end end
3 写在最后
部分理论引用网络文献,若有侵权请联系博主删除。