-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPolicy_Improvement.m
More file actions
38 lines (34 loc) · 1.64 KB
/
Policy_Improvement.m
File metadata and controls
38 lines (34 loc) · 1.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
function greedypi = Policy_Improvement(MDP, v, gamma)
%% Description: Policy-Improvement algorithm in MATLAB
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Input: MDP Markov Decision Process with fields States, Actions, Transition, Reward
% v state-value function (v-function)
% gamma discount factor (γ)
%
% Output: greedypi greedy policy based on the provided value function and MDP
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Initialization
n = length(MDP.States); % Number of states in MDP
m = length(MDP.Actions); % Number of actions in MDP
q = zeros(n, m); % Initialization of action-value function (q-function)
% %% Compute the action-value function (q-function) for all state-action pairs
% for a = 1:m
% % Using vectorized operations to calculate q-function(s,a) for all states
% q(:,a) = sum(MDP.T(:,:,a) .* (MDP.R(:,:,a) + gamma * v'), 2);
% end
%% Compute the action-value function (q-function) for all state-action pairs
for s = 1:n
for a = 1:m
prob = MDP.T(s,:,a); % Transition probability
reward = MDP.R(s,:,a); % Reward
s_next = find(MDP.T(s,:,a)~=0); % Next state
for j = 1:length(s_next)
q(s,a) = q(s,a) + prob(s_next(j)) * (reward(s_next(j)) + gamma*v(s_next(j))); % Bellman equation for v-function
end
end
end
%% Calculate the greedy strategy (π' = argmax_a q(s,a)) based on the action-value function (q-function)
for s=1:n
greedypihelp = find(q(s,:) == max(q(s,:)));
greedypi(s) = greedypihelp(1);
end