-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRL_Dynamic_Programing.m
More file actions
66 lines (53 loc) · 2.69 KB
/
RL_Dynamic_Programing.m
File metadata and controls
66 lines (53 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
%% Solution for Reinforcement Learning with Markov Decision Process (MDP)
%% Creation of the Markov Decision Process
MDP.States = 1:12; % 12 states
MDP.Actions = 1:4; % 4 actions: a1 (left), a2 (down), a3 (right), a4 (up)
MDP.R = zeros(12,12,4); % Initialize the reward matrix
MDP.R(6,5,:) = -1; % Penalty of -1 if the agent falls into a hole
MDP.R(6,7,:) = -1; % Penalty of -1 if the agent falls into a hole
MDP.R(4,5,:) = -1; % Penalty of -1 if the agent falls into a hole
MDP.R(8,7,:) = -1; % Penalty of -1 if the agent falls into a hole
MDP.R(2,7,:) = -1; % Penalty of -1 if the agent falls into a hole
MDP.R(11,12,:) = 1; % Reward of +1 if state s12 is reached
MDP.T = zeros(12,12,4); % Initialize the transition matrix
%% Define the grid for the RL environment
grid = [9, 10, 11, 12;
8, 7, 6, 5;
1, 2, 3, 4];
%% Function to find the position of a state in the RL environment
find_position = @(state) [find(any(grid==state, 2)), find(any(grid==state, 1))];
%% Determination of transition probabilities
for s = MDP.States
if ~ismember(s, [5, 7, 12]) % States s5, s7, and s12 are terminal states
position = find_position(s); % Find the position of the current state in the grid
row = position(1); % Row position in the grid
col = position(2); % Column position in the grid
% Directions: Left, Right, Up, Down
left = col - 1; right = col + 1; up = row - 1; down = row + 1;
if left < 1, left = col; end
if right > 4, right = col; end
if up < 1, up = row; end
if down > 3, down = row; end
MDP.T(s,grid(row,left),1) = 0.8; % 80% probability to move "left"
MDP.T(s,grid(row,right),3) = 0.8; % 80% probability to move "right"
MDP.T(s,grid(up,col),4) = 0.8; % 80% probability to move "up"
MDP.T(s,grid(down,col),2) = 0.8; % 80% probability to move "down"
% Adjusting the transition probabilities when facing walls
% If the agent would hit a wall, it stays in the current state
if left == col, MDP.T(s,s,1) = 0.1; end
if right == col, MDP.T(s,s,3) = 0.1; end
if up == row, MDP.T(s,s,4) = 0.8; end
if down == row, MDP.T(s,s,2) = 0; end
% Orthogonal movements with a 10% probability
orthogonal_states = setdiff([grid(row,left), grid(row,right), grid(up,col), grid(down,col)], s);
for os = orthogonal_states
MDP.T(s,os,:) = MDP.T(s,os,:) + 0.1;
end
end
end
pi = ones(1,12)*4; % Policy (Strategy) that always chooses "up"
gamma = 0.9; % Discount factor (γ)
%% Calculate the state-value function
v = Policy_Evaluation(MDP,pi,gamma);
%% Display the calculated state-value function
disp(v);