Path_Seeker/RL_Dynamic_Programing.m at master · cfeng-dev/Path_Seeker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
%% Solution for Reinforcement Learning with Markov Decision Process (MDP)

%% Creation of the Markov Decision Process
MDP.States = 1:12; % 12 states
MDP.Actions = 1:4; % 4 actions: a1 (left), a2 (down), a3 (right), a4 (up)
MDP.R = zeros(12,12,4); % Initialize the reward matrix

MDP.R(6,5,:) = -1;  % Penalty of -1 if the agent falls into a hole
MDP.R(6,7,:) = -1;  % Penalty of -1 if the agent falls into a hole
MDP.R(4,5,:) = -1;  % Penalty of -1 if the agent falls into a hole
MDP.R(8,7,:) = -1;  % Penalty of -1 if the agent falls into a hole
MDP.R(2,7,:) = -1;  % Penalty of -1 if the agent falls into a hole
MDP.R(11,12,:) = 1; % Reward of +1 if state s12 is reached

MDP.T = zeros(12,12,4); % Initialize the transition matrix

%% Define the grid for the RL environment
grid = [9, 10, 11, 12;
        8, 7,  6,  5;
        1, 2,  3,  4];

%% Function to find the position of a state in the RL environment
find_position = @(state) [find(any(grid==state, 2)), find(any(grid==state, 1))];

%% Determination of transition probabilities
for s = MDP.States
    if ~ismember(s, [5, 7, 12]) % States s5, s7, and s12 are terminal states
        position = find_position(s); % Find the position of the current state in the grid
        row = position(1); % Row position in the grid
        col = position(2); % Column position in the grid

        % Directions: Left, Right, Up, Down
        left = col - 1; right = col + 1; up = row - 1; down = row + 1;
        if left < 1, left = col; end
        if right > 4, right = col; end
        if up < 1, up = row; end
        if down > 3, down = row; end

        MDP.T(s,grid(row,left),1) = 0.8;    % 80% probability to move "left"
        MDP.T(s,grid(row,right),3) = 0.8;   % 80% probability to move "right"
        MDP.T(s,grid(up,col),4) = 0.8;      % 80% probability to move "up"
        MDP.T(s,grid(down,col),2) = 0.8;    % 80% probability to move "down"

        % Adjusting the transition probabilities when facing walls
        % If the agent would hit a wall, it stays in the current state
        if left == col, MDP.T(s,s,1) = 0.1; end
        if right == col, MDP.T(s,s,3) = 0.1; end
        if up == row, MDP.T(s,s,4) = 0.8; end
        if down == row, MDP.T(s,s,2) = 0; end

        % Orthogonal movements with a 10% probability
        orthogonal_states = setdiff([grid(row,left), grid(row,right), grid(up,col), grid(down,col)], s);
        for os = orthogonal_states
            MDP.T(s,os,:) = MDP.T(s,os,:) + 0.1;
        end
    end
end

pi = ones(1,12)*4;  % Policy (Strategy) that always chooses "up"
gamma = 0.9;        % Discount factor (γ)

%% Calculate the state-value function
v = Policy_Evaluation(MDP,pi,gamma);

%% Display the calculated state-value function
disp(v);