BlackJack/Learning.py at main · DeepDeducing/BlackJack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import numpy as np
import gym
import math

#--------------------------------------------------------------------

def inverse_sigmoid(input):
    return np.log((input+ 0.0000000001) /(1-input + 0.0000000001))

def vectorizing(array_size, init, interv, input):
    array = np.zeros(array_size)
    array[int(array_size//2 - 1 + (input - init) // interv)] = 1
    return array

def quantifying(array_size, init, interval, input):
    array = np.zeros(array_size)
    if int( (input - init) // interval + 1) >= 0:
        array[ : int( (input - init) // interval + 1)] = 1
    return array

#--------------------------------------------------------------------

start_set     = 1    # <<<<<<<<<<<<
end_set       = 10   # <<<<<<<<<<<<

n_sets        = end_set - start_set + 1

for n in range(n_sets):


    from Brain_for_learning import *
    network_size              = np.array([200 + 1  + 2 * 10, 100, 100, 100, 100])  # <<<<<<<<<<<<
    slope                     = 25                                                 # <<<<<<<<<<<<
    alpha                     = 0.000001                                           # <<<<<<<<<<<<
    epoch_of_learning         = 50000000                                           # <<<<<<<<<<<<
    drop_rate                 = 0.2                                                # <<<<<<<<<<<<
    momentum_rate             = 0.9                                                # <<<<<<<<<<<<

    Machine                   = Brain(network_size, slope, alpha, epoch_of_learning, drop_rate, momentum_rate)

    retrain = False                                                                # <<<<<<<<<<<<
    if retrain == True:
        Machine.weight_list            = np.load("100x100x100_25_0.000001_50m_0.2_[" + str(start_set + n) +  "]_weight_list.npy"          , allow_pickle=True)
        Machine.slope_list             = np.load("100x100x100_25_0.000001_50m_0.2_[" + str(start_set + n) +  "]_slope_list.npy"           , allow_pickle=True)
        Machine.weight_list_momentum   = np.load("100x100x100_25_0.000001_50m_0.2_[" + str(start_set + n) +  "]_weight_list_momentum.npy" , allow_pickle=True)
        Machine.slope_list_momentum    = np.load("100x100x100_25_0.000001_50m_0.2_[" + str(start_set + n) +  "]_slope_list_momentum.npy"  , allow_pickle=True)


    for i_episode in range(epoch_of_learning):

        print(i_episode)


        final_reward         = 0


        env                  = gym.make('Blackjack-v0')                         # <<<<<<<<<<<<
        state                = env.reset()
        #env.render()                                                           # <<<<<<<<<<<<


        random_initial_moves = 0                                      # <<<<<<<<<<<<
        for t in range(random_initial_moves):                         # <<<<<<<<<<<<
            action                    = env.action_space.sample()
            state, reward, done, info = env.step(action)
            # env.render()                                            # <<<<<<<<<<<<
            final_reward             += reward


        state_0            = quantifying(100, 0, 1  , state[0])       # <<<<<<<<<<<<
        state_1            = quantifying(100, 0, 1  , state[1])
        if state[2] == False:
            state_2        = np.zeros(1)
        if state[2] == True:
            state_2        = np.ones(1)


        action_list = np.zeros(2 * 10)  # <<<<<<<<<<<<
        for t in range(10):             # <<<<<<<<<<<<
            action                          = env.action_space.sample()
            state, reward, done, info       = env.step(action)
            #env.render()               # <<<<<<<<<<<<
            action_list[t * 2 + action]     = 1
            final_reward                   += reward
            if done:                    # <<<<<<<<<<<<
                break


        reward                = quantifying(100, -1, 0.02, final_reward)             # <<<<<<<<<<<<


        Machine.learn_batch(       np.atleast_2d(           np.concatenate((state_0,
                                                            state_1,
                                                            state_2,
                                                            action_list
                                                            )) ),
                                            np.array([reward])  )


    env.close()


    np.save("100x100x100_25_0.000001_50m_0.2_[" + str(start_set + n) +  "]_weight_list"             , Machine.weight_list                 ) # <<<<<<<<<<<<
    np.save("100x100x100_25_0.000001_50m_0.2_[" + str(start_set + n) +  "]_slope_list"              , Machine.slope_list                  ) # <<<<<<<<<<<<
    np.save("100x100x100_25_0.000001_50m_0.2_[" + str(start_set + n) +  "]_weight_list_momentum"    , Machine.weight_list_momentum        ) # <<<<<<<<<<<<
    np.save("100x100x100_25_0.000001_50m_0.2_[" + str(start_set + n) +  "]_slope_list_momentum"     , Machine.slope_list_momentum         ) # <<<<<<<<<<<<