MLP/micrograd.py at master · maxr21/MLP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# -*- coding: utf-8 -*-
"""
Created on Mon May  5 14:02:49 2025

@author: Max Robinson
"""
import math
from graphviz import Digraph
#%matplotlib inline

class Value:
    def __init__(self, data, _children = (), _op = ''):
        self.data = data
        self.grad = 0.0
        self._prev = set(_children)
        self._backward = lambda : None
        self._op = _op

    def __repr__(self):
        return f"Value(data={self.data})"

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward
        return out

    def __radd__(self, other):
        return self.__add__(other)

    def __neg__(self):
        return self * -1

    def __sub__(self, other):
        return self + (-other)

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad

        out._backward = _backward
        return out

    def __rmul__(self, other):
        return self * other

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only int or float powers"
        out = Value(self.data**other, (self, ), f"**{other}")

        def _backward():
            self.grad += other * self.data ** (other-1) * out.grad
        out._backward = _backward

        return out

    def __truediv__(self, other):
        return self * other**-1

    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self, ), 'exp')

        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward
        return out

    def tanh(self):
        n = self.data

        tanh = (math.exp(2*n) - 1)/(math.exp(2*n) + 1)

        out = Value(tanh, (self, ), 'tanh')

        def _backward():
            #out.grad will always start at 0
            self.grad += 4 / (math.exp(2*n) + math.exp(-2*n) + 2) * out.grad

        out._backward = _backward

        return out

    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                topo.append(v)
                for child in v._prev:
                    build_topo(child)
        build_topo(self)
        self.grad = 1.0
        for node in topo:
            node._backward()

# builds the graph's nodes and edges set
def trace(root):
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format = 'svg', graph_attr = {'rankdir': 'LR'})

    nodes, edges = trace(root)

    for n in nodes:
        uid = str(id(n))
        dot.node(name = uid, label = "{data %.4f}" % (n.data, ), shape = "record")
        # '' evaluates to false
        if n._op:
            dot.node(name = uid + n._op, label = n._op)
            dot.edge(uid+n._op, uid)

    for n1, n2 in edges:
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)

    return dot

a = Value(2)
b = Value(-2)
c = Value(-1)

e = a*b + c
f = Value(2)
#print(a/b)
L = e + f # i.e. loss function (how far off the output is from being correct)
# manual back progation
L.grad = 1.0 # dL/dL = 1
f.grad = 1.0 # dL/df = 1
e.grad = 1.0 # dL/de = 1

a.grad = e.grad*b.data # dL/da = dL/de * de/da
b.grad = e.grad*a.data # dL/db = dL/de * de/db
c.grad = 1.0           # dL/dc = dL/de * de/dc


# building a neuron

# inputs x1, x2
x1 = Value(2.0)
w1 = Value(-3.0)
x2 = Value(-0.0)
w2 = Value(1.0)
b = Value(6.88137)

# sum of inputs with their weights and bias added. i.e. Modelling the neuron.

x1w1 = x1*w1
x2w2 = x2*w2
x1w1x2w2 = x1w1 + x2w2
n = x1w1x2w2 + b # value exiting the neuron
o = n.tanh() # squashing n between -1 and 1
#or
e = (2*n).exp()
o = (e - 1)/ (e + 1)

#print(o)
# back propagating the neuron
o.backward()
#print(w1.grad)

# sech^2(n) = 4/(e^2n + e^-2n + 2)
# n.grad = 4 / (math.exp(2*n.data) + math.exp(-2*n.data) + 2)
# b.grad = n.grad * 1.0 # do/dn * dn/db
# x1w1x2w2.grad = n.grad * 1.0
# x1w1.grad = x1w1x2w2.grad * 1.0
# x2w2.grad = x1w1x2w2.grad * 1.0

# x1.grad = x1w1.grad * w1.data # x1 * w1 = x1w1
# w1.grad = x1w1.grad * x1.data
# x2.grad = x2w2.grad * w2.data
# w2.grad = x2w2.grad * x2.data

#draw_dot(o)

#------------------------------------------------------------------------------
# Using pytorch API
# import torch

# x1 = torch.Tensor([2.0]).double() #from float32 to float64
# x1.requires_grad = True
# w1 = torch.Tensor([-3.0]).double();      w1.requires_grad = True
# x2 = torch.Tensor([0.0]).double();       x2.requires_grad = True
# w2 = torch.Tensor([1.0]).double();       w2.requires_grad = True
# b = torch.Tensor([6.88137358]).double(); b.requires_grad = True

# n = x1*w1 + x2*w2
# o = torch.tanh(n)

#print(o.data.item())
#------------------------------------------------------------------------------
#neural net
import random
class Neuron:
    def __init__(self, nin):
        # for each input (nin) create a random weighting save it in this array
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1)) # random bias between -1 and 1

    def __call__ (self, x):
        # w * x + b
        #sum
        #for wi, xi in zip(self.w, x):
        #    sum += xi*wi
        activation = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
        out = activation.tanh()
        return out

    def parameters(self):
        return self.w + [self.b]

x = [2.0, 3.0]
n = Neuron(2)
#print(n(x)) # this will call __call__

class Layer:
    def __init__(self, nin, nout): # nin: number of inputs per neuron inthis layer (dimensionality of neurons), nout: number of outputs to this layer (i.e. # of neurons)
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x): # x: data to call each neuron with. (i.e. all the data)
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs

    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]
        #params = []
        #for neuron in self.neurons:
        #    ps = neuron.parameters()
        #    params.extend(ps)
        #return params

class MLP:
    def __init__(self, nin, nouts): # nin: # of input, nouts: array of # outputs for every layer
        sz = [nin] + nouts # make an array of [nin, nouts]
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x) #calculates the output of the prev layer and passes it as input of the next layer
        return x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

# 2 inputs, 3 outputs/ 3 neurons
l = Layer(2, 3)
#print(l(x))
# multi-layer perceptron with 3 layers, sized 4, 4 and 1 neuron and 3 inputs
mlp = MLP(3, [4, 4, 1])

# -----example data-----------------------------------------------------------------------------
# inputs
xs = [[2.0, 3.0, -1.0], # one of these represents the three inputs to a single run of the network
      [3.0, -1.0, 0.5],
      [0.5, 1.0, 1.0],
      [1.0, 1.0, -1.0]]

# desired outputs
ys = [1.0, -1.0, -1.0, 1.0]

# use the mean square error loss.
# I.e. find the distance between the target and the predicted value
# gradient descent
for k in range(30): # 10 training steps
    # forward pass
    ypred = [mlp(x) for x in xs]
    loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))

    #backward pass
    #zero grad, resetting grads so they aren't accumulated from previous runs
    for p in mlp.parameters():
        p.grad = 0

    loss.backward()

    #update
    for p in mlp.parameters():
        #print(f"before={p.data}")
        p.data += -0.1 * p.grad
        #print(f"after={p.data}")

    #print(k, loss)

#print(ypred)

print(mlp(xs[0]))
print(mlp(xs[1]))
print(mlp(xs[2]))
print(mlp(xs[3]))
print(f"loss={loss}")