Dans ce TP, vous allez implémenter en Numpy un réseau de neurones de type Perceptron multicouche.
spyder &
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)
#%% DEFINE AND PLOT DATA
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
return xx, yy
style_per_class = ['xb', 'or', 'sg']
X = np.array([[1.2, 2.3, -0.7, 3.2, -1.3],[-3.4, 2.8, 1.2, -0.4, -2.3]]).T
y = np.array([0,0,1,1,2])
C = len(style_per_class)
N = X.shape[0]
xx, yy = make_meshgrid(X[:,0].ravel(), X[:,1].ravel(), h=0.1)
plt.figure(1)
ax = plt.subplot(111)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
plt.grid(True)
for i in range(C):
x_c = X[(y==i).ravel(),:]
plt.plot(x_c[:,0],x_c[:,1],style_per_class[i])
plt.pause(0.1)
class MLP:
def __init__(self, H):
self.C = 3
self.D = 2
self.H = H
#parameters
self.W1 = (np.sqrt(6./self.D))*(2*(np.random.uniform(size=(self.D,self.H))-0.5))
self.b1 = (1./np.sqrt(self.D))*(2*(np.random.uniform(size=(self.H))-0.5))
self.W3 = (np.sqrt(6./self.H))*(2*(np.random.uniform(size=(self.H,self.C))-0.5))
self.b3 = (1./np.sqrt(self.H))*(2*(np.random.uniform(size=(self.C))-0.5))
#gradients
self.dl_dW1 = np.zeros_like(self.W1)
self.dl_db1 = np.zeros_like(self.b1)
self.dl_dW3 = np.zeros_like(self.W3)
self.dl_db3 = np.zeros_like(self.b3)
def forward(self,X):
X1 = X.dot(self.W1) + self.b1 #NxH
X2 = np.maximum(0.,X1) #NxH
O = X2.dot(self.W3) + self.b3 #NxC
return X,X1,X2,O
def backward(self,dl_dO, O, X2, X1, X0):
#backpropagation of dl_dO through last fully connected layer
dl_dX2 = ???
self.dl_dW3 += ???
self.dl_db3 += ???
#backpropagation of dl_dX2 through ReLU
dl_dX1 = ???
#backpropagation of dl_dX1 through first fully connected layer
self.dl_dW1 += ???
self.dl_db1 += ???
return
Travail : implémenter la fonction backward
en utilisant les équations obtenues en TP.
def logsoftmax(x):
x_shift = x - np.amax(x, axis=1, keepdims=True)
return x_shift - np.log(np.exp(x_shift).sum(axis=1, keepdims=True))
def softmax(x):
e_x = np.exp(x - np.amax(x, axis=1, keepdims=True))
return e_x / e_x.sum(axis=1, keepdims=True)
def multinoulliCrossEntropyLoss(O, y):
N = y.shape[0]
P = softmax(O.astype('double'))
log_p = logsoftmax(O.astype('double'))
a = log_p[np.arange(N),y]
l = -a.sum()/N
dl_do = P
dl_do[np.arange(N),y] -= 1
dl_do = dl_do/N
return (l, dl_do)
class GradientDescentWithMomentum:
def __init__(self, model, beta, lr):
self.model = model
self.beta= beta
self.lr = lr
#momentum
self.VW1 = np.zeros_like(self.model.W1)
self.Vb1 = np.zeros_like(self.model.b1)
self.VW3 = np.zeros_like(self.model.W3)
self.Vb3 = np.zeros_like(self.model.b3)
def step(self):
self.VW1 = self.beta*self.VW1 + (1.0-self.beta)*self.model.dl_dW1
self.model.W1 -= self.lr*self.VW1
self.VW3 = self.beta*self.VW3 + (1.0-self.beta)*self.model.dl_dW3
self.model.W3 -= self.lr*self.VW3
self.Vb1 = self.beta*self.Vb1 + (1.0-self.beta)*self.model.dl_db1
self.model.b1 -= self.lr*self.Vb1
self.Vb3 = self.beta*self.Vb3 + (1.0-self.beta)*self.model.dl_db3
self.model.b3 -= self.lr*self.Vb3
def zero_gradients(self):
self.model.dl_dW1.fill(0.)
self.model.dl_db1.fill(0.)
self.model.dl_dW3.fill(0.)
self.model.dl_db3.fill(0.)
def plot_contours(ax, model, xx, yy, **params):
"""Plot the decision boundaries for a classifier.
Parameters
----------
ax: matplotlib axes object
model: neural network
xx: meshgrid ndarray
yy: meshgrid ndarray
params: dictionary of params to pass to contourf, optional
"""
_,_,_,O = model.forward(np.c_[xx.ravel(), yy.ravel()])
pred = np.argmax(O, axis=1)
Z = pred.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
#%% HYPERPARAMETERS
H = 300
lr = 1e-2 #learning rate
beta = 0.9 #momentum parameter
n_epoch = 10000 #number of iterations
model = MLP(H)
optimizer = GradientDescentWithMomentum(model, beta, lr)
for i in range(n_epoch):
#Forward Pass
X0,X1,X2,O = model.forward(X)
#Compute Loss
[l, dl_dO] = multinoulliCrossEntropyLoss(O, y)
#Print Loss and Classif Accuracy
pred = np.argmax(O, axis=1)
acc = (np.argmax(O, axis=1) == y).astype('float').sum()/N
print('Iter {} | Loss = {} | Training Accuracy = {}%'.format(i,l,acc*100))
#Backward Pass (Compute Gradient)
optimizer.zero_gradients()
model.backward(dl_dO, O, X2, X1, X0)
#Update Parameters
optimizer.step()
if(np.mod(i,10)==0):
#Plot decision boundary
ax.cla()
for i in range(C):
x_c = X[(y==i).ravel(),:]
plt.plot(x_c[:,0],x_c[:,1],style_per_class[i])
plot_contours(ax, model, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
plt.pause(0.5)
backward
, vous devriez constater que l'apprentissage sur l'exemple jouet "fonctionne", c'est-à-dire que le coût diminue progressivement vers zéro et le taux de bonne classification atteint rapidement 100%.X1
, X2
, O
) et des dérivées (notamment dl_dW3
et dl_dW2
). Que constatez-vous ? Pourquoi ?