import numpy as np
It will be a two-layer network (1 hidden layer) with variable number of units in each layer.
Ni = 2
Nh = 2
No = 3
sig = 0.01
W1 = sig * np.random.rand(Ni, Nh)
b1 = 0 * np.random.rand(1 , Nh)
W2 = sig * np.random.rand(Nh, No)
b2 = 0 * np.random.rand(1 , No)
The hidden layer uses RELU: $h = \max(xW1 + b1)$ The output layer uses Softmax activation: $o = \frac{e^{\text{scores}}}{\sum_i e^{\text{scores}_i}}$ where $\text{scores} = hW2+b2.$
def forward(x):
h = np.maximum(0, np.dot(x,W1)+b1)
score = np.dot(h,W2)+b2
o = np.exp(score)
o /= np.sum(o, axis=1, keepdims=True)
return o
We must calculate the partial derivatives to compute the gradient
def backward(x, y):
####Forward pass
h = np.maximum(0, np.dot(x,W1)+b1)
score = np.dot(h,W2)+b2
o = np.exp(score)
o /= np.sum(o, axis=1, keepdims=True)
####Backward
n_samples = x.shape[0]
dscore = np.copy(o)
dscore[range(n_samples), y] -= 1
dscore /= n_samples
dW2 = np.dot(h.T, dscore)
db2 = np.sum(dscore, axis=0, keepdims=True)
dh = np.dot(dscore, W2.T)
dh[h <= 0] = 0
dW1 = np.dot(x.T, dh)
db1 = np.sum(dh, axis=0, keepdims=True)
return dW1, db1, dW2, db2
def loss(x,y):
p = forward(x)
logloss = -np.log(p[range(x.shape[0]), y])
return np.sum(logloss)/x.shape[0]
Example loss
from sklearn.datasets import make_classification
x = np.array(((1,3),(3,-1),(6,9)))
y = np.array((0,1,1))
x,y = make_classification(20, 2, 2, 0, 0, 2, 1)
print(forward(x))
print('Example loss: {}'.format(loss(x,y)))
epochs = 10000
lr = 1e-2
for i in range(epochs):
dW1, db1, dW2, db2 = backward(x,y)
W1 -= lr*dW1
b1 -= lr*db1
W2 -= lr*dW2
b2 -= lr*db2
if i%1000 == 0:
print("Loss at iteration {}: {}".format(i, loss(x,y)))
print(forward(x))
print(y)