Introduction¶
Forward Operation and 4 Backpropagation Formulas¶
By definition, the forward operation can be fully defined by 3 formulas.
$$z^l = w^l a^{l-1} + b^l$$$$a^l = \sigma(z^l)$$$$C = \frac{1}{2} \|y(x)-a^L(x)\|^2$$Set $\delta_j^l = \frac{\partial C}{\partial z_j^l}$. The following 4 backpropagation formulas are the direct consequences of the previous 3 formulas.
$$\delta^L = \Delta_a C \odot \sigma' (z^L)$$$$\delta^l = ((w^{l+1})^T \delta^{l+1} ) \odot \sigma' (z^l)$$$$\frac{\partial C}{\partial b_j^l} = \delta_j^l$$$$\frac{\partial C}{\partial w_{jk}^{l}} = a_k^{l-1} \delta_j^l$$These 4 formulas can be used to quickly compute the derivatives of the cost function with respect to the weights and the biases. The computation of derivatives starts from the output layer backwards to the first hidden layer, hence the name backpropagation algorithm.
Derivation of 4 Backpropagation Formulas¶
1st Formula
$$\delta_j^L = \frac{\partial C}{\partial z_j^L} = \sum_k \frac{\partial C}{\partial a_k^L} \frac{\partial a_k^L}{\partial z_j^L} = \frac{\partial C}{\partial a_j^L} \frac{\partial a_j^L}{\partial z_j^L} = \frac{\partial C}{\partial a_j^L} \sigma' (z_j^L)$$2nd Formula
$$\delta_j^l = \frac{\partial C}{\partial z_j^l} = \sum_k \frac{\partial C}{\partial z_k^{l+1}} \frac{\partial z_k^{l+1}}{\partial z_j^l} = \sum_k \frac{\partial z_k^{l+1}}{\partial z_j^l} \delta_k^{l+1}$$$$z_k^{l+1} = \sum_j w^{l+1}_{kj} a^l_j + b^{l+1}_k = \sum_j w^{l+1}_{kj} \sigma(z^l_j) + b^{l+1}_k$$$$\frac{\partial z_k^{l+1}}{\partial z_j^{l}} = w^{l+1}_{kj} \sigma'(z^l_j)$$$$\delta_j^l = \Big(\sum_k w^{l+1}_{kj} \delta_k^{l+1}\Big) \sigma'(z^l_j) $$3rd Formula
$$\frac{\partial C}{\partial b_j^l} = \sum_k \frac{\partial C}{\partial z_k^l}\frac{\partial z_k^l}{\partial b_j^l} = \frac{\partial C}{\partial z_j^l}\frac{\partial z_j^l}{\partial b_j^l} = \frac{\partial C}{\partial z_j^l} = \delta_j^l$$4th Formula
$$\frac{\partial C}{\partial w_{jk}^{l}} =\sum_m \frac{\partial C}{\partial z_m^l}\frac{\partial z_m^l}{\partial w_{jk}^{l}} = \frac{\partial C}{\partial z_j^l}\frac{\partial z_j^l}{\partial w_{jk}^{l}} = \frac{\partial C}{\partial z_j^l}a^{l-1}_k = a_k^{l-1} \delta_j^l$$Implementation of Batch Gradient Descent¶
import numpy as np
class Network:
def __init__(self, sizes):
self.biases = [np.random.randn(y,1) for y in sizes[1:]]
self.weights = [np.random.randn(x,y) for x, y in zip(sizes[1:], sizes[0:-1])]
def compute_derivative(self, X, Y):
all_z = []
all_a = []
all_delta = []
# forward pass to compute the activations
a = X
all_a.append(a)
for weight, bias in zip(self.weights, self.biases):
z = np.dot(weight,a) + np.repeat(bias, X.shape[1]).reshape(-1,X.shape[1])
all_z.append(z)
a = sigmoid(z)
all_a.append(a)
# backward pass to compute the deltas
delta = np.multiply(all_a[-1] - Y, sigmoid_prime(all_z[-1]))
all_delta.append(delta)
for weight, z in zip(self.weights[1:][::-1], all_z[:-1][::-1]):
delta = np.multiply(np.dot(np.transpose(weight),delta), sigmoid_prime(z))
all_delta.append(delta)
all_a = all_a[:-1]
all_delta = all_delta[::-1]
# compute derivatives based on activations and deltas
weight_deris = [np.transpose(np.dot(a, np.transpose(delta))) for a, delta in zip(all_a, all_delta)]
bias_deris = [np.sum(delta, axis=1, keepdims=True) for delta in all_delta]
return weight_deris, bias_deris
def BGD(self, training_data, epochs, batch_size, eta, test_data=None):
sample_num = training_data[0].shape[0]
for epoch in range(epochs):
print '>>> epoch #', epoch
indices = np.arange(sample_num)
np.random.shuffle(indices)
for batch in np.arange(0, sample_num, batch_size):
l_bound = batch
r_bound = batch+batch_size if batch+batch_size < sample_num else sample_num
X_batch = np.transpose(training_data[0][indices[l_bound:r_bound]])
Y_batch = np.transpose(training_data[1][indices[l_bound:r_bound]])
# compute the derivatives with respect to instances in a batch
weight_deris, bias_deris = self.compute_derivative(X_batch, Y_batch)
# adjust weights and biases so as to favour these instances
for i in range(len(self.weights)):
self.weights[i] = self.weights[i] - eta * weight_deris[i]
self.biases[i] = self.biases[i] - eta * bias_deris[i]
self.evaluate(training_data, 'train rate\t')
if test_data:
self.evaluate(test_data, 'test rate\t')
def evaluate(self, test_data, label=''):
X = test_data[0]
Y = np.argmax(test_data[1], axis=1)
a = np.transpose(X)
for weight, bias in zip(self.weights, self.biases):
z = np.dot(weight,a) + np.repeat(bias, a.shape[1]).reshape(-1,a.shape[1])
a = sigmoid(z)
print label, float(np.sum((np.argmax(a, axis=0) == Y).astype('int')))/len(Y)
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
temp = sigmoid(z)
return temp * (1-temp)
Comparison with Vanila SVM¶
Preprocessing¶
import cPickle, gzip
import numpy as np
import matplotlib.pyplot as plt
from keras.utils import np_utils
from sklearn import svm
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(-1,784).astype('float64')/255.
y_train = np_utils.to_categorical(y_train).astype('float64')
x_test = x_test.reshape(-1,784).astype('float64')/255.
y_test = np_utils.to_categorical(y_test).astype('float64')
plt.imshow(x_train[0].reshape(28,28)); plt.show()
Neural Network Training and Testing¶
net = Network([784, 30, 30, 10])
net.BGD((x_train, y_train), 20, 50, 0.1, (x_test, y_test))
Vanila SVM Training and Testing¶
y_train = np.argmax(y_train,1)
y_test = np.argmax(y_test,1)
# train
clf = svm.SVC()
clf.fit(x_train, y_train)
# test
predictions = [int(a) for a in clf.predict(x_test)]
print 'test rate\t', float(sum(int(a == y) for a, y in zip(predictions, y_test)))/len(y_test)
comments powered by Disqus