The goal of this short project is to train a neural network, using the MNIST dataset, using only linear algebra via Numpy. The MNIST dataset is a collection of 28x28 handwritten digits with corresponding integer values for the represented digit. The model as currently constructed can correctly identify about 90% of hand drawn digits. Using actual data science modules, accuracy well over 98% can be achieved. The data for this project can be accessed Here.
#Import Modules
import numpy as np
#numpy for all data proccessing and modeling
from matplotlib import pyplot as plt
#Pyplot for image generation
from keras.datasets import mnist
#Mnist for loading dataset
from time import perf_counter_ns
#Load and Reshape data to have 60000 columns of 784 nodes
(train_X_2d, train_y), (test_X_2d, test_y) = mnist.load_data()
#Collapse 28x28 images into 784 node columns
train_X = np.reshape(train_X_2d, (60000, 784))[0:10000, :]
train_X_2d = train_X_2d[0:10000, :, :]
train_y = train_y[0:10000]
test_X = np.reshape(test_X_2d, (10000, 784))
m, n = train_X.shape
train_X = train_X.T
test_X = test_X.T
#Generate random samples for analysis
nums = np.random.randint(0, 9999, 9)
plt.figure(1)
for i in range(9):
plt.subplot(3,3,i+1)
plt.imshow(test_X[:,nums[i]].reshape(28,28), cmap='Greys')
plt.yticks([])
plt.xticks([])
plt.title('Actual:' + str(test_y[nums[i]]))
plt.suptitle('Sample of dataset Images and Values from Test Dataset', y=1)
plt.subplots_adjust(hspace = .5, wspace = 1.5)
This project uses the sigmoid function as the activator and uses the softmax fuction to generate probabilities and thus predicitons. While there are functions that perform better in training than the sigmoid function, it was selected here for its ability to limit values and prevent them from exceeding the limits of float64.
def sigmoid(Z):
return (1/(1+np.exp(-Z)))
def softmax(Z):
A = np.exp(Z) / sum(np.exp(Z))
return A
def sigmoid_deriv(Z):
dsig = (np.exp(-Z))/((np.exp(-Z)+1)**2)
#Some extremely large numbers are passed through here that give infinetesimal values. To account for problems caused by this these values are set to 0.
dsig = np.nan_to_num(dsig)
return dsig
def y_col_arr(Y):
#Set y values to be arrays of length 10 where the solution is 1 and all other values are 0
y_col_array = np.zeros((Y.size, Y.max() + 1))
y_col_array[np.arange(Y.size), Y] = 1
y_col_array = y_col_array.T
return y_col_array
def get_predictions(A3, Y):
#Find predicted values from probability vectors
predictions = np.argmax(A3,0)
print('Predictions:')
print(predictions)
print('Solutions:')
print(Y)
#Generate accuracy value
accuracy = np.sum(predictions == Y)/ Y.size
return accuracy, predictions
def init_params():
#Generate initial constants as random values between -.5 and .5
W1 = np.random.rand(72, 784) - 0.5
b1 = np.random.rand(72, 1) - 0.5
W2 = np.random.rand(36, 72) - 0.5
b2 = np.random.rand(36, 1) - 0.5
W3 = np.random.rand(10, 36) - 0.5
b3 = np.random.rand(10, 1,) - 0.5
return (W1, b1, W2, b2, W3, b3)
This neural network utilizes two hidden layers with 72 and 36 nodes respectively. This combination gives a good amount of accuracy without requiring too much time to run.
This iterates 500 times to appropriately train the model.
def forward_prop(W1, b1, W2, b2, W3, b3, X):
#Compute node values from X data and constant values
Z1 = np.dot(W1, X) + b1
A1 = sigmoid(Z1)
Z2 = np.dot(W2, A1) + b2
A2 = sigmoid(Z2)
Z3 = np.dot(W3, A2) + b3
A3 = softmax(Z3)
return (Z1, A1, Z2, A2, Z3, A3)
def backward_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y):
#Find gradient of cost function to determine the dW and db values that are used to adjust constants
y_col_arr_Y = y_col_arr(Y)
dZ3 = A3 - y_col_arr_Y
dW3 = 1/m * dZ3.dot(A2.T)
db3 = 1/m * np.sum(dZ3)
dZ2 = W3.T.dot(dZ3) * sigmoid_deriv(Z2)
dW2 = 1/m * dZ2.dot(A1.T)
db2 = 1/m * np.sum(dZ2)
dZ1 = W2.T.dot(dZ2) * sigmoid_deriv(Z1)
print(np.shape(dZ1), np.shape(X.T))
dW1 = 1/m * dZ1.dot(X.T)
db1 = 1/m * np.sum(dZ1)
return (dW1, db1, dW2, db2, dW3, db3)
def update_par(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha):
#Adjust constants based db dW and alpha
W1 = W1 - alpha * dW1
b1 = b1 - alpha * db1
W2 = W2 - alpha * dW2
b2 = b2 - alpha * db2
W3 = W3 - alpha * dW3
b3 = b3 - alpha * db3
return (W1, b1, W2, b2, W3, b3)
import warnings
def gradient_decent(X, Y, alpha, iterations):
W1, b1, W2, b2, W3, b3 = init_params()
print(np.shape(X))
#Iterate through forward prop - backward prop - update par 500 times
for i in range(iterations):
warnings.filterwarnings('ignore')
Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
dW1, db1, dW2, db2, dW3, db3 = backward_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y)
W1, b1, W2, b2, W3, b3 = update_par(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha)
if i % 25 == 0:
#Report status of model for every fifth iteration.
print('Iterations: ', i)
accuracy, predictions = get_predictions(A3, Y)
print(str(accuracy*100) + '%')
print('Final Accuracy')
accuracy, predictions = get_predictions(A3,Y)
print(str(accuracy*100) + '%')
return (W1, b1, W2, b2, W3, b3)
#Train Model
W1, b1, W2, b2, W3, b3 = gradient_decent(train_X, train_y, 1, 1)
print(np.shape(W1), np.shape(b1),np.shape(W2),np.shape(b2),np.shape(W3),np.shape(b3))
(784, 10000) (72, 10000) (10000, 784) Iterations: 0 Predictions: [1 5 1 ... 1 1 1] Solutions: [5 0 4 ... 6 9 7] 11.84% Final Accuracy Predictions: [1 5 1 ... 1 1 1] Solutions: [5 0 4 ... 6 9 7] 11.84% (72, 784) (72, 1) (36, 72) (36, 1) (10, 36) (10, 1)
92.9% accuracy on training data.
#Test Model
Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, test_X)
accuracy, predictions = get_predictions(A3, test_y)
print(str(accuracy*100) + '%')
Predictions: [9 0 4 ... 4 7 6] Solutions: [7 2 1 ... 4 5 6] 17.349999999999998%
91.1% accuracy on test data.
plt.figure(1)
for i in range(9):
plt.subplot(3,3,i+1)
plt.imshow(test_X[:,nums[i]].reshape(28,28), cmap='Greys')
plt.yticks([])
plt.xticks([])
plt.title('Actual:' + str(test_y[nums[i]]) + ' Prediction:' + str(predictions[nums[i]]))
plt.suptitle("Sample of dataset Images and Values from Test Dataset With Predictions", y=1)
plt.subplots_adjust(hspace = .5, wspace = 1.5)
Here you can see examples of the numbers that the model correctly identifies and one example of a 2 which it guesses is a 9.
By using only numpy, linear algebra, and multivariable calculus this neural network is able to estimate digits from hand drawn images 91.1% of the time. With more tinkering or more training data its possible this neural network could be more accurate, but considering its limited resources 91.1% is a respectable success rate.