Going Back-to-Basics – Logistic Regression solved using SGD from Scratch

import numpy as np
import h5py
import time
import copy
from random import randint
import argparse

#load MNIST data
MNIST_data = h5py.File('MNISTdata.hdf5', 'r')
x_train = np.float32(MNIST_data['x_train'][:] )
y_train = np.int32(np.array(MNIST_data['y_train'][:,0]))
x_test = np.float32( MNIST_data['x_test'][:] )
y_test = np.int32( np.array( MNIST_data['y_test'][:,0] ) )
#reshaping data
MNIST_data.close() 

#Implementation of stochastic gradient descent algorithm
#number of inputs
#--lr=0.01 --epochs=20 --k_num=30 --k_size=7 --ps 3500
#Implementation of stochastic gradient descent algorithm
#number of inputs
num_inputs = 28*28
#number of outputs
num_outputs = 10
model = {}
#Xavier initalization of Weight, Convlutional Stack and Bias
model['W1'] = np.random.randn(num_outputs,num_inputs) / np.sqrt(num_inputs)
#model gradients
model_grads = copy.deepcopy(model)

#softmax function
def softmax_function(z):
    ZZ = np.exp(z)/np.sum(np.exp(z))
    return ZZ

#Forward Pass
def forward(x,y, model):
    Z = np.dot(model['W1'], x)
    p = softmax_function(Z)
    return p

#Backwards pass
#note derivates of loss function taken ahead of time
def backward(x,y,p, model, model_grads):
    dZ = -1.0*p
    dZ[y] = dZ[y] + 1.0
    for i in range(num_outputs):
        model_grads['W1'][i,:] = dZ[i]*x
    return model_grads

#Training Loop
print("!"*40)
print("!"*15 + " TRAINING " + "!"*15)
print("!"*40)

import time
time1 = time.time()
LR = .01
num_epochs = 20
for epochs in range(num_epochs):
    #Learning rate schedule
    if (epochs > 5):
        LR = 0.001
    if (epochs > 10):
        LR = 0.0001
    if (epochs > 15):
        LR = 0.00001
    total_correct = 0
    for n in range( len(x_train)):
        n_random = randint(0,len(x_train)-1 )
        y = y_train[n_random]
        x = x_train[n_random][:]
        p = forward(x, y, model)
        prediction = np.argmax(p)
        if (prediction == y):
            total_correct += 1
        model_grads = backward(x,y,p, model, model_grads)
        model['W1'] = model['W1'] + LR*model_grads['W1']
    print(total_correct/np.float(len(x_train) ) )
time2 = time.time()
print(time2-time1, "seconds")

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!! TRAINING !!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
8959333333333334
9126833333333333
9195166666666666
9216
9215
9220666666666667
9278333333333333
9292166666666667
9293
9288
93085
9315666666666667
9313333333333333
93
9316833333333333
9301333333333334
93095
9329
9314333333333333
9308166666666666
81015992164612

#Testing the super overfit model 
#92% accuracy 
######################################################
#test data
total_correct = 0
for n in range( len(x_test)):
    y = y_test[n]
    x = x_test[n][:]
    p = forward(x, y, model)
    prediction = np.argmax(p)
    if (prediction == y):
        total_correct += 1
        
print("{} accuracy on test ".format(total_correct/np.float(len(x_test) )))

0.9242 accuracy on test