Just wrote up my first Neural Network Class in python. Everything as far as I can tell should work, but there is some bug in it that I can't seem to find(Probably staring me right in the face).
I first tried it on 10,000 examples of the MNIST data, then again when trying to replicate the sign function, and again when trying to replicate a XOR Gate. Every time, regardless of the # of epochs, it always produces output from all the output neurons(regardless of how many there may be) that are all roughly the same value, but the cost function seems to be going down.
I am using batch gradient descent, all done using vectors(no loop for each training example).
#Neural Network Class
import numpy as np
class NeuralNetwork:
#methods
def __init__(self,layer_shape):
#Useful Network Info
self.__layer_shape = layer_shape
self.__layers = len(layer_shape)
#Initialize Random Weights
self.__weights = []
self.__weight_sizes = []
for i in range(len(layer_shape)-1):
current_weight_size = (layer_shape[i+1],layer_shape[i]+1)
self.__weight_sizes.append(current_weight_size)
self.__weights.append(np.random.normal(loc=0.1,scale=0.1,size=current_weight_size))
def sigmoid(self,z):
return (1/(1+np.exp(-z)))
def sig_prime(self,z):
return np.multiply(self.sigmoid(z),(1-self.sigmoid(z)))
def Feedforward(self,input,Train=False):
self.__input_cases = np.shape(input)[0]
#Empty list to hold the output of every layer.
output_list = []
#Appends the output of the the 1st input layer.
output_list.append(input)
for i in range(self.__layers-1):
if i == 0:
output = self.sigmoid(np.dot(np.concatenate((np.ones((self.__input_cases,1)),input),1),self.__weights[0].T))
output_list.append(output)
else:
output = self.sigmoid(np.dot(np.concatenate((np.ones((self.__input_cases,1)),output),1),self.__weights[i].T))
output_list.append(output)
#Returns the final output if not training.
if Train == False:
return output_list[-1]
#Returns the entire output_list if need for training
else:
return output_list
def CostFunction(self,input,target,error_func=1):
"""Gives the cost of using a particular weight matrix
based off of the input and targeted output"""
#Run the network to get output using current theta matrices.
output = self.Feedforward(input)
#####Allows user to choose Cost Functions.#####
#
#Log Based Error Function
#
if error_func == 0:
error = np.multiply(-target,np.log(output))-np.multiply((1-target),np.log(1-output))
total_error = np.sum(np.sum(error))
#
#Squared Error Cost Function
#
elif error_func == 1:
error = (target - output)**2
total_error = 0.5 * np.sum(np.sum(error))
return total_error
def Weight_Grad(self,input,target,output_list):
#Finds the Error Deltas for Each Layer
#
deltas = []
for i in range(self.__layers - 1):
#Finds Error Delta for the last layer
if i == 0:
error = (target-output_list[-1])
error_delta = -1*np.multiply(error,np.multiply(output_list[-1],(1-output_list[-1])))
deltas.append(error_delta)
#Finds Error Delta for the hidden layers
else:
#Weight matrices have bias values removed
error_delta = np.multiply(np.dot(deltas[-1],self.__weights[-i][:,1:]),output_list[-i-1]*(1-output_list[-i-1]))
deltas.append(error_delta)
#
#Finds the Deltas for each Weight Matrix
#
Weight_Delta_List = []
deltas.reverse()
for i in range(len(self.__weights)):
current_weight_delta = (1/self.__input_cases) * np.dot(deltas[i].T,np.concatenate((np.ones((self.__input_cases,1)),output_list[i]),1))
Weight_Delta_List.append(current_weight_delta)
#print("Weight",i,"Delta:","
",current_weight_delta)
#print()
#
#Combines all Weight Deltas into a single row vector
#
Weight_Delta_Vector = np.array([[]])
for i in Weight_Delta_List:
Weight_Delta_Vector = np.concatenate((Weight_Delta_Vector,np.reshape(i,(1,-1))),1)
return Weight_Delta_List
def Train(self,input_data,target):
#
#Gradient Checking:
#
#First Get Gradients from first iteration of Back Propagation
output_list = self.Feedforward(input_data,Train=True)
self.__input_cases = np.shape(input_data)[0]
Weight_Delta_List = self.Weight_Grad(input_data,target,output_list)
#Creates List of Gradient Approx arrays set to zero.
grad_approx_list = []
for i in self.__weight_sizes:
current_grad_approx = np.zeros(i)
grad_approx_list.append(current_grad_approx)
#Compute Approx. Gradient for every Weight Change
for W in range(len(self.__weights)):
for index,value in np.ndenumerate(self.__weights[W]):
orig_value = self.__weights[W][index] #Saves the Original Value
print("Orig Value:", orig_value)
#Sets weight to weight +/- epsilon
self.__weights[W][index] = orig_value+.00001
cost_plusE = self.CostFunction(input_data, target)
self.__weights[W][index] = orig_value-.00001
cost_minusE = self.CostFunction(input_data, target)
#Solves for grad approx:
grad_approx = (cost_plusE-cost_minusE)/(2*.00001)
grad_approx_list[W][index] = grad_approx
#Sets Weight Value back to its original value
self.__weights[W][index] = orig_value
#
#Print Gradients from Back Prop. and Grad Approx. side-by-side:
#
print("Back Prop. Grad","","Grad. Approx")
print("-"*15,"","-"*15)
for W in range(len(self.__weights)):
for index, value in np.ndenumerate(self.__weights[W]):
print(self.__weights[W][index],""*3,grad_approx_list[W][index])
print("
"*3)
input_ = input("Press Enter to continue:")
#
#Perform Weight Updates for X number of Iterations
#
for i in range(10000):
#Run the network
output_list = self.Feedforward(input_data,Train=True)
self.__input_cases = np.shape(input_data)[0]
Weight_Delta_List = self.Weight_Grad(input_data,target,output_list)
for w in range(len(self.__weights)):
#print(self.__weights[w])
#print(Weight_Delta_List[w])
self.__weights[w] = self.__weights[w] - (.01*Weight_Delta_List[w])
print("Done")`
I even implememented Gradient Checking and the values are different, and I thought I would try replacing the Back Propagation updates with the Approx. Gradient Checking values, but that gave the same results, causing me to doubt even my Gradient Checking code.
Here are some of the values being produced when training for the XOR Gate:
Back Prop. Grad: 0.0756102610697 0.261814503398 0.0292734023876
Grad Approx: 0.05302210631166 0.0416095559674 0.0246847342122
Cost: Before Training: 0.508019225507 After Training 0.50007095103 (After 10000 Epochs)
Output for 4 different examples(after training):
[ 0.49317733] [ 0.49294556] [ 0.50489004] [ 0.50465824]
So my question is, is there any obvious problem with my Back Propagation, or my gradient checking? Are there any usual problems when a ANN shows these symptoms(Outputs are all roughly the same/Cost is going down)?
See Question&Answers more detail:
os