import pandas as pd
import scipy as sp
import seaborn as sn
from mpl_toolkits.mplot3d import Axes3D
from __future__ import division
%pylab inline

cd ex1

#load in the data, set header to None so that first line isn't assigned as header
data = pd.read_csv('ex1data1.txt', header = None)

data.head()

# assign the first and second columns to the variables X and y, respectively
X = data.iloc[:,0]
y = data.iloc[:,1]

#check X
X.head()

#check y
y.head()

#check that the length of y matches the MatLab file (97 rows), which it does. Assign this to the variable m
m = len(y)
m

#Plot a scatter of population size by profit
plt.scatter(X,y, c = 'r', marker = 'x')
plt.xlabel("Population of city in 10,000s")
plt.ylabel("Profit in $10,000s")
plt.title("Scatter plot of training data")
plt.xlim(4,24)
plt.ylim(-5,25)

# X was automatically made into a series, so make it into a dataframe
X = pd.DataFrame(X)

#insert a column of ones in the first column of X
# This takes into account the intercept term, theta0
X.insert(0, "theta zero",value =pd.Series(np.ones([m])))

#check this worked correctly
X.head()

#initialize fitting parameters as stated in the homework
theta = np.zeros([2, 1])
iterations = 1500
alpha = 0.01

#this is how you calculate the hypothesis parameter
hypothesis = X.dot(theta)
hypothesis.head()

# define a function that computes the cost function J()
def costJ(X,y,theta):
    '''where X is a dataframe of input features, plus a column of ones to accommodate theta 0; 
    y is a vector that we are trying to predict using these features, and theta is an array of the parameters'''
    
    m = len(y)
    hypothesis = X.dot(theta) # if you're using a np array, you need to use flatten to remove nesting
    a = (hypothesis[0]-y)**2 # otherwise this will give you completely the wrong answerX[:, 0]
    J = (1/(2*m)*sum(a))
    return J

costJ(X,y,theta)

# define a function that will implement the gradient descent algorithm
def gradDesc(X,y,theta,alpha,num_iters):
    '''Implement the gradient descent algorithm, where alpha is the learning rate and num_iters is the number of iterations to run'''
    
    Jhistory = [] #initiate an empty list to store values of cost function after each cycle
    theta_update = theta.copy()   
    
    for num_iter in range(num_iters):
        #these update only once for each iteration
        hypothesis = X.dot(theta_update)
        loss = hypothesis[0]-y 
        
        for i in range(len(X.columns)):
            #these will update once for every parameter
            theta_update[i] = theta_update[i] - (alpha*(1.0/m))*((loss*(X.iloc[:,i])).sum())

        Jhistory.append(costJ(X,y,theta_update))
        
    return Jhistory, theta_update


#try running the function 
Jhistory, theta_update= gradDesc(X,y,theta,alpha,1500)  
Jhistory[:10]

# These are the final values for theta
theta_update

# Check the output of Jhistory is decreasing towards a minimum
plt.plot(Jhistory)
plt.xlabel("number of iterations")
plt.ylabel("cost function")

#Add a line which shows the outcome of the gradient descent after 1000 iterations
plt.scatter(X.iloc[:,1],y, c = 'r', marker = 'x')
plt.xlabel("Population of city in 10,000s")
plt.ylabel("Profit in $10,000s")
plt.title("Scatter plot of training data")
plt.xlim(4,24)
plt.ylim(-5,25)
plt.plot(X.iloc[:,1],X.dot(theta_update))

# Next I want to make a contour plot that plots theta0 against theta 1 and the outcome of J
# First initiate values for theta0 and theta 1
theta0_vals = np.linspace(-10, 10, 100)
theta1_vals = np.linspace(-1, 4, 100)

#they both have length 100
len(theta1_vals)

# initialize J_vals to a matrix of 0's
J_vals = np.zeros([len(theta0_vals), len(theta1_vals)])
J_vals.shape

for i in range(len(theta0_vals)):
    for j in range(len(theta1_vals)):
        t = np.array([[theta0_vals[i]], [theta1_vals[j]]])
        
        J_vals[i,j] = costJ(X, y, t)


plt.contour(theta0_vals, theta1_vals, J_vals, logspace(-2, 3, 20))

plt.contour(theta0mesh,theta1mesh,J_vals, np.logspace(-2, 3, 20))

sn.set_palette("Blues")
plt.scatter(theta_update[0], theta_update[1], )
plt.contour(theta0_vals, theta1_vals, J_vals.T, logspace(-2, 3, 20))
sn.set_style("darkgrid")


#numpy.meshgrid takes two (or more) 1d x and y coordinate arrays, and returns a pair of 2d x and y grid arrays. 
theta0mesh, theta1mesh = np.meshgrid(theta0_vals,theta1_vals)


fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(theta0mesh, theta1mesh, J_vals.T)