from sklearn.datasets import load_boston
data = load_boston()
print data.keys()

print data.data.shape
print data.target.shape

print data.DESCR

%pylab inline

plt.hist(data.target)
plt.xlabel('price ($1000s)')
plt.ylabel('count')
plt.show()

n_chns = 6
fig, axs = plt.subplots(1, n_chns, sharey=True, figsize=(24,8))
for ii in range(0,n_chns):
    axs[ii].scatter (data.data[:,ii],data.target)
    axs[ii].set_title('Median value (Y) vs {0} (X)'.format(data.feature_names[ii]),fontsize=16)
plt.show()

from sklearn.linear_model import LinearRegression

clf = LinearRegression()

clf.fit(data.data, data.target)

predicted = clf.predict(data.data)

plt.figure(figsize=(10,8))
plt.scatter(data.target, predicted)
plt.plot([0, 50], [0, 50], '--k', linewidth=3)
plt.axis('tight')
plt.xlabel('True price ($1000s)',fontsize=15)
plt.ylabel('Predicted price ($1000s)', fontsize=15)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.grid('on')
plt.show()

# What is the correlation between the observations and the property price? 

# The coefficients (shows the correlation between the property price and property related information)
print('Attribute information and Model coefficients:')
for ii in range(0,len(clf.coef_)):
    print ('{0}: {1:.3f}'.format(data.feature_names[ii],clf.coef_[ii]))

# The mean square error
print("\nResidual sum of squares: %.2f" % np.mean((predicted - data.target) ** 2))
print("Residual mean deviation: %.2f (K USD)" % np.sqrt(np.mean((predicted - data.target) ** 2)))

from sklearn.tree import DecisionTreeRegressor
# Instantiate the model, fit the results, and scatter in vs. out

clf = DecisionTreeRegressor()
clf.fit(data.data, data.target)

predicted = clf.predict(data.data)

plt.figure(figsize=(10,8))
plt.scatter(data.target, predicted)
plt.plot([0, 50], [0, 50], '--k', linewidth=3)
plt.axis('tight')
plt.xlabel('True price ($1000s)',fontsize=15)
plt.ylabel('Predicted price ($1000s)', fontsize=15)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.grid('on')
plt.show()

# x from 0 to 10
x = 30 * np.random.random(40)

# y = a*x + b with noise
y = 0.5 * x + 5.0 + np.random.normal(size=x.shape) * 3

# create a linear regression classifier
clf = LinearRegression()
clf.fit(x[:, None], y)

# predict y from the data
x_new = np.linspace(20, 50, 10)
y_new = clf.predict(x_new[:, None])

# plot the results
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111);
ax.scatter(x, y, label='model fitting data')
ax.scatter(x_new, y_new, color='r', label='new observations & predictions')
ax.set_xlabel('x',fontsize=15)
ax.set_ylabel('y',fontsize=15)
ax.legend(loc='upper left', fontsize=15)
ax.tick_params(axis='both', which='major', labelsize=15)
ax.axis('tight')
ax.grid('on')