import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error
data = pd.read_csv('day.csv')
data
instant | dteday | season | yr | mnth | holiday | weekday | workingday | weathersit | temp | atemp | hum | windspeed | casual | registered | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2011-01-01 | 1 | 0 | 1 | 0 | 6 | 0 | 2 | 0.344167 | 0.363625 | 0.805833 | 0.160446 | 331 | 654 | 985 |
1 | 2 | 2011-01-02 | 1 | 0 | 1 | 0 | 0 | 0 | 2 | 0.363478 | 0.353739 | 0.696087 | 0.248539 | 131 | 670 | 801 |
2 | 3 | 2011-01-03 | 1 | 0 | 1 | 0 | 1 | 1 | 1 | 0.196364 | 0.189405 | 0.437273 | 0.248309 | 120 | 1229 | 1349 |
3 | 4 | 2011-01-04 | 1 | 0 | 1 | 0 | 2 | 1 | 1 | 0.200000 | 0.212122 | 0.590435 | 0.160296 | 108 | 1454 | 1562 |
4 | 5 | 2011-01-05 | 1 | 0 | 1 | 0 | 3 | 1 | 1 | 0.226957 | 0.229270 | 0.436957 | 0.186900 | 82 | 1518 | 1600 |
5 | 6 | 2011-01-06 | 1 | 0 | 1 | 0 | 4 | 1 | 1 | 0.204348 | 0.233209 | 0.518261 | 0.089565 | 88 | 1518 | 1606 |
6 | 7 | 2011-01-07 | 1 | 0 | 1 | 0 | 5 | 1 | 2 | 0.196522 | 0.208839 | 0.498696 | 0.168726 | 148 | 1362 | 1510 |
7 | 8 | 2011-01-08 | 1 | 0 | 1 | 0 | 6 | 0 | 2 | 0.165000 | 0.162254 | 0.535833 | 0.266804 | 68 | 891 | 959 |
8 | 9 | 2011-01-09 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0.138333 | 0.116175 | 0.434167 | 0.361950 | 54 | 768 | 822 |
9 | 10 | 2011-01-10 | 1 | 0 | 1 | 0 | 1 | 1 | 1 | 0.150833 | 0.150888 | 0.482917 | 0.223267 | 41 | 1280 | 1321 |
10 | 11 | 2011-01-11 | 1 | 0 | 1 | 0 | 2 | 1 | 2 | 0.169091 | 0.191464 | 0.686364 | 0.122132 | 43 | 1220 | 1263 |
11 | 12 | 2011-01-12 | 1 | 0 | 1 | 0 | 3 | 1 | 1 | 0.172727 | 0.160473 | 0.599545 | 0.304627 | 25 | 1137 | 1162 |
12 | 13 | 2011-01-13 | 1 | 0 | 1 | 0 | 4 | 1 | 1 | 0.165000 | 0.150883 | 0.470417 | 0.301000 | 38 | 1368 | 1406 |
13 | 14 | 2011-01-14 | 1 | 0 | 1 | 0 | 5 | 1 | 1 | 0.160870 | 0.188413 | 0.537826 | 0.126548 | 54 | 1367 | 1421 |
14 | 15 | 2011-01-15 | 1 | 0 | 1 | 0 | 6 | 0 | 2 | 0.233333 | 0.248112 | 0.498750 | 0.157963 | 222 | 1026 | 1248 |
15 | 16 | 2011-01-16 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0.231667 | 0.234217 | 0.483750 | 0.188433 | 251 | 953 | 1204 |
16 | 17 | 2011-01-17 | 1 | 0 | 1 | 1 | 1 | 0 | 2 | 0.175833 | 0.176771 | 0.537500 | 0.194017 | 117 | 883 | 1000 |
17 | 18 | 2011-01-18 | 1 | 0 | 1 | 0 | 2 | 1 | 2 | 0.216667 | 0.232333 | 0.861667 | 0.146775 | 9 | 674 | 683 |
18 | 19 | 2011-01-19 | 1 | 0 | 1 | 0 | 3 | 1 | 2 | 0.292174 | 0.298422 | 0.741739 | 0.208317 | 78 | 1572 | 1650 |
19 | 20 | 2011-01-20 | 1 | 0 | 1 | 0 | 4 | 1 | 2 | 0.261667 | 0.255050 | 0.538333 | 0.195904 | 83 | 1844 | 1927 |
20 | 21 | 2011-01-21 | 1 | 0 | 1 | 0 | 5 | 1 | 1 | 0.177500 | 0.157833 | 0.457083 | 0.353242 | 75 | 1468 | 1543 |
21 | 22 | 2011-01-22 | 1 | 0 | 1 | 0 | 6 | 0 | 1 | 0.059130 | 0.079070 | 0.400000 | 0.171970 | 93 | 888 | 981 |
22 | 23 | 2011-01-23 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0.096522 | 0.098839 | 0.436522 | 0.246600 | 150 | 836 | 986 |
23 | 24 | 2011-01-24 | 1 | 0 | 1 | 0 | 1 | 1 | 1 | 0.097391 | 0.117930 | 0.491739 | 0.158330 | 86 | 1330 | 1416 |
24 | 25 | 2011-01-25 | 1 | 0 | 1 | 0 | 2 | 1 | 2 | 0.223478 | 0.234526 | 0.616957 | 0.129796 | 186 | 1799 | 1985 |
25 | 26 | 2011-01-26 | 1 | 0 | 1 | 0 | 3 | 1 | 3 | 0.217500 | 0.203600 | 0.862500 | 0.293850 | 34 | 472 | 506 |
26 | 27 | 2011-01-27 | 1 | 0 | 1 | 0 | 4 | 1 | 1 | 0.195000 | 0.219700 | 0.687500 | 0.113837 | 15 | 416 | 431 |
27 | 28 | 2011-01-28 | 1 | 0 | 1 | 0 | 5 | 1 | 2 | 0.203478 | 0.223317 | 0.793043 | 0.123300 | 38 | 1129 | 1167 |
28 | 29 | 2011-01-29 | 1 | 0 | 1 | 0 | 6 | 0 | 1 | 0.196522 | 0.212126 | 0.651739 | 0.145365 | 123 | 975 | 1098 |
29 | 30 | 2011-01-30 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0.216522 | 0.250322 | 0.722174 | 0.073983 | 140 | 956 | 1096 |
30 | 31 | 2011-01-31 | 1 | 0 | 1 | 0 | 1 | 1 | 2 | 0.180833 | 0.186250 | 0.603750 | 0.187192 | 42 | 1459 | 1501 |
31 | 32 | 2011-02-01 | 1 | 0 | 2 | 0 | 2 | 1 | 2 | 0.192174 | 0.234530 | 0.829565 | 0.053213 | 47 | 1313 | 1360 |
32 | 33 | 2011-02-02 | 1 | 0 | 2 | 0 | 3 | 1 | 2 | 0.260000 | 0.254417 | 0.775417 | 0.264308 | 72 | 1454 | 1526 |
33 | 34 | 2011-02-03 | 1 | 0 | 2 | 0 | 4 | 1 | 1 | 0.186957 | 0.177878 | 0.437826 | 0.277752 | 61 | 1489 | 1550 |
34 | 35 | 2011-02-04 | 1 | 0 | 2 | 0 | 5 | 1 | 2 | 0.211304 | 0.228587 | 0.585217 | 0.127839 | 88 | 1620 | 1708 |
35 | 36 | 2011-02-05 | 1 | 0 | 2 | 0 | 6 | 0 | 2 | 0.233333 | 0.243058 | 0.929167 | 0.161079 | 100 | 905 | 1005 |
36 | 37 | 2011-02-06 | 1 | 0 | 2 | 0 | 0 | 0 | 1 | 0.285833 | 0.291671 | 0.568333 | 0.141800 | 354 | 1269 | 1623 |
37 | 38 | 2011-02-07 | 1 | 0 | 2 | 0 | 1 | 1 | 1 | 0.271667 | 0.303658 | 0.738333 | 0.045408 | 120 | 1592 | 1712 |
38 | 39 | 2011-02-08 | 1 | 0 | 2 | 0 | 2 | 1 | 1 | 0.220833 | 0.198246 | 0.537917 | 0.361950 | 64 | 1466 | 1530 |
39 | 40 | 2011-02-09 | 1 | 0 | 2 | 0 | 3 | 1 | 2 | 0.134783 | 0.144283 | 0.494783 | 0.188839 | 53 | 1552 | 1605 |
40 | 41 | 2011-02-10 | 1 | 0 | 2 | 0 | 4 | 1 | 1 | 0.144348 | 0.149548 | 0.437391 | 0.221935 | 47 | 1491 | 1538 |
41 | 42 | 2011-02-11 | 1 | 0 | 2 | 0 | 5 | 1 | 1 | 0.189091 | 0.213509 | 0.506364 | 0.108550 | 149 | 1597 | 1746 |
42 | 43 | 2011-02-12 | 1 | 0 | 2 | 0 | 6 | 0 | 1 | 0.222500 | 0.232954 | 0.544167 | 0.203367 | 288 | 1184 | 1472 |
43 | 44 | 2011-02-13 | 1 | 0 | 2 | 0 | 0 | 0 | 1 | 0.316522 | 0.324113 | 0.457391 | 0.260883 | 397 | 1192 | 1589 |
44 | 45 | 2011-02-14 | 1 | 0 | 2 | 0 | 1 | 1 | 1 | 0.415000 | 0.398350 | 0.375833 | 0.417908 | 208 | 1705 | 1913 |
45 | 46 | 2011-02-15 | 1 | 0 | 2 | 0 | 2 | 1 | 1 | 0.266087 | 0.254274 | 0.314348 | 0.291374 | 140 | 1675 | 1815 |
46 | 47 | 2011-02-16 | 1 | 0 | 2 | 0 | 3 | 1 | 1 | 0.318261 | 0.316200 | 0.423478 | 0.251791 | 218 | 1897 | 2115 |
47 | 48 | 2011-02-17 | 1 | 0 | 2 | 0 | 4 | 1 | 1 | 0.435833 | 0.428658 | 0.505000 | 0.230104 | 259 | 2216 | 2475 |
48 | 49 | 2011-02-18 | 1 | 0 | 2 | 0 | 5 | 1 | 1 | 0.521667 | 0.511983 | 0.516667 | 0.264925 | 579 | 2348 | 2927 |
49 | 50 | 2011-02-19 | 1 | 0 | 2 | 0 | 6 | 0 | 1 | 0.399167 | 0.391404 | 0.187917 | 0.507463 | 532 | 1103 | 1635 |
50 | 51 | 2011-02-20 | 1 | 0 | 2 | 0 | 0 | 0 | 1 | 0.285217 | 0.277330 | 0.407826 | 0.223235 | 639 | 1173 | 1812 |
51 | 52 | 2011-02-21 | 1 | 0 | 2 | 1 | 1 | 0 | 2 | 0.303333 | 0.284075 | 0.605000 | 0.307846 | 195 | 912 | 1107 |
52 | 53 | 2011-02-22 | 1 | 0 | 2 | 0 | 2 | 1 | 1 | 0.182222 | 0.186033 | 0.577778 | 0.195683 | 74 | 1376 | 1450 |
53 | 54 | 2011-02-23 | 1 | 0 | 2 | 0 | 3 | 1 | 1 | 0.221739 | 0.245717 | 0.423043 | 0.094113 | 139 | 1778 | 1917 |
54 | 55 | 2011-02-24 | 1 | 0 | 2 | 0 | 4 | 1 | 2 | 0.295652 | 0.289191 | 0.697391 | 0.250496 | 100 | 1707 | 1807 |
55 | 56 | 2011-02-25 | 1 | 0 | 2 | 0 | 5 | 1 | 2 | 0.364348 | 0.350461 | 0.712174 | 0.346539 | 120 | 1341 | 1461 |
56 | 57 | 2011-02-26 | 1 | 0 | 2 | 0 | 6 | 0 | 1 | 0.282500 | 0.282192 | 0.537917 | 0.186571 | 424 | 1545 | 1969 |
57 | 58 | 2011-02-27 | 1 | 0 | 2 | 0 | 0 | 0 | 1 | 0.343478 | 0.351109 | 0.680000 | 0.125248 | 694 | 1708 | 2402 |
58 | 59 | 2011-02-28 | 1 | 0 | 2 | 0 | 1 | 1 | 2 | 0.407273 | 0.400118 | 0.876364 | 0.289686 | 81 | 1365 | 1446 |
59 | 60 | 2011-03-01 | 1 | 0 | 3 | 0 | 2 | 1 | 1 | 0.266667 | 0.263879 | 0.535000 | 0.216425 | 137 | 1714 | 1851 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
731 rows × 16 columns
%pylab inline
fig = plt.figure()
fig.set_figwidth(20.0)
ax1 = fig.add_subplot(1,3,1) # one row, two columns, first plot
ax1.hist(np.log(data['cnt']))
ax2 = fig.add_subplot(1,3,2) # one row, two columns, second plot
ax2.hist(np.log(data['casual']))
ax3 = fig.add_subplot(1,3,3)
ax3.scatter(np.log(data['season']), np.log(data['registered']))
print fig
Populating the interactive namespace from numpy and matplotlib Figure(1600x320)
def ols_regression(x_input, y_response):
"""solution for solving a regression.
x_input: an input narray (X)
y_response: a 1d array of expected outputs (y)
X and y must have equal lengths, but x can be multiple dimensions
should return back a 1d array of 1 intercept + all coefficients
"""
intercept_matrix = np.ones(len(x_input))
x_var = np.array([intercept_matrix, x_input]).T
x_inv = x_var.T
a = np.linalg.inv(x_inv.dot(x_var))
b = x_inv.dot(y_response)
return a.dot(b)
linear_fit = LinearRegression()
log_fit = LinearRegression()
linear_fit.fit(data[['temp','casual','hum']].values, data['cnt'].values)
log_fit.fit(np.log(data[['temp','casual']]), np.log(data['cnt'].values))
#print "My intercept and coefficient:", ols_regression(data['temp','hum'], data['cnt'])
print "sklearn intercept and coef (linear):", linear_fit.intercept_, linear_fit.coef_
print "sklearn intercept and coef (log):", log_fit.intercept_, log_fit.coef_
sklearn intercept and coef (linear): 2339.35657509 [ 4.22992224e+03 1.26015129e+00 -1.59147475e+03] sklearn intercept and coef (log): 5.97738169165 [ 0.22239077 0.39074807]
print "Linear R-squared", round(linear_fit.score(data[['temp','casual','hum']].values, data['cnt'].values), 4)
print "Log R-squared", round(log_fit.score(np.log(data[['temp','casual']].values), np.log(data['cnt'].values)), 4)
Linear R-squared 0.5631 Log R-squared 0.6495
lin_f, lin_p = f_regression(data[['temp','casual']].values, data['cnt'].values)
log_f, log_p = f_regression(np.log(data[['temp','casual']]), np.log(data['cnt'].values))
print 'LINEAR F-Test Values:', lin_f[0]
print 'LINEAR p-values: ', lin_p[0]
print 'LOG F-Test Values:', log_f[0]
print 'LOG p-values: ', log_p[0]
LINEAR F-Test Values: 473.471710535 LINEAR p-values: 2.81062239759e-81 LOG F-Test Values: 517.386209663 LOG p-values: 5.74528625902e-87
linear_fit2 = LinearRegression()
log_fit2 = LinearRegression()
linear_fit2.fit(data[['temp','registered']].values, data['casual'].values)
log_fit2.fit(np.log(data[['temp','registered']]), np.log(data['casual'].values))
#print "My intercept and coefficient:", ols_regression(data['temp','hum'], data['cnt'])
print "sklearn intercept and coef (linear):", linear_fit2.intercept_, linear_fit2.coef_
print "sklearn intercept and coef (log):", log_fit2.intercept_, log_fit2.coef_
sklearn intercept and coef (linear): -248.4524661 [ 1.74647366e+03 6.33045792e-02] sklearn intercept and coef (log): 2.11544820589 [ 1.18827215 0.63908617]
print "Linear R-squared", round(linear_fit2.score(data[['temp','registered']].values, data['casual'].values), 4)
print "Log R-squared", round(log_fit2.score(np.log(data[['temp','registered']].values), np.log(data['casual'].values)), 4)
Linear R-squared 0.3098 Log R-squared 0.5809
lin_f, lin_p = f_regression(data[['temp','casual','holiday']].values, data['registered'].values)
log_f, log_p = f_regression(np.log(data[['temp','casual']]), np.log(data['registered'].values))
print 'LINEAR F-Test Values:', lin_f[0]
print 'LINEAR p-values: ', lin_p[0]
print 'LOG F-Test Values:', log_f[0]
print 'LOG p-values: ', log_p[0]
LINEAR F-Test Values: 300.098390251 LINEAR p-values: 1.44622865127e-56 LOG F-Test Values: 355.274052677 LOG p-values: 7.37054113943e-65
linear_fit3 = LinearRegression()
log_fit3 = LinearRegression()
linear_fit3.fit(data[['temp','casual','hum']].values, data['registered'].values)
log_fit3.fit(np.log(data[['temp','casual']]), np.log(data['registered'].values))
#print "My intercept and coefficient:", ols_regression(data['temp','hum'], data['cnt'])
print "sklearn intercept and coef (linear):", linear_fit3.intercept_, linear_fit3.coef_
print "sklearn intercept and coef (log):", log_fit3.intercept_, log_fit3.coef_
sklearn intercept and coef (linear): 2339.35657509 [ 4.22992224e+03 2.60151290e-01 -1.59147475e+03] sklearn intercept and coef (log): 6.65864600463 [ 0.31431862 0.26287702]
print "Linear R-squared", round(linear_fit3.score(data[['temp','casual','hum']].values, data['registered'].values), 4)
print "Log R-squared", round(log_fit3.score(np.log(data[['temp','casual']].values), np.log(data['registered'].values)), 4)
Linear R-squared 0.3265 Log R-squared 0.4406