import pandas as pd
pd.options.display.max_columns = 5200
pd.options.display.max_rows = 5200
# Insert your BigQuery Project ID Here
# Can be found in the Google web console
#projectid = "dazzling-will-91618"
#df = pd.read_gbq('SELECT * FROM dazzling-will-91618:taxi_all.nycb2010_stats LIMIT 200', project_id = projectid)
#http://stackoverflow.com/questions/18267749/importerror-no-module-named-apiclient-discovery
#sudo pip install --upgrade google-api-python-client
#data_frame = pd.read_gbq('SELECT * FROM test_dataset.test_table LIMIT 200',index_col='index_c_name',, project_id = projectid)
import pandas as pd
#dfmIN = '/Volumes/Hotel/Dropbox/data/output/all/taxi_2013.csv'
dfsIN = '/Volumes/Hotel/Dropbox/data/output/all/nycb2010_taxi_2013_stats_bldg_cnt.csv'
dfbIN = '/Volumes/Hotel/Dropbox/data/input/building/bldg_dist_height.csv'
dfs = pd.read_csv(dfsIN, dtype={'geoid':object})
dfb = pd.read_csv(dfbIN).rename(columns=lambda x: x.lower())
dfb = dfb[['geoid','building_block_int_dis_tbl_bulkdens']]
dfb = dfb.fillna(0)
dfb['geoid'] = dfb['geoid'].astype(str)
print dfb.dtypes
dfs = dfs[(dfs.areasqft >= 50000)]
dfs = dfs[(dfs.areasqft <= 300000)]
dfs = dfs[(dfs.countbldg >= 1)]
df = dfs.merge(dfb, on='geoid', how='left')
df = df[(df.building_block_int_dis_tbl_bulkdens >= 3)]
df
geoid object building_block_int_dis_tbl_bulkdens float64 dtype: object
geoid | avgbrdist | medbrdist | stdbrdist | count | areasqft | countbldg | building_block_int_dis_tbl_bulkdens | |
---|---|---|---|---|---|---|---|---|
0 | 360850112010001 | 0.000000 | 0.000000 | 0.000000 | 0 | 6807.30 | NaN | 0.000000 |
1 | 360850226000001 | 0.000000 | 0.000000 | 0.000000 | 0 | 45247.20 | NaN | 0.000000 |
2 | 360850007001000 | 0.000000 | 0.000000 | 0.000000 | 0 | 58960.10 | 1 | 0.037041 |
3 | 360850009001000 | 2.615134 | 0.000000 | 6.133327 | 30 | 244590.00 | 28 | 1.660016 |
4 | 360850011001000 | 11.767236 | 0.000000 | 23.266198 | 31 | 314611.00 | 84 | 2.380148 |
5 | 360850017001000 | 11.665143 | 0.000000 | 30.009733 | 10 | 293775.00 | 49 | 1.366823 |
6 | 360850020011000 | 9.132777 | 0.000000 | 24.635996 | 15 | 111006.00 | 19 | 1.222633 |
7 | 360850027001000 | 13.078075 | 0.000000 | 23.066772 | 8 | 150407.00 | 44 | 3.716140 |
8 | 360850029001000 | 4.820950 | 0.499503 | 12.474478 | 10 | 373418.00 | 90 | 1.586655 |
9 | 360850036001000 | 9.455927 | 0.000000 | 14.165112 | 11 | 244605.00 | 28 | 1.142436 |
10 | 360850039001000 | 1.230096 | 0.000000 | 4.224633 | 29 | 469846.00 | 61 | 1.710747 |
11 | 360850040001000 | 15.627651 | 0.000000 | 34.944490 | 5 | 141297.00 | 19 | 1.042037 |
12 | 360850047001000 | 9.113771 | 0.000000 | 13.433418 | 43 | 1058270.00 | 93 | 0.566017 |
13 | 360850059001000 | 34.489629 | 0.000000 | 103.354739 | 71 | 4824710.00 | 7 | 0.012687 |
14 | 360850064001000 | 12.578383 | 0.000000 | 24.247516 | 27 | 200785.00 | 20 | 1.063275 |
15 | 360850074001000 | 8.874304 | 0.000000 | 15.494223 | 7 | 139084.00 | 36 | 1.875482 |
16 | 360850075001000 | 0.000000 | 0.000000 | 0.000000 | 3 | 123561.00 | 32 | 1.739414 |
17 | 360850077001000 | 2.584788 | 0.000000 | 4.974750 | 5 | 205771.00 | 32 | 2.054480 |
18 | 360850096011000 | 3.783601 | 0.000000 | 7.817140 | 30 | 640395.00 | 96 | 1.226833 |
19 | 360850105001000 | 0.414795 | 0.000000 | 1.436892 | 12 | 288531.00 | 31 | 1.422864 |
20 | 360850112011000 | 6.755672 | 0.000000 | 17.918980 | 12 | 196112.00 | 21 | 1.340195 |
21 | 360850112021000 | 0.000000 | 0.000000 | 0.000000 | 3 | 92737.80 | 12 | 1.092448 |
22 | 360850114011000 | 0.000000 | 0.000000 | 0.000000 | 2 | 66684.10 | 12 | 1.083342 |
23 | 360850114021000 | 54.202656 | 54.202656 | 10.059157 | 2 | 183061.00 | 43 | 1.538521 |
24 | 360850121001000 | 16.750273 | 0.000000 | 24.619993 | 5 | 170251.00 | 43 | 1.644445 |
25 | 360850122001000 | 6.492126 | 1.490844 | 11.085928 | 4 | 139819.00 | 24 | 1.239573 |
26 | 360850125001000 | 7.792671 | 0.000000 | 13.168025 | 6 | 215178.00 | 51 | 1.971548 |
27 | 360850128041000 | 32.096027 | 0.026801 | 55.767375 | 47 | 2184280.00 | 88 | 1.767975 |
28 | 360850128051000 | 0.587746 | 0.000000 | 1.097798 | 12 | 635446.00 | 118 | 1.299699 |
29 | 360850128061000 | 0.000000 | 0.000000 | 0.000000 | 0 | 281051.00 | 48 | 1.138468 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
38686 | 360470644001003 | 4.748647 | 0.000000 | 14.293095 | 56 | 126338.00 | 28 | 1.424973 |
38687 | 360470642001000 | 6.843542 | 0.000000 | 15.789983 | 256 | 401457.00 | 92 | 3.244198 |
38688 | 360470644001004 | 14.778127 | 0.000000 | 34.967502 | 15 | 16403.10 | NaN | 0.000000 |
38689 | 360470644002004 | 6.599847 | 0.000000 | 18.427728 | 83 | 197433.00 | 65 | 2.601140 |
38690 | 360471010001002 | 3.207278 | 0.000000 | 7.562953 | 55 | 202878.00 | 65 | 2.243298 |
38691 | 360471010001003 | 11.044633 | 0.000000 | 20.867847 | 57 | 202716.00 | 56 | 3.036255 |
38692 | 360471010002002 | 4.459066 | 0.000000 | 13.049675 | 48 | 202659.00 | 72 | 1.803503 |
38693 | 360471010002003 | 2.933292 | 0.000000 | 7.813724 | 26 | 109458.00 | 31 | 1.710231 |
38694 | 360810171001003 | 6.939177 | 0.000000 | 19.450770 | 2633 | 119689.00 | 1 | 10.133345 |
38695 | 360810171001004 | 7.080989 | 0.000000 | 22.479680 | 6735 | 369285.00 | 9 | 9.983437 |
38696 | 360810055001006 | 5.237518 | 0.000000 | 16.423084 | 574 | 9436.48 | NaN | 0.000000 |
38697 | 360810055001007 | 5.929513 | 0.000000 | 19.269368 | 2529 | 70482.60 | 1 | 2.416705 |
38698 | 360810945002022 | 20.544891 | 0.000000 | 43.236024 | 581 | 3553480.00 | 84 | 2.260479 |
38699 | 360810945002023 | 12.476821 | 0.000000 | 20.423062 | 35 | 118509.00 | 20 | 1.532482 |
38700 | 360810945002024 | 7.718515 | 0.000000 | 20.488599 | 63 | 192455.00 | 63 | 1.750790 |
38701 | 360850226002020 | 0.000000 | 0.000000 | 0.000000 | 0 | 591.06 | NaN | 0.000000 |
38702 | 360850226003000 | 9.074053 | 0.000000 | 20.077495 | 30 | 2356680.00 | 231 | 0.823473 |
38703 | 360850226003001 | 1.090007 | 0.000000 | 2.437330 | 5 | 126572.00 | 35 | 2.327165 |
38704 | 360850239002005 | 2.557576 | 0.000000 | 5.115152 | 4 | 178216.00 | 32 | 1.420231 |
38705 | 360850239002006 | 1.332799 | 0.000000 | 2.590392 | 6 | 81905.40 | 16 | 1.400480 |
38706 | 360850239002002 | 1.466704 | 0.000000 | 2.518669 | 7 | 259159.00 | 62 | 1.695666 |
38707 | 360850239002003 | 0.000000 | 0.000000 | 0.000000 | 3 | 141550.00 | 24 | 1.222984 |
38708 | 360850239002007 | 18.457392 | 0.000000 | 28.704296 | 7 | 273797.00 | 53 | 1.362336 |
38709 | 360050274020001 | 0.000000 | 0.000000 | 0.000000 | 0 | 66837.40 | NaN | 0.000000 |
38710 | 360850248003001 | 0.000000 | 0.000000 | 0.000000 | 0 | 633172.00 | 2 | 1.220499 |
38711 | 360850248003002 | 0.000000 | 0.000000 | 0.000000 | 0 | 857302.00 | 7 | 0.047088 |
38712 | 360850226004017 | 25.372098 | 3.201143 | 43.240281 | 6 | 484725.00 | 54 | 1.346176 |
38713 | 360850226004027 | 25.617130 | 7.881637 | 39.441979 | 9 | 1755900.00 | 41 | 0.531735 |
38714 | 360610275001006 | 7.036360 | 0.000000 | 19.466528 | 133 | 93888.00 | NaN | 0.000000 |
38715 | 360610265003002 | 6.298910 | 0.000000 | 18.609811 | 1696 | 57478.40 | NaN | 0.000000 |
38716 rows × 8 columns
df = pd.read_csv('/Users/danielmsheehan/Dropbox/data/output/all/nycb2010_taxi_2013_stats_bldg_cnt_pctcbbldg.csv', dtype={'geoid':object})
#dfs[(dfs.areasqft >= 50000)] pctcbbldg > 0.20 AND count > 1000
#blaahahaahahah
df = df[(df.pctcbbldg > 0.333)] #df.pctcbbldg > 0.2 & df.count > 1000
df = df[(df['count'].astype(float) > 1000)]
df.dtypes
df.count()
#df.head(50)
geoid 4611 avgbrdist 4611 medbrdist 4611 stdbrdist 4611 count 4611 areasqft 4611 countbldg 4611 bldgareaft 4611 pctcbbldg 4611 dist_bldg_hght 4611 dtype: int64
# #%matplotlib inline
# #from http://stackoverflow.com/questions/7714677/r-scatterplot-with-too-many-points
# import numpy as np
# import matplotlib.pyplot as plt
# # N = 10000
# # mean = [0, 0]
# # cov = [[2, 2], [0, 2]]
# # x,y = np.random.multivariate_normal(mean, cov, N).T
# x = df.dist_bldg_hght
# y = df.avgbrdist
# plt.xlabel('building_block_int_dis_tbl_bulkdens')
# plt.ylabel('avgbrdist')
# plt.scatter(x, y, s=70, alpha=0.03)
# plt.ylim((0, 30))
# plt.xlim((0, 40))
# plt.show()
from scipy.stats import linregress
linregress(x,y) #x and y are arrays or lists.
(0.15933978469092655, 7.0073655131505124, 0.67502521913795477, 5.4965139892681322e-302, 0.0036561371976772286)
df['avgbrdist_meters'] = df['avgbrdist'] * 0.3048
#http://code.hammerpig.com/log-transformations-in-python.html
import math
z = math.exp(10)
print z
df['avgbrdist_logtran'] = df['avgbrdist'] * z
22026.4657948
import matplotlib.pyplot as plt
import numpy as np
x = df.dist_bldg_hght
y = df.avgbrdist_logtran
fig, ax = plt.subplots()
plt.xlabel('dist_bldg_hght')
plt.ylabel('avgbrdist')
plt.scatter(x, y, s=70, alpha=0.03)
# plt.ylim((0, 30))
# plt.xlim((0, 40))
plt.ylim((0, 1000000))
plt.xlim((0, 160))
fit = np.polyfit(x, y, deg=1)
ax.plot(x, fit[0] * x + fit[1], color='red')
plt.show()
import numpy
numpy.corrcoef(x, y)[0, 1]
0.6482169495740886
from scipy import stats
import numpy as np
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print "r-squared:", r_value**2 #http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.linregress.html
r-squared: 0.420185213715
df.to_csv('/Users/danielmsheehan/Dropbox/data/output/all/nycb2010_taxi_2013_stats_bldg_cnt_pctcbbldg_LIMIT_third_bldg_1000_pts.csv', index=False)