import pandas as pd pd.options.display.max_columns = 5200 pd.options.display.max_rows = 5200 # Insert your BigQuery Project ID Here # Can be found in the Google web console #projectid = "dazzling-will-91618" #df = pd.read_gbq('SELECT * FROM dazzling-will-91618:taxi_all.nycb2010_stats LIMIT 200', project_id = projectid) #http://stackoverflow.com/questions/18267749/importerror-no-module-named-apiclient-discovery #sudo pip install --upgrade google-api-python-client #data_frame = pd.read_gbq('SELECT * FROM test_dataset.test_table LIMIT 200',index_col='index_c_name',, project_id = projectid) import pandas as pd #dfmIN = '/Volumes/Hotel/Dropbox/data/output/all/taxi_2013.csv' dfsIN = '/Volumes/Hotel/Dropbox/data/output/all/nycb2010_taxi_2013_stats_bldg_cnt.csv' dfbIN = '/Volumes/Hotel/Dropbox/data/input/building/bldg_dist_height.csv' dfs = pd.read_csv(dfsIN, dtype={'geoid':object}) dfb = pd.read_csv(dfbIN).rename(columns=lambda x: x.lower()) dfb = dfb[['geoid','building_block_int_dis_tbl_bulkdens']] dfb = dfb.fillna(0) dfb['geoid'] = dfb['geoid'].astype(str) print dfb.dtypes dfs = dfs[(dfs.areasqft >= 50000)] dfs = dfs[(dfs.areasqft <= 300000)] dfs = dfs[(dfs.countbldg >= 1)] df = dfs.merge(dfb, on='geoid', how='left') df = df[(df.building_block_int_dis_tbl_bulkdens >= 3)] df df = pd.read_csv('/Users/danielmsheehan/Dropbox/data/output/all/nycb2010_taxi_2013_stats_bldg_cnt_pctcbbldg.csv', dtype={'geoid':object}) #dfs[(dfs.areasqft >= 50000)] pctcbbldg > 0.20 AND count > 1000 #blaahahaahahah df = df[(df.pctcbbldg > 0.333)] #df.pctcbbldg > 0.2 & df.count > 1000 df = df[(df['count'].astype(float) > 1000)] df.dtypes df.count() #df.head(50) # #%matplotlib inline # #from http://stackoverflow.com/questions/7714677/r-scatterplot-with-too-many-points # import numpy as np # import matplotlib.pyplot as plt # # N = 10000 # # mean = [0, 0] # # cov = [[2, 2], [0, 2]] # # x,y = np.random.multivariate_normal(mean, cov, N).T # x = df.dist_bldg_hght # y = df.avgbrdist # plt.xlabel('building_block_int_dis_tbl_bulkdens') # plt.ylabel('avgbrdist') # plt.scatter(x, y, s=70, alpha=0.03) # plt.ylim((0, 30)) # plt.xlim((0, 40)) # plt.show() from scipy.stats import linregress linregress(x,y) #x and y are arrays or lists. df['avgbrdist_meters'] = df['avgbrdist'] * 0.3048 #http://code.hammerpig.com/log-transformations-in-python.html import math z = math.exp(10) print z df['avgbrdist_logtran'] = df['avgbrdist'] * z import matplotlib.pyplot as plt import numpy as np x = df.dist_bldg_hght y = df.avgbrdist_logtran fig, ax = plt.subplots() plt.xlabel('dist_bldg_hght') plt.ylabel('avgbrdist') plt.scatter(x, y, s=70, alpha=0.03) # plt.ylim((0, 30)) # plt.xlim((0, 40)) plt.ylim((0, 1000000)) plt.xlim((0, 160)) fit = np.polyfit(x, y, deg=1) ax.plot(x, fit[0] * x + fit[1], color='red') plt.show() import numpy numpy.corrcoef(x, y)[0, 1] from scipy import stats import numpy as np slope, intercept, r_value, p_value, std_err = stats.linregress(x,y) print "r-squared:", r_value**2 #http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.linregress.html df.to_csv('/Users/danielmsheehan/Dropbox/data/output/all/nycb2010_taxi_2013_stats_bldg_cnt_pctcbbldg_LIMIT_third_bldg_1000_pts.csv', index=False)