%matplotlib inline
import pandas as pd
import numpy as np
import glob
pj = '/Users/danielmsheehan/Desktop/data/'
pj = '/Users/danielmsheehan/Dropbox/data/'
from ggplot import *
df = pd.read_csv(pj+'output/all/taxi_2013.csv', dtype={'geoid':object})
rows = np.random.choice(df.index.values, 1000000)
dfs = df.ix[rows]
print len(dfs.index)
print dfs.dtypes
1000000 tuid int64 dist_roadbed float64 geoid object dist_bldg_hght float64 type object dtype: object
# ggplot(dfs, aes('dist_bldg_hght', 'dist_roadbed')) + \
# geom_point(alpha=1/20.) + \
# ylim(0, 20000)
import matplotlib.pyplot as plt
# N = 10000
# mean = [0, 0]
# cov = [[2, 2], [0, 2]]
# x,y = np.random.multivariate_normal(mean, cov, N).T
x = dfs.dist_bldg_hght
y = dfs.dist_roadbed
plt.xlabel('dist_bldg_hght')
plt.ylabel('dist_roadbed')
plt.scatter(x, y, s=70, alpha=0.03)
plt.ylim((0, 600))
plt.xlim((0, 200))
plt.show()
%reset
Once deleted, variables cannot be recovered. Proceed (y/[n])? y