import pandas as pd import numpy as np import matplotlib.pyplot as plt activity = pd.read_csv('data/refine-crimes.csv') ucr = pd.read_csv('data/RMSData_2012-01-01_to_2012-12-31.csv') print float(activity['Crime'].count()) / float(ucr['InternalID'].count()) print "Daily activity frequencies\n" print activity['Crime'].value_counts()[0:10] print "\n\n" print "Distinct crimes: %s" % len(activity['Crime'].value_counts()) print "Average incidents per crime: %s" % activity['Crime'].value_counts().mean() print "Median incidents per crime: %s" % activity['Crime'].value_counts().median() activity['Crime'].value_counts().hist() plt.title('Distribution of activity frequencies') plt.show() print "UCR frequencies\n" print ucr['Description'].value_counts()[0:10] print "\n\n" print "Distinct crimes: %s" % len(ucr['Description'].value_counts()) print "Average incidents per crime: %s" % ucr['Description'].value_counts().mean() print "Median incidents per crime: %s" % ucr['Description'].value_counts().median() ucr['Description'].value_counts().hist() plt.title('Distribution of UCR frequencies') plt.show() top_ucrs = ucr.groupby(['Description']).count().sort_index(by='Description', ascending=False)[0:25] top_activities = activity.groupby(['Crime']).count().sort_index(by='Crime', ascending=False)[0:25] fig = plt.figure() fig.subplots_adjust(left=0.2, wspace=0.6) ucr_plot = top_ucrs['Description'].plot(kind="barh") ax1 = fig.add_subplot(ucr_plot) ax1.set_title('Top 25 UCR descriptions') ax1.set_ylabel('') plt.show() fig = plt.figure() fig.subplots_adjust(left=0.2, wspace=0.6) activity_plot = top_activities['Crime'].plot(kind="barh") ax2 = fig.add_subplot(activity_plot) ax2.set_title('Top 25 acitivity incididents') ax2.set_xlim(0, 600) ax2.set_ylabel('') plt.show() months = ["Jan.", "Feb.", "March", "April", "May", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."] days = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'] def get_activity_date(activity): # grabs the posted date from the URL, likely making days off by one # TODO scrape dates of the incidents from the pages return pd.to_datetime(activity[-8:], format='%Y%m%d') activity['date'] = activity.url.apply(lambda x: get_activity_date(x)) activity['month'] = activity.date.apply(lambda x: x.month) activity['weekday'] = activity.date.apply(lambda x: x.weekday()) activity_by_month = activity.groupby('month').size().plot() activity_by_month.set_title('Activity by month') activity_by_month.set_xticklabels(months) activity_by_month.set_xlim(1, 12) plt.show() ucr.OccDate = pd.to_datetime(ucr.OccDate) ucr['weekday'] = ucr.OccDate.apply(lambda x: x.weekday()) ucr_by_day = ucr.groupby('weekday').size().plot() ucr_by_day.set_xticklabels(days) ucr_by_day.set_title('UCR by weekday') plt.show() ucr['month'] = ucr.OccDate.apply(lambda x: x.month) ucr_by_month = ucr.groupby('month').size().plot() ucr_by_month.set_title('UCR by month') ucr_by_month.set_xticklabels(months) ucr_by_month.set_xlim(1, 12) plt.show() ucr['time'] = ucr.OccDate.apply(lambda x: x.hour + (x.minute / 100)) ucr_by_time = ucr.groupby('time').size().plot() ucr_by_time.set_title('UCR by time of day') ucr_by_time.set_xlim(0, 24) plt.show() activity_cumsum = activity.groupby('date').size().cumsum().plot() activity_cumsum.set_title('Total 2012 activities by date') plt.show() ucr_cumsum = ucr.groupby('OccDate').size().cumsum().plot() ucr_cumsum.set_title('Total 2012 UCR by date') plt.show() print "Daily activity incidents per day" activity_per_day = activity.groupby('date').size() print "Mean: %s" % activity_per_day.mean() print "Standard dev: %s" % activity_per_day.std() apd_hist = activity_per_day.hist(bins=25) apd_hist.set_title('Distribution of daily activity incidents per day') plt.show() def get_md(ucr_date): return str(ucr_date.month) + '/' + str(ucr_date.day) ucr['md'] = ucr.OccDate.apply(get_md) print "Daily activity incidents per day" ucr_per_day = ucr.groupby('md').size() print "Mean: %s" % ucr_per_day.mean() print "Standard dev: %s" % ucr_per_day.std() upd_hist = ucr_per_day.hist(bins=25) upd_hist.set_title('Distribution of UCR crimes per day') plt.show() # TODO nltk? # TODO combine charts # TODO Monte Carlo