import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

activity = pd.read_csv('data/refine-crimes.csv')
ucr = pd.read_csv('data/RMSData_2012-01-01_to_2012-12-31.csv')

print float(activity['Crime'].count()) / float(ucr['InternalID'].count())

print "Daily activity frequencies\n"

print activity['Crime'].value_counts()[0:10]
print "\n\n"
print "Distinct crimes: %s" % len(activity['Crime'].value_counts())
print "Average incidents per crime: %s" % activity['Crime'].value_counts().mean()
print "Median incidents per crime: %s" % activity['Crime'].value_counts().median()
activity['Crime'].value_counts().hist()
plt.title('Distribution of activity frequencies')
plt.show()

print "UCR frequencies\n"
print ucr['Description'].value_counts()[0:10]
print "\n\n"
print "Distinct crimes: %s" % len(ucr['Description'].value_counts())
print "Average incidents per crime: %s" % ucr['Description'].value_counts().mean()
print "Median incidents per crime: %s" % ucr['Description'].value_counts().median()
ucr['Description'].value_counts().hist()
plt.title('Distribution of UCR frequencies')
plt.show()

top_ucrs = ucr.groupby(['Description']).count().sort_index(by='Description',
                                                         ascending=False)[0:25]

top_activities = activity.groupby(['Crime']).count().sort_index(by='Crime',
                                                          ascending=False)[0:25]

fig = plt.figure()
fig.subplots_adjust(left=0.2, wspace=0.6)

ucr_plot = top_ucrs['Description'].plot(kind="barh")

ax1 = fig.add_subplot(ucr_plot)
ax1.set_title('Top 25 UCR descriptions')
ax1.set_ylabel('')
plt.show()

fig = plt.figure()
fig.subplots_adjust(left=0.2, wspace=0.6)

activity_plot = top_activities['Crime'].plot(kind="barh")

ax2 = fig.add_subplot(activity_plot)
ax2.set_title('Top 25 acitivity incididents')
ax2.set_xlim(0, 600)
ax2.set_ylabel('')
plt.show()

months = ["Jan.", "Feb.", "March", "April", "May", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."]
days = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']

def get_activity_date(activity):
    # grabs the posted date from the URL, likely making days off by one
    # TODO scrape dates of the incidents from the pages
    return pd.to_datetime(activity[-8:], format='%Y%m%d')

activity['date'] = activity.url.apply(lambda x: get_activity_date(x))
activity['month'] = activity.date.apply(lambda x: x.month)
activity['weekday'] = activity.date.apply(lambda x: x.weekday())

activity_by_month = activity.groupby('month').size().plot()
activity_by_month.set_title('Activity by month')
activity_by_month.set_xticklabels(months)
activity_by_month.set_xlim(1, 12)
plt.show()

ucr.OccDate = pd.to_datetime(ucr.OccDate)
ucr['weekday'] = ucr.OccDate.apply(lambda x: x.weekday())

ucr_by_day = ucr.groupby('weekday').size().plot()
ucr_by_day.set_xticklabels(days)
ucr_by_day.set_title('UCR by weekday')

plt.show()

ucr['month'] = ucr.OccDate.apply(lambda x: x.month)

ucr_by_month = ucr.groupby('month').size().plot()
ucr_by_month.set_title('UCR by month')
ucr_by_month.set_xticklabels(months)
ucr_by_month.set_xlim(1, 12)
plt.show()

ucr['time'] = ucr.OccDate.apply(lambda x: x.hour + (x.minute / 100))
ucr_by_time = ucr.groupby('time').size().plot()
ucr_by_time.set_title('UCR by time of day')
ucr_by_time.set_xlim(0, 24)
plt.show()

activity_cumsum = activity.groupby('date').size().cumsum().plot()
activity_cumsum.set_title('Total 2012 activities by date')
plt.show()

ucr_cumsum = ucr.groupby('OccDate').size().cumsum().plot()
ucr_cumsum.set_title('Total 2012 UCR by date')
plt.show()

print "Daily activity incidents per day"
activity_per_day = activity.groupby('date').size()
print "Mean: %s" % activity_per_day.mean()
print "Standard dev: %s" % activity_per_day.std()

apd_hist = activity_per_day.hist(bins=25)
apd_hist.set_title('Distribution of daily activity incidents per day')
plt.show()

def get_md(ucr_date):
    return str(ucr_date.month) + '/' + str(ucr_date.day)

ucr['md'] = ucr.OccDate.apply(get_md)

print "Daily activity incidents per day"
ucr_per_day = ucr.groupby('md').size()
print "Mean: %s" % ucr_per_day.mean()
print "Standard dev: %s" % ucr_per_day.std()

upd_hist = ucr_per_day.hist(bins=25)
upd_hist.set_title('Distribution of UCR crimes per day')
plt.show()

# TODO nltk?
# TODO combine charts
# TODO Monte Carlo