%matplotlib inline import pandas as pd import datetime as dt base = '/home/ovis/cr/dvto' df = pd.read_csv("%s/Parking_Tags_Data_2012.csv" % base) df df.dtypes pd.isnull(df['date_of_infraction']).value_counts() date_format = '%Y%m%d' def create_weekday(x): d = dt.datetime.strptime(str(x['date_of_infraction']), date_format) return d.strftime('%A') df['weekday'] = df.apply(create_weekday, axis=1) weekday_counts = df['weekday'].value_counts() weekday_counts weekdays = pd.DataFrame({ 'weekday': weekday_counts.keys(), 'count': weekday_counts.values }) weekdays weekdays = weekdays.reindex_axis(sorted(weekdays.columns, reverse=True), axis=1) weekdays weekdays.to_csv("%s/weekdays.csv" % base, index=False) fine_counts = df['set_fine_amount'].value_counts() fine_counts fines = pd.DataFrame({ 'amount': fine_counts.keys(), 'count': fine_counts.values }) fines fines.to_csv("%s/fines.csv" % base, index=False) province_counts = df['province'].value_counts() province_counts top_provinces = province_counts[province_counts > province_counts[9]].keys() top_provinces top_provinces = map(lambda x: str(x), top_provinces) top_provinces def create_trimmed_provinces(x): if x['province'] in top_provinces: return x['province'] else: return 'Other' df['trimmed_provinces'] = df.apply(create_trimmed_provinces, axis=1) trimmed_provinces = df['trimmed_provinces'].value_counts() trimmed_provinces provinces = pd.DataFrame({ 'province': trimmed_provinces.keys(), 'count': trimmed_provinces.values }) provinces provinces = provinces.reindex_axis(sorted(provinces.columns, reverse=True), axis=1) provinces provinces.to_csv("%s/provinces.csv" % base, index=False) infraction_counts = df['infraction_description'].value_counts() infraction_counts infraction_counts = df['infraction_code'].value_counts() infraction_counts df[df['infraction_code'] == 5].head() descriptions = {} def create_sane_infraction_description(x): code = x['infraction_code'] if code not in descriptions: descriptions[code] = x['infraction_description'] return descriptions[code] df['sane_infraction_description'] = df.apply(create_sane_infraction_description, axis=1) infraction_counts = df['sane_infraction_description'].value_counts() infraction_counts infractions = pd.DataFrame({ 'infraction': infraction_counts.keys(), 'count': infraction_counts.values }) infractions = infractions.reindex_axis(sorted(infractions.columns, reverse=True), axis=1) infractions infractions.to_csv("%s/infractions.csv" % base, index=False) pd.isnull(df['time_of_infraction']).value_counts() time_of_infraction = df['time_of_infraction'] time_of_infraction time_of_infraction.value_counts() time_of_infraction[time_of_infraction % 100 > 59] time_of_infraction = time_of_infraction[time_of_infraction % 100 <= 59] time_counts = time_of_infraction.value_counts() time_counts time_counts = time_of_infraction.value_counts(sort=False) time_counts times = pd.DataFrame({ 'time': time_counts.keys(), 'count': time_counts.values }) times = times.reindex_axis(sorted(times.columns, reverse=True), axis=1) times times.dtypes times['time'] = times.apply(lambda x: int(x['time']), axis=1) times times.to_csv("%s/times.csv" % base, index=False)