import re import matplotlib.pyplot as plt import matplotlib.ticker as pltticker import numpy as np import pandas as pd from datetime import date, timedelta def compose(*funcs): return lambda x: reduce(lambda acc, f: f(acc), funcs, x) pd.options.display.max_colwidth = 0 pd.options.display.show_dimensions = False def line_break(s, n=40): # line break every n chars return re.sub('(.{%d})' % n, '\\1\n', s) def expense_item_hist(row): amounts = pd.Series(row['amounts']).map(abs) # histogram of amounts counts, bins, patches = plt.hist(amounts, color=['lightgray']) # colour current amount bar red for patch, rightside, leftside in zip(patches, bins[1:], bins[:-1]): if leftside <= abs(row['Amount']) <= rightside: patch.set_facecolor('red') # add probability density estimate amounts.plot(kind="kde", secondary_y=True, style=':') plt.axis('off') # desc on top row # date - amount - catg on second row plt.title('%s\n%s' % (line_break(row['desc']), row['Entered Date'].date()), loc='left') plt.title('$%.2f' % abs(row['Amount']), loc='center') plt.title(row['catg'], loc='right') def expense_histograms(table, ncols=3, width=16, height=3, hspace=1): nrows = ceil(len(table.index) / float(ncols)) plt.figure(figsize=(width, nrows*height)) plt.subplots_adjust(hspace=hspace) # TODO: axis scale in $10, $100, $1000, $10000 increments #loc = pltticker.MaxNLocator() for i, (idx, row) in enumerate(table.iterrows()): ax = plt.subplot(nrows, ncols, i+1) #ax.xaxis.set_major_locator(loc) expense_item_hist(row) statement_csv = "data/eg-statement.csv" !head {statement_csv} statement = pd.read_csv(statement_csv, parse_dates=[0, 1], dayfirst=True) statement['Entered Date'].describe() #statement.head() #statement['Transaction Description'].value_counts() def re_sub(regex, repl): return lambda desc: re.sub(regex, repl, desc) clean_up = [lambda desc: desc.lower(), re_sub('\s+', ' '), re_sub('atm#\d+', ''), re_sub('atm owner fee \d+', ''), re_sub('pos( including cash out)?\s*#\d+', ''), re_sub('receipt no \d+', ''), re_sub('crn#\d+', ''), re_sub('\(?ref\.?\d+\)?', ''), re_sub('(au|gb|us)#\d+', '')] statement['desc'] = statement['Transaction Description'].map(compose(*clean_up)) #statement.head() def categorise(desc): if '' in desc: return 'atm' if '' in desc: return 'pos' if 'paypal' in desc: return 'paypal' if 'bpay' in desc: return 'bpay' if '' in desc: return 'fee' if 'transaction fee' in desc: return 'fee' if 'visa access card fee' in desc: return 'fee' if 'member net transfer' in desc: return 'transfer' if 'visa-internode' in desc: return 'bill' if 'visa-linode.com' in desc: return 'bill' if 'visa-citylink' in desc: return 'bill' if 'visa-virgin mobile' in desc: return 'bill' return '' statement['catg'] = statement['desc'].map(categorise) stats = statement.groupby('desc')['Amount'].agg({#'total' : np.sum, 'average' : np.mean, #'min': np.min, 'max': np.max, 'count': np.count_nonzero, 'amounts': lambda amounts: list(amounts)}) last_date = statement['Entered Date'].describe()['last'] start_date = last_date - timedelta(days=15) table = statement.join(stats, on='desc')[statement['Entered Date'].gt(start_date)] cols = ['Entered Date', 'catg', 'desc', 'Amount'] table.sort(['catg'], inplace=True) new = table['count'].eq(1) # only one instance of item description seen not_new = new.eq(False) debit = table['Amount'].lt(0) credit = table['Amount'].gt(0) # only consider it recurring if we've seen it more than twice, # otherwise it could be a doubled-up transaction we want to catch table['recurring'] = [amounts.count(amount) > 1 and len(amounts) > 2 for amount, amounts in zip(table['Amount'], table['amounts'])] recurring = table['recurring'].eq(True) not_recurring = table['recurring'].eq(False) & not_new table[cols][debit & new] expense_histograms(table[cols+['amounts']][debit & not_recurring]) table[cols][debit & recurring] table[cols][credit & new] expense_histograms(table[cols+['amounts']][credit & not_recurring]) table[cols][credit & recurring] left_overs = sorted(statement['desc'].unique()) len(left_overs), left_overs