import pandas as pd import os curdir = os.path.abspath('./..') df = pd.read_csv(os.path.join(curdir, 'scraped_data', 'epci_all.csv')) df[['year', 'net_profit', 'staff_costs', 'financial_costs', 'debt_repayments', 'allocation']].head(n=20) df.columns df['debt_ratio'] = df['debt_annual_costs']/df['operating_revenues'] df['staff_costs_ratio'] = df['staff_costs']/df['operating_revenues'] print "Nombre d'EPCI crawlés par an" df.groupby('year').year.count() xls = pd.ExcelFile(os.path.join(curdir, 'data', 'epci-au-01-01-2013.xls') data = xls.parse('Composition communale des EPCI') data['siren'] = data[u'Établissement public à fiscalité propre'][1:] data['siren'].dropna().unique().size # there is a strange epci ZZZZZZZZZZZZZZ len(set(df['siren'].apply(unicode).unique()).symmetric_difference(data['siren'].unique())) plt.figure(figsize=(12,12)); df[['debt_ratio', 'staff_costs_ratio']].boxplot() df[['debt_ratio', 'staff_costs_ratio', 'name']].head(20) # Biggest property tax rate _df = df.sort(columns='debt_ratio', ascending=False) _df[['year', 'debt_ratio', 'name']].head(n=20)