%pylab inline import pandas as pd import matplotlib.pyplot as plt import textwrap import datetime rc('font', family='Ubuntu') rc('figure', figsize=(18, 10)) pd.set_option('display.max_rows', 500) wrap = lambda text, width: '\n'.join(textwrap.wrap(text, width)) now = datetime.datetime.now() format_percent = '{:.02f}%'.format data = pd.read_csv('tarybu_nariai_-_main_information.csv', sep='|', parse_dates=['birthdate']) get_segment = lambda: data[data.county=='Vilniaus miesto'] get_segment().party.value_counts().to_frame('Kandidatų skaičius') fig, ax = plt.subplots(1, 1) parties = get_segment().party.unique() labels = [wrap(party.decode('utf-8'), 42) for party in parties] cmap = map(plt.cm.brg, linspace(0, 1, len(parties))) data['visasturtas'] = data.turtas + data.vertybiniai + data.pinigai + data.suteiktospaskolos segment = get_segment() for color, party, label in zip(cmap, parties, labels): segment[segment.party==party].plot(kind='scatter', x='mokesciai', y='visasturtas', color=color, label=label, ax=ax) data['tms'] = ((data.mokesciai / data.visasturtas).replace(inf, 0).fillna(0) * 100).replace(0, NaN) segment = get_segment() segment = segment[segment.tms.notnull()].groupby('party').tms.mean().to_frame().sort('tms') segment['tms'] = segment.tms.map(format_percent) segment.index.rename(None, inplace=True) segment.rename(columns={'tms': 'Viso turto ir mokesčių santykis'}) data['pms'] = ((data.mokesciai / data.pajamos).replace(np.inf, 0).fillna(0) * 100).astype(int).replace(0, np.NaN) segment = get_segment() segment = segment[segment.pms.notnull()].groupby('party').pms.mean().to_frame().sort('pms') segment['pms'] = segment.pms.map(format_percent) segment.index.rename(None, inplace=True) segment.rename(columns={'pms': 'Pajamų ir mokesčių santykis'}) data['age'] = (now - data.birthdate).astype('2 mln.']) segment = get_segment() segment = segment.groupby(['party', 'ts']).visasturtas.count().to_frame() segment.index.rename(['Partija', 'Turto segmentai'], inplace=True) segment.rename(columns={'visasturtas': 'Kandidatai'}) data['visasturtas'] = (data.turtas + data.vertybiniai + data.pinigai + data.suteiktospaskolos).fillna(0) segments = [-1, 1e6, int(data.visasturtas.max()+1)] data['ts'] = pd.cut(data.visasturtas, segments, labels=['varguoliai', 'milijonieriai']) segment = get_segment().groupby(['party', 'ts']).visasturtas.agg(['count', 'sum']) segment['percent'] = (segment['sum'] / segment['sum'].sum(level='party') * 100).map(format_percent) segment['sum'] = segment['sum'].map('{:,}'.format) segment.index.rename(['Partija', 'Kategorija'], inplace=True) segment.rename(columns={'count': 'Kandidatai', 'sum': 'Turtas', 'percent': 'Turto %'}) grp = get_segment().groupby('party') result = grp.otherelections.count().to_frame() result['count'] = grp['name'].count() result.index.rename(None, inplace=True) result.sort('otherelections').rename(columns={ 'otherelections': 'Kandidatuoja ne pirmą kartą', 'count': 'Viso kandidatų partijoje', }) def extract_languages(frame): repl = {'nan': 'jokios'} for i, x in frame.iterrows(): for lang in str(x.languages).replace(',', ' ').split(): if lang != 'pagrindai': yield {'party': x.party, 'language': repl.get(lang, lang)} result = pd.DataFrame(extract_languages(get_segment())).reset_index() result = result.groupby(['party', 'language']).count() result = result.reset_index().sort(['party', 'index'], ascending=[1, 0]).set_index(['party']) result['Procentas'] = (result['index'] / get_segment().party.value_counts() * 100).map(format_percent) result.set_index('language', append=True, inplace=True) result.index.rename(['Partija', 'Kalba'], inplace=True) result.rename(columns={'index': 'Kandidatai'})