%pylab inline from pandas import * import seaborn as sns sns.set_context(rc={'lines.markeredgewidth': 0.1}) import re from scipy.stats import kendalltau, spearmanr, pearsonr, rankdata country_details = read_pickle('data/countries/countries.pd') code_to_id = read_pickle('data/countries/code_to_id.pd') name_to_id = country_details.reset_index().rename(columns={'index': 'Country_id'}).set_index('Country_name')['Country_id'] def id_to_name(x): return country_details.ix[x]['Country_name'] def id_to_name_eng(x): return country_details.ix[x]['name'] def clean_district(data, district='Quartiere'): data = data[data[district] != 'QUARTIERI RESIDUALI'] data[district] = data[district].str.replace(u'à', u"a'").str.replace(u'ò', u"o'").str.replace(u'è', u"e'").str.replace(u'é', u"e'").str.upper() data[district] = data[district].str.replace(u'S/N', u"SUL NAVIGLIO") data[district] = data[district].str.replace(u'VLE', u"VIALE") return data pop = read_csv('data/districts/pop_sto_quartiere_1999_2012.csv', sep=';', encoding='latin1', decimal=',', thousands='.') pop.Eta = pop.Eta.str.replace('100.*', '100').astype(int) census_data = clean_district(pop) census_year = '2012' census_data = census_data[census_data.Cittadinanza != 'Italia'].groupby(['Quartiere', 'Cittadinanza'])[census_year].sum() census_data = census_data[census_data > 0] census_countries = census_data.index.get_level_values('Cittadinanza').unique() census_ids = name_to_id.ix[census_countries] missing_ids = census_ids[census_ids.isnull()].reset_index().rename(columns={'index': 'Country_name'}) def recover_code(x): x = re.compile('[ ]+').sub(r' ', x) x = re.compile('[,]+').sub(r',', x) x = re.compile('[-]+').sub(r'-', x) x = re.compile("[()]").sub(r'', x) words = [w.lower() for w in re.compile('[\s,-]+').split(x)] countries = name_to_id.index.values count = {} for country in countries: count[country] = 0 for word in words: country_words = [w.lower() for w in re.compile('[\s,-]+').split(re.compile("[()]").sub(r'', country))] if len(word) > 1 and word not in ['rep.', 'di', 'repubblica'] and word in country_words: count[country] = count[country] + 1 count = Series(count) if count.max() > 0: return count.idxmax() else: return NaN missing_ids['Recover_name'] = missing_ids.Country_name.apply(recover_code) mispelled_names = ['Cinese, Repubblica Popolare', 'Maurizio', 'Russa, Federazione', "Stati Uniti d'America", 'Thailandia', 'Azerbaigian', 'Seicelle', 'Costarica', 'Malaysia', 'Bahama'] correct_names = ['Cina', 'Mauritius', 'Russia', "Stati Uniti D'America", 'Tailandia', 'Azerbaijan', 'Seychelles', 'Costa Rica', 'Malesia', 'Bahamas'] for i, j in zip(mispelled_names, correct_names): missing_ids.loc[missing_ids.Country_name.isin([i]), 'Recover_name'] = [j] name_correction = missing_ids.set_index('Country_name')['Recover_name'] census_data = census_data.reset_index() census_data = census_data[census_data['Cittadinanza'] != 'Apolide'] census_data['Cittadinanza'] = census_data['Cittadinanza'].apply(lambda x: name_correction.ix[x] if x in name_correction.index else x) census_data[census_data.Cittadinanza.isnull()] census_data.isnull().any(axis=1).sum() census_data = census_data.dropna() census_data['Country_id'] = name_to_id.ix[census_data.Cittadinanza].values assert ~census_data.Country_id.isnull().any() census_codes = census_data.groupby(['Country_id', 'Cittadinanza']).sum().reset_index(level=1).join(code_to_id.reset_index().set_index('Country_id')) census_codes[census_codes.Country_code.isnull()] census_data.Cittadinanza = census_data.Cittadinanza.str.replace("(Trinidad e Tobago|Canada|Giamaica|Bahamas|Dominica$)", "Stati Uniti D'America") census_data.Cittadinanza = census_data.Cittadinanza.str.replace("Kazakistan", "Russia") census_data.Cittadinanza = census_data.Cittadinanza.str.replace("Republic of Kosovo", "Serbia") census_data['Country_id'] = name_to_id.ix[census_data.Cittadinanza].values census_pop = census_data.groupby(['Quartiere', 'Country_id'])[census_year].sum() census_pop = DataFrame(census_pop).rename(columns={census_year: 'census'}) census_pop.index.names = ('NIL', 'Country_id') census_pop.groupby(level=0).size().plot(kind='bar', figsize=(15, 5)) store = HDFStore('./stores/aggregated_dataset.h5') intensity_data = store.select('intensity_ni_1M').reset_index() # only November data was used for this analysis intensity_data = intensity_data[intensity_data.time < '2013-12'] store.close() nil_cells = read_csv('data/districts/celle_nil.csv') nil_cells = nil_cells[['cellId', 'NIL']].set_index('cellId') intensity_data = intensity_data[logical_not(intensity_data.Country_code.isin([0, 39])) & intensity_data.Country_code.isin(code_to_id.index)] intensity_data = intensity_data.join(code_to_id, on='Country_code') intensity_data = intensity_data.groupby(['Square_id', 'Country_id'])[['Call']].agg(sum) intensity_data = intensity_data.reset_index() intensity_data = intensity_data.join(nil_cells, on='Square_id') call_intensity = intensity_data.groupby(['NIL', 'Country_id'])['Call'].sum() call_intensity = DataFrame(call_intensity).rename(columns={'Call': 'telco'}) call_intensity.groupby(level=0).size().plot(kind='bar', figsize=(15, 5)) def kt_test(df): if len(df) == 1: return x, y = df.transpose().values try: k, p = kendalltau(x, y) except: return return Series([k, p], index=['k', 'p']) census_pop_tot = census_pop.groupby(level=1).sum() call_intensity_tot = call_intensity.groupby(level=1).sum() census_vs_telco = merge(census_pop_tot, call_intensity_tot, how='outer', left_index=True, right_index=True) census_vs_telco.index = Series(census_vs_telco.index).astype(int).map(id_to_name) # 1. census_vs_telco[census_vs_telco['census'].isnull()].sort(columns='telco', ascending=False) # 2. census_vs_telco[census_vs_telco['telco'].isnull()].sort(columns='census', ascending=False) sns.jointplot("census", "telco", log10(1 + census_vs_telco.fillna(0)), kind='reg') sns.jointplot("census", "telco", log10(1 + census_vs_telco.dropna()), kind='reg') # outer join kt_test(census_vs_telco.fillna(0)) # inner join kt_test(census_vs_telco.dropna()) census_vs_telco_ranked = census_vs_telco.fillna(0).rank(method='max').sort(columns='census', ascending=False) census_vs_telco_ranked.plot(kind='bar', figsize=(20,5)) census_vs_telco_ranked = census_vs_telco.dropna().rank(method='max').sort(columns='census', ascending=False) census_vs_telco_ranked.plot(kind='bar', figsize=(20,5)) nil_vs_telco = merge(call_intensity, census_pop, how='outer', left_index=True, right_index=True) nil_vs_telco.groupby(level=0).filter(lambda x: x['census'].isnull().all()).index.get_level_values('NIL').unique() nil_vs_telco = nil_vs_telco.groupby(level=0).filter(lambda x: ~x['census'].isnull().all()) nil_vs_telco = nil_vs_telco.drop("QUARTIERI RESIDUALI") top_census = nil_vs_telco[['census']].groupby(level=1).sum().sort(ascending=False, columns='census').reset_index()[['Country_id']] top_census.Country_id = top_census.Country_id.astype(int).apply(id_to_name_eng) top_telco = nil_vs_telco[['telco']].groupby(level=1).sum().sort(ascending=False, columns='telco').reset_index()[['Country_id']] top_telco.Country_id = top_telco.Country_id.astype(int).apply(id_to_name_eng) top_ranked = top_census.join(top_telco, lsuffix='_census', rsuffix='_telco')[:15] top_ranked.index = top_ranked.index + 1 top_ranked top_ranked.Country_id_census.isin(top_ranked.Country_id_telco).sum() nil_vs_telco.isnull().groupby(level=0).sum().sort(columns='census', ascending=False)[:10] nil_vs_telco.groupby(level=0).agg({'census': lambda x: x.notnull().sum(), 'telco': lambda x: x.notnull().sum()}).plot(kind='bar', figsize=(20, 5)) kt_nil_outer = nil_vs_telco.fillna(0).groupby(level='NIL').apply(kt_test) kt_nil_inner = nil_vs_telco.dropna().groupby(level='NIL').apply(kt_test) kt_nil_outer.k.describe() (kt_nil_outer.k > .4).sum() kt_nil_inner.k.describe() (kt_nil_inner.k > .4).sum() figure(figsize=(25,5)); kt_nil_inner[['k']].sort(columns='k').join(kt_nil_inner[['k']], rsuffix='_inner').plot(kind='bar', figsize=(20,5)); title('Call'); kt_nil_inner['p'].order().plot(kind='bar', figsize=(20,5)) title('Tot') ((kt_nil_inner['p'] < 0.01) * kt_nil_inner['k']).order().plot(kind='bar', figsize=(20, 5))