%pylab inline
Populating the interactive namespace from numpy and matplotlib
from pandas import *
import seaborn as sns
sns.set_context(rc={'lines.markeredgewidth': 0.1})
import re
from scipy.stats import kendalltau, spearmanr, pearsonr, rankdata
source: mledoze / countries
country_details = read_pickle('data/countries/countries.pd')
code_to_id = read_pickle('data/countries/code_to_id.pd')
name_to_id = country_details.reset_index().rename(columns={'index': 'Country_id'}).set_index('Country_name')['Country_id']
def id_to_name(x):
return country_details.ix[x]['Country_name']
def id_to_name_eng(x):
return country_details.ix[x]['name']
def clean_district(data, district='Quartiere'):
data = data[data[district] != 'QUARTIERI RESIDUALI']
data[district] = data[district].str.replace(u'à', u"a'").str.replace(u'ò', u"o'").str.replace(u'è', u"e'").str.replace(u'é', u"e'").str.upper()
data[district] = data[district].str.replace(u'S/N', u"SUL NAVIGLIO")
data[district] = data[district].str.replace(u'VLE', u"VIALE")
return data
pop = read_csv('data/districts/pop_sto_quartiere_1999_2012.csv', sep=';', encoding='latin1', decimal=',', thousands='.')
pop.Eta = pop.Eta.str.replace('100.*', '100').astype(int)
census_data = clean_district(pop)
census_year = '2012'
census_data = census_data[census_data.Cittadinanza != 'Italia'].groupby(['Quartiere', 'Cittadinanza'])[census_year].sum()
census_data = census_data[census_data > 0]
census_countries = census_data.index.get_level_values('Cittadinanza').unique()
census_ids = name_to_id.ix[census_countries]
missing_ids = census_ids[census_ids.isnull()].reset_index().rename(columns={'index': 'Country_name'})
def recover_code(x):
x = re.compile('[ ]+').sub(r' ', x)
x = re.compile('[,]+').sub(r',', x)
x = re.compile('[-]+').sub(r'-', x)
x = re.compile("[()]").sub(r'', x)
words = [w.lower() for w in re.compile('[\s,-]+').split(x)]
countries = name_to_id.index.values
count = {}
for country in countries:
count[country] = 0
for word in words:
country_words = [w.lower() for w in re.compile('[\s,-]+').split(re.compile("[()]").sub(r'', country))]
if len(word) > 1 and word not in ['rep.', 'di', 'repubblica'] and word in country_words:
count[country] = count[country] + 1
count = Series(count)
if count.max() > 0:
return count.idxmax()
else:
return NaN
missing_ids['Recover_name'] = missing_ids.Country_name.apply(recover_code)
mispelled_names = ['Cinese, Repubblica Popolare', 'Maurizio', 'Russa, Federazione', "Stati Uniti d'America", 'Thailandia', 'Azerbaigian',
'Seicelle', 'Costarica', 'Malaysia', 'Bahama']
correct_names = ['Cina', 'Mauritius', 'Russia', "Stati Uniti D'America", 'Tailandia', 'Azerbaijan',
'Seychelles', 'Costa Rica', 'Malesia', 'Bahamas']
for i, j in zip(mispelled_names, correct_names):
missing_ids.loc[missing_ids.Country_name.isin([i]), 'Recover_name'] = [j]
name_correction = missing_ids.set_index('Country_name')['Recover_name']
census_data = census_data.reset_index()
census_data = census_data[census_data['Cittadinanza'] != 'Apolide']
census_data['Cittadinanza'] = census_data['Cittadinanza'].apply(lambda x: name_correction.ix[x] if x in name_correction.index else x)
census_data[census_data.Cittadinanza.isnull()]
Quartiere | Cittadinanza | 2012 |
---|
census_data.isnull().any(axis=1).sum()
0
census_data = census_data.dropna()
census_data['Country_id'] = name_to_id.ix[census_data.Cittadinanza].values
assert ~census_data.Country_id.isnull().any()
census_codes = census_data.groupby(['Country_id', 'Cittadinanza']).sum().reset_index(level=1).join(code_to_id.reset_index().set_index('Country_id'))
census_codes[census_codes.Country_code.isnull()]
Cittadinanza | 2012 | Country_code | Country_pop | |
---|---|---|---|---|
Country_id | ||||
40 | Canada | 199 | NaN | NaN |
111 | Giamaica | 14 | NaN | NaN |
115 | Kazakistan | 87 | NaN | NaN |
180 | Republic of Kosovo | 210 | NaN | NaN |
227 | Trinidad e Tobago | 6 | NaN | NaN |
Trinidad, Canada, Jamaica, Bahamas, Dominica, have the same calling code as US, Kazakistan has the same calling code as Russia. Kosovo has the same calling code as Serbia.
census_data.Cittadinanza = census_data.Cittadinanza.str.replace("(Trinidad e Tobago|Canada|Giamaica|Bahamas|Dominica$)", "Stati Uniti D'America")
census_data.Cittadinanza = census_data.Cittadinanza.str.replace("Kazakistan", "Russia")
census_data.Cittadinanza = census_data.Cittadinanza.str.replace("Republic of Kosovo", "Serbia")
census_data['Country_id'] = name_to_id.ix[census_data.Cittadinanza].values
census_pop = census_data.groupby(['Quartiere', 'Country_id'])[census_year].sum()
census_pop = DataFrame(census_pop).rename(columns={census_year: 'census'})
census_pop.index.names = ('NIL', 'Country_id')
census_pop.groupby(level=0).size().plot(kind='bar', figsize=(15, 5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f97b5635a50>
store = HDFStore('./stores/aggregated_dataset.h5')
intensity_data = store.select('intensity_ni_1M').reset_index()
# only November data was used for this analysis
intensity_data = intensity_data[intensity_data.time < '2013-12']
store.close()
nil_cells = read_csv('data/districts/celle_nil.csv')
nil_cells = nil_cells[['cellId', 'NIL']].set_index('cellId')
intensity_data = intensity_data[logical_not(intensity_data.Country_code.isin([0, 39])) &
intensity_data.Country_code.isin(code_to_id.index)]
intensity_data = intensity_data.join(code_to_id, on='Country_code')
intensity_data = intensity_data.groupby(['Square_id', 'Country_id'])[['Call']].agg(sum)
intensity_data = intensity_data.reset_index()
intensity_data = intensity_data.join(nil_cells, on='Square_id')
call_intensity = intensity_data.groupby(['NIL', 'Country_id'])['Call'].sum()
call_intensity = DataFrame(call_intensity).rename(columns={'Call': 'telco'})
call_intensity.groupby(level=0).size().plot(kind='bar', figsize=(15, 5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f97b57eadd0>
def kt_test(df):
if len(df) == 1:
return
x, y = df.transpose().values
try:
k, p = kendalltau(x, y)
except:
return
return Series([k, p], index=['k', 'p'])
census_pop_tot = census_pop.groupby(level=1).sum()
call_intensity_tot = call_intensity.groupby(level=1).sum()
census_vs_telco = merge(census_pop_tot, call_intensity_tot, how='outer', left_index=True, right_index=True)
census_vs_telco.index = Series(census_vs_telco.index).astype(int).map(id_to_name)
NaN values correspond either to:
# 1.
census_vs_telco[census_vs_telco['census'].isnull()].sort(columns='telco', ascending=False)
census | telco | |
---|---|---|
Country_id | ||
Emirati Arabi Uniti | NaN | 20512.689453 |
Hong Kong | NaN | 9334.054688 |
Qatar | NaN | 4778.136230 |
oman | NaN | 1292.375000 |
Bahrein | NaN | 711.344421 |
Bhutan | NaN | 526.156982 |
Sudan del sud | NaN | 430.463745 |
Liechtenstein | NaN | 310.338776 |
Riunione | NaN | 261.787476 |
Macao | NaN | 241.948807 |
Cambogia | NaN | 240.152267 |
Guinea Equatoriale | NaN | 229.740540 |
Andorra | NaN | 168.752899 |
Guadeloupa | NaN | 159.081924 |
Porto Rico | NaN | 147.603287 |
São Tomé e Príncipe | NaN | 137.702866 |
Namibia | NaN | 133.522461 |
Lesotho | NaN | 127.724556 |
Sint Maarten | NaN | 106.729385 |
Suriname | NaN | 87.424164 |
Gibilterra | NaN | 76.608322 |
Saint Vincent e Grenadine | NaN | 72.256630 |
Aruba | NaN | 65.973450 |
Bahamas | NaN | 59.690262 |
Gibuti | NaN | 56.548664 |
Vanuatu | NaN | 53.244602 |
Botswana | NaN | 48.090466 |
Guyana francese | NaN | 45.207359 |
Santa Lucia | NaN | 40.840706 |
Nuova Caledonia | NaN | 38.907169 |
... | ... | ... |
Tonga | NaN | 28.187069 |
Timor Est | NaN | 28.187067 |
Belize | NaN | 25.146940 |
Comore | NaN | 25.138527 |
Polinesia Francese | NaN | 25.132738 |
Grenada | NaN | 23.558239 |
Groenlandia | NaN | 21.991148 |
Isole Far Oer | NaN | 21.970684 |
Kiribati | NaN | 18.849556 |
Samoa | NaN | 18.849556 |
Isole Vergini americane | NaN | 15.724266 |
Guyana | NaN | 15.707963 |
Isole Cayman | NaN | 15.707963 |
Isole Cook | NaN | 12.566371 |
Dominica | NaN | 12.566371 |
Bermuda | NaN | 9.424779 |
Tuvalu | NaN | 9.424778 |
Samoa Americane | NaN | 8.495346 |
Isole Turks e Caicos | NaN | 6.283185 |
Città del Vaticano | NaN | 6.283185 |
Wallis e Futuna | NaN | 6.283185 |
Nauru | NaN | 6.283185 |
Territorio britannico dell'oceano indiano | NaN | 6.283185 |
Isole Marshall | NaN | 3.952756 |
Isole Tokelau | NaN | 3.145879 |
Isole Marianne Settentrionali | NaN | 3.141593 |
Niue | NaN | 3.141593 |
Anguilla | NaN | 3.141593 |
Palau | NaN | 3.141593 |
Isole Falkland o Isole Malvine | NaN | 1.509182 |
69 rows × 2 columns
# 2.
census_vs_telco[census_vs_telco['telco'].isnull()].sort(columns='census', ascending=False)
census | telco | |
---|---|---|
Country_id | ||
Corea del Nord | 5 | NaN |
sns.jointplot("census", "telco", log10(1 + census_vs_telco.fillna(0)), kind='reg')
<seaborn.axisgrid.JointGrid at 0x7f97da7df410>
sns.jointplot("census", "telco", log10(1 + census_vs_telco.dropna()), kind='reg')
<seaborn.axisgrid.JointGrid at 0x7f97b663d950>
73 countries are missing from census data but present in telco data, depending on the handling of this missing data we will get different correlations:
# outer join
kt_test(census_vs_telco.fillna(0))
k 6.976619e-01 p 4.958213e-53 dtype: float64
# inner join
kt_test(census_vs_telco.dropna())
k 5.978741e-01 p 4.160940e-27 dtype: float64
A graphical comparision (outer join):
census_vs_telco_ranked = census_vs_telco.fillna(0).rank(method='max').sort(columns='census', ascending=False)
census_vs_telco_ranked.plot(kind='bar', figsize=(20,5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f97b6e9b790>
A graphical comparision (inner join):
census_vs_telco_ranked = census_vs_telco.dropna().rank(method='max').sort(columns='census', ascending=False)
census_vs_telco_ranked.plot(kind='bar', figsize=(20,5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f97b5a39790>
nil_vs_telco = merge(call_intensity, census_pop, how='outer', left_index=True, right_index=True)
Drop NIL for which we have no census data:
nil_vs_telco.groupby(level=0).filter(lambda x: x['census'].isnull().all()).index.get_level_values('NIL').unique()
array(['CANTALUPA', 'CASCINA TRIULZA - EXPO', 'CHIARAVALLE', 'FIGINO', 'MUGGIANO', 'PARCO AGRICOLO SUD', "PARCO BOSCO IN CITTA'", 'PARCO DEI NAVIGLI', 'PARCO DELLE ABBAZIE', 'PARCO FORLANINI - ORTICA', 'PARCO NORD', 'PARCO SEMPIONE', 'QUINTOSOLE', 'SACCO', 'STEPHENSON', 'TRE TORRI', 'TRIULZO SUPERIORE'], dtype=object)
nil_vs_telco = nil_vs_telco.groupby(level=0).filter(lambda x: ~x['census'].isnull().all())
nil_vs_telco = nil_vs_telco.drop("QUARTIERI RESIDUALI")
top_census = nil_vs_telco[['census']].groupby(level=1).sum().sort(ascending=False, columns='census').reset_index()[['Country_id']]
top_census.Country_id = top_census.Country_id.astype(int).apply(id_to_name_eng)
top_telco = nil_vs_telco[['telco']].groupby(level=1).sum().sort(ascending=False, columns='telco').reset_index()[['Country_id']]
top_telco.Country_id = top_telco.Country_id.astype(int).apply(id_to_name_eng)
top_ranked = top_census.join(top_telco, lsuffix='_census', rsuffix='_telco')[:15]
top_ranked.index = top_ranked.index + 1
top_ranked
Country_id_census | Country_id_telco | |
---|---|---|
1 | Philippines | Egypt |
2 | Egypt | Bangladesh |
3 | China | Switzerland |
4 | Peru | China |
5 | Sri Lanka | Ukraine |
6 | Ecuador | France |
7 | Romania | United Kingdom |
8 | Morocco | Sri Lanka |
9 | Ukraine | Philippines |
10 | Bangladesh | Romania |
11 | Albania | Russia |
12 | El Salvador | Germany |
13 | France | Spain |
14 | Brazil | Senegal |
15 | Moldova | Ecuador |
top_ranked.Country_id_census.isin(top_ranked.Country_id_telco).sum()
9
Many countries are missing from census data but present in telco data, depending on the handling of this missing data we will get different correlations:
nil_vs_telco.isnull().groupby(level=0).sum().sort(columns='census', ascending=False)[:10]
telco | census | |
---|---|---|
NIL | ||
MAGGIORE - MUSOCCO | 1 | 105 |
DUOMO | 0 | 99 |
FARINI | 2 | 94 |
GARIBALDI REPUBBLICA | 2 | 92 |
BRERA | 2 | 89 |
BOVISASCA | 1 | 88 |
QT 8 | 0 | 87 |
PORTELLO | 0 | 84 |
QUINTO ROMANO | 1 | 84 |
PAGANO | 0 | 83 |
nil_vs_telco.groupby(level=0).agg({'census': lambda x: x.notnull().sum(), 'telco': lambda x: x.notnull().sum()}).plot(kind='bar', figsize=(20, 5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f97acd0ebd0>
kt_nil_outer = nil_vs_telco.fillna(0).groupby(level='NIL').apply(kt_test)
kt_nil_inner = nil_vs_telco.dropna().groupby(level='NIL').apply(kt_test)
kt_nil_outer.k.describe()
count 69.000000 mean 0.504946 std 0.089336 min 0.159067 25% 0.449533 50% 0.510122 75% 0.578244 max 0.682520 Name: k, dtype: float64
(kt_nil_outer.k > .4).sum()
58
kt_nil_inner.k.describe()
count 69.000000 mean 0.468872 std 0.081513 min 0.212301 25% 0.426314 50% 0.482826 75% 0.528603 max 0.625140 Name: k, dtype: float64
(kt_nil_inner.k > .4).sum()
55
figure(figsize=(25,5));
kt_nil_inner[['k']].sort(columns='k').join(kt_nil_inner[['k']], rsuffix='_inner').plot(kind='bar', figsize=(20,5));
title('Call');
<matplotlib.figure.Figure at 0x7f97afad0f50>
kt_nil_inner['p'].order().plot(kind='bar', figsize=(20,5))
title('Tot')
<matplotlib.text.Text at 0x7f97ac0fa310>
((kt_nil_inner['p'] < 0.01) * kt_nil_inner['k']).order().plot(kind='bar', figsize=(20, 5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f97ac3cd450>