%pylab inline
Populating the interactive namespace from numpy and matplotlib
from pandas import *
import dateutil
from datetime import timedelta
time_span = '1D'
store = HDFStore('./stores/aggregated_dataset.h5')
intensity = store['intensity_' + time_span].fillna(0)
store.close()
def fix_timezone(df):
if type(df.index) is DatetimeIndex and df.index.tz is None:
df.index = df.index.tz_localize('UTC').tz_convert('Europe/Rome')
return df
intensity.head()
SMS_in | SMS_out | Call_in | Call_out | |||
---|---|---|---|---|---|---|
Square_id | Country_code | |||||
2013-11-01 00:00:00+01:00 | 1 | 0 | 20.821161 | 0.058800 | 0.161477 | 0.670390 |
1 | 0.026137 | 0.000000 | 0.000000 | 0.000000 | ||
7 | 0.000000 | 0.001787 | 0.000000 | 0.000000 | ||
20 | 0.000000 | 0.000000 | 0.000000 | 0.290462 | ||
32 | 0.027300 | 0.000000 | 0.000000 | 0.000000 |
intensity = fix_timezone(intensity.reset_index(level=[1,2]))
intensity.index.name = 'time'
intensity['Call'] = intensity['Call_in'].values + intensity['Call_out'].values
intensity.drop(['SMS_in', 'SMS_out', 'Call_in', 'Call_out'], axis=1, inplace=True)
intensity = intensity[intensity['Call'] > 0]
intensity.head()
Square_id | Country_code | Call | |
---|---|---|---|
time | |||
2013-11-01 00:00:00+01:00 | 1 | 0 | 0.831867 |
2013-11-01 00:00:00+01:00 | 1 | 20 | 0.290462 |
2013-11-01 00:00:00+01:00 | 1 | 33 | 0.027300 |
2013-11-01 00:00:00+01:00 | 1 | 34 | 0.027300 |
2013-11-01 00:00:00+01:00 | 1 | 39 | 87.455475 |
def entropy(v):
# v : countries intensity vector
n = len(v)
# drop null values
v = v.dropna()
# drop intensities equal to 0
v = v[v > 0]
# return NaN if no country is left or no intensity is left:
if len(v) == 0 or v.sum() == 0:
return Series([NaN, NaN, len(v)], index=['entropy', 'entropy_n', 'n'])
# return 0 if one country is left:
if len(v) == 1:
return Series([0, 0, 1], index=['entropy', 'entropy_n', 'n'])
# frequencies
# p_i = v_i / \sum_j v_j
p = v.div(float(v.sum()))
# entropy
# e = - \sum_i p_i * log(p_i)
e = - p.map(lambda x: x * log(x)).sum()
return Series([e, e / log(len(p)), len(p)], index=['entropy', 'entropy_n', 'n'])
def plot_entropy(edf, n=12):
for time in edf.index.unique()[:n]:
figure(figsize=(20,5))
suptitle(time)
ref = Series(index=Index(arange(1, 10**4 + 1)), name='Square_id')
subplot(131)
data, _ = edf.ix[time].set_index('Square_id')['entropy'].align(ref)
pcolormesh(data.fillna(0).reshape(100,100), cmap=cm.Spectral, vmax=3.8)
colorbar()
title("entropy")
subplot(132)
data, _ = edf.ix[time].set_index('Square_id')['entropy_n'].align(ref)
pcolormesh(data.fillna(0).reshape(100,100), cmap=cm.Spectral, vmax=1)
colorbar()
title("entropy (normalized)")
subplot(133)
data, _ = edf.ix[time].set_index('Square_id')['n'].align(ref)
pcolormesh(data.fillna(0).reshape(100,100), cmap=cm.Blues)
colorbar()
title("entropy (n)")
show()
# drop italy codes but keep every other country
intensity_ni = (intensity[logical_not(intensity['Country_code'].isin([0,39]))]
.set_index(['Country_code', 'Square_id'], append=True))
t_entropy = intensity_ni.unstack(level='Country_code').apply(entropy, axis=1)
t_entropy = fix_timezone(t_entropy.reset_index(level='Square_id'))
plot_entropy(t_entropy)
m_intensity = intensity.groupby(TimeGrouper('M')).apply(lambda x: x.groupby(['Square_id', 'Country_code']).sum())
m_intensity.index.set_names(['time'] + m_intensity.index.names[1:], inplace=True)
m_intensity_ni = m_intensity[logical_not(m_intensity.index.get_level_values('Country_code').isin([0, 39]))]
m_entropy = m_intensity_ni.unstack(level='Country_code').apply(entropy, axis=1)
m_entropy = fix_timezone(m_entropy.reset_index(level='Square_id'))
plot_entropy(m_entropy)
a_intensity = intensity.groupby(TimeGrouper('A')).apply(lambda x: x.groupby(['Square_id', 'Country_code']).sum())
a_intensity.index.set_names(['time'] + a_intensity.index.names[1:], inplace=True)
a_intensity_ni = a_intensity[logical_not(a_intensity.index.get_level_values('Country_code').isin([0, 39]))]
a_entropy = a_intensity_ni.unstack(level='Country_code').apply(entropy, axis=1)
a_entropy = fix_timezone(a_entropy.reset_index(level='Square_id'))
plot_entropy(a_entropy)
def store_if_new(store, name, df):
if name in store:
print "[INFO] skipping %s, already in store" % name
store[name] = df
store.open()
# entropy
store_if_new(store, 'entropy_' + time_span, t_entropy)
store_if_new(store, 'entropy_1M', m_entropy)
store_if_new(store, 'entropy_1A', a_entropy)
# intensity_ni
store['intensity_ni_' + time_span] = intensity_ni
store['intensity_ni_1M'] = m_intensity_ni
store['intensity_ni_1A'] = a_intensity_ni
store.flush()
store.close()