%pylab inline from pandas import * import dateutil from datetime import timedelta time_span = '1D' store = HDFStore('./stores/aggregated_dataset.h5') intensity = store['intensity_' + time_span].fillna(0) store.close() def fix_timezone(df): if type(df.index) is DatetimeIndex and df.index.tz is None: df.index = df.index.tz_localize('UTC').tz_convert('Europe/Rome') return df intensity.head() intensity = fix_timezone(intensity.reset_index(level=[1,2])) intensity.index.name = 'time' intensity['Call'] = intensity['Call_in'].values + intensity['Call_out'].values intensity.drop(['SMS_in', 'SMS_out', 'Call_in', 'Call_out'], axis=1, inplace=True) intensity = intensity[intensity['Call'] > 0] intensity.head() def entropy(v): # v : countries intensity vector n = len(v) # drop null values v = v.dropna() # drop intensities equal to 0 v = v[v > 0] # return NaN if no country is left or no intensity is left: if len(v) == 0 or v.sum() == 0: return Series([NaN, NaN, len(v)], index=['entropy', 'entropy_n', 'n']) # return 0 if one country is left: if len(v) == 1: return Series([0, 0, 1], index=['entropy', 'entropy_n', 'n']) # frequencies # p_i = v_i / \sum_j v_j p = v.div(float(v.sum())) # entropy # e = - \sum_i p_i * log(p_i) e = - p.map(lambda x: x * log(x)).sum() return Series([e, e / log(len(p)), len(p)], index=['entropy', 'entropy_n', 'n']) def plot_entropy(edf, n=12): for time in edf.index.unique()[:n]: figure(figsize=(20,5)) suptitle(time) ref = Series(index=Index(arange(1, 10**4 + 1)), name='Square_id') subplot(131) data, _ = edf.ix[time].set_index('Square_id')['entropy'].align(ref) pcolormesh(data.fillna(0).reshape(100,100), cmap=cm.Spectral, vmax=3.8) colorbar() title("entropy") subplot(132) data, _ = edf.ix[time].set_index('Square_id')['entropy_n'].align(ref) pcolormesh(data.fillna(0).reshape(100,100), cmap=cm.Spectral, vmax=1) colorbar() title("entropy (normalized)") subplot(133) data, _ = edf.ix[time].set_index('Square_id')['n'].align(ref) pcolormesh(data.fillna(0).reshape(100,100), cmap=cm.Blues) colorbar() title("entropy (n)") show() # drop italy codes but keep every other country intensity_ni = (intensity[logical_not(intensity['Country_code'].isin([0,39]))] .set_index(['Country_code', 'Square_id'], append=True)) t_entropy = intensity_ni.unstack(level='Country_code').apply(entropy, axis=1) t_entropy = fix_timezone(t_entropy.reset_index(level='Square_id')) plot_entropy(t_entropy) m_intensity = intensity.groupby(TimeGrouper('M')).apply(lambda x: x.groupby(['Square_id', 'Country_code']).sum()) m_intensity.index.set_names(['time'] + m_intensity.index.names[1:], inplace=True) m_intensity_ni = m_intensity[logical_not(m_intensity.index.get_level_values('Country_code').isin([0, 39]))] m_entropy = m_intensity_ni.unstack(level='Country_code').apply(entropy, axis=1) m_entropy = fix_timezone(m_entropy.reset_index(level='Square_id')) plot_entropy(m_entropy) a_intensity = intensity.groupby(TimeGrouper('A')).apply(lambda x: x.groupby(['Square_id', 'Country_code']).sum()) a_intensity.index.set_names(['time'] + a_intensity.index.names[1:], inplace=True) a_intensity_ni = a_intensity[logical_not(a_intensity.index.get_level_values('Country_code').isin([0, 39]))] a_entropy = a_intensity_ni.unstack(level='Country_code').apply(entropy, axis=1) a_entropy = fix_timezone(a_entropy.reset_index(level='Square_id')) plot_entropy(a_entropy) def store_if_new(store, name, df): if name in store: print "[INFO] skipping %s, already in store" % name store[name] = df store.open() # entropy store_if_new(store, 'entropy_' + time_span, t_entropy) store_if_new(store, 'entropy_1M', m_entropy) store_if_new(store, 'entropy_1A', a_entropy) # intensity_ni store['intensity_ni_' + time_span] = intensity_ni store['intensity_ni_1M'] = m_intensity_ni store['intensity_ni_1A'] = a_intensity_ni store.flush() store.close()