%pylab inline import pandas as pd import joblib column_names = ['Square_id', 'Time_interval', 'Country_code', 'SMS_in', 'SMS_out', 'Call_in', 'Call_out', 'Internet_traffic'] dtypes = {'Square_id': int32, 'Time_interval': uint64, 'Country_code': int32, 'SMS_in': float32, 'SMS_out': float32, 'Call_in': float32, 'Call_out': float32, 'Internet_traffic': float32} file_pattern = 'data/telco/sms-call-internet-mi-2013-{:02d}-{:02d}.txt' df_list = [] for day in range(1,31): print 'loading', file_pattern.format(11, day) df = pd.read_csv(file_pattern.format(11, day), sep='\t', header=None, names=column_names, dtype=dtypes) df_list.append(df) for day in range(1,32): print 'loading', file_pattern.format(12, day) df = pd.read_csv(file_pattern.format(12, day), sep='\t', header=None, names=column_names, dtype=dtypes) df_list.append(df) df = pd.concat(df_list) df.index = pd.to_datetime(df.Time_interval.values, unit='ms').tz_localize('utc').tz_convert('Europe/Rome') df.drop('Time_interval', axis=1, inplace=True) df.sort_index(inplace=True) STORE_FILE = 'stores/sms-call-internet-mi-table-blosc.h5' store = pd.HDFStore(STORE_FILE, 'w', complib='blosc') store.append('telco_data', df, format='t', data_columns=['Square_id', 'Country_code']) store.append('telco_codes', df.Country_code.drop_duplicates(), format='t') store.close()