import pandas as pd import json import numpy from IPython.display import display, HTML df0 = pd.read_json('file://localhost/home/natalie/pressurenet/data/June-01-2013:00:00:00_June-30-2013:00:00:00_43-44_-80--79.json', convert_dates=['daterecorded']) df1 = pd.read_json('file://localhost/home/natalie/pressurenet/data/July-01-2013:00:00:00_July-31-2013:00:00:00_43-44_-80--79.json', convert_dates=['daterecorded']) df2 = pd.read_json('file://localhost/home/natalie/pressurenet/data/August-01-2013:00:00:00_August-31-2013:00:00:00_43-44_-80--79.json', convert_dates=['daterecorded']) df3 = pd.read_json('file://localhost/home/natalie/pressurenet/data/September-01-2013:00:00:00_September-07-2013:00:00:00_43-44_-80--79.json', convert_dates=['daterecorded']) df4 = pd.read_json('file://localhost/home/natalie/pressurenet/data/September-07-2013:00:00:00_September-11-2013:00:00:00_43-44_-80--79.json', convert_dates=['daterecorded']) pn = pd.concat([df0, df1, df2, df3, df4]) pn[:5] pn.plot(x='daterecorded', y='reading', figsize=(15, 6)) pn = pn[(pn['reading'] > 900) & (pn['reading'] < 1100)] pn.plot(x='daterecorded', y='reading', figsize=(15, 6)) heat = pd.read_json('file://localhost/home/natalie/torontodata/heat_alerts_list.json', convert_dates=['date']) print heat.code.unique() print heat.text.unique() heat[:5] heat.dtypes import arrow pn["date"] = pn.apply(lambda x: str(arrow.get("T".join(str(x['daterecorded']).split(" "))).date()), axis=1).astype(datetime64) pn[:1] tor = pd.merge(pn, heat, on='date', suffixes=('_p','_h'), how='left') tor[:5] tor.text.unique() import numpy as np unique_vals, tor['code_no'] = np.unique(tor.code, return_inverse=True) tor['code_no'].unique() tor[tor['code_no'] == 1][:1]['text'] tor[tor['code_no'] == 2][:1]['text'] tor[tor['code_no'] == 3][:1]['text'] tor[tor['code_no'] == 4][:1]['text'] tor[tor['code_no'] == 5][:1]['text'] tor.corr() warnings = tor[tor['code_no'].isin([1, 2, 3, 4, 5])] warnings2 = tor[tor['code_no'] == 2] warnings1 = tor[tor['code_no'] == 1] warnings3 = tor[tor['code_no'] == 3] warnings4 = tor[tor['code_no'] == 4] warnings5 = tor[tor['code_no'] == 5] nonwarnings = tor[tor['code_no'] == 0] tor.describe() nonwarnings.describe() warnings.describe() print len(warnings1) warnings1.describe() print len(warnings2) warnings2.describe() print len(warnings3) warnings3.describe() print len(warnings4) warnings4.describe() print len(warnings5) warnings5.describe() import scipy.stats scipy.stats.ttest_ind(warnings['reading'], nonwarnings['reading']) warnings.hist('reading') nonwarnings.hist('reading') tor.hist('reading') warnings["time"] = warnings.apply(lambda x: arrow.get("T".join(str(x['daterecorded']).split(" "))).time(), axis=1) nonwarnings["time"] = nonwarnings.apply(lambda x: arrow.get("T".join(str(x['daterecorded']).split(" "))).time(), axis=1) tor["time"] = tor.apply(lambda x: arrow.get("T".join(str(x['daterecorded']).split(" "))).time(), axis=1) starttime = datetime.time(13,0) endtime = datetime.time(23,59) warnings_day = warnings[(warnings['time'] > starttime) & (warnings['time'] < endtime)] nonwarnings_day = nonwarnings[(nonwarnings['time'] > starttime) & (nonwarnings['time'] < endtime)] days = tor[(tor['time'] > starttime) & (tor['time'] < endtime)] scipy.stats.ttest_ind(warnings_day['reading'], nonwarnings_day['reading']) warnings_day.describe() nonwarnings_day.describe() warnings_day.hist('reading') nonwarnings_day.hist('reading') days.hist('reading')