import pandas as pd import numpy # temperature vs ice_cream, fill the series temperatures = pd.Series([21,30,16,18]) icecreams_sold = pd.Series([70,90,35,40]) temperatures.plot() icecreams_sold.plot() # x,y plot plt.scatter(temperatures, icecreams_sold) # calc correlation temperatures.corr(icecreams_sold) # negative correlation? fill the series coats_sold = pd.Series([25, 10, 47, 40]) plt.scatter(temperatures, coats_sold) # calc corr coats_sold.corr(temperatures) #bigquery imports import bq import datetime import pandas as pd client = bq.Client.Get() # first 10 days of NY query = """ SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] WHERE departure_state = 'NY' GROUP BY 1,2 HAVING c > 5 ORDER BY 1 LIMIT 10 """ fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000) data[0:10] # last 10 days of NY query = """ SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] WHERE departure_state = 'NY' GROUP BY 1,2 HAVING c > 5 ORDER BY 1 DESC LIMIT 10 """ fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000) data[0:10] # plot NY by day query = """ SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] WHERE departure_state = 'NY' GROUP BY 1,2 HAVING c > 5 ORDER BY 1 """ fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000) for x in data: x[2] = float(x[2]) frame = pd.DataFrame(data) xx=frame.plot() xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()]) # too much data, plot NY by month query = """ SELECT left(date, 7) date, departure_state, AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] WHERE departure_state = 'NY' GROUP BY 1,2 HAVING c > 5 ORDER BY 1 """ fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000) for x in data: x[2] = float(x[2]) frame = pd.DataFrame(data) xx=frame.plot() xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()]) "" #comparing NY to NJ query = """ SELECT a.date, a.departure_state, a.avg, b.departure_state, b.avg FROM ( SELECT left(date, 7) date, departure_state, AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] WHERE departure_state = 'NY' GROUP BY 1,2 HAVING c > 5) a JOIN ( SELECT left(date, 7) date, departure_state, AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] WHERE departure_state = 'NJ' GROUP BY 1,2 HAVING c > 5) b ON a.date=b.date ORDER BY 1 """ fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000) for x in data: x[2] = float(x[2]) x[4] = float(x[4]) data[0:30] #plot NJ vs NY frame = pd.DataFrame(data) xx=frame.plot() xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()]) #scatter plot NY vs NJ plt.scatter(frame[2],frame[4]) plt.xlabel('avg NY delay in minutes') plt.ylabel('avg NJ delay in minutes') frame[2].corr(frame[4]) # repeat NJ vs HI query = """ SELECT a.date, a.departure_state, b.departure_state, a.avg, b.avg corr FROM (SELECT LEFT(date, 7) date, departure_state , AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] WHERE departure_state = 'NY' GROUP BY 1,2 HAVING c > 5 ) a JOIN (SELECT LEFT(date, 7) date, departure_state, AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] WHERE departure_state = 'HI' GROUP BY 1,2 HAVING c > 5 ) b ON a.date=b.date ORDER BY a.date; """ fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000) for x in data: x[3] = float(x[3]) x[4] = float(x[4]) #NY vs HI frame = pd.DataFrame(data) xx=frame.plot() xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()]) frame[3].corr(frame[4]) plt.scatter(frame[3],frame[4]) plt.xlabel('avg NY delay in minutes') plt.ylabel('avg Hawaii delay in minutes') print "Correlation: %s" % frame[3].corr(frame[4]) # The best correlations query = """SELECT a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c FROM (SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] GROUP BY 1,2 HAVING c > 5 ) a JOIN (SELECT date, departure_state , AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] GROUP BY 1,2 HAVING c > 5 ) b ON a.date=b.date WHERE a.departure_state < b.departure_state GROUP EACH BY 1, 2 HAVING c > 5 ORDER BY corr DESC;""" fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000) data[0:30] # The best predictors query = """SELECT a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c FROM (SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] GROUP BY 1,2 HAVING c > 5 ) a JOIN (SELECT DATE(DATE_ADD(timestamp(date), 1, 'DAY')) date, departure_state , AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] GROUP BY 1,2 HAVING c > 5 ) b ON a.date=b.date GROUP EACH BY 1, 2 HAVING c > 5 ORDER BY corr DESC;""" fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000) data[0:30] # The best NY predictors query = """SELECT a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c FROM (SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] WHERE departure_state = 'NY' GROUP BY 1,2 HAVING c > 5 ) a JOIN (SELECT DATE(DATE_ADD(timestamp(date), 1, 'DAY')) date, departure_state , AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] GROUP BY 1,2 HAVING c > 5 ) b ON a.date=b.date GROUP EACH BY 1, 2 HAVING c > 5 ORDER BY corr DESC;""" fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000) data[0:30] # The best NY predictors, by quarter query = """SELECT a.q, a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c FROM (SELECT QUARTER(timestamp(date)) q, date, departure_state , AVG(departure_delay) avg , COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] WHERE departure_state = 'NY' GROUP BY 1,2,3 HAVING c > 5 ) a JOIN (SELECT QUARTER(timestamp(date)) q, DATE(DATE_ADD(timestamp(date), 1, 'DAY')) date, departure_state , AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights] GROUP BY 1,2,3 HAVING c > 5 ) b ON a.date=b.date AND a.q = b.q GROUP EACH BY 1, 2, 3 HAVING c > 5 ORDER BY corr DESC;""" fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000) data[0:30] # Bonus: When is a correlation a good correlation? nnn = 1100 # Move it from 2 to 4000 s1 = pd.Series(np.random.random(nnn)) s2 = pd.Series(np.random.random(nnn)) plt.scatter(s1, s2) s1.corr(s2)