Will your plane be late tomorrow? We'll use Pearson Correlation over a 70 million flights dataset.
Watch these queries live at http://youtu.be/tqS4vZ2Rxlo
import pandas as pd
import numpy
First let's learn what is Pearson Correlation.
# temperature vs ice_cream, fill the series
temperatures = pd.Series([21,30,16,18])
icecreams_sold = pd.Series([70,90,35,40])
temperatures.plot()
icecreams_sold.plot()
# x,y plot
plt.scatter(temperatures, icecreams_sold)
# calc correlation
temperatures.corr(icecreams_sold)
# negative correlation? fill the series
coats_sold = pd.Series([25, 10, 47, 40])
plt.scatter(temperatures, coats_sold)
# calc corr
coats_sold.corr(temperatures)
#bigquery imports
import bq
import datetime
import pandas as pd
client = bq.Client.Get()
We have a cool web UI too! You can run all the following queries at https://bigquery.cloud.google.com/.
# first 10 days of NY
query = """
SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5
ORDER BY 1
LIMIT 10
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:10]
# last 10 days of NY
query = """
SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5
ORDER BY 1 DESC
LIMIT 10
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:10]
# plot NY by day
query = """
SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5
ORDER BY 1
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
for x in data:
x[2] = float(x[2])
frame = pd.DataFrame(data)
xx=frame.plot()
xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()])
# too much data, plot NY by month
query = """
SELECT left(date, 7) date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5
ORDER BY 1
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
for x in data:
x[2] = float(x[2])
frame = pd.DataFrame(data)
xx=frame.plot()
xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()])
""
#comparing NY to NJ
query = """
SELECT a.date, a.departure_state, a.avg, b.departure_state, b.avg FROM
(
SELECT left(date, 7) date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5) a
JOIN
(
SELECT left(date, 7) date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NJ'
GROUP BY 1,2 HAVING c > 5) b
ON a.date=b.date
ORDER BY 1
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
for x in data:
x[2] = float(x[2])
x[4] = float(x[4])
data[0:30]
#plot NJ vs NY
frame = pd.DataFrame(data)
xx=frame.plot()
xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()])
#scatter plot NY vs NJ
plt.scatter(frame[2],frame[4])
plt.xlabel('avg NY delay in minutes')
plt.ylabel('avg NJ delay in minutes')
frame[2].corr(frame[4])
# repeat NJ vs HI
query = """
SELECT a.date, a.departure_state, b.departure_state, a.avg, b.avg corr
FROM
(SELECT LEFT(date, 7) date, departure_state , AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5 ) a
JOIN
(SELECT LEFT(date, 7) date, departure_state, AVG(departure_delay) avg, COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'HI'
GROUP BY 1,2 HAVING c > 5 ) b
ON a.date=b.date
ORDER BY a.date;
"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
for x in data:
x[3] = float(x[3])
x[4] = float(x[4])
#NY vs HI
frame = pd.DataFrame(data)
xx=frame.plot()
xx.set_xticklabels([frame[0][min(int(x), len(frame[0])-1)][0:7] for x in xx.get_xticks()])
frame[3].corr(frame[4])
plt.scatter(frame[3],frame[4])
plt.xlabel('avg NY delay in minutes')
plt.ylabel('avg Hawaii delay in minutes')
print "Correlation: %s" % frame[3].corr(frame[4])
# The best correlations
query = """SELECT a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c
FROM
(SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2 HAVING c > 5
) a
JOIN
(SELECT date, departure_state ,
AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2 HAVING c > 5 ) b
ON a.date=b.date
WHERE a.departure_state < b.departure_state
GROUP EACH BY 1, 2
HAVING c > 5
ORDER BY corr DESC;"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:30]
# The best predictors
query = """SELECT a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c
FROM
(SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2 HAVING c > 5
) a
JOIN
(SELECT DATE(DATE_ADD(timestamp(date), 1, 'DAY')) date, departure_state ,
AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2 HAVING c > 5 ) b
ON a.date=b.date
GROUP EACH BY 1, 2
HAVING c > 5
ORDER BY corr DESC;"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:30]
# The best NY predictors
query = """SELECT a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c
FROM
(SELECT date, departure_state, AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2 HAVING c > 5
) a
JOIN
(SELECT DATE(DATE_ADD(timestamp(date), 1, 'DAY')) date, departure_state ,
AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2 HAVING c > 5 ) b
ON a.date=b.date
GROUP EACH BY 1, 2
HAVING c > 5
ORDER BY corr DESC;"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:30]
# The best NY predictors, by quarter
query = """SELECT a.q, a.departure_state, b.departure_state, corr(a.avg, b.avg) corr, COUNT(*) c
FROM
(SELECT QUARTER(timestamp(date)) q, date, departure_state , AVG(departure_delay) avg , COUNT(*) c
FROM [bigquery-samples:airline_ontime_data.flights]
WHERE departure_state = 'NY'
GROUP BY 1,2,3 HAVING c > 5
) a
JOIN
(SELECT QUARTER(timestamp(date)) q, DATE(DATE_ADD(timestamp(date), 1, 'DAY')) date, departure_state ,
AVG(departure_delay) avg, COUNT(*) c FROM [bigquery-samples:airline_ontime_data.flights]
GROUP BY 1,2,3 HAVING c > 5 ) b
ON a.date=b.date AND a.q = b.q
GROUP EACH BY 1, 2, 3
HAVING c > 5
ORDER BY corr DESC;"""
fields, data = client.ReadSchemaAndRows(client.Query(query)['configuration']['query']['destinationTable'], max_rows = 1000)
data[0:30]
# Bonus: When is a correlation a good correlation?
nnn = 1100 # Move it from 2 to 4000
s1 = pd.Series(np.random.random(nnn))
s2 = pd.Series(np.random.random(nnn))
plt.scatter(s1, s2)
s1.corr(s2)
Follow the most interesting BigQuery news at http://www.reddit.com/r/bigquery. Ask your technical questions using the [google-bigquery] tag on Stack Overflow.