The following is a public-facing walkthrough of the code used to for the predictive tool that powers @WorldLeaderTips. The styling of the out graph has been changed for this context.
%pylab inline
import json
import random
import datetime as dt
from collections import deque
import numpy as np
import pandas as pd
from pandas import *
from pandas.io.json import json_normalize
import matplotlib.pylab
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import statsmodels.api as sm
from statsmodels.tsa import *
from bigquery import get_client
tomorrow = str(dt.date.today())
twoweeksago = str(dt.date.today() - dt.timedelta(days = 14))
lastweek = dt.datetime.today() - dt.timedelta(days = 7)
nextweek = dt.date.today() + dt.timedelta(days = 8)
day_index = dt.datetime.today().weekday()
# Initialize Google BigQuery client:
with open('/PATH/TO/settings.json', 'rb') as json_data:
settings = json.load(json_data)
client = get_client(settings['PROJECT_ID'], service_account=settings['SERVICE_ACCOUNT'], private_key=settings['KEY'], readonly=True)
# Load the Goldstein suggstions table:
gsCodes = pd.read_csv('assets/goldstein_suggestions.tsv', index_col='code', sep='\t')
leader_display_name = 'Pope Francis'
gdelt_search = "WHERE Actor1Name CONTAINS 'POPE' AND SQLDATE > 20130313"
date_start = '19790101'
date_end = ''.join(str(dt.date.today()).split('-'))
query_template = 'SELECT GLOBALEVENTID, SQLDATE, Actor1Name, Actor1CountryCode, GoldsteinScale, NumMentions FROM [gdelt-bq:full.events] {} AND SQLDATE>{} AND SQLDATE<{} IGNORE CASE;'
query = query_template.format(gdelt_search, date_start, date_end)
print 'Search for: {}'.format(leader_display_name)
print 'Query: {}'.format(query)
filename = '{}.csv'.format(leader_display_name.replace(' ', '_'))
try:
job_id, results = client.query(query, timeout=2000)
print '{} results'.format(str(len(results)))
if len(results) == 0:
print "NO RESULTS"
df = json_normalize(results)
df = df.sort(columns='SQLDATE')
df.to_csv('{}'.format(filename), encoding='utf-8')
print 'Saved: {}'.format(filename)
except:
print 'ERROR: Timed out'
df = pd.read_csv(filename, index_col='SQLDATE', parse_dates=True)
df.tail()
# These steps incorporate the number of mentions a Goldstein score is associated with, reducing the impact of error in the event encoding, making the average better reflect the event's presence in the GDELT.
df['GoldMentions'] = df['GoldsteinScale'] * df['NumMentions']
goldstein = df.groupby([df.index.date]).agg({'GoldMentions': np.sum, 'NumMentions': np.sum})
goldstein['GoldAverage'] = goldstein['GoldMentions'] / goldstein['NumMentions']
full_daterange = pd.date_range(start=min(df.index), end=max(df.index))
# interpolate() takes care of days that do not have a entry / Goldstein score in GDELT:
goldstein = goldstein.reindex(full_daterange).interpolate(method='linear')
# In case there is not enough historical data for the world leader:
if len(goldstein['GoldAverage']) < 230:
day_difference = 230 - len(goldstein['GoldAverage']) + 1
new_dates = [goldstein.index[0] - dt.timedelta(days=x) for x in range(1, day_difference)]
new_dates = reversed(new_dates)
columns = ['NumMentions', 'GoldMentions', 'GoldAverage']
temp_data = {col: [None for x in range(1, day_difference)] for col in columns}
missing = pd.DataFrame(temp_data, index=new_dates, columns=columns)
temp_goldstein = missing.append(goldstein)
goldstein = temp_goldstein.fillna(method='bfill')
print 'Insufficient data, missing entries were backfilled.'
else:
print 'Sufficient data, backfilling was avoided.'
#Create two instances of the data, one to train the model...
training = pd.DataFrame(goldstein['GoldAverage'])
training.index = pd.to_datetime(training.index)
training.columns = ['Goldstein daily average']
print 'Test Sample {}\n'.format(training.tail())
# ... the other to create the plot:
plot_sample = pd.DataFrame(goldstein['GoldAverage'][-230:])
plot_sample.index = pd.to_datetime(plot_sample.index)
plot_sample.columns = ['Goldstein daily average']
print 'Plot Sample {}'.format(plot_sample.tail())
# Creates the forcasting model using Autoregressive Moving Average (ARMA):
tries = 0
success = False
while tries < 6 and success is False:
try:
model = sm.tsa.ARMA(training,(12, tries)).fit()
success = True
except:
tries += 1
if success is False:
print "NOT SUCCESSFUL"
print model
# Creates the prediction based on the proceeding two weeks through one week into the future:
prediction = model.predict(twoweeksago, str(nextweek), dynamic = False)
# The fitting of the prediction to the model:
prediction = prediction.shift(-1)
def stir_the_pot(sample_range, st_dev):
mean = sample_range.mean()
added_range = mean + st_dev
random_suggestion = None
# Randomly suggest something either above or below the stagnant Goldstein range:
if random.randint(0,1) == 1:
random_suggestion = "{0:.1f}".format(random.uniform(float(added_range),10))
else:
random_suggestion = "{0:.1f}".format(random.uniform(-10,-float(added_range)))
print 'new suggestion: {}'.format(random_suggestion)
return float(random_suggestion)
# Based on the Prediction, determine the Suggestion:
sample_window = training[-60:]
standard_deviation = float(sample_window.std())
suggestion = None
if standard_deviation < 2.1 and random.randint(1,5) > 2:
# The leader has been stuck in a rut and is not honing in on Goldstein Score of '1', 3/5 chance of getting a new suggestion:
print 'Your actions are too stagnant!'
suggestion = stir_the_pot(sample_window, standard_deviation)
predicts = round(prediction.ix[tomorrow:str(nextweek)].mean(), 1)
else:
# Finds the average of the Goldstein scores for the coming week:
predicts = round(prediction.ix[tomorrow:str(nextweek)].mean(), 1)
suggestion = round(((predicts - 1) * -1), 1)
print 'suggestion: {}'.format(suggestion)
# From the possible suggestions for the score, find one at random:
prediction_text = random.choice(gsCodes.loc[predicts].values[0].split(';')).strip()
gsDescription = random.choice(gsCodes.loc[suggestion].values[0].split(';')).strip()
print '=================================================='
print "{}'s Forecast: {} {} ".format(leader_display_name, predicts, prediction_text)
print "{}'s Suggested score: {} ".format(leader_display_name, suggestion)
if predicts == 1.0:
gsDescription = "You're doing great. Stay the course."
else:
print "Suggested actions for the coming week:\n{}".format(gsDescription)
print '=================================================='
plot_sample.plot(kind='line',
title='{}: Goldstein Trend and Prediction'.format(leader_display_name),
ylim=(-10,10),
figsize=(16,8),
color='lightblue')
prediction.plot(kind='line',
label='prediction',
legend=True,
color='red')