The following is the code used to generate a version of the predictive tool that powers the International Political Tele-Conferencing Equilibrium System. The styling of the out graph has been changed.
%pylab inline
import datetime as dt
import numpy as np
import pandas as pd
from pandas import *
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa import *
tsa = sm.tsa
Populating the interactive namespace from numpy and matplotlib
# Get the data here:
# http://storage.googleapis.com/gdelt_bc/Barack_Obama.csv
PATH = '/Users/path/to/folder/' # This is where the downloaded .csv is stored locally.
q = 'Barack_Obama'
csv = q + '.csv'
df = pd.read_csv(PATH + csv, index_col='SQLDATE', parse_dates=True)
df.head()
Actor1Name | GoldsteinScale | NumMentions | |
---|---|---|---|
SQLDATE | |||
2014-01-23 | BARACK OBAMA | 1.9 | 2 |
2014-01-23 | BARACK OBAMA | 1.9 | 2 |
2014-01-23 | BARACK OBAMA | 1.9 | 12 |
2014-01-23 | BARACK OBAMA | 1.9 | 2 |
2014-01-23 | BARACK OBAMA | 1.9 | 2 |
# Get the data here:
# http://storage.googleapis.com/gdelt_bc/goldstein_suggestions.csv
GS_PATH = 'Users/path/to/folder/goldstein_suggestions.tsv'
gsCodes = pd.read_csv(GS_PATH, index_col='code')
gsCodes.head()
description | |
---|---|
code | |
-10.0 | Military attack; clash; assault; Biological we... |
-9.9 | Assassination; Guerrilla assault; Paramilitary... |
-9.8 | Assassination; Guerrilla assault; Paramilitary... |
-9.7 | Assassination; Guerrilla assault; Paramilitary... |
-9.6 | Assassination; Guerrilla assault; Paramilitary... |
tomorrow = str(dt.date.today() + dt.timedelta(days=1))
twoweeksago = str(dt.date.today() - dt.timedelta(days=14))
nextweek = dt.date.today() + dt.timedelta(days=8)
# These steps incorporate the number of mentions a Goldstein score is associated with,
# reducing the impact of error in the event encoding,
# making the average better reflect the event's presence in the GDELT.
df['GoldMentions'] = df['GoldsteinScale'] * df['NumMentions']
goldstein = df.groupby([df.index.date]).agg({'GoldMentions': np.sum, 'NumMentions': np.sum})
goldstein['GoldAverage'] = goldstein['GoldMentions'] / goldstein['NumMentions']
full_daterange = pd.date_range(start=min(df.index),end=max(df.index))
goldstein = goldstein.reindex(full_daterange).ffill()
# Creates a rolling_mean using a 30-day window:
goldstein['sma-30'] = pd.rolling_mean(goldstein['GoldAverage'],30)
# The first 30 entries in the rolling_mean become NaN, so...
grm = goldstein['sma-30'].dropna()
test_sample = pd.DataFrame(grm[-400:]) # 200 entries seems to be enough to determine stationarity.
test_sample.index = pd.to_datetime(test_sample.index)
test_sample.columns = ['Goldstein daily average']
plot_sample = pd.DataFrame(grm[-200:])
plot_sample.index = pd.to_datetime(plot_sample.index)
plot_sample.columns = ['Goldstein daily average']
model = sm.tsa.ARMA(test_sample,(12,0)).fit() # 12 Lags seems to be enough to get an accurate prediction.
prediction = model.predict(twoweeksago, str(nextweek), dynamic=False)
# looked about 1 day off, so...
prediction = prediction.shift(-1)
predicts = round(prediction.ix[tomorrow:str(nextweek)].mean(numeric_only=float),1)
print predicts
1.2
startdate = datetime.date(plot_sample.index[0])
enddate = dt.date.today() + dt.timedelta(days=8)
daterange = [startdate + dt.timedelta(days=x) for x in range(0, (enddate-startdate).days)]
suggestion = round(((predicts - 1) * -1),1)
gsDescription = gsCodes.loc[suggestion].values[0]
gsDescription
'Comment on a situation'
plot_sample.plot(kind='line',
title=q + ': Goldstein Trend and Prediction',
ylim = (-10,10),
figsize=(18,8),
color = 'steelblue')
prediction.plot(kind = 'line',
label = 'prediction',
legend = True,
color = 'red',)
<matplotlib.axes.AxesSubplot at 0x10eb49990>