import pandas as pd
import numpy as np
import scipy as sp
import statsmodels.api as sm
import statsmodels.graphics.tsaplots as tsaplots
from scipy import stats
import seaborn as sns

from pylab import rcParams

%%bash 
head -n 3 data/historic_euro_hrs_worked.csv

df = pd.read_csv('data/historic_euro_hrs_worked.csv',  na_values='N.a.', index_col= 0 )

df.head(5)

df.info(verbose=False)

df.describe()

df.describe().ix[1].plot(figsize=(10, 4), linewidth=3, alpha =.8)

df = df.T
df = df.convert_objects(convert_numeric=True)

df.head(5)

sns.set_palette("Set2", 10, 1)
sns.palplot(sns.color_palette("Set2", 10,1))

guf_vs_piigs = ['Germany', 'UK', 'France','Portugal', 'Italy', 'Ireland', 'Greece', 'Spain']
df.ix[:, guf_vs_piigs].plot(figsize=(14, 10),
                            linewidth=3,
                            alpha =.8,
                            title = "Germany, UK, and France vs the PIIGS Annual Average Hours Worked per Employee").legend(loc=3,prop={'size':15})

supress = df.plot(subplots=True, layout=(8, 4), figsize=(16, 16,), sharex=False, alpha =.7)

plt.rcParams['figure.figsize'] = 15, 15

supress = sns.corrplot(df, annot=False, sig_stars=False,
             diag_names=False, cmap="coolwarm")

pct_change_sum = df.pct_change().sum()
pct_change_sum.sort() # inplace method default
print 'Absolute Mean Percentage Change: ' + str(pct_change_sum.abs().mean())
print ''
print pct_change_sum

df.France

known_x = np.arange(len(df.France)) # Create the index
known_y = df.France # Return a pandas series from a df object

# Find the polynomial coefficients
linear_coef = np.polyfit(known_x, known_y, 1)

# Pass polynomial coefficients to poly1d convenience function (highest power 1st)
linear_poly = np.poly1d(linear_coef)

linear_poly(23)

a,b = linear_poly.coeffs # slope, intercept

quadratic_coef = np.polyfit(known_x, known_y, 2)
quadratic_poly = np.poly1d(quadratic_coef)

known_x_labels = np.array(list(df.index)) # For x-axis labeling 
plt.rcParams['figure.figsize'] = 9, 5
plt.scatter(known_x_labels, known_y, label = 'Original Data')
plt.plot(known_x_labels,known_y, alpha = .3, label = 'Original Data')
plt.plot(known_x_labels, linear_poly(known_x), 'r', alpha=.8, label='Linear Regression') # y = a + bx or y = mx + b 
plt.plot(known_x_labels, quadratic_poly(known_x), '-', label='Quadratic Fit') # y = ax^2 + bx + c
plt.title('France Average Annual Hours Worked per Employee')
plt.legend()

print 'Linear Regression'
print 'slope: %s' % a, 'intercept: %s' % b

df.France

France_incomplete  = df.France.copy()

France_incomplete[['2004','2010','2011']] = np.NaN
France_incomplete

France_incomplete.fillna(value = France_incomplete.mean())

plt.rcParams['figure.figsize'] = 7, 3.5

# Original data
df.France.plot(color = 'black')
# Using the mean
France_incomplete.fillna(value = France_incomplete.mean()).plot(color = 'orange',alpha = .5)
# Filling forward
France_incomplete.fillna(method= 'ffill').plot(color = 'dodgerblue',alpha = .5)

colnames = [colname for colname in df.columns]

# 1998 - 2013
years_into_future = 7
known_x = np.arange(df.shape[0]) # Current time interval
new_x = np.arange(len(known_x), len(known_x) + years_into_future, 1) # 7 years into future
concat_x = np.concatenate((known_x, new_x),axis=0)

out = np.empty(shape=(concat_x.shape[0], len(colnames)), dtype=float) # Empty faster than zero

# Return tuple of index and value
for col_index in enumerate(colnames):
    known_y = df.iloc[:,col_index[0]]
    known_y = known_y.fillna(value=known_y.mean())
    
    linear_coef = np.polyfit(known_x, known_y, 1)
    linear_poly = np.poly1d(linear_coef)

    new_y = linear_poly(new_x)   
    #new_y = new_y[::-1] # Reverse projections
    
    concat_y = np.concatenate((known_y,new_y),axis=0)
    out[:, col_index[0]] = concat_y

extended_df = pd.DataFrame(out)
extended_df.columns = colnames


extended_df.index = pd.date_range(start = '1998',
                                  end = '2021',
                                  freq='A')

supress = extended_df.plot(subplots=True, layout=(7, 4), figsize=(16, 16), alpha =.6, sharex=False);

colnames = [colname for colname in df.columns]

# 1998 - 2013
years_into_future = 7
known_x = np.arange(df.shape[0]) # Current time interval
new_x = np.arange(len(known_x), len(known_x) + years_into_future, 1) # 7 years into future
concat_x = np.concatenate((known_x, new_x),axis=0)

out = np.empty(shape=(concat_x.shape[0], len(colnames)), dtype=float) # Empty faster than zero

# Return tuple of index and value
for col_index in enumerate(colnames):
    known_y = df.iloc[:,col_index[0]]
    known_y = known_y.fillna(value=known_y.mean())
    
    new_y = np.zeros(shape=(len(new_x),))
    new_y[-1] = np.max(known_y) + (np.std(known_y) * 1)
    concat_y = np.concatenate((known_y,new_y),axis=0)
    
    out[:, col_index[0]] = concat_y

extended_df = pd.DataFrame(out)
extended_df.columns = colnames

extended_df.index = pd.date_range(start = '1998',
                                  end = '2021',
                                  freq='A')

# All the options described here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.interpolate.html
# {‘linear’, ‘time’, ‘index’, ‘values’, ‘nearest’, ‘zero’,
# ‘slinear’, ‘quadratic’, ‘cubic’, ‘barycentric’, ‘krogh’, ‘polynomial’, ‘spline’ ‘piecewise_polynomial’, ‘pchip’}

extended_df.replace(0, np.nan, inplace=True)

# Must specify order for poly
extended_df.interpolate(method='pchip', axis=0, limit=None, inplace=True, downcast=None)

supress = extended_df.plot(subplots=True, layout=(7, 4), figsize=(16, 16), alpha =.6, sharex=False);

extended_df.plot(figsize=(16, 9), alpha =.6)