import pandas as pd import numpy as np import scipy as sp import statsmodels.api as sm import statsmodels.graphics.tsaplots as tsaplots from scipy import stats import seaborn as sns from pylab import rcParams %%bash head -n 3 data/historic_euro_hrs_worked.csv df = pd.read_csv('data/historic_euro_hrs_worked.csv', na_values='N.a.', index_col= 0 ) df.head(5) df.info(verbose=False) df.describe() df.describe().ix[1].plot(figsize=(10, 4), linewidth=3, alpha =.8) df = df.T df = df.convert_objects(convert_numeric=True) df.head(5) sns.set_palette("Set2", 10, 1) sns.palplot(sns.color_palette("Set2", 10,1)) guf_vs_piigs = ['Germany', 'UK', 'France','Portugal', 'Italy', 'Ireland', 'Greece', 'Spain'] df.ix[:, guf_vs_piigs].plot(figsize=(14, 10), linewidth=3, alpha =.8, title = "Germany, UK, and France vs the PIIGS Annual Average Hours Worked per Employee").legend(loc=3,prop={'size':15}) supress = df.plot(subplots=True, layout=(8, 4), figsize=(16, 16,), sharex=False, alpha =.7) plt.rcParams['figure.figsize'] = 15, 15 supress = sns.corrplot(df, annot=False, sig_stars=False, diag_names=False, cmap="coolwarm") pct_change_sum = df.pct_change().sum() pct_change_sum.sort() # inplace method default print 'Absolute Mean Percentage Change: ' + str(pct_change_sum.abs().mean()) print '' print pct_change_sum df.France known_x = np.arange(len(df.France)) # Create the index known_y = df.France # Return a pandas series from a df object # Find the polynomial coefficients linear_coef = np.polyfit(known_x, known_y, 1) # Pass polynomial coefficients to poly1d convenience function (highest power 1st) linear_poly = np.poly1d(linear_coef) linear_poly(23) a,b = linear_poly.coeffs # slope, intercept quadratic_coef = np.polyfit(known_x, known_y, 2) quadratic_poly = np.poly1d(quadratic_coef) known_x_labels = np.array(list(df.index)) # For x-axis labeling plt.rcParams['figure.figsize'] = 9, 5 plt.scatter(known_x_labels, known_y, label = 'Original Data') plt.plot(known_x_labels,known_y, alpha = .3, label = 'Original Data') plt.plot(known_x_labels, linear_poly(known_x), 'r', alpha=.8, label='Linear Regression') # y = a + bx or y = mx + b plt.plot(known_x_labels, quadratic_poly(known_x), '-', label='Quadratic Fit') # y = ax^2 + bx + c plt.title('France Average Annual Hours Worked per Employee') plt.legend() print 'Linear Regression' print 'slope: %s' % a, 'intercept: %s' % b df.France France_incomplete = df.France.copy() France_incomplete[['2004','2010','2011']] = np.NaN France_incomplete France_incomplete.fillna(value = France_incomplete.mean()) plt.rcParams['figure.figsize'] = 7, 3.5 # Original data df.France.plot(color = 'black') # Using the mean France_incomplete.fillna(value = France_incomplete.mean()).plot(color = 'orange',alpha = .5) # Filling forward France_incomplete.fillna(method= 'ffill').plot(color = 'dodgerblue',alpha = .5) colnames = [colname for colname in df.columns] # 1998 - 2013 years_into_future = 7 known_x = np.arange(df.shape[0]) # Current time interval new_x = np.arange(len(known_x), len(known_x) + years_into_future, 1) # 7 years into future concat_x = np.concatenate((known_x, new_x),axis=0) out = np.empty(shape=(concat_x.shape[0], len(colnames)), dtype=float) # Empty faster than zero # Return tuple of index and value for col_index in enumerate(colnames): known_y = df.iloc[:,col_index[0]] known_y = known_y.fillna(value=known_y.mean()) linear_coef = np.polyfit(known_x, known_y, 1) linear_poly = np.poly1d(linear_coef) new_y = linear_poly(new_x) #new_y = new_y[::-1] # Reverse projections concat_y = np.concatenate((known_y,new_y),axis=0) out[:, col_index[0]] = concat_y extended_df = pd.DataFrame(out) extended_df.columns = colnames extended_df.index = pd.date_range(start = '1998', end = '2021', freq='A') supress = extended_df.plot(subplots=True, layout=(7, 4), figsize=(16, 16), alpha =.6, sharex=False); colnames = [colname for colname in df.columns] # 1998 - 2013 years_into_future = 7 known_x = np.arange(df.shape[0]) # Current time interval new_x = np.arange(len(known_x), len(known_x) + years_into_future, 1) # 7 years into future concat_x = np.concatenate((known_x, new_x),axis=0) out = np.empty(shape=(concat_x.shape[0], len(colnames)), dtype=float) # Empty faster than zero # Return tuple of index and value for col_index in enumerate(colnames): known_y = df.iloc[:,col_index[0]] known_y = known_y.fillna(value=known_y.mean()) new_y = np.zeros(shape=(len(new_x),)) new_y[-1] = np.max(known_y) + (np.std(known_y) * 1) concat_y = np.concatenate((known_y,new_y),axis=0) out[:, col_index[0]] = concat_y extended_df = pd.DataFrame(out) extended_df.columns = colnames extended_df.index = pd.date_range(start = '1998', end = '2021', freq='A') # All the options described here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.interpolate.html # {‘linear’, ‘time’, ‘index’, ‘values’, ‘nearest’, ‘zero’, # ‘slinear’, ‘quadratic’, ‘cubic’, ‘barycentric’, ‘krogh’, ‘polynomial’, ‘spline’ ‘piecewise_polynomial’, ‘pchip’} extended_df.replace(0, np.nan, inplace=True) # Must specify order for poly extended_df.interpolate(method='pchip', axis=0, limit=None, inplace=True, downcast=None) supress = extended_df.plot(subplots=True, layout=(7, 4), figsize=(16, 16), alpha =.6, sharex=False); extended_df.plot(figsize=(16, 9), alpha =.6)