import pandas as pd

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline

a = pd.Series(np.random.randn(5), name='random numbers')
a

# Series can be sliced
a[-3:]

# Series can be indexed by position
a[3]

# Series can be indexed with arrays of indices
a[[1, 3, 2]]

# Series can be indexed with boolean arrays
a[[True, False, False, True, True]]

# Series can have labels as indices
a.index = list('abcde')  # list('abcde') --> ['a', 'b', 'c', 'd', 'e']
a

# Indexing with numbers (and slicing) still works
a[2]

# But we can also index with labels...
a['c']

# ...and arrays of labels
a[['c', 'b', 'a']]

# Operations on Series align on the index (i)
a[-3:]

# Operations on Series align on the index (ii)
a[:3]

# Operations on Series align on the index (iii)
a[-3:] + a[:3]

# Compare to the same result for the underlying numpy array
a.values[-3:] + a.values[:3]

# A Series makes a great look-up table
gp_lut = pd.Series(data=[4, 3, 2, 1, 0], index=list('ABCDF'))
gp_lut

my_grades = ['A', 'B', 'B', 'A', 'A']
my_points = gp_lut[my_grades]
my_points

# Series have lots of numerical evaluation methods
my_gpa = my_points.mean()
my_gpa

students = pd.DataFrame({'name' : ['Peter', 'Paul', 'Mary',
                                    'Peter', 'Paul', 'Mary'],
                         'subject' : ['English', 'English', 'English',
                                      'Math', 'Math', 'Math'],
                         'grade' : [85.0, 76.0, 92.0, 77.0, 68.0, 87.0]})
students

# We can get the column names
students.columns

# We can get a single Series indexing by column name...
students['name']

# ...or as an attribute
students.name

# We can get a subset of columns indexing with an array of labels
students[['subject', 'grade']]

# To get a subset of rows, slicing and indexing still work
students[-3:]

# But we can give boolean indexing an interesting twist
students[students.name == 'Mary']

data = pd.read_csv('star_2013_clean_wide.csv')
data

# A closer look at the available columns
data.columns

# What are the unique test names?
data['Test.Name'].unique()

data.pivot_table(values='Students.Tested', index='Test.Name',
                 columns='Grade', aggfunc='sum')

# First, keep the English test rows only
english = data[data['Test.Name'] == 'CST English-Language Arts']

english_grade = english.groupby('Grade')['Mean.Scale.Score'].aggregate(np.mean)
english_grade

english_grade.plot()

# We add a new calculated column to our dataframe
english['Weighted.Score'] = english['Mean.Scale.Score'] * english['Pct.Test.Grade']
# Then group by grade and plot the aggregated sum of this column
english.groupby('Grade')['Weighted.Score'].aggregate(np.sum).plot()

# Keep the relevant rows only
math_7th = data[(data['Test.Name'] == 'CST Mathematics') &
                (data['Grade'] == 7)]

math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter')

# Let's peek at the offending row
math_7th[math_7th['Mean.Scale.Score'] < 300]

math_7th.plot(x='Students.Tested', y='Mean.Scale.Score',
              kind='scatter', logx=True)

math_7th = math_7th[math_7th['County.Name'] != 'Sierra']
math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter')

# Let's fit a linear model to our data
y = math_7th['Mean.Scale.Score']  # response
X = math_7th['median.family.income']  # predictor
X = sm.add_constant(X)  # Add a constant term to the predictor
# The actual fitting happens here
est = sm.OLS(y, X)
est = est.fit()
est.summary()

# Let's plot the regression line on top of the data
x_ = np.array([X.min(), X.max()])
y_ = est.predict(x_)
math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter')
plt.plot(x_[:, 1], y_, 'r-')

math_7th.plot(x='Spend.Per.ADA', y='Mean.Scale.Score', kind='scatter')

# Let's fit a linear model to our data
y = math_7th['Mean.Scale.Score']  # response
X = math_7th['Spend.Per.ADA']  # predictor
X = sm.add_constant(X)  # Add a constant term to the predictor
# The actual fitting happens here
est = sm.OLS(y, X)
est = est.fit()
est.summary()

# Let's plot the regression line on top of the data
x_ = np.array([X.min(), X.max()])
y_ = est.predict(x_)
math_7th.plot(x='Spend.Per.ADA', y='Mean.Scale.Score', kind='scatter')
plt.plot(x_[:, 1], y_, 'r-')

# Let's fit a linear model to our data
y = math_7th['Mean.Scale.Score']  # response
X = math_7th[['median.family.income', 'Spend.Per.ADA']]  # predictor
X = sm.add_constant(X)  # Add a constant term to the predictor
# The actual fitting happens here
est = sm.OLS(y, X)
est = est.fit()
est.summary()

raw_data = pd.read_csv('ca2013_1_csv_v3.txt')

# Let's list the columns in the loaded data frame
raw_data.columns

raw_data = raw_data[['County Code', 'District Code', 'Grade', 'Test Id', 'Students Tested', 'Mean Scale Score']]

raw_data = raw_data[(raw_data['County Code'] != 0) & (raw_data['District Code'] == 0)]

del raw_data['District Code']

raw_data[:5]

entities = pd.read_csv('ca2013entities_csv.txt')

# Let's take a look at the columns here
entities.columns

# Let's keep what we need only
entities = entities[['County Code', 'District Code', 'County Name']]

# And let's get rid of the non county specific data
entities = entities[(entities['County Code'] != 0) & (entities['District Code'] == 0)]
entities[:5]

# We can now get rid of the 'District Code' column
del entities['District Code']

# And make the 'County Code' column the index of the data frame
entities.set_index('County Code', inplace=True)

# Lastly, we use this as a LUT for the raw_data 'County Code'. NOtice the use of
# .loc to ensure the indexing is label, not position, based
county_names = entities.loc[raw_data['County Code']]

# And we add this information as a new column...
county_names.index = raw_data.index
raw_data['County Name'] = county_names
raw_data[:5]

test_name_lut = pd.read_csv('test_names.csv')
test_name_lut.set_index('Test ID', inplace=True)
test_names = test_name_lut.loc[raw_data['Test Id']]
test_names.index = raw_data.index
raw_data['Test Name'] = test_names
raw_data[:5]

raw_data['Mean Scale Score'] = raw_data['Mean Scale Score'].convert_objects(convert_numeric=True)
raw_data['Mean Scale Score'].dtype

raw_data = raw_data[raw_data['County Code'] != 99]
income = pd.read_csv('income.csv')
income.set_index('County', inplace=True)
income = income.loc[raw_data['County Name']]
income.index = raw_data.index
raw_data = raw_data.join(income)
raw_data[:5]

expense_lut = pd.read_excel('currentexpense1213.xls',
                            sheetname='County Averages', skiprows=5)
expense_lut.set_index('County', inplace=True)
expenses = expense_lut.loc[raw_data['County Name']]
expenses.index = raw_data.index
raw_data = raw_data.join(expenses)
raw_data[:5]

grouped = raw_data.groupby(['Grade', 'Test Id'])['Students Tested']
raw_data['Pct Test Grade'] = grouped.apply(lambda x: x / x.sum())
raw_data[:5]

raw_data.rename(columns={'County Code': 'County.Code',
                        'Test Id': 'Test.Id',
                        'Students Tested': 'Students.Tested',
                        'Mean Scale Score': 'Mean.Scale.Score',
                        'County Name': 'County.Name',
                        'Test Name': 'Test.Name',
                        'Per capita income': 'per.capita.income',
                        'Median household income': 'median.household.income',
                        'Median family income': 'median.family.income',
                        'Expenditures (EDP 365)': 'Spend',
                        'Current Expense ADA': 'ADAttend',
                        'Current Expense Per ADA': 'Spend.Per.ADA',
                        'Pct Test Grade': 'Pct.Test.Grade'}, inplace=True)
raw_data[:5]

raw_data.to_csv('star_2013_cleaned_up.csv', index=False)