import pandas as pd import numpy as np import statsmodels.api as sm import matplotlib.pyplot as plt %matplotlib inline a = pd.Series(np.random.randn(5), name='random numbers') a # Series can be sliced a[-3:] # Series can be indexed by position a[3] # Series can be indexed with arrays of indices a[[1, 3, 2]] # Series can be indexed with boolean arrays a[[True, False, False, True, True]] # Series can have labels as indices a.index = list('abcde') # list('abcde') --> ['a', 'b', 'c', 'd', 'e'] a # Indexing with numbers (and slicing) still works a[2] # But we can also index with labels... a['c'] # ...and arrays of labels a[['c', 'b', 'a']] # Operations on Series align on the index (i) a[-3:] # Operations on Series align on the index (ii) a[:3] # Operations on Series align on the index (iii) a[-3:] + a[:3] # Compare to the same result for the underlying numpy array a.values[-3:] + a.values[:3] # A Series makes a great look-up table gp_lut = pd.Series(data=[4, 3, 2, 1, 0], index=list('ABCDF')) gp_lut my_grades = ['A', 'B', 'B', 'A', 'A'] my_points = gp_lut[my_grades] my_points # Series have lots of numerical evaluation methods my_gpa = my_points.mean() my_gpa students = pd.DataFrame({'name' : ['Peter', 'Paul', 'Mary', 'Peter', 'Paul', 'Mary'], 'subject' : ['English', 'English', 'English', 'Math', 'Math', 'Math'], 'grade' : [85.0, 76.0, 92.0, 77.0, 68.0, 87.0]}) students # We can get the column names students.columns # We can get a single Series indexing by column name... students['name'] # ...or as an attribute students.name # We can get a subset of columns indexing with an array of labels students[['subject', 'grade']] # To get a subset of rows, slicing and indexing still work students[-3:] # But we can give boolean indexing an interesting twist students[students.name == 'Mary'] data = pd.read_csv('star_2013_clean_wide.csv') data # A closer look at the available columns data.columns # What are the unique test names? data['Test.Name'].unique() data.pivot_table(values='Students.Tested', index='Test.Name', columns='Grade', aggfunc='sum') # First, keep the English test rows only english = data[data['Test.Name'] == 'CST English-Language Arts'] english_grade = english.groupby('Grade')['Mean.Scale.Score'].aggregate(np.mean) english_grade english_grade.plot() # We add a new calculated column to our dataframe english['Weighted.Score'] = english['Mean.Scale.Score'] * english['Pct.Test.Grade'] # Then group by grade and plot the aggregated sum of this column english.groupby('Grade')['Weighted.Score'].aggregate(np.sum).plot() # Keep the relevant rows only math_7th = data[(data['Test.Name'] == 'CST Mathematics') & (data['Grade'] == 7)] math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter') # Let's peek at the offending row math_7th[math_7th['Mean.Scale.Score'] < 300] math_7th.plot(x='Students.Tested', y='Mean.Scale.Score', kind='scatter', logx=True) math_7th = math_7th[math_7th['County.Name'] != 'Sierra'] math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter') # Let's fit a linear model to our data y = math_7th['Mean.Scale.Score'] # response X = math_7th['median.family.income'] # predictor X = sm.add_constant(X) # Add a constant term to the predictor # The actual fitting happens here est = sm.OLS(y, X) est = est.fit() est.summary() # Let's plot the regression line on top of the data x_ = np.array([X.min(), X.max()]) y_ = est.predict(x_) math_7th.plot(x='median.family.income', y='Mean.Scale.Score', kind='scatter') plt.plot(x_[:, 1], y_, 'r-') math_7th.plot(x='Spend.Per.ADA', y='Mean.Scale.Score', kind='scatter') # Let's fit a linear model to our data y = math_7th['Mean.Scale.Score'] # response X = math_7th['Spend.Per.ADA'] # predictor X = sm.add_constant(X) # Add a constant term to the predictor # The actual fitting happens here est = sm.OLS(y, X) est = est.fit() est.summary() # Let's plot the regression line on top of the data x_ = np.array([X.min(), X.max()]) y_ = est.predict(x_) math_7th.plot(x='Spend.Per.ADA', y='Mean.Scale.Score', kind='scatter') plt.plot(x_[:, 1], y_, 'r-') # Let's fit a linear model to our data y = math_7th['Mean.Scale.Score'] # response X = math_7th[['median.family.income', 'Spend.Per.ADA']] # predictor X = sm.add_constant(X) # Add a constant term to the predictor # The actual fitting happens here est = sm.OLS(y, X) est = est.fit() est.summary() raw_data = pd.read_csv('ca2013_1_csv_v3.txt') # Let's list the columns in the loaded data frame raw_data.columns raw_data = raw_data[['County Code', 'District Code', 'Grade', 'Test Id', 'Students Tested', 'Mean Scale Score']] raw_data = raw_data[(raw_data['County Code'] != 0) & (raw_data['District Code'] == 0)] del raw_data['District Code'] raw_data[:5] entities = pd.read_csv('ca2013entities_csv.txt') # Let's take a look at the columns here entities.columns # Let's keep what we need only entities = entities[['County Code', 'District Code', 'County Name']] # And let's get rid of the non county specific data entities = entities[(entities['County Code'] != 0) & (entities['District Code'] == 0)] entities[:5] # We can now get rid of the 'District Code' column del entities['District Code'] # And make the 'County Code' column the index of the data frame entities.set_index('County Code', inplace=True) # Lastly, we use this as a LUT for the raw_data 'County Code'. NOtice the use of # .loc to ensure the indexing is label, not position, based county_names = entities.loc[raw_data['County Code']] # And we add this information as a new column... county_names.index = raw_data.index raw_data['County Name'] = county_names raw_data[:5] test_name_lut = pd.read_csv('test_names.csv') test_name_lut.set_index('Test ID', inplace=True) test_names = test_name_lut.loc[raw_data['Test Id']] test_names.index = raw_data.index raw_data['Test Name'] = test_names raw_data[:5] raw_data['Mean Scale Score'] = raw_data['Mean Scale Score'].convert_objects(convert_numeric=True) raw_data['Mean Scale Score'].dtype raw_data = raw_data[raw_data['County Code'] != 99] income = pd.read_csv('income.csv') income.set_index('County', inplace=True) income = income.loc[raw_data['County Name']] income.index = raw_data.index raw_data = raw_data.join(income) raw_data[:5] expense_lut = pd.read_excel('currentexpense1213.xls', sheetname='County Averages', skiprows=5) expense_lut.set_index('County', inplace=True) expenses = expense_lut.loc[raw_data['County Name']] expenses.index = raw_data.index raw_data = raw_data.join(expenses) raw_data[:5] grouped = raw_data.groupby(['Grade', 'Test Id'])['Students Tested'] raw_data['Pct Test Grade'] = grouped.apply(lambda x: x / x.sum()) raw_data[:5] raw_data.rename(columns={'County Code': 'County.Code', 'Test Id': 'Test.Id', 'Students Tested': 'Students.Tested', 'Mean Scale Score': 'Mean.Scale.Score', 'County Name': 'County.Name', 'Test Name': 'Test.Name', 'Per capita income': 'per.capita.income', 'Median household income': 'median.household.income', 'Median family income': 'median.family.income', 'Expenditures (EDP 365)': 'Spend', 'Current Expense ADA': 'ADAttend', 'Current Expense Per ADA': 'Spend.Per.ADA', 'Pct Test Grade': 'Pct.Test.Grade'}, inplace=True) raw_data[:5] raw_data.to_csv('star_2013_cleaned_up.csv', index=False)