import pandas as pd import pylab as P import matplotlib.pyplot as plt import numpy as np from collections import Counter from nltk import FreqDist %matplotlib inline # read in file as pandas dataframe def read_in_file(filename): data = pd.read_csv(filename) return data # for sanity checks of data within columns # prints out the unique categorical values found in specified columns def get_unique_values(df, verbose=False): column_names = list(df) agg_levels = pd.unique(df[column_names[0]]) names = pd.unique(df[column_names[2]]) discip_type = pd.unique(df[column_names[3]]) ethnic_groups = pd.unique(df[column_names[4]]) if verbose: print("Aggregate Levels are: %s" % agg_levels) print("Names of schools are: %s" % names) print("Discipline types are: %s" % discip_type) print("Ethnic groups are: %s" % ethnic_groups) return # subset the data by AGGREGATION LEVEL # D=Local educational agency totals (includes districts and direct funded charter schools) # O=County totals # S=SchoolTotals # T=State totals def subset_by_agg_level(df): column_names = list(df) county_data = df[df.AggegateLevel == 'O'] district_data = df[df.AggegateLevel == 'D'] school_data = df[df.AggegateLevel == 'S'] state_data = df[df.AggegateLevel == 'T'] # label columns in subset: county_data.columns = column_names district_data.columns = column_names school_data.columns = column_names state_data.columns = column_names return county_data, district_data, school_data, state_data # export a specified dataframe to csv def export_df_to_csv(df, output_filename): df.to_csv(output_filename, index=False, header=True) return # grab a subset of a dataframe by column indices def subset_cols_dataframe(df, col_indexes): subset = df.iloc[:, col_indexes] return subset # append together an array of dataframes(any number) in order # example: array_df = [dataframe1, dataframe2, dataframe3] def append_dataframes(array_df): temp = array_df[0] for i in range(1, len(array_df)): temp = temp.append(array_df[i]) return temp # this chunk of code just takes the input csv and subsets it ca14 = read_in_file('CA_discip14.csv') # 2014 California Data saved in ca14 county_data14, district_data14, school_data14, state_data14 = subset_by_agg_level(ca14) ca13 = read_in_file('CA_discip13.csv') # 2013 California Data saved in ca13 county_data13, district_data13, school_data13, state_data13 = subset_by_agg_level(ca13) ca12 = read_in_file('CA_discip12.csv') # 2012 California Data saved in ca12 county_data12, district_data12, school_data12, state_data12 = subset_by_agg_level(ca12) print("2012 data:") get_unique_values(ca12, verbose=True) print("2013 data:") get_unique_values(ca13, verbose=True) print("2014 data:") get_unique_values(ca14, verbose=True) print("-----------------------------------------------------------------------------------------") # creating the useful dataframe for visualizing change in schools over time indices_schools = [2, -2,-1] # for this information we only need the name of school, total referrals, and year schools14 = subset_cols_dataframe(school_data14, indices_schools) # slicing out columns for each year schools13 = subset_cols_dataframe(school_data13, indices_schools) schools12 = subset_cols_dataframe(school_data12, indices_schools) school_totals = append_dataframes([schools14, schools13, schools12]) # appending all years together by_school = school_totals.groupby(['Name','Year']).sum() print("Preview of groupby product: ") print(by_school[0:10]) print(by_school.describe()) by_school.sort('Total', ascending=False).head(50) total_ref = by_school.Total.tolist() # get just the number total referrals/year in a list cleanedList = [x for x in total_ref if str(x) != 'nan'] # temp fix since my 'nan' is string not NaN # make histogram from this list of values yearly_ref = Counter() for value in cleanedList: yearly_ref[value] += 1 yearly_ref_values = list(yearly_ref.keys()) print(yearly_ref.most_common(15)) P.hist(yearly_ref_values, bins=100, facecolor='green') plt.title('Counts of Number of Referrals per Year') plt.ylabel('Number of Schools') plt.xlabel('Number Referrals') # choose which columns you want to subset indices_ethnicity = [4, 6, 12] # state data w/o agg_level and CID and name indices_IDless = [4, 3, 5, 6, 7, 8, 9, 10, 12] ethnic14state = subset_cols_dataframe(state_data14, indices_IDless) ethnic13state = subset_cols_dataframe(state_data13, indices_IDless) ethnic12state = subset_cols_dataframe(state_data12, indices_IDless) # append the state data from all 3 years together ethnic_discip_state = append_dataframes([ethnic14state, ethnic13state, ethnic12state]) # optional: uncomment to export this dataframe in csv form # export_df_to_csv(ethnic_discip_state, 'ethnic_discip_state.csv') group_year_ethnicity = ethnic_discip_state.groupby(['Ethnicity', 'Year']).sum() group_ethnic = ethnic_discip_state.groupby(['Ethnicity']).sum() group_year = ethnic_discip_state.groupby(['Year']).sum() # optional: uncomment to export this dataframe in csv form # export_df_to_csv(group_year_ethnicity, 'group_year_ethnicity.csv') group_year_ethnicity.plot(kind = 'bar',stacked = True, title = 'Referrals by Ethnicity and Year') #InAmGroup = group_year_ethnicity.groupby(group_year_ethnicity.Ethnicity==1) group_year_ethnicity.describe().plot(kind = 'area', stacked =True, title = '') group_year_ethnicity.describe() ethnic_discip = read_in_file('ethnic_discip_state.csv') print("Columns are: %s" %list(ethnic_discip)) # columns of dataframe print("Number of entries is: %s" %len(ethnic_discip)) # num entries ethnic_discip.plot(x='Ethnicity',y ='Drugs', kind = 'scatter',xticks=[0,1,2,3,4,5,6,7,8,9], title = 'Total Referrals by Ethnicity') byInAm = ethnic_discip[ethnic_discip.Ethnicity==1] print(byInAm) year = byInAm.groupby(['Year']).sum() print(year)