import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline import MySQLdb import time import os from mpl_toolkits.basemap import Basemap from matplotlib.collections import LineCollection from matplotlib import cm import shapefile import inspect from itertools import islice, izip db_inputs = {"host":"localhost", "user":"root", "password":"root"} db = MySQLdb.connect(host=db_inputs["host"], user=db_inputs["user"], passwd=db_inputs["password"], port=8889) cursor = db.cursor() def create_database(dbname): sql1 = "SHOW DATABASES;" cursor.execute(sql1) current_databases = np.concatenate(cursor.fetchall()) if dbname not in current_databases: sql2 = "CREATE DATABASE %s;" % dbname cursor.execute(sql2) print "Database %s created at %s" % (dbname, time.strftime("%D %H:%M:%S")) else: print "Database '%s' already exists." % dbname def delete_database(dbname): sql1 = "SHOW DATABASES;" cursor.execute(sql1) current_databases = np.concatenate(cursor.fetchall()) if dbname in current_databases: sql2 = "DROP DATABASE %s;" % dbname cursor.execute(sql2) print "Database '%s' deleted at %s" % (dbname, time.strftime("%D %H:%M:%S")) else: print "'%s' does not exist." % dbname def create_table(tbname, dbname, column_names, dtypes, quiet=False): sql1 = "SHOW TABLES IN %s;" % dbname cursor.execute(sql1) current_tables = cursor.fetchall() if (len(current_tables) == 0) or ((len(current_tables) > 0) & (tbname not in np.concatenate(current_tables))): sql2 = "CREATE TABLE %s.%s (" % (dbname, tbname) sql2 += "%s %s" % (column_names[0], dtypes[0]) for ii in range(1, len(column_names)): sql2 += ", %s %s" % (column_names[ii], dtypes[ii]) sql2 += ");" cursor.execute(sql2) if quiet==False: print "Table %s created in database %s at %s" % (tbname, dbname, time.strftime("%D %H:%M:%S")) else: if quiet==False: print "Table %s in database %s already exists." % (tbname, dbname) def delete_table(tbname, dbname): sql1 = "SHOW TABLES IN %s;" % dbname cursor.execute(sql1) current_tables = np.concatenate(cursor.fetchall()) if tbname in current_tables: sql2 = "DROP TABLE %s.%s;" % (dbname, tbname) cursor.execute(sql2) print "Table '%s' in database '%s' deleted at %s" % (tbname, dbname, time.strftime("%D %H:%M:%S")) else: print "'%s' does not exist in database %s." % (tbname, dbname) def load_data_to_new_table(filename, dbname, tbname=None, print_sql=False): if not tbname: tbname = filename.split(".")[0].replace("-","_").lower() the_data = pd.read_csv(filename) nohead_fname = filename.split(".")[0]+"_no_head.csv" the_data.to_csv(nohead_fname, header=False, index=False, sep="\t") column_names = list(the_data.columns) dtypes = [] for ii in range(len(column_names)): if the_data[column_names[ii]].dtype == "O": dtypes.append("VARCHAR(100)") elif the_data[column_names[ii]].dtype == "int64": dtypes.append("INT") column_names[ii] = column_names[ii].lower().replace(" ","_") # Create the table create_table(tbname, dbname, column_names, dtypes) # Load the data sql = "LOAD DATA LOCAL INFILE '%s/%s' INTO TABLE %s.%s FIELDS TERMINATED BY '\\t' (%s" % (os.getcwd(), nohead_fname, dbname, tbname, column_names[0]) for ii in range(1, len(column_names)): sql += ", %s" % column_names[ii] sql += ");" if print_sql == False: cursor.execute(sql) else: sql_file = open("load_data.sql", "w") sql_file.write(sql) sql_file.close() os.system("mysql -u %s --password=%s < load_data.sql" % (db_inputs["user"], db_inputs["password"])) os.system("rm load_data.sql") print "\nData from %s loaded into %s.%s at %s" % (filename, dbname, tbname, time.strftime("%D %H:%M:%S")) def load_state_aggregate_data(filename, dbname, print_sql=False, skiprows=3, quiet=False, sheetname=0): data_year = filename.split(".")[0][-4:] tbname = "state_aggregate_"+data_year the_data = pd.read_excel(filename, skiprows=skiprows, sheetname=sheetname) nohead_fname = tbname+"_no_head.tsv" the_data.to_csv(nohead_fname, header=False, index=False, sep="\t") column_names = list(the_data.columns) dtypes = [] if "Year 1" in column_names: for ii in range(len(column_names)): if the_data[column_names[ii]].dtype == "O": dtypes.append("VARCHAR(100)") elif the_data[column_names[ii]].dtype == "int64": dtypes.append("INT") elif the_data[column_names[ii]].dtype == "float64": dtypes.append("DOUBLE") column_names[ii] = column_names[ii].lower().replace(" ","_").replace("-","_") if column_names[ii].startswith("year"): column_names[ii] = column_names[ii]+"_dropouts" if column_names[ii].find("actual") > -1: column_names[ii] = "adjusted_4_year_cohort_graduation_rate" else: for ii in range(len(column_names)): if the_data[column_names[ii]].dtype == "O": dtypes.append("VARCHAR(100)") elif the_data[column_names[ii]].dtype == "int64": dtypes.append("INT") elif the_data[column_names[ii]].dtype == "float64": dtypes.append("DOUBLE") if column_names[ii] == "Category": column_names[ii] = "Student " + column_names[ii] if type(column_names[ii]) == int: column_names[ii] = "Net Students Served in Grade "+str(column_names[ii]) if column_names[ii].find(".1") > -1: column_names[ii] = "Grade "+ column_names[ii].split(".")[0] + " Dropouts" if column_names[ii].find(".2") > -1: column_names[ii] = "Grade "+ column_names[ii].split(".")[0] + " Dropout Rate" if (column_names[ii] == "Number") or (column_names[ii] == "Count"): column_names[ii] = "Number Continuing" if column_names[ii] == "Rate": column_names[ii] = "Rate Continuing" if column_names[ii].startswith("Start"): column_names[ii] = "Percent Remaining " + column_names[ii] if column_names[ii].endswith("(minus continuing)"): column_names[ii] = column_names[ii].split("(")[0] if column_names[ii].find("Est") > -1: column_names[ii] = column_names[ii].replace("Est","Estimated") + " Rate" if column_names[ii].startswith("Annual"): column_names[ii] = column_names[ii] + " Rate" column_names[ii] = column_names[ii].lower().replace(" ","_").replace("-","_").replace("*","") # Create the table create_table(tbname, dbname, column_names, dtypes, quiet=quiet) # Load the data sql = "LOAD DATA LOCAL INFILE '%s/%s' INTO TABLE %s.%s FIELDS TERMINATED BY '\\t' (%s" % (os.getcwd(), nohead_fname, dbname, tbname, column_names[0]) for ii in range(1, len(column_names)): sql += ", %s" % column_names[ii] sql += ");" if print_sql == False: cursor.execute(sql) else: sql_file = open("load_data.sql", "w") sql_file.write(sql) sql_file.close() os.system("mysql -u %s --password=%s < load_data.sql" % (db_inputs["user"], db_inputs["password"])) os.system("rm load_data.sql") if quiet==False: print "Data from %s loaded into %s.%s at %s\n" % (filename, dbname, tbname, time.strftime("%D %H:%M:%S")) def load_county_aggregate_data(filename, dbname, print_sql=False, skiprows=3, quiet=False, sheetname=0): data_year = filename.split(".")[0][-4:] tbname = "county_aggregate_"+data_year the_data = pd.read_excel(filename, skiprows=skiprows, sheetname=sheetname) for ii in range(len(the_data)): the_data.County[ii] = the_data.County[ii].strip() nohead_fname = tbname+"_no_head.tsv" the_data.to_csv(nohead_fname, header=False, index=False, sep="\t") column_names = list(the_data.columns) dtypes = [] if "Year 1" in column_names: for ii in range(len(column_names)): if the_data[column_names[ii]].dtype == "O": dtypes.append("VARCHAR(100)") elif the_data[column_names[ii]].dtype == "int64": dtypes.append("INT") elif the_data[column_names[ii]].dtype == "float64": dtypes.append("DOUBLE") column_names[ii] = column_names[ii].lower().replace(" ","_").replace("-","_") if column_names[ii].startswith("year"): column_names[ii] = column_names[ii]+"_dropouts" if column_names[ii].find("actual") > -1: column_names[ii] = "adjusted_4_year_cohort_graduation_rate" else: for ii in range(len(column_names)): if the_data[column_names[ii]].dtype == "O": dtypes.append("VARCHAR(100)") elif the_data[column_names[ii]].dtype == "int64": dtypes.append("INT") elif the_data[column_names[ii]].dtype == "float64": dtypes.append("DOUBLE") if type(column_names[ii]) == int: column_names[ii] = "Net Students Served in Grade "+str(column_names[ii]) if column_names[ii].find(".1") > -1: column_names[ii] = "Grade "+ column_names[ii].split(".")[0] + " Dropouts" if column_names[ii].find(".2") > -1: column_names[ii] = "Grade "+ column_names[ii].split(".")[0] + " Dropout Rate" if (column_names[ii] == "Number") or (column_names[ii] == "Count"): column_names[ii] = "Number Continuing" if column_names[ii] == "Rate": column_names[ii] = "Rate Continuing" if column_names[ii].startswith("Start"): column_names[ii] = "Percent Remaining " + column_names[ii] if column_names[ii].endswith("(minus continuing)"): column_names[ii] = column_names[ii].split("(")[0] if column_names[ii].find("Est") > -1: column_names[ii] = column_names[ii].replace("Est","Estimated") + " Rate" if column_names[ii].startswith("Annual"): column_names[ii] = column_names[ii] + " Rate" column_names[ii] = column_names[ii].lower().replace(" ","_").replace("-","_").replace("*","") # Create the table create_table(tbname, dbname, column_names, dtypes, quiet=quiet) # Load the data sql = "LOAD DATA LOCAL INFILE '%s/%s' INTO TABLE %s.%s FIELDS TERMINATED BY '\\t' (%s" % (os.getcwd(), nohead_fname, dbname, tbname, column_names[0]) for ii in range(1, len(column_names)): sql += ", %s" % column_names[ii] sql += ");" if print_sql == False: cursor.execute(sql) else: sql_file = open("load_data.sql", "w") sql_file.write(sql) sql_file.close() os.system("mysql -u %s --password=%s < load_data.sql" % (db_inputs["user"], db_inputs["password"])) os.system("rm load_data.sql") if quiet==False: print "Data from %s loaded into %s.%s at %s\n" % (filename, dbname, tbname, time.strftime("%D %H:%M:%S")) def restore_df_columns(df, year, level): columns = list(df.columns) col_dict = {} for ii in range(len(columns)): col_dict[columns[ii]] = columns[ii].title().replace("_"," ")# + " " + str(year) if level == "state": df.index.rename("Student Category", inplace=True) return df.rename(columns=col_dict) def retrieve_aggregate_data(year, level="state"): if level == "state": df = pd.read_sql_query("SELECT * FROM testme.%s_aggregate_%s;" % (level, str(year)), db, index_col="student_category") elif level == "county": df = pd.read_sql_query("SELECT * FROM testme.%s_aggregate_%s;" % (level, str(year)), db, index_col="county") df = new_columns(restore_df_columns(df, year, level)) return df def get_stats(df, column, records): wash_counties = [record[4] for record in records if record[0] == "53"] data_dict = {} for county in wash_counties: data_dict[county] = df[column][county] return data_dict def get_differential(df1, df2, columns): diff = df2[columns] - df1[columns] return diff def new_columns(df): columns = df.columns if "Graduates" in columns: df["Estimated Grad Rate"] = (df["Graduates"] * 100.)/df["Adjusted Cohort"] df = df.rename(columns={"Cohort Dropout Rate":"Estimated Cohort Dropout Rate"}) else: df["Estimated Grad Rate"] = (df["On Time Grads"] * 100.)/df["Size Of Cohort"] df["Estimated Cohort Dropout Rate"] = df["Estimated Cohort Dropout Rate"]*100 if "Amer.Indian" in df.index: df = df.rename(index={"Amer.Indian":"American Indian"}) if "Asian Pac Islander" in df.index: df = df.rename(index={"Asian Pac Islander":"Asian/Pacific Islander"}) if "Asian/Pac Islander" in df.index: df = df.rename(index={"Asian/Pac Islander":"Asian/Pacific Islander", "Pac Islander":"Pacific Islander"}) return df def plot_aggregate_data(dfs, years, column, population, plot_type="bar",ylims=None,outfile=None): xaxis = [str(year) for year in years] years = np.array(years) fig = plt.figure(figsize=(11,6)) fig.subplots_adjust(left=0.07, right=0.99, top=0.97) ax = plt.subplot(111) if type(population) == str: data = [df[column][population] for df in dfs] if plot_type=="bar": ax.bar(years, data, align="center") elif plot_type=="scatter": ax.scatter(years,data,s=15) elif plot_type=="line": ax.plot(years,data,marker="o") else: N = len(population) #number of bins w = 0.9/N #bin width colors = ["b", "r", "g", "orange", "black", "white", "purple", "gray"] for ii in range(len(population)): data = [df[column][population[ii]] for df in dfs] if plot_type=="bar": ax.bar(years+(w/2)*(2*ii - (N-1)), data, align="center", width=w, color=colors[ii]) if plot_type=="scatter": ax.scatter(years, data, s=15, c=colors[ii]) if plot_type=="line": ax.plot(years, data, marker="o", color=colors[ii]) ax.minorticks_on() ax.grid() if iter(population): # ax.legend(population) legend = ax.legend(population, frameon = 1) frame = legend.get_frame() frame.set_color('white') frame.set_edgecolor('k') ax.set_xticks(years) ax.set_xticklabels(xaxis) ax.set_xlabel("Years", fontsize=18) ax.set_ylabel(column, fontsize=18) if ylims: ax.set_ylim(ylims) if outfile: plt.savefig(outfile) plt.show() def plot_county_data(df, column, year, vmin=0., vmax=100., cmap=plt.cm.hot, bounds="w", king_col="g", outfile=None): basemap_data_dir = os.path.join(os.path.dirname(inspect.getfile(Basemap)), "data") if os.path.exists(os.path.join(basemap_data_dir,"UScounties.shp")): shpf = shapefile.Reader(os.path.join(basemap_data_dir,"UScounties")) shapes = shpf.shapes() records = shpf.records() ## "53" is washington state # list(zip_filter_by_state(records, shapes, ['53']))[0] def zip_filter_by_state(records, shapes, included_states=None): # by default, no filtering # included_states is a list of states fips prefixes for (record, state) in izip(records, shapes): if record[0] in included_states: yield (record, state) fig = plt.figure(figsize=(6.4,4)) plt.subplots_adjust(left=0.01,right=0.9,top=0.97,bottom=0.05,wspace=0.15,hspace=0.05) ax = plt.subplot(111) #Let's create a basemap of WA x1 = -124.8 x2 = -116.9 y1 = 45.5 y2 = 49.1 # setup mercator map projection. m = Basemap(llcrnrlon=x1,llcrnrlat=y1,urcrnrlon=x2,urcrnrlat=y2,\ resolution='h',projection='merc',\ lat_ts=5.) m.drawcoastlines() m.fillcontinents() m.drawcountries() m.drawstates() df_data = get_stats(df, column, records) county_names = [] # show only WA for record, shape in zip_filter_by_state(records, shapes, ['53']): lons,lats = zip(*shape.points) data = np.array(m(lons, lats)).T county = record[4] value = df_data[county] color = cmap((value-vmin)/(vmax-vmin))[:3] if len(shape.parts) == 1: segs = [data,] else: segs = [] for i in range(1,len(shape.parts)): index = shape.parts[i-1] index2 = shape.parts[i] segs.append(data[index:index2]) segs.append(data[index2:]) lines = LineCollection(segs,antialiaseds=(1,)) lines.set_facecolors(color) lines.set_edgecolors(bounds) lines.set_linewidth(0.3) if county == "King": lines.set_linewidth(2) lines.set_edgecolors(king_col) ax.add_collection(lines) ax1 = fig.add_axes([0.89,0.05,0.03,0.92]) norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax) cbar = mpl.colorbar.ColorbarBase(ax1, cmap=cmap, norm=norm, orientation="vertical") cbar.set_label(year + " " + column + " (%)") dtick = 5. cbar.set_ticks(np.arange(vmin, vmax+dtick, dtick)) if outfile: plt.savefig(outfile) plt.show() infiles = [f for f in os.listdir("./") if f.startswith("Appendix") and f.find("State") != -1] delete_table("state_aggregate_2007","testme") delete_table("state_aggregate_2008","testme") delete_table("state_aggregate_2009","testme") delete_table("state_aggregate_2010","testme") delete_table("state_aggregate_2011","testme") delete_table("state_aggregate_2012","testme") delete_table("state_aggregate_2013","testme") delete_table("state_aggregate_2014","testme") for f in infiles: if eval(f.split("State")[-1].split(".")[0]) <= 2010: load_state_aggregate_data(f, "testme", print_sql=True, quiet=False, skiprows=4, sheetname=-1) else: load_state_aggregate_data(f, "testme", print_sql=True, quiet=False) infiles = [f for f in os.listdir("./") if f.startswith("Appendix") and f.find("County") != -1] delete_table("county_aggregate_2007","testme") delete_table("county_aggregate_2008","testme") delete_table("county_aggregate_2009","testme") delete_table("county_aggregate_2010","testme") delete_table("county_aggregate_2011","testme") delete_table("county_aggregate_2012","testme") delete_table("county_aggregate_2013","testme") delete_table("county_aggregate_2014","testme") for f in infiles: if eval(f.split(".")[0][-4:]) <= 2010: load_county_aggregate_data(f, "testme", print_sql=True, quiet=False, skiprows=3) else: load_county_aggregate_data(f, "testme", print_sql=True, quiet=False, skiprows=3) data2007_state = retrieve_aggregate_data(2007, level="state") data2008_state = retrieve_aggregate_data(2008, level="state") data2009_state = retrieve_aggregate_data(2009, level="state") data2010_state = retrieve_aggregate_data(2010, level="state") data2011_state = retrieve_aggregate_data(2011, level="state") data2012_state = retrieve_aggregate_data(2012, level="state") data2013_state = retrieve_aggregate_data(2013, level="state") data2014_state = retrieve_aggregate_data(2014, level="state") data2007_county = retrieve_aggregate_data(2007, level="county") data2008_county = retrieve_aggregate_data(2008, level="county") data2009_county = retrieve_aggregate_data(2009, level="county") data2010_county = retrieve_aggregate_data(2010, level="county") data2011_county = retrieve_aggregate_data(2011, level="county") data2012_county = retrieve_aggregate_data(2012, level="county") data2013_county = retrieve_aggregate_data(2013, level="county") data2014_county = retrieve_aggregate_data(2014, level="county") plot_aggregate_data((data2007_state, data2008_state, data2009_state, data2010_state, data2011_state, data2012_state, data2013_state, data2014_state), years=(2007,2008,2009,2010,2011,2012,2013,2014), column="Estimated Cohort Dropout Rate", population=("All Students", "Black","White","Hispanic","American Indian"), plot_type="bar", ylims=(10,50), outfile="dropout_rate_2007_2014.pdf") plot_aggregate_data((data2007_state, data2008_state, data2009_state, data2010_state, data2011_state, data2012_state, data2013_state, data2014_state), years=(2007,2008,2009,2010,2011,2012,2013,2014), column="Estimated Grad Rate", population=("All Students", "Black","White","Hispanic","American Indian"), plot_type="bar", ylims=(40,100), outfile="grad_rate_2007_2014.pdf") plot_county_data(data2014_county, "Estimated Grad Rate", "2014", vmin=40., vmax=100., cmap=plt.cm.RdBu, bounds = 'k', king_col="r", outfile="statewide_grad_rate.pdf") plot_county_data(data2014_county, "Estimated Cohort Dropout Rate", "2014", vmax=40., cmap=plt.cm.RdBu_r, bounds = 'k', king_col="r", outfile="statewide_dropout_rate.pdf") columns = ["Estimated Cohort Dropout Rate", "Estimated Grad Rate"] diff_2007_2014 = get_differential(data2007_county, data2014_county, columns) plot_county_data(diff_2007_2014, "Estimated Cohort Dropout Rate", "2007 - 2014", vmin=-15., vmax=15., cmap=plt.cm.RdBu_r, bounds = 'k', king_col="r", outfile="differential_dropout_rate.pdf") plot_county_data(diff_2007_2014, "Estimated Grad Rate", "2007 - 2014", vmin=-15., vmax=15., cmap=plt.cm.RdBu, bounds = 'k', king_col="r", outfile="differential_grad_rate.pdf")