How can we help parents to choose elementary schools for their children? We’re interested in visualizing the statistical data from National Center for Educational Statistics with geo-data to help parents determine the best location to live in and send their children to school.
We will help parents assess the optimal location through analyzing a variety of factors including:
After ranking each of these on a scale of 1-10 (10: Most Important), we'll generate a heat map of the state they're interested in living in and dynamically color it according to how closely each county accords with their preferences.
The heatmap will be displayed within a web browser.
Example: http://nbviewer.ipython.org/5490713
Below is our initial code that extracts, cleans and coerces the data into Numpy data structures
## Setting up tools
import csv
import codecs
import pandas as pd
import numpy as np
import os
from pandas import DataFrame, Series
from itertools import islice
from bs4 import BeautifulSoup
dtype = {'FIPS':np.object}
## Extracting Literacy rates, and College degree rates
f1 = codecs.open(os.path.abspath("data/LiteracyCollegeDegreeData.csv"), encoding='iso-8859-1')
df1 = pd.read_csv(f1, dtype=dtype)
## Extracting Crime, motor vehicle mortality, average income and smoking rates
f2 = codecs.open(os.path.abspath("data/CrimeRate.csv"), encoding='iso-8859-1')
header = list()
data = list()
n = 0
for row in islice(f2, None):
if n == 0:
pass
elif n == 1:
header.append(row.split(','))
else:
data.append(row.split(','))
n = n + 1
df2 = DataFrame(data, columns = header[0])
df2 = df2[['FIPS', 'County', 'State', 'Violent Crime Rate', '% AFGR', 'MV Mortality Rate', '% Smokers']]
f2 = codecs.open(os.path.abspath("data/PopIncome.csv"), encoding='iso-8859-1')
header1 = list()
data1 = list()
n = 0
for row in islice(f2, None):
if n == 0:
pass
elif n == 1:
header1.append(row.split(','))
else:
data1.append(row.split(','))
n = n + 1
df3 = DataFrame(data1, columns = header1[0])
df3 = df3[['FIPS', 'County', 'State', 'Population', 'Household Income']]
## Extracting county health rankings
dtype = {'FIPS':np.object}
ff = codecs.open(os.path.abspath("data/CountyHealthRankings.csv"), encoding='iso-8859-1')
header = list()
data = list()
n = 0
for row in islice(ff, None):
if n == 0:
pass
elif n == 1:
header.append(row.split(','))
else:
data.append(row.split(','))
n = n + 1
health = DataFrame(data, columns = header[0])
dataframe = pd.merge(df2, df3)
df_final = pd.merge(df1, dataframe, on = 'FIPS')
# Deleting redundant or unnecessary columns, then cleaning up the dataframe by renaming the columns
for col in ('State_y', 'County_y', 'Population_x', '95%CI-Low(College)', '95%CI-Low(Illiterate)', '95%CI-High(Illiterate)', 'Quartile'):
del df_final[col]
df_final = df_final.rename(columns={'Population_y': 'Population', 'State_x': 'State', 'County_x': 'County'})
df_final['# of Ranked Counties'] = health['# of Ranked Counties']
df_final['Health Outcome Rank'] = health['Health Outcome Rank']
def get_state(state):
'''
Returns subset of df_final that is only the counties in the state
state should be a state's FIP code, e.g., '06' for California
'''
if len(state) > 2:
fip = state[:2]
else:
fip = state
return df_final[df_final['FIPS'].map(lambda x: x[:2] == fip)]
# False is for 'good' metrics and True is for 'bad' metrics.
ascending = {'NumberWentToSomeCollege': False, 'PercentWithCollegeDegree': False, '95%CI-High(College)': False, 'Illiterate': True, '% Smokers':True, 'MV Mortality Rate': True, '% AFGR': False, 'Violent Crime Rate': True, 'Population': False, 'Household Income':False, '# of Ranked Counties': False, 'Health Outcome Rank': False}
def getStateCountyMetricRankings(stateFips):
'''
Returns a dataframe with all the metrics ranked for all the counties in this state.
stateFips should be a FIPS code for a state with just the first two characters that are relevant.
'''
if len(stateFips) > 2:
stateFips = stateFips[:2]
finalDataFrame = None
for metric in ascending:
rankedColumn = get_state(stateFips)[metric].rank(ascending=ascending[metric])
if finalDataFrame is None:
finalDataFrame = DataFrame(index = rankedColumn.index)
finalDataFrame[metric] = rankedColumn
else:
finalDataFrame[metric] = rankedColumn
return finalDataFrame
Ranking a county characteristic as number 1 signifies that it is very important to you in choosing where to live. Ranking something as number 10 signifies that it is the least important factor to you.
translationMap = {"Crime Rate": "Violent Crime Rate", "HS Grad Rate": "% AFGR", \
"Healthcare Cost": "Health Outcome Rank", "College Degree Holders": "PercentWithCollegeDegree", "Population": "Population", \
"Income": "Household Income", "Literacy Rate": "Illiterate", "MV Mortality Rate": "MV Mortality Rate", \
"Percent of Smokers": "% Smokers", "Number attended College": "NumberWentToSomeCollege"}
ranks = [rank1, rank2, rank3, rank4, rank5, rank6, rank7, rank8, rank9, rank10]
weights = {}
# Populate the weights of the metrics.
for i,rank in enumerate(ranks):
translatedDFMetric = translationMap[rank]
weights[translatedDFMetric] = i + 1
print weights
{'Household Income': 3, 'Illiterate': 5, 'NumberWentToSomeCollege': 9, 'Violent Crime Rate': 1, '% Smokers': 6, 'MV Mortality Rate': 8, 'PercentWithCollegeDegree': 2, '% AFGR': 10, 'Population': 7, 'Health Outcome Rank': 4}
stateDF = get_state(state)
stateCountyMetrics = getStateCountyMetricRankings(state)
# Key is County FIPS, Value is the score
rankScores = {}
for index in stateDF.index:
countyFip = stateDF.ix[index].ix['FIPS']
for metric in weights:
weight = weights[metric]
metricValue = stateCountyMetrics.ix[index].ix[metric]
if countyFip not in rankScores:
rankScores[countyFip] = 0
rankScores[countyFip] += weight*metricValue
num_counties = len(stateDF)
def rgb_to_hex(rgb):
return '#%02x%02x%02x' % rgb
def generate_rgb(val, maxval):
f = float(val) / (maxval)
r, g, b = int((1-f)*255), int(f*255), 0
return rgb_to_hex((g, r, b))
colors = []
for val in xrange(0, num_counties+1):
colors.append(generate_rgb(val, num_counties))
from collections import defaultdict
counties_rating = rankScores
counties_sorted = sorted(counties_rating, key=counties_rating.get, reverse=False)
counties_colors = defaultdict(int)
i = 0
while (i < num_counties):
try:
counties_colors[counties_sorted[i]] = colors[i]
i = i + 1
except:
i = i + 1
continue
SVG file source: https://commons.wikimedia.org/wiki/File:USA_Counties.svg
from IPython.display import SVG
SVG(filename="counties.svg")
SVG file source: https://commons.wikimedia.org/wiki/File:USA_Counties.svg
svg = codecs.open(os.path.abspath("data/counties.svg"), encoding='iso-8859-1').read()
soup = BeautifulSoup(svg, selfClosingTags=['defs','sodipodi:namedview'])
paths = soup.findAll('path')
svgs = soup.findAll('svg')
viewbox = "5 90 650 300"
style = 'font-size:12px;fill-rule:nonzero;stroke:#FFFFFF;stroke-opacity:1; stroke-width:0.1;stroke-miterlimit:4;stroke-dasharray:none;stroke-linecap:butt; marker-start:none;stroke-linejoin:bevel;fill:'
blank = 'fill:#FFFFFF';
path_onmouseover_open = "displayName('"
path_onmouseover_close = "')"
legends = []
countyNames = defaultdict(int)
rank = 0
for p in paths:
if p['id'][0:2] == state[:2] and p['id'] not in ["State_Lines", "separator"]:
try:
color = counties_colors[p['id']]
p['style'] = style + color
rank = counties_sorted.index(p['id'])
countyNames[p['id']] = p['inkscape:label']
p['inkscape:label'] = p['inkscape:label'] + '. Rank: ' + str(rank+1)
name = p['inkscape:label']
legends.append(name)
p['onmouseover'] = path_onmouseover_open + name + path_onmouseover_close
count += 1
except:
continue
else:
p['style'] = blank
for s in svgs:
s['viewBox']= viewbox
with open("output.svg", "wb") as file:
file.write(soup.prettify(formatter=None))
for fips in counties_sorted[:10]:
print "Rank" + str(counties_sorted.index(fips) + 1) + " " + str(countyNames[fips])
Rank1 Nevada, CA Rank2 El Dorado, CA Rank3 San Mateo, CA Rank4 San Luis Obispo, CA Rank5 Placer, CA Rank6 Orange, CA Rank7 Marin, CA Rank8 Santa Barbara, CA Rank9 San Diego, CA Rank10 Sonoma, CA
from IPython.display import SVG
SVG(filename="output.svg")