import pandas as pd pd.set_option('display.max_rows', 0) import numpy as np import seaborn as sns from datetime import datetime from matplotlib import pyplot as plt import difflib import sklearn.linear_model as linear_model import sklearn import six from matplotlib import colors from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from numpy import hstack from numpy import atleast_2d, where # Matplotlib preferences %matplotlib inline plt.rcParams['figure.figsize'] = (15,12) plt.rc('xtick', labelsize=20) plt.rc('ytick', labelsize=20) plt.rc('axes', titlesize=25, labelsize=18) # Code for setting the style of the notebook from IPython.core.display import HTML def css_styling(): styles = open("theme/custom.css", "r").read() return HTML(styles) css_styling() # Output text to an external file def output(text): with open('ouput.txt', "a") as output: output.write("{}\n".format(text)) elect_res = pd.read_csv("csv_in/US_elect_county.csv") elect_res = elect_res[elect_res.FIPS > 0] elect_res.OBAMA_NUM = elect_res.OBAMA_NUM.str.replace(",", "") elect_res.ROMNEY_NUM = elect_res.ROMNEY_NUM.str.replace(",", "") elect_res = elect_res.convert_objects(convert_numeric=True) elect_res["OBAMA_PCT"] = elect_res.OBAMA_NUM / (elect_res.OBAMA_NUM + elect_res.ROMNEY_NUM) elect_res["OBAMA_WIN"] = elect_res.OBAMA_PCT > .5 elect_res.head(10) states = [("AK", "Alaska"), ("AL", "Alabama"), ("AR", "Arkansas"), ("AZ", "Arizona"), ("CA", "California"), ("CO", "Colorado"), ("CT", "Connecticut"), ("DE", "Delaware"), ("FL", "Florida"), ("GA", "Georgia"), ("HI", "Hawaii"), ("IA", "Iowa"), ("ID", "Idaho"), ("IL", "Illinois"), ("IN", "Indiana"), ("KS", "Kansas"), ("KY", "Kentucky"), ("LA", "Louisiana"), ("MA", "Massachusetts"), ("MD", "Maryland"), ("ME", "Maine"), ("MI", "Michigan"), ("MN", "Minnesota"), ("MO", "Missouri"), ("MS", "Mississippi"), ("MT", "Montana"), ("NC", "North Carolina"), ("ND", "North Dakota"), ("NE", "Nebraska"), ("NH", "New Hampshire"), ("NJ", "New Jersey"), ("NM", "New Mexico"), ("NV", "Nevada"), ("NY", "New York"), ("OH", "Ohio"), ("OK", "Oklahoma"), ("OR", "Oregon"), ("PA", "Pennsylvania"), ("RI", "Rhode Island"), ("SC", "South Carolina"), ("SD", "South Dakota"), ("TN", "Tennessee"), ("TX", "Texas"), ("UT", "Utah"), ("VA", "Virginia"), ("VT", "Vermont"), ("WA", "Washington"), ("WI", "Wisconsin"), ("WV", "West Virginia"), ("WY", "Wyoming")] have = elect_res.STATE_POST.unique() [state for (abbv, state) in states if abbv not in have] msa_def = pd.read_csv("csv_in/msa_definitions.csv") msa_def.MSA_AGG_NUM = msa_def.MSA_AGG_NUM.fillna(0) msa_def = msa_def.convert_objects(convert_numeric=True) msa_def.TOWNSHIP_NAME = msa_def.TOWNSHIP_NAME.str.rstrip("town") msa_def.TOWNSHIP_NAME = msa_def.TOWNSHIP_NAME.str.rstrip("city") msa_def.MSA_AGG_NUM = msa_def.MSA_AGG_NUM.astype(int) msa_def.info() fips_to_msa = {} for ind, data in msa_def.iterrows(): if pd.isnull(data.TOWNSHIP_NAME): if pd.isnull(data.MSA_AGG_NAME): # MSA fips_to_msa[data.FIPS] = data.MSA_NUM else: # MSA_AGG fips_to_msa[data.FIPS] = data.MSA_AGG_NUM # Towns else: try: fips_to_msa[data.FIPS][data.TOWNSHIP_NAME] = data.MSA_NUM except: fips_to_msa[data.FIPS] = {data.TOWNSHIP_NAME:data.MSA_NUM} # Create new dataframe df = pd.read_csv("csv_in/msa_list.csv") df["OBAMA_NUM"] = 0 df["ROMNEY_NUM"] = 0 df.index = df.MSA_NUM # Distribute votes for ind, data in elect_res.iterrows(): try: msa = fips_to_msa[data.FIPS] except: continue if type(msa) == dict: town = difflib.get_close_matches(data.COUNTY_NAME, msa.keys(), 1, .2) if town == []: continue msa = msa[town[0]] try: df.loc[msa, "OBAMA_NUM"] += data.OBAMA_NUM df.loc[msa, "ROMNEY_NUM"] += data.ROMNEY_NUM except: continue df = df[df.OBAMA_NUM > 0] df["TOTAL_VOTES"] = df.OBAMA_NUM + df.ROMNEY_NUM df["OBAMA_PCT"] = df.OBAMA_NUM / (df.OBAMA_NUM + df.ROMNEY_NUM) df["OBAMA_WIN"] = df.OBAMA_PCT > .5 df = df.sort() df.head() msa_pop = pd.read_csv("csv_in/msa_population.csv") msa_pop = msa_pop[pd.isnull(msa_pop.MSA_NUM) == False] for c in ["POP_2000","POP_2010","POP_DENSITY_2000","POP_DENSITY_2010"]: msa_pop[c] = msa_pop[c].str.replace(",","") msa_pop = msa_pop.convert_objects(convert_numeric=True) msa_pop.head() print ("2010 Population Size\n\tAverage: {}\n\tStd. Deviation: {}\n\n2010 Population Density (People / Square Mile)\n\tAverage: {}\n\tStd. Deviation: {}".format(round(msa_pop.POP_2010.mean(), 2), round(msa_pop.POP_2010.std(), 2), round(msa_pop.POP_DENSITY_2010.mean(), 2),round(msa_pop.POP_DENSITY_2010.std(), 2) )) msa_pop.sort("POP_DENSITY_2010").plot(x = "MSA_NAME", y="POP_DENSITY_2010", xticks=(), title="Population Density Distribution") msa_pop.sort("POP_DENSITY_2010", ascending = False).head() msa_pop.sort("POP_DENSITY_2010").head() msa_edu = pd.read_csv("csv_in/msa_educational-attainment.csv") msa_edu.EDU_10 = msa_edu.EDU_10 / msa_edu.EDU_TOTAL msa_edu.EDU_12 = msa_edu.EDU_12 / msa_edu.EDU_TOTAL msa_edu.EDU_14 = msa_edu.EDU_14 / msa_edu.EDU_TOTAL msa_edu.EDU_16 = msa_edu.EDU_16 / msa_edu.EDU_TOTAL msa_edu["EDU_AVG"] = (msa_edu.EDU_10 * 10 + msa_edu.EDU_12 * 12 + msa_edu.EDU_14 * 14 + msa_edu.EDU_16 * 16) msa_edu.head() print ("Educational Attainment\n\tAverage: {}\n\tStd. Deviation: {}".format(round(msa_edu.EDU_AVG.mean(), 3), round(msa_edu.EDU_AVG.std(), 3))) msa_edu.sort("EDU_AVG").plot(x = "MSA_NAME", y="EDU_AVG", xticks=(), title="Educational Attainment Distribution") msa_edu.sort("EDU_AVG", ascending = False).head() msa_edu.sort("EDU_AVG").head() msa_pop = msa_pop.drop("MSA_NAME", 1) df = df.merge(right=msa_pop, how='left', on="MSA_NUM") msa_edu = msa_edu.drop(["MSA_NAME","EDU_TOTAL"],1) df = df.merge(right=msa_edu, how='left', on="MSA_NUM") df.head() def plotLinReg(title, X_col, xlabel, Y, ylabel): X = X_col[:, np.newaxis] X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(X, Y, test_size = .5, random_state = 5) regr = linear_model.LinearRegression() regr.fit(X_train, Y_train) plt.axhline(y=0.5,xmin=0,xmax=max(X),c="grey",linewidth=3, zorder=3) plt.scatter(X, Y, color='blue', zorder=5) plt.plot(X_test, regr.predict(X_test), color='red', linewidth=3, zorder=4) plt.ylabel(ylabel) plt.xlabel(xlabel) plt.title(title) plotLinReg("Population Density vs Obama Percentage", df.POP_DENSITY_2010, "Popululation Density (people per square mile)", df.OBAMA_PCT, "% of votes for Obama") plt.savefig("images/votes-density-unfiltered.jpg") df[df.POP_DENSITY_2010 > 10000].sort("POP_DENSITY_2010", ascending=False) df_original = df.copy() df = df[pd.isnull(df.POP_DENSITY_2010) == False] df = df[pd.isnull(df.EDU_AVG) == False] df = df[df.POP_DENSITY_2010 < 10000] df.sort("POP_DENSITY_2010", ascending=False).head() plotLinReg("Population Density vs Obama Percentage (filtered)", df.POP_DENSITY_2010, "Popululation Density (people per square mile)", df.OBAMA_PCT, "% of votes for Obama") plt.savefig("images/votes-density-filtered.jpg") plt.show() df[df.OBAMA_WIN == False].sort("POP_DENSITY_2010", ascending = False).head() plotLinReg("Educational Attainment vs Obama Percentage", df.EDU_AVG, "Educational Attainment", df.OBAMA_PCT, "% of votes for Obama") plt.savefig("images/votes-education.jpg") plt.show() X = df.POP_DENSITY_2010.values Y = df.EDU_AVG.values X_scaler = StandardScaler(with_std=True) scaled_X = X_scaler.fit_transform(X) Y_scaler = StandardScaler(with_std=True) scaled_Y = Y_scaler.fit_transform(Y) def evaluate_clustering(X, max_k): inertia = [] inertia.append(0) for k in range(1, max_k + 1): kmeans = KMeans(init='k-means++', n_clusters=k, n_init=5) kmeans.fit_predict(X) inertia.append(kmeans.inertia_) return inertia data = hstack((atleast_2d(scaled_X).T, atleast_2d(scaled_Y).T)) inertia_k = evaluate_clustering(data, 15) ax = plt.subplot(111) ax.plot(inertia_k) plt.xlabel('Number of clusters') plt.ylabel('Inertia') ax.set_xlim(1, len(inertia_k)) plt.savefig("images/cluster-eval.jpg") plt.show() color_list_full = [("#D2691E", "Brown"), ("#1E90FF", "Blue"), ("#3CB371", "Green"), ("#9400D3", "Purple")] color_list = [a for (a, b) in color_list_full] def cluster(n_clusters): k_means = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) k_means.fit_predict(data) k_means_labels = k_means.labels_ df["CLUSTER_{}".format(n_clusters)] = k_means_labels k_means_cluster_centers = k_means.cluster_centers_ k_means_labels_unique = np.unique(k_means_labels) # KMeans for k, col in zip(range(n_clusters), color_list[:n_clusters]): my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] plt.plot(data[my_members, 0], data[my_members, 1], 'w', markerfacecolor=col, marker='.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor="#2F4F4F", markeredgecolor='k', markersize=6) # plt.text(-3.5, 1.8, 'train time: %.2fs\ninertia: %f' % ( # t_batch, k_means.inertia_)) plt.xlabel("Population Density", fontsize="18") plt.ylabel("Educational Attainment", fontsize="18") plt.savefig("images/{}-cluster.jpg".format(n_clusters)) plt.show() return k_means_labels # analyzeCluster(n_clusters, k_means_labels) def analyzeCluster(n_clusters, labels): titles=["Population Density", "Educational Attainment"] titles = ["Population Density"] plt.rcParams['figure.figsize'] = (14,7 * n_clusters) for i, label in enumerate(set(labels)): point_indices = where(labels == label)[0] point_indices = point_indices.tolist() cluster = df.iloc[point_indices] obama = cluster.OBAMA_PCT.mean() print "Cluster {} ({}):\n\tSize: {}\n\tAvg Pop Density: {}\n\tAvg Edu Lvl: {}\n\tObama Avg Pct: {}%".format(i+1, color_list_full[i][1], len(point_indices), round(cluster.POP_DENSITY_2010.mean(), 2), round(cluster.EDU_AVG.mean(), 2), round(cluster.OBAMA_PCT.mean() * 100, 2)) with sns.axes_style('darkgrid'): ax = plt.subplot(n_clusters, 3, 3 * i + 1) sns.violinplot(cluster[['OBAMA_PCT']], color=color_list[i], ax=ax, names=["% Of Obama Votes"]) ax.set_title("Cluster %d" % (i + 1)) ax = plt.subplot(n_clusters, 3, 3 * i + 2) sns.violinplot(cluster[['POP_DENSITY_2010']], color=color_list[i], ax=ax, names=["Population Density"]) ax = plt.subplot(n_clusters, 3, 3 * i + 3) sns.violinplot(cluster[['EDU_AVG']], ax=ax, color=color_list[i], names=["Educational Attainment"]) plt.rcParams['figure.figsize'] = (15,12) plt.savefig("images/{}-cluster-analysis.jpg".format(n_clusters)) labels = cluster(2) analyzeCluster(2, labels) labels = cluster(4) analyzeCluster(4, labels) msa_list = pd.read_csv("csv_in/msa_list.csv") msa_name_to_num = {} for ind, data in msa_list.iterrows(): msa_name_to_num[data.MSA_NAME] = data.MSA_NUM msa_carto = pd.read_csv("csv_in/msa_carto.csv") msa_carto.rename(columns={'name': 'MSA_NAME'}, inplace=True) msa_carto = msa_carto.drop_duplicates(subset="MSA_NAME") msa_carto = msa_carto.drop(["msacmsa", "type", "created_at", "updated_at", "cartodb_id"], 1) msa_carto = msa_carto[msa_carto.MSA_NAME.str.contains(", PR") == False] for ind, data in msa_carto.iterrows(): name = difflib.get_close_matches(str(data.MSA_NAME), msa_name_to_num.keys(), 1, .2) num = msa_name_to_num[name[0]] msa_carto.loc[ind, "MSA_NAME_FOUND"] = name[0] msa_carto.loc[ind, "MSA_NUM"] = num msa_carto = msa_carto.drop(["MSA_NAME", "MSA_NAME_FOUND"], 1) df_carto = df.merge(right=msa_carto, on="MSA_NUM") df_carto.to_csv("csv_out/df_carto.csv") df_carto.head() df_original css_styling()