%pylab inline
import pandas as pd
Populating the interactive namespace from numpy and matplotlib
df = pd.read_csv('./data/beer.txt',delimiter = ' ')
df.sort('Calories')
Beer | Calories | Sodium | Alcohol | Cost | |
---|---|---|---|---|---|
15 | Pabst_Extra_Light | 68 | 15 | 2.3 | 0.38 |
18 | Olympia_Goled_Light | 72 | 6 | 2.9 | 0.46 |
19 | Schlitz_Light | 97 | 7 | 4.2 | 0.47 |
8 | Miller_Lite | 99 | 10 | 4.3 | 0.43 |
11 | Coors_Light | 102 | 15 | 4.1 | 0.46 |
9 | Budweiser_Light | 113 | 8 | 3.7 | 0.40 |
12 | Michelob_Light | 135 | 11 | 4.2 | 0.50 |
16 | Hamms | 139 | 19 | 4.4 | 0.43 |
10 | Coors | 140 | 18 | 4.6 | 0.44 |
17 | Heilemans_Old_Style | 144 | 24 | 4.9 | 0.43 |
0 | Budweiser | 144 | 15 | 4.7 | 0.43 |
5 | Old_Milwaukee | 145 | 23 | 4.6 | 0.28 |
7 | Srohs_Bohemian_Style | 149 | 27 | 4.7 | 0.42 |
14 | Kirin | 149 | 6 | 5.0 | 0.79 |
13 | Becks | 150 | 19 | 4.7 | 0.76 |
1 | Schlitz | 151 | 19 | 4.9 | 0.43 |
4 | Heineken | 152 | 11 | 5.0 | 0.77 |
2 | Lowenbrau | 157 | 15 | 0.9 | 0.48 |
3 | Kronenbourg | 170 | 7 | 5.2 | 0.73 |
6 | Augsberger | 175 | 24 | 5.5 | 0.40 |
from sklearn import cluster
from __future__ import division
from matplotlib import pyplot as plt
import seaborn as sns
beer = df.iloc[:, 1:]
krange = range(1, int(beer.shape[0] / 2))
sum_squares = [cluster.KMeans(n_clusters=k).fit(beer).inertia_ for k in krange]
variance_explained = [1.0 - (s / sum(sum_squares)) for s in sum_squares]
plt.figure()
plt.plot(range(1, 10), variance_explained)
plt.ylabel('% of explained variance')
plt.xlabel('clusters K')
plt.xlim([1, 75])
plt.xlim([1, 11])
(1, 11)
## -- 3 Clusters --
clusters_3 = cluster.KMeans(n_clusters=3).fit(beer)
fig, ax = plt.subplots(2, 2, figsize=[9, 4])
fig.suptitle('K Cluster Targets')
ax[0][0].scatter(beer.iloc[:, 0], beer.iloc[:, 1], c=clusters_3.predict(beer))
ax[0][1].scatter(beer.iloc[:, 0], beer.iloc[:, 2], c=clusters_3.predict(beer))
ax[1][0].scatter(beer.iloc[:, 0], beer.iloc[:, 3], c=clusters_3.predict(beer))
<matplotlib.collections.PathCollection at 0x10b397d90>
from sklearn import tree
# fit a classification tree with max_depth=2 on all data
treeclf = tree.DecisionTreeClassifier(max_depth=2, random_state=1)
treeclf.fit(beer, clusters_3.predict(beer))
# create a Graphviz file
with open("beer_clusters.dot", 'wb') as f:
f = tree.export_graphviz(treeclf, out_file=f, feature_names=list(beer.columns))