In [1]:

%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib

In [7]:

df = pd.read_csv('./data/beer.txt',delimiter = ' ')

In [22]:

df.sort('Calories')

Out[22]:

	Beer	Calories	Sodium	Alcohol	Cost
15	Pabst_Extra_Light	68	15	2.3	0.38
18	Olympia_Goled_Light	72	6	2.9	0.46
19	Schlitz_Light	97	7	4.2	0.47
8	Miller_Lite	99	10	4.3	0.43
11	Coors_Light	102	15	4.1	0.46
9	Budweiser_Light	113	8	3.7	0.40
12	Michelob_Light	135	11	4.2	0.50
16	Hamms	139	19	4.4	0.43
10	Coors	140	18	4.6	0.44
17	Heilemans_Old_Style	144	24	4.9	0.43
0	Budweiser	144	15	4.7	0.43
5	Old_Milwaukee	145	23	4.6	0.28
7	Srohs_Bohemian_Style	149	27	4.7	0.42
14	Kirin	149	6	5.0	0.79
13	Becks	150	19	4.7	0.76
1	Schlitz	151	19	4.9	0.43
4	Heineken	152	11	5.0	0.77
2	Lowenbrau	157	15	0.9	0.48
3	Kronenbourg	170	7	5.2	0.73
6	Augsberger	175	24	5.5	0.40

In [21]:

from sklearn import cluster
from __future__ import division
from matplotlib import pyplot as plt
import seaborn as sns

beer = df.iloc[:, 1:]
krange = range(1, int(beer.shape[0] / 2))
sum_squares = [cluster.KMeans(n_clusters=k).fit(beer).inertia_ for k in krange]

variance_explained = [1.0 - (s / sum(sum_squares)) for s in sum_squares]
plt.figure()
plt.plot(range(1, 10), variance_explained)
plt.ylabel('% of explained variance')
plt.xlabel('clusters K')
plt.xlim([1, 75])
plt.xlim([1, 11])

Out[21]:

(1, 11)

In [26]:

## -- 3 Clusters --
clusters_3 = cluster.KMeans(n_clusters=3).fit(beer)
fig, ax = plt.subplots(2, 2, figsize=[9, 4])
fig.suptitle('K Cluster Targets')
ax[0][0].scatter(beer.iloc[:, 0], beer.iloc[:, 1], c=clusters_3.predict(beer))
ax[0][1].scatter(beer.iloc[:, 0], beer.iloc[:, 2], c=clusters_3.predict(beer))
ax[1][0].scatter(beer.iloc[:, 0], beer.iloc[:, 3], c=clusters_3.predict(beer))

Out[26]:

<matplotlib.collections.PathCollection at 0x10b397d90>

In [31]:

from sklearn import tree

# fit a classification tree with max_depth=2 on all data
treeclf = tree.DecisionTreeClassifier(max_depth=2, random_state=1)
treeclf.fit(beer, clusters_3.predict(beer))

# create a Graphviz file
with open("beer_clusters.dot", 'wb') as f:
    f = tree.export_graphviz(treeclf, out_file=f, feature_names=list(beer.columns))

In [ ]: