#!/usr/bin/env python # coding: utf-8 # # Mushroom Dataset # ## 1. Loading Data to Pandas DataFame # In[1]: import urllib2 from scipy import stats from pandas import Series, DataFrame import pandas as pd import matplotlib.pyplot as plt import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data' raw_csv = urllib2.urlopen(path) col_names = range(23) df = pd.read_csv(raw_csv, names = col_names) # In[2]: df # - df_per_attr 사전 만들기 # In[44]: df_per_attr = {} for i in range(1, 23): df_per_attr[i] = {} groupby_df = df[[0, i]].groupby(i) df_per_attr[i]['num_groups'] = groupby_df.ngroups df_per_attr[i]['group_keys'] = groupby_df.groups.keys() df_per_attr[i]['subgroups'] = {} for j in range(df_per_attr[i]['num_groups']): df_per_attr[i]['subgroups'][j] = groupby_df.get_group(df_per_attr[i]['group_keys'][j]) # In[45]: df_per_attr[1]['num_groups'] # In[46]: df_per_attr[1]['group_keys'] # In[47]: df_per_attr[1]['subgroups'][0] # In[48]: df_per_attr[1]['subgroups'][5] # ## 2. Entropy # In[49]: labels = df[0].values print len(labels) n_labels = len(labels) # In[50]: from collections import Counter from math import log print Counter(labels) print Counter(labels).values() counts = np.array(Counter(labels).values()) print counts print probs = counts / float(n_labels) print probs print n_classes = np.count_nonzero(probs) print n_classes print entropy = 0. for i in probs: entropy -= i * log(i, n_classes) print "Entropy of Root Node: %s" % entropy # In[51]: def entropy(labels): """ Computes entropy of label distribution. """ from collections import Counter from math import log n_labels = len(labels) if n_labels <= 1: return 0 counts = np.array(Counter(labels).values()) probs = counts / float(n_labels) n_classes = np.count_nonzero(probs) if n_classes <= 1: return 0 entropy = 0. # Compute standard entropy. for i in probs: entropy -= i * log(i, n_classes) return entropy # In[52]: df[0].values # In[53]: entropy(df[0].values) # In[54]: df_per_attr[1]['subgroups'][0].values # In[55]: a = [item[0] for item in df_per_attr[1]['subgroups'][0].values] print a # In[56]: entropy([item[0] for item in df_per_attr[1]['subgroups'][0].values]) # In[57]: a = [item[0] for item in df_per_attr[1]['subgroups'][5].values] print a # In[58]: entropy([item[0] for item in df_per_attr[1]['subgroups'][5].values]) # ## 3. Information Gain # In[59]: parent_size = float(len(df)) parent_size # In[60]: for i in range(1, 23): sub_sum_entropy = 0.0 print i for j in range(df_per_attr[i]['num_groups']): sub_size = len(df_per_attr[i]['subgroups'][j]) sub_entropy = entropy([item[0] for item in df_per_attr[i]['subgroups'][j].values]) print "%3.2f * %5.4f = %5.4f" % (sub_size / parent_size, sub_entropy, (sub_size / parent_size) * sub_entropy) sub_sum_entropy = sub_sum_entropy + (sub_size / parent_size) * sub_entropy ig = entropy(df[0].values) - sub_sum_entropy df_per_attr[i]['information_gain'] = ig print "Information Gain: %5.4f - %5.4f = %5.4f" % (entropy(df[0].values), sub_sum_entropy, ig) print # In[61]: import operator information_gain_dict = {} for i in range(1, 23): information_gain_dict[i] = df_per_attr[i]['information_gain'] information_gain_dict # In[62]: max(information_gain_dict.iteritems(), key=operator.itemgetter(1))