#!/usr/bin/env python # coding: utf-8 # # Mushroom Dataset # ## 1. Data Set Desciption # - http://archive.ics.uci.edu/ml/datasets/Mushroom # - Attribute Information # - 0 - classes (target attribute): edible=e, poisonous=p # - 1 - cap-shape: bell=b, conical=c, convex=x, flat=f, knobbed=k, sunken=s # - 2 - cap-surface: fibrous=f, grooves=g, scaly=y, smooth=s # - 3 - cap-color: brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y # - 4 - bruises: bruises=t, no=f # - 5 - odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s # - 6 - gill-attachment: attached=a, descending=d, free=f, notched=n # - 7 - gill-spacing: close=c,crowded=w,distant=d # - 8 - gill-size: broad=b, narrow=n # - 9 - gill-color: black=k, brown=n, buff=b, chocolate=h, gray=g, green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y # - 10 - stalk-shape: enlarging=e, tapering=t # - 11 - stalk-root: bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=? # - 12 - stalk-surface-above-ring: fibrous=f, scaly=y, silky=k, smooth=s # - 13 - stalk-surface-below-ring: fibrous=f, scaly=y, silky=k, smooth=s # - 14 - stalk-color-above-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y # - 15 - stalk-color-below-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y # - 16 - veil-type: partial=p, universal=u # - 17 - veil-color: brown=n, orange=o, white=w, yellow=y # - 18 - ring-number: none=n, one=o, two=t # - 19 - ring-type: cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z # - 20 - spore-print-color: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y # - 21 - population: abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y # - 22 - habitat: grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d # ## 2. Pandas DataFame 다루기 # ### 1) Loading Data # In[1]: import urllib2 from scipy import stats from pandas import Series, DataFrame import pandas as pd import matplotlib.pyplot as plt import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data' raw_csv = urllib2.urlopen(path) col_names = range(23) df = pd.read_csv(raw_csv, names = col_names) # In[2]: df.head() # In[3]: df.describe() # ### 2) Missing Value 찾기 # In[4]: df.isnull().head() # In[5]: df.isnull().values # In[6]: df.isnull().values.any() # ### 2) DataFrame의 groupby 학습 # In[7]: temp_df = DataFrame([[1, 2], [1, 4], [5, 6], [5, 4], [5, 4]], columns=['A', 'B']) # In[8]: temp_df # #### - 컬럼 A로 그룹핑 # In[9]: groupby_temp_df = temp_df.groupby('A') # In[10]: groupby_temp_df.groups # In[11]: groupby_temp_df.get_group(1) # In[12]: groupby_temp_df.get_group(5) # - 각 그룹마다 존재하는 아이템의 개수 반환 # In[13]: groupby_temp_df.count() # - 각 그룹마다 n 개씩의 row를 반환하는 방법 # In[14]: groupby_temp_df.head(1) # In[15]: groupby_temp_df.describe() # #### - 컬럼 B로 그룹핑 # In[16]: groupby_temp_df2 = temp_df.groupby('B') # In[17]: groupby_temp_df2.groups # In[18]: groupby_temp_df2.get_group(2) # In[19]: groupby_temp_df2.get_group(4) # In[20]: groupby_temp_df2.get_group(6) # In[21]: groupby_temp_df2.count() # In[22]: groupby_temp_df2.head(1) # In[23]: groupby_temp_df2.describe() # ### 4) Mushroom 데이터의 각 속성별로 그룹핑 작업을 위한 정보 확인 # In[24]: df[[0,1]].head(5) # In[25]: a = df[[0,1]].groupby(1) # In[26]: print type(a) # In[27]: a.count() # In[28]: a.head(2) # In[29]: a.groups.keys() # In[30]: a.size() # In[31]: a.ngroups # In[32]: a.get_group("c") # In[33]: a.describe() # ### 5) 각 속성별 주요 정보를 담은 df_per_attr 사전 만들기 # In[34]: df_per_attr = {} for i in range(1, 23): df_per_attr[i] = {} groupby_df = df[[0, i]].groupby(i) df_per_attr[i]['ngorups'] = groupby_df.ngroups df_per_attr[i]['group_keys'] = groupby_df.groups.keys() df_per_attr[i]['subgroups'] = {} for j in range(groupby_df.ngroups): df_per_attr[i]['subgroups'][j] = groupby_df.get_group(df_per_attr[i]['group_keys'][j]) # In[35]: df_per_attr[1]['group_keys'] # In[36]: df_per_attr[1]['subgroups'][0] # ### 6) Categorical Attribute를 Numerical Attribute로 변환 # In[37]: df[0] = df[0].map({'p': 1, 'e': 0}) df # In[38]: print df.shape[0], df.shape[1] # In[39]: num_columns = df.shape[1] map_dic = {} for i in range(num_columns): unique_array = df[i].unique() N = len(unique_array) map_dic[i] = {} for j in range(N): map_dic[i][unique_array[j]] = j df[i] = df[i].map(map_dic[i]) df # In[40]: map_dic # In[41]: df.describe() # ### 7) 각 컬럼별 Normalization # In[42]: for i in range(1, num_columns): unique_array = df[i].unique() N = len(unique_array) map_dic_sub = {} for j in range(N): if j == 0: map_dic_sub[j] = 0 else: map_dic_sub[j] = j / float(N - 1) df[i] = df[i].map(map_dic_sub) df # In[49]: df.describe() # ### 8) Edible Mushrooms과 Poisonous Mushrooms 의 두 개의 그룹핑 작업 및 각 그룹별 Boxplot 그리기 # - Typical Scematic Box Plot # - http://goo.gl/qh2ILu # - http://www.sfu.ca/~jackd/Stat203/Wk02_1_Full.pdf # ![Box Plot 설명](boxplot.png) # In[50]: df_edible = df[df[0] == 0] # 0: edible df_poisonous = df[df[0] == 1] # 1: poisonous # In[51]: df_edible.describe() # In[52]: import matplotlib.pyplot as plt fig, ax = plt.subplots() fig.set_size_inches(15, 4) df_edible.boxplot(ax=ax) plt.show() # In[53]: df_poisonous.describe() # In[54]: fig, ax = plt.subplots() fig.set_size_inches(15, 4) df_poisonous.boxplot(ax=ax) plt.show() # ### 9) 두 그룹간에 확연하게 차이를 보이는 속성들에 대한 EDA 작업 # - 속성 4 (bruises: bruises=t (0), no=f (1)) # In[55]: map_dic[4] # In[56]: df_edible_4 = df_edible[[0, 4]] df_edible_4.describe() # In[57]: df_edible_groupby_4 = df_edible_4.groupby(4) df_edible_groupby_4.count() # In[58]: df_poisonous_4 = df_poisonous[[0, 4]] df_poisonous_4.describe() # In[59]: df_poisonous_groupby_4 = df_poisonous_4.groupby(4) df_poisonous_groupby_4.count() # - 속성 5 (odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s) # In[60]: map_dic[5] # In[61]: df_edible_5 = df_edible[[0, 5]] df_edible_5.describe() # In[62]: df_edible_groupby_5 = df_edible_5.groupby(5) df_edible_groupby_5.count() # In[63]: df_poisonous_5 = df_poisonous[[0, 5]] df_poisonous_5.describe() # In[64]: df_poisonous_groupby_5 = df_poisonous_5.groupby(5) df_poisonous_groupby_5.count() # - 속성 16 (veil-type: partial=p, universal=u) # In[65]: map_dic[16] # - 속성 17 (veil-color: brown=n, orange=o, white=w, yellow=y) # In[66]: map_dic[17] # In[67]: df_edible_17 = df_edible[[0, 17]] df_edible_17.describe() # In[68]: df_edible_groupby_17 = df_edible_17.groupby(17) df_edible_groupby_17.count() # In[69]: df_poisonous_17 = df_poisonous[[0, 17]] df_poisonous_17.describe() # In[70]: df_poisonous_groupby_17 = df_poisonous_17.groupby(17) df_poisonous_groupby_17.count()