#!/usr/bin/env python
# coding: utf-8

# # Mushroom Dataset

# ## 1. Data Set Desciption
#   - http://archive.ics.uci.edu/ml/datasets/Mushroom
#   - Attribute Information
#     - 0 - classes (target attribute): edible=e, poisonous=p
#     - 1 - cap-shape: bell=b, conical=c, convex=x, flat=f, knobbed=k, sunken=s
#     - 2 - cap-surface: fibrous=f, grooves=g, scaly=y, smooth=s
#     - 3 - cap-color: brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y
#     - 4 - bruises: bruises=t, no=f
#     - 5 - odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s
#     - 6 - gill-attachment: attached=a, descending=d, free=f, notched=n
#     - 7 - gill-spacing: close=c,crowded=w,distant=d
#     - 8 - gill-size: broad=b, narrow=n
#     - 9 - gill-color: black=k, brown=n, buff=b, chocolate=h, gray=g, green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y
#     - 10 - stalk-shape: enlarging=e, tapering=t
#     - 11 - stalk-root: bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=?
#     - 12 - stalk-surface-above-ring: fibrous=f, scaly=y, silky=k, smooth=s
#     - 13 - stalk-surface-below-ring: fibrous=f, scaly=y, silky=k, smooth=s
#     - 14 - stalk-color-above-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
#     - 15 - stalk-color-below-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
#     - 16 - veil-type: partial=p, universal=u
#     - 17 - veil-color: brown=n, orange=o, white=w, yellow=y
#     - 18 - ring-number: none=n, one=o, two=t
#     - 19 - ring-type: cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z
#     - 20 - spore-print-color: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y
#     - 21 - population: abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y
#     - 22 - habitat: grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d

# ## 2. Pandas DataFame 다루기

# ### 1) Loading Data

# In[1]:


import urllib2
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
get_ipython().run_line_magic('matplotlib', 'inline')

path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
raw_csv = urllib2.urlopen(path)
col_names = range(23)
df = pd.read_csv(raw_csv, names = col_names)


# In[2]:


df.head()


# In[3]:


df.describe()


# ### 2) Missing Value 찾기

# In[4]:


df.isnull().head()


# In[5]:


df.isnull().values


# In[6]:


df.isnull().values.any()


# ### 2) DataFrame의 groupby 학습

# In[7]:


temp_df = DataFrame([[1, 2], [1, 4], [5, 6], [5, 4], [5, 4]], columns=['A', 'B'])


# In[8]:


temp_df


# #### - 컬럼 A로 그룹핑

# In[9]:


groupby_temp_df = temp_df.groupby('A')


# In[10]:


groupby_temp_df.groups


# In[11]:


groupby_temp_df.get_group(1)


# In[12]:


groupby_temp_df.get_group(5)


# - 각 그룹마다 존재하는 아이템의 개수 반환

# In[13]:


groupby_temp_df.count()


# - 각 그룹마다 n 개씩의 row를 반환하는 방법

# In[14]:


groupby_temp_df.head(1)


# In[15]:


groupby_temp_df.describe()


# #### - 컬럼 B로 그룹핑

# In[16]:


groupby_temp_df2 = temp_df.groupby('B')


# In[17]:


groupby_temp_df2.groups


# In[18]:


groupby_temp_df2.get_group(2)


# In[19]:


groupby_temp_df2.get_group(4)


# In[20]:


groupby_temp_df2.get_group(6)


# In[21]:


groupby_temp_df2.count()


# In[22]:


groupby_temp_df2.head(1)


# In[23]:


groupby_temp_df2.describe()


# ### 4) Mushroom 데이터의 각 속성별로 그룹핑 작업을 위한 정보 확인

# In[24]:


df[[0,1]].head(5)


# In[25]:


a = df[[0,1]].groupby(1)


# In[26]:


print type(a)


# In[27]:


a.count()


# In[28]:


a.head(2)


# In[29]:


a.groups.keys()


# In[30]:


a.size()


# In[31]:


a.ngroups


# In[32]:


a.get_group("c")


# In[33]:


a.describe()


# ### 5) 각 속성별 주요 정보를 담은 df_per_attr 사전 만들기

# In[34]:


df_per_attr = {}
for i in range(1, 23):
    df_per_attr[i] = {}
    groupby_df = df[[0, i]].groupby(i)
    df_per_attr[i]['ngorups'] = groupby_df.ngroups
    df_per_attr[i]['group_keys'] = groupby_df.groups.keys()
    df_per_attr[i]['subgroups'] = {}
    for j in range(groupby_df.ngroups):
        df_per_attr[i]['subgroups'][j] = groupby_df.get_group(df_per_attr[i]['group_keys'][j])


# In[35]:


df_per_attr[1]['group_keys']


# In[36]:


df_per_attr[1]['subgroups'][0]


# ### 6) Categorical Attribute를 Numerical Attribute로 변환

# In[37]:


df[0] = df[0].map({'p': 1, 'e': 0})
df


# In[38]:


print df.shape[0], df.shape[1]


# In[39]:


num_columns = df.shape[1]
map_dic = {}
for i in range(num_columns):
    unique_array = df[i].unique()
    N = len(unique_array)
    map_dic[i] = {}
    for j in range(N):
        map_dic[i][unique_array[j]] = j
    df[i] = df[i].map(map_dic[i])    
df


# In[40]:


map_dic


# In[41]:


df.describe()


# ### 7) 각 컬럼별 Normalization

# In[42]:


for i in range(1, num_columns):
    unique_array = df[i].unique()
    N = len(unique_array)
    map_dic_sub = {}
    for j in range(N):
        if j == 0:
            map_dic_sub[j] = 0
        else:
            map_dic_sub[j] = j / float(N - 1)        
    df[i] = df[i].map(map_dic_sub)
df


# In[49]:


df.describe()


# ### 8) Edible Mushrooms과 Poisonous Mushrooms 의 두 개의 그룹핑 작업 및 각 그룹별 Boxplot 그리기
# - Typical Scematic Box Plot
#   - http://goo.gl/qh2ILu
#   - http://www.sfu.ca/~jackd/Stat203/Wk02_1_Full.pdf
# ![Box Plot 설명](boxplot.png)

# In[50]:


df_edible = df[df[0] == 0]    # 0: edible
df_poisonous = df[df[0] == 1] # 1: poisonous


# In[51]:


df_edible.describe()


# In[52]:


import matplotlib.pyplot as plt
fig, ax = plt.subplots()
fig.set_size_inches(15, 4)
df_edible.boxplot(ax=ax)
plt.show()


# In[53]:


df_poisonous.describe()


# In[54]:


fig, ax = plt.subplots()
fig.set_size_inches(15, 4)
df_poisonous.boxplot(ax=ax)
plt.show()


# ### 9) 두 그룹간에 확연하게 차이를 보이는 속성들에 대한 EDA 작업
# - 속성 4 (bruises: bruises=t (0), no=f (1))

# In[55]:


map_dic[4]


# In[56]:


df_edible_4 = df_edible[[0, 4]]
df_edible_4.describe()


# In[57]:


df_edible_groupby_4 = df_edible_4.groupby(4)
df_edible_groupby_4.count()


# In[58]:


df_poisonous_4 = df_poisonous[[0, 4]]
df_poisonous_4.describe()


# In[59]:


df_poisonous_groupby_4 = df_poisonous_4.groupby(4)
df_poisonous_groupby_4.count()


# - 속성 5 (odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s)

# In[60]:


map_dic[5]


# In[61]:


df_edible_5 = df_edible[[0, 5]]
df_edible_5.describe()


# In[62]:


df_edible_groupby_5 = df_edible_5.groupby(5)
df_edible_groupby_5.count()


# In[63]:


df_poisonous_5 = df_poisonous[[0, 5]]
df_poisonous_5.describe()


# In[64]:


df_poisonous_groupby_5 = df_poisonous_5.groupby(5)
df_poisonous_groupby_5.count()


# - 속성 16 (veil-type: partial=p, universal=u)

# In[65]:


map_dic[16]


# - 속성 17 (veil-color: brown=n, orange=o, white=w, yellow=y)

# In[66]:


map_dic[17]


# In[67]:


df_edible_17 = df_edible[[0, 17]]
df_edible_17.describe()


# In[68]:


df_edible_groupby_17 = df_edible_17.groupby(17)
df_edible_groupby_17.count()


# In[69]:


df_poisonous_17 = df_poisonous[[0, 17]]
df_poisonous_17.describe()


# In[70]:


df_poisonous_groupby_17 = df_poisonous_17.groupby(17)
df_poisonous_groupby_17.count()