import urllib2
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
raw_csv = urllib2.urlopen(path)
col_names = range(23)
df = pd.read_csv(raw_csv, names = col_names)
df.head()
df.describe()
df.isnull().head()
df.isnull().values
df.isnull().values.any()
temp_df = DataFrame([[1, 2], [1, 4], [5, 6], [5, 4], [5, 4]], columns=['A', 'B'])
temp_df
groupby_temp_df = temp_df.groupby('A')
groupby_temp_df.groups
groupby_temp_df.get_group(1)
groupby_temp_df.get_group(5)
groupby_temp_df.count()
groupby_temp_df.head(1)
groupby_temp_df.describe()
groupby_temp_df2 = temp_df.groupby('B')
groupby_temp_df2.groups
groupby_temp_df2.get_group(2)
groupby_temp_df2.get_group(4)
groupby_temp_df2.get_group(6)
groupby_temp_df2.count()
groupby_temp_df2.head(1)
groupby_temp_df2.describe()
df[[0,1]].head(5)
a = df[[0,1]].groupby(1)
print type(a)
a.count()
a.head(2)
a.groups.keys()
a.size()
a.ngroups
a.get_group("c")
a.describe()
df_per_attr = {}
for i in range(1, 23):
df_per_attr[i] = {}
groupby_df = df[[0, i]].groupby(i)
df_per_attr[i]['ngorups'] = groupby_df.ngroups
df_per_attr[i]['group_keys'] = groupby_df.groups.keys()
df_per_attr[i]['subgroups'] = {}
for j in range(groupby_df.ngroups):
df_per_attr[i]['subgroups'][j] = groupby_df.get_group(df_per_attr[i]['group_keys'][j])
df_per_attr[1]['group_keys']
df_per_attr[1]['subgroups'][0]
df[0] = df[0].map({'p': 1, 'e': 0})
df
print df.shape[0], df.shape[1]
num_columns = df.shape[1]
map_dic = {}
for i in range(num_columns):
unique_array = df[i].unique()
N = len(unique_array)
map_dic[i] = {}
for j in range(N):
map_dic[i][unique_array[j]] = j
df[i] = df[i].map(map_dic[i])
df
map_dic
df.describe()
for i in range(1, num_columns):
unique_array = df[i].unique()
N = len(unique_array)
map_dic_sub = {}
for j in range(N):
if j == 0:
map_dic_sub[j] = 0
else:
map_dic_sub[j] = j / float(N - 1)
df[i] = df[i].map(map_dic_sub)
df
df.describe()
df_edible = df[df[0] == 0] # 0: edible
df_poisonous = df[df[0] == 1] # 1: poisonous
df_edible.describe()
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
fig.set_size_inches(15, 4)
df_edible.boxplot(ax=ax)
plt.show()
df_poisonous.describe()
fig, ax = plt.subplots()
fig.set_size_inches(15, 4)
df_poisonous.boxplot(ax=ax)
plt.show()
map_dic[4]
df_edible_4 = df_edible[[0, 4]]
df_edible_4.describe()
df_edible_groupby_4 = df_edible_4.groupby(4)
df_edible_groupby_4.count()
df_poisonous_4 = df_poisonous[[0, 4]]
df_poisonous_4.describe()
df_poisonous_groupby_4 = df_poisonous_4.groupby(4)
df_poisonous_groupby_4.count()
map_dic[5]
df_edible_5 = df_edible[[0, 5]]
df_edible_5.describe()
df_edible_groupby_5 = df_edible_5.groupby(5)
df_edible_groupby_5.count()
df_poisonous_5 = df_poisonous[[0, 5]]
df_poisonous_5.describe()
df_poisonous_groupby_5 = df_poisonous_5.groupby(5)
df_poisonous_groupby_5.count()
map_dic[16]
map_dic[17]
df_edible_17 = df_edible[[0, 17]]
df_edible_17.describe()
df_edible_groupby_17 = df_edible_17.groupby(17)
df_edible_groupby_17.count()
df_poisonous_17 = df_poisonous[[0, 17]]
df_poisonous_17.describe()
df_poisonous_groupby_17 = df_poisonous_17.groupby(17)
df_poisonous_groupby_17.count()