import numpy as np
import pandas as pd
import pandas.rpy.common as com

# load the data
spam = com.load_data('spam', package='kernlab')

# get the number of rows and columns in the data
spam.shape

%load_ext rmagic

%%R -o trainIndicator
set.seed(3435)
trainIndicator = rbinom(4601,size=1,prob=0.5)

# this is equivalent to the R table command
pd.Series(trainIndicator).value_counts()

## another way to do it completely in Python:
# np.random.seed(3435)
# trainIndicator = np.random.binomial(1, 0.5, 4601)
# np.bincount(trainIndicator)

# split into training and test data
trainSpam = spam[trainIndicator == 1]
testSpam = spam[trainIndicator == 0]
trainSpam.shape

# show column names
print trainSpam.columns 

# this is the equivalent of R head command
trainSpam.ix[:,:20].head()

# compute frequency counts, similar to R table command
trainSpam['type'].value_counts()

# boxplot using pandas boxplot
from pandas.tools.plotting import boxplot

boxplot(trainSpam, column='capitalAve', by='type');

# take the log
#
# here we use the apply() function to compute the log
df = trainSpam[['capitalAve', 'type']]
df['capitalAve'] = df['capitalAve'].apply(lambda x: np.log10(x + 1))

boxplot(df, column='capitalAve', by='type');

# relationships between predictors
from pandas.tools.plotting import scatter_matrix

scatter_matrix(trainSpam[['make', 'address', 'all', 'num3d']], figsize=(7, 7), marker='o');

# hierarchical clustring
# need hcluster package
from hcluster import pdist, linkage, dendrogram

dendrogram(linkage(pdist(trainSpam.ix[:,:56].T)), labels=trainSpam.ix[:,:56].columns);

# take the log
dendrogram(linkage(pdist(trainSpam.ix[:,:55].T.apply(lambda x: np.log10(x + 1)))), labels=trainSpam.ix[:,:55].columns);