import numpy as np
import pandas as pd
import pandas.rpy.common as com
# load the data
spam = com.load_data('spam', package='kernlab')
# get the number of rows and columns in the data
spam.shape
(4601, 58)
Here we'll use rpy2 interface to get the same trainIndicator variable as in the course, since it uses a seed to randomly sample the data
%load_ext rmagic
%%R -o trainIndicator
set.seed(3435)
trainIndicator = rbinom(4601,size=1,prob=0.5)
Now the trainIndicator variable is in Python namespace, stored as numpy array
# this is equivalent to the R table command
pd.Series(trainIndicator).value_counts()
0 2314 1 2287
## another way to do it completely in Python:
# np.random.seed(3435)
# trainIndicator = np.random.binomial(1, 0.5, 4601)
# np.bincount(trainIndicator)
# split into training and test data
trainSpam = spam[trainIndicator == 1]
testSpam = spam[trainIndicator == 0]
trainSpam.shape
(2287, 58)
# show column names
print trainSpam.columns
Index([make, address, all, num3d, our, over, remove, internet, order, mail, receive, will, people, report, addresses, free, business, email, you, credit, your, font, num000, money, hp, hpl, george, num650, lab, labs, telnet, num857, data, num415, num85, technology, num1999, parts, pm, direct, cs, meeting, original, project, re, edu, table, conference, charSemicolon, charRoundbracket, charSquarebracket, charExclamation, charDollar, charHash, capitalAve, capitalLong, capitalTotal, type], dtype=object)
# this is the equivalent of R head command
trainSpam.ix[:,:20].head()
make | address | all | num3d | our | over | remove | internet | order | receive | will | people | report | addresses | free | business | you | credit | |||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0.00 | 0.64 | 0.64 | 0 | 0.32 | 0.00 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.64 | 0.00 | 0 | 0 | 0.32 | 0 | 1.29 | 1.93 | 0.00 |
7 | 0.00 | 0.00 | 0.00 | 0 | 1.92 | 0.00 | 0.00 | 0 | 0.00 | 0.64 | 0.96 | 1.28 | 0.00 | 0 | 0 | 0.96 | 0 | 0.32 | 3.85 | 0.00 |
9 | 0.15 | 0.00 | 0.46 | 0 | 0.61 | 0.00 | 0.30 | 0 | 0.92 | 0.76 | 0.76 | 0.92 | 0.00 | 0 | 0 | 0.00 | 0 | 0.15 | 1.23 | 3.53 |
12 | 0.00 | 0.00 | 0.25 | 0 | 0.38 | 0.25 | 0.25 | 0 | 0.00 | 0.00 | 0.12 | 0.12 | 0.12 | 0 | 0 | 0.00 | 0 | 0.00 | 1.16 | 0.00 |
14 | 0.00 | 0.00 | 0.00 | 0 | 0.90 | 0.00 | 0.90 | 0 | 0.00 | 0.90 | 0.90 | 0.00 | 0.90 | 0 | 0 | 0.00 | 0 | 0.00 | 2.72 | 0.00 |
# compute frequency counts, similar to R table command
trainSpam['type'].value_counts()
nonspam 1381 spam 906
# boxplot using pandas boxplot
from pandas.tools.plotting import boxplot
boxplot(trainSpam, column='capitalAve', by='type');
# take the log
#
# here we use the apply() function to compute the log
df = trainSpam[['capitalAve', 'type']]
df['capitalAve'] = df['capitalAve'].apply(lambda x: np.log10(x + 1))
boxplot(df, column='capitalAve', by='type');
# relationships between predictors
from pandas.tools.plotting import scatter_matrix
scatter_matrix(trainSpam[['make', 'address', 'all', 'num3d']], figsize=(7, 7), marker='o');
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.py:3576: FutureWarning: rename with inplace=True will return None from pandas 0.11 onward " from pandas 0.11 onward", FutureWarning)
# hierarchical clustring
# need hcluster package
from hcluster import pdist, linkage, dendrogram
dendrogram(linkage(pdist(trainSpam.ix[:,:56].T)), labels=trainSpam.ix[:,:56].columns);
# take the log
dendrogram(linkage(pdist(trainSpam.ix[:,:55].T.apply(lambda x: np.log10(x + 1)))), labels=trainSpam.ix[:,:55].columns);