Structure of a data analysis -- in Python¶

In [1]:

import numpy as np
import pandas as pd
import pandas.rpy.common as com

In [5]:

# load the data
spam = com.load_data('spam', package='kernlab')

# get the number of rows and columns in the data
spam.shape

Out[5]:

(4601, 58)

Here we'll use rpy2 interface to get the same trainIndicator variable as in the course, since it uses a seed to randomly sample the data

In [6]:

%load_ext rmagic

In [7]:

%%R -o trainIndicator
set.seed(3435)
trainIndicator = rbinom(4601,size=1,prob=0.5)

Now the trainIndicator variable is in Python namespace, stored as numpy array

In [8]:

# this is equivalent to the R table command
pd.Series(trainIndicator).value_counts()

Out[8]:

0    2314
1    2287

In [9]:

## another way to do it completely in Python:
# np.random.seed(3435)
# trainIndicator = np.random.binomial(1, 0.5, 4601)
# np.bincount(trainIndicator)

In [10]:

# split into training and test data
trainSpam = spam[trainIndicator == 1]
testSpam = spam[trainIndicator == 0]
trainSpam.shape

Out[10]:

(2287, 58)

In [11]:

# show column names
print trainSpam.columns 

Index([make, address, all, num3d, our, over, remove, internet, order, mail, receive, will, people, report, addresses, free, business, email, you, credit, your, font, num000, money, hp, hpl, george, num650, lab, labs, telnet, num857, data, num415, num85, technology, num1999, parts, pm, direct, cs, meeting, original, project, re, edu, table, conference, charSemicolon, charRoundbracket, charSquarebracket, charExclamation, charDollar, charHash, capitalAve, capitalLong, capitalTotal, type], dtype=object)

In [12]:

# this is the equivalent of R head command
trainSpam.ix[:,:20].head()

Out[12]:

	make	address	all	our	over	remove	order	mail	receive	will	people	free	email	you	credit
1	0.00	0.64	0.64	0.32	0.00	0.00	0.00	0.00	0.00	0.64	0.00	0.32	1.29	1.93	0.00
7	0.00	0.00	0.00	1.92	0.00	0.00	0.00	0.64	0.96	1.28	0.00	0.96	0.32	3.85	0.00
9	0.15	0.00	0.46	0.61	0.00	0.30	0.92	0.76	0.76	0.92	0.00	0.00	0.15	1.23	3.53
12	0.00	0.00	0.25	0.38	0.25	0.25	0.00	0.00	0.12	0.12	0.12	0.00	0.00	1.16	0.00
14	0.00	0.00	0.00	0.90	0.00	0.90	0.00	0.90	0.90	0.00	0.90	0.00	0.00	2.72	0.00

In [13]:

# compute frequency counts, similar to R table command
trainSpam['type'].value_counts()

Out[13]:

nonspam    1381
spam        906

Plots¶

In [14]:

# boxplot using pandas boxplot
from pandas.tools.plotting import boxplot

boxplot(trainSpam, column='capitalAve', by='type');

In [15]:

# take the log
#
# here we use the apply() function to compute the log
df = trainSpam[['capitalAve', 'type']]
df['capitalAve'] = df['capitalAve'].apply(lambda x: np.log10(x + 1))

boxplot(df, column='capitalAve', by='type');

In [16]:

# relationships between predictors
from pandas.tools.plotting import scatter_matrix

scatter_matrix(trainSpam[['make', 'address', 'all', 'num3d']], figsize=(7, 7), marker='o');

/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.py:3576: FutureWarning: rename with inplace=True  will return None from pandas 0.11 onward
  " from pandas 0.11 onward", FutureWarning)

In [17]:

# hierarchical clustring
# need hcluster package
from hcluster import pdist, linkage, dendrogram

dendrogram(linkage(pdist(trainSpam.ix[:,:56].T)), labels=trainSpam.ix[:,:56].columns);

In [18]:

# take the log
dendrogram(linkage(pdist(trainSpam.ix[:,:55].T.apply(lambda x: np.log10(x + 1)))), labels=trainSpam.ix[:,:55].columns);

In [ ]: