import numpy as np import pandas as pd import pandas.rpy.common as com # load the data spam = com.load_data('spam', package='kernlab') # get the number of rows and columns in the data spam.shape %load_ext rmagic %%R -o trainIndicator set.seed(3435) trainIndicator = rbinom(4601,size=1,prob=0.5) # this is equivalent to the R table command pd.Series(trainIndicator).value_counts() ## another way to do it completely in Python: # np.random.seed(3435) # trainIndicator = np.random.binomial(1, 0.5, 4601) # np.bincount(trainIndicator) # split into training and test data trainSpam = spam[trainIndicator == 1] testSpam = spam[trainIndicator == 0] trainSpam.shape # show column names print trainSpam.columns # this is the equivalent of R head command trainSpam.ix[:,:20].head() # compute frequency counts, similar to R table command trainSpam['type'].value_counts() # boxplot using pandas boxplot from pandas.tools.plotting import boxplot boxplot(trainSpam, column='capitalAve', by='type'); # take the log # # here we use the apply() function to compute the log df = trainSpam[['capitalAve', 'type']] df['capitalAve'] = df['capitalAve'].apply(lambda x: np.log10(x + 1)) boxplot(df, column='capitalAve', by='type'); # relationships between predictors from pandas.tools.plotting import scatter_matrix scatter_matrix(trainSpam[['make', 'address', 'all', 'num3d']], figsize=(7, 7), marker='o'); # hierarchical clustring # need hcluster package from hcluster import pdist, linkage, dendrogram dendrogram(linkage(pdist(trainSpam.ix[:,:56].T)), labels=trainSpam.ix[:,:56].columns); # take the log dendrogram(linkage(pdist(trainSpam.ix[:,:55].T.apply(lambda x: np.log10(x + 1)))), labels=trainSpam.ix[:,:55].columns);