import pandas as pd
import numpy as np
from sklearn import datasets
from IPython.core import display
%pylab inline
rcParams['axes.grid'] = True
filter(lambda m: 'grid' in m, plt.rcParams.keys())
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
['axes.grid', 'axes3d.grid', 'grid.alpha', 'grid.color', 'grid.linestyle', 'grid.linewidth', 'keymap.grid', 'polaraxes.grid']
## Plot (Time) Series
ts = pd.Series(randn(1000),
index = pd.date_range('1/1/2013', periods = 1000, freq="D"))
ts.plot()
cts = ts.cumsum()
figure()
cts.plot()
<matplotlib.axes.AxesSubplot at 0x310a290>
## Plot series with labelling and styling
figure()
cts.plot(label = 'cumulative ts', style = 'k--', title="ts plot with style and label")
legend(loc='best')
<matplotlib.legend.Legend at 0x35a0390>
## PLOT DATAFRAME - a convenience to plot all columns with labels
df = pd.DataFrame(randn(1000, 4), index = ts.index, columns = list("ABCD"))
cdf = df.cumsum(axis = 0) # rowwise cumulative
cdf.plot(title = 'default setting')
figure()
cdf.plot(title = 'turn off legend', legend=False)
<matplotlib.axes.AxesSubplot at 0x38b3690>
## plot different series in a dataframe EACH ON A DIFFERENT AXIS
## with subplots=True
NCOLS = cdf.shape[1]
cdf.plot(subplots=True, figsize=(10, 2 * NCOLS), title = "df with subplots")
legend(loc="best")
figure()
cdf.plot(subplots = True, title = 'with log y', logy=True, figsize = (10, 2 * NCOLS))
array([<matplotlib.axes.AxesSubplot object at 0x4022750>, <matplotlib.axes.AxesSubplot object at 0x3b6a190>, <matplotlib.axes.AxesSubplot object at 0x3b8c4d0>, <matplotlib.axes.AxesSubplot object at 0x426e950>], dtype=object)
## PLot with Pandas and subplots
fig, axes = subplots(nrows = 2, ncols = 2, figsize = (10, 6))
axes = axes.flatten()
for i, col in enumerate(cdf.columns):
cdf[col].plot(ax = axes[i])
axes[i].set_title(col)
by="class_label"
to plot class-wise conditional boxplotting. It is derived from groupby()**Conditional Plotting (mainly on different class labels), several things to try - (1) USE THE "by"
or "class_column"
or "c"
PARAMETER if there is any (2) USE groupby() function and chain with the plot (3) Use groupby() explicitly iterating each group and their DF*
MOST of them are plotted by using kind
parameter in the plot, some of them use extra parameters such as x
and y
## load data
iris_raw = datasets.load_iris()
iris = pd.DataFrame(data = iris_raw.data, columns = iris_raw.feature_names)
iris["Species"] = np.asarray(iris_raw.target, dtype='int32')
display(iris.describe())
print(iris.dtypes)
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | Species | |
---|---|---|---|---|---|
count | 150.000000 | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
mean | 5.843333 | 3.054000 | 3.758667 | 1.198667 | 1.000000 |
std | 0.828066 | 0.433594 | 1.764420 | 0.763161 | 0.819232 |
min | 4.300000 | 2.000000 | 1.000000 | 0.100000 | 0.000000 |
25% | 5.100000 | 2.800000 | 1.600000 | 0.300000 | 0.000000 |
50% | 5.800000 | 3.000000 | 4.350000 | 1.300000 | 1.000000 |
75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 | 2.000000 |
max | 7.900000 | 4.400000 | 6.900000 | 2.500000 | 2.000000 |
sepal length (cm) float64 sepal width (cm) float64 petal length (cm) float64 petal width (cm) float64 Species int32 dtype: object
## SCATTER PLOT
## sepal_length vs sepal_width
figure(figsize=(8, 8))
"""
iris[iris.Species==0].plot(x = iris.columns[0], y = iris.columns[1],
style = 'ro', label="species0")
iris[iris.Species==1].plot(x = iris.columns[0], y = iris.columns[1],
style = 'b*', label="species1")
iris[iris.Species==2].plot(x = iris.columns[0], y = iris.columns[1],
style = 'k^', label="species2")
legend(loc="best")
"""
iris.groupby(by="Species").plot(x = iris.columns[0], y = iris.columns[1], style = 'o')
xlabel(iris.columns[0])
ylabel(iris.columns[1])
<matplotlib.text.Text at 0x9fa89d0>
## BAR PLOT
print iris.Species.value_counts()
print iris.iloc[:, 0].plot(kind = 'bar', figsize=(8, 8))
legend(loc = 'best')
figure()
df2 = pd.DataFrame(rand(10, 4), columns = list('abcd'))
df2.plot(kind = 'bar', title="BarPlot of DF")
figure()
df2.plot(kind = 'bar', stacked = True, title = 'Stacked BarPlot of DF')
legend(loc = 'best')
2 50 1 50 0 50 dtype: int64 Axes(0.125,0.125;0.775x0.775)
<matplotlib.legend.Legend at 0x6d33e50>
## HISTOGRAM - by different class labels
iris.Species.hist()
"""
## hard way of drawing histogram in pandas -
## discretize the variables then draw the histogram
figure()
cut_sepalLen = pd.cut(iris.iloc[:, 0], 3)
print cut_sepalLen.levels
pd.Series(cut_sepalLen.labels).hist()
"""
print iris.Species.value_counts()
fig, axes = subplots(nrows = 3, ncols = 1, figsize = (3, 3 * 3))
for i, clabel in enumerate(iris.Species.value_counts().keys()):
iris[iris.Species==clabel].iloc[:, 0].hist(ax = axes[i], bins = 15)
axes[i].set_title('hist of %s in label % i' % (iris.columns[0], i))
2 50 1 50 0 50 dtype: int64
## HISTOGRAM - on the data frame as a whole
iris.hist(figsize = (6, 4))
## DRAW the class conditioning DIAGRAM again
"""
## HARD way
colors = ['r', 'g', 'b']
for i, label in enumerate(np.unique(iris.Species)):
figure()
iris[iris.Species==label].hist(color=colors[i], layout = (1, 5), figsize=(3 * 5, 3))
"""
colors = ['r', 'g', 'b']
for grp_name, grp_df in iris.groupby(by = 'Species'):
figure()
print grp_name
print type(grp_df)
_ = grp_df.hist(color=colors[grp_name], layout=(1, 5), figsize=(3 * 5, 3))
0 <class 'pandas.core.frame.DataFrame'> 1 <class 'pandas.core.frame.DataFrame'> 2 <class 'pandas.core.frame.DataFrame'>
## BOXPLOT
_ = iris.boxplot(figsize = (10, 10))
## CLASS WISE BOXPLOT
_ = iris.boxplot(by = 'Species', figsize = (10, 10), )
## SCATTER MATRIX PLOT
#map(lambda (gn, gd): pd.scatter_matrix(gd), iris.groupby('Species'))
_ = pd.scatter_matrix(iris, figsize = (10, 10), diagonal = 'kde', c = iris.Species)
## Andrews Curves - two obvious clusters
figure(figsize = (8, 8))
pd.tools.plotting.andrews_curves(iris, class_column='Species',)
<matplotlib.axes.AxesSubplot at 0xa26d110>
## Parallel Coordinates
figure(figsize=(8, 8))
pd.tools.plotting.parallel_coordinates(iris, class_column = 'Species')
## class 1 and 2 are very similiar to each other, and
## class 0 will be well distinguished from the other two
## best by petal_length and petal_width
<matplotlib.axes.AxesSubplot at 0x5db2590>
## LAG PLOT - FINDING STRUCTURES
pd.tools.plotting.lag_plot(iris.iloc[:, 0])
title("Sepal Length Structure")
figure()
pd.tools.plotting.lag_plot(iris.iloc[:, 4],)
title("Species Structure")
figure()
data = pd.Series(0.1 * rand(1000)
+ 0.6 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num = 1000)))
pd.tools.plotting.lag_plot(data)
title('sin signal with noise')
<matplotlib.text.Text at 0x10901b50>
## Autocorrelation plot - finding potential frequence or period
pd.tools.plotting.autocorrelation_plot(data)
title('Sin + noise with freq=$1$')
figure()
pd.tools.plotting.autocorrelation_plot(pd.Series(np.random.random(100)))
title('standard gaussian noise')
<matplotlib.text.Text at 0x102f9850>
pd.tools.plotting.bootstrap_plot(iris.iloc[:, 0], color = 'gray', size = 10, samples = 500 )
## RadViz, compared with parallel coordinates view
fig, axes = subplots(nrows = 1, ncols = 2, figsize = (2 * 8, 8))
pd.tools.plotting.radviz(iris, class_column = 'Species', ax = axes[0])
pd.tools.plotting.parallel_coordinates(iris, class_column = 'Species', ax = axes[1])
<matplotlib.axes.AxesSubplot at 0xdcb01d0>
## load tips data
import pandas as pd
import pandas.tools.rplot as rplot
tips = pd.read_csv('data/tips.csv')
print tips.head()
print "any missing values:", any(pd.isnull(tips))
total_bill tip sex smoker day time size 0 16.99 1.01 Female No Sun Dinner 2 1 10.34 1.66 Male No Sun Dinner 3 2 21.01 3.50 Male No Sun Dinner 3 3 23.68 3.31 Male No Sun Dinner 2 4 24.59 3.61 Female No Sun Dinner 4 any missing values: False
plot = rplot.RPlot(tips, x = 'total_bill', y='tip')
plot.add(rplot.TrellisGrid(['sex', 'smoker']))
#plot.add(rplot.GeomDensity())
plot.add(rplot.GeomScatter())
plot.render(gcf())
figure()
plot = rplot.RPlot(tips, x = 'total_bill', y = 'tip')
#plot.add(rplot.TrellisGrid(['sex', '.']))
plot.add(rplot.TrellisGrid(['smoker', 'sex']))
#plot.add(rplot.TrellisGrid(['sex', 'smoker']))
plot.add(rplot.GeomScatter())
plot.render()
pd.tools.plotting.boxplot(tips, by = ['sex', 'smoker'], column=['total_bill', 'tip'],
figsize = (5 * 2, 5))
array([<matplotlib.axes.AxesSubplot object at 0x6e27bd0>, <matplotlib.axes.AxesSubplot object at 0x73f8750>], dtype=object)
markers = pd.Series(['r'] * tips.shape[0])
markers[tips.sex == 'Female'] = 'b'
#print markers
_ = pd.scatter_matrix(tips, figsize = (8, 8), c=markers.tolist(), marker = '*')
## FINALLY A WORKING CONDITIONAL PLOTTING
## key parameters to set: c (color), s (size), cmap (using get_cmap())
## and probably labels
_ = pd.scatter_matrix(iris, figsize=(14, 14), c = iris.Species, label = iris.Species,
s = (iris.Species+1) * 50,
diagonal = 'kde', marker = 'o', cmap = get_cmap("Spectral"))