This closely follows the ggplot book. The syntax in python is obviously different - for a start, we don't have a qplot function, but always build plot objects.
But let's get started.
#imports and R magic
try:
import exptools # Don't worry about this
exptools.load_software('pyggplot')
except ImportError:
pass
import pandas
import pandas.rpy
import pyggplot as gg
#set plot size in the notebook to 'tiny'
gg.ipython_plot_width = 200
gg.ipython_plot_height = 200
import rpy2.robjects as ro
quiet = ro.r('library("ggplot2")')
import numpy
%load_ext rpy2.ipython
running inside docker Find out what's changed in ggplot2 with news(Version == "1.0.0", package = "ggplot2")
#this is the example data set from the ggplot book
diamonds = pandas.rpy.common.load_data('diamonds')
#but the book only uses one hundred samples
numpy.random.seed(1410) # make this reproducible
chosen = numpy.random.choice(diamonds.index, 100)
dsmall = diamonds.ix[chosen]
%Rpush chosen
%R dsmall = diamonds[chosen, ]
None
#A simpule scatter plot - in R using the qplot function
%R -w 150 -h 150 -u px plot(qplot(carat, price, data=diamonds))
#Which translates to ggplot(diamonds) + geom_point(carat, price)
None #to silence [Listvector...] output
#Same thing in python
gg.Plot(diamonds).geom_point('carat', 'price')
#virtually every function of the Plot object returns the Plot object, so you can chain them like this.
#R allows passing in functions of values - python does not
%R -w 150 -h 150 -u px plot(qplot(log(carat), log(price), data=diamonds))
None
#But in this case, we can simply transform the axis
#And we'll do it step by step in this example
p = gg.Plot(diamonds)
p.geom_point('carat', 'price')
p.scale_x_continuous(trans='log10')
p.scale_y_continuous(trans='log10')
#arguments to qplot can be combinations of existing variables
%R -w 150 -h 150 -u px plot(qplot(carat, x * y * z, data=diamonds))
None
#In Python, you have to specify a dataframe column - not a function to plot
#so we have to prepare the data in python.
diamonds['xyz' ] = diamonds['x'] * diamonds['y'] * diamonds['z']
gg.Plot(diamonds).geom_point('carat','xyz')
#Colors and shapes
%R -w 400 -h 200 -u px plot(qplot(carat, price, data=dsmall, color=color, shape=cut))
None
#now we'll add some color and shapes
p = gg.Plot(dsmall).geom_point('carat', 'price', color='color', shape='cut')
p.ipython_plot_width = 400 #you can also pass the width to plot.render when writing to a file
p.ipython_plot_height = 200
p
#overplotting was a problem for the complete dataset, but we can reduce the opacity with the alpha parameter
%R -w 150 -h 150 -u px plot(qplot(carat, price, data = diamonds, alpha=I(1/100)))
None
#overplotting was a problem for the complete dataset, but we can reduce the opacity with the alpha parameter
#In R: qplot(carat, price, data = diamonds, alpha=I(1/100))
gg.Plot(diamonds).geom_point('carat','price', alpha=1/100.)
#Adding a smoother plot, combining two geoms. Once with the qplot interface
%R -w 150 -h 150 -u px plot(qplot(carat, price, data = dsmall, geom = c("point", "smooth")))
#and once with the + geom interface.
#Our python stuff is closer to the geom interface
%R -w 150 -h 150 -u px plot(ggplot(dsmall) + geom_smooth(aes(carat, price)) + geom_point(aes(carat, price)))
None
#We could also have
geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
#In Python
gg.Plot(dsmall).geom_point('carat','price').add_smooth('carat','price', method='loess')
#Hourribly overprinterd jitter plots
%R -w 150 -h 150 -u px plot(qplot(color, price / carat, data=diamonds, geom="jitter", alpha=I(1/5), position = position_jitter(w=0.1)))
None
diamonds['price/carat'] = diamonds['price'] / diamonds['carat']
gg.Plot(diamonds).geom_jitter('color', 'price/carat', alpha = 1/5.,
position = gg.position_jitter(w = 0.1)
)
#Box plots
#Hourribly overprinterd jitter plots
%R -w 150 -h 150 -u px plot(qplot(color, price / carat, data=diamonds, geom="boxplot"))
None
diamonds['price/carat'] = diamonds['price'] / diamonds['carat']
gg.Plot(diamonds).geom_boxplot('color', 'price/carat') # Note added _ between box and plot
#histograms...
%R -w 150 -h 150 -u px plot(qplot(carat, data=diamonds, geom='histogram', binwidth = 1))
None
gg.Plot(diamonds).geom_histogram('carat', binwidth=1)
#Density
#histograms...
%R -w 150 -h 150 -u px plot(qplot(carat, data=diamonds, geom='density'))
None
gg.Plot(diamonds).geom_density('carat')
#bar charts
%R -w 150 -h 150 -u px plot(qplot(color, data=diamonds, geom='bar'))
None
gg.Plot(diamonds).geom_bar('color', '..count..', stat='bin') #The python wrapper does not set the default stat to bin
#switching datasets for the line plots
economics = pandas.rpy.common.load_data('economics')
%R year = function(x) as.POSIXlt(x)$year + 1900
economics['year'] = ro.r('year(economics$date)') # convert date into a year
%R -w 150 -h 150 -u px plot(qplot(date,uempmed, data=economics, geom = "line"))
None
gg.Plot(economics).geom_line('date', 'uempmed')
%R -w 150 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = c("point", "path")))
None
economics['unemploy / pop'] = economics['unemploy'] / economics['pop']
gg.Plot(economics).geom_point('unemploy / pop', 'uempmed').add_path() #You can omit mandatory mappings an subsequent calls
%R year = function(x) as.POSIXlt(x)$year + 1900
%R -w 300 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = "path", colour = year(date)))
None
p = gg.Plot(economics).geom_path('unemploy / pop', 'uempmed', color = 'year') # remember, we converted date into year earlier
p.ipython_plot_width = 300 # just for this plot
p
#Since the book talked about scale_area - which is now called scale_size_area
#but the example actually didn't need an area scaled size, here is an example
#first without size area
%R -w 300 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = "point", colour = year(date), size=unemploy) )
None
#Since the book talked about scale_area - which is now called scale_size_area
#but the example actually didn't need an area scaled size, here is an example
#then with it.
%R -w 300 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = "point", colour = year(date), size=unemploy) + scale_size_area())
None
#python without
p = gg.Plot(economics).add_point('unemploy / pop', 'uempmed', color = 'date', size='unemploy')
p.ipython_plot_width = 300
p
#python with
p = gg.Plot(economics).geom_point('unemploy / pop', 'uempmed', color = 'date', size='unemploy').scale_size_area()
p.ipython_plot_width = 300
p
%R -w 250 -h 250 -u px plot(qplot(carat, data = diamonds, facets = color ~ .,geom = "histogram", binwidth = 0.1, xlim = c(0, 3)))
None
p = gg.Plot(diamonds).geom_histogram('carat', binwidth=0.1).facet_grid(rows='color').scale_x_continuous(limits = [0,3])
p.ipython_plot_height = 250
p.ipython_plot_width = 250
p
%R -w 200 -h 200 -u px plot(qplot(carat, ..density.., data=diamonds, facets = color ~ ., geom="histogram", binwidth=0.1, xlim=c(0,3)))
None
gg.Plot(diamonds).geom_histogram('carat', '..density..', binwidth=0.1).facet_grid(rows='color').scale_x_continuous(limits = [0,3])
#some more options from section 2.7 of the ggplot book
#limits, label renaming, log, title setting
%R -w 150 -h 150 -u px plot(ggplot(dsmall) + ggtitle("Hello world") + geom_point(aes(carat, price)) + scale_x_continuous(name='The X Axis', limits=c(1,2)) + scale_y_continuous( trans='log10'),limits=c(0,5000)) #ggplot helpfully ignores thes limits
None
p = gg.Plot(dsmall).geom_point('carat', 'price')
p.scale_x_continuous(limits=[1,2.], name = 'The X-Axis')
p.scale_y_continuous(trans='log10') # passing limits together with a transformation get's an error somewhere in ggplot
p.title('hello world')
#I guess that's better than showing the wrong graph :)
mpg = pandas.rpy.common.load_data('mpg')
Here we use the ggplot + geom* syntax in R - see chapter 4 of the ggplot b ook
%R -w 250 -h 250 -u px plot(ggplot(mpg) + geom_point(aes(displ, hwy, color=factor(cyl))))
None
mpg['cyl'] = pandas.Categorical(mpg['cyl']) # equivalent to factor
gg.Plot(mpg).geom_point('displ', 'hwy', color='cyl')
#let's add some linear regression
%R -w 250 -h 250 -u px plot(ggplot(mpg) + geom_point(aes(displ, hwy, color=factor(cyl))) + geom_smooth(aes(displ, hwy, color=factor(cyl)), method='lm'))
None
#What if you don't want the linear regressions to be colored by cyl? use group
gg.Plot(mpg).geom_point('displ', 'hwy', color='cyl').geom_smooth(group='cyl',method='lm')
#We have to repeat color - it's not a required argument and therefore needs to be filled in again!
#stacked bar charts
%R -w 250 -h 150 -u px plot(ggplot(diamonds) + geom_bar(aes(color, fill=cut)))
None
p = gg.Plot(diamonds).geom_bar('color', '..count..', fill='cut', stat='bin', position=gg.position_stack())
p.ipython_plot_width = 250
p
#or side by side (the default)
p = gg.Plot(diamonds).geom_bar('color', '..count..', fill='cut', stat='bin', position=gg.position_dodge())
p.ipython_plot_width = 400
p
#or filled to 100%
gg.Plot(diamonds).geom_bar('color', '..count..', fill='cut', stat='bin', position=gg.position_fill())