pyggplot examples¶

This closely follows the ggplot book. The syntax in python is obviously different - for a start, we don't have a qplot function, but always build plot objects.

But let's get started.

In [1]:

#imports and R magic
try:
    import exptools # Don't worry about this
    exptools.load_software('pyggplot')
except ImportError:
    pass
import pandas
import pandas.rpy
import pyggplot as gg
#set plot size in the notebook to 'tiny' 
gg.ipython_plot_width = 200
gg.ipython_plot_height = 200
import rpy2.robjects as ro
quiet = ro.r('library("ggplot2")')
import numpy 
%load_ext rpy2.ipython

running inside docker
Find out what's changed in ggplot2 with
news(Version == "1.0.0", package = "ggplot2")

In [2]:

#this is the example data set from the ggplot book
diamonds = pandas.rpy.common.load_data('diamonds')
#but the book only uses one hundred samples
numpy.random.seed(1410) # make this reproducible
chosen = numpy.random.choice(diamonds.index, 100)
dsmall = diamonds.ix[chosen]

In [3]:

%Rpush chosen
%R dsmall = diamonds[chosen,  ]
None

In [4]:

#A simpule scatter plot - in R using the qplot function

%R -w 150 -h 150 -u px plot(qplot(carat, price, data=diamonds))
#Which translates to ggplot(diamonds) + geom_point(carat, price)
None #to silence [Listvector...] output

In [5]:

#Same thing in python
gg.Plot(diamonds).geom_point('carat', 'price')
#virtually every function of the Plot object returns the Plot object, so you can chain them like this.

Out[5]:

In [6]:

#R allows passing in functions of values - python does not
%R -w 150 -h 150 -u px plot(qplot(log(carat), log(price), data=diamonds))
None

In [7]:

#But in this case, we can simply transform the axis
#And we'll do it step  by step in this example
p = gg.Plot(diamonds)
p.geom_point('carat', 'price')
p.scale_x_continuous(trans='log10')
p.scale_y_continuous(trans='log10')

Out[7]:

In [8]:

#arguments to qplot can be combinations of existing variables
%R -w 150 -h 150 -u px  plot(qplot(carat, x * y * z, data=diamonds))
None

In [9]:

#In Python, you have to specify a dataframe column - not a function to plot
#so we have to prepare the data in python.
diamonds['xyz' ] = diamonds['x'] * diamonds['y'] * diamonds['z']
gg.Plot(diamonds).geom_point('carat','xyz')

Out[9]:

In [10]:

#Colors and shapes
%R -w 400 -h 200 -u px  plot(qplot(carat, price, data=dsmall, color=color, shape=cut))
None

In [11]:

#now we'll add some color and shapes
p = gg.Plot(dsmall).geom_point('carat', 'price', color='color', shape='cut')
p.ipython_plot_width = 400 #you can also pass the width to plot.render when writing to a file
p.ipython_plot_height = 200
p

Out[11]:

In [12]:

#overplotting was a problem for the complete dataset, but we can reduce the opacity with the alpha parameter
%R -w 150 -h 150 -u px  plot(qplot(carat, price, data = diamonds, alpha=I(1/100)))
None

In [13]:

#overplotting was a problem for the complete dataset, but we can reduce the opacity with the alpha parameter
#In R: qplot(carat, price, data = diamonds, alpha=I(1/100))
gg.Plot(diamonds).geom_point('carat','price', alpha=1/100.)

Out[13]:

In [14]:

#Adding a smoother plot, combining two geoms. Once with the qplot interface
%R -w 150 -h 150 -u px  plot(qplot(carat, price, data = dsmall, geom = c("point", "smooth")))
#and once with the + geom interface.
#Our python stuff is closer to the geom interface
%R -w 150 -h 150 -u px  plot(ggplot(dsmall) + geom_smooth(aes(carat, price)) + geom_point(aes(carat, price)))
None
#We could also have 

geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

In [15]:

#In Python
gg.Plot(dsmall).geom_point('carat','price').add_smooth('carat','price', method='loess')

Out[15]:

In [16]:

#Hourribly overprinterd jitter plots
%R -w 150 -h 150 -u px  plot(qplot(color, price / carat, data=diamonds, geom="jitter", alpha=I(1/5), position = position_jitter(w=0.1)))
None

In [17]:

diamonds['price/carat'] = diamonds['price'] / diamonds['carat']
gg.Plot(diamonds).geom_jitter('color', 'price/carat', alpha = 1/5.,
    position = gg.position_jitter(w = 0.1)
                             )

Out[17]:

In [18]:

#Box plots
#Hourribly overprinterd jitter plots
%R -w 150 -h 150 -u px plot(qplot(color, price / carat, data=diamonds, geom="boxplot"))
None

In [19]:

diamonds['price/carat'] = diamonds['price'] / diamonds['carat']
gg.Plot(diamonds).geom_boxplot('color', 'price/carat') # Note added _ between box and plot

Out[19]:

In [20]:

#histograms...
%R -w 150 -h 150 -u px plot(qplot(carat, data=diamonds, geom='histogram', binwidth = 1))
None

In [21]:

gg.Plot(diamonds).geom_histogram('carat', binwidth=1) 

Out[21]:

In [22]:

#Density
#histograms...
%R -w 150 -h 150 -u px  plot(qplot(carat, data=diamonds, geom='density'))
None

In [23]:

gg.Plot(diamonds).geom_density('carat')

Out[23]:

In [24]:

#bar charts
%R -w 150 -h 150 -u px plot(qplot(color, data=diamonds, geom='bar'))
None

In [25]:

gg.Plot(diamonds).geom_bar('color', '..count..', stat='bin') #The python wrapper does not set the default stat to bin 

Out[25]:

In [26]:

#switching datasets for the line plots
economics = pandas.rpy.common.load_data('economics')
%R year = function(x) as.POSIXlt(x)$year + 1900
economics['year'] = ro.r('year(economics$date)') # convert date into a year

In [27]:

%R -w 150 -h 150 -u px plot(qplot(date,uempmed, data=economics, geom = "line"))
None

In [28]:

gg.Plot(economics).geom_line('date', 'uempmed')

Out[28]:

In [29]:

%R -w 150 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = c("point", "path")))
None

In [30]:

economics['unemploy / pop'] = economics['unemploy'] / economics['pop']
gg.Plot(economics).geom_point('unemploy / pop', 'uempmed').add_path() #You can omit mandatory mappings an subsequent calls

Out[30]:

In [31]:

%R year = function(x) as.POSIXlt(x)$year + 1900
%R -w 300 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = "path", colour = year(date)))
None

In [32]:

p = gg.Plot(economics).geom_path('unemploy / pop', 'uempmed', color = 'year') # remember, we converted date into year earlier
p.ipython_plot_width = 300 # just for this plot
p

Out[32]:

In [33]:

#Since the book talked about scale_area - which is now called scale_size_area
#but the example actually didn't need an area scaled size, here is an example
#first without size area
%R -w 300 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = "point", colour = year(date), size=unemploy) )
None

In [34]:

#Since the book talked about scale_area - which is now called scale_size_area
#but the example actually didn't need an area scaled size, here is an example
#then with it.
%R -w 300 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = "point", colour = year(date), size=unemploy) + scale_size_area())
None

In [35]:

#python without
p = gg.Plot(economics).add_point('unemploy / pop', 'uempmed', color = 'date', size='unemploy')
p.ipython_plot_width = 300
p

Out[35]:

In [36]:

#python with
p = gg.Plot(economics).geom_point('unemploy / pop', 'uempmed', color = 'date', size='unemploy').scale_size_area()
p.ipython_plot_width = 300
p

Out[36]:

Faceting¶

In [37]:

%R -w 250 -h 250 -u px plot(qplot(carat, data = diamonds, facets = color ~ .,geom = "histogram", binwidth = 0.1, xlim = c(0, 3)))
None
        

In [38]:

p = gg.Plot(diamonds).geom_histogram('carat', binwidth=0.1).facet_grid(rows='color').scale_x_continuous(limits = [0,3])
p.ipython_plot_height = 250
p.ipython_plot_width = 250
p

Out[38]:

In [39]:

%R -w 200 -h 200 -u px plot(qplot(carat, ..density.., data=diamonds, facets = color ~ ., geom="histogram", binwidth=0.1, xlim=c(0,3)))
None

In [40]:

gg.Plot(diamonds).geom_histogram('carat', '..density..', binwidth=0.1).facet_grid(rows='color').scale_x_continuous(limits = [0,3])

Out[40]:

In [41]:

#some more options from section 2.7 of the ggplot book
#limits, label renaming, log, title setting
%R -w 150 -h 150 -u px plot(ggplot(dsmall) + ggtitle("Hello world") + geom_point(aes(carat, price)) + scale_x_continuous(name='The X Axis', limits=c(1,2)) + scale_y_continuous( trans='log10'),limits=c(0,5000)) #ggplot helpfully ignores thes limits
None

In [42]:

p = gg.Plot(dsmall).geom_point('carat', 'price')
p.scale_x_continuous(limits=[1,2.], name = 'The X-Axis')
p.scale_y_continuous(trans='log10')  # passing limits together with a transformation get's an error somewhere in ggplot
p.title('hello world')
#I guess that's better than showing the wrong graph :)

Out[42]:

In [43]:

mpg = pandas.rpy.common.load_data('mpg')

Building plots layer by layer¶

Here we use the ggplot + geom* syntax in R - see chapter 4 of the ggplot b ook

In [44]:

%R -w 250 -h 250 -u px plot(ggplot(mpg) + geom_point(aes(displ, hwy, color=factor(cyl))))
None

In [45]:

mpg['cyl'] = pandas.Categorical(mpg['cyl'])  # equivalent to factor
gg.Plot(mpg).geom_point('displ', 'hwy', color='cyl')

Out[45]:

In [46]:

#let's add some linear regression
%R -w 250 -h 250 -u px plot(ggplot(mpg) + geom_point(aes(displ, hwy, color=factor(cyl))) + geom_smooth(aes(displ, hwy, color=factor(cyl)), method='lm'))
None

In [47]:

#What if you don't want the linear regressions to be colored by cyl? use group
gg.Plot(mpg).geom_point('displ', 'hwy', color='cyl').geom_smooth(group='cyl',method='lm') 
#We have to repeat color - it's not a required argument and therefore needs to be filled in again!

Out[47]:

In [48]:

#stacked bar charts
%R -w 250 -h 150 -u px plot(ggplot(diamonds) + geom_bar(aes(color, fill=cut)))
None

In [49]:

p = gg.Plot(diamonds).geom_bar('color', '..count..', fill='cut', stat='bin', position=gg.position_stack())
p.ipython_plot_width = 250
p

Out[49]:

In [50]:

#or side by side (the default)
p = gg.Plot(diamonds).geom_bar('color', '..count..', fill='cut', stat='bin', position=gg.position_dodge())
p.ipython_plot_width = 400
p

Out[50]:

In [51]:

#or filled to 100%
gg.Plot(diamonds).geom_bar('color', '..count..', fill='cut', stat='bin', position=gg.position_fill())

Out[51]: