#!/usr/bin/env python # coding: utf-8 # pyggplot examples # === # This closely follows the ggplot book. # The syntax in python is obviously different - for a start, # we don't have a qplot function, but always build plot objects. # # But let's get started. # In[1]: #imports and R magic try: import exptools # Don't worry about this exptools.load_software('pyggplot') except ImportError: pass import pandas import pandas.rpy import pyggplot as gg #set plot size in the notebook to 'tiny' gg.ipython_plot_width = 200 gg.ipython_plot_height = 200 import rpy2.robjects as ro quiet = ro.r('library("ggplot2")') import numpy get_ipython().run_line_magic('load_ext', 'rpy2.ipython') # In[2]: #this is the example data set from the ggplot book diamonds = pandas.rpy.common.load_data('diamonds') #but the book only uses one hundred samples numpy.random.seed(1410) # make this reproducible chosen = numpy.random.choice(diamonds.index, 100) dsmall = diamonds.ix[chosen] # In[3]: get_ipython().run_line_magic('Rpush', 'chosen') get_ipython().run_line_magic('R', 'dsmall = diamonds[chosen, ]') None # In[4]: #A simpule scatter plot - in R using the qplot function get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(qplot(carat, price, data=diamonds))') #Which translates to ggplot(diamonds) + geom_point(carat, price) None #to silence [Listvector...] output # In[5]: #Same thing in python gg.Plot(diamonds).geom_point('carat', 'price') #virtually every function of the Plot object returns the Plot object, so you can chain them like this. # In[6]: #R allows passing in functions of values - python does not get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(qplot(log(carat), log(price), data=diamonds))') None # In[7]: #But in this case, we can simply transform the axis #And we'll do it step by step in this example p = gg.Plot(diamonds) p.geom_point('carat', 'price') p.scale_x_continuous(trans='log10') p.scale_y_continuous(trans='log10') # In[8]: #arguments to qplot can be combinations of existing variables get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(qplot(carat, x * y * z, data=diamonds))') None # In[9]: #In Python, you have to specify a dataframe column - not a function to plot #so we have to prepare the data in python. diamonds['xyz' ] = diamonds['x'] * diamonds['y'] * diamonds['z'] gg.Plot(diamonds).geom_point('carat','xyz') # In[10]: #Colors and shapes get_ipython().run_line_magic('R', '-w 400 -h 200 -u px plot(qplot(carat, price, data=dsmall, color=color, shape=cut))') None # In[11]: #now we'll add some color and shapes p = gg.Plot(dsmall).geom_point('carat', 'price', color='color', shape='cut') p.ipython_plot_width = 400 #you can also pass the width to plot.render when writing to a file p.ipython_plot_height = 200 p # In[12]: #overplotting was a problem for the complete dataset, but we can reduce the opacity with the alpha parameter get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(qplot(carat, price, data = diamonds, alpha=I(1/100)))') None # In[13]: #overplotting was a problem for the complete dataset, but we can reduce the opacity with the alpha parameter #In R: qplot(carat, price, data = diamonds, alpha=I(1/100)) gg.Plot(diamonds).geom_point('carat','price', alpha=1/100.) # In[14]: #Adding a smoother plot, combining two geoms. Once with the qplot interface get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(qplot(carat, price, data = dsmall, geom = c("point", "smooth")))') #and once with the + geom interface. #Our python stuff is closer to the geom interface get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(ggplot(dsmall) + geom_smooth(aes(carat, price)) + geom_point(aes(carat, price)))') None #We could also have # In[15]: #In Python gg.Plot(dsmall).geom_point('carat','price').add_smooth('carat','price', method='loess') # In[16]: #Hourribly overprinterd jitter plots get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(qplot(color, price / carat, data=diamonds, geom="jitter", alpha=I(1/5), position = position_jitter(w=0.1)))') None # In[17]: diamonds['price/carat'] = diamonds['price'] / diamonds['carat'] gg.Plot(diamonds).geom_jitter('color', 'price/carat', alpha = 1/5., position = gg.position_jitter(w = 0.1) ) # In[18]: #Box plots #Hourribly overprinterd jitter plots get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(qplot(color, price / carat, data=diamonds, geom="boxplot"))') None # In[19]: diamonds['price/carat'] = diamonds['price'] / diamonds['carat'] gg.Plot(diamonds).geom_boxplot('color', 'price/carat') # Note added _ between box and plot # In[20]: #histograms... get_ipython().run_line_magic('R', "-w 150 -h 150 -u px plot(qplot(carat, data=diamonds, geom='histogram', binwidth = 1))") None # In[21]: gg.Plot(diamonds).geom_histogram('carat', binwidth=1) # In[22]: #Density #histograms... get_ipython().run_line_magic('R', "-w 150 -h 150 -u px plot(qplot(carat, data=diamonds, geom='density'))") None # In[23]: gg.Plot(diamonds).geom_density('carat') # In[24]: #bar charts get_ipython().run_line_magic('R', "-w 150 -h 150 -u px plot(qplot(color, data=diamonds, geom='bar'))") None # In[25]: gg.Plot(diamonds).geom_bar('color', '..count..', stat='bin') #The python wrapper does not set the default stat to bin # In[26]: #switching datasets for the line plots economics = pandas.rpy.common.load_data('economics') get_ipython().run_line_magic('R', 'year = function(x) as.POSIXlt(x)$year + 1900') economics['year'] = ro.r('year(economics$date)') # convert date into a year # In[27]: get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(qplot(date,uempmed, data=economics, geom = "line"))') None # In[28]: gg.Plot(economics).geom_line('date', 'uempmed') # In[29]: get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = c("point", "path")))') None # In[30]: economics['unemploy / pop'] = economics['unemploy'] / economics['pop'] gg.Plot(economics).geom_point('unemploy / pop', 'uempmed').add_path() #You can omit mandatory mappings an subsequent calls # In[31]: get_ipython().run_line_magic('R', 'year = function(x) as.POSIXlt(x)$year + 1900') get_ipython().run_line_magic('R', '-w 300 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = "path", colour = year(date)))') None # In[32]: p = gg.Plot(economics).geom_path('unemploy / pop', 'uempmed', color = 'year') # remember, we converted date into year earlier p.ipython_plot_width = 300 # just for this plot p # In[33]: #Since the book talked about scale_area - which is now called scale_size_area #but the example actually didn't need an area scaled size, here is an example #first without size area get_ipython().run_line_magic('R', '-w 300 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = "point", colour = year(date), size=unemploy) )') None # In[34]: #Since the book talked about scale_area - which is now called scale_size_area #but the example actually didn't need an area scaled size, here is an example #then with it. get_ipython().run_line_magic('R', '-w 300 -h 150 -u px plot(qplot(unemploy / pop, uempmed, data = economics,geom = "point", colour = year(date), size=unemploy) + scale_size_area())') None # In[35]: #python without p = gg.Plot(economics).add_point('unemploy / pop', 'uempmed', color = 'date', size='unemploy') p.ipython_plot_width = 300 p # In[36]: #python with p = gg.Plot(economics).geom_point('unemploy / pop', 'uempmed', color = 'date', size='unemploy').scale_size_area() p.ipython_plot_width = 300 p # Faceting # === # In[37]: get_ipython().run_line_magic('R', '-w 250 -h 250 -u px plot(qplot(carat, data = diamonds, facets = color ~ .,geom = "histogram", binwidth = 0.1, xlim = c(0, 3)))') None # In[38]: p = gg.Plot(diamonds).geom_histogram('carat', binwidth=0.1).facet_grid(rows='color').scale_x_continuous(limits = [0,3]) p.ipython_plot_height = 250 p.ipython_plot_width = 250 p # In[39]: get_ipython().run_line_magic('R', '-w 200 -h 200 -u px plot(qplot(carat, ..density.., data=diamonds, facets = color ~ ., geom="histogram", binwidth=0.1, xlim=c(0,3)))') None # In[40]: gg.Plot(diamonds).geom_histogram('carat', '..density..', binwidth=0.1).facet_grid(rows='color').scale_x_continuous(limits = [0,3]) # In[41]: #some more options from section 2.7 of the ggplot book #limits, label renaming, log, title setting get_ipython().run_line_magic('R', '-w 150 -h 150 -u px plot(ggplot(dsmall) + ggtitle("Hello world") + geom_point(aes(carat, price)) + scale_x_continuous(name=\'The X Axis\', limits=c(1,2)) + scale_y_continuous( trans=\'log10\'),limits=c(0,5000)) #ggplot helpfully ignores thes limits') None # In[42]: p = gg.Plot(dsmall).geom_point('carat', 'price') p.scale_x_continuous(limits=[1,2.], name = 'The X-Axis') p.scale_y_continuous(trans='log10') # passing limits together with a transformation get's an error somewhere in ggplot p.title('hello world') #I guess that's better than showing the wrong graph :) # In[43]: mpg = pandas.rpy.common.load_data('mpg') # Building plots layer by layer # === # Here we use the ggplot + geom* syntax in R - see chapter 4 of the ggplot b ook # In[44]: get_ipython().run_line_magic('R', '-w 250 -h 250 -u px plot(ggplot(mpg) + geom_point(aes(displ, hwy, color=factor(cyl))))') None # In[45]: mpg['cyl'] = pandas.Categorical(mpg['cyl']) # equivalent to factor gg.Plot(mpg).geom_point('displ', 'hwy', color='cyl') # In[46]: #let's add some linear regression get_ipython().run_line_magic('R', "-w 250 -h 250 -u px plot(ggplot(mpg) + geom_point(aes(displ, hwy, color=factor(cyl))) + geom_smooth(aes(displ, hwy, color=factor(cyl)), method='lm'))") None # In[47]: #What if you don't want the linear regressions to be colored by cyl? use group gg.Plot(mpg).geom_point('displ', 'hwy', color='cyl').geom_smooth(group='cyl',method='lm') #We have to repeat color - it's not a required argument and therefore needs to be filled in again! # In[48]: #stacked bar charts get_ipython().run_line_magic('R', '-w 250 -h 150 -u px plot(ggplot(diamonds) + geom_bar(aes(color, fill=cut)))') None # In[49]: p = gg.Plot(diamonds).geom_bar('color', '..count..', fill='cut', stat='bin', position=gg.position_stack()) p.ipython_plot_width = 250 p # In[50]: #or side by side (the default) p = gg.Plot(diamonds).geom_bar('color', '..count..', fill='cut', stat='bin', position=gg.position_dodge()) p.ipython_plot_width = 400 p # In[51]: #or filled to 100% gg.Plot(diamonds).geom_bar('color', '..count..', fill='cut', stat='bin', position=gg.position_fill())