#!/usr/bin/env python # coding: utf-8 # # Data Visualization with Python # # ## Part 1: Python + Matplotlib # # ### [Guy Allard](mailto://w.g.allard@lumc.nl) # # Matplotlib # - Plotting library for Python # - High quality figures suitable for publication # - Integrates with IPython, Jupyter and NumPy (in PyLab mode) # - Established and robust # - Large community / user base # # Interfaces # 1. Object Oriented # - Best for larger development projects # - Have to keep track of figures and axes # - Steep learning curve #

# 2. Pyplot State Machine # - For interactive plotting # - Takes care of many housekeeping tasks # - Easier to learn than the OO interface #

# 3. Pylab # - Modelled on matlab # - Imports common modules # - Handles most housekeeping tasks # - Easiest to learn # - The one we will be using! # # Interfaces Example # # 1. Object-oriented interface # ```python # import matplotlib.pyplot as plt # import numpy as np # x = np.arange(0, 10, 0.2) # y = np.sin(x) # fig = plt.figure() # ax = fig.add_subplot(111) # ax.plot(x, y) # ``` # # 2. State-machine environment (pyplot) # ```python # import matplotlib.pyplot as plt # import numpy as np # x = np.arange(0, 10, 0.2) # y = np.sin(x) # plt.plot(x, y) # ``` # # 3. PyLab mode # ```python # %pylab # x = arange(0, 10, 0.2) # y = sin(x) # plot(x, y) # ``` # # Getting help # # Consult the built-in documentation, for example: # ``` # >>> help(subplot) # Help on function subplot in module matplotlib.pyplot: # # subplot(*args, **kwargs) # Return a subplot axes positioned by the given grid definition. # ... # ``` # # # # Useful Resources # - Matplotlib Homepage # - https://matplotlib.org/ #

# - Gallery # - https://matplotlib.org/gallery.html # - Many examples with source code #

# - Online documentation # - https://matplotlib.org/contents.html # - Full API documentation # # First Steps # # ## Preparing the Jupyter Notebook # 1. Open a new Jupyter Notebook # 2. Run this code in the first empty cell: # ``` # %pylab inline # ``` # 3. Now any pylab plotting commands will display in the notebook # In[1]: get_ipython().run_line_magic('pylab', 'inline') # # Grab some data # Use Pandas to load a dataset which contains population data for four countries # In[2]: import pandas as pd populations = pd.read_csv( 'https://git.lumc.nl/courses/programming-course/raw/visualization-2018/visualization/data/populations.csv' ) # Take a quick look at the data # In[3]: populations.head() # # Plot it! # Let's make a plot the population of the Netherlands on the y-axis, and the year on the x-axis # In[4]: plot(populations['Year'], populations['Netherlands']); # # Add titles and label the axes # In[5]: plot(populations['Year'], populations['Netherlands']) title('Historical Population of The Netherlands') xlabel('Year') ylabel('Population (Millions)'); # # Change some properties of the line # # How about a 5px thick orange line? # In[6]: plot(populations['Year'], populations['Netherlands'], linewidth=5, color='orange') title('Historical Population of The Netherlands') xlabel('Year') ylabel('Population (Millions)'); # # Change some properties of the x-axis # # Label at five-year intervals # Display the label vertically # In[7]: plot(populations['Year'], populations['Netherlands'], linewidth=5, color='orange') title('Historical Population of The Netherlands') xlabel('Year') ylabel('Population (Millions)') xticks(range(1950, 2016, 5), rotation=90); # # Change which years are displayed # In[8]: plot(populations['Year'], populations['Netherlands'], linewidth=5, color='orange') title('Historical Population of The Netherlands') xlabel('Year') ylabel('Population (Millions)') xticks(range(1950, 2016, 5), rotation=90) xlim(1970, 1990); # # Change the y-axis scale # In[9]: plot(populations['Year'], populations['Netherlands'], linewidth=5, color='orange') title('Historical Population of The Netherlands') xlabel('Year') ylabel('Population (Millions)') xticks(range(1950, 2016, 5), rotation=90) xlim(1970, 1990) ylim(13,15); # # Clean up the number formatting on the y-axis # # Integer tick labels # In[10]: plot(populations['Year'], populations['Netherlands'], linewidth=5, color='orange') title('Historical Population of The Netherlands') xlabel('Year') ylabel('Population (Millions)') xticks(range(1950, 2016, 5), rotation=90) xlim(1970, 1990) ylim(13,15) yticks(range(13,16)); # # Plot multiple series # # Calling **plot** multiple times within the same cell will add multiple series to the chart # # Let's compare the Dutch with the Danes # In[11]: plot(populations['Year'], populations['Netherlands'], color='orange') plot(populations['Year'], populations['Denmark'], color='red') title('Historical Populations of The Netherlands and Denmark') xlabel('Year') ylabel('Population (Millions)') xticks(range(1950, 2016, 5), rotation=90); # # Add a legend # # 1. Give each plotted line a label # 2. Add a legend to the figure # In[12]: plot(populations['Year'], populations['Netherlands'], color='orange', label='The Netherlands') plot(populations['Year'], populations['Denmark'], color='red', label='Denmark') legend(loc='upper left') title('Historical Populations of The Netherlands and Denmark') xlabel('Year') ylabel('Population (Millions)') xticks(range(1950, 2016, 5), rotation=90); # # Other plot types # # Let's load a different dataset and take a look at some different plot types # In[13]: flowers = pd.read_csv('https://git.lumc.nl/courses/programming-course/raw/visualization-2018/visualization/data/iris.csv') flowers.head() # # Boxplots # # A simple boxplot of the sepal-length distribution # In[14]: boxplot(flowers['sepal_length'], labels=['Sepal_length']); # # Boxplots # # Distributions of multiple features # In[15]: # make a list containing the numeric feature column names features = list(flowers.columns[:-1]) features # In[16]: # plot the data boxplot([flowers[f] for f in features], labels=features); # # Controlling the size of the plot # # Let's change the shape of the boxplot # In[17]: # make the figure 10 'units' wide and 5 'units' high figsize(10, 5) # plot the data boxplot([flowers[f] for f in features], labels=features); # In[18]: figsize(7,4) # # Histogram # In[19]: hist(flowers['petal_length']) title('Petal Length Distribution') xlabel('petal length') ylabel('count'); # # Histogram # # change the number of 'bins' # In[20]: hist(flowers['petal_length'], bins=20) title('Petal Length Distribution') xlabel('petal length') ylabel('count'); # # Histogram # # Some formatting # # - Lines around the bars # - color # In[21]: hist(flowers['petal_length'], bins=20, facecolor='teal', edgecolor='black', alpha=0.7) title('Petal Length Distribution') xlabel('petal length') ylabel('count'); # # Subplots # # Separate plots with their own axes within a single figure # # The syntax can be confusing! # In[22]: for i in range(1, 5): subplot(2, 2, i) xticks([]), yticks([]) text(0.5, 0.5, 'subplot(2, 2, %d)' % i, ha='center', size=18, alpha=0.75); # subplot(2, 2, 1) indicates the first cell of a 2 row x 2 column matrix # # subplot(2, 2, 4) indicates the fourth cell of a 2 column x 2 row matrix # # Subplots # # More complicated layouts # In[23]: subplot(1, 3, 1) # 1 row, 3 columns, cell 1 xticks([]), yticks([]) text(0.5, 0.5, '(1, 3, 1)', ha='center', size=18, alpha=0.75) subplot(2, 3, 3) # 2 rows, 3 columns, cell 3 xticks([]), yticks([]) text(0.5, 0.5, '(2, 3, 3)', ha='center', size=18, alpha=0.75) subplot(3, 2, 6) # 3 rows, 2 columns, cell 6 xticks([]), yticks([]) text(0.5, 0.5, '(3, 2, 6)', ha='center', size=18, alpha=0.75) subplot(3, 3, 5) # 3 rows, 3 columns, cell 5 xticks([]), yticks([]) text(0.5, 0.5, '(3, 3, 5)', ha='center', size=18, alpha=0.75); # # Subplots and Boxplots # # Compare how the features are distributed by species # In[24]: species = list(set(flowers.species)) print(species) # In[25]: # make a dataset for each species setosa = flowers[flowers.species == 'setosa'] versicolor = flowers[flowers.species == 'versicolor'] virginica = flowers[flowers.species == 'virginica'] # In[26]: figsize(10, 8) for cell, feature in enumerate(features): subplot(2, 2, cell + 1) boxplot( [setosa[feature], versicolor[feature], virginica[feature]], labels=species ) ylabel(feature) # In[27]: figsize(7,4) # # Sketch-style drawing # # using xkcd mode # In[28]: with xkcd(): hist(flowers['petal_length'], bins=20, facecolor='teal', edgecolor='black') title('Petal Length Distribution') xlabel('petal length') ylabel('count'); # # Saving to a file # # Images can be saved to a file using savefig after the plotting commands: # ``` # savefig('myplot.pdf') # ``` # # The format of the saved image will be inferred from the given file extension. # # The End # # This lesson was based on previous work by [Jeroen Laros](mailto://j.f.j.laros@lumc.nl) and Martijn Vermaat # # License: [Creative Commons Attribution 3.0 License (CC-by)](http://creativecommons.org/licenses/by/3.0) # In[ ]: