#!/usr/bin/env python
# coding: utf-8
# # Data Visualization with Python
#
# ## Part 1: Python + Matplotlib
#
# ### [Guy Allard](mailto://w.g.allard@lumc.nl)
# # Matplotlib
# - Plotting library for Python
# - High quality figures suitable for publication
# - Integrates with IPython, Jupyter and NumPy (in PyLab mode)
# - Established and robust
# - Large community / user base
# # Interfaces
# 1. Object Oriented
# - Best for larger development projects
# - Have to keep track of figures and axes
# - Steep learning curve
#
# 2. Pyplot State Machine
# - For interactive plotting
# - Takes care of many housekeeping tasks
# - Easier to learn than the OO interface
#
# 3. Pylab
# - Modelled on matlab
# - Imports common modules
# - Handles most housekeeping tasks
# - Easiest to learn
# - The one we will be using!
# # Interfaces Example
#
# 1. Object-oriented interface
# ```python
# import matplotlib.pyplot as plt
# import numpy as np
# x = np.arange(0, 10, 0.2)
# y = np.sin(x)
# fig = plt.figure()
# ax = fig.add_subplot(111)
# ax.plot(x, y)
# ```
#
# 2. State-machine environment (pyplot)
# ```python
# import matplotlib.pyplot as plt
# import numpy as np
# x = np.arange(0, 10, 0.2)
# y = np.sin(x)
# plt.plot(x, y)
# ```
#
# 3. PyLab mode
# ```python
# %pylab
# x = arange(0, 10, 0.2)
# y = sin(x)
# plot(x, y)
# ```
# # Getting help
#
# Consult the built-in documentation, for example:
# ```
# >>> help(subplot)
# Help on function subplot in module matplotlib.pyplot:
#
# subplot(*args, **kwargs)
# Return a subplot axes positioned by the given grid definition.
# ...
# ```
#
#
# # Useful Resources
# - Matplotlib Homepage
# - https://matplotlib.org/
#
# - Gallery
# - https://matplotlib.org/gallery.html
# - Many examples with source code
#
# - Online documentation
# - https://matplotlib.org/contents.html
# - Full API documentation
# # First Steps
#
# ## Preparing the Jupyter Notebook
# 1. Open a new Jupyter Notebook
# 2. Run this code in the first empty cell:
# ```
# %pylab inline
# ```
# 3. Now any pylab plotting commands will display in the notebook
# In[1]:
get_ipython().run_line_magic('pylab', 'inline')
# # Grab some data
# Use Pandas to load a dataset which contains population data for four countries
# In[2]:
import pandas as pd
populations = pd.read_csv(
'https://git.lumc.nl/courses/programming-course/raw/visualization-2018/visualization/data/populations.csv'
)
# Take a quick look at the data
# In[3]:
populations.head()
# # Plot it!
# Let's make a plot the population of the Netherlands on the y-axis, and the year on the x-axis
# In[4]:
plot(populations['Year'], populations['Netherlands']);
# # Add titles and label the axes
# In[5]:
plot(populations['Year'], populations['Netherlands'])
title('Historical Population of The Netherlands')
xlabel('Year')
ylabel('Population (Millions)');
# # Change some properties of the line
#
# How about a 5px thick orange line?
# In[6]:
plot(populations['Year'], populations['Netherlands'],
linewidth=5, color='orange')
title('Historical Population of The Netherlands')
xlabel('Year')
ylabel('Population (Millions)');
# # Change some properties of the x-axis
#
# Label at five-year intervals
# Display the label vertically
# In[7]:
plot(populations['Year'], populations['Netherlands'],
linewidth=5, color='orange')
title('Historical Population of The Netherlands')
xlabel('Year')
ylabel('Population (Millions)')
xticks(range(1950, 2016, 5), rotation=90);
# # Change which years are displayed
# In[8]:
plot(populations['Year'], populations['Netherlands'],
linewidth=5, color='orange')
title('Historical Population of The Netherlands')
xlabel('Year')
ylabel('Population (Millions)')
xticks(range(1950, 2016, 5), rotation=90)
xlim(1970, 1990);
# # Change the y-axis scale
# In[9]:
plot(populations['Year'], populations['Netherlands'],
linewidth=5, color='orange')
title('Historical Population of The Netherlands')
xlabel('Year')
ylabel('Population (Millions)')
xticks(range(1950, 2016, 5), rotation=90)
xlim(1970, 1990)
ylim(13,15);
# # Clean up the number formatting on the y-axis
#
# Integer tick labels
# In[10]:
plot(populations['Year'], populations['Netherlands'],
linewidth=5, color='orange')
title('Historical Population of The Netherlands')
xlabel('Year')
ylabel('Population (Millions)')
xticks(range(1950, 2016, 5), rotation=90)
xlim(1970, 1990)
ylim(13,15)
yticks(range(13,16));
# # Plot multiple series
#
# Calling **plot** multiple times within the same cell will add multiple series to the chart
#
# Let's compare the Dutch with the Danes
# In[11]:
plot(populations['Year'], populations['Netherlands'], color='orange')
plot(populations['Year'], populations['Denmark'], color='red')
title('Historical Populations of The Netherlands and Denmark')
xlabel('Year')
ylabel('Population (Millions)')
xticks(range(1950, 2016, 5), rotation=90);
# # Add a legend
#
# 1. Give each plotted line a label
# 2. Add a legend to the figure
# In[12]:
plot(populations['Year'], populations['Netherlands'],
color='orange', label='The Netherlands')
plot(populations['Year'], populations['Denmark'],
color='red', label='Denmark')
legend(loc='upper left')
title('Historical Populations of The Netherlands and Denmark')
xlabel('Year')
ylabel('Population (Millions)')
xticks(range(1950, 2016, 5), rotation=90);
# # Other plot types
#
# Let's load a different dataset and take a look at some different plot types
# In[13]:
flowers = pd.read_csv('https://git.lumc.nl/courses/programming-course/raw/visualization-2018/visualization/data/iris.csv')
flowers.head()
# # Boxplots
#
# A simple boxplot of the sepal-length distribution
# In[14]:
boxplot(flowers['sepal_length'], labels=['Sepal_length']);
# # Boxplots
#
# Distributions of multiple features
# In[15]:
# make a list containing the numeric feature column names
features = list(flowers.columns[:-1])
features
# In[16]:
# plot the data
boxplot([flowers[f] for f in features], labels=features);
# # Controlling the size of the plot
#
# Let's change the shape of the boxplot
# In[17]:
# make the figure 10 'units' wide and 5 'units' high
figsize(10, 5)
# plot the data
boxplot([flowers[f] for f in features], labels=features);
# In[18]:
figsize(7,4)
# # Histogram
# In[19]:
hist(flowers['petal_length'])
title('Petal Length Distribution')
xlabel('petal length')
ylabel('count');
# # Histogram
#
# change the number of 'bins'
# In[20]:
hist(flowers['petal_length'], bins=20)
title('Petal Length Distribution')
xlabel('petal length')
ylabel('count');
# # Histogram
#
# Some formatting
#
# - Lines around the bars
# - color
# In[21]:
hist(flowers['petal_length'], bins=20, facecolor='teal', edgecolor='black', alpha=0.7)
title('Petal Length Distribution')
xlabel('petal length')
ylabel('count');
# # Subplots
#
# Separate plots with their own axes within a single figure
#
# The syntax can be confusing!
# In[22]:
for i in range(1, 5):
subplot(2, 2, i)
xticks([]), yticks([])
text(0.5, 0.5, 'subplot(2, 2, %d)' % i, ha='center', size=18, alpha=0.75);
# subplot(2, 2, 1) indicates the first cell of a 2 row x 2 column matrix
#
# subplot(2, 2, 4) indicates the fourth cell of a 2 column x 2 row matrix
# # Subplots
#
# More complicated layouts
# In[23]:
subplot(1, 3, 1) # 1 row, 3 columns, cell 1
xticks([]), yticks([])
text(0.5, 0.5, '(1, 3, 1)', ha='center', size=18, alpha=0.75)
subplot(2, 3, 3) # 2 rows, 3 columns, cell 3
xticks([]), yticks([])
text(0.5, 0.5, '(2, 3, 3)', ha='center', size=18, alpha=0.75)
subplot(3, 2, 6) # 3 rows, 2 columns, cell 6
xticks([]), yticks([])
text(0.5, 0.5, '(3, 2, 6)', ha='center', size=18, alpha=0.75)
subplot(3, 3, 5) # 3 rows, 3 columns, cell 5
xticks([]), yticks([])
text(0.5, 0.5, '(3, 3, 5)', ha='center', size=18, alpha=0.75);
# # Subplots and Boxplots
#
# Compare how the features are distributed by species
# In[24]:
species = list(set(flowers.species))
print(species)
# In[25]:
# make a dataset for each species
setosa = flowers[flowers.species == 'setosa']
versicolor = flowers[flowers.species == 'versicolor']
virginica = flowers[flowers.species == 'virginica']
# In[26]:
figsize(10, 8)
for cell, feature in enumerate(features):
subplot(2, 2, cell + 1)
boxplot(
[setosa[feature], versicolor[feature], virginica[feature]],
labels=species
)
ylabel(feature)
# In[27]:
figsize(7,4)
# # Sketch-style drawing
#
# using xkcd mode
# In[28]:
with xkcd():
hist(flowers['petal_length'], bins=20, facecolor='teal', edgecolor='black')
title('Petal Length Distribution')
xlabel('petal length')
ylabel('count');
# # Saving to a file
#
# Images can be saved to a file using savefig after the plotting commands:
# ```
# savefig('myplot.pdf')
# ```
#
# The format of the saved image will be inferred from the given file extension.
# # The End
#
# This lesson was based on previous work by [Jeroen Laros](mailto://j.f.j.laros@lumc.nl) and Martijn Vermaat
#
# License: [Creative Commons Attribution 3.0 License (CC-by)](http://creativecommons.org/licenses/by/3.0)
# In[ ]: