#!/usr/bin/env python # coding: utf-8 # In[ ]: import pandas as pd # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import matplotlib.pyplot as plt try: import seaborn except ImportError: pass # # Tabular data # In[ ]: df = pd.read_csv("data/titanic.csv") # In[ ]: df.head() # Starting from reading this dataset, to answering questions about this data in a few lines of code: # **What is the age distribution of the passengers?** # In[ ]: df['Age'].hist() # **How does the survival rate of the passengers differ between sexes?** # In[ ]: df.groupby('Sex')[['Survived']].aggregate(lambda x: x.sum() / len(x)) # **Or how does it differ between the different classes?** # In[ ]: df.groupby('Pclass')['Survived'].aggregate(lambda x: x.sum() / len(x)).plot(kind='bar') # **Are young people more likely to survive?** # In[ ]: df['Survived'].sum() / df['Survived'].count() # In[ ]: df25 = df[df['Age'] <= 25] df25['Survived'].sum() / len(df25['Survived']) # All the needed functionality for the above examples will be explained throughout this tutorial. # # Data structures # # Pandas provides two fundamental data objects, for 1D (``Series``) and 2D data (``DataFrame``). # ## Series # # A Series is a basic holder for **one-dimensional labeled data**. It can be created much as a NumPy array is created: # In[ ]: s = pd.Series([0.1, 0.2, 0.3, 0.4]) s # ### Attributes of a Series: `index` and `values` # # The series has a built-in concept of an **index**, which by default is the numbers *0* through *N - 1* # In[ ]: s.index # You can access the underlying numpy array representation with the `.values` attribute: # In[ ]: s.values # We can access series values via the index, just like for NumPy arrays: # In[ ]: s[0] # Unlike the NumPy array, though, this index can be something other than integers: # In[ ]: s2 = pd.Series(np.arange(4), index=['a', 'b', 'c', 'd']) s2 # In[ ]: s2['c'] # In this way, a ``Series`` object can be thought of as similar to an ordered dictionary mapping one typed value to another typed value. # # In fact, it's possible to construct a series directly from a Python dictionary: # In[ ]: pop_dict = {'Germany': 81.3, 'Belgium': 11.3, 'France': 64.3, 'United Kingdom': 64.9, 'Netherlands': 16.9} population = pd.Series(pop_dict) population # We can index the populations like a dict as expected: # In[ ]: population['France'] # but with the power of numpy arrays: # In[ ]: population * 1000 # ## DataFrames: Multi-dimensional Data # # A DataFrame is a **tablular data structure** (multi-dimensional object to hold labeled data) comprised of rows and columns, akin to a spreadsheet, database table, or R's data.frame object. You can think of it as multiple Series object which share the same index. # # # One of the most common ways of creating a dataframe is from a dictionary of arrays or lists. # # Note that in the IPython notebook, the dataframe will display in a rich HTML view: # In[ ]: data = {'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'], 'population': [11.3, 64.3, 81.3, 16.9, 64.9], 'area': [30510, 671308, 357050, 41526, 244820], 'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']} countries = pd.DataFrame(data) countries # ### Attributes of the DataFrame # # A DataFrame has besides a `index` attribute, also a `columns` attribute: # In[ ]: countries.index # In[ ]: countries.columns # To check the data types of the different columns: # In[ ]: countries.dtypes # An overview of that information can be given with the `info()` method: # In[ ]: countries.info() # Also a DataFrame has a `values` attribute, but attention: when you have heterogeneous data, all values will be upcasted: # In[ ]: countries.values # If we don't like what the index looks like, we can reset it and set one of our columns: # In[ ]: countries = countries.set_index('country') countries # To access a Series representing a column in the data, use typical indexing syntax: # In[ ]: countries['area'] # # Basic operations on Series/Dataframes # As you play around with DataFrames, you'll notice that many operations which work on NumPy arrays will also work on dataframes. # # In[ ]: # redefining the example objects population = pd.Series({'Germany': 81.3, 'Belgium': 11.3, 'France': 64.3, 'United Kingdom': 64.9, 'Netherlands': 16.9}) countries = pd.DataFrame({'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'], 'population': [11.3, 64.3, 81.3, 16.9, 64.9], 'area': [30510, 671308, 357050, 41526, 244820], 'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']}) # ### Elementwise-operations (like numpy) # Just like with numpy arrays, many operations are element-wise: # In[ ]: population / 100 # In[ ]: countries['population'] / countries['area'] # ### Alignment! (unlike numpy) # # Only, pay attention to **alignment**: operations between series will align on the index: # In[ ]: s1 = population[['Belgium', 'France']] s2 = population[['France', 'Germany']] # In[ ]: s1 # In[ ]: s2 # In[ ]: s1 + s2 # ### Reductions (like numpy) # The average population number: # In[ ]: population.mean() # The minimum area: # In[ ]: countries['area'].min() # For dataframes, often only the numeric columns are included in the result: # In[ ]: countries.median() #
# EXERCISE: Calculate the population numbers relative to Belgium #
# In[ ]: #
# EXERCISE: Calculate the population density for each country and add this as a new column to the dataframe. #
# In[ ]: # In[ ]: # ### Some other useful methods # Sorting the rows of the DataFrame according to the values in a column: # In[ ]: countries.sort_values('density', ascending=False) # One useful method to use is the ``describe`` method, which computes summary statistics for each column: # In[ ]: countries.describe() # The `plot` method can be used to quickly visualize the data in different ways: # In[ ]: countries.plot() # However, for this dataset, it does not say that much: # In[ ]: countries['population'].plot(kind='bar') # You can play with the `kind` keyword: 'line', 'bar', 'hist', 'density', 'area', 'pie', 'scatter', 'hexbin' # ## Importing and exporting data # A wide range of input/output formats are natively supported by pandas: # # * CSV, text # * SQL database # * Excel # * HDF5 # * json # * html # * pickle # * ... # In[ ]: pd.read # In[ ]: states.to # ## Other features # # * Working with missing data (`.dropna()`, `pd.isnull()`) # * Merging and joining (`concat`, `join`) # * Grouping: `groupby` functionality # * Reshaping (`stack`, `pivot`) # * Time series manipulation (resampling, timezones, ..) # * Easy plotting # There are many, many more interesting operations that can be done on Series and DataFrame objects, but rather than continue using this toy data, we'll instead move to a real-world example, and illustrate some of the advanced concepts along the way. # # See the next notebooks! # ## Acknowledgement # # > *© 2015, Stijn Van Hoey and Joris Van den Bossche (, ). Licensed under [CC BY 4.0 Creative Commons](http://creativecommons.org/licenses/by/4.0/)* # # > This notebook is partly based on material of Jake Vanderplas (https://github.com/jakevdp/OsloWorkshop2014). # # ---