#!/usr/bin/env python
# coding: utf-8
# # Indexing and selecting data
# In[ ]:
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
try:
import seaborn
except ImportError:
pass
# In[ ]:
# redefining the example objects
# series
population = pd.Series({'Germany': 81.3, 'Belgium': 11.3, 'France': 64.3,
'United Kingdom': 64.9, 'Netherlands': 16.9})
# dataframe
data = {'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'],
'population': [11.3, 64.3, 81.3, 16.9, 64.9],
'area': [30510, 671308, 357050, 41526, 244820],
'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']}
countries = pd.DataFrame(data)
countries
# Setting the index to the country names:
# In[ ]:
countries = countries.set_index('country')
countries
# ## Some notes on selecting data
#
# One of pandas' basic features is the labeling of rows and columns, but this makes indexing also a bit more complex compared to numpy. We now have to distuinguish between:
#
# - selection by label
# - selection by position.
# ### `data[]` provides some convenience shortcuts
# For a DataFrame, basic indexing selects the columns.
#
# Selecting a single column:
# In[ ]:
countries['area']
# or multiple columns:
# In[ ]:
countries[['area', 'population']]
# But, slicing accesses the rows:
# In[ ]:
countries['France':'Netherlands']
#
# NOTE: Unlike slicing in numpy, the end label is **included**.
#
# So as a summary, `[]` provides the following convenience shortcuts:
#
# - Series: selecting a label: `s[label]`
# - DataFrame: selecting a single or multiple columns: `df['col']` or `df[['col1', 'col2']]`
# - DataFrame: slicing the rows: `df['row_label1':'row_label2']` or `df[mask]`
# ### Systematic indexing with `loc` and `iloc`
# When using `[]` like above, you can only select from one axis at once (rows or columns, not both). For more advanced indexing, you have some extra attributes:
#
# * `loc`: selection by label
# * `iloc`: selection by position
#
# These methods index the different dimensions of the frame:
#
# * `df.loc[row_indexer, column_indexer]`
# * `df.iloc[row_indexer, column_indexer]`
# Selecting a single element:
# In[ ]:
countries.loc['Germany', 'area']
# But the row or column indexer can also be a list, slice, boolean array, ..
# In[ ]:
countries.loc['France':'Germany', ['area', 'population']]
# ---
# Selecting by position with `iloc` works similar as indexing numpy arrays:
# In[ ]:
countries.iloc[0:2,1:3]
# The different indexing methods can also be used to assign data:
# In[ ]:
countries2 = countries.copy()
countries2.loc['Belgium':'Germany', 'population'] = 10
# In[ ]:
countries2
# ## Boolean indexing (filtering)
# Often, you want to select rows based on a certain condition. This can be done with 'boolean indexing' (like a where clause in SQL).
#
# The indexer (or boolean mask) should be 1-dimensional and the same length as the thing being indexed.
# In[ ]:
countries['area'] > 100000
# In[ ]:
countries[countries['area'] > 100000]
# ---
#
#
# EXERCISE: Add a column `density` with the population density (note: population column is expressed in millions)
#
# In[ ]:
#
# EXERCISE: Select the capital and the population column of those countries where the density is larger than 300
#
# In[ ]:
#
# EXERCISE: Add a column 'density_ratio' with the ratio of the density to the mean density
#
# In[ ]:
#
# EXERCISE: Change the capital of the UK to Cambridge
#
# In[ ]:
#
# EXERCISE: Select all countries whose population density is between 100 and 300 people/kmĀ²
#
# In[ ]:
# ## Some other useful methods: `isin` and string methods
# The `isin` method of Series is very useful to select rows that may contain certain values:
# In[ ]:
s = countries['capital']
# In[ ]:
get_ipython().run_line_magic('pinfo', 's.isin')
# In[ ]:
s.isin(['Berlin', 'London'])
# This can then be used to filter the dataframe with boolean indexing:
# In[ ]:
countries[countries['capital'].isin(['Berlin', 'London'])]
# Let's say we want to select all data for which the capital starts with a 'B'. In Python, when having a string, we could use the `startswith` method:
# In[ ]:
'Berlin'.startswith('B')
# In pandas, these are available on a Series through the `str` namespace:
# In[ ]:
countries['capital'].str.startswith('B')
# For an overview of all string methods, see: http://pandas.pydata.org/pandas-docs/stable/api.html#string-handling
#
# EXERCISE: Select all countries that have capital names with more than 7 characters
#
# In[ ]:
#
# EXERCISE: Select all countries that have capital names that contain the character sequence 'am'
#
# In[ ]:
# ## Pitfall: chained indexing (and the 'SettingWithCopyWarning')
# In[ ]:
countries.loc['Belgium', 'capital'] = 'Ghent'
# In[ ]:
countries
# In[ ]:
countries['capital']['Belgium'] = 'Antwerp'
# In[ ]:
countries
# In[ ]:
countries[countries['capital'] == 'Antwerp']['capital'] = 'Brussels'
# In[ ]:
countries
# How to avoid this?
#
# * Use `loc` instead of chained indexing if possible!
# * Or `copy` explicitly if you don't want to change the original data.