#!/usr/bin/env python # coding: utf-8 # # Indexing and selecting data # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt try: import seaborn except ImportError: pass # In[ ]: # redefining the example objects # series population = pd.Series({'Germany': 81.3, 'Belgium': 11.3, 'France': 64.3, 'United Kingdom': 64.9, 'Netherlands': 16.9}) # dataframe data = {'country': ['Belgium', 'France', 'Germany', 'Netherlands', 'United Kingdom'], 'population': [11.3, 64.3, 81.3, 16.9, 64.9], 'area': [30510, 671308, 357050, 41526, 244820], 'capital': ['Brussels', 'Paris', 'Berlin', 'Amsterdam', 'London']} countries = pd.DataFrame(data) countries # Setting the index to the country names: # In[ ]: countries = countries.set_index('country') countries # ## Some notes on selecting data # # One of pandas' basic features is the labeling of rows and columns, but this makes indexing also a bit more complex compared to numpy. We now have to distuinguish between: # # - selection by label # - selection by position. # ### `data[]` provides some convenience shortcuts # For a DataFrame, basic indexing selects the columns. # # Selecting a single column: # In[ ]: countries['area'] # or multiple columns: # In[ ]: countries[['area', 'population']] # But, slicing accesses the rows: # In[ ]: countries['France':'Netherlands'] #
# NOTE: Unlike slicing in numpy, the end label is **included**. #
# So as a summary, `[]` provides the following convenience shortcuts: # # - Series: selecting a label: `s[label]` # - DataFrame: selecting a single or multiple columns: `df['col']` or `df[['col1', 'col2']]` # - DataFrame: slicing the rows: `df['row_label1':'row_label2']` or `df[mask]` # ### Systematic indexing with `loc` and `iloc` # When using `[]` like above, you can only select from one axis at once (rows or columns, not both). For more advanced indexing, you have some extra attributes: # # * `loc`: selection by label # * `iloc`: selection by position # # These methods index the different dimensions of the frame: # # * `df.loc[row_indexer, column_indexer]` # * `df.iloc[row_indexer, column_indexer]` # Selecting a single element: # In[ ]: countries.loc['Germany', 'area'] # But the row or column indexer can also be a list, slice, boolean array, .. # In[ ]: countries.loc['France':'Germany', ['area', 'population']] # --- # Selecting by position with `iloc` works similar as indexing numpy arrays: # In[ ]: countries.iloc[0:2,1:3] # The different indexing methods can also be used to assign data: # In[ ]: countries2 = countries.copy() countries2.loc['Belgium':'Germany', 'population'] = 10 # In[ ]: countries2 # ## Boolean indexing (filtering) # Often, you want to select rows based on a certain condition. This can be done with 'boolean indexing' (like a where clause in SQL). # # The indexer (or boolean mask) should be 1-dimensional and the same length as the thing being indexed. # In[ ]: countries['area'] > 100000 # In[ ]: countries[countries['area'] > 100000] # --- # #
# EXERCISE: Add a column `density` with the population density (note: population column is expressed in millions) #
# In[ ]: #
# EXERCISE: Select the capital and the population column of those countries where the density is larger than 300 #
# In[ ]: #
# EXERCISE: Add a column 'density_ratio' with the ratio of the density to the mean density #
# In[ ]: #
# EXERCISE: Change the capital of the UK to Cambridge #
# In[ ]: #
# EXERCISE: Select all countries whose population density is between 100 and 300 people/kmĀ² #
# In[ ]: # ## Some other useful methods: `isin` and string methods # The `isin` method of Series is very useful to select rows that may contain certain values: # In[ ]: s = countries['capital'] # In[ ]: get_ipython().run_line_magic('pinfo', 's.isin') # In[ ]: s.isin(['Berlin', 'London']) # This can then be used to filter the dataframe with boolean indexing: # In[ ]: countries[countries['capital'].isin(['Berlin', 'London'])] # Let's say we want to select all data for which the capital starts with a 'B'. In Python, when having a string, we could use the `startswith` method: # In[ ]: 'Berlin'.startswith('B') # In pandas, these are available on a Series through the `str` namespace: # In[ ]: countries['capital'].str.startswith('B') # For an overview of all string methods, see: http://pandas.pydata.org/pandas-docs/stable/api.html#string-handling #
# EXERCISE: Select all countries that have capital names with more than 7 characters #
# In[ ]: #
# EXERCISE: Select all countries that have capital names that contain the character sequence 'am' #
# In[ ]: # ## Pitfall: chained indexing (and the 'SettingWithCopyWarning') # In[ ]: countries.loc['Belgium', 'capital'] = 'Ghent' # In[ ]: countries # In[ ]: countries['capital']['Belgium'] = 'Antwerp' # In[ ]: countries # In[ ]: countries[countries['capital'] == 'Antwerp']['capital'] = 'Brussels' # In[ ]: countries # How to avoid this? # # * Use `loc` instead of chained indexing if possible! # * Or `copy` explicitly if you don't want to change the original data.