In [1]:

import pandas as pd
import numpy as np
import sys
%matplotlib inline

In [2]:

print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)

Python version 3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]
Pandas version 1.3.5

Select¶

How do I select a random sample of a group?¶

In [3]:

# Initial dataframe
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'value'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df

Out[3]:

	group1	group2	value
0	a	1	apple
1	b	2	pear
2	a	3	orange
3	a	4	apple
4	b	1	banana
5	c	3	durian
6	c	5	lemon
7	c	6	lime
8	c	5	raspberry
9	c	4	durian
10	a	1	peach
11	a	2	nectarine
12	a	3	banana
13	b	4	lemon
14	b	3	guava
15	b	2	blackberry
16	b	1	grape

In [4]:

# We don't simply want to select random rows from df
# We want to first group df by (group1 & group2) then select random rows
from random import choice

# First create the group
grouped = df.groupby(['group1','group2'])
grouped.size()


#Notice that group (a,1) has two posibilities
#Notice that group (a,2) has one posibilities

#This means that if we select a random sample from group (a,1) we will get either "apple" or "peach"
#This means that if we select a random sample from group (a,2) we will always get "nectarine"

Out[4]:

group1  group2
a       1         2
        2         1
        3         2
        4         1
b       1         2
        2         2
        3         1
        4         1
c       3         1
        4         1
        5         2
        6         1
dtype: int64

In [5]:

#df.loc[select a random record from each group]
df.loc[[choice(x) for x in grouped.groups.values()]]

Out[5]:

	group1	group2	value
10	a	1	peach
11	a	2	nectarine
12	a	3	banana
3	a	4	apple
16	b	1	grape
15	b	2	blackberry
14	b	3	guava
13	b	4	lemon
5	c	3	durian
9	c	4	durian
6	c	5	lemon
7	c	6	lime

How do I slice each row of a column?¶

In [6]:

df = pd.DataFrame(data=['abcdef']*10, columns=['text'])
df

Out[6]:

	text
0	abcdef
1	abcdef
2	abcdef
3	abcdef
4	abcdef
5	abcdef
6	abcdef
7	abcdef
8	abcdef
9	abcdef

In [7]:

# Select the first 2 characters of each row
df['text'].apply(lambda x: x[:2])

Out[7]:

0    ab
1    ab
2    ab
3    ab
4    ab
5    ab
6    ab
7    ab
8    ab
9    ab
Name: text, dtype: object

How can I select rows of my dataframe based on a "complex" filter applied to multiple columns?¶

In [8]:

d = {'Dates':[pd.Timestamp('2013-01-02'),
              pd.Timestamp('2013-01-03'),
              pd.Timestamp('2013-01-04')],
     'Num1':[1,2,3],
     'Num2':[-1,-2,-3]}
                 

df = pd.DataFrame(data=d)
df

Out[8]:

	Dates	Num1	Num2
0	2013-01-02	1	-1
1	2013-01-03	2	-2
2	2013-01-04	3	-3

In [9]:

# where all values in column "Num1" are positive
positive = df['Num1'] > 0

# where values in column "Num2" is equal to -1
negativeOne = df['Num2'] == -1

# where values in the column "Dates" are in (1/2/2013 or 1/20/2013)
Dates = df['Dates'].isin(['2013-01-02','2013-01-20'])

df[positive & negativeOne & Dates]

Out[9]:

	Dates	Num1	Num2
0	2013-01-02	1	-1

How to get the maximum value of a group?¶

In [10]:

df = pd.DataFrame({'col1':['minus','minus','positive','nan'],
                'col2':[10,20,30,40],
                'col3':[-10,-20,30,np.nan]
                })
df

Out[10]:

	col1	col2	col3
0	minus	10	-10.0
1	minus	20	-20.0
2	positive	30	30.0
3	nan	40	NaN

In [11]:

# Method 1
df.groupby('col1').apply(lambda x: x.max())

Out[11]:

	col1	col2	col3
col1
minus	minus	20	-10.0
nan	nan	40	NaN
positive	positive	30	30.0

In [12]:

# Method 2
df.groupby('col1').agg('max')

Out[12]:

	col2	col3
col1
minus	20	-10.0
nan	40	NaN
positive	30	30.0

How to select records from one level of a multi-index data frame?¶

In [13]:

df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'value' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'group2'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df = df.set_index(['group1','group2'])
df

Out[13]:

		value
group1	group2
a	apple	1
b	pear	2
a	orange	3
a	apple	4
b	banana	1
c	durian	3
	lemon	5
	lime	6
	raspberry	5
	durian	4
a	peach	1
	nectarine	2
	banana	3
b	lemon	4
	guava	3
	blackberry	2
	grape	1

In [14]:

df.xs('a', level='group1')

Out[14]:

	value
group2
apple	1
orange	3
apple	4
peach	1
nectarine	2
banana	3

How do I reset the index when the index names are the same as the column names?¶

In [15]:

df = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] , 
                "City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"]}
               )
df

Out[15]:

	Name	City
0	Alice	Seattle
1	Bob	Seattle
2	Mallory	Portland
3	Mallory	Seattle
4	Bob	Seattle
5	Mallory	Portland

In [16]:

group = df.groupby(['City','Name'])
s = group.agg('size')
s.add_suffix('_size').reset_index()

Out[16]:

	City	Name	0
0	Portland_size	Mallory_size	2
1	Seattle_size	Alice_size	1
2	Seattle_size	Bob_size	2
3	Seattle_size	Mallory_size	1

This tutorial was created by HEDARO