import pandas as pd
import numpy as np
import sys
%matplotlib inline
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
Python version 3.7.4 (default, Aug 9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)] Pandas version 1.3.5
# Initial dataframe
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
"c","a","a","a","b","b","b","b"],
'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
'value' : ["apple","pear","orange","apple",
"banana","durian","lemon","lime",
"raspberry","durian","peach","nectarine",
"banana","lemon","guava","blackberry","grape"]})
df
group1 | group2 | value | |
---|---|---|---|
0 | a | 1 | apple |
1 | b | 2 | pear |
2 | a | 3 | orange |
3 | a | 4 | apple |
4 | b | 1 | banana |
5 | c | 3 | durian |
6 | c | 5 | lemon |
7 | c | 6 | lime |
8 | c | 5 | raspberry |
9 | c | 4 | durian |
10 | a | 1 | peach |
11 | a | 2 | nectarine |
12 | a | 3 | banana |
13 | b | 4 | lemon |
14 | b | 3 | guava |
15 | b | 2 | blackberry |
16 | b | 1 | grape |
# We don't simply want to select random rows from df
# We want to first group df by (group1 & group2) then select random rows
from random import choice
# First create the group
grouped = df.groupby(['group1','group2'])
grouped.size()
#Notice that group (a,1) has two posibilities
#Notice that group (a,2) has one posibilities
#This means that if we select a random sample from group (a,1) we will get either "apple" or "peach"
#This means that if we select a random sample from group (a,2) we will always get "nectarine"
group1 group2 a 1 2 2 1 3 2 4 1 b 1 2 2 2 3 1 4 1 c 3 1 4 1 5 2 6 1 dtype: int64
#df.loc[select a random record from each group]
df.loc[[choice(x) for x in grouped.groups.values()]]
group1 | group2 | value | |
---|---|---|---|
10 | a | 1 | peach |
11 | a | 2 | nectarine |
12 | a | 3 | banana |
3 | a | 4 | apple |
16 | b | 1 | grape |
15 | b | 2 | blackberry |
14 | b | 3 | guava |
13 | b | 4 | lemon |
5 | c | 3 | durian |
9 | c | 4 | durian |
6 | c | 5 | lemon |
7 | c | 6 | lime |
df = pd.DataFrame(data=['abcdef']*10, columns=['text'])
df
text | |
---|---|
0 | abcdef |
1 | abcdef |
2 | abcdef |
3 | abcdef |
4 | abcdef |
5 | abcdef |
6 | abcdef |
7 | abcdef |
8 | abcdef |
9 | abcdef |
# Select the first 2 characters of each row
df['text'].apply(lambda x: x[:2])
0 ab 1 ab 2 ab 3 ab 4 ab 5 ab 6 ab 7 ab 8 ab 9 ab Name: text, dtype: object
d = {'Dates':[pd.Timestamp('2013-01-02'),
pd.Timestamp('2013-01-03'),
pd.Timestamp('2013-01-04')],
'Num1':[1,2,3],
'Num2':[-1,-2,-3]}
df = pd.DataFrame(data=d)
df
Dates | Num1 | Num2 | |
---|---|---|---|
0 | 2013-01-02 | 1 | -1 |
1 | 2013-01-03 | 2 | -2 |
2 | 2013-01-04 | 3 | -3 |
# where all values in column "Num1" are positive
positive = df['Num1'] > 0
# where values in column "Num2" is equal to -1
negativeOne = df['Num2'] == -1
# where values in the column "Dates" are in (1/2/2013 or 1/20/2013)
Dates = df['Dates'].isin(['2013-01-02','2013-01-20'])
df[positive & negativeOne & Dates]
Dates | Num1 | Num2 | |
---|---|---|---|
0 | 2013-01-02 | 1 | -1 |
df = pd.DataFrame({'col1':['minus','minus','positive','nan'],
'col2':[10,20,30,40],
'col3':[-10,-20,30,np.nan]
})
df
col1 | col2 | col3 | |
---|---|---|---|
0 | minus | 10 | -10.0 |
1 | minus | 20 | -20.0 |
2 | positive | 30 | 30.0 |
3 | nan | 40 | NaN |
# Method 1
df.groupby('col1').apply(lambda x: x.max())
col1 | col2 | col3 | |
---|---|---|---|
col1 | |||
minus | minus | 20 | -10.0 |
nan | nan | 40 | NaN |
positive | positive | 30 | 30.0 |
# Method 2
df.groupby('col1').agg('max')
col2 | col3 | |
---|---|---|
col1 | ||
minus | 20 | -10.0 |
nan | 40 | NaN |
positive | 30 | 30.0 |
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
"c","a","a","a","b","b","b","b"],
'value' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
'group2' : ["apple","pear","orange","apple",
"banana","durian","lemon","lime",
"raspberry","durian","peach","nectarine",
"banana","lemon","guava","blackberry","grape"]})
df = df.set_index(['group1','group2'])
df
value | ||
---|---|---|
group1 | group2 | |
a | apple | 1 |
b | pear | 2 |
a | orange | 3 |
apple | 4 | |
b | banana | 1 |
c | durian | 3 |
lemon | 5 | |
lime | 6 | |
raspberry | 5 | |
durian | 4 | |
a | peach | 1 |
nectarine | 2 | |
banana | 3 | |
b | lemon | 4 |
guava | 3 | |
blackberry | 2 | |
grape | 1 |
df.xs('a', level='group1')
value | |
---|---|
group2 | |
apple | 1 |
orange | 3 |
apple | 4 |
peach | 1 |
nectarine | 2 |
banana | 3 |
df = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] ,
"City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"]}
)
df
Name | City | |
---|---|---|
0 | Alice | Seattle |
1 | Bob | Seattle |
2 | Mallory | Portland |
3 | Mallory | Seattle |
4 | Bob | Seattle |
5 | Mallory | Portland |
group = df.groupby(['City','Name'])
s = group.agg('size')
s.add_suffix('_size').reset_index()
City | Name | 0 | |
---|---|---|---|
0 | Portland_size | Mallory_size | 2 |
1 | Seattle_size | Alice_size | 1 |
2 | Seattle_size | Bob_size | 2 |
3 | Seattle_size | Mallory_size | 1 |
This tutorial was created by HEDARO