import pandas as pd
import numpy as np
# Sample Data about student test scores from two classes
class_data = [{ 'student': 'AJ',
'class': 'A',
'score': 9
},
{ 'student': 'Paul',
'class': 'A',
'score': 8
},
{ 'student': 'Raymond',
'class': 'A',
'score': 7
},
{ 'student': 'Jenny',
'class': 'B',
'score': 5
},
{ 'student': 'Pete',
'class': 'B',
'score': 4
},
{ 'student': 'Colin',
'class': 'B',
'score': 6
},
{ 'student': 'Sarah',
'class': 'B',
'score': 4
}]
df = pd.DataFrame(class_data)
# What does the dataframe look like?
df
class | score | student | |
---|---|---|---|
0 | A | 9 | AJ |
1 | A | 8 | Paul |
2 | A | 7 | Raymond |
3 | B | 5 | Jenny |
4 | B | 4 | Pete |
5 | B | 6 | Colin |
6 | B | 4 | Sarah |
7 rows × 3 columns
# the mean score for all students
df['score'].mean()
6.1428571428571432
# Use a boolean index to look at students from only class A
df['class'] == 'A'
0 True 1 True 2 True 3 False 4 False 5 False 6 False Name: class, dtype: bool
df[df['class'] == 'A']
class | score | student | |
---|---|---|---|
0 | A | 9 | AJ |
1 | A | 8 | Paul |
2 | A | 7 | Raymond |
3 rows × 3 columns
# We can do a similar operation grouping by 'class'
group = df.groupby(by='class')
# This returns a DataFrameGroupBy object
type(group)
pandas.core.groupby.DataFrameGroupBy
# We can get the mean from this object
class_mean = group.mean()
# And this returns a dataframe
type(class_mean)
pandas.core.frame.DataFrame
class_mean
score | |
---|---|
class | |
A | 8.00 |
B | 4.75 |
2 rows × 1 columns
# Next, we can specify a column and use .aggregate to perform
# multiple calculations on one column
class_info = group['score'].aggregate({
'sum': np.sum,
'mean': np.mean,
'std': np.std
})
# this returns a dataframe
type(class_info)
pandas.core.frame.DataFrame
class_info
std | sum | mean | |
---|---|---|---|
class | |||
A | 1.000000 | 24 | 8.00 |
B | 0.957427 | 19 | 4.75 |
2 rows × 3 columns