Mark Santcroos, Department of Human Genetics, Leiden University Medical Center
Examples and ideas taken from: Jupyter Documentation
pandas is a Python package aiming to provide
data structures designed to make working with
data both
For R users, DataFrame provides everything that R’s data.frame provides and much more.
pandas is built on top of NumPy and is intended to integrate well within a scientific computing environment with many other 3rd party libraries.
import pandas as pd
import numpy as np
pd.__version__
'2.1.3'
# Create Series with missing data
s = pd.Series([1,3,5,np.nan,6,8])
s
0 1.0 1 3.0 2 5.0 3 NaN 4 6.0 5 8.0 dtype: float64
# Create DatetimeIndex for 6 days
dates = pd.date_range('20170901', periods=6)
dates
DatetimeIndex(['2017-09-01', '2017-09-02', '2017-09-03', '2017-09-04', '2017-09-05', '2017-09-06'], dtype='datetime64[ns]', freq='D')
# Create 6x4 NP array with random values
ran_values = np.random.randn(6,4)
ran_values
array([[ 0.44057999, -0.14813443, -0.64273978, -0.79652794], [ 0.9941771 , -0.90039051, 0.45449045, 2.17529027], [-0.41381434, 1.21695023, -1.03746123, 1.08426416], [-0.51187413, -0.33306356, -1.16664576, -1.1092606 ], [ 0.79378938, 0.33526963, 0.2428412 , 0.80630805], [-0.38244712, 0.55489813, 1.66328568, -0.47015855]])
df = pd.DataFrame(ran_values, columns=list('ABDC'))
df
A | B | D | C | |
---|---|---|---|---|
0 | 0.440580 | -0.148134 | -0.642740 | -0.796528 |
1 | 0.994177 | -0.900391 | 0.454490 | 2.175290 |
2 | -0.413814 | 1.216950 | -1.037461 | 1.084264 |
3 | -0.511874 | -0.333064 | -1.166646 | -1.109261 |
4 | 0.793789 | 0.335270 | 0.242841 | 0.806308 |
5 | -0.382447 | 0.554898 | 1.663286 | -0.470159 |
df.set_index(dates, inplace=True)
df
A | B | D | C | |
---|---|---|---|---|
2017-09-01 | 0.440580 | -0.148134 | -0.642740 | -0.796528 |
2017-09-02 | 0.994177 | -0.900391 | 0.454490 | 2.175290 |
2017-09-03 | -0.413814 | 1.216950 | -1.037461 | 1.084264 |
2017-09-04 | -0.511874 | -0.333064 | -1.166646 | -1.109261 |
2017-09-05 | 0.793789 | 0.335270 | 0.242841 | 0.806308 |
2017-09-06 | -0.382447 | 0.554898 | 1.663286 | -0.470159 |
# Create DataFrame by using a dict of series-like objects.
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20170920'),
'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
'D' : np.array([3] * 4, dtype='int32'),
'E' : pd.Categorical(["LUMC","EMC","LUMC","EMC"]),
'F' : 'researcher'
})
df2
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
0 | 1.0 | 2017-09-20 | 1.0 | 3 | LUMC | researcher |
1 | 1.0 | 2017-09-20 | 1.0 | 3 | EMC | researcher |
2 | 1.0 | 2017-09-20 | 1.0 | 3 | LUMC | researcher |
3 | 1.0 | 2017-09-20 | 1.0 | 3 | EMC | researcher |
df2.dtypes
A float64 B datetime64[s] C float32 D int32 E category F object dtype: object
# df.<TAB> shows attributes and column names
df.head()
A | B | D | C | |
---|---|---|---|---|
2017-09-01 | 0.440580 | -0.148134 | -0.642740 | -0.796528 |
2017-09-02 | 0.994177 | -0.900391 | 0.454490 | 2.175290 |
2017-09-03 | -0.413814 | 1.216950 | -1.037461 | 1.084264 |
2017-09-04 | -0.511874 | -0.333064 | -1.166646 | -1.109261 |
2017-09-05 | 0.793789 | 0.335270 | 0.242841 | 0.806308 |
df.tail(2)
A | B | D | C | |
---|---|---|---|---|
2017-09-05 | 0.793789 | 0.335270 | 0.242841 | 0.806308 |
2017-09-06 | -0.382447 | 0.554898 | 1.663286 | -0.470159 |
df.index
DatetimeIndex(['2017-09-01', '2017-09-02', '2017-09-03', '2017-09-04', '2017-09-05', '2017-09-06'], dtype='datetime64[ns]', freq='D')
df.columns
Index(['A', 'B', 'D', 'C'], dtype='object')
df.values
array([[ 0.44057999, -0.14813443, -0.64273978, -0.79652794], [ 0.9941771 , -0.90039051, 0.45449045, 2.17529027], [-0.41381434, 1.21695023, -1.03746123, 1.08426416], [-0.51187413, -0.33306356, -1.16664576, -1.1092606 ], [ 0.79378938, 0.33526963, 0.2428412 , 0.80630805], [-0.38244712, 0.55489813, 1.66328568, -0.47015855]])
df.describe()
A | B | D | C | |
---|---|---|---|---|
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
mean | 0.153402 | 0.120922 | -0.081038 | 0.281653 |
std | 0.670959 | 0.742581 | 1.081032 | 1.278096 |
min | -0.511874 | -0.900391 | -1.166646 | -1.109261 |
25% | -0.405973 | -0.286831 | -0.938781 | -0.714936 |
50% | 0.029066 | 0.093568 | -0.199949 | 0.168075 |
75% | 0.705487 | 0.499991 | 0.401578 | 1.014775 |
max | 0.994177 | 1.216950 | 1.663286 | 2.175290 |
df.T
2017-09-01 | 2017-09-02 | 2017-09-03 | 2017-09-04 | 2017-09-05 | 2017-09-06 | |
---|---|---|---|---|---|---|
A | 0.440580 | 0.994177 | -0.413814 | -0.511874 | 0.793789 | -0.382447 |
B | -0.148134 | -0.900391 | 1.216950 | -0.333064 | 0.335270 | 0.554898 |
D | -0.642740 | 0.454490 | -1.037461 | -1.166646 | 0.242841 | 1.663286 |
C | -0.796528 | 2.175290 | 1.084264 | -1.109261 | 0.806308 | -0.470159 |
# Sort on axis
df.sort_index(axis=1)
A | B | C | D | |
---|---|---|---|---|
2017-09-01 | 0.440580 | -0.148134 | -0.796528 | -0.642740 |
2017-09-02 | 0.994177 | -0.900391 | 2.175290 | 0.454490 |
2017-09-03 | -0.413814 | 1.216950 | 1.084264 | -1.037461 |
2017-09-04 | -0.511874 | -0.333064 | -1.109261 | -1.166646 |
2017-09-05 | 0.793789 | 0.335270 | 0.806308 | 0.242841 |
2017-09-06 | -0.382447 | 0.554898 | -0.470159 | 1.663286 |
# Sort by value
df.sort_values(by='B')
A | B | D | C | |
---|---|---|---|---|
2017-09-02 | 0.994177 | -0.900391 | 0.454490 | 2.175290 |
2017-09-04 | -0.511874 | -0.333064 | -1.166646 | -1.109261 |
2017-09-01 | 0.440580 | -0.148134 | -0.642740 | -0.796528 |
2017-09-05 | 0.793789 | 0.335270 | 0.242841 | 0.806308 |
2017-09-06 | -0.382447 | 0.554898 | 1.663286 | -0.470159 |
2017-09-03 | -0.413814 | 1.216950 | -1.037461 | 1.084264 |
# Select column, which returns a series
df['A']
2017-09-01 0.440580 2017-09-02 0.994177 2017-09-03 -0.413814 2017-09-04 -0.511874 2017-09-05 0.793789 2017-09-06 -0.382447 Freq: D, Name: A, dtype: float64
# Row based
df[1:4]
A | B | D | C | |
---|---|---|---|---|
2017-09-02 | 0.994177 | -0.900391 | 0.454490 | 2.175290 |
2017-09-03 | -0.413814 | 1.216950 | -1.037461 | 1.084264 |
2017-09-04 | -0.511874 | -0.333064 | -1.166646 | -1.109261 |
# Or index based
df['20170902':'20170904']
A | B | D | C | |
---|---|---|---|---|
2017-09-02 | 0.994177 | -0.900391 | 0.454490 | 2.175290 |
2017-09-03 | -0.413814 | 1.216950 | -1.037461 | 1.084264 |
2017-09-04 | -0.511874 | -0.333064 | -1.166646 | -1.109261 |
# Cross section using a label
df.loc['2017-09-02']
A 0.994177 B -0.900391 D 0.454490 C 2.175290 Name: 2017-09-02 00:00:00, dtype: float64
# Multi access selection based on label
df.loc[:,['A','B']]
A | B | |
---|---|---|
2017-09-01 | 0.440580 | -0.148134 |
2017-09-02 | 0.994177 | -0.900391 |
2017-09-03 | -0.413814 | 1.216950 |
2017-09-04 | -0.511874 | -0.333064 |
2017-09-05 | 0.793789 | 0.335270 |
2017-09-06 | -0.382447 | 0.554898 |
# Multi dimension label slicing
df.loc['20170902':'20170904',['B','C']]
B | C | |
---|---|---|
2017-09-02 | -0.900391 | 2.175290 |
2017-09-03 | 1.216950 | 1.084264 |
2017-09-04 | -0.333064 | -1.109261 |
# Reduced dimension of return object for single rows
df.loc['20170902',['A','B']]
A 0.994177 B -0.900391 Name: 2017-09-02 00:00:00, dtype: float64
# Scalar values
df.loc['20170902','A']
0.9941770977562511
The semantics follow closely python and numpy slicing.
# Row
df.iloc[3]
A -0.511874 B -0.333064 D -1.166646 C -1.109261 Name: 2017-09-04 00:00:00, dtype: float64
# Multi dimension
df.iloc[3:5,2:4]
D | C | |
---|---|---|
2017-09-04 | -1.166646 | -1.109261 |
2017-09-05 | 0.242841 | 0.806308 |
# Select rows only
df.iloc[1:3,:]
A | B | D | C | |
---|---|---|---|---|
2017-09-02 | 0.994177 | -0.900391 | 0.454490 | 2.175290 |
2017-09-03 | -0.413814 | 1.216950 | -1.037461 | 1.084264 |
# Select columns only
df.iloc[:,1:3]
B | D | |
---|---|---|
2017-09-01 | -0.148134 | -0.642740 |
2017-09-02 | -0.900391 | 0.454490 |
2017-09-03 | 1.216950 | -1.037461 |
2017-09-04 | -0.333064 | -1.166646 |
2017-09-05 | 0.335270 | 0.242841 |
2017-09-06 | 0.554898 | 1.663286 |
df.iloc[1,1]
-0.9003905086095164
# Using a single column’s values to select data.
df[df.A > 0]
A | B | D | C | |
---|---|---|---|---|
2017-09-01 | 0.440580 | -0.148134 | -0.642740 | -0.796528 |
2017-09-02 | 0.994177 | -0.900391 | 0.454490 | 2.175290 |
2017-09-05 | 0.793789 | 0.335270 | 0.242841 | 0.806308 |
# Selecting values from a DataFrame where a boolean condition is met.
df[df < 0]
A | B | D | C | |
---|---|---|---|---|
2017-09-01 | NaN | -0.148134 | -0.642740 | -0.796528 |
2017-09-02 | NaN | -0.900391 | NaN | NaN |
2017-09-03 | -0.413814 | NaN | -1.037461 | NaN |
2017-09-04 | -0.511874 | -0.333064 | -1.166646 | -1.109261 |
2017-09-05 | NaN | NaN | NaN | NaN |
2017-09-06 | -0.382447 | NaN | NaN | -0.470159 |
# Create new copy and add extra column
df3 = df.copy()
df3['E'] = ['one', 'one','two','three','four','three']
df3
A | B | D | C | E | |
---|---|---|---|---|---|
2017-09-01 | 0.440580 | -0.148134 | -0.642740 | -0.796528 | one |
2017-09-02 | 0.994177 | -0.900391 | 0.454490 | 2.175290 | one |
2017-09-03 | -0.413814 | 1.216950 | -1.037461 | 1.084264 | two |
2017-09-04 | -0.511874 | -0.333064 | -1.166646 | -1.109261 | three |
2017-09-05 | 0.793789 | 0.335270 | 0.242841 | 0.806308 | four |
2017-09-06 | -0.382447 | 0.554898 | 1.663286 | -0.470159 | three |
# Use isin() filtering
df3[df3['E'].isin(['two','four'])]
A | B | D | C | E | |
---|---|---|---|---|---|
2017-09-03 | -0.413814 | 1.21695 | -1.037461 | 1.084264 | two |
2017-09-05 | 0.793789 | 0.33527 | 0.242841 | 0.806308 | four |
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20170902', periods=6))
s1
2017-09-02 1 2017-09-03 2 2017-09-04 3 2017-09-05 4 2017-09-06 5 2017-09-07 6 Freq: D, dtype: int64
# Add column F, align by original index
df['F'] = s1
df
A | B | D | C | F | |
---|---|---|---|---|---|
2017-09-01 | 0.440580 | -0.148134 | -0.642740 | -0.796528 | NaN |
2017-09-02 | 0.994177 | -0.900391 | 0.454490 | 2.175290 | 1.0 |
2017-09-03 | -0.413814 | 1.216950 | -1.037461 | 1.084264 | 2.0 |
2017-09-04 | -0.511874 | -0.333064 | -1.166646 | -1.109261 | 3.0 |
2017-09-05 | 0.793789 | 0.335270 | 0.242841 | 0.806308 | 4.0 |
2017-09-06 | -0.382447 | 0.554898 | 1.663286 | -0.470159 | 5.0 |
# Setting values by label
df.at['20170902','A'] = 0
df
A | B | D | C | F | |
---|---|---|---|---|---|
2017-09-01 | 0.440580 | -0.148134 | -0.642740 | -0.796528 | NaN |
2017-09-02 | 0.000000 | -0.900391 | 0.454490 | 2.175290 | 1.0 |
2017-09-03 | -0.413814 | 1.216950 | -1.037461 | 1.084264 | 2.0 |
2017-09-04 | -0.511874 | -0.333064 | -1.166646 | -1.109261 | 3.0 |
2017-09-05 | 0.793789 | 0.335270 | 0.242841 | 0.806308 | 4.0 |
2017-09-06 | -0.382447 | 0.554898 | 1.663286 | -0.470159 | 5.0 |
# Set value at two dimensional location
df.iat[0,1] = 0
df
A | B | D | C | F | |
---|---|---|---|---|---|
2017-09-01 | 0.440580 | 0.000000 | -0.642740 | -0.796528 | NaN |
2017-09-02 | 0.000000 | -0.900391 | 0.454490 | 2.175290 | 1.0 |
2017-09-03 | -0.413814 | 1.216950 | -1.037461 | 1.084264 | 2.0 |
2017-09-04 | -0.511874 | -0.333064 | -1.166646 | -1.109261 | 3.0 |
2017-09-05 | 0.793789 | 0.335270 | 0.242841 | 0.806308 | 4.0 |
2017-09-06 | -0.382447 | 0.554898 | 1.663286 | -0.470159 | 5.0 |
# Setting a column based on a numpy array
df.loc[:,'D'] = np.array([5] * len(df))
df
A | B | D | C | F | |
---|---|---|---|---|---|
2017-09-01 | 0.440580 | 0.000000 | 5.0 | -0.796528 | NaN |
2017-09-02 | 0.000000 | -0.900391 | 5.0 | 2.175290 | 1.0 |
2017-09-03 | -0.413814 | 1.216950 | 5.0 | 1.084264 | 2.0 |
2017-09-04 | -0.511874 | -0.333064 | 5.0 | -1.109261 | 3.0 |
2017-09-05 | 0.793789 | 0.335270 | 5.0 | 0.806308 | 4.0 |
2017-09-06 | -0.382447 | 0.554898 | 5.0 | -0.470159 | 5.0 |
df2 = df.copy()
df2[df2 > 0] = -df2
df2
A | B | D | C | F | |
---|---|---|---|---|---|
2017-09-01 | -0.440580 | 0.000000 | -5.0 | -0.796528 | NaN |
2017-09-02 | 0.000000 | -0.900391 | -5.0 | -2.175290 | -1.0 |
2017-09-03 | -0.413814 | -1.216950 | -5.0 | -1.084264 | -2.0 |
2017-09-04 | -0.511874 | -0.333064 | -5.0 | -1.109261 | -3.0 |
2017-09-05 | -0.793789 | -0.335270 | -5.0 | -0.806308 | -4.0 |
2017-09-06 | -0.382447 | -0.554898 | -5.0 | -0.470159 | -5.0 |
# reindex (copy) a subset of the data and add an empty column E
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
# Set E to 1 for first two rows
df1.loc[dates[0]:dates[1],'E'] = 1
df1
A | B | D | C | F | E | |
---|---|---|---|---|---|---|
2017-09-01 | 0.440580 | 0.000000 | 5.0 | -0.796528 | NaN | 1.0 |
2017-09-02 | 0.000000 | -0.900391 | 5.0 | 2.175290 | 1.0 | 1.0 |
2017-09-03 | -0.413814 | 1.216950 | 5.0 | 1.084264 | 2.0 | NaN |
2017-09-04 | -0.511874 | -0.333064 | 5.0 | -1.109261 | 3.0 | NaN |
# Drop all rows that have any unknown values
df1.dropna(how='any')
A | B | D | C | F | E | |
---|---|---|---|---|---|---|
2017-09-02 | 0.0 | -0.900391 | 5.0 | 2.17529 | 1.0 | 1.0 |
# Replace NA with value
df1.fillna(value=42)
A | B | D | C | F | E | |
---|---|---|---|---|---|---|
2017-09-01 | 0.440580 | 0.000000 | 5.0 | -0.796528 | 42.0 | 1.0 |
2017-09-02 | 0.000000 | -0.900391 | 5.0 | 2.175290 | 1.0 | 1.0 |
2017-09-03 | -0.413814 | 1.216950 | 5.0 | 1.084264 | 2.0 | 42.0 |
2017-09-04 | -0.511874 | -0.333064 | 5.0 | -1.109261 | 3.0 | 42.0 |
# Show the boolean mask
pd.isnull(df1)
A | B | D | C | F | E | |
---|---|---|---|---|---|---|
2017-09-01 | False | False | False | False | True | False |
2017-09-02 | False | False | False | False | False | False |
2017-09-03 | False | False | False | False | False | True |
2017-09-04 | False | False | False | False | False | True |
# Mean per column
df.mean() # similar to axis=0
A -0.012294 B 0.145611 D 5.000000 C 0.281653 F 3.000000 dtype: float64
# Mean per row
df.mean(axis=1)
2017-09-01 1.161013 2017-09-02 1.454980 2017-09-03 1.777480 2017-09-04 1.209160 2017-09-05 2.187073 2017-09-06 1.940458 Freq: D, dtype: float64
# Create my own function that returns the negated value
def my_func(val):
return -val
# Apply my function to all values
df.apply(my_func)
A | B | D | C | F | |
---|---|---|---|---|---|
2017-09-01 | -0.440580 | -0.000000 | -5.0 | 0.796528 | NaN |
2017-09-02 | -0.000000 | 0.900391 | -5.0 | -2.175290 | -1.0 |
2017-09-03 | 0.413814 | -1.216950 | -5.0 | -1.084264 | -2.0 |
2017-09-04 | 0.511874 | 0.333064 | -5.0 | 1.109261 | -3.0 |
2017-09-05 | -0.793789 | -0.335270 | -5.0 | -0.806308 | -4.0 |
2017-09-06 | 0.382447 | -0.554898 | -5.0 | 0.470159 | -5.0 |
s = pd.Series(np.random.randint(0, 7, size=10))
s
0 2 1 0 2 6 3 5 4 1 5 0 6 6 7 1 8 5 9 3 dtype: int64
s.value_counts()
0 2 6 2 5 2 1 2 2 1 3 1 Name: count, dtype: int64
# Create 10x4 table with random numbers
df = pd.DataFrame(np.random.randn(10, 4))
df
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | -1.032198 | -0.887591 | -1.463087 | -2.050719 |
1 | 0.544080 | -1.138772 | -0.902064 | -0.911823 |
2 | 0.541855 | -1.523351 | -0.209272 | -1.610346 |
3 | -0.627728 | 1.108326 | -1.050366 | -0.566356 |
4 | -1.687683 | 1.139958 | -1.611219 | -1.009924 |
5 | -0.646679 | 1.548301 | 0.676723 | -0.693801 |
6 | -0.094278 | -0.076768 | -0.493994 | 1.250234 |
7 | -1.957639 | 0.148645 | -1.016835 | -0.541509 |
8 | -0.349715 | -0.890640 | 1.329320 | -0.174841 |
9 | -0.343248 | 0.883219 | -1.654050 | 0.349884 |
# Split them into 3 chunks (row-based)
chunks = [df[:3], df[3:7], df[7:]]
chunks
[ 0 1 2 3 0 -1.032198 -0.887591 -1.463087 -2.050719 1 0.544080 -1.138772 -0.902064 -0.911823 2 0.541855 -1.523351 -0.209272 -1.610346, 0 1 2 3 3 -0.627728 1.108326 -1.050366 -0.566356 4 -1.687683 1.139958 -1.611219 -1.009924 5 -0.646679 1.548301 0.676723 -0.693801 6 -0.094278 -0.076768 -0.493994 1.250234, 0 1 2 3 7 -1.957639 0.148645 -1.016835 -0.541509 8 -0.349715 -0.890640 1.329320 -0.174841 9 -0.343248 0.883219 -1.654050 0.349884]
# add them back together
pd.concat(chunks)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | -1.032198 | -0.887591 | -1.463087 | -2.050719 |
1 | 0.544080 | -1.138772 | -0.902064 | -0.911823 |
2 | 0.541855 | -1.523351 | -0.209272 | -1.610346 |
3 | -0.627728 | 1.108326 | -1.050366 | -0.566356 |
4 | -1.687683 | 1.139958 | -1.611219 | -1.009924 |
5 | -0.646679 | 1.548301 | 0.676723 | -0.693801 |
6 | -0.094278 | -0.076768 | -0.493994 | 1.250234 |
7 | -1.957639 | 0.148645 | -1.016835 | -0.541509 |
8 | -0.349715 | -0.890640 | 1.329320 | -0.174841 |
9 | -0.343248 | 0.883219 | -1.654050 | 0.349884 |
Many ways to combine multiple dataframes.
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left
key | lval | |
---|---|---|
0 | foo | 1 |
1 | foo | 2 |
right
key | rval | |
---|---|---|
0 | foo | 4 |
1 | foo | 5 |
pd.merge(left, right, on='key')
key | lval | rval | |
---|---|---|---|
0 | foo | 1 | 4 |
1 | foo | 1 | 5 |
2 | foo | 2 | 4 |
3 | foo | 2 | 5 |
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
left
key | lval | |
---|---|---|
0 | foo | 1 |
1 | bar | 2 |
right
key | rval | |
---|---|---|
0 | foo | 4 |
1 | bar | 5 |
pd.merge(left, right, on='key')
key | lval | rval | |
---|---|---|---|
0 | foo | 1 | 4 |
1 | bar | 2 | 5 |
# create a 8x4 matrix
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df
A | B | C | D | |
---|---|---|---|---|
0 | 1.144167 | -1.168476 | 0.249483 | -1.734513 |
1 | -0.709550 | 0.504567 | -1.146155 | -0.405963 |
2 | -0.157156 | -1.525090 | 0.704099 | 0.647796 |
3 | -0.114077 | -0.843899 | 0.331256 | -1.461805 |
4 | -0.215538 | 1.166669 | -0.425822 | 1.092165 |
5 | -1.386906 | 0.121748 | -1.278508 | -0.763579 |
6 | -0.381231 | 0.007251 | 0.928618 | -0.541720 |
7 | 0.548590 | 0.600198 | -0.402174 | 0.411751 |
# extract a row
s = df.iloc[3]
s
A -0.114077 B -0.843899 C 0.331256 D -1.461805 Name: 3, dtype: float64
# append the extract row at the end
# df.append(s, ignore_index=False).reindex() - not working since version 2.0.0
df = pd.concat([df, pd.DataFrame([s])], ignore_index=True)
df
A | B | C | D | |
---|---|---|---|---|
0 | 1.144167 | -1.168476 | 0.249483 | -1.734513 |
1 | -0.709550 | 0.504567 | -1.146155 | -0.405963 |
2 | -0.157156 | -1.525090 | 0.704099 | 0.647796 |
3 | -0.114077 | -0.843899 | 0.331256 | -1.461805 |
4 | -0.215538 | 1.166669 | -0.425822 | 1.092165 |
5 | -1.386906 | 0.121748 | -1.278508 | -0.763579 |
6 | -0.381231 | 0.007251 | 0.928618 | -0.541720 |
7 | 0.548590 | 0.600198 | -0.402174 | 0.411751 |
8 | -0.114077 | -0.843899 | 0.331256 | -1.461805 |
Value based grouping in order to execute methods on the results.
# Create
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | 0.631038 | -0.022213 |
1 | bar | one | -0.413756 | -0.334146 |
2 | foo | two | -1.698029 | 0.804681 |
3 | bar | three | 1.334126 | 0.163851 |
4 | foo | two | -0.122999 | 0.510452 |
5 | bar | two | -1.698212 | 0.670997 |
6 | foo | one | 0.785731 | -0.144679 |
7 | foo | three | -0.666275 | 0.038802 |
df.groupby('A').sum()
B | C | D | |
---|---|---|---|
A | |||
bar | onethreetwo | -0.777843 | 0.500702 |
foo | onetwotwoonethree | -1.070535 | 1.187045 |
df.groupby(['A','B']).sum()
C | D | ||
---|---|---|---|
A | B | ||
bar | one | -0.413756 | -0.334146 |
three | 1.334126 | 0.163851 | |
two | -1.698212 | 0.670997 | |
foo | one | 1.416769 | -0.166891 |
three | -0.666275 | 0.038802 | |
two | -1.821028 | 1.315134 |
# Create a flat table with duplicated entries
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['X', 'Y', 'Z'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
df
A | B | C | D | E | |
---|---|---|---|---|---|
0 | one | X | foo | 0.120945 | 0.899031 |
1 | one | Y | foo | -1.412810 | -1.848770 |
2 | two | Z | foo | -1.253503 | 2.625549 |
3 | three | X | bar | 1.049642 | 0.150325 |
4 | one | Y | bar | -0.680348 | -0.117214 |
5 | one | Z | bar | -0.776564 | -1.251835 |
6 | two | X | foo | -0.411001 | -1.549370 |
7 | three | Y | foo | -0.635409 | -0.170142 |
8 | one | Z | foo | 0.031961 | -0.123966 |
9 | one | X | bar | -1.048441 | 0.301670 |
10 | two | Y | bar | -0.334869 | -0.240000 |
11 | three | Z | bar | 1.223787 | 1.278326 |
# Create a pivot table using A and B as the index, making C columns, and using D as the values (E is not used)
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
C | bar | foo | |
---|---|---|---|
A | B | ||
one | X | -1.048441 | 0.120945 |
Y | -0.680348 | -1.412810 | |
Z | -0.776564 | 0.031961 | |
three | X | 1.049642 | NaN |
Y | NaN | -0.635409 | |
Z | 1.223787 | NaN | |
two | X | NaN | -0.411001 |
Y | -0.334869 | NaN | |
Z | NaN | -1.253503 |