import pandas as pd
import numpy as np
pd.__version__
'0.23.1'
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa'],
'age':[23,78,22,19,45,33],
'state':['iowa','dc','california','texas','washington','dc'],
'num_children':[2,2,0,1,2,1],
'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
0 | john | 23 | iowa | 2 | 0 |
1 | mary | 78 | dc | 2 | 4 |
2 | peter | 22 | california | 0 | 0 |
3 | jeff | 19 | texas | 1 | 5 |
4 | bill | 45 | washington | 2 | 0 |
5 | lisa | 33 | dc | 1 | 0 |
# select the first 2 rows
df.iloc[:2]
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
0 | john | 23 | iowa | 2 | 0 |
1 | mary | 78 | dc | 2 | 4 |
# select the last 2 rows
df.iloc[-2:]
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
4 | bill | 45 | washington | 2 | 0 |
5 | lisa | 33 | dc | 1 | 0 |
compare this with
iloc
above
# select rows up to and including the one
# with index=2
df.loc[:2]
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
0 | john | 23 | iowa | 2 | 0 |
1 | mary | 78 | dc | 2 | 4 |
2 | peter | 22 | california | 0 | 0 |
# by a simple numeric condition
df[df["age"] > 30]
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
1 | mary | 78 | dc | 2 | 4 |
4 | bill | 45 | washington | 2 | 0 |
5 | lisa | 33 | dc | 1 | 0 |
# comparing the value of two columns
df[ df["num_pets"] > df[ "num_children"] ]
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
1 | mary | 78 | dc | 2 | 4 |
3 | jeff | 19 | texas | 1 | 5 |
# using boolean AND
df[ (df["age"] > 40) & (df["num_pets"] > 0) ]
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
1 | mary | 78 | dc | 2 | 4 |
df[[colname for colname in df.columns if colname.startswith('n')]]
name | num_children | num_pets | |
---|---|---|---|
0 | john | 2 | 0 |
1 | mary | 2 | 4 |
2 | peter | 0 | 0 |
3 | jeff | 1 | 5 |
4 | bill | 2 | 0 |
5 | lisa | 1 | 0 |
df.drop(['age'],axis=1)
name | state | num_children | num_pets | |
---|---|---|---|---|
0 | john | iowa | 2 | 0 |
1 | mary | dc | 2 | 4 |
2 | peter | california | 0 | 0 |
3 | jeff | texas | 1 | 5 |
4 | bill | washington | 2 | 0 |
5 | lisa | dc | 1 | 0 |
df.drop(["age","num_children"],axis=1)
name | state | num_pets | |
---|---|---|---|
0 | john | iowa | 0 |
1 | mary | dc | 4 |
2 | peter | california | 0 |
3 | jeff | texas | 5 |
4 | bill | washington | 0 |
5 | lisa | dc | 0 |
Using numpy vectorized functions for numerical values
# get the mean for each of the selected columns
df[["age","num_pets","num_children"]].apply(lambda row: np.mean(row),axis=0).to_frame()
0 | |
---|---|
age | 36.666667 |
num_pets | 1.500000 |
num_children | 1.333333 |
Using numpy vectorized functions for numerical values
# sum columns age, num_pets and num_children for each row
df[["age","num_pets","num_children"]].apply(lambda row: np.sum(row),axis=1).to_frame()
0 | |
---|---|
0 | 25 |
1 | 84 |
2 | 22 |
3 | 25 |
4 | 47 |
5 | 34 |
df[["age"]].apply(lambda value: value*2)
age | |
---|---|
0 | 46 |
1 | 156 |
2 | 44 |
3 | 38 |
4 | 90 |
5 | 66 |
# certain numerical functions can also be used:
df[["age"]] * 2
age | |
---|---|
0 | 46 |
1 | 156 |
2 | 44 |
3 | 38 |
4 | 90 |
5 | 66 |
# also works for string values
df[["name"]].apply(lambda value: value.str.upper())
name | |
---|---|
0 | JOHN |
1 | MARY |
2 | PETER |
3 | JEFF |
4 | BILL |
5 | LISA |
use
apply
for DataFrame objects andmap
for Series objects
df['name'].map(lambda name: name.upper()).to_frame()
name | |
---|---|
0 | JOHN |
1 | MARY |
2 | PETER |
3 | JEFF |
4 | BILL |
5 | LISA |
# simple sum of two columns
df["pets_and_children"] = df["num_pets"] + df["num_children"]
df
name | age | state | num_children | num_pets | pets_and_children | |
---|---|---|---|---|---|---|
0 | john | 23 | iowa | 2 | 0 | 2 |
1 | mary | 78 | dc | 2 | 4 | 6 |
2 | peter | 22 | california | 0 | 0 | 0 |
3 | jeff | 19 | texas | 1 | 5 | 6 |
4 | bill | 45 | washington | 2 | 0 | 2 |
5 | lisa | 33 | dc | 1 | 0 | 1 |
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa'],
'age':[23,78,22,19,45,33],
'state':['iowa','dc','california','texas','washington','dc'],
'num_children':[2,2,0,1,2,1],
'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df
# you can also use custom functions we used on "elementwise application"
df["name_uppercase"] = df[["name"]].apply(lambda name: name.str.upper())
df
name | age | state | num_children | num_pets | name_uppercase | |
---|---|---|---|---|---|---|
0 | john | 23 | iowa | 2 | 0 | JOHN |
1 | mary | 78 | dc | 2 | 4 | MARY |
2 | peter | 22 | california | 0 | 0 | PETER |
3 | jeff | 19 | texas | 1 | 5 | JEFF |
4 | bill | 45 | washington | 2 | 0 | BILL |
5 | lisa | 33 | dc | 1 | 0 | LISA |
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa'],
'age':[23,78,22,19,45,33],
'state':['iowa','dc','california','texas','washington','dc'],
'num_children':[2,2,0,1,2,1],
'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df
df.reindex(np.random.permutation(df.index))
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
4 | bill | 45 | washington | 2 | 0 |
3 | jeff | 19 | texas | 1 | 5 |
2 | peter | 22 | california | 0 | 0 |
1 | mary | 78 | dc | 2 | 4 |
5 | lisa | 33 | dc | 1 | 0 |
0 | john | 23 | iowa | 2 | 0 |
for index,row in df.iterrows():
print("{0} has name: {1}".format(index,row["name"]))
0 has name: john 1 has name: mary 2 has name: peter 3 has name: jeff 4 has name: bill 5 has name: lisa
# sample 10 rows from df
random_indices = np.random.choice(df.index.values, 4, replace=False)
# iloc allows you to retrieve rows by their numeric indices
sampled_df = df.iloc[random_indices]
sampled_df
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
2 | peter | 22 | california | 0 | 0 |
3 | jeff | 19 | texas | 1 | 5 |
1 | mary | 78 | dc | 2 | 4 |
5 | lisa | 33 | dc | 1 | 0 |
# sort by age, largest first
df.sort_values("age",ascending=False )
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
1 | mary | 78 | dc | 2 | 4 |
4 | bill | 45 | washington | 2 | 0 |
5 | lisa | 33 | dc | 1 | 0 |
0 | john | 23 | iowa | 2 | 0 |
2 | peter | 22 | california | 0 | 0 |
3 | jeff | 19 | texas | 1 | 5 |
# sort by num_pets descending then sort by age ascending
df.sort_values( ["num_pets","age"], ascending=[False,True] )
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
3 | jeff | 19 | texas | 1 | 5 |
1 | mary | 78 | dc | 2 | 4 |
2 | peter | 22 | california | 0 | 0 |
0 | john | 23 | iowa | 2 | 0 |
5 | lisa | 33 | dc | 1 | 0 |
4 | bill | 45 | washington | 2 | 0 |
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa'],
'age':[23,78,22,19,12,33],
'state':['N/A','dc','california','texas','N/A','dc']
})
# sorting columns
df=df[['name','age','state']]
df
name | age | state | |
---|---|---|---|
0 | john | 23 | N/A |
1 | mary | 78 | dc |
2 | peter | 22 | california |
3 | jeff | 19 | texas |
4 | bill | 12 | N/A |
5 | lisa | 33 | dc |
def state_to_rank(state):
if state=="N/A":
return 1
else:
return 0
df['rank'] = df['state'].map(lambda x: state_to_rank(x))
df.sort_values(by=['rank','age']).drop(['rank'],axis=1).reset_index(drop=True)
name | age | state | |
---|---|---|---|
0 | jeff | 19 | texas |
1 | peter | 22 | california |
2 | lisa | 33 | dc |
3 | mary | 78 | dc |
4 | bill | 12 | N/A |
5 | john | 23 | N/A |
df[df.apply(lambda row: row['name'].startswith('j'),axis=1)]
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
0 | john | 23 | iowa | 2 | 0 |
3 | jeff | 19 | texas | 1 | 5 |
# use inplace=True if you want to mutate the current dataframe
df.rename(columns={"age":"age_years"} )
name | age_years | state | num_children | num_pets | |
---|---|---|---|---|---|
0 | john | 23 | iowa | 2 | 0 |
1 | mary | 78 | dc | 2 | 4 |
2 | peter | 22 | california | 0 | 0 |
3 | jeff | 19 | texas | 1 | 5 |
4 | bill | 45 | washington | 2 | 0 |
5 | lisa | 33 | dc | 1 | 0 |
df['num_children'].dtype
dtype('int64')
# we don't need 64 bits for num_children
df['num_children'] = df['num_children'].astype('int32')
df['num_children'].dtype
# it looks the same but takes less space
df
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
0 | john | 23 | iowa | 2 | 0 |
1 | mary | 78 | dc | 2 | 4 |
2 | peter | 22 | california | 0 | 0 |
3 | jeff | 19 | texas | 1 | 5 |
4 | bill | 45 | washington | 2 | 0 |
5 | lisa | 33 | dc | 1 | 0 |
This is done using the .isin()
method, which returns a boolean dataframe to indicate where the passed values are.
df = pd.DataFrame({
'name':['john','mary','peter','jeff','bill','lisa'],
'age':[23,78,22,19,45,33],
'state':['iowa','dc','california','texas','washington','dc'],
'num_children':[2,2,0,1,2,1],
'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
# if the method is passed a simple list, it matches
# those values anywhere in the dataframe
df.isin([2,4])
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
0 | False | False | False | True | False |
1 | False | False | False | True | True |
2 | False | False | False | False | False |
3 | False | False | False | False | False |
4 | False | False | False | True | False |
5 | False | False | False | False | False |
# you can also pass a dict or another dataframe
# as argument
df.isin({'num_pets':[4,5]})
name | age | state | num_children | num_pets | |
---|---|---|---|---|---|
0 | False | False | False | False | False |
1 | False | False | False | False | True |
2 | False | False | False | False | False |
3 | False | False | False | False | True |
4 | False | False | False | False | False |
5 | False | False | False | False | False |
# set column names and dtypes
new_df = pd.DataFrame(columns=['col_a','col_b']).astype({'col_a':'float32', 'col_b':'int8'})
# must reassign since the append method does not work in place
new_df = new_df.append({'col_a':5,'col_b':10}, ignore_index=True)
new_df = new_df.append({'col_a':1,'col_b':100}, ignore_index=True)
new_df
col_a | col_b | |
---|---|---|
0 | 5.0 | 10 |
1 | 1.0 | 100 |
new_df = pd.DataFrame(columns=['id','name'])
data_dict = [
{'id':1,'name':"john"},
{'id':2,'name':"mary"},
{'id':3,'name':"peter"}
]
# must reassign since the append method does not work in place
new_df = new_df.from_records(data_dict)
new_df
id | name | |
---|---|---|
0 | 1 | john |
1 | 2 | mary |
2 | 3 | peter |
df = pd.DataFrame({
'name':['john','mary','peter'],
"date_of_birth": ['27/05/2002','10/10/1999','01/04/1985']
})
df
date_of_birth | name | |
---|---|---|
0 | 27/05/2002 | john |
1 | 10/10/1999 | mary |
2 | 01/04/1985 | peter |
df['date_of_birth']=pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')
df
date_of_birth | name | |
---|---|---|
0 | 2002-05-27 | john |
1 | 1999-10-10 | mary |
2 | 1985-04-01 | peter |
df.dtypes
date_of_birth datetime64[ns] name object dtype: object