#!/usr/bin/env python
# coding: utf-8

# ## These are the follow-up code samples for the following blog post: [Pandas DataFrame by Example](http://queirozf.com/entries/pandas-dataframe-by-example)

# In[38]:


import pandas as pd
import numpy as np


# In[2]:


pd.__version__


# In[41]:


df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df


# ## Select rows by position

# In[42]:


# select the first 2 rows
df.iloc[:2]


# In[43]:


# select the last 2 rows
df.iloc[-2:]


# ## Select rows by index value
# 
# > compare this with `iloc` above

# In[44]:


# select rows up to and including the one
# with index=2
df.loc[:2]


# ## Select rows based upon the value of columns

# In[45]:


# by a simple numeric condition
df[df["age"] > 30]


# In[46]:


# comparing the value of two columns
df[ df["num_pets"] > df[ "num_children"] ]


# In[47]:


# using boolean AND
df[ (df["age"] > 40) & (df["num_pets"] > 0) ]


# ## select columns starting with

# In[48]:


df[[colname for colname in df.columns if colname.startswith('n')]]


# ## select all columns but one

# In[49]:


df.drop(['age'],axis=1)


# ## Drop a column

# In[50]:


df.drop(["age","num_children"],axis=1)


# ## Apply a function to every column (as aggregates)
# 
# > Using **numpy** vectorized functions for numerical values

# In[52]:


# get the mean for each of the selected columns
df[["age","num_pets","num_children"]].apply(lambda row: np.mean(row),axis=0).to_frame()


# ## Apply a function to every row (as aggregates)
# > Using **numpy** vectorized functions for numerical values

# In[54]:


# sum columns age, num_pets and num_children for each row
df[["age","num_pets","num_children"]].apply(lambda row: np.sum(row),axis=1).to_frame()


# ## Apply a function elementwise using apply

# In[55]:


df[["age"]].apply(lambda value: value*2)


# In[56]:


# certain numerical functions can also be used:
df[["age"]] * 2


# In[57]:


# also works for string values
df[["name"]].apply(lambda value: value.str.upper())


# ## Apply a function elementwise using map
# 
# > use `apply` for DataFrame objects and `map` for Series objects
# 

# In[58]:


df['name'].map(lambda name: name.upper()).to_frame()


# ## Add new columns based on old ones

# In[59]:


# simple sum of two columns
df["pets_and_children"] = df["num_pets"] + df["num_children"]
df


# In[60]:


df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df

# you can also use custom functions we used on "elementwise application"
df["name_uppercase"] = df[["name"]].apply(lambda name: name.str.upper())
df


# ## Shuffle rows

# In[82]:


df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]
df

df.reindex(np.random.permutation(df.index))


# ## Iterate over all rows

# In[83]:


for index,row in df.iterrows():
    print("{0} has name: {1}".format(index,row["name"]))


# ## Randomly sample rows

# In[84]:


# sample 10 rows from df
random_indices = np.random.choice(df.index.values, 4, replace=False)

# iloc allows you to retrieve rows by their numeric indices
sampled_df = df.iloc[random_indices]
sampled_df


# ## Sort a dataframe

# In[85]:


# sort by age, largest first
df.sort_values("age",ascending=False )


# In[86]:


# sort by num_pets descending then sort by age ascending
df.sort_values( ["num_pets","age"], ascending=[False,True] )


# ## custom sort

# In[100]:


df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,12,33],
    'state':['N/A','dc','california','texas','N/A','dc']
})

# sorting columns
df=df[['name','age','state']]
df


# In[104]:


def state_to_rank(state):
    if state=="N/A":
        return 1
    else:
        return 0
    
df['rank'] = df['state'].map(lambda x: state_to_rank(x))

df.sort_values(by=['rank','age']).drop(['rank'],axis=1).reset_index(drop=True)


# ## Perform complex selections using lambdas

# In[66]:


df[df.apply(lambda row: row['name'].startswith('j'),axis=1)]


# ## Change column names

# In[68]:


# use inplace=True if you want to mutate the current dataframe
df.rename(columns={"age":"age_years"} )


# ## Change column dtype

# In[69]:


df['num_children'].dtype


# In[72]:


# we don't need 64 bits for num_children
df['num_children'] = df['num_children'].astype('int32')
df['num_children'].dtype

# it looks the same but takes less space
df


# ## Veryfing that the dataframe includes specific values
# 
# This is done using the `.isin()` method, which returns a **boolean** dataframe to indicate where the passed values are.
# 

# In[73]:


df = pd.DataFrame({
    'name':['john','mary','peter','jeff','bill','lisa'],
    'age':[23,78,22,19,45,33],
    'state':['iowa','dc','california','texas','washington','dc'],
    'num_children':[2,2,0,1,2,1],
    'num_pets':[0,4,0,5,0,0]
})
# sorting columns
df=df[['name','age','state','num_children','num_pets']]

# if the method is passed a simple list, it matches
# those values anywhere in the dataframe 
df.isin([2,4])


# In[74]:


# you can also pass a dict or another dataframe
# as argument
df.isin({'num_pets':[4,5]})


# ## Create an empty Dataframe and append rows one by one

# In[75]:


# set column names and dtypes
new_df = pd.DataFrame(columns=['col_a','col_b']).astype({'col_a':'float32', 'col_b':'int8'})


# must reassign since the append method does not work in place
new_df = new_df.append({'col_a':5,'col_b':10}, ignore_index=True)
new_df = new_df.append({'col_a':1,'col_b':100}, ignore_index=True)

new_df


# ## Create from list of dicts

# In[76]:


new_df = pd.DataFrame(columns=['id','name'])

data_dict = [
    {'id':1,'name':"john"},
    {'id':2,'name':"mary"},
    {'id':3,'name':"peter"}
]

# must reassign since the append method does not work in place

new_df = new_df.from_records(data_dict)
new_df


# ## converting types

# In[77]:


df = pd.DataFrame({
    'name':['john','mary','peter'],
    "date_of_birth": ['27/05/2002','10/10/1999','01/04/1985']
})
df


# In[79]:


df['date_of_birth']=pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y')
df


# In[80]:


df.dtypes


# In[ ]: