#!/usr/bin/env python # coding: utf-8 # ## These are the follow-up code samples for the following blog post: [Pandas DataFrame by Example](http://queirozf.com/entries/pandas-dataframe-by-example) # In[38]: import pandas as pd import numpy as np # In[2]: pd.__version__ # In[41]: df = pd.DataFrame({ 'name':['john','mary','peter','jeff','bill','lisa'], 'age':[23,78,22,19,45,33], 'state':['iowa','dc','california','texas','washington','dc'], 'num_children':[2,2,0,1,2,1], 'num_pets':[0,4,0,5,0,0] }) # sorting columns df=df[['name','age','state','num_children','num_pets']] df # ## Select rows by position # In[42]: # select the first 2 rows df.iloc[:2] # In[43]: # select the last 2 rows df.iloc[-2:] # ## Select rows by index value # # > compare this with `iloc` above # In[44]: # select rows up to and including the one # with index=2 df.loc[:2] # ## Select rows based upon the value of columns # In[45]: # by a simple numeric condition df[df["age"] > 30] # In[46]: # comparing the value of two columns df[ df["num_pets"] > df[ "num_children"] ] # In[47]: # using boolean AND df[ (df["age"] > 40) & (df["num_pets"] > 0) ] # ## select columns starting with # In[48]: df[[colname for colname in df.columns if colname.startswith('n')]] # ## select all columns but one # In[49]: df.drop(['age'],axis=1) # ## Drop a column # In[50]: df.drop(["age","num_children"],axis=1) # ## Apply a function to every column (as aggregates) # # > Using **numpy** vectorized functions for numerical values # In[52]: # get the mean for each of the selected columns df[["age","num_pets","num_children"]].apply(lambda row: np.mean(row),axis=0).to_frame() # ## Apply a function to every row (as aggregates) # > Using **numpy** vectorized functions for numerical values # In[54]: # sum columns age, num_pets and num_children for each row df[["age","num_pets","num_children"]].apply(lambda row: np.sum(row),axis=1).to_frame() # ## Apply a function elementwise using apply # In[55]: df[["age"]].apply(lambda value: value*2) # In[56]: # certain numerical functions can also be used: df[["age"]] * 2 # In[57]: # also works for string values df[["name"]].apply(lambda value: value.str.upper()) # ## Apply a function elementwise using map # # > use `apply` for DataFrame objects and `map` for Series objects # # In[58]: df['name'].map(lambda name: name.upper()).to_frame() # ## Add new columns based on old ones # In[59]: # simple sum of two columns df["pets_and_children"] = df["num_pets"] + df["num_children"] df # In[60]: df = pd.DataFrame({ 'name':['john','mary','peter','jeff','bill','lisa'], 'age':[23,78,22,19,45,33], 'state':['iowa','dc','california','texas','washington','dc'], 'num_children':[2,2,0,1,2,1], 'num_pets':[0,4,0,5,0,0] }) # sorting columns df=df[['name','age','state','num_children','num_pets']] df # you can also use custom functions we used on "elementwise application" df["name_uppercase"] = df[["name"]].apply(lambda name: name.str.upper()) df # ## Shuffle rows # In[82]: df = pd.DataFrame({ 'name':['john','mary','peter','jeff','bill','lisa'], 'age':[23,78,22,19,45,33], 'state':['iowa','dc','california','texas','washington','dc'], 'num_children':[2,2,0,1,2,1], 'num_pets':[0,4,0,5,0,0] }) # sorting columns df=df[['name','age','state','num_children','num_pets']] df df.reindex(np.random.permutation(df.index)) # ## Iterate over all rows # In[83]: for index,row in df.iterrows(): print("{0} has name: {1}".format(index,row["name"])) # ## Randomly sample rows # In[84]: # sample 10 rows from df random_indices = np.random.choice(df.index.values, 4, replace=False) # iloc allows you to retrieve rows by their numeric indices sampled_df = df.iloc[random_indices] sampled_df # ## Sort a dataframe # In[85]: # sort by age, largest first df.sort_values("age",ascending=False ) # In[86]: # sort by num_pets descending then sort by age ascending df.sort_values( ["num_pets","age"], ascending=[False,True] ) # ## custom sort # In[100]: df = pd.DataFrame({ 'name':['john','mary','peter','jeff','bill','lisa'], 'age':[23,78,22,19,12,33], 'state':['N/A','dc','california','texas','N/A','dc'] }) # sorting columns df=df[['name','age','state']] df # In[104]: def state_to_rank(state): if state=="N/A": return 1 else: return 0 df['rank'] = df['state'].map(lambda x: state_to_rank(x)) df.sort_values(by=['rank','age']).drop(['rank'],axis=1).reset_index(drop=True) # ## Perform complex selections using lambdas # In[66]: df[df.apply(lambda row: row['name'].startswith('j'),axis=1)] # ## Change column names # In[68]: # use inplace=True if you want to mutate the current dataframe df.rename(columns={"age":"age_years"} ) # ## Change column dtype # In[69]: df['num_children'].dtype # In[72]: # we don't need 64 bits for num_children df['num_children'] = df['num_children'].astype('int32') df['num_children'].dtype # it looks the same but takes less space df # ## Veryfing that the dataframe includes specific values # # This is done using the `.isin()` method, which returns a **boolean** dataframe to indicate where the passed values are. # # In[73]: df = pd.DataFrame({ 'name':['john','mary','peter','jeff','bill','lisa'], 'age':[23,78,22,19,45,33], 'state':['iowa','dc','california','texas','washington','dc'], 'num_children':[2,2,0,1,2,1], 'num_pets':[0,4,0,5,0,0] }) # sorting columns df=df[['name','age','state','num_children','num_pets']] # if the method is passed a simple list, it matches # those values anywhere in the dataframe df.isin([2,4]) # In[74]: # you can also pass a dict or another dataframe # as argument df.isin({'num_pets':[4,5]}) # ## Create an empty Dataframe and append rows one by one # In[75]: # set column names and dtypes new_df = pd.DataFrame(columns=['col_a','col_b']).astype({'col_a':'float32', 'col_b':'int8'}) # must reassign since the append method does not work in place new_df = new_df.append({'col_a':5,'col_b':10}, ignore_index=True) new_df = new_df.append({'col_a':1,'col_b':100}, ignore_index=True) new_df # ## Create from list of dicts # In[76]: new_df = pd.DataFrame(columns=['id','name']) data_dict = [ {'id':1,'name':"john"}, {'id':2,'name':"mary"}, {'id':3,'name':"peter"} ] # must reassign since the append method does not work in place new_df = new_df.from_records(data_dict) new_df # ## converting types # In[77]: df = pd.DataFrame({ 'name':['john','mary','peter'], "date_of_birth": ['27/05/2002','10/10/1999','01/04/1985'] }) df # In[79]: df['date_of_birth']=pd.to_datetime(df['date_of_birth'],format='%d/%m/%Y') df # In[80]: df.dtypes # In[ ]: