#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import sys get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: print('Python version ' + sys.version) print('Pandas version ' + pd.__version__) # # Select # ### How do I select a random sample of a group? # In[3]: # Initial dataframe df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c", "c","a","a","a","b","b","b","b"], 'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1], 'value' : ["apple","pear","orange","apple", "banana","durian","lemon","lime", "raspberry","durian","peach","nectarine", "banana","lemon","guava","blackberry","grape"]}) df # In[4]: # We don't simply want to select random rows from df # We want to first group df by (group1 & group2) then select random rows from random import choice # First create the group grouped = df.groupby(['group1','group2']) grouped.size() #Notice that group (a,1) has two posibilities #Notice that group (a,2) has one posibilities #This means that if we select a random sample from group (a,1) we will get either "apple" or "peach" #This means that if we select a random sample from group (a,2) we will always get "nectarine" # In[5]: #df.loc[select a random record from each group] df.loc[[choice(x) for x in grouped.groups.values()]] # ### How do I slice each row of a column? # In[6]: df = pd.DataFrame(data=['abcdef']*10, columns=['text']) df # In[7]: # Select the first 2 characters of each row df['text'].apply(lambda x: x[:2]) # ### How can I select rows of my dataframe based on a "complex" filter applied to multiple columns? # In[8]: d = {'Dates':[pd.Timestamp('2013-01-02'), pd.Timestamp('2013-01-03'), pd.Timestamp('2013-01-04')], 'Num1':[1,2,3], 'Num2':[-1,-2,-3]} df = pd.DataFrame(data=d) df # In[9]: # where all values in column "Num1" are positive positive = df['Num1'] > 0 # where values in column "Num2" is equal to -1 negativeOne = df['Num2'] == -1 # where values in the column "Dates" are in (1/2/2013 or 1/20/2013) Dates = df['Dates'].isin(['2013-01-02','2013-01-20']) df[positive & negativeOne & Dates] # ### How to get the maximum value of a group? # In[10]: df = pd.DataFrame({'col1':['minus','minus','positive','nan'], 'col2':[10,20,30,40], 'col3':[-10,-20,30,np.nan] }) df # In[11]: # Method 1 df.groupby('col1').apply(lambda x: x.max()) # In[12]: # Method 2 df.groupby('col1').agg('max') # ### How to select records from one level of a multi-index data frame? # In[13]: df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c", "c","a","a","a","b","b","b","b"], 'value' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1], 'group2' : ["apple","pear","orange","apple", "banana","durian","lemon","lime", "raspberry","durian","peach","nectarine", "banana","lemon","guava","blackberry","grape"]}) df = df.set_index(['group1','group2']) df # In[14]: df.xs('a', level='group1') # ### How do I reset the index when the index names are the same as the column names? # In[15]: df = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] , "City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"]} ) df # In[16]: group = df.groupby(['City','Name']) s = group.agg('size') s.add_suffix('_size').reset_index() #

This tutorial was created by HEDARO