In [1]:
from pandas import DataFrame
import pandas as pd
import numpy as np
%matplotlib inline
In [2]:
print 'Pandas Version: ' + pd.__version__
Pandas Version: 0.13.0rc1

Select

How do I select a random sample of a group?

In [3]:
# Initial dataframe
df = DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'value'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df
Out[3]:
group1 group2 value
0 a 1 apple
1 b 2 pear
2 a 3 orange
3 a 4 apple
4 b 1 banana
5 c 3 durian
6 c 5 lemon
7 c 6 lime
8 c 5 raspberry
9 c 4 durian
10 a 1 peach
11 a 2 nectarine
12 a 3 banana
13 b 4 lemon
14 b 3 guava
15 b 2 blackberry
16 b 1 grape

17 rows × 3 columns

In [4]:
# We don't simply want to select random rows from df
# We want to first group df by (group1 & group2) then select random rows
from random import choice

# First create the group
grouped = df.groupby(['group1','group2'])
grouped.size()


#Notice that group (a,1) has two posibilities
#Notice that group (a,2) has one posibilities

#This means that if we select a random sample from group (a,1) we will get either "apple" or "peach"
#This means that if we select a random sample from group (a,2) we will always get "nectarine"
Out[4]:
group1  group2
a       1         2
        2         1
        3         2
        4         1
b       1         2
        2         2
        3         1
        4         1
c       3         1
        4         1
        5         2
        6         1
dtype: int64
In [5]:
#df.loc[select a random record from each group]
df.loc[(choice(x) for x in grouped.groups.itervalues())]
Out[5]:
group1 group2 value
10 a 1 peach
15 b 2 blackberry
7 c 6 lime
14 b 3 guava
16 b 1 grape
9 c 4 durian
11 a 2 nectarine
5 c 3 durian
3 a 4 apple
2 a 3 orange
13 b 4 lemon
8 c 5 raspberry

12 rows × 3 columns

How do I slice each row of a column?

In [6]:
df = DataFrame(data=['abcdef']*10, columns=['text'])
df
Out[6]:
text
0 abcdef
1 abcdef
2 abcdef
3 abcdef
4 abcdef
5 abcdef
6 abcdef
7 abcdef
8 abcdef
9 abcdef

10 rows × 1 columns

In [7]:
# Select the first 2 characters of each row
df['text'].apply(lambda x: x[:2])
Out[7]:
0    ab
1    ab
2    ab
3    ab
4    ab
5    ab
6    ab
7    ab
8    ab
9    ab
Name: text, dtype: object

How can I select rows of my dataframe based on a "complex" filter applied to multiple columns?

In [8]:
d = {'Dates':[pd.Timestamp('2013-01-02'),
              pd.Timestamp('2013-01-03'),
              pd.Timestamp('2013-01-04')],
     'Num1':[1,2,3],
     'Num2':[-1,-2,-3]}
                 

df = DataFrame(data=d)
df
Out[8]:
Dates Num1 Num2
0 2013-01-02 00:00:00 1 -1
1 2013-01-03 00:00:00 2 -2
2 2013-01-04 00:00:00 3 -3

3 rows × 3 columns

In [9]:
# where all values in column "Num1" are positive
positive = df['Num1'] > 0

# where values in column "Num2" is equal to -1
negativeOne = df['Num2'] == -1

# where values in the column "Dates" are in (1/2/2013 or 1/20/2013)
Dates = df['Dates'].isin(['2013-01-02','2013-01-20'])

df[positive & negativeOne & Dates]
Out[9]:
Dates Num1 Num2
0 2013-01-02 00:00:00 1 -1

1 rows × 3 columns

How to get the maximum value of a group?

In [10]:
df = DataFrame({'col1':['minus','minus','positive','nan'],
                'col2':[10,20,30,40],
                'col3':[-10,-20,30,np.nan]
                })
df
Out[10]:
col1 col2 col3
0 minus 10 -10
1 minus 20 -20
2 positive 30 30
3 nan 40 NaN

4 rows × 3 columns

In [11]:
# Method 1
df.groupby('col1').apply(lambda x: x.max())
Out[11]:
col1 col2 col3
col1
minus minus 20 -10.0
nan NaN 40 NaN
positive positive 30 30.0

3 rows × 3 columns

In [12]:
# Method 2
df.groupby('col1').agg('max')
Out[12]:
col2 col3
col1
minus 20 -10
nan 40 NaN
positive 30 30

3 rows × 2 columns

How to select records from one level of a multi-index data frame?

In [13]:
df = DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'value' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'group2'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df = df.set_index(['group1','group2'])
df
Out[13]:
value
group1 group2
a apple 1
b pear 2
a orange 3
apple 4
b banana 1
c durian 3
lemon 5
lime 6
raspberry 5
durian 4
a peach 1
nectarine 2
banana 3
b lemon 4
guava 3
blackberry 2
grape 1

17 rows × 1 columns

In [14]:
df.xs('a', level='group1')
Out[14]:
value
group2
apple 1
orange 3
apple 4
peach 1
nectarine 2
banana 3

6 rows × 1 columns

How do I reset the index when the index names are the same as the column names?

In [15]:
df = DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] , 
                "City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"]}
               )
df
Out[15]:
City Name
0 Seattle Alice
1 Seattle Bob
2 Portland Mallory
3 Seattle Mallory
4 Seattle Bob
5 Portland Mallory

6 rows × 2 columns

In [16]:
group = df.groupby(['City','Name'])
s = group.agg('count')
s.add_suffix('_count').reset_index()
Out[16]:
City Name City_count Name_count
0 Portland Mallory 2 2
1 Seattle Alice 1 1
2 Seattle Bob 2 2
3 Seattle Mallory 1 1

4 rows × 4 columns

Author: David Rojas LLC