In [1]:
import pandas as pd
import numpy as np
import sys
%matplotlib inline

In [2]:
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__

Python version 2.7.5 |Anaconda 2.1.0 (64-bit)| (default, Jul  1 2013, 12:37:52) [MSC v.1500 64 bit (AMD64)]
Pandas version 0.15.2


# Select¶

### How do I select a random sample of a group?¶

In [3]:
# Initial dataframe
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
"c","a","a","a","b","b","b","b"],
'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
'value'  : ["apple","pear","orange","apple",
"banana","durian","lemon","lime",
"raspberry","durian","peach","nectarine",
"banana","lemon","guava","blackberry","grape"]})
df

Out[3]:
group1 group2 value
0 a 1 apple
1 b 2 pear
2 a 3 orange
3 a 4 apple
4 b 1 banana
5 c 3 durian
6 c 5 lemon
7 c 6 lime
8 c 5 raspberry
9 c 4 durian
10 a 1 peach
11 a 2 nectarine
12 a 3 banana
13 b 4 lemon
14 b 3 guava
15 b 2 blackberry
16 b 1 grape
In [4]:
# We don't simply want to select random rows from df
# We want to first group df by (group1 & group2) then select random rows
from random import choice

# First create the group
grouped = df.groupby(['group1','group2'])
grouped.size()

#Notice that group (a,1) has two posibilities
#Notice that group (a,2) has one posibilities

#This means that if we select a random sample from group (a,1) we will get either "apple" or "peach"
#This means that if we select a random sample from group (a,2) we will always get "nectarine"

Out[4]:
group1  group2
a       1         2
2         1
3         2
4         1
b       1         2
2         2
3         1
4         1
c       3         1
4         1
5         2
6         1
dtype: int64
In [5]:
#df.loc[select a random record from each group]
df.loc[(choice(x) for x in grouped.groups.itervalues())]

Out[5]:
group1 group2 value

### How do I slice each row of a column?¶

In [6]:
df = pd.DataFrame(data=['abcdef']*10, columns=['text'])
df

Out[6]:
text
0 abcdef
1 abcdef
2 abcdef
3 abcdef
4 abcdef
5 abcdef
6 abcdef
7 abcdef
8 abcdef
9 abcdef
In [7]:
# Select the first 2 characters of each row
df['text'].apply(lambda x: x[:2])

Out[7]:
0    ab
1    ab
2    ab
3    ab
4    ab
5    ab
6    ab
7    ab
8    ab
9    ab
Name: text, dtype: object

### How can I select rows of my dataframe based on a "complex" filter applied to multiple columns?¶

In [8]:
d = {'Dates':[pd.Timestamp('2013-01-02'),
pd.Timestamp('2013-01-03'),
pd.Timestamp('2013-01-04')],
'Num1':[1,2,3],
'Num2':[-1,-2,-3]}

df = pd.DataFrame(data=d)
df

Out[8]:
Dates Num1 Num2
0 2013-01-02 1 -1
1 2013-01-03 2 -2
2 2013-01-04 3 -3
In [9]:
# where all values in column "Num1" are positive
positive = df['Num1'] > 0

# where values in column "Num2" is equal to -1
negativeOne = df['Num2'] == -1

# where values in the column "Dates" are in (1/2/2013 or 1/20/2013)
Dates = df['Dates'].isin(['2013-01-02','2013-01-20'])

df[positive & negativeOne & Dates]

Out[9]:
Dates Num1 Num2
0 2013-01-02 1 -1

### How to get the maximum value of a group?¶

In [10]:
df = pd.DataFrame({'col1':['minus','minus','positive','nan'],
'col2':[10,20,30,40],
'col3':[-10,-20,30,np.nan]
})
df

Out[10]:
col1 col2 col3
0 minus 10 -10
1 minus 20 -20
2 positive 30 30
3 nan 40 NaN
In [11]:
# Method 1
df.groupby('col1').apply(lambda x: x.max())

Out[11]:
col1 col2 col3
col1
minus minus 20 -10
nan NaN 40 NaN
positive positive 30 30
In [12]:
# Method 2
df.groupby('col1').agg('max')

Out[12]:
col2 col3
col1
minus 20 -10
nan 40 NaN
positive 30 30

### How to select records from one level of a multi-index data frame?¶

In [13]:
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
"c","a","a","a","b","b","b","b"],
'value' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
'group2'  : ["apple","pear","orange","apple",
"banana","durian","lemon","lime",
"raspberry","durian","peach","nectarine",
"banana","lemon","guava","blackberry","grape"]})
df = df.set_index(['group1','group2'])
df

Out[13]:
value
group1 group2
a apple 1
b pear 2
a orange 3
apple 4
b banana 1
c durian 3
lemon 5
lime 6
raspberry 5
durian 4
a peach 1
nectarine 2
banana 3
b lemon 4
guava 3
blackberry 2
grape 1
In [14]:
df.xs('a', level='group1')

Out[14]:
value
group2
apple 1
orange 3
apple 4
peach 1
nectarine 2
banana 3

### How do I reset the index when the index names are the same as the column names?¶

In [15]:
df = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] ,
"City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"]}
)
df

Out[15]:
City Name
0 Seattle Alice
1 Seattle Bob
2 Portland Mallory
3 Seattle Mallory
4 Seattle Bob
5 Portland Mallory
In [16]:
group = df.groupby(['City','Name'])
s = group.agg('size')