In [1]:
from pandas import DataFrame
import pandas as pd
import numpy as np
%matplotlib inline
In [2]:
print 'Pandas Version: ' + pd.__version__
Pandas Version: 0.13.0rc1

Compute

How to get the sum and length of a group?

In [3]:
df = DataFrame({'group1':["a","a","b","b"],
                'value':[10,20,30,40]
                })
df
Out[3]:
group1 value
0 a 10
1 a 20
2 b 30
3 b 40

4 rows × 2 columns

In [4]:
group = df.groupby('group1')
group.agg([len,sum])
Out[4]:
value
len sum
group1
a 2 30
b 2 70

2 rows × 2 columns

How can I add a column that is equal to the sum of a group?

In [5]:
df = DataFrame({'labels':["a","a","b","b"],
                'value':[10,20,30,40]
                })
df
Out[5]:
labels value
0 a 10
1 a 20
2 b 30
3 b 40

4 rows × 2 columns

In [6]:
group = df.groupby('labels')['value']
df['value.sum'] = group.transform('sum')
df
Out[6]:
labels value value.sum
0 a 10 30
1 a 20 30
2 b 30 70
3 b 40 70

4 rows × 3 columns

How to get the month name out of a date column?

In [7]:
df = DataFrame({'col1':[pd.Timestamp('20130102000030'),
                         pd.Timestamp('2013-02-03 00:00:30'),
                         pd.Timestamp('3/4/2013 000030')]
                 })
df
Out[7]:
col1
0 2013-01-02 00:00:30
1 2013-02-03 00:00:30
2 2013-03-04 00:00:30

3 rows × 1 columns

In [8]:
df['MonthNumber'] = df['col1'].apply(lambda x: x.month)
df['Day'] = df['col1'].apply(lambda x: x.day)
df['Year'] = df['col1'].apply(lambda x: x.year)
df['MonthName'] = df['col1'].apply(lambda x: x.strftime('%B'))
df['WeekDay'] = df['col1'].apply(lambda x: x.strftime('%A'))
df
Out[8]:
col1 MonthNumber Day Year MonthName WeekDay
0 2013-01-02 00:00:30 1 2 2013 January Wednesday
1 2013-02-03 00:00:30 2 3 2013 February Sunday
2 2013-03-04 00:00:30 3 4 2013 March Monday

3 rows × 6 columns

How can I create a column based on two other columns?

In [9]:
df = DataFrame({'col1':['minus','minus','positive','nan'],
                'col2':[10,20,30,40]
                })
df
Out[9]:
col1 col2
0 minus 10
1 minus 20
2 positive 30
3 nan 40

4 rows × 2 columns

In [10]:
df['col3'] = df['col2']*df['col1'].apply(lambda x: -1 if x=='minus' else (1 if x=='positive' else np.nan))
df
Out[10]:
col1 col2 col3
0 minus 10 -10
1 minus 20 -20
2 positive 30 30
3 nan 40 NaN

4 rows × 3 columns

How can I apply a function to a group and add the results to my original data frame?

In [11]:
df = DataFrame({'group1':['a','a','a','b','b','b'],
                       'group2':['c','c','d','d','d','e'],
                       'value1':[1.1,2,3,4,5,6],
                       'value2':[7.1,8,9,10,11,12]
})

df
Out[11]:
group1 group2 value1 value2
0 a c 1.1 7.1
1 a c 2.0 8.0
2 a d 3.0 9.0
3 b d 4.0 10.0
4 b d 5.0 11.0
5 b e 6.0 12.0

6 rows × 4 columns

In [12]:
group = df.groupby(['group1','group2'])

def Half(x):
    return x.sum()

df['new'] = group['value1'].transform(Half)
df
Out[12]:
group1 group2 value1 value2 new
0 a c 1.1 7.1 3.1
1 a c 2.0 8.0 3.1
2 a d 3.0 9.0 3.0
3 b d 4.0 10.0 9.0
4 b d 5.0 11.0 9.0
5 b e 6.0 12.0 6.0

6 rows × 5 columns

In [13]:
# For multiple functions
def HalfPlus(x):
    return x.sum() + 1

newcol = group['value1'].agg([Half,HalfPlus])
newcol
Out[13]:
Half HalfPlus
group1 group2
a c 3.1 4.1
d 3.0 4.0
b d 9.0 10.0
e 6.0 7.0

4 rows × 2 columns

In [14]:
df.merge(newcol, left_on=['group1','group2'], right_index=True)
Out[14]:
group1 group2 value1 value2 new Half HalfPlus
0 a c 1.1 7.1 3.1 3.1 4.1
1 a c 2.0 8.0 3.1 3.1 4.1
2 a d 3.0 9.0 3.0 3.0 4.0
3 b d 4.0 10.0 9.0 9.0 10.0
4 b d 5.0 11.0 9.0 9.0 10.0
5 b e 6.0 12.0 6.0 6.0 7.0

6 rows × 7 columns

How to add two data frames and not get null values?

In [15]:
df1 = DataFrame(data=[26371, 1755, 2], index=[-9999, 240, 138.99], columns=['value'])
df1
Out[15]:
value
-9999.00 26371
240.00 1755
138.99 2

3 rows × 1 columns

In [16]:
df2 = DataFrame(data=[26371, 1755, 6, 4], index=[-9999, 240, 113.03, 110], columns=['value'])
df2
Out[16]:
value
-9999.00 26371
240.00 1755
113.03 6
110.00 4

4 rows × 1 columns

In [17]:
# If you simply add them, you will get null values
# were the index does not match
df1 + df2
Out[17]:
value
-9999.00 52742
110.00 NaN
113.03 NaN
138.99 NaN
240.00 3510

5 rows × 1 columns

In [18]:
# Here we fix this issue
df1.add(df2, fill_value=0)
Out[18]:
value
-9999.00 52742
110.00 4
113.03 6
138.99 2
240.00 3510

5 rows × 1 columns

Author: David Rojas LLC

Back to top