In [72]:
import numpy as np
import pandas as pd
from datetime import time
pd.set_option('html', False)
In [2]:
from IPython.core.display import Image
Image('http://akamaicovers.oreilly.com/images/0636920023784/lrg.jpg')
Out[2]:

Important Features in pandas

In [3]:
import pandas as pd
import numpy as np

Fast tabular data IO

In [62]:
temp = '/Users/wesm/Downloads/minutebars/%s.csv'
path = temp % 'AAPL'
!wc -l $path
  489598 /Users/wesm/Downloads/minutebars/AAPL.csv

In [63]:
aapl_bars = pd.read_csv(temp % 'AAPL')
aapl_bars
Out[63]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 489597 entries, 0 to 489596
Data columns:
volume         489597  non-null values
high           489597  non-null values
low            489597  non-null values
close_price    489597  non-null values
dt             489597  non-null values
open_price     489597  non-null values
dtypes: float64(4), int64(1), object(1)
In [64]:
%time _ = pd.read_csv(path)
CPU times: user 0.72 s, sys: 0.18 s, total: 0.90 s
Wall time: 0.90 s

Time series operations

In [65]:
aapl_bars.dt
Out[65]:
0     2008-01-07 14:31:00+00:00
1     2008-01-07 14:32:00+00:00
2     2008-01-07 14:33:00+00:00
3     2008-01-07 14:34:00+00:00
4     2008-01-07 14:35:00+00:00
5     2008-01-07 14:36:00+00:00
6     2008-01-07 14:37:00+00:00
7     2008-01-07 14:38:00+00:00
8     2008-01-07 14:39:00+00:00
9     2008-01-07 14:40:00+00:00
10    2008-01-07 14:41:00+00:00
11    2008-01-07 14:42:00+00:00
12    2008-01-07 14:43:00+00:00
13    2008-01-07 14:44:00+00:00
14    2008-01-07 14:45:00+00:00
...
489582    2013-01-07 20:46:00+00:00
489583    2013-01-07 20:47:00+00:00
489584    2013-01-07 20:48:00+00:00
489585    2013-01-07 20:49:00+00:00
489586    2013-01-07 20:50:00+00:00
489587    2013-01-07 20:51:00+00:00
489588    2013-01-07 20:52:00+00:00
489589    2013-01-07 20:53:00+00:00
489590    2013-01-07 20:54:00+00:00
489591    2013-01-07 20:55:00+00:00
489592    2013-01-07 20:56:00+00:00
489593    2013-01-07 20:57:00+00:00
489594    2013-01-07 20:58:00+00:00
489595    2013-01-07 20:59:00+00:00
489596    2013-01-07 21:00:00+00:00
Name: dt, Length: 489597
In [66]:
aapl_bars.index = pd.to_datetime(aapl_bars.pop('dt'))
In [69]:
aapl_bars.head()
Out[69]:
                     volume    high     low  close_price  open_price
dt                                                                  
2008-01-07 14:31:00  593143  182.07  181.00       182.03      181.25
2008-01-07 14:32:00  344139  182.56  181.92       182.54      182.04
2008-01-07 14:33:00  257436  182.75  182.23       182.50      182.53
2008-01-07 14:34:00  235358  182.62  182.15       182.30      182.51
2008-01-07 14:35:00  281763  182.50  181.71       181.97      182.33
In [70]:
def load_bars(ticker):
    bars = pd.read_csv(temp % ticker)
    bars.index = pd.to_datetime(bars.pop('dt'))
    return bars
In [79]:
aapl_bars.at_time(time(15, 0)).head(10)
Out[79]:
                     volume    high     low  close_price  open_price
dt                                                                  
2008-01-07 15:00:00  148512  183.10  182.60       182.64     182.860
2008-01-08 15:00:00  136202  179.44  178.87       179.40     179.020
2008-01-09 15:00:00  179490  173.49  172.71       172.80     173.431
2008-01-10 15:00:00  253765  177.08  176.08       176.19     176.540
2008-01-11 15:00:00   96055  176.70  176.17       176.43     176.520
2008-01-14 15:00:00  181223  176.70  176.00       176.53     176.600
2008-01-15 15:00:00  166167  177.78  177.19       177.72     177.200
2008-01-16 15:00:00  292435  165.91  164.70       164.76     165.402
2008-01-17 15:00:00  152760  164.24  163.29       163.29     163.600
2008-01-18 15:00:00  217712  163.97  163.50       163.75     163.500
In [80]:
aapl_bars.close_price['2009-10-15']
Out[80]:
dt
2009-10-15 14:31:00    189.870
2009-10-15 14:32:00    189.800
2009-10-15 14:33:00    189.890
2009-10-15 14:34:00    189.930
2009-10-15 14:35:00    189.920
2009-10-15 14:36:00    189.976
2009-10-15 14:37:00    190.030
2009-10-15 14:38:00    189.940
2009-10-15 14:39:00    189.910
2009-10-15 14:40:00    190.069
2009-10-15 14:41:00    190.020
2009-10-15 14:42:00    189.990
2009-10-15 14:43:00    189.920
2009-10-15 14:44:00    189.990
2009-10-15 14:45:00    189.830
...
2009-10-15 20:46:00    189.990
2009-10-15 20:47:00    189.940
2009-10-15 20:48:00    189.910
2009-10-15 20:49:00    189.821
2009-10-15 20:50:00    189.860
2009-10-15 20:51:00    189.940
2009-10-15 20:52:00    189.940
2009-10-15 20:53:00    189.980
2009-10-15 20:54:00    190.010
2009-10-15 20:55:00    190.090
2009-10-15 20:56:00    190.139
2009-10-15 20:57:00    190.170
2009-10-15 20:58:00    190.210
2009-10-15 20:59:00    190.200
2009-10-15 21:00:00    190.560
Name: close_price, Length: 390
In [81]:
aapl_bars.close_price
Out[81]:
dt
2008-01-07 14:31:00    182.03
2008-01-07 14:32:00    182.54
2008-01-07 14:33:00    182.50
2008-01-07 14:34:00    182.30
2008-01-07 14:35:00    181.97
2008-01-07 14:36:00    181.48
2008-01-07 14:37:00    181.04
2008-01-07 14:38:00    180.68
2008-01-07 14:39:00    180.80
2008-01-07 14:40:00    180.86
2008-01-07 14:41:00    181.03
2008-01-07 14:42:00    180.88
2008-01-07 14:43:00    181.11
2008-01-07 14:44:00    181.24
2008-01-07 14:45:00    181.84
...
2013-01-07 20:46:00    523.480
2013-01-07 20:47:00    522.970
2013-01-07 20:48:00    522.954
2013-01-07 20:49:00    523.850
2013-01-07 20:50:00    523.750
2013-01-07 20:51:00    524.270
2013-01-07 20:52:00    524.230
2013-01-07 20:53:00    524.940
2013-01-07 20:54:00    524.580
2013-01-07 20:55:00    524.490
2013-01-07 20:56:00    524.143
2013-01-07 20:57:00    523.750
2013-01-07 20:58:00    524.040
2013-01-07 20:59:00    523.780
2013-01-07 21:00:00    523.970
Name: close_price, Length: 489597
In [82]:
mth_mean = aapl_bars.close_price.resample('M', how=['mean', 'median', 'std'])
mth_mean
Out[82]:
                  mean    median        std
dt                                         
2008-01-31  155.579126  161.1000  19.396180
2008-02-29  125.582646  125.7500   4.720866
2008-03-31  130.494650  127.3600   8.403366
2008-04-30  158.145335  154.7350   9.087710
2008-05-31  184.774497  185.0800   3.618693
2008-06-30  178.654374  178.9300   6.429312
2008-07-31  167.784954  169.9300   7.347889
2008-08-31  171.233317  174.0500   7.244400
2008-09-30  141.942077  140.9100  16.775448
2008-10-31   99.343456   97.6300   6.335940
2008-11-30   93.985805   92.2400   7.549505
2008-12-31   91.581060   90.2600   4.812012
2009-01-31   88.675004   89.8200   4.586041
2009-02-28   94.207834   94.3400   4.092791
2009-03-31   97.352191   96.9470   7.816247
2009-04-30  119.858558  120.3500   4.900853
2009-05-31  127.983364  128.2600   4.061250
2009-06-30  139.360858  139.3000   2.799540
2009-07-31  149.103911  148.3800   9.518534
2009-08-31  166.170257  165.9500   2.354870
2009-09-30  177.893266  181.7000   7.385285
2009-10-31  193.195646  190.5500   6.595516
2009-11-30  200.248734  202.0900   5.619620
2009-12-31  199.102938  197.2200   6.731990
2010-01-31  208.156303  209.4800   5.058381
2010-02-28  198.559427  198.3900   3.454915
2010-03-31  223.336387  224.3300   7.781226
2010-04-30  250.766901  245.9600  11.538351
2010-05-31  251.805447  252.0230   7.846905
2010-06-30  261.842737  262.5325   8.737081
2010-07-31  254.440632  256.2700   5.695913
2010-08-31  251.561496  250.5900   7.495469
2010-09-30  273.745057  272.1700  13.610222
2010-10-31  300.674357  302.7455  10.336923
2010-11-30  311.558581  311.1500   5.419399
2010-12-31  321.606753  321.5000   2.633648
2011-01-31  338.037888  338.5950   5.570475
2011-02-28  351.196638  352.7500   7.126330
2011-03-31  347.578442  349.0200   7.478078
2011-04-30  340.500490  339.4450   8.392912
2011-05-31  341.849013  342.9230   5.946428
2011-06-30  331.625547  331.5605   8.191484
2011-07-31  371.942253  365.4800  19.378892
2011-08-31  377.103717  377.3100  10.823605
2011-09-30  393.123445  391.6450  13.226161
2011-10-31  397.190649  400.4600  15.248496
2011-11-30  385.337475  383.8800  11.555804
2011-12-31  393.056772  392.8600   8.102446
2012-01-31  428.571704  424.3000  12.856495
2012-02-29  496.902626  502.0500  25.815138
2012-03-31  576.550595  587.1600  30.231559
2012-04-30  607.127307  609.0175  21.316798
2012-05-31  565.350624  567.3405  13.847976
2012-06-30  574.423152  573.5770   7.848743
2012-07-31  600.786165  604.4400  11.827663
2012-08-31  642.572344  635.9400  23.148544
2012-09-30  682.100657  679.4900  13.630957
2012-10-31  635.947102  634.1975  20.621637
2012-11-30  564.845334  563.0600  21.851129
2012-12-31  532.420270  528.8600  21.166298
2013-01-31  536.401272  536.9800  10.776982
In [83]:
mth_mean.plot()
Out[83]:
<matplotlib.axes.AxesSubplot at 0x10db20e50>
In [84]:
close = aapl_bars.close_price
close / close.shift(1) - 1
Out[84]:
dt
2008-01-07 14:31:00         NaN
2008-01-07 14:32:00    0.002802
2008-01-07 14:33:00   -0.000219
2008-01-07 14:34:00   -0.001096
2008-01-07 14:35:00   -0.001810
2008-01-07 14:36:00   -0.002693
2008-01-07 14:37:00   -0.002425
2008-01-07 14:38:00   -0.001989
2008-01-07 14:39:00    0.000664
2008-01-07 14:40:00    0.000332
2008-01-07 14:41:00    0.000940
2008-01-07 14:42:00   -0.000829
2008-01-07 14:43:00    0.001272
2008-01-07 14:44:00    0.000718
2008-01-07 14:45:00    0.003311
...
2013-01-07 20:46:00   -0.000363
2013-01-07 20:47:00   -0.000974
2013-01-07 20:48:00   -0.000031
2013-01-07 20:49:00    0.001713
2013-01-07 20:50:00   -0.000191
2013-01-07 20:51:00    0.000993
2013-01-07 20:52:00   -0.000076
2013-01-07 20:53:00    0.001354
2013-01-07 20:54:00   -0.000686
2013-01-07 20:55:00   -0.000172
2013-01-07 20:56:00   -0.000662
2013-01-07 20:57:00   -0.000750
2013-01-07 20:58:00    0.000554
2013-01-07 20:59:00   -0.000496
2013-01-07 21:00:00    0.000363
Name: close_price, Length: 489597
In [85]:
minute_returns = aapl_bars.close_price.pct_change()
std_10day = pd.rolling_std(minute_returns, 390 * 10)
std_10day.resample('B').plot()
Out[85]:
<matplotlib.axes.AxesSubplot at 0x10d94ca90>

Data alignment

In [86]:
ts1 = pd.Series(np.random.randn(10), 
                index=pd.date_range('1/1/2000', periods=10))
ts1
Out[86]:
2000-01-01    0.583378
2000-01-02   -1.542445
2000-01-03   -0.215126
2000-01-04   -1.231664
2000-01-05    0.334308
2000-01-06    1.234491
2000-01-07   -0.678003
2000-01-08   -0.496512
2000-01-09   -2.023730
2000-01-10   -0.464823
Freq: D
In [87]:
ts2 = ts1[[0, 2, 4, 5, 6, 7, 8]]
ts2
Out[87]:
2000-01-01    0.583378
2000-01-03   -0.215126
2000-01-05    0.334308
2000-01-06    1.234491
2000-01-07   -0.678003
2000-01-08   -0.496512
2000-01-09   -2.023730
In [88]:
ts1 + ts2
Out[88]:
2000-01-01    1.166756
2000-01-02         NaN
2000-01-03   -0.430252
2000-01-04         NaN
2000-01-05    0.668617
2000-01-06    2.468981
2000-01-07   -1.356005
2000-01-08   -0.993023
2000-01-09   -4.047460
2000-01-10         NaN
In [89]:
df = pd.DataFrame({'A': ts1, 'B': ts2})
df
Out[89]:
                   A         B
2000-01-01  0.583378  0.583378
2000-01-02 -1.542445       NaN
2000-01-03 -0.215126 -0.215126
2000-01-04 -1.231664       NaN
2000-01-05  0.334308  0.334308
2000-01-06  1.234491  1.234491
2000-01-07 -0.678003 -0.678003
2000-01-08 -0.496512 -0.496512
2000-01-09 -2.023730 -2.023730
2000-01-10 -0.464823       NaN
In [90]:
ibm_bars = load_bars('IBM')
In [91]:
def subsample(frame, pct=0.9):
    N = len(frame)
    indexer = np.sort(np.random.permutation(N)[:pct*N])
    return frame.take(indexer)

f1 = subsample(ibm_bars)
f2 = subsample(aapl_bars)
f1
Out[91]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 440607 entries, 2008-01-07 14:31:00 to 2013-01-07 21:00:00
Data columns:
volume         440607  non-null values
high           440607  non-null values
low            440607  non-null values
close_price    440607  non-null values
open_price     440607  non-null values
dtypes: float64(4), int64(1)
In [92]:
both = pd.concat([f1, f2], axis=1, keys=['IBM', 'AAPL'])
both.head(20)
Out[92]:
                        IBM                                             AAPL           \
                     volume    high     low  close_price  open_price  volume     high   
dt                                                                                      
2008-01-07 14:31:00  173600  100.47  100.02      100.100     100.250  593143  182.070   
2008-01-07 14:32:00   93937  100.13   99.77       99.900     100.110  344139  182.560   
2008-01-07 14:33:00   75100   99.93   99.75       99.890      99.900  257436  182.750   
2008-01-07 14:34:00   60300   99.93   99.55       99.560      99.900  235358  182.620   
2008-01-07 14:35:00   66300   99.84   99.51       99.659      99.550  281763  182.500   
2008-01-07 14:36:00  115600   99.70   99.56       99.610      99.694  184191  182.000   
2008-01-07 14:37:00   68400   99.65   99.50       99.600      99.620     NaN      NaN   
2008-01-07 14:38:00   94800   99.68   99.55       99.590      99.620  324597  181.100   
2008-01-07 14:39:00   71200   99.60   99.25       99.320      99.590  274712  181.160   
2008-01-07 14:40:00   61900   99.56   99.27       99.550      99.300  403421  181.040   
2008-01-07 14:41:00   50400   99.75   99.54       99.630      99.550  342292  181.172   
2008-01-07 14:42:00   64300   99.87   99.56       99.775      99.660  229040  181.430   
2008-01-07 14:43:00   41500   99.79   99.69       99.760      99.770  203759  181.120   
2008-01-07 14:44:00   60900   99.85   99.71       99.720      99.780  163075  181.240   
2008-01-07 14:45:00   46400  100.00   99.72      100.000      99.720  263733  181.870   
2008-01-07 14:46:00   64800  100.42   99.98      100.330     100.000  239652  182.500   
2008-01-07 14:47:00   43200  100.39  100.17      100.180     100.340  217670  182.680   
2008-01-07 14:48:00   66400  100.49  100.14      100.300     100.230  235761  182.420   
2008-01-07 14:49:00   80954  100.59  100.26      100.550     100.300  159339  182.380   
2008-01-07 14:50:00   81100  100.59  100.41      100.490     100.550     NaN      NaN   

                                                      
                        low  close_price  open_price  
dt                                                    
2008-01-07 14:31:00  181.00       182.03     181.250  
2008-01-07 14:32:00  181.92       182.54     182.040  
2008-01-07 14:33:00  182.23       182.50     182.530  
2008-01-07 14:34:00  182.15       182.30     182.510  
2008-01-07 14:35:00  181.71       181.97     182.330  
2008-01-07 14:36:00  181.41       181.48     181.904  
2008-01-07 14:37:00     NaN          NaN         NaN  
2008-01-07 14:38:00  180.65       180.68     181.050  
2008-01-07 14:39:00  180.62       180.80     180.680  
2008-01-07 14:40:00  180.21       180.86     180.800  
2008-01-07 14:41:00  180.67       181.03     180.860  
2008-01-07 14:42:00  180.83       180.88     181.025  
2008-01-07 14:43:00  180.80       181.11     180.890  
2008-01-07 14:44:00  180.99       181.24     181.120  
2008-01-07 14:45:00  181.19       181.84     181.240  
2008-01-07 14:46:00  181.84       182.50     181.880  
2008-01-07 14:47:00  182.20       182.35     182.500  
2008-01-07 14:48:00  181.62       181.63     182.240  
2008-01-07 14:49:00  181.67       181.99     181.720  
2008-01-07 14:50:00     NaN          NaN         NaN  

Missing data handling

In [93]:
df
Out[93]:
                   A         B
2000-01-01  0.583378  0.583378
2000-01-02 -1.542445       NaN
2000-01-03 -0.215126 -0.215126
2000-01-04 -1.231664       NaN
2000-01-05  0.334308  0.334308
2000-01-06  1.234491  1.234491
2000-01-07 -0.678003 -0.678003
2000-01-08 -0.496512 -0.496512
2000-01-09 -2.023730 -2.023730
2000-01-10 -0.464823       NaN
In [94]:
df.count()
Out[94]:
A    10
B     7
In [95]:
both.count()
Out[95]:
IBM   volume         440607
      high           440607
      low            440607
      close_price    440607
      open_price     440607
AAPL  volume         440637
      high           440637
      low            440637
      close_price    440637
      open_price     440637
In [96]:
df.sum()
Out[96]:
A   -4.500125
B   -1.261193
In [97]:
df.mean(1)
Out[97]:
2000-01-01    0.583378
2000-01-02   -1.542445
2000-01-03   -0.215126
2000-01-04   -1.231664
2000-01-05    0.334308
2000-01-06    1.234491
2000-01-07   -0.678003
2000-01-08   -0.496512
2000-01-09   -2.023730
2000-01-10   -0.464823
Freq: D
In [98]:
df.dropna()
Out[98]:
                   A         B
2000-01-01  0.583378  0.583378
2000-01-03 -0.215126 -0.215126
2000-01-05  0.334308  0.334308
2000-01-06  1.234491  1.234491
2000-01-07 -0.678003 -0.678003
2000-01-08 -0.496512 -0.496512
2000-01-09 -2.023730 -2.023730
In [99]:
df.fillna(0)
Out[99]:
                   A         B
2000-01-01  0.583378  0.583378
2000-01-02 -1.542445  0.000000
2000-01-03 -0.215126 -0.215126
2000-01-04 -1.231664  0.000000
2000-01-05  0.334308  0.334308
2000-01-06  1.234491  1.234491
2000-01-07 -0.678003 -0.678003
2000-01-08 -0.496512 -0.496512
2000-01-09 -2.023730 -2.023730
2000-01-10 -0.464823  0.000000
In [100]:
df.fillna(method='ffill')
Out[100]:
                   A         B
2000-01-01  0.583378  0.583378
2000-01-02 -1.542445  0.583378
2000-01-03 -0.215126 -0.215126
2000-01-04 -1.231664 -0.215126
2000-01-05  0.334308  0.334308
2000-01-06  1.234491  1.234491
2000-01-07 -0.678003 -0.678003
2000-01-08 -0.496512 -0.496512
2000-01-09 -2.023730 -2.023730
2000-01-10 -0.464823 -2.023730
In [101]:
df.asfreq('4h')
Out[101]:
                            A         B
2000-01-01 00:00:00  0.583378  0.583378
2000-01-01 04:00:00       NaN       NaN
2000-01-01 08:00:00       NaN       NaN
2000-01-01 12:00:00       NaN       NaN
2000-01-01 16:00:00       NaN       NaN
2000-01-01 20:00:00       NaN       NaN
2000-01-02 00:00:00 -1.542445       NaN
2000-01-02 04:00:00       NaN       NaN
2000-01-02 08:00:00       NaN       NaN
2000-01-02 12:00:00       NaN       NaN
2000-01-02 16:00:00       NaN       NaN
2000-01-02 20:00:00       NaN       NaN
2000-01-03 00:00:00 -0.215126 -0.215126
2000-01-03 04:00:00       NaN       NaN
2000-01-03 08:00:00       NaN       NaN
2000-01-03 12:00:00       NaN       NaN
2000-01-03 16:00:00       NaN       NaN
2000-01-03 20:00:00       NaN       NaN
2000-01-04 00:00:00 -1.231664       NaN
2000-01-04 04:00:00       NaN       NaN
2000-01-04 08:00:00       NaN       NaN
2000-01-04 12:00:00       NaN       NaN
2000-01-04 16:00:00       NaN       NaN
2000-01-04 20:00:00       NaN       NaN
2000-01-05 00:00:00  0.334308  0.334308
2000-01-05 04:00:00       NaN       NaN
2000-01-05 08:00:00       NaN       NaN
2000-01-05 12:00:00       NaN       NaN
2000-01-05 16:00:00       NaN       NaN
2000-01-05 20:00:00       NaN       NaN
2000-01-06 00:00:00  1.234491  1.234491
2000-01-06 04:00:00       NaN       NaN
2000-01-06 08:00:00       NaN       NaN
2000-01-06 12:00:00       NaN       NaN
2000-01-06 16:00:00       NaN       NaN
2000-01-06 20:00:00       NaN       NaN
2000-01-07 00:00:00 -0.678003 -0.678003
2000-01-07 04:00:00       NaN       NaN
2000-01-07 08:00:00       NaN       NaN
2000-01-07 12:00:00       NaN       NaN
2000-01-07 16:00:00       NaN       NaN
2000-01-07 20:00:00       NaN       NaN
2000-01-08 00:00:00 -0.496512 -0.496512
2000-01-08 04:00:00       NaN       NaN
2000-01-08 08:00:00       NaN       NaN
2000-01-08 12:00:00       NaN       NaN
2000-01-08 16:00:00       NaN       NaN
2000-01-08 20:00:00       NaN       NaN
2000-01-09 00:00:00 -2.023730 -2.023730
2000-01-09 04:00:00       NaN       NaN
2000-01-09 08:00:00       NaN       NaN
2000-01-09 12:00:00       NaN       NaN
2000-01-09 16:00:00       NaN       NaN
2000-01-09 20:00:00       NaN       NaN
2000-01-10 00:00:00 -0.464823       NaN
In [102]:
df.asfreq('4h').ffill(limit=3)
Out[102]:
                            A         B
2000-01-01 00:00:00  0.583378  0.583378
2000-01-01 04:00:00  0.583378  0.583378
2000-01-01 08:00:00  0.583378  0.583378
2000-01-01 12:00:00  0.583378  0.583378
2000-01-01 16:00:00       NaN       NaN
2000-01-01 20:00:00       NaN       NaN
2000-01-02 00:00:00 -1.542445       NaN
2000-01-02 04:00:00 -1.542445       NaN
2000-01-02 08:00:00 -1.542445       NaN
2000-01-02 12:00:00 -1.542445       NaN
2000-01-02 16:00:00       NaN       NaN
2000-01-02 20:00:00       NaN       NaN
2000-01-03 00:00:00 -0.215126 -0.215126
2000-01-03 04:00:00 -0.215126 -0.215126
2000-01-03 08:00:00 -0.215126 -0.215126
2000-01-03 12:00:00 -0.215126 -0.215126
2000-01-03 16:00:00       NaN       NaN
2000-01-03 20:00:00       NaN       NaN
2000-01-04 00:00:00 -1.231664       NaN
2000-01-04 04:00:00 -1.231664       NaN
2000-01-04 08:00:00 -1.231664       NaN
2000-01-04 12:00:00 -1.231664       NaN
2000-01-04 16:00:00       NaN       NaN
2000-01-04 20:00:00       NaN       NaN
2000-01-05 00:00:00  0.334308  0.334308
2000-01-05 04:00:00  0.334308  0.334308
2000-01-05 08:00:00  0.334308  0.334308
2000-01-05 12:00:00  0.334308  0.334308
2000-01-05 16:00:00       NaN       NaN
2000-01-05 20:00:00       NaN       NaN
2000-01-06 00:00:00  1.234491  1.234491
2000-01-06 04:00:00  1.234491  1.234491
2000-01-06 08:00:00  1.234491  1.234491
2000-01-06 12:00:00  1.234491  1.234491
2000-01-06 16:00:00       NaN       NaN
2000-01-06 20:00:00       NaN       NaN
2000-01-07 00:00:00 -0.678003 -0.678003
2000-01-07 04:00:00 -0.678003 -0.678003
2000-01-07 08:00:00 -0.678003 -0.678003
2000-01-07 12:00:00 -0.678003 -0.678003
2000-01-07 16:00:00       NaN       NaN
2000-01-07 20:00:00       NaN       NaN
2000-01-08 00:00:00 -0.496512 -0.496512
2000-01-08 04:00:00 -0.496512 -0.496512
2000-01-08 08:00:00 -0.496512 -0.496512
2000-01-08 12:00:00 -0.496512 -0.496512
2000-01-08 16:00:00       NaN       NaN
2000-01-08 20:00:00       NaN       NaN
2000-01-09 00:00:00 -2.023730 -2.023730
2000-01-09 04:00:00 -2.023730 -2.023730
2000-01-09 08:00:00 -2.023730 -2.023730
2000-01-09 12:00:00 -2.023730 -2.023730
2000-01-09 16:00:00       NaN       NaN
2000-01-09 20:00:00       NaN       NaN
2000-01-10 00:00:00 -0.464823       NaN

Groupby operations

In [103]:
import random, string
import matplotlib as mpl
def rands(n):
    choices = string.ascii_letters
    return ''.join([random.choice(choices) for _ in xrange(n)])
mpl.rc('figure', figsize=(12, 8))

ind_names = np.array(['ENERGY', 'FINANCIAL', 'TECH', 
                      'CONSDUR', 'SERVICES', 'UTILITIES'], dtype='O')
ccys = np.array(['USD', 'EUR'], dtype='O')

Nfull = 2000
tickers = np.array(sorted(rands(5).upper() for _ in xrange(Nfull)), dtype='O')
tickers = np.unique(tickers)

industries = pd.Series(ind_names.take(np.random.randint(0, 6, Nfull)), 
                       index=tickers, name='industry')
ccy = pd.Series(ccys.take(np.random.randint(0, len(ccys), Nfull)), 
                index=tickers, name='ccy')
In [104]:
ccy
Out[104]:
AAEUC    EUR
AAKGK    EUR
AATPS    EUR
AAUGH    EUR
AAXBK    USD
ABQLB    USD
ACPJP    EUR
ACSGL    EUR
ADCYV    EUR
ADFEZ    EUR
ADNMK    USD
ADUCW    EUR
AENYT    EUR
AEQKA    EUR
AETPI    USD
...
ZVHFY    EUR
ZVHKA    USD
ZVVJW    EUR
ZWCVS    EUR
ZWMOZ    EUR
ZXBDD    USD
ZXDQC    EUR
ZXNPV    EUR
ZYLEW    EUR
ZYTMF    EUR
ZYYHV    USD
ZZBYE    USD
ZZENJ    EUR
ZZJOX    EUR
ZZZCZ    USD
Name: ccy, Length: 2000
In [105]:
df = pd.DataFrame({'Momentum' : np.random.randn(1000) / 200 + 0.03,
                'Value' : np.random.randn(1000) / 200 + 0.08,
                'ShortInterest' : np.random.randn(1000) / 200 - 0.02},
                index=tickers.take(np.random.permutation(Nfull)[:1000]))
df.head()
Out[105]:
       Momentum  ShortInterest     Value
WPZGV  0.030320      -0.022210  0.067899
KTCLL  0.031851      -0.016965  0.079431
RMXIV  0.031954      -0.018996  0.077051
SUYKU  0.025723      -0.022544  0.076665
EEYEA  0.022493      -0.026726  0.079324
In [106]:
means = df.groupby(industries).mean()
means
Out[106]:
           Momentum  ShortInterest     Value
industry                                    
CONSDUR    0.029303      -0.020244  0.080010
ENERGY     0.029526      -0.019828  0.079985
FINANCIAL  0.029805      -0.019937  0.079605
SERVICES   0.030393      -0.020437  0.079563
TECH       0.029396      -0.019980  0.080333
UTILITIES  0.029690      -0.020159  0.080312
In [107]:
means.plot(kind='barh')
Out[107]:
<matplotlib.axes.AxesSubplot at 0x13025aa50>
In [108]:
means = df.groupby([industries, ccy]).mean()
means
Out[108]:
               Momentum  ShortInterest     Value
industry  ccy                                   
CONSDUR   EUR  0.029077      -0.020063  0.080763
          USD  0.029529      -0.020424  0.079257
ENERGY    EUR  0.029349      -0.019963  0.079538
          USD  0.029728      -0.019673  0.080498
FINANCIAL EUR  0.029460      -0.020284  0.079718
          USD  0.030215      -0.019524  0.079471
SERVICES  EUR  0.030222      -0.020150  0.079125
          USD  0.030539      -0.020683  0.079939
TECH      EUR  0.029018      -0.019371  0.080187
          USD  0.029731      -0.020521  0.080461
UTILITIES EUR  0.029562      -0.020285  0.080697
          USD  0.029847      -0.020006  0.079843
In [109]:
keys = [industries, ccy]
zscore = lambda x: (x - x.mean()) / x.std()
normed = df.groupby(keys).apply(zscore)
In [110]:
normed.groupby(keys).agg(['mean', 'std'])
Out[110]:
                   Momentum       ShortInterest              Value     
                       mean  std           mean  std          mean  std
industry  ccy                                                          
CONSDUR   EUR -1.717145e-16    1   1.281938e-15    1  2.158274e-15    1
          USD -1.576517e-15    1  -1.245300e-15    1 -3.043491e-15    1
ENERGY    EUR -4.680352e-16    1  -2.024524e-15    1  1.094985e-15    1
          USD  6.536594e-16    1   9.056426e-16    1 -1.187564e-15    1
FINANCIAL EUR -1.144085e-15    1   1.731948e-16    1 -7.722989e-16    1
          USD -4.130294e-17    1   4.378112e-16    1  2.775558e-15    1
SERVICES  EUR -3.221210e-16    1  -1.845159e-16    1 -5.222739e-15    1
          USD -2.193694e-16    1  -1.553643e-15    1  1.317888e-15    1
TECH      EUR -1.467181e-15    1   8.108844e-16    1 -2.259093e-15    1
          USD  1.066563e-15    1  -1.777604e-15    1  2.604658e-15    1
UTILITIES EUR -2.628850e-15    1   1.698377e-15    1 -3.649858e-15    1
          USD -6.454177e-16    1  -6.082091e-16    1 -1.024141e-15    1

Hierarchical indexing

In [111]:
means
Out[111]:
               Momentum  ShortInterest     Value
industry  ccy                                   
CONSDUR   EUR  0.029077      -0.020063  0.080763
          USD  0.029529      -0.020424  0.079257
ENERGY    EUR  0.029349      -0.019963  0.079538
          USD  0.029728      -0.019673  0.080498
FINANCIAL EUR  0.029460      -0.020284  0.079718
          USD  0.030215      -0.019524  0.079471
SERVICES  EUR  0.030222      -0.020150  0.079125
          USD  0.030539      -0.020683  0.079939
TECH      EUR  0.029018      -0.019371  0.080187
          USD  0.029731      -0.020521  0.080461
UTILITIES EUR  0.029562      -0.020285  0.080697
          USD  0.029847      -0.020006  0.079843
In [112]:
means['Momentum']
Out[112]:
industry   ccy
CONSDUR    EUR    0.029077
           USD    0.029529
ENERGY     EUR    0.029349
           USD    0.029728
FINANCIAL  EUR    0.029460
           USD    0.030215
SERVICES   EUR    0.030222
           USD    0.030539
TECH       EUR    0.029018
           USD    0.029731
UTILITIES  EUR    0.029562
           USD    0.029847
Name: Momentum
In [113]:
means.ix['TECH']
Out[113]:
     Momentum  ShortInterest     Value
ccy                                   
EUR  0.029018      -0.019371  0.080187
USD  0.029731      -0.020521  0.080461
In [114]:
means.stack()
Out[114]:
industry   ccy               
CONSDUR    EUR  Momentum         0.029077
                ShortInterest   -0.020063
                Value            0.080763
           USD  Momentum         0.029529
                ShortInterest   -0.020424
                Value            0.079257
ENERGY     EUR  Momentum         0.029349
                ShortInterest   -0.019963
                Value            0.079538
           USD  Momentum         0.029728
                ShortInterest   -0.019673
                Value            0.080498
FINANCIAL  EUR  Momentum         0.029460
                ShortInterest   -0.020284
                Value            0.079718
           USD  Momentum         0.030215
                ShortInterest   -0.019524
                Value            0.079471
SERVICES   EUR  Momentum         0.030222
                ShortInterest   -0.020150
                Value            0.079125
           USD  Momentum         0.030539
                ShortInterest   -0.020683
                Value            0.079939
TECH       EUR  Momentum         0.029018
                ShortInterest   -0.019371
                Value            0.080187
           USD  Momentum         0.029731
                ShortInterest   -0.020521
                Value            0.080461
UTILITIES  EUR  Momentum         0.029562
                ShortInterest   -0.020285
                Value            0.080697
           USD  Momentum         0.029847
                ShortInterest   -0.020006
                Value            0.079843
In [115]:
means.stack().unstack('industry')
Out[115]:
industry            CONSDUR    ENERGY  FINANCIAL  SERVICES      TECH  UTILITIES
ccy                                                                            
EUR Momentum       0.029077  0.029349   0.029460  0.030222  0.029018   0.029562
    ShortInterest -0.020063 -0.019963  -0.020284 -0.020150 -0.019371  -0.020285
    Value          0.080763  0.079538   0.079718  0.079125  0.080187   0.080697
USD Momentum       0.029529  0.029728   0.030215  0.030539  0.029731   0.029847
    ShortInterest -0.020424 -0.019673  -0.019524 -0.020683 -0.020521  -0.020006
    Value          0.079257  0.080498   0.079471  0.079939  0.080461   0.079843

Merging and joining

In [118]:
base = '/Users/wesm/Dropbox/book/svn/book_scripts/movielens/ml-1m'
get_path = lambda x: '%s/%s.dat' % (base, x)

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(get_path('users'), sep='::', header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(get_path('ratings'), sep='::', header=None, names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(get_path('movies'), sep='::', header=None, names=mnames)
In [119]:
movies.head()
Out[119]:
   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy
In [120]:
ratings.head()
Out[120]:
   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291
In [121]:
users.head()
Out[121]:
   user_id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
2        3      M   25          15  55117
3        4      M   45           7  02460
4        5      M   25          20  55455
In [122]:
data = pd.merge(pd.merge(ratings, users), movies)
data
Out[122]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns:
user_id       1000209  non-null values
movie_id      1000209  non-null values
rating        1000209  non-null values
timestamp     1000209  non-null values
gender        1000209  non-null values
age           1000209  non-null values
occupation    1000209  non-null values
zip           1000209  non-null values
title         1000209  non-null values
genres        1000209  non-null values
dtypes: int64(6), object(4)
In [123]:
rating_counts = data.groupby('title').size()
freq_titles = rating_counts.index[rating_counts > 1000]
freq_titles
Out[123]:
Index([2001: A Space Odyssey (1968), Abyss, The (1989), African Queen, The (1951), Air Force One (1997), Airplane! (1980), Aladdin (1992), Alien (1979), Aliens (1986), Amadeus (1984), American Beauty (1999), American Pie (1999), American President, The (1995), Animal House (1978), Annie Hall (1977), Apocalypse Now (1979), Apollo 13 (1995), Arachnophobia (1990), Armageddon (1998), As Good As It Gets (1997), Austin Powers: International Man of Mystery (1997), Austin Powers: The Spy Who Shagged Me (1999), Babe (1995), Back to the Future (1985), Back to the Future Part II (1989), Back to the Future Part III (1990), Batman (1989), Batman Returns (1992), Beauty and the Beast (1991), Beetlejuice (1988), Being John Malkovich (1999), Big (1988), Big Lebowski, The (1998), Blade Runner (1982), Blair Witch Project, The (1999), Blazing Saddles (1974), Blues Brothers, The (1980), Boat, The (Das Boot) (1981), Boogie Nights (1997), Braveheart (1995), Breakfast Club, The (1985), Bug's Life, A (1998), Bull Durham (1988), Butch Cassidy and the Sundance Kid (1969), Casablanca (1942), Chicken Run (2000), Chinatown (1974), Christmas Story, A (1983), Citizen Kane (1941), Clear and Present Danger (1994), Clerks (1994), Clockwork Orange, A (1971), Close Encounters of the Third Kind (1977), Clueless (1995), Cocoon (1985), Contact (1997), Crying Game, The (1992), Dances with Wolves (1990), Die Hard (1988), Die Hard 2 (1990), Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963), E.T. the Extra-Terrestrial (1982), Edward Scissorhands (1990), Election (1999), Erin Brockovich (2000), Face/Off (1997), Fargo (1996), Ferris Bueller's Day Off (1986), Few Good Men, A (1992), Fifth Element, The (1997), Fight Club (1999), Fish Called Wanda, A (1988), Fly, The (1986), Forrest Gump (1994), Four Weddings and a Funeral (1994), Fugitive, The (1993), Full Metal Jacket (1987), Full Monty, The (1997), Galaxy Quest (1999), Gattaca (1997), Get Shorty (1995), Ghost (1990), Ghostbusters (1984), Gladiator (2000), Glory (1989), Godfather, The (1972), Godfather: Part II, The (1974), Gone with the Wind (1939), Good Morning, Vietnam (1987), Good Will Hunting (1997), GoodFellas (1990), Graduate, The (1967), Green Mile, The (1999), Grosse Pointe Blank (1997), Groundhog Day (1993), High Fidelity (2000), Honey, I Shrunk the Kids (1989), Hunt for Red October, The (1990), Independence Day (ID4) (1996), Indiana Jones and the Last Crusade (1989), Indiana Jones and the Temple of Doom (1984), Jaws (1975), Jerry Maguire (1996), Jurassic Park (1993), L.A. Confidential (1997), League of Their Own, A (1992), Lethal Weapon (1987), Lethal Weapon 2 (1989), Life Is Beautiful (La Vita � bella) (1997), Lion King, The (1994), Little Mermaid, The (1989), Lost World: Jurassic Park, The (1997), M*A*S*H (1970), Mad Max (1979), Mad Max 2 (a.k.a. The Road Warrior) (1981), Magnolia (1999), Maltese Falcon, The (1941), Mars Attacks! (1996), Mary Poppins (1964), Mask, The (1994), Matrix, The (1999), Men in Black (1997), Mission: Impossible (1996), Mission: Impossible 2 (2000), Monty Python and the Holy Grail (1974), Mummy, The (1999), My Cousin Vinny (1992), North by Northwest (1959), One Flew Over the Cuckoo's Nest (1975), Patriot Games (1992), Patriot, The (2000), Perfect Storm, The (2000), Planet of the Apes (1968), Platoon (1986), Pleasantville (1998), Predator (1987), Pretty Woman (1990), Princess Bride, The (1987), Psycho (1960), Pulp Fiction (1994), Raiders of the Lost Ark (1981), Rain Man (1988), Raising Arizona (1987), Rear Window (1954), Reservoir Dogs (1992), Robocop (1987), Rock, The (1996), Rocky (1976), Rocky Horror Picture Show, The (1975), Romancing the Stone (1984), Run Lola Run (Lola rennt) (1998), Rushmore (1998), Saving Private Ryan (1998), Schindler's List (1993), Seven (Se7en) (1995), Shakespeare in Love (1998), Shawshank Redemption, The (1994), Shining, The (1980), Silence of the Lambs, The (1991), Sixth Sense, The (1999), Sleepy Hollow (1999), Sling Blade (1996), Sneakers (1992), South Park: Bigger, Longer and Uncut (1999), Speed (1994), Splash (1984), Stand by Me (1986), Star Trek IV: The Voyage Home (1986), Star Trek VI: The Undiscovered Country (1991), Star Trek: First Contact (1996), Star Trek: The Wrath of Khan (1982), Star Wars: Episode I - The Phantom Menace (1999), Star Wars: Episode IV - A New Hope (1977), Star Wars: Episode V - The Empire Strikes Back (1980), Star Wars: Episode VI - Return of the Jedi (1983), Stargate (1994), Starship Troopers (1997), Sting, The (1973), Superman (1978), Talented Mr. Ripley, The (1999), Taxi Driver (1976), Terminator 2: Judgment Day (1991), Terminator, The (1984), Thelma & Louise (1991), There's Something About Mary (1998), This Is Spinal Tap (1984), Thomas Crown Affair, The (1999), Three Kings (1999), Time Bandits (1981), Titanic (1997), Top Gun (1986), Total Recall (1990), Toy Story (1995), Toy Story 2 (1999), True Lies (1994), Truman Show, The (1998), Twelve Monkeys (1995), Twister (1996), Untouchables, The (1987), Usual Suspects, The (1995), Wayne's World (1992), When Harry Met Sally... (1989), Who Framed Roger Rabbit? (1988), Willy Wonka and the Chocolate Factory (1971), Witness (1985), Wizard of Oz, The (1939), X-Men (2000), Young Frankenstein (1974)], dtype=object)
In [124]:
highest_rated = data.groupby('title').rating.mean()[freq_titles].order()[-20:]
highest_rated
Out[124]:
title
Life Is Beautiful (La Vita � bella) (1997)                      4.329861
Monty Python and the Holy Grail (1974)                          4.335210
Saving Private Ryan (1998)                                      4.337354
Chinatown (1974)                                                4.339241
Silence of the Lambs, The (1991)                                4.351823
Godfather: Part II, The (1974)                                  4.357565
North by Northwest (1959)                                       4.384030
Citizen Kane (1941)                                             4.388889
One Flew Over the Cuckoo's Nest (1975)                          4.390725
Maltese Falcon, The (1941)                                      4.395973
Sixth Sense, The (1999)                                         4.406263
Casablanca (1942)                                               4.412822
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)    4.449890
Star Wars: Episode IV - A New Hope (1977)                       4.453694
Rear Window (1954)                                              4.476190
Raiders of the Lost Ark (1981)                                  4.477725
Schindler's List (1993)                                         4.510417
Usual Suspects, The (1995)                                      4.517106
Godfather, The (1972)                                           4.524966
Shawshank Redemption, The (1994)                                4.554558
Name: rating
In [125]:
filtered = data[data.title.isin(highest_rated.index)]
filtered.title = filtered.title.str[:25]
filtered.groupby(['title', 'gender']).rating.count().unstack()
Out[125]:
gender                       F     M
title                               
Casablanca (1942)          505  1164
Chinatown (1974)           255   930
Citizen Kane (1941)        280   836
Dr. Strangelove or: How I  231  1136
Godfather, The (1972)      483  1740
Godfather: Part II, The (  342  1350
Life Is Beautiful (La Vit  367   785
Maltese Falcon, The (1941  235   808
Monty Python and the Holy  352  1247
North by Northwest (1959)  332   983
One Flew Over the Cuckoo'  444  1281
Raiders of the Lost Ark (  572  1942
Rear Window (1954)         291   759
Saving Private Ryan (1998  575  2078
Schindler's List (1993)    615  1689
Shawshank Redemption, The  627  1600
Silence of the Lambs, The  706  1872
Sixth Sense, The (1999)    664  1795
Star Wars: Episode IV - A  647  2344
Usual Suspects, The (1995  413  1370

Pivot tables

In [126]:
mean_ratings = data.pivot_table('rating', rows='title',
                                cols='gender', aggfunc='mean')
mean_ratings.tail(20)
Out[126]:
gender                                             F         M
title                                                         
Year of the Horse (1997)                         NaN  3.250000
Yellow Submarine (1968)                     3.714286  3.689286
Yojimbo (1961)                              4.423077  4.402116
You Can't Take It With You (1938)           4.192308  3.921569
You So Crazy (1994)                         3.666667  2.300000
You've Got Mail (1998)                      3.542424  3.275591
Young Doctors in Love (1982)                1.923077  2.742424
Young Frankenstein (1974)                   4.289963  4.239177
Young Guns (1988)                           3.371795  3.425620
Young Guns II (1990)                        2.934783  2.904025
Young Poisoner's Handbook, The (1995)       4.000000  3.532258
Young Sherlock Holmes (1985)                3.514706  3.363344
Young and Innocent (1937)                   2.500000  3.500000
Your Friends and Neighbors (1998)           2.888889  3.536585
Zachariah (1971)                                 NaN  3.500000
Zed & Two Noughts, A (1985)                 3.500000  3.380952
Zero Effect (1998)                          3.864407  3.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995)       NaN  3.500000
Zeus and Roxanne (1997)                     2.777778  2.357143
eXistenZ (1999)                             3.098592  3.289086

Data summary, statistics

summary, value_counts, etc.

In [127]:
data.title.value_counts()
Out[127]:
American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Saving Private Ryan (1998)                               2653
Terminator 2: Judgment Day (1991)                        2649
Matrix, The (1999)                                       2590
Back to the Future (1985)                                2583
Silence of the Lambs, The (1991)                         2578
Men in Black (1997)                                      2538
Raiders of the Lost Ark (1981)                           2514
Fargo (1996)                                             2513
Sixth Sense, The (1999)                                  2459
Braveheart (1995)                                        2443
...
Beauty (1998)                                                   1
Legal Deceit (1997)                                             1
Silence of the Palace, The (Saimt el Qusur) (1994)              1
Relative Fear (1994)                                            1
For Ever Mozart (1996)                                          1
White Boys (1999)                                               1
Terror in a Texas Town (1958)                                   1
Schlafes Bruder (Brother of Sleep) (1995)                       1
Follow the Bitch (1998)                                         1
Even Dwarfs Started Small (Auch Zwerge haben klein angefangen) (1971)    1
Low Life, The (1994)                                            1
Wooden Man's Bride, The (Wu Kui) (1994)                         1
Back Stage (2000)                                               1
One Man's Hero (1999)                                           1
Beloved/Friend (Amigo/Amado) (1999)                             1
Length: 3706
In [128]:
data.rating.describe()
Out[128]:
count    1000209.000000
mean           3.581564
std            1.117102
min            1.000000
25%            3.000000
50%            4.000000
75%            4.000000
max            5.000000
In [129]:
by_gender = data.groupby('gender').rating.describe()
by_gender
Out[129]:
gender       
F       count    246440.000000
        mean          3.620366
        std           1.111228
        min           1.000000
        25%           3.000000
        50%           4.000000
        75%           4.000000
        max           5.000000
M       count    753769.000000
        mean          3.568879
        std           1.118724
        min           1.000000
        25%           3.000000
        50%           4.000000
        75%           4.000000
        max           5.000000
In [130]:
by_gender.unstack(0)
Out[130]:
gender              F              M
count   246440.000000  753769.000000
mean         3.620366       3.568879
std          1.111228       1.118724
min          1.000000       1.000000
25%          3.000000       3.000000
50%          4.000000       4.000000
75%          4.000000       4.000000
max          5.000000       5.000000
In [ ]: