import pandas as pd
import numpy as np
cols = ['time', 'cd4', 'age', 'packs', 'drugs', 'sex', 'cesd', 'id']
cd4Data = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/cd4.data', header=None, names=cols, skipinitialspace=True, delimiter=' ')
cd4Data = cd4Data.ix[cd4Data['time'].order().index]
cd4Data.index = cd4Data.index + 1
cd4Data.head()
time | cd4 | age | packs | drugs | sex | cesd | id | |
---|---|---|---|---|---|---|---|---|
1279 | -2.989733 | 814 | 6.17 | 3 | 1 | 5 | -3 | 30183 |
2190 | -2.989733 | 400 | -6.02 | 0 | 0 | 3 | -4 | 41406 |
1167 | -2.984257 | 467 | 13.94 | 0 | 1 | 1 | 0 | 30046 |
1427 | -2.956879 | 749 | -4.54 | 0 | 1 | -1 | -7 | 30498 |
2032 | -2.951403 | 1218 | 5.57 | 3 | 1 | 5 | 3 | 41032 |
plot(cd4Data['time'], cd4Data['cd4'], 'ok', markersize=1)
# average from 2 before until 2 after (window size = 5)
plot(cd4Data['time'], pd.rolling_mean(cd4Data['cd4'], 5, center=True), linewidth=2);
plot(cd4Data['time'], cd4Data['cd4'], 'ok', markersize=1)
# average from 10 before until 10 after (window size = 21)
plot(cd4Data['time'], pd.rolling_mean(cd4Data['cd4'], 21, center=True), linewidth=2);
plot(cd4Data['time'], cd4Data['cd4'], 'ok', markersize=1)
# average from 400 before until 400 after (window size = 401)
plot(cd4Data['time'], pd.rolling_mean(cd4Data['cd4'], 401, center=True), linewidth=3);
def tukey(x):
return max(1 - x**2, 0) ** 2
ws = 10.
# this is just for plotting, not really useful in the moving average later
filt = map(tukey, np.arange(-ws, ws+1)/(ws + 1))
filt = filt/sum(filt)
plot(np.arange(-ws, ws+1), filt, 'ok');
# here we'd like to define a 'reducer' function for the rolling_apply
# that is, a function that returns a scalar given a vector
def filt(x):
# determining ws is a bit tricky,
# because x should contain the whole window
ws = (len(x) - 1) / 2.
f = map(tukey, np.arange(-ws, ws+1)/(ws + 1))
f = f/sum(f)
return sum(f * x)
# ws=100 means 100 before until 100 after --> window size of 201
window = 201.
plot(cd4Data['time'], cd4Data['cd4'], 'ok', markersize=1)
# weighted moving average with the filt function
plot(cd4Data['time'], pd.rolling_apply(cd4Data['cd4'], window, filt, center=True), linewidth=3);
# no loess implementation in pandas/statsmodels, and lowess implementation is somehow broken for this particular data
from statsmodels.nonparametric.smoothers_lowess import lowess
#lowess_line = lowess(cd4Data['cd4'], cd4Data['time'])
# the splines support is pretty basic also; R's regression with splines is just übercool