# http://stackoverflow.com/questions/11979194/subclasses-of-pandas-object-work-differently-from-subclass-of-other-object
import pandas as pd
class Support(pd.Series):
def __new__(cls, *args, **kwargs):
arr = Series.__new__(cls, *args, **kwargs)
return arr.view(Support)
def supportMethod1(self):
print 'I am support method 1'
def supportMethod2(self):
print 'I am support method 2'
class Compute(object):
supp=None
def test(self):
self.supp()
class Config(object):
supp=None
@classmethod
def initializeConfig(cls):
cls.supp=Support()
@classmethod
def setConfig1(cls):
Compute.supp=cls.supp.supportMethod1
@classmethod
def setConfig2(cls):
Compute.supp=cls.supp.supportMethod2
# adding the __new__ works for this simple demo
s = Support(range(10))
assert s.supportMethod1() == Support.supportMethod1(s)
assert isinstance(s, Support)
I am support method 1 I am support method 1
The problem is that there are many instances where Series data is boxed and unboxed. That data will come back as a series, which might hamper how useful a series subclass is.
assert not isinstance(s.cumsum(), Support)
assert not isinstance(s.ix[:5], Support)
Also, when you add a Series to a DataFrame, you're not really adding the Series to frame, but adding the data. The DataFrame holds the data and boxes it as a Series when you access it.
Whatever speed you're getting from pandas/numpy has a lot to do with how the data is stored. A DataFrame is not collection of pointers to series. It's data needs to be consolidated and so it just becomes a row in a bigger data set. Losing it's Series-likeness until it's reboxed.
s = Support(range(10))
df = pd.DataFrame({'s': s})
# df.s is not Support or even the Series. It's the data.
assert not isinstance(df.s, Support)
assert id(s) != id(df.s)
s.ix[0] = 888
# does not change df
df
s | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
5 | 5 |
6 | 6 |
7 | 7 |
8 | 8 |
9 | 9 |
When you're working with Frames, your data is being consolidated like so:
import numpy as np
s1 = Support(range(10))
s2 = Support(range(10, 20))
stacked = np.vstack((s1, s2))
s.ix[0] = 10
assert stacked[0][0] != s[0]
stacked
array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]])
After the consolidation, all you really know is that stacked is two rows of 10 columns. You could kind of simulate what a DataFrame does like so
items = {}
items['s1'] = 0
items['s2'] = 1
def get_support_series(frame, key):
ind = items[key]
row = stacked[ind]
# box row
return Support(row)
s2_copy = get_support_series(stacked, 's2')
# values are same...
assert np.all(s2_copy == s2)
# but...
assert s2_copy is not s2
s2[0] = 888
assert s2_copy[0] != 888
# they are not the same!
So, to support subclasses, you'd have keep track of each class type and the additional metadata that class requires. Anything in the dict would be lost unless that data was stored somewhere to be boxed later. And then there's supporting HDF5 and whatever other persistence that assumes the data is really just rows and columns.