import pandas as pd
import seaborn as sns
import numpy as np
from itertools import chain
sns.__version__
'0.3.dev'
np.random.seed(0)
columns = list(chain(*[['{}_{}'.format(letter, i) for i in range(1, np.random.randint(4, 12))] for letter in ['A', 'B', 'C']]))
columns
['A_1', 'A_2', 'A_3', 'A_4', 'A_5', 'A_6', 'A_7', 'B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'B_10', 'C_1', 'C_2', 'C_3', 'C_4', 'C_5', 'C_6', 'C_7', 'C_8']
index = ['gene_{}'.format(i) for i in range(1, 11)]
index
['gene_1', 'gene_2', 'gene_3', 'gene_4', 'gene_5', 'gene_6', 'gene_7', 'gene_8', 'gene_9', 'gene_10']
data = np.random.beta(0.5, 0.5, size=(len(index), len(columns)))
data
array([[ 6.98100192e-01, 9.99980802e-01, 9.87985539e-01, 6.48456486e-05, 4.91422547e-01, 5.51770233e-01, 3.65547338e-01, 7.54562380e-01, 6.81443796e-01, 1.17461282e-01, 6.00214414e-02, 7.34413612e-01, 5.34260221e-01, 4.25745408e-01, 5.60891272e-02, 9.39695362e-01, 4.15434953e-01, 1.74349946e-01, 5.45878475e-01, 1.67988593e-01, 2.34069578e-02, 9.98176618e-01, 2.34642998e-01, 5.99997180e-01, 6.50186035e-01], [ 7.94773534e-01, 4.98211414e-02, 1.47921376e-03, 7.32825054e-01, 4.05044659e-02, 1.25817383e-01, 9.77079335e-01, 3.98805371e-01, 6.14201306e-01, 4.12808223e-01, 2.04214151e-01, 6.16153638e-02, 4.43763051e-01, 9.97792090e-01, 1.17163829e-01, 7.06915152e-01, 5.11311813e-01, 1.89697752e-02, 8.92791126e-01, 3.41088229e-02, 1.09425771e-01, 6.93688009e-01, 1.74507838e-01, 6.46675246e-02, 9.51044390e-01], [ 6.93554352e-01, 4.86422745e-02, 6.14128795e-01, 4.95649205e-01, 1.94665972e-01, 4.13590079e-01, 6.02262986e-02, 9.06532035e-01, 7.82491408e-01, 6.07626154e-01, 9.00537707e-01, 9.33023594e-02, 8.60477814e-01, 9.92246312e-02, 5.00634531e-01, 8.18880892e-01, 5.97677942e-01, 2.21264366e-02, 2.48834956e-01, 1.91412940e-02, 9.78765522e-01, 8.78540106e-02, 5.94167377e-01, 7.40834945e-01, 8.24369558e-03], [ 7.02018765e-01, 5.01774642e-03, 5.92178720e-01, 7.89994948e-01, 6.76900754e-01, 4.98514063e-01, 2.16422230e-01, 7.91867610e-01, 7.15053966e-03, 8.63272053e-01, 9.11725089e-01, 1.83527843e-01, 2.36540540e-01, 9.98346226e-01, 9.95533258e-01, 1.82258144e-01, 7.44840543e-01, 9.34294400e-01, 1.18908411e-01, 9.23454444e-01, 8.68103188e-01, 7.09196634e-01, 9.97503573e-01, 5.83445817e-01, 5.41327763e-01], [ 6.41294099e-01, 6.70763284e-01, 9.61441011e-01, 2.31248805e-04, 2.71918714e-01, 8.35638470e-01, 9.82882282e-01, 9.64961362e-01, 1.00634449e-01, 4.78048031e-01, 9.94694956e-01, 9.97783358e-01, 9.99055516e-01, 4.10860037e-02, 9.20578423e-02, 6.45746396e-01, 9.34845107e-01, 4.06016701e-01, 7.17516175e-01, 2.77886419e-01, 2.67576079e-01, 9.85608326e-02, 9.80963462e-02, 3.36243203e-02, 4.71808654e-01], [ 9.48493038e-01, 2.55259818e-01, 9.93840007e-01, 3.10356581e-01, 5.98019326e-01, 4.07848162e-02, 8.99448269e-01, 1.89758259e-01, 7.40082311e-01, 3.26953718e-02, 8.33773468e-01, 2.18068173e-01, 9.71635945e-01, 6.97629596e-01, 5.43586345e-01, 5.43218940e-01, 8.38715773e-01, 7.89581904e-01, 7.70038369e-03, 9.82768930e-01, 7.38049641e-01, 1.42405017e-01, 6.84192466e-01, 9.13087529e-01, 4.10372819e-01], [ 5.05443962e-01, 1.09224002e-01, 8.22652224e-02, 8.31753192e-01, 5.01107142e-01, 9.99999994e-01, 4.80554798e-01, 3.55548751e-01, 1.84943972e-01, 8.29308343e-01, 4.85519315e-01, 2.95649456e-01, 7.37892607e-01, 1.22655052e-03, 8.31416149e-03, 9.93992701e-01, 3.45354364e-03, 9.99998193e-01, 6.54093859e-01, 9.56963581e-01, 9.98170167e-01, 8.84769831e-01, 3.78547791e-02, 9.96764161e-01, 3.27943811e-01], [ 2.41484741e-01, 3.38728178e-01, 9.01967598e-01, 5.53795040e-01, 9.41379724e-01, 1.84962400e-03, 7.69768253e-01, 8.71942441e-01, 1.08504119e-01, 1.23407779e-01, 7.60803615e-02, 1.09248589e-01, 8.54598648e-01, 4.19990578e-01, 2.73573455e-01, 3.57240372e-01, 5.75850510e-01, 1.51749962e-01, 9.99346273e-01, 9.36074707e-01, 2.81528536e-02, 1.84426656e-01, 1.31432420e-01, 9.96173333e-01, 4.21688295e-01], [ 8.12883541e-01, 6.73983724e-01, 5.16632248e-01, 9.57743746e-01, 1.89601563e-01, 5.82302216e-01, 3.75355250e-01, 9.59248770e-01, 2.46486322e-02, 4.75795441e-01, 5.26710465e-01, 9.81340850e-01, 3.86436650e-02, 3.90178483e-01, 3.38850404e-01, 2.34728302e-03, 7.99547395e-01, 6.21292883e-01, 9.82193739e-01, 2.04466328e-02, 1.17947193e-02, 6.49078010e-01, 6.52980209e-01, 7.16551892e-01, 8.05175861e-04], [ 9.66808753e-01, 8.24591978e-03, 3.97266578e-01, 3.12554800e-01, 4.99522164e-01, 6.19962741e-01, 5.83116279e-03, 9.16792867e-02, 4.14137429e-01, 8.00321941e-01, 7.88816582e-01, 2.04200460e-01, 7.89804713e-04, 1.16922617e-01, 5.95642246e-01, 1.06832221e-01, 2.24981885e-02, 9.56742800e-01, 9.92740258e-01, 9.74729014e-01, 6.09302612e-01, 5.50276638e-02, 4.43853410e-01, 9.31485985e-01, 7.42156757e-02]])
Add some random NAs
data.flat[np.random.choice(np.arange(data.flatten().shape[0]), 100)] = np.nan
data
array([[ 6.98100192e-01, 9.99980802e-01, nan, nan, nan, 5.51770233e-01, 3.65547338e-01, 7.54562380e-01, 6.81443796e-01, 1.17461282e-01, 6.00214414e-02, nan, 5.34260221e-01, 4.25745408e-01, 5.60891272e-02, 9.39695362e-01, 4.15434953e-01, nan, nan, 1.67988593e-01, 2.34069578e-02, 9.98176618e-01, 2.34642998e-01, 5.99997180e-01, 6.50186035e-01], [ 7.94773534e-01, nan, nan, 7.32825054e-01, nan, nan, 9.77079335e-01, nan, 6.14201306e-01, 4.12808223e-01, nan, 6.16153638e-02, nan, nan, 1.17163829e-01, nan, 5.11311813e-01, 1.89697752e-02, 8.92791126e-01, 3.41088229e-02, nan, 6.93688009e-01, 1.74507838e-01, 6.46675246e-02, nan], [ 6.93554352e-01, 4.86422745e-02, 6.14128795e-01, 4.95649205e-01, 1.94665972e-01, 4.13590079e-01, nan, nan, 7.82491408e-01, nan, nan, nan, 8.60477814e-01, nan, 5.00634531e-01, 8.18880892e-01, 5.97677942e-01, nan, nan, 1.91412940e-02, 9.78765522e-01, 8.78540106e-02, nan, nan, 8.24369558e-03], [ nan, 5.01774642e-03, 5.92178720e-01, 7.89994948e-01, nan, 4.98514063e-01, 2.16422230e-01, 7.91867610e-01, 7.15053966e-03, 8.63272053e-01, 9.11725089e-01, 1.83527843e-01, 2.36540540e-01, nan, nan, nan, nan, 9.34294400e-01, 1.18908411e-01, nan, 8.68103188e-01, 7.09196634e-01, nan, 5.83445817e-01, 5.41327763e-01], [ nan, 6.70763284e-01, 9.61441011e-01, 2.31248805e-04, nan, 8.35638470e-01, nan, 9.64961362e-01, 1.00634449e-01, 4.78048031e-01, 9.94694956e-01, nan, nan, 4.10860037e-02, 9.20578423e-02, 6.45746396e-01, nan, 4.06016701e-01, nan, nan, 2.67576079e-01, 9.85608326e-02, 9.80963462e-02, nan, 4.71808654e-01], [ nan, nan, 9.93840007e-01, 3.10356581e-01, 5.98019326e-01, 4.07848162e-02, nan, 1.89758259e-01, nan, 3.26953718e-02, 8.33773468e-01, 2.18068173e-01, 9.71635945e-01, 6.97629596e-01, nan, 5.43218940e-01, 8.38715773e-01, 7.89581904e-01, nan, 9.82768930e-01, nan, 1.42405017e-01, 6.84192466e-01, 9.13087529e-01, 4.10372819e-01], [ nan, 1.09224002e-01, 8.22652224e-02, 8.31753192e-01, 5.01107142e-01, nan, 4.80554798e-01, nan, nan, 8.29308343e-01, 4.85519315e-01, 2.95649456e-01, 7.37892607e-01, 1.22655052e-03, 8.31416149e-03, 9.93992701e-01, 3.45354364e-03, nan, 6.54093859e-01, 9.56963581e-01, 9.98170167e-01, nan, 3.78547791e-02, 9.96764161e-01, 3.27943811e-01], [ 2.41484741e-01, 3.38728178e-01, nan, nan, 9.41379724e-01, 1.84962400e-03, nan, 8.71942441e-01, 1.08504119e-01, 1.23407779e-01, nan, nan, 8.54598648e-01, nan, 2.73573455e-01, 3.57240372e-01, nan, 1.51749962e-01, 9.99346273e-01, 9.36074707e-01, 2.81528536e-02, nan, nan, nan, 4.21688295e-01], [ nan, 6.73983724e-01, nan, nan, 1.89601563e-01, 5.82302216e-01, nan, 9.59248770e-01, 2.46486322e-02, 4.75795441e-01, 5.26710465e-01, nan, nan, nan, 3.38850404e-01, 2.34728302e-03, 7.99547395e-01, 6.21292883e-01, 9.82193739e-01, 2.04466328e-02, 1.17947193e-02, nan, 6.52980209e-01, nan, 8.05175861e-04], [ nan, 8.24591978e-03, 3.97266578e-01, nan, nan, 6.19962741e-01, nan, 9.16792867e-02, 4.14137429e-01, 8.00321941e-01, 7.88816582e-01, nan, 7.89804713e-04, 1.16922617e-01, 5.95642246e-01, 1.06832221e-01, 2.24981885e-02, 9.56742800e-01, 9.92740258e-01, 9.74729014e-01, 6.09302612e-01, nan, nan, 9.31485985e-01, nan]])
df = pd.DataFrame(data, columns=columns, index=index)
df
A_1 | A_2 | A_3 | A_4 | A_5 | A_6 | A_7 | B_1 | B_2 | B_3 | B_4 | B_5 | B_6 | B_7 | B_8 | B_9 | B_10 | C_1 | C_2 | C_3 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
gene_1 | 0.698100 | 0.999981 | NaN | NaN | NaN | 0.551770 | 0.365547 | 0.754562 | 0.681444 | 0.117461 | 0.060021 | NaN | 0.534260 | 0.425745 | 0.056089 | 0.939695 | 0.415435 | NaN | NaN | 0.167989 | ... |
gene_2 | 0.794774 | NaN | NaN | 0.732825 | NaN | NaN | 0.977079 | NaN | 0.614201 | 0.412808 | NaN | 0.061615 | NaN | NaN | 0.117164 | NaN | 0.511312 | 0.018970 | 0.892791 | 0.034109 | ... |
gene_3 | 0.693554 | 0.048642 | 0.614129 | 0.495649 | 0.194666 | 0.413590 | NaN | NaN | 0.782491 | NaN | NaN | NaN | 0.860478 | NaN | 0.500635 | 0.818881 | 0.597678 | NaN | NaN | 0.019141 | ... |
gene_4 | NaN | 0.005018 | 0.592179 | 0.789995 | NaN | 0.498514 | 0.216422 | 0.791868 | 0.007151 | 0.863272 | 0.911725 | 0.183528 | 0.236541 | NaN | NaN | NaN | NaN | 0.934294 | 0.118908 | NaN | ... |
gene_5 | NaN | 0.670763 | 0.961441 | 0.000231 | NaN | 0.835638 | NaN | 0.964961 | 0.100634 | 0.478048 | 0.994695 | NaN | NaN | 0.041086 | 0.092058 | 0.645746 | NaN | 0.406017 | NaN | NaN | ... |
gene_6 | NaN | NaN | 0.993840 | 0.310357 | 0.598019 | 0.040785 | NaN | 0.189758 | NaN | 0.032695 | 0.833773 | 0.218068 | 0.971636 | 0.697630 | NaN | 0.543219 | 0.838716 | 0.789582 | NaN | 0.982769 | ... |
gene_7 | NaN | 0.109224 | 0.082265 | 0.831753 | 0.501107 | NaN | 0.480555 | NaN | NaN | 0.829308 | 0.485519 | 0.295649 | 0.737893 | 0.001227 | 0.008314 | 0.993993 | 0.003454 | NaN | 0.654094 | 0.956964 | ... |
gene_8 | 0.241485 | 0.338728 | NaN | NaN | 0.941380 | 0.001850 | NaN | 0.871942 | 0.108504 | 0.123408 | NaN | NaN | 0.854599 | NaN | 0.273573 | 0.357240 | NaN | 0.151750 | 0.999346 | 0.936075 | ... |
gene_9 | NaN | 0.673984 | NaN | NaN | 0.189602 | 0.582302 | NaN | 0.959249 | 0.024649 | 0.475795 | 0.526710 | NaN | NaN | NaN | 0.338850 | 0.002347 | 0.799547 | 0.621293 | 0.982194 | 0.020447 | ... |
gene_10 | NaN | 0.008246 | 0.397267 | NaN | NaN | 0.619963 | NaN | 0.091679 | 0.414137 | 0.800322 | 0.788817 | NaN | 0.000790 | 0.116923 | 0.595642 | 0.106832 | 0.022498 | 0.956743 | 0.992740 | 0.974729 | ... |
10 rows × 25 columns
Map each sample ID to its celltype.
celltype_to_long_name = {'A': 'Abracadabra', 'B':'Broccoli', 'C': 'Carrot'}
sample_id_to_celltype = dict((col, celltype_to_long_name[col[0]]) for col in df.columns)
sns.violinplot(df.ix['gene_8'], groupby=sample_id_to_celltype)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-55-92e529cfb4a6> in <module>() ----> 1 sns.violinplot(df.ix['gene_8'], groupby=sample_id_to_celltype) /Users/olga/workspace-git/seaborn/seaborn/distributions.py in violinplot(vals, groupby, inner, color, positions, names, order, kernel, bw, widths, alpha, join_rm, gridsize, cut, inner_kws, ax, **kwargs) 262 names = np.sort(pd.unique(groupby)) 263 ylabel = vals.name --> 264 grouped_vals = pd.groupby(vals, groupby).values 265 if order is not None: 266 grouped_vals = grouped_vals[order] /Users/olga/workspace-git/pandas/pandas/core/groupby.pyc in __getattr__(self, attr) 260 261 if hasattr(self.obj, attr) and attr != '_cache': --> 262 return self._make_wrapper(attr) 263 264 raise AttributeError("%r object has no attribute %r" % /Users/olga/workspace-git/pandas/pandas/core/groupby.pyc in _make_wrapper(self, name) 275 "using the 'apply' method".format(kind, name, 276 type(self).__name__)) --> 277 raise AttributeError(msg) 278 279 f = getattr(self.obj, name) AttributeError: Cannot access attribute 'values' of 'SeriesGroupBy' objects, try using the 'apply' method
s = df.ix['gene_8']
new_index = pd.MultiIndex.from_tuples([(celltype_to_long_name[v[0]], v) for v in s.index],
names=['celltype', 'sample_id'])
s.index = new_index
grouped = s.groupby(level='celltype')
data = [v.dropna().values if v.dropna().shape[0] > 0 else [-1,-2] for k, v in list(grouped)]
ax = sns.violin(data, names=s.index.levels[0], bw=0.05)
ax.set_ylim(0,1)
(0, 1)