%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
import pandas as pd
import seaborn as sns
# data = pd.read_csv('http://datasets.flowingdata.com/ppg2008.csv', index_col=0)
data = pd.read_csv('/Users/olga/Dropbox/ipython/seaborn/ppg2008.csv.bak', index_col=0)
data.index = data.index.map(lambda x: x.strip())
# label source:https://en.wikipedia.org/wiki/Basketball_statistics
labels = ['Games', 'Minutes', 'Points', 'Field goals made',
'Field goal attempts', 'Field goal percentage', 'Free throws made',
'Free throws attempts', 'Free throws percentage',
'Three-pointers made', 'Three-point attempt',
'Three-point percentage', 'Offensive rebounds', 'Defensive rebounds',
'Total rebounds', 'Assists', 'Steals', 'Blocks', 'Turnover',
'Personal foul']
data.columns = labels
row_dendrogram, col_dendrogram = sns.clusteredheatmap(data)
data_normalized = data
# Standardize the mean and variance within a stat, so different stats can be comparable
# (This is the same as changing all the columns to Z-scores)
data_normalized = (data_normalized - data_normalized.mean())/data_normalized.var()
# Normalize these values to range from -1 to 1
data_normalized = (data_normalized)/(data_normalized.max() - data_normalized.min())
data_normalized = data_normalized.T
# Can use a semicolon after the command to suppress output of the row_dendrogram and col_dendrogram.
sns.clusteredheatmap(data_normalized);
import matplotlib.pyplot as plt
sns.clusteredheatmap(data_normalized);
fig = plt.gcf()
fig.savefig('clusteredheatmap_bbox_tight.png', bbox_inches='tight')
tidy_df = pd.melt(data_normalized.reset_index(), id_vars='index')
tidy_df.head()
index | variable | value | |
---|---|---|---|
0 | Games | Dwyane Wade | 0.143158 |
1 | Minutes | Dwyane Wade | 0.233535 |
2 | Points | Dwyane Wade | 0.718308 |
3 | Field goals made | Dwyane Wade | 0.595714 |
4 | Field goal attempts | Dwyane Wade | 0.561296 |
5 rows × 3 columns
pivot_kws = dict(index='index', columns='variable', values='value')
tidy_df.pivot(**pivot_kws).head()
variable | Al Harrington | Al Jefferson | Allen Iverson | Amare Stoudemire | Andre Iguodala | Antawn Jamison | Ben Gordon | Brandon Roy | Carmelo Anthony | Caron Butler | Chauncey Billups | Chris Bosh | Chris Paul | Corey Maggette | Danny Granger | David West | Deron Williams | Devin Harris | Dirk Nowitzki | Dwight Howard | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
Assists | -0.256042 | -0.235208 | 0.118958 | -0.193542 | 0.150208 | -0.203958 | -0.047708 | 0.129375 | -0.047708 | 0.046042 | 0.264792 | -0.141458 | 0.743958 | -0.214375 | -0.120625 | -0.162292 | 0.712708 | 0.316875 | -0.151875 | -0.256042 | ... |
Blocks | -0.106429 | 0.393571 | -0.177857 | 0.179286 | -0.070714 | -0.106429 | -0.106429 | -0.106429 | -0.070714 | -0.106429 | -0.142143 | 0.143571 | -0.177857 | -0.142143 | 0.286429 | 0.107857 | -0.106429 | -0.142143 | 0.072143 | 0.822143 | ... |
Defensive rebounds | 0.050130 | 0.387792 | -0.261558 | 0.180000 | 0.011169 | 0.257922 | -0.222597 | -0.144675 | 0.089091 | -0.014805 | -0.248571 | 0.348831 | 0.024156 | 0.011169 | -0.014805 | 0.244935 | -0.261558 | -0.209610 | 0.361818 | 0.660519 | ... |
Field goal attempts | 0.061296 | 0.329815 | -0.123889 | -0.170185 | -0.179444 | 0.172407 | 0.005741 | 0.089074 | 0.218704 | 0.024259 | -0.327593 | 0.042778 | 0.015000 | -0.327593 | 0.292778 | 0.098333 | -0.133148 | -0.077593 | 0.376111 | -0.327593 | ... |
Field goal percentage | -0.155276 | 0.136181 | -0.265829 | 0.347236 | 0.015578 | -0.009548 | -0.074874 | 0.050754 | -0.135176 | -0.084925 | -0.260804 | 0.085930 | 0.166332 | -0.044724 | -0.115075 | 0.010553 | 0.005528 | -0.160302 | 0.045729 | 0.513065 | ... |
5 rows × 50 columns
sns.clusteredheatmap(tidy_df, pivot_kws=pivot_kws);
sns.clusteredheatmap(data_normalized, title='2008 NBA Stats', title_fontsize=24);
sns.clusteredheatmap(data_normalized, figsize=(10, 5), labelsize=10);
sns.clusteredheatmap(data, color_scale='log');
sns.clusteredheatmap(data.replace(0, 0.0001), color_scale='log', data_na_ok=data);
# Wacky example of custom linkage method and metric
sns.clusteredheatmap(data_normalized, linkage_method='complete', metric='hamming');
import matplotlib as mpl
sns.clusteredheatmap(data_normalized, pcolormesh_kws={'linewidth': 0.1, 'vmin': 0, 'cmap': mpl.cm.Greens});
colors = sns.color_palette('Set2', n_colors=6)
def stat_to_label(stat):
if set([stat]) & set(('Games', 'Minutes', 'Points')):
return colors[0]
if stat.startswith('Field'):
return colors[1]
if stat.startswith('Free'):
return colors[2]
if stat.startswith('Three'):
return colors[3]
if 'rebounds' in stat:
return colors[4]
else:
return colors[5]
stat_colors = data_normalized.index.map(stat_to_label)
row_labels = data_normalized.index.map(lambda x: 'relabel {}'.format(x))
sns.clusteredheatmap(data_normalized, row_kws={'side_colors': stat_colors, 'label': row_labels},
col_kws={'cluster': False});
sns.clusteredheatmap(data_normalized, colorbar_kws={'orientation': 'vertical', 'label': 'Normalized stat', 'fontsize': 10});