%matplotlib inline
import json
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import operator
import math
from pattern import web
from fnmatch import fnmatch
# set some nicer defaults for matplotlib
from matplotlib import rcParams
#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = 'Helvetica'
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
# taken from http://en.wikipedia.org/wiki/List_of_countries_by_population
population = {
'China' : 1361750000,
'United States' : 317307000,
'Japan' : 127290000,
'Switzerland' : 8112200,
'Germany' : 80548000,
'France' : 65834000,
'Italy' : 59859996,
'United Kingdom': 63705000,
'Australia' : 23320497,
'Spain' : 46704314,
'Russia' : 143600000,
'India' : 1237980000,
'Saudi Arabia' : 29994272,
'Sweden' : 9633490,
'Norway' : 5096300,
'Canada' : 35295770,
'Korea, South' : 50219669,
'Poland' : 38502396,
'Brazil' : 201032714,
'Hong Kong' : 7184000,
'Netherlands' : 16811000,
'Finland' : 5448025,
'Israel' : 8107000,
'Taiwan' : 23367320,
'Denmark' : 5623501,
'Austria' : 8501502,
'Belgium' : 11180320,
'Ireland' : 4593100
}
css tweaks in this cell
def get_top500_xml(year, month):
"Year is an int, month is a string, either '06' or '11.'"
if year not in range(1993, 2014):
raise Exception("Invalid year.")
if month != '06' and month != '11':
raise Exception("Invalid month.")
xml = requests.get("http://s.top500.org/static/lists/xml/TOP500_{}{}_all.xml".format(year, month)).text
return xml
def top500_to_df(xml):
dom = web.Element(xml)
ranks = [int(rank.content) for rank in dom.by_tag('top500:rank')]
manufacturers = [manufacturer.content for manufacturer in dom.by_tag('top500:manufacturer')]
r_maxs_mflops = [float(r_max.content)*1000 for r_max in dom.by_tag('top500:r-max')]
r_maxs_tflops = [float(r_max.content)/1000 for r_max in dom.by_tag('top500:r-max')]
countries = [country.content for country in dom.by_tag('top500:country')]
years = [int(year.content) for year in dom.by_tag('top500:year')]
df = pd.DataFrame({
'rank':ranks,
'manufacturer': manufacturers,
'r-max mflops': r_maxs_mflops,
'r-max tflops': r_maxs_tflops,
'country': countries,
'year': years
})
df = df.set_index('rank')
return df
Practice with DataFrames
nov_2013_xml = get_top500_xml(2013, '11')
nov_2013 = top500_to_df(nov_2013_xml)
nov_2013.head()
country | manufacturer | r-max mflops | r-max tflops | year | |
---|---|---|---|---|---|
rank | |||||
1 | China | NUDT | 33862700000 | 33862.700 | 2013 |
2 | United States | Cray Inc. | 17590000000 | 17590.000 | 2012 |
3 | United States | IBM | 17173224000 | 17173.224 | 2011 |
4 | Japan | Fujitsu | 10510000000 | 10510.000 | 2011 |
5 | United States | IBM | 8586612000 | 8586.612 | 2012 |
Primary goal is to visualize amount of computing power in teraFLOPS per country, and in megaFLOPS per capita by country.
tflops_per_country = {}
mflops_per_capita = {}
for country in nov_2013['country'].unique():
country_df = nov_2013[nov_2013['country'] == country].head()
country_tflops = sum(country_df['r-max tflops'])
country_per_capita_mflops = sum(country_df['r-max mflops'])/population[country]
tflops_per_country[country] = round(country_tflops, 4)
mflops_per_capita[country] = round(country_per_capita_mflops, 4)
print tflops_per_country['China'], tflops_per_country['United States']
39267.3 52811.252
print mflops_per_capita['China'], mflops_per_capita['United States']
28.8359 166.4358
def hbar(title, item_count_dict, bar_color=dark2_colors[0], spacing=50, subplot=111):
"""
Visualize sorted item and count data with a horizontal bar graph.
Parameters
-----------
title: str
the graph's title
item_count_tuples: lst of tuples
each tuple contains the word and its count
bar_color: RGB triple (default: dark2_colors[0])
the color of the bars
spacing: int (default: 50)
the spacing between the bars and their count labels
"""
item_count_tuples = item_count_dict.items()
get_count = operator.itemgetter(1)
item_count_tuples = sorted(item_count_tuples, key=get_count, reverse=True)
items, counts = [], []
for item, count in item_count_tuples:
items.append(item)
counts.append(count)
# plot
pos = np.arange(len(items))
if str(subplot)[2] == '1':
plt.figure(figsize=(10, len(items)/3))
plt.subplot(subplot)
plt.title(title)
plt.barh(pos, counts, color=bar_color)
# add the numbers to the side of each bar
for p, item, count in zip(pos, items, counts):
plt.annotate(str(count), xy=(count + spacing, p + .5), va='center')
# customize ticks
ticks = plt.yticks(pos + .5, items)
xt = plt.xticks()[0]
plt.xticks(xt, [' '] * len(xt))
# minimize chartjunk
remove_border(left=False, bottom=False)
plt.grid(axis = 'x', color ='white', linestyle='-')
# set plot limits
plt.ylim(pos.max() + 1, pos.min())
plt.tight_layout()
log_tflops_per_country = {}
for country, tflops in tflops_per_country.iteritems():
log_tflops_per_country[country] = round(math.log(tflops), 4)
hbar("TFLOPS per Country", tflops_per_country, subplot=121, spacing=1500)
hbar("Log TFLOPS per Country", log_tflops_per_country, subplot=122, bar_color=dark2_colors[1], spacing=0.5)
plt.tight_layout()
plt.subplots_adjust(wspace=1.2)
log_mflops_per_capita = {}
for country, mflops in mflops_per_capita.iteritems():
log_mflops_per_capita[country] = round(math.log(mflops), 4)
hbar("MFLOPS per capita, by Country", mflops_per_capita, subplot=121, bar_color=dark2_colors[2], spacing=25)
hbar("Log MFLOPS per capita, by Country", log_mflops_per_capita, subplot=122, bar_color=dark2_colors[3], spacing=0.15)
plt.tight_layout()
plt.subplots_adjust(wspace=1.2)