%matplotlib inline import json import pandas as pd import numpy as np import requests import matplotlib.pyplot as plt import operator import math from pattern import web from fnmatch import fnmatch # set some nicer defaults for matplotlib from matplotlib import rcParams #these colors come from colorbrewer2.org. Each is an RGB triplet dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = False rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' rcParams['font.family'] = 'sans-serif' rcParams['font.sans-serif'] = 'Helvetica' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecessary plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() # taken from http://en.wikipedia.org/wiki/List_of_countries_by_population population = { 'China' : 1361750000, 'United States' : 317307000, 'Japan' : 127290000, 'Switzerland' : 8112200, 'Germany' : 80548000, 'France' : 65834000, 'Italy' : 59859996, 'United Kingdom': 63705000, 'Australia' : 23320497, 'Spain' : 46704314, 'Russia' : 143600000, 'India' : 1237980000, 'Saudi Arabia' : 29994272, 'Sweden' : 9633490, 'Norway' : 5096300, 'Canada' : 35295770, 'Korea, South' : 50219669, 'Poland' : 38502396, 'Brazil' : 201032714, 'Hong Kong' : 7184000, 'Netherlands' : 16811000, 'Finland' : 5448025, 'Israel' : 8107000, 'Taiwan' : 23367320, 'Denmark' : 5623501, 'Austria' : 8501502, 'Belgium' : 11180320, 'Ireland' : 4593100 } def get_top500_xml(year, month): "Year is an int, month is a string, either '06' or '11.'" if year not in range(1993, 2014): raise Exception("Invalid year.") if month != '06' and month != '11': raise Exception("Invalid month.") xml = requests.get("http://s.top500.org/static/lists/xml/TOP500_{}{}_all.xml".format(year, month)).text return xml def top500_to_df(xml): dom = web.Element(xml) ranks = [int(rank.content) for rank in dom.by_tag('top500:rank')] manufacturers = [manufacturer.content for manufacturer in dom.by_tag('top500:manufacturer')] r_maxs_mflops = [float(r_max.content)*1000 for r_max in dom.by_tag('top500:r-max')] r_maxs_tflops = [float(r_max.content)/1000 for r_max in dom.by_tag('top500:r-max')] countries = [country.content for country in dom.by_tag('top500:country')] years = [int(year.content) for year in dom.by_tag('top500:year')] df = pd.DataFrame({ 'rank':ranks, 'manufacturer': manufacturers, 'r-max mflops': r_maxs_mflops, 'r-max tflops': r_maxs_tflops, 'country': countries, 'year': years }) df = df.set_index('rank') return df nov_2013_xml = get_top500_xml(2013, '11') nov_2013 = top500_to_df(nov_2013_xml) nov_2013.head() tflops_per_country = {} mflops_per_capita = {} for country in nov_2013['country'].unique(): country_df = nov_2013[nov_2013['country'] == country].head() country_tflops = sum(country_df['r-max tflops']) country_per_capita_mflops = sum(country_df['r-max mflops'])/population[country] tflops_per_country[country] = round(country_tflops, 4) mflops_per_capita[country] = round(country_per_capita_mflops, 4) print tflops_per_country['China'], tflops_per_country['United States'] print mflops_per_capita['China'], mflops_per_capita['United States'] def hbar(title, item_count_dict, bar_color=dark2_colors[0], spacing=50, subplot=111): """ Visualize sorted item and count data with a horizontal bar graph. Parameters ----------- title: str the graph's title item_count_tuples: lst of tuples each tuple contains the word and its count bar_color: RGB triple (default: dark2_colors[0]) the color of the bars spacing: int (default: 50) the spacing between the bars and their count labels """ item_count_tuples = item_count_dict.items() get_count = operator.itemgetter(1) item_count_tuples = sorted(item_count_tuples, key=get_count, reverse=True) items, counts = [], [] for item, count in item_count_tuples: items.append(item) counts.append(count) # plot pos = np.arange(len(items)) if str(subplot)[2] == '1': plt.figure(figsize=(10, len(items)/3)) plt.subplot(subplot) plt.title(title) plt.barh(pos, counts, color=bar_color) # add the numbers to the side of each bar for p, item, count in zip(pos, items, counts): plt.annotate(str(count), xy=(count + spacing, p + .5), va='center') # customize ticks ticks = plt.yticks(pos + .5, items) xt = plt.xticks()[0] plt.xticks(xt, [' '] * len(xt)) # minimize chartjunk remove_border(left=False, bottom=False) plt.grid(axis = 'x', color ='white', linestyle='-') # set plot limits plt.ylim(pos.max() + 1, pos.min()) plt.tight_layout() log_tflops_per_country = {} for country, tflops in tflops_per_country.iteritems(): log_tflops_per_country[country] = round(math.log(tflops), 4) hbar("TFLOPS per Country", tflops_per_country, subplot=121, spacing=1500) hbar("Log TFLOPS per Country", log_tflops_per_country, subplot=122, bar_color=dark2_colors[1], spacing=0.5) plt.tight_layout() plt.subplots_adjust(wspace=1.2) log_mflops_per_capita = {} for country, mflops in mflops_per_capita.iteritems(): log_mflops_per_capita[country] = round(math.log(mflops), 4) hbar("MFLOPS per capita, by Country", mflops_per_capita, subplot=121, bar_color=dark2_colors[2], spacing=25) hbar("Log MFLOPS per capita, by Country", log_mflops_per_capita, subplot=122, bar_color=dark2_colors[3], spacing=0.15) plt.tight_layout() plt.subplots_adjust(wspace=1.2)