In [133]:
%matplotlib inline

import json
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import operator
import math

from pattern import web
from fnmatch import fnmatch

# set some nicer defaults for matplotlib
from matplotlib import rcParams

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = 'Helvetica'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
In [82]:
# taken from http://en.wikipedia.org/wiki/List_of_countries_by_population
population = {
    'China'         : 1361750000,
    'United States' : 317307000,
    'Japan'         : 127290000,
    'Switzerland'   : 8112200, 
    'Germany'       : 80548000,
    'France'        : 65834000,
    'Italy'         : 59859996, 
    'United Kingdom': 63705000,
    'Australia'     : 23320497,
    'Spain'         : 46704314,
    'Russia'        : 143600000, 
    'India'         : 1237980000, 
    'Saudi Arabia'  : 29994272, 
    'Sweden'        : 9633490,
    'Norway'        : 5096300,
    'Canada'        : 35295770, 
    'Korea, South'  : 50219669,
    'Poland'        : 38502396, 
    'Brazil'        : 201032714, 
    'Hong Kong'     : 7184000,
    'Netherlands'   : 16811000, 
    'Finland'       : 5448025, 
    'Israel'        : 8107000, 
    'Taiwan'        : 23367320, 
    'Denmark'       : 5623501,
    'Austria'       : 8501502, 
    'Belgium'       : 11180320, 
    'Ireland'       : 4593100
}

css tweaks in this cell

In [7]:
def get_top500_xml(year, month):
    "Year is an int, month is a string, either '06' or '11.'"
    if year not in range(1993, 2014):
        raise Exception("Invalid year.")
    if month != '06' and month != '11':
        raise Exception("Invalid month.")
    
    xml = requests.get("http://s.top500.org/static/lists/xml/TOP500_{}{}_all.xml".format(year, month)).text
    return xml
In [108]:
def top500_to_df(xml):
    dom = web.Element(xml)
    
    ranks = [int(rank.content) for rank in dom.by_tag('top500:rank')]
    manufacturers = [manufacturer.content for manufacturer in dom.by_tag('top500:manufacturer')]
    r_maxs_mflops = [float(r_max.content)*1000 for r_max in dom.by_tag('top500:r-max')]
    r_maxs_tflops = [float(r_max.content)/1000 for r_max in dom.by_tag('top500:r-max')]
    countries = [country.content for country in dom.by_tag('top500:country')]
    years = [int(year.content) for year in dom.by_tag('top500:year')]

    df = pd.DataFrame({
        'rank':ranks,
        'manufacturer': manufacturers,
        'r-max mflops': r_maxs_mflops,
        'r-max tflops': r_maxs_tflops,
        'country': countries,
        'year': years
    })
    
    df = df.set_index('rank')
    
    return df

Practice with DataFrames

In [103]:
nov_2013_xml = get_top500_xml(2013, '11')
In [109]:
nov_2013 = top500_to_df(nov_2013_xml)
nov_2013.head()
Out[109]:
country manufacturer r-max mflops r-max tflops year
rank
1 China NUDT 33862700000 33862.700 2013
2 United States Cray Inc. 17590000000 17590.000 2012
3 United States IBM 17173224000 17173.224 2011
4 Japan Fujitsu 10510000000 10510.000 2011
5 United States IBM 8586612000 8586.612 2012

Primary goal is to visualize amount of computing power in teraFLOPS per country, and in megaFLOPS per capita by country.

In [181]:
tflops_per_country = {}
mflops_per_capita = {}

for country in nov_2013['country'].unique():
    country_df = nov_2013[nov_2013['country'] == country].head()
    country_tflops = sum(country_df['r-max tflops'])
    country_per_capita_mflops = sum(country_df['r-max mflops'])/population[country]
    
    tflops_per_country[country] = round(country_tflops, 4)
    mflops_per_capita[country] = round(country_per_capita_mflops, 4)
    
In [177]:
print tflops_per_country['China'], tflops_per_country['United States']
39267.3 52811.252
In [182]:
print mflops_per_capita['China'], mflops_per_capita['United States']
28.8359 166.4358
In [120]:
def hbar(title, item_count_dict, bar_color=dark2_colors[0], spacing=50, subplot=111):
    """
    Visualize sorted item and count data with a horizontal bar graph.
    
    Parameters
    -----------
    title: str
      the graph's title
        
    item_count_tuples: lst of tuples
      each tuple contains the word and its count
      
    bar_color: RGB triple (default: dark2_colors[0])
      the color of the bars
    
    spacing: int (default: 50)
      the spacing between the bars and their count labels
    """

    item_count_tuples = item_count_dict.items()
    get_count = operator.itemgetter(1)
    item_count_tuples = sorted(item_count_tuples, key=get_count, reverse=True)
    
    items, counts = [], []
    for item, count in item_count_tuples:
        items.append(item)
        counts.append(count)
    
    # plot
    pos = np.arange(len(items))
    
    if str(subplot)[2] == '1':
        plt.figure(figsize=(10, len(items)/3))
    
    plt.subplot(subplot)
    
    plt.title(title)
    plt.barh(pos, counts, color=bar_color)
    
    # add the numbers to the side of each bar
    for p, item, count in zip(pos, items, counts):
            plt.annotate(str(count), xy=(count + spacing, p + .5), va='center')
    
    # customize ticks
    ticks = plt.yticks(pos + .5, items)
    xt = plt.xticks()[0]
    plt.xticks(xt, [' '] * len(xt))
    
    # minimize chartjunk
    remove_border(left=False, bottom=False)
    plt.grid(axis = 'x', color ='white', linestyle='-')
    
    # set plot limits
    plt.ylim(pos.max() + 1, pos.min())
    
    plt.tight_layout()
In [174]:
log_tflops_per_country = {}

for country, tflops in tflops_per_country.iteritems():
    log_tflops_per_country[country] = round(math.log(tflops), 4)
In [179]:
hbar("TFLOPS per Country", tflops_per_country, subplot=121, spacing=1500)
hbar("Log TFLOPS per Country", log_tflops_per_country, subplot=122, bar_color=dark2_colors[1], spacing=0.5)
plt.tight_layout()
plt.subplots_adjust(wspace=1.2)
In [172]:
log_mflops_per_capita = {}

for country, mflops in mflops_per_capita.iteritems():
    log_mflops_per_capita[country] = round(math.log(mflops), 4)
In [194]:
hbar("MFLOPS per capita, by Country", mflops_per_capita, subplot=121, bar_color=dark2_colors[2], spacing=25)
hbar("Log MFLOPS per capita, by Country", log_mflops_per_capita, subplot=122, bar_color=dark2_colors[3], spacing=0.15)
plt.tight_layout()
plt.subplots_adjust(wspace=1.2)