In [54]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib

This is inspired by Sumana's Colons: A Retrospective (http://www.harihareswara.net/sumana/2013/11/30/0)

The basic idea: take data directly from the source, parse it using python and make a pretty(-ish) graph with matplotlib.

In [1]:
# Step 1: Getting the data

import urllib
PulitzerHTML = urllib.urlopen("http://www.pulitzer.org/bycat/General-Nonfiction").read()
In [21]:
# This is what the basic format looks like
print PulitzerHTML[10198:10695]
<div class="citation-heading">
  <a name="winner_2009"></a><span class="year">2009</span> <span class="title">Douglas A. Blackmon</span> <span class="publication">Slavery by Another Name: The Re-Enslavement of Black Americans from the Civil War to World War II</span></div>

<div class="view-field view-data-field-citation-value">
  <p>A precise and eloquent work that examines a deliberate system of racial suppression and that rescues a multitude of atrocities from virtual obscurity.</p>
</div>
In [29]:
# Let's parse it using BeautifulSoup!
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(PulitzerHTML)

winners = soup.findAll("div", attrs={'class': 'citation-heading'})
winners[0]
Out[29]:
<div class="citation-heading">
<a name="winner_2013"></a><span class="year">2013</span> <span class="title">Gilbert King</span> <span class="publication">Devil in the Grove: Thurgood Marshall, the Groveland Boys</span></div>
In [37]:
# The year can be determined using
year = int(winners[0].find('span', attrs={'class': 'year'}).text)
print year

# And the title can be determined using
title = winners[0].find('span', attrs={'class': 'publication'}).text
print title

# Finally, we can determine whether there is a colon in the title
hascolon = ":" in title
print hascolon
2013
Devil in the Grove: Thurgood Marshall, the Groveland Boys
True
In [46]:
# except this fails for 1979 and earlier:
year = int(winners[40].find('span', attrs={'class': 'year'}).text)
print year

# And the title can be determined using
title = winners[40].find('span', attrs={'class': 'publication'}).text
print title

# Finally, we can determine whether there is a colon in the title
hascolon = ":" in title
print hascolon
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-46-0da5dc330ec4> in <module>()
      4 
      5 # And the title can be determined using
----> 6 title = winners[40].find('span', attrs={'class': 'publication'}).text
      7 print title
      8 

AttributeError: 'NoneType' object has no attribute 'text'
1974
In [73]:
# This allows us to make a dictionary {2013: [True, ...], 2012: [...]} where the internal list represents,
# for each book in that year, whether there was a colon in the title.
# We also store the titles, as they will be useful later

from collections import defaultdict
colons_per_year = defaultdict(list)
titles = defaultdict(list)

for i,winner in enumerate(winners):
    year = int(winner.find('span', attrs={'class': 'year'}).text)
    
    if i == 40:
        # The Denial of Death by the late Ernest Becker has a slightly different format - work around it manually
        title = u'The Denial of Death'
    else:
        title = winner.find('span', attrs={'class': 'publication'}).text
    hascolon = ":" in title
    
    if i<5:
        print year, title, hascolon
    
    titles[year].append(title)
    colons_per_year[year].append(hascolon)
2013 Devil in the Grove: Thurgood Marshall, the Groveland Boys True
2012 The Swerve: How the World Became Modern True
2011 The Emperor of All Maladies: A Biography of Cancer True
2010 The Dead Hand: The Untold Story of the Cold War Arms Race and Its Dangerous Legacy True
2009 Slavery by Another Name: The Re-Enslavement of Black Americans from the Civil War to World War II True
In [74]:
# However, we cannot use data for before 1980, as the finalists are not known. Remove them
for key,value in colons_per_year.items():
    if key < 1980:
        del colons_per_year[key]
        del titles[key]
In [75]:
# Good, now we can summarize the data!
colon_fraction_per_year = {year: mean(colons) for year,colons in colons_per_year.items()}
In [77]:
# and we can get to plotting!
%pylab inline
figure()

barwidth = 0.7

for year,fraction in colon_fraction_per_year.items():
    percentage = 100*fraction
    bar(year-barwidth/2, percentage, width=barwidth, bottom=0, color='blue')
    bar(year-barwidth/2, 100-percentage, width=barwidth, bottom=percentage, color='red')

legend(["titles with colon", "titles without colon"], loc="lower right", framealpha=0.9, fancybox=True, shadow=True)
axis(xmin=1979, xmax=2014)

xlabel("year")
ylabel("% of titles")

title("Colons: A retrospective")
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['title']
`%pylab --no-import-all` prevents importing * from pylab and numpy
Out[77]:
<matplotlib.text.Text at 0x784db30>
In [82]:
# But we can do more. How did the length of the title change?

for year,title_list in titles.items():
    lengths = [len(t) for t in title_list]
    errorbar(year, mean(lengths), yerr=std(lengths), color='k')
    
xlabel("year")
ylabel("title length in characters")
Out[82]:
<matplotlib.text.Text at 0x7ed3730>
In [87]:
# Or in words?

for year,title_list in titles.items():
    lengths = [len(t.split(" ")) for t in title_list]
    errorbar(year, mean(lengths), yerr=std(lengths), color='k')
    
xlabel("year")
ylabel("title length in words")
Out[87]:
<matplotlib.text.Text at 0x8d69a50>
In [88]:
# Or even average word length...

for year,title_list in titles.items():
    lengths = [len(t.replace(" ", ""))/len(t.split(" ")) for t in title_list]
    errorbar(year, mean(lengths), yerr=std(lengths), color='k')
    
xlabel("year")
ylabel("average word length in the title")

axis(ymin=0)
Out[88]:
(1980.0, 2015.0, 0, 8.0)
In [ ]: