%pylab inline
Populating the interactive namespace from numpy and matplotlib
This is inspired by Sumana's Colons: A Retrospective (http://www.harihareswara.net/sumana/2013/11/30/0)
The basic idea: take data directly from the source, parse it using python and make a pretty(-ish) graph with matplotlib.
# Step 1: Getting the data
import urllib
PulitzerHTML = urllib.urlopen("http://www.pulitzer.org/bycat/General-Nonfiction").read()
# This is what the basic format looks like
print PulitzerHTML[10198:10695]
<div class="citation-heading"> <a name="winner_2009"></a><span class="year">2009</span> <span class="title">Douglas A. Blackmon</span> <span class="publication">Slavery by Another Name: The Re-Enslavement of Black Americans from the Civil War to World War II</span></div> <div class="view-field view-data-field-citation-value"> <p>A precise and eloquent work that examines a deliberate system of racial suppression and that rescues a multitude of atrocities from virtual obscurity.</p> </div>
# Let's parse it using BeautifulSoup!
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(PulitzerHTML)
winners = soup.findAll("div", attrs={'class': 'citation-heading'})
winners[0]
<div class="citation-heading"> <a name="winner_2013"></a><span class="year">2013</span> <span class="title">Gilbert King</span> <span class="publication">Devil in the Grove: Thurgood Marshall, the Groveland Boys</span></div>
# The year can be determined using
year = int(winners[0].find('span', attrs={'class': 'year'}).text)
print year
# And the title can be determined using
title = winners[0].find('span', attrs={'class': 'publication'}).text
print title
# Finally, we can determine whether there is a colon in the title
hascolon = ":" in title
print hascolon
2013 Devil in the Grove: Thurgood Marshall, the Groveland Boys True
# except this fails for 1979 and earlier:
year = int(winners[40].find('span', attrs={'class': 'year'}).text)
print year
# And the title can be determined using
title = winners[40].find('span', attrs={'class': 'publication'}).text
print title
# Finally, we can determine whether there is a colon in the title
hascolon = ":" in title
print hascolon
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-46-0da5dc330ec4> in <module>() 4 5 # And the title can be determined using ----> 6 title = winners[40].find('span', attrs={'class': 'publication'}).text 7 print title 8 AttributeError: 'NoneType' object has no attribute 'text'
1974
# This allows us to make a dictionary {2013: [True, ...], 2012: [...]} where the internal list represents,
# for each book in that year, whether there was a colon in the title.
# We also store the titles, as they will be useful later
from collections import defaultdict
colons_per_year = defaultdict(list)
titles = defaultdict(list)
for i,winner in enumerate(winners):
year = int(winner.find('span', attrs={'class': 'year'}).text)
if i == 40:
# The Denial of Death by the late Ernest Becker has a slightly different format - work around it manually
title = u'The Denial of Death'
else:
title = winner.find('span', attrs={'class': 'publication'}).text
hascolon = ":" in title
if i<5:
print year, title, hascolon
titles[year].append(title)
colons_per_year[year].append(hascolon)
2013 Devil in the Grove: Thurgood Marshall, the Groveland Boys True 2012 The Swerve: How the World Became Modern True 2011 The Emperor of All Maladies: A Biography of Cancer True 2010 The Dead Hand: The Untold Story of the Cold War Arms Race and Its Dangerous Legacy True 2009 Slavery by Another Name: The Re-Enslavement of Black Americans from the Civil War to World War II True
# However, we cannot use data for before 1980, as the finalists are not known. Remove them
for key,value in colons_per_year.items():
if key < 1980:
del colons_per_year[key]
del titles[key]
# Good, now we can summarize the data!
colon_fraction_per_year = {year: mean(colons) for year,colons in colons_per_year.items()}
# and we can get to plotting!
%pylab inline
figure()
barwidth = 0.7
for year,fraction in colon_fraction_per_year.items():
percentage = 100*fraction
bar(year-barwidth/2, percentage, width=barwidth, bottom=0, color='blue')
bar(year-barwidth/2, 100-percentage, width=barwidth, bottom=percentage, color='red')
legend(["titles with colon", "titles without colon"], loc="lower right", framealpha=0.9, fancybox=True, shadow=True)
axis(xmin=1979, xmax=2014)
xlabel("year")
ylabel("% of titles")
title("Colons: A retrospective")
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['title'] `%pylab --no-import-all` prevents importing * from pylab and numpy
<matplotlib.text.Text at 0x784db30>
# But we can do more. How did the length of the title change?
for year,title_list in titles.items():
lengths = [len(t) for t in title_list]
errorbar(year, mean(lengths), yerr=std(lengths), color='k')
xlabel("year")
ylabel("title length in characters")
<matplotlib.text.Text at 0x7ed3730>
# Or in words?
for year,title_list in titles.items():
lengths = [len(t.split(" ")) for t in title_list]
errorbar(year, mean(lengths), yerr=std(lengths), color='k')
xlabel("year")
ylabel("title length in words")
<matplotlib.text.Text at 0x8d69a50>
# Or even average word length...
for year,title_list in titles.items():
lengths = [len(t.replace(" ", ""))/len(t.split(" ")) for t in title_list]
errorbar(year, mean(lengths), yerr=std(lengths), color='k')
xlabel("year")
ylabel("average word length in the title")
axis(ymin=0)
(1980.0, 2015.0, 0, 8.0)