%pylab inline

# Step 1: Getting the data

import urllib
PulitzerHTML = urllib.urlopen("http://www.pulitzer.org/bycat/General-Nonfiction").read()

# This is what the basic format looks like
print PulitzerHTML[10198:10695]

# Let's parse it using BeautifulSoup!
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(PulitzerHTML)

winners = soup.findAll("div", attrs={'class': 'citation-heading'})
winners[0]

# The year can be determined using
year = int(winners[0].find('span', attrs={'class': 'year'}).text)
print year

# And the title can be determined using
title = winners[0].find('span', attrs={'class': 'publication'}).text
print title

# Finally, we can determine whether there is a colon in the title
hascolon = ":" in title
print hascolon

# except this fails for 1979 and earlier:
year = int(winners[40].find('span', attrs={'class': 'year'}).text)
print year

# And the title can be determined using
title = winners[40].find('span', attrs={'class': 'publication'}).text
print title

# Finally, we can determine whether there is a colon in the title
hascolon = ":" in title
print hascolon

# This allows us to make a dictionary {2013: [True, ...], 2012: [...]} where the internal list represents,
# for each book in that year, whether there was a colon in the title.
# We also store the titles, as they will be useful later

from collections import defaultdict
colons_per_year = defaultdict(list)
titles = defaultdict(list)

for i,winner in enumerate(winners):
    year = int(winner.find('span', attrs={'class': 'year'}).text)
    
    if i == 40:
        # The Denial of Death by the late Ernest Becker has a slightly different format - work around it manually
        title = u'The Denial of Death'
    else:
        title = winner.find('span', attrs={'class': 'publication'}).text
    hascolon = ":" in title
    
    if i<5:
        print year, title, hascolon
    
    titles[year].append(title)
    colons_per_year[year].append(hascolon)

# However, we cannot use data for before 1980, as the finalists are not known. Remove them
for key,value in colons_per_year.items():
    if key < 1980:
        del colons_per_year[key]
        del titles[key]

# Good, now we can summarize the data!
colon_fraction_per_year = {year: mean(colons) for year,colons in colons_per_year.items()}

# and we can get to plotting!
%pylab inline
figure()

barwidth = 0.7

for year,fraction in colon_fraction_per_year.items():
    percentage = 100*fraction
    bar(year-barwidth/2, percentage, width=barwidth, bottom=0, color='blue')
    bar(year-barwidth/2, 100-percentage, width=barwidth, bottom=percentage, color='red')

legend(["titles with colon", "titles without colon"], loc="lower right", framealpha=0.9, fancybox=True, shadow=True)
axis(xmin=1979, xmax=2014)

xlabel("year")
ylabel("% of titles")

title("Colons: A retrospective")

# But we can do more. How did the length of the title change?

for year,title_list in titles.items():
    lengths = [len(t) for t in title_list]
    errorbar(year, mean(lengths), yerr=std(lengths), color='k')
    
xlabel("year")
ylabel("title length in characters")

# Or in words?

for year,title_list in titles.items():
    lengths = [len(t.split(" ")) for t in title_list]
    errorbar(year, mean(lengths), yerr=std(lengths), color='k')
    
xlabel("year")
ylabel("title length in words")

# Or even average word length...

for year,title_list in titles.items():
    lengths = [len(t.replace(" ", ""))/len(t.split(" ")) for t in title_list]
    errorbar(year, mean(lengths), yerr=std(lengths), color='k')
    
xlabel("year")
ylabel("average word length in the title")

axis(ymin=0)