%pylab inline # Step 1: Getting the data import urllib PulitzerHTML = urllib.urlopen("http://www.pulitzer.org/bycat/General-Nonfiction").read() # This is what the basic format looks like print PulitzerHTML[10198:10695] # Let's parse it using BeautifulSoup! from BeautifulSoup import BeautifulSoup soup = BeautifulSoup(PulitzerHTML) winners = soup.findAll("div", attrs={'class': 'citation-heading'}) winners[0] # The year can be determined using year = int(winners[0].find('span', attrs={'class': 'year'}).text) print year # And the title can be determined using title = winners[0].find('span', attrs={'class': 'publication'}).text print title # Finally, we can determine whether there is a colon in the title hascolon = ":" in title print hascolon # except this fails for 1979 and earlier: year = int(winners[40].find('span', attrs={'class': 'year'}).text) print year # And the title can be determined using title = winners[40].find('span', attrs={'class': 'publication'}).text print title # Finally, we can determine whether there is a colon in the title hascolon = ":" in title print hascolon # This allows us to make a dictionary {2013: [True, ...], 2012: [...]} where the internal list represents, # for each book in that year, whether there was a colon in the title. # We also store the titles, as they will be useful later from collections import defaultdict colons_per_year = defaultdict(list) titles = defaultdict(list) for i,winner in enumerate(winners): year = int(winner.find('span', attrs={'class': 'year'}).text) if i == 40: # The Denial of Death by the late Ernest Becker has a slightly different format - work around it manually title = u'The Denial of Death' else: title = winner.find('span', attrs={'class': 'publication'}).text hascolon = ":" in title if i<5: print year, title, hascolon titles[year].append(title) colons_per_year[year].append(hascolon) # However, we cannot use data for before 1980, as the finalists are not known. Remove them for key,value in colons_per_year.items(): if key < 1980: del colons_per_year[key] del titles[key] # Good, now we can summarize the data! colon_fraction_per_year = {year: mean(colons) for year,colons in colons_per_year.items()} # and we can get to plotting! %pylab inline figure() barwidth = 0.7 for year,fraction in colon_fraction_per_year.items(): percentage = 100*fraction bar(year-barwidth/2, percentage, width=barwidth, bottom=0, color='blue') bar(year-barwidth/2, 100-percentage, width=barwidth, bottom=percentage, color='red') legend(["titles with colon", "titles without colon"], loc="lower right", framealpha=0.9, fancybox=True, shadow=True) axis(xmin=1979, xmax=2014) xlabel("year") ylabel("% of titles") title("Colons: A retrospective") # But we can do more. How did the length of the title change? for year,title_list in titles.items(): lengths = [len(t) for t in title_list] errorbar(year, mean(lengths), yerr=std(lengths), color='k') xlabel("year") ylabel("title length in characters") # Or in words? for year,title_list in titles.items(): lengths = [len(t.split(" ")) for t in title_list] errorbar(year, mean(lengths), yerr=std(lengths), color='k') xlabel("year") ylabel("title length in words") # Or even average word length... for year,title_list in titles.items(): lengths = [len(t.replace(" ", ""))/len(t.split(" ")) for t in title_list] errorbar(year, mean(lengths), yerr=std(lengths), color='k') xlabel("year") ylabel("average word length in the title") axis(ymin=0)