""" Get 'Science or Fiction' data from The Skeptics' Guide to the Universe website. """
import urllib2
import contextlib
from time import sleep
from bs4 import BeautifulSoup
# put data in dictionary, counts = {episode_num: fiction_num, ...}
counts = {}
# first episode with science or fiction info is episode 43.
for i in range(43,481):
url = "http://www.theskepticsguide.org/podcast/sgu/%s" % i
with contextlib.closing(urllib2.urlopen(url)) as response:
html = response.read()
soup = BeautifulSoup(html)
scifi_title = None
# find the 'Science or Fiction' section of the page
for title in soup.find('div', {'class': 'podcast-detail'}).find_all('h3'):
if 'Science or Fiction' in title.text:
scifi_title = title
# not all episodes have a round of scifi
if scifi_title:
# get the list of news items from section
scifi_items = scifi_title.find_next('ul').select('li')
# only want to look at scifis with 3 items.
if len(scifi_items) == 3:
for item in scifi_items:
split_item = item.select('span')
# get the item that is fiction and increment number
if split_item[1].text.strip().lower() == 'fiction':
item_number = split_item[0].text.split('#')[1].strip()
try:
item_number = int(item_number)
counts[i] = item_number
except ValueError:
print "ValueError with episode %s!" % i
continue
# be very gentle on the website
sleep(5)
ValueError with episode 247!
# count the number of times items 1, 2 and 3 were fiction.
values = counts.values()
n1 = values.count(1)
n2 = values.count(2)
n3 = values.count(3)
# probability
total = float(len(values))
p1 = n1 / total
p2 = n2 / total
p3 = n3 / total
print n1, n2, n3
print p1, p2, p3
128 119 133 0.336842105263 0.313157894737 0.35
# make a little plot
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.rcParams['savefig.dpi'] = 100 # make the graph display nicer
ind = [0]
plt.figure(figsize=(2.5,5))
plot1 = plt.bar(ind, p1, 0.2, color='#66c2a5') # colors from http://colorbrewer2.org/
plot2 = plt.bar(ind, p2, 0.2, color='#fc8d62', bottom=p1)
plot3 = plt.bar(ind, p3, 0.2, color='#8da0cb', bottom=p1+p2)
# remove x ticks
plt.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
plt.ylabel('Distribution of Fiction')
plt.legend((plot3, plot2, plot1), ('Item 3', 'Item 2', 'Item 1'))
plt.show()
# yeah, that plot isn't useful at all
# do a chi squared test to see if the differences are actually significant
from scipy import stats
observations = [n1, n2, n3]
chisq, p = stats.chisquare(observations)
print "ChiSq: %s \n P: %s" % (chisq, p)
ChiSq: 0.794736842105 P: 0.672086369247