print "This is the Jupyter notebook" print "It provides a platform for:" words = ['Open', 'Data', 'Science'] from random import shuffle for i in range(3): shuffle(words) print ' '.join(words) import urllib urllib.urlretrieve('https://github.com/sjmgarnier/R-vs-Python/archive/master.zip', 'master.zip') import zipfile zip = zipfile.ZipFile('./master.zip', 'r') for name in zip.namelist(): zip.extract(name, '.') import pandas as pd # import the pandas library into a namespace called pd film_deaths = pd.read_csv('./R-vs-Python-master/Deadliest movies scrape/code/film-death-counts-Python.csv') film_deaths.describe() film_deaths.describe? print film_deaths['Year'] #print film_deaths['Body_Count'] # this ensures the plot appears in the web browser %matplotlib inline import pylab as plt # this imports the plotting library in python plt.plot(film_deaths['Year'], film_deaths['Body_Count'], 'rx') plt.plot? film_deaths[film_deaths['Body_Count']>200] film_deaths[film_deaths['Body_Count']>200].sort(columns='Body_Count', ascending=False) film_deaths['Body_Count'].hist(bins=20) # histogram the data with 20 bins. plt.title('Histogram of Film Kill Count') plt.plot(film_deaths['Year'], film_deaths['Body_Count'], 'rx') ax = plt.gca() # obtain a handle to the current axis ax.set_yscale('log') # use a logarithmic death scale # give the plot some titles and labels plt.title('Film Deaths against Year') plt.ylabel('deaths') plt.xlabel('year') deaths = (film_deaths.Body_Count>40).sum() # number of positive outcomes (in sum True counts as 1, False counts as 0) total_films = film_deaths.Body_Count.count() prob_death = float(deaths)/float(total_films) print "Probability of deaths being greather than 40 is:", prob_death for year in [2000, 2002]: deaths = (film_deaths.Body_Count[film_deaths.Year==year]>40).sum() total_films = (film_deaths.Year==year).sum() prob_death = float(deaths)/float(total_films) print "Probability of deaths being greather than 40 in year", year, "is:", prob_death # Question 5 Answer Code # Write code for you answer to this question in this box # Do not delete these comments, otherwise you will get zero for this answer. # Make sure your code has run and the answer is correct *before* submitting your notebook for marking. year = 2000 deaths = (film_deaths.Body_Count[film_deaths.Year==year]>40).sum() total_films = film_deaths.Body_Count.count() # this is total number of films prob_death = float(deaths)/float(total_films) print "Probability of deaths being greather than 40 and year being", year, "is:", prob_death p_t = float((film_deaths.Year==2002).sum())/float(film_deaths.Body_Count.count()) p_y_given_t = float((film_deaths.Body_Count[film_deaths.Year==2002]>40).sum())/float((film_deaths.Year==2002).sum()) p_y_and_t = float((film_deaths.Body_Count[film_deaths.Year==2002]>40).sum())/float(film_deaths.Body_Count.count()) print "P(t) is", p_t print "P(y|t) is", p_y_given_t print "P(y,t) is", p_y_and_t # Question 6 Answer Code # Write code for you answer to this question in this box # Do not delete these comments, otherwise you will get zero for this answer. # Make sure your code has run and the answer is correct *before* submitting your notebook for marking. %matplotlib inline import numpy as np import pylab as pb years = xrange(film_deaths.Year.min(), film_deaths.Year.max()) p_y_and_t = [] for year in years: p_y_and_t.append(float((film_deaths.Body_Count[film_deaths.Year==year]>40).sum())/float(film_deaths.Body_Count.count())) p_y_and_t = np.asarray(p_y_and_t) p_y = p_y_and_t.sum() movies = pd.read_csv('./R-vs-Python-master/Deadliest movies scrape/code/film-death-counts-Python.csv') movies.columns !easy_install -U IMDbPY from imdb import IMDb ia = IMDb() for movie in ia.search_movie('python'): print movie from IPython.display import YouTubeVideo YouTubeVideo('GX8VLYUYScM')