print "This is the Jupyter notebook"
print "It provides a platform for:"
words = ['Open', 'Data', 'Science']
from random import shuffle
for i in range(3):
    shuffle(words)
    print ' '.join(words)


import urllib
urllib.urlretrieve('https://github.com/sjmgarnier/R-vs-Python/archive/master.zip', 'master.zip')

import zipfile
zip = zipfile.ZipFile('./master.zip', 'r')
for name in zip.namelist():
    zip.extract(name, '.')

import pandas as pd # import the pandas library into a namespace called pd
film_deaths = pd.read_csv('./R-vs-Python-master/Deadliest movies scrape/code/film-death-counts-Python.csv')


film_deaths.describe()

film_deaths.describe?

print film_deaths['Year']
#print film_deaths['Body_Count']

# this ensures the plot appears in the web browser
%matplotlib inline 
import pylab as plt # this imports the plotting library in python

plt.plot(film_deaths['Year'], film_deaths['Body_Count'], 'rx')

plt.plot?

film_deaths[film_deaths['Body_Count']>200]

film_deaths[film_deaths['Body_Count']>200].sort(columns='Body_Count', ascending=False)

film_deaths['Body_Count'].hist(bins=20) # histogram the data with 20 bins.
plt.title('Histogram of Film Kill Count')

plt.plot(film_deaths['Year'], film_deaths['Body_Count'], 'rx')
ax = plt.gca() # obtain a handle to the current axis
ax.set_yscale('log') # use a logarithmic death scale
# give the plot some titles and labels
plt.title('Film Deaths against Year')
plt.ylabel('deaths')
plt.xlabel('year')

deaths = (film_deaths.Body_Count>40).sum()  # number of positive outcomes (in sum True counts as 1, False counts as 0)
total_films = film_deaths.Body_Count.count()

prob_death = float(deaths)/float(total_films)
print "Probability of deaths being greather than 40 is:", prob_death

for year in [2000, 2002]:
    deaths = (film_deaths.Body_Count[film_deaths.Year==year]>40).sum()
    total_films = (film_deaths.Year==year).sum()

    prob_death = float(deaths)/float(total_films)
    print "Probability of deaths being greather than 40 in year", year, "is:", prob_death

# Question 5 Answer Code
# Write code for you answer to this question in this box
# Do not delete these comments, otherwise you will get zero for this answer.
# Make sure your code has run and the answer is correct *before* submitting your notebook for marking.


year = 2000
deaths = (film_deaths.Body_Count[film_deaths.Year==year]>40).sum()
total_films = film_deaths.Body_Count.count() # this is total number of films
prob_death = float(deaths)/float(total_films)
print "Probability of deaths being greather than 40 and year being", year, "is:", prob_death

p_t = float((film_deaths.Year==2002).sum())/float(film_deaths.Body_Count.count())
p_y_given_t = float((film_deaths.Body_Count[film_deaths.Year==2002]>40).sum())/float((film_deaths.Year==2002).sum())
p_y_and_t = float((film_deaths.Body_Count[film_deaths.Year==2002]>40).sum())/float(film_deaths.Body_Count.count())

print "P(t) is", p_t
print "P(y|t) is", p_y_given_t
print "P(y,t) is", p_y_and_t

# Question 6 Answer Code
# Write code for you answer to this question in this box
# Do not delete these comments, otherwise you will get zero for this answer.
# Make sure your code has run and the answer is correct *before* submitting your notebook for marking.
%matplotlib inline
import numpy as np
import pylab as pb
years = xrange(film_deaths.Year.min(), film_deaths.Year.max())
p_y_and_t = []
for year in years:
    p_y_and_t.append(float((film_deaths.Body_Count[film_deaths.Year==year]>40).sum())/float(film_deaths.Body_Count.count()))
p_y_and_t = np.asarray(p_y_and_t)
p_y = p_y_and_t.sum()


movies = pd.read_csv('./R-vs-Python-master/Deadliest movies scrape/code/film-death-counts-Python.csv')
movies.columns

!easy_install -U IMDbPY

from imdb import IMDb
ia = IMDb()

for movie in ia.search_movie('python'):
    print movie 

from IPython.display import YouTubeVideo
YouTubeVideo('GX8VLYUYScM')