#!/usr/bin/env python
# coding: utf-8
# # IMDb Scraping
#
# requests is a python library for dealing with web pages.
# http://docs.python-requests.org/en/v2.0-0/user/quickstart/
# In[3]:
import requests
from pattern import web
from BeautifulSoup import BeautifulSoup
# ### Two ways of making requests
#
# #### 1. Explicit url
# In[4]:
url = 'http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2015'
r = requests.get(url)
print r.url
# #### 2. Base url with GET dictionary
# params is way to specify the added features while getting a url
# In[5]:
url = 'http://www.imdb.com/search/title'
params = dict(sort='num_votes,desc', start=1, title_type='feature', year='1950,2015')
r = requests.get(url, params=params)
print r.url # notice it constructs the full url for you
# ## Using Pattern
#
# check this out
# http://www.clips.ua.ac.be/pages/pattern-web
# r.text has the source code for the entire webpage. dom will now have all the tags from the source code.
# In[6]:
dom = web.Element(r.text)
for movie in dom.by_tag('td.title'):
title = movie.by_tag('a')[0].content #content of a tag is the stuff between the opening and closing tags
runtime = movie.by_tag('span.runtime')[0].content
rating = movie.by_tag('span.value')[0].content
genres = movie.by_tag('span.genre')[0].by_tag('a')
genre = [g.content for g in genres]
#could have as well done
#genre = []
#for g in genres:
# genre.append(g.content)
print title, runtime, rating, genre
# ## Using BeautifulSoup
#
# Beautiful Soup is a Python library for pulling data out of HTML and XML files.
# Check documentation here.
# http://www.crummy.com/software/BeautifulSoup/bs4/doc/
# In[7]:
bs = BeautifulSoup(r.text) #gives you the source code
for movie in bs.findAll('td','title'):
title = movie.find('a').contents[0] #use only find when you know you want only one value of a tag
runtime = movie.find('span','runtime').contents[0]
rating = movie.find('span','value').contents[0]
genres = movie.find('span','genre').findAll('a')
genre = [g.contents[0] for g in genres]
print title, runtime, rating, genre
#http://www.crummy.com/software/BeautifulSoup/bs4/doc/#calling-a-tag-is-like-calling-find-all
#
#
# #### So now how do you get the top 200 movies?
# You'll have to iterate over the start parameter in the get request function.
# syntax of xrange is xrange(start, stop, [step])
# It exits loop before the iteration can reach stop.
# In[ ]:
url = 'http://www.imdb.com/search/title'
for i in xrange(1,200,50):
params = dict(sort='num_votes,desc', start=i, title_type='feature', year='1950,2015')
r = requests.get(url, params=params)
dom = web.Element(r.text)
for movie in dom.by_tag('td.title'):
title = movie.by_tag('a')[0].content
year = movie.by_tag('span.year_type')[0].content
runtime = movie.by_tag('span.runtime')[0].content
rating = movie.by_tag('span.value')[0].content
genres = movie.by_tag('span.genre')[0].by_tag('a')
genre = [g.content for g in genres]
print title, year, runtime, rating, genre
# ## Writing the scraped data into a file
# Now, we'll write this data into a file for further analysis.
# In[ ]:
imdb = open('imdb_top_200.txt','a')
url = 'http://www.imdb.com/search/title'
for i in xrange(1,200,50):
params = dict(sort='num_votes,desc', start=i, title_type='feature', year='1950,2015')
r = requests.get(url, params=params)
dom = web.Element(r.text)
for movie in dom.by_tag('td.title'):
title = movie.by_tag('a')[0].content
year = movie.by_tag('span.year_type')[0].content
runtime = movie.by_tag('span.runtime')[0].content
rating = movie.by_tag('span.value')[0].content
genres = movie.by_tag('span.genre')[0].by_tag('a')
genre = [g.content for g in genres]
imdb.write(title+'\t'+year+'\t'+str(rating) +'\t'+str(runtime)+'\t'+str(genre)+'\n')
imdb.close()
# This pretty much gives me the top 200 movies but there are some redundant movies being written at the top of the file.
Will have to look into that.