#!/usr/bin/env python # coding: utf-8 # # IMDb Scraping # # requests is a python library for dealing with web pages.
# http://docs.python-requests.org/en/v2.0-0/user/quickstart/ # In[3]: import requests from pattern import web from BeautifulSoup import BeautifulSoup # ### Two ways of making requests # # #### 1. Explicit url # In[4]: url = 'http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2015' r = requests.get(url) print r.url # #### 2. Base url with GET dictionary # params is way to specify the added features while getting a url # In[5]: url = 'http://www.imdb.com/search/title' params = dict(sort='num_votes,desc', start=1, title_type='feature', year='1950,2015') r = requests.get(url, params=params) print r.url # notice it constructs the full url for you # ## Using Pattern # # check this out
# http://www.clips.ua.ac.be/pages/pattern-web
# r.text has the source code for the entire webpage. dom will now have all the tags from the source code. # In[6]: dom = web.Element(r.text) for movie in dom.by_tag('td.title'): title = movie.by_tag('a')[0].content #content of a tag is the stuff between the opening and closing tags runtime = movie.by_tag('span.runtime')[0].content rating = movie.by_tag('span.value')[0].content genres = movie.by_tag('span.genre')[0].by_tag('a') genre = [g.content for g in genres] #could have as well done #genre = [] #for g in genres: # genre.append(g.content) print title, runtime, rating, genre # ## Using BeautifulSoup # # Beautiful Soup is a Python library for pulling data out of HTML and XML files.
# Check documentation here.
# http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # In[7]: bs = BeautifulSoup(r.text) #gives you the source code for movie in bs.findAll('td','title'): title = movie.find('a').contents[0] #use only find when you know you want only one value of a tag runtime = movie.find('span','runtime').contents[0] rating = movie.find('span','value').contents[0] genres = movie.find('span','genre').findAll('a') genre = [g.contents[0] for g in genres] print title, runtime, rating, genre #http://www.crummy.com/software/BeautifulSoup/bs4/doc/#calling-a-tag-is-like-calling-find-all #
#
# #### So now how do you get the top 200 movies?
# You'll have to iterate over the start parameter in the get request function.
# syntax of xrange is xrange(start, stop, [step])
# It exits loop before the iteration can reach stop. # In[ ]: url = 'http://www.imdb.com/search/title' for i in xrange(1,200,50): params = dict(sort='num_votes,desc', start=i, title_type='feature', year='1950,2015') r = requests.get(url, params=params) dom = web.Element(r.text) for movie in dom.by_tag('td.title'): title = movie.by_tag('a')[0].content year = movie.by_tag('span.year_type')[0].content runtime = movie.by_tag('span.runtime')[0].content rating = movie.by_tag('span.value')[0].content genres = movie.by_tag('span.genre')[0].by_tag('a') genre = [g.content for g in genres] print title, year, runtime, rating, genre # ## Writing the scraped data into a file # Now, we'll write this data into a file for further analysis. # In[ ]: imdb = open('imdb_top_200.txt','a') url = 'http://www.imdb.com/search/title' for i in xrange(1,200,50): params = dict(sort='num_votes,desc', start=i, title_type='feature', year='1950,2015') r = requests.get(url, params=params) dom = web.Element(r.text) for movie in dom.by_tag('td.title'): title = movie.by_tag('a')[0].content year = movie.by_tag('span.year_type')[0].content runtime = movie.by_tag('span.runtime')[0].content rating = movie.by_tag('span.value')[0].content genres = movie.by_tag('span.genre')[0].by_tag('a') genre = [g.content for g in genres] imdb.write(title+'\t'+year+'\t'+str(rating) +'\t'+str(runtime)+'\t'+str(genre)+'\n') imdb.close() # This pretty much gives me the top 200 movies but there are some redundant movies being written at the top of the file.
Will have to look into that.