#!/usr/bin/env python # coding: utf-8 # # Easy Scraping # # * Author: [Pili Hu](http://hupili.net/) # * Repo: [Easy Scraping in Python](https://github.com/hupili/workshop-easy-scraping) # * Demo: scrapely, python-readability, pyQuery, httpie # # Prerequisites: # # * Python3 # * `pip install -r reuiqrements.txt` # Useful trick in IPython notebook # In[1]: import pprint from IPython.core.display import HTML # In[2]: HTML('Logo of Initium Lab: ' % 'http://initiumlab.com/favicon-32x32.png') # A small hack to allow longer output area # In[3]: get_ipython().run_cell_magic('javascript', '', '//IPython.OutputArea.auto_scroll_threshold = 9999;\nIPython.OutputArea.prototype._should_scroll = function(){return false;}\n') # ## Readability # # We use a version ported to Python3: # # (already included in the `reuqirements.txt` file) # In[4]: from readability.readability import Document import requests html = requests.get('http://initiumlab.com/').content readable_article = Document(html).summary() readable_title = Document(html).short_title() # In[5]: print(readable_article) # In[6]: HTML(readable_article) # ## PyQuery # # Let's fix the above URL problems # In[7]: import pyquery r = pyquery.PyQuery(readable_article) r('p') # In[8]: r('video').attr('poster') # In[9]: r('video source').attr('src') # In[10]: r('video').attr('poster', 'http://initiumlab.com/%s' % r('video').attr('poster')) # In[11]: r('video').attr('poster') # In[12]: r('video source').attr('src', 'http://initiumlab.com/%s' % r('video source').attr('src')) # In[13]: r('video source').attr('src') # In[14]: r.html() # In[15]: get_ipython().run_cell_magic('javascript', '', '//IPython.OutputArea.auto_scroll_threshold = 9999;\nIPython.OutputArea.prototype._should_scroll = function(){return false;}\n') # In[16]: HTML(r.html()) # ## Scrapely # In[17]: from scrapely import Scraper s = Scraper() # In[18]: help(s.train) # In[19]: from urllib import parse def get_localhost_url(url): filename = parse.quote_plus(url) fullpath = 'tmp/%s' % filename html = requests.get(url).content open(fullpath, 'wb').write(html) return 'http://localhost:8888/files/%s?download=1' % parse.quote_plus(fullpath) # In[20]: training_url = 'http://initiumlab.com/blog/20150916-legco-eng/' training_data = {'title': 'Legco Matrix Brief (English)', 'author': 'Initium Lab', 'date': '2015-09-16'} s.train(get_localhost_url(training_url), training_data) # In[21]: testing_url = 'http://initiumlab.com/blog/20150901-data-journalism-for-the-blind/' s.scrape(get_localhost_url(testing_url)) # In[22]: testing_url = 'http://initiumlab.com/blog/20150922-jackathon3-review/' s.scrape(get_localhost_url(testing_url)) # ## HTTPie & pQuery # # * Demo repo: https://github.com/hupili/60-data-science-book-visualisation # * HTTPie: https://github.com/jkbrzt/httpie # * pquery: https://github.com/hupili/pquery (CLI wrapper of PyQuery) # In[23]: get_ipython().system('ls -1') # In[24]: a = get_ipython().getoutput('ls -1') # In[25]: a # In[26]: get_ipython().run_cell_magic('sh', '', "http get 'http://httpbin.org/get' name==hupili at=='Hardcore scraping workshop!'\n") # In[27]: get_ipython().run_cell_magic('sh', '', "http get 'http://httpbin.org/get' name==hupili 'User-Agent: Arbitrarily name your user agent!'\n") # HTTPie request construction. From `http --help` # # ``` # ':' HTTP headers: # Referer:http://httpie.org Cookie:foo=bar User-Agent:bacon/1.0 # # '==' URL parameters to be appended to the request URI: # search==httpie # # '=' Data fields to be serialized into a JSON object (with --json, -j) # or form data (with --form, -f): # name=HTTPie language=Python description='CLI HTTP client' # # ':=' Non-string JSON data fields (only with --json, -j): # awesome:=true amount:=42 colors:='["red", "green", "blue"]' # # '@' Form file fields (only with --form, -f): # cs@~/Documents/CV.pdf # # '=@' A data field like '=', but takes a file path and embeds its content: # essay=@Documents/essay.txt # # ':=@' A raw JSON field like ':=', but takes a file path and embeds its content: # package:=@./package.json # # You can use a backslash to escape a colliding separator in the field name: # field-name-with\:colon=value # ``` # In[28]: get_ipython().run_cell_magic('sh', '', "http --body 'http://www.kdnuggets.com/2015/09/free-data-science-books.html' | head -n 5\n") # In[29]: get_ipython().run_cell_magic('sh', '', "http --body 'http://www.kdnuggets.com/2015/09/free-data-science-books.html' |\\\npquery '.three_ul li strong a' -p text |\\\nhead -n 5\n") # In[30]: get_ipython().run_cell_magic('sh', '', "http --body 'http://www.kdnuggets.com/2015/09/free-data-science-books.html' |\\\npquery '.three_ul li strong a' -p href |\\\nhead -n 5\n") # In[31]: get_ipython().run_cell_magic('sh', '', 'http --body \'http://www.kdnuggets.com/2015/09/free-data-science-books.html\' |\\\npquery \'.three_ul li strong a\' -f \'"{text}",{href}\' |\\\nhead -n 5\n')