#!/usr/bin/env python
# coding: utf-8
# # Easy Scraping
#
# * Author: [Pili Hu](http://hupili.net/)
# * Repo: [Easy Scraping in Python](https://github.com/hupili/workshop-easy-scraping)
# * Demo: scrapely, python-readability, pyQuery, httpie
#
# Prerequisites:
#
# * Python3
# * `pip install -r reuiqrements.txt`
# Useful trick in IPython notebook
# In[1]:
import pprint
from IPython.core.display import HTML
# In[2]:
HTML('Logo of Initium Lab: ' % 'http://initiumlab.com/favicon-32x32.png')
# A small hack to allow longer output area
# In[3]:
get_ipython().run_cell_magic('javascript', '', '//IPython.OutputArea.auto_scroll_threshold = 9999;\nIPython.OutputArea.prototype._should_scroll = function(){return false;}\n')
# ## Readability
#
# We use a version ported to Python3:
#
# (already included in the `reuqirements.txt` file)
# In[4]:
from readability.readability import Document
import requests
html = requests.get('http://initiumlab.com/').content
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
# In[5]:
print(readable_article)
# In[6]:
HTML(readable_article)
# ## PyQuery
#
# Let's fix the above URL problems
# In[7]:
import pyquery
r = pyquery.PyQuery(readable_article)
r('p')
# In[8]:
r('video').attr('poster')
# In[9]:
r('video source').attr('src')
# In[10]:
r('video').attr('poster', 'http://initiumlab.com/%s' % r('video').attr('poster'))
# In[11]:
r('video').attr('poster')
# In[12]:
r('video source').attr('src', 'http://initiumlab.com/%s' % r('video source').attr('src'))
# In[13]:
r('video source').attr('src')
# In[14]:
r.html()
# In[15]:
get_ipython().run_cell_magic('javascript', '', '//IPython.OutputArea.auto_scroll_threshold = 9999;\nIPython.OutputArea.prototype._should_scroll = function(){return false;}\n')
# In[16]:
HTML(r.html())
# ## Scrapely
# In[17]:
from scrapely import Scraper
s = Scraper()
# In[18]:
help(s.train)
# In[19]:
from urllib import parse
def get_localhost_url(url):
filename = parse.quote_plus(url)
fullpath = 'tmp/%s' % filename
html = requests.get(url).content
open(fullpath, 'wb').write(html)
return 'http://localhost:8888/files/%s?download=1' % parse.quote_plus(fullpath)
# In[20]:
training_url = 'http://initiumlab.com/blog/20150916-legco-eng/'
training_data = {'title': 'Legco Matrix Brief (English)',
'author': 'Initium Lab',
'date': '2015-09-16'}
s.train(get_localhost_url(training_url), training_data)
# In[21]:
testing_url = 'http://initiumlab.com/blog/20150901-data-journalism-for-the-blind/'
s.scrape(get_localhost_url(testing_url))
# In[22]:
testing_url = 'http://initiumlab.com/blog/20150922-jackathon3-review/'
s.scrape(get_localhost_url(testing_url))
# ## HTTPie & pQuery
#
# * Demo repo: https://github.com/hupili/60-data-science-book-visualisation
# * HTTPie: https://github.com/jkbrzt/httpie
# * pquery: https://github.com/hupili/pquery (CLI wrapper of PyQuery)
# In[23]:
get_ipython().system('ls -1')
# In[24]:
a = get_ipython().getoutput('ls -1')
# In[25]:
a
# In[26]:
get_ipython().run_cell_magic('sh', '', "http get 'http://httpbin.org/get' name==hupili at=='Hardcore scraping workshop!'\n")
# In[27]:
get_ipython().run_cell_magic('sh', '', "http get 'http://httpbin.org/get' name==hupili 'User-Agent: Arbitrarily name your user agent!'\n")
# HTTPie request construction. From `http --help`
#
# ```
# ':' HTTP headers:
# Referer:http://httpie.org Cookie:foo=bar User-Agent:bacon/1.0
#
# '==' URL parameters to be appended to the request URI:
# search==httpie
#
# '=' Data fields to be serialized into a JSON object (with --json, -j)
# or form data (with --form, -f):
# name=HTTPie language=Python description='CLI HTTP client'
#
# ':=' Non-string JSON data fields (only with --json, -j):
# awesome:=true amount:=42 colors:='["red", "green", "blue"]'
#
# '@' Form file fields (only with --form, -f):
# cs@~/Documents/CV.pdf
#
# '=@' A data field like '=', but takes a file path and embeds its content:
# essay=@Documents/essay.txt
#
# ':=@' A raw JSON field like ':=', but takes a file path and embeds its content:
# package:=@./package.json
#
# You can use a backslash to escape a colliding separator in the field name:
# field-name-with\:colon=value
# ```
# In[28]:
get_ipython().run_cell_magic('sh', '', "http --body 'http://www.kdnuggets.com/2015/09/free-data-science-books.html' | head -n 5\n")
# In[29]:
get_ipython().run_cell_magic('sh', '', "http --body 'http://www.kdnuggets.com/2015/09/free-data-science-books.html' |\\\npquery '.three_ul li strong a' -p text |\\\nhead -n 5\n")
# In[30]:
get_ipython().run_cell_magic('sh', '', "http --body 'http://www.kdnuggets.com/2015/09/free-data-science-books.html' |\\\npquery '.three_ul li strong a' -p href |\\\nhead -n 5\n")
# In[31]:
get_ipython().run_cell_magic('sh', '', 'http --body \'http://www.kdnuggets.com/2015/09/free-data-science-books.html\' |\\\npquery \'.three_ul li strong a\' -f \'"{text}",{href}\' |\\\nhead -n 5\n')