# This is the input block -- a full bone Python Shell
print('Look: I will be shown on output block')
Look: I will be shown on output block
import pprint
from IPython.core.display import HTML
HTML('Logo of Initium Lab: <img src="%s">' % 'http://initiumlab.com/favicon-32x32.png')
# Display any HTML easily
my_html = '''
I'm going to show you:
<ul>
<li> PyReadability </li>
<li> PyQuery </li>
<li> ... </li>
</ul>
'''
HTML(my_html)
%%javascript
//IPython.OutputArea.auto_scroll_threshold = 9999;
IPython.OutputArea.prototype._should_scroll = function(){return false;}
# I'm going to insert some slides here
from IPython.core.display import Image
Hong Kong Legislative Council:
https://theinitium.com/article/20150812-hongkong-legcoanalysis/
http://legco.initiumlab.com/matrix
Hong Kong District Council Election:
https://theinitium.com/project/20151012-hk-district-council-elections/
https://theinitium.com/project/20151019-hk-district-council-elections-2/
https://theinitium.com/project/20151029-hk-district-council-elections-3/
Image('assets/venn-skillset.png')
Image('assets/workflow-highlight-data-collection.png')
print('screenshot from: https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp')
Image('assets/rgc-official-site.png')
screenshot from: https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp
print('e.g. # of Hong Kong v.s. Non Hong Kong studies (Social Science)')
print('(Just draft labeling! -- cite the figure at your own risk)')
Image('assets/hk-non-hk-studies-humanities.png')
e.g. # of Hong Kong v.s. Non Hong Kong studies (Social Science) (Just draft labeling! -- cite the figure at your own risk)
%%sh
ls -1
Easy Scraping.html Easy Scraping.ipynb README.md Scrape More with Less Codes.ipynb assets hosts output path-list.txt requirements.txt tmp venv
%%sh
curl -s 'http://initiumlab.com/' | head -n 8
<!doctype html> <html class="theme-next use-motion "> <head> <meta charset="UTF-8"/> <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" /> <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>
import requests
html = requests.get('http://initiumlab.com/').content
html[:500]
b'<!doctype html>\n<html class="theme-next use-motion ">\n<head>\n \n\n<meta charset="UTF-8"/>\n<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>\n\n\n<meta http-equiv="Cache-Control" content="no-transform" />\n<meta http-equiv="Cache-Control" content="no-siteapp" />\n\n\n\n\n\n\n <link rel="stylesheet" type="text/css" href="./vendors/fancybox/source/jquery.fancybox.css?v=2.1.5"/>\n\n\n\n <link href=\'//fonts.google'
Not an easy task, generally:
%%sh
curl -s 'http://initiumlab.com/' | grep title
<link rel="alternate" href="./blog/feed.xml" title="Initium Lab" type="application/atom+xml" /> <meta property="og:title" content="Initium Lab"> <meta name="twitter:title" content="Initium Lab"> <title> Initium Lab </title> <div class='subtitle' id="titleWorks"> <div class='subtitle' id='titleBlogs'> <h1 class="post-title" itemprop="name headline"> <a class="post-title-link" href="./blog/20151025-jackathon-no-5/" itemprop="url"> </h1> <!-- h1.post-title --> <h1 class="post-title" itemprop="name headline"> <a class="post-title-link" href="./blog/20151015-3d-infographic-user-testing/" itemprop="url"> </h1> <!-- h1.post-title --> <h1 class="post-title" itemprop="name headline"> <a class="post-title-link" href="./blog/20151005-read-journalism/" itemprop="url"> </h1> <!-- h1.post-title --> <h1 class="post-title" itemprop="name headline"> <a class="post-title-link" href="./blog/20150925-react-in-1-hour-cuhk/" itemprop="url"> </h1> <!-- h1.post-title --> <h1 class="post-title" itemprop="name headline"> <a class="post-title-link" href="./blog/20150922-jackathon3-review/" itemprop="url"> </h1> <!-- h1.post-title --> <h1 class="post-title" itemprop="name headline"> <a class="post-title-link" href="./blog/20150916-legco-eng/" itemprop="url"> </h1> <!-- h1.post-title --> var disqus_title = '';
%%sh
curl -s 'http://initiumlab.com/' | grep '<title'
<title> Initium Lab </title>
[Y]
items will be involved in this talk.
For mature project, you usually loop between Download and Parse, e.g. scrapy
is a widely used framework.
Keywords of this demo:
Human-friendly command tool written in Python
%%sh
http get http://initiumlab.com | head -n 50 | tail -n 10
<meta property="og:type" content="website"> <meta property="og:title" content="Initium Lab"> <meta property="og:url" content="http://initiumlab.com/blog/index.html"> <meta property="og:site_name" content="Initium Lab"> <meta property="og:description" content="The Website of Initium Lab, the exploratory arm of Initium Media"> <meta name="twitter:card" content="summary"> <meta name="twitter:title" content="Initium Lab"> <meta name="twitter:description" content="The Website of Initium Lab, the exploratory arm of Initium Media">
No. Nearly seamless integration:
lines = !http get http://initiumlab.com
lines[0:10]
['<!doctype html>', '<html class="theme-next use-motion ">', '<head>', ' ', '', '<meta charset="UTF-8"/>', '<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />', '<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>', '', '']
':' HTTP headers:
Referer:http://httpie.org Cookie:foo=bar User-Agent:bacon/1.0
'==' URL parameters to be appended to the request URI:
search==httpie
'=' Data fields to be serialized into a JSON object (with --json, -j)
or form data (with --form, -f):
name=HTTPie language=Python description='CLI HTTP client'
':=' Non-string JSON data fields (only with --json, -j):
awesome:=true amount:=42 colors:='["red", "green", "blue"]'
'@' Form file fields (only with --form, -f):
cs@~/Documents/CV.pdf
'=@' A data field like '=', but takes a file path and embeds its content:
essay=@Documents/essay.txt
':=@' A raw JSON field like ':=', but takes a file path and embeds its content:
package:=@./package.json
You can use a backslash to escape a colliding separator in the field name:
field-name-with\:colon=value
%%sh
http get 'http://httpbin.org/get' name==hupili at=='Scrape more with less codes!'
{ "args": { "at": "Scrape more with less codes!", "name": "hupili" }, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Host": "httpbin.org", "User-Agent": "HTTPie/0.9.2" }, "origin": "118.140.67.6", "url": "http://httpbin.org/get?at=Scrape+more+with+less+codes!&name=hupili" }
%%sh
http post 'http://httpbin.org/post' name==hupili at=='Scrape more with less codes!'
{ "args": { "at": "Scrape more with less codes!", "name": "hupili" }, "data": "", "files": {}, "form": {}, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Content-Length": "0", "Host": "httpbin.org", "User-Agent": "HTTPie/0.9.2" }, "json": null, "origin": "118.140.67.6", "url": "http://httpbin.org/post?at=Scrape+more+with+less+codes!&name=hupili" }
Caveats: --ignore-stdin
is required in IPython notebook
Not a problem in command-line env.
Related issues: https://github.com/jkbrzt/httpie/issues/150
%%sh
http --form --ignore-stdin post 'http://httpbin.org/post' name==hupili at='Scrape more with less codes!'
{ "args": { "name": "hupili" }, "data": "", "files": {}, "form": { "at": "Scrape more with less codes!" }, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Content-Length": "33", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "Host": "httpbin.org", "User-Agent": "HTTPie/0.9.2" }, "json": null, "origin": "118.140.67.6", "url": "http://httpbin.org/post?name=hupili" }
Note: "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
%%sh
http --form --ignore-stdin post 'http://httpbin.org/post' name=hupili at='Scrape more with less codes!'
{ "args": {}, "data": "", "files": {}, "form": { "at": "Scrape more with less codes!", "name": "hupili" }, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Content-Length": "45", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "Host": "httpbin.org", "User-Agent": "HTTPie/0.9.2" }, "json": null, "origin": "118.140.67.6", "url": "http://httpbin.org/post" }
Note: "Content-Type": "application/json",
%%sh
http --ignore-stdin post 'http://httpbin.org/post' name=hupili at='Scrape more with less codes!'
{ "args": {}, "data": "{\"name\": \"hupili\", \"at\": \"Scrape more with less codes!\"}", "files": {}, "form": {}, "headers": { "Accept": "application/json", "Accept-Encoding": "gzip, deflate", "Content-Length": "56", "Content-Type": "application/json", "Host": "httpbin.org", "User-Agent": "HTTPie/0.9.2" }, "json": { "at": "Scrape more with less codes!", "name": "hupili" }, "origin": "118.140.67.6", "url": "http://httpbin.org/post" }
%%sh
http get https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp | head -n 10
<HTML> <HEAD> <script language="JavaScript" src="validation.js"></script> <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> <META name="GENERATOR" content="IBM WebSphere Studio"> <META http-equiv="Content-Style-Type" content="text/css">
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'> BrokenPipeError: [Errno 32] Broken pipe
%%sh
http get https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp | grep 155
Needs to find out why it doesn't give us the links
Image('assets/rgc-search-network-trace.png')
Now use HTTPie to easily construct the query
%%sh
http post https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp | grep 155
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' | grep 155
<A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=155">155</A>
html_lines = !http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1'
html_lines_with_a = list(filter(lambda l: '<A' in l, html_lines))
html_lines_with_a[:5]
['\t<A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2">[Next Page]</A>', '\t<A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=906">[Last Page]</A>', '\t<A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2">2</A>', '\t<A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=3">3</A>', '\t<A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=4">4</A>']
A wrap around pyQuery
-- a Python library that allow you manipulate HTML in jQuery style.
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| grep 155
<A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=155">155</A>
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| grep '<a'
Ignore case
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| grep -i '<a' | head -n 5
<A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2">[Next Page]</A> <A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=906">[Last Page]</A> <A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2">2</A> <A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=3">3</A> <A HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=4">4</A>
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'> BrokenPipeError: [Errno 32] Broken pipe
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| grep -i '<a' | grep -o 'HREF=".*"' | head -n 5
HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2" HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=906" HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2" HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=3" HREF="scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=4"
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'> BrokenPipeError: [Errno 32] Broken pipe
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| grep -i '<a' | grep -o 'HREF=".*"' | cut -d'"' -f2 | head -n 5
scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2 scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=906 scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2 scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=3 scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=4
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'> BrokenPipeError: [Errno 32] Broken pipe
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| pquery 'a' | head -n 5
{'tag': 'a', 'text': '[Next Page]', 'html': '[Next Page]', 'href': 'scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2'} {'tag': 'a', 'text': '[Last Page]', 'html': '[Last Page]', 'href': 'scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=906'} {'tag': 'a', 'text': '2', 'html': '2', 'href': 'scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2'} {'tag': 'a', 'text': '3', 'html': '3', 'href': 'scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=3'} {'tag': 'a', 'text': '4', 'html': '4', 'href': 'scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=4'}
Traceback (most recent call last): File "/Users/hupili/Dropbox/Desktop-iMAC-initium/project/workshop-easy-scraping/venv/bin/pquery", line 121, in <module> array_output(data) File "/Users/hupili/Dropbox/Desktop-iMAC-initium/project/workshop-easy-scraping/venv/bin/pquery", line 56, in array_output sys.stdout.write(str(i) + '\n') BrokenPipeError: [Errno 32] Broken pipe Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'> BrokenPipeError: [Errno 32] Broken pipe
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| pquery 'a' -p href | head -n 5
scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2 scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=906 scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=2 scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=3 scrrm00541.jsp?subject=&panel=&sScheme=1&mode=search&sStatus=&subject=&proj_id=&Old_proj_id=&proj_title=&isname=&ioname=&institution=&Year=&pages=4
Traceback (most recent call last): File "/Users/hupili/Dropbox/Desktop-iMAC-initium/project/workshop-easy-scraping/venv/bin/pquery", line 121, in <module> array_output(data) File "/Users/hupili/Dropbox/Desktop-iMAC-initium/project/workshop-easy-scraping/venv/bin/pquery", line 56, in array_output sys.stdout.write(str(i) + '\n') BrokenPipeError: [Errno 32] Broken pipe Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'> BrokenPipeError: [Errno 32] Broken pipe
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| pquery 'a' -p href | wc -l
907
Image('assets/rgc-index-list.png')
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| pquery "table td[align='right'] a" -p href | wc -l
907
Scrape the info of 60 data science books and visualise their connection: http://www.kdnuggets.com/2015/09/free-data-science-books.html
%%sh
http --body 'http://www.kdnuggets.com/2015/09/free-data-science-books.html' |\
pquery '.three_ul li strong a' -f '"{text}",{href}' |\
head -n 8
"An Introduction to Data Science",https://docs.google.com/file/d/0B6iefdnF22XQeVZDSkxjZ0Z5VUE/edit?pli=1 "School of Data Handbook",http://schoolofdata.org/handbook/ "Data Jujitsu: The Art of Turning Data into Product",http://www.oreilly.com/data/free/data-jujitsu.csp "The Data Science Handbook",http://www.thedatasciencehandbook.com/#get-the-book "The Data Analytics Handbook",https://www.teamleada.com/handbook "Data Driven: Creating a Data Culture",http://www.oreilly.com/data/free/data-driven.csp "Building Data Science Teams",http://www.oreilly.com/data/free/building-data-science-teams.csp "Understanding the Chief Data Officer",http://www.oreilly.com/data/free/files/understanding-chief-data-officer.pdf
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'> BrokenPipeError: [Errno 32] Broken pipe
Image('assets/data-science-books-graph.png')
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| pquery "table td[align='right'] a" -p href > path-list.txt
Next, let's download them all
%%sh
tail -n 1 path-list.txt | xargs -I{} http "https://cerg1.ugc.edu.hk/cergprod/{}" \
| pquery 'table.styleTableContent' -p html | head -n 10
<tr class="styleTableHeader"> <td nowrap="nowrap" width="10%" align="center"><b>Project Number</b></td> <td nowrap="nowrap" width="50%" align="center"><b>Project Title</b></td> <td nowrap="nowrap" width="15%" align="center"><b>Principal Investigator</b></td> <td nowrap="nowrap" width="15%" align="center"><b>Status</b></td> </tr> <br/> <br/>
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'> BrokenPipeError: [Errno 32] Broken pipe
%%sh
tail -n 1 path-list.txt | xargs -I{} http "https://cerg1.ugc.edu.hk/cergprod/{}" \
| pquery 'table.styleTableContent' -p html | pquery 'td' -p text
Revolution, Commercialism and Chineseness: The Reception and Appropriation of the Socialist Opera Films in Captialist-Colonial Hong Kong, 1954-1966 Dr Hui, Kwok Wai On-going Age Differences in Cognitive Control and Daily Control Strategies and Emotional Experiences: Implications on Physical and Emotional Health Dr Hou, Wai Kai On-going The Chinese Healthcare Reform in Provincial Perspective: A Comparative Study of Fujian and Shanxi Dr He, Jingwei Alex On-going The identification, abundance and sources of microplastics in the fluvial, littoral and marine environments of Hong Kong Dr Fok, Lincoln On-going Decoding the Role and Efficacy of Verbal Imagery in the Teaching and Learning of Singing: Case Studies in Greater China towards a Holistic Approach Dr Chen, Ti Wei On-going Linguistic Analysis of Mid-20th Century Hong Kong Cantonese by Constructing an Annotated Spoken Corpus Dr Chin, Chi On On-going Chinese morality: When propriety is part of the picture, what does morality mean? Testing and extending moral theory to fit lay concepts of a Confucian moral system Dr Buchtel, Emma Ellen Kathrina On-going
%time page_lines = !tail -n 10 path-list.txt | xargs -I{} http "https://cerg1.ugc.edu.hk/cergprod/{}"
CPU times: user 84.1 ms, sys: 51.5 ms, total: 136 ms Wall time: 7.45 s
%time page_lines = !tail -n 10 path-list.txt | xargs -I{} -P5 http "https://cerg1.ugc.edu.hk/cergprod/{}"
CPU times: user 78.4 ms, sys: 47.4 ms, total: 126 ms Wall time: 4.36 s
A) My early dirty work: https://github.com/hupili/Lightweight-Distributing-Toolset
In Perl. 4 years ago. Do not use
B) GNU Parallel: http://www.gnu.org/software/parallel/
Written in Perl. Only need SSH access to remote (or local machine)
Cool, but...
C) PSSH: https://code.google.com/p/parallel-ssh/
%%file hosts
localhost
Overwriting hosts
%%sh
cat hosts
localhost
%%sh
pssh -h hosts -o output/ 'echo hello PSSH'
[1] 14:14:43 [SUCCESS] localhost
%%sh
ls output/
localhost
%%sh
cat output/localhost
hello PSSH
Easier for tabulared data.
%%sh
tail -n 1 path-list.txt | xargs -I{} http "https://cerg1.ugc.edu.hk/cergprod/{}" \
| pquery 'table.styleTableContent' -p html | head -n 5
<tr class="styleTableHeader"> <td nowrap="nowrap" width="10%" align="center"><b>Project Number</b></td> <td nowrap="nowrap" width="50%" align="center"><b>Project Title</b></td> <td nowrap="nowrap" width="15%" align="center"><b>Principal Investigator</b></td>
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='UTF-8'> BrokenPipeError: [Errno 32] Broken pipe
table_html = !tail -n 1 path-list.txt | xargs -I{} http "https://cerg1.ugc.edu.hk/cergprod/{}" | pquery 'table.styleTableContent' -p html
import pandas as pd
df_projects = pd.read_html('<table>%s</table>' % '\n'.join(table_html))
df_projects[0]
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | Project Number | Project Title | Principal Investigator | Status |
1 | NaN | Revolution, Commercialism and Chineseness: The... | Dr Hui, Kwok Wai | On-going |
2 | NaN | Age Differences in Cognitive Control and Daily... | Dr Hou, Wai Kai | On-going |
3 | NaN | The Chinese Healthcare Reform in Provincial Pe... | Dr He, Jingwei Alex | On-going |
4 | NaN | The identification, abundance and sources of m... | Dr Fok, Lincoln | On-going |
5 | NaN | Decoding the Role and Efficacy of Verbal Image... | Dr Chen, Ti Wei | On-going |
6 | NaN | Linguistic Analysis of Mid-20th Century Hong K... | Dr Chin, Chi On | On-going |
7 | NaN | Chinese morality: When propriety is part of th... | Dr Buchtel, Emma Ellen Kathrina | On-going |
We use a version ported to Python3:
https://github.com/hyperlinkapp/python-readability
(already included in the reuqirements.txt
file)
from readability.readability import Document
import requests
html = requests.get('http://initiumlab.com/blog/20150922-jackathon3-review/').content
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
print(readable_article[:1000])
<html><body><div><span itemprop="articleBody"><video controls="" poster="../../blog/20150922-jackathon3-review/jackathon3-timelapse-poster.png"><br/> <source src="../../blog/20150922-jackathon3-review/jackathon3-timelapse.mp4" type="video/mp4"><br/> <source src="../../blog/20150922-jackathon3-review/jackathon3-timelapse.webm" type="video/webm"><br/> Sorry, you browser does not support HTML5 video.<br/></source></source></video> <p>The video is also available on <a href="https://youtu.be/zFeSh2W1_C8" target="_blank" rel="external">YouTube</a> and <a href="http://v.youku.com/v_show/id_XMTM0MzM1MjEwMA==.html?from=y1.7-2" target="_blank" rel="external">Youku</a>.</p> <h2 id="What_did_we_do?">What did we do?</h2><p>Jackathon is short for “Journalism-Hackathon”. At Initium Lab, we aim to push limits of Journalism with Technology. We hold regular Jackathons to advance our knowledge and skills in using new technology for obtaining data, analysing information, and reporting.</p> <p>This wee
HTML(readable_article[:1000])
The video is also available on YouTube and Youku.
Jackathon is short for “Journalism-Hackathon”. At Initium Lab, we aim to push limits of Journalism with Technology. We hold regular Jackathons to advance our knowledge and skills in using new technology for obtaining data, analysing information, and reporting.
This wee
Let's fix the above URL problems
import pyquery
r = pyquery.PyQuery(readable_article)
r('p')
[<p>, <p>, <p>, <p>, <p>, <p>, <p>, <p>, <p>, <p>, <p>, <p>, <p>, <p>, <p>, <p>, <p>]
r('video').attr('poster')
'../../blog/20150922-jackathon3-review/jackathon3-timelapse-poster.png'
r('video source').attr('src')
'../../blog/20150922-jackathon3-review/jackathon3-timelapse.mp4'
r('video').attr('poster', 'http://initiumlab.com/%s' % r('video').attr('poster'))
[<video>]
r('video').attr('poster')
'http://initiumlab.com/../../blog/20150922-jackathon3-review/jackathon3-timelapse-poster.png'
r('video source').attr('src', 'http://initiumlab.com/%s' % r('video source').attr('src'))
[<source>, <source>]
r('video source').attr('src')
'http://initiumlab.com/../../blog/20150922-jackathon3-review/jackathon3-timelapse.mp4'
r.html()[:1000]
'<body><div><span itemprop="articleBody"><video controls="" poster="http://initiumlab.com/../../blog/20150922-jackathon3-review/jackathon3-timelapse-poster.png"><br/> <source src="http://initiumlab.com/../../blog/20150922-jackathon3-review/jackathon3-timelapse.mp4" type="video/mp4"><br/> <source src="http://initiumlab.com/../../blog/20150922-jackathon3-review/jackathon3-timelapse.mp4" type="video/webm"><br/> Sorry, you browser does not support HTML5 video.<br/></source></source></video>\n\n<p>The video is also available on <a href="https://youtu.be/zFeSh2W1_C8" target="_blank" rel="external">YouTube</a> and <a href="http://v.youku.com/v_show/id_XMTM0MzM1MjEwMA==.html?from=y1.7-2" target="_blank" rel="external">Youku</a>.</p>\n<h2 id="What_did_we_do?">What did we do?</h2><p>Jackathon is short for “Journalism-Hackathon”. At Initium Lab, we aim to push limits of Journalism with Technology. We hold regular Jackathons to advance our knowledge and skills in using new technology for obtaining '
%%javascript
//IPython.OutputArea.auto_scroll_threshold = 9999;
IPython.OutputArea.prototype._should_scroll = function(){return false;}
HTML(r.html()[:1000])
from scrapely import Scraper
s = Scraper()
help(s.train)
Help on method train in module scrapely: train(url, data, encoding=None) method of scrapely.Scraper instance
from urllib import parse
def get_localhost_url(url):
filename = parse.quote_plus(url)
fullpath = 'tmp/%s' % filename
html = requests.get(url).content
open(fullpath, 'wb').write(html)
return 'http://localhost:8888/files/%s?download=1' % parse.quote_plus(fullpath)
training_url = 'http://initiumlab.com/blog/20150916-legco-eng/'
training_data = {'title': 'Legco Matrix Brief (English)',
'author': 'Initium Lab',
'date': '2015-09-16'}
s.train(get_localhost_url(training_url), training_data)
testing_url = 'http://initiumlab.com/blog/20150901-data-journalism-for-the-blind/'
s.scrape(get_localhost_url(testing_url))
[{'date': ['\n 2015-09-01\n '], 'title': ['\n \n \n \n 可視化火了 盲人怎麼辦\n \n \n ']}]
testing_url = 'http://initiumlab.com/blog/20150922-jackathon3-review/'
s.scrape(get_localhost_url(testing_url))
[{'author': ['Initium Lab'], 'date': ['\n 2015-09-22\n '], 'title': ['\n \n \n \n Jackathon #3 -- Read a data science book in 8 hours\n \n \n ']}]
testing_url = 'http://initiumlab.com/blog/20151015-3d-infographic-user-testing/'
s.scrape(get_localhost_url(testing_url))
[{'date': ['\n 2015-10-15\n '], 'title': ['\n \n \n \n Infographic for the Blind: We Tried 3D Printing That Almost Worked\n \n \n ']}]
blogs = !http get http://initiumlab.com/blog/ | pquery 'a.post-title-link' -p href
blogs
['../blog/20151025-jackathon-no-5/', '../blog/20151015-3d-infographic-user-testing/', '../blog/20151015-Facebook-Signal-Review/', '../blog/20151012-visualization-via-jobs/', '../blog/20151012-what-is-colour/', '../blog/20151005-read-journalism/', '../blog/20150930-google-sheets-explore/', '../blog/20150925-react-in-1-hour-cuhk/', '../blog/20150922-jackathon3-review/', '../blog/20150916-legco-eng/']
infos = []
for b in blogs:
infos.extend(s.scrape(get_localhost_url('http://initiumlab.com/blog/' + b)))
infos
[{'author': ['Initium Lab'], 'date': ['\n 2015-10-25\n '], 'title': ['\n \n \n \n Jackathon #5 -- Read a journalism book in 8 hours\n \n \n ']}, {'date': ['\n 2015-10-15\n '], 'title': ['\n \n \n \n Infographic for the Blind: We Tried 3D Printing That Almost Worked\n \n \n ']}, {'date': ['\n 2015-10-15\n '], 'title': ['\n \n \n \n 在Facebook找新聞線索?FB Signal搶鮮試用\n \n \n ']}, {'date': ['\n 2015-10-12\n '], 'title': ['\n \n \n \n 一張圖讀懂喬布斯數據化妝術\n \n \n ']}, {'date': ['\n 2015-10-12\n '], 'title': ['\n \n \n \n 數據新聞人,今夜我們談色\n \n \n ']}, {'author': ['Initium Lab'], 'date': ['\n 2015-10-05\n '], 'title': ['\n \n \n \n Jackathon #5: Read Journalism\n \n \n ']}, {'author': ['Chao Tianyi'], 'date': ['\n 2015-09-30\n '], 'title': ['\n \n \n \n 整日做表沒思路?Google幫你開腦洞\n \n \n ']}, {'author': ['Initium Lab'], 'date': ['\n 2015-09-25\n '], 'title': ['\n \n \n \n React in One Hour\n \n \n ']}, {'author': ['Initium Lab'], 'date': ['\n 2015-09-22\n '], 'title': ['\n \n \n \n Jackathon #3 -- Read a data science book in 8 hours\n \n \n ']}, {'author': ['Initium Lab'], 'date': ['\n 2015-09-16\n '], 'title': ['\n \n \n \n Legco Matrix Brief (English)\n \n \n ']}]
import pandas as pd
df_blogs = pd.DataFrame(infos)
df_blogs['title'] = df_blogs['title'].apply(lambda x: x[0].strip())
df_blogs
author | date | title | |
---|---|---|---|
0 | [Initium Lab] | [\n 2015-10-25\n ] | Jackathon #5 -- Read a journalism book in 8 hours |
1 | NaN | [\n 2015-10-15\n ] | Infographic for the Blind: We Tried 3D Printin... |
2 | NaN | [\n 2015-10-15\n ] | 在Facebook找新聞線索?FB Signal搶鮮試用 |
3 | NaN | [\n 2015-10-12\n ] | 一張圖讀懂喬布斯數據化妝術 |
4 | NaN | [\n 2015-10-12\n ] | 數據新聞人,今夜我們談色 |
5 | [Initium Lab] | [\n 2015-10-05\n ] | Jackathon #5: Read Journalism |
6 | [Chao Tianyi] | [\n 2015-09-30\n ] | 整日做表沒思路?Google幫你開腦洞 |
7 | [Initium Lab] | [\n 2015-09-25\n ] | React in One Hour |
8 | [Initium Lab] | [\n 2015-09-22\n ] | Jackathon #3 -- Read a data science book in 8 ... |
9 | [Initium Lab] | [\n 2015-09-16\n ] | Legco Matrix Brief (English) |
Theme: scrape more with less codes
Keywords: quick and dirty hacks
Environment:
Human friendly HTTP interface:
HTTPie
requests
Scale-out
xargs -P
pssh
Manual parse:
pQuery
pyQuery
for FE peoplepandas
useful for tabulared dataAutomatic parse, in Python REPL:
PyReadability
: the main body of a pagescraply
: learn patterns from your labelling