Author: Pili Hu
GitHub repo: https://github.com/hupili/legcohk
Related notebooks:
Compared with the above notebooks, this repo contains more compact notes covering the whole data mining flow -- from data collection to final visualization. Interpretations will note be provided in the notes directly. If you have interest, you can dump your thoughts on the issue tracker. You can also request other features there.
import requests
import pylab as pl
from pyquery import PyQuery as pq
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import scipy
import pandas as pd
from lxml import etree
from functools import reduce
matplotlib.rc('font', **{'sans-serif' : 'Helvetica, LiHei Pro, sans-serif', #'Arial',
'family' : 'sans-serif'})
%matplotlib inline
seed_pages = [
'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1213.htm',
'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1314.htm',
'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1415.htm'
]
def crawl_seed(seed):
d = pq(seed)
return d('a').map(lambda i, a: a.attrib.get('name', None)).filter(lambda i, s: s.startswith('cm20'))
meetings = reduce(list.__add__, map(crawl_seed, seed_pages), [])
print(meetings)
['cm20121010', 'cm20121017', 'cm20121024', 'cm20121031', 'cm20121101', 'cm20121107', 'cm20121114', 'cm20121121', 'cm20121128', 'cm20121205', 'cm20121210', 'cm20121212', 'cm20121219', 'cm20130109', 'cm20130116', 'cm20130117', 'cm20130123', 'cm20130130', 'cm20130206', 'cm20130220', 'cm20130227', 'cm20130320', 'cm20130327', 'cm20130417', 'cm20130424', 'cm20130508', 'cm20130509', 'cm20130515', 'cm20130522', 'cm20130529', 'cm20130605', 'cm20130619', 'cm20130626', 'cm20130703', 'cm20130710', 'cm20130711', 'cm20130717', 'cm20131009', 'cm20131016', 'cm20131017', 'cm20131023', 'cm20131030', 'cm20131106', 'cm20131113', 'cm20131120', 'cm20131127', 'cm20131204', 'cm20131211', 'cm20131218', 'cm20140108', 'cm20140115', 'cm20140116', 'cm20140122', 'cm20140212', 'cm20140219', 'cm20140226', 'cm20140319', 'cm20140326', 'cm20140409', 'cm20140416', 'cm20140430', 'cm20140507', 'cm20140514', 'cm20140521', 'cm20140522', 'cm20140528', 'cm20140604', 'cm20140611', 'cm20140618', 'cm20140625', 'cm20140702', 'cm20140703', 'cm20140709', 'cm20141008', 'cm20141015', 'cm20141016', 'cm20141022', 'cm20141029', 'cm20141105', 'cm20141112', 'cm20141120', 'cm20141126', 'cm20141203', 'cm20141210', 'cm20141217', 'cm20150107', 'cm20150114', 'cm20150115', 'cm20150121', 'cm20150128', 'cm20150204', 'cm20150211', 'cm20150225', 'cm20150318', 'cm20150325', 'cm20150326', 'cm20150415', 'cm20150422', 'cm20150429', 'cm20150506', 'cm20150513', 'cm20150520', 'cm20150527', 'cm20150603', 'cm20150610', 'cm20150617', 'cm20150624', 'cm20150708']
from IPython.core.display import clear_output
import sys
def crawl_xml(meeting):
# This logic is translated from the official JS code
yy, mm, dd = map(lambda i: int(meeting[i:(i + 2)]), [4, 6, 8])
if mm >= 10:
yr = 'yr%02d-%02d' % (yy, yy + 1)
else:
yr = 'yr%02d-%02d' % (yy - 1, yy)
prefix = 'http://www.legco.gov.hk'
url = '%(prefix)s/%(yr)s/chinese/counmtg/voting/cm_vote_20%(yy)02d%(mm)02d%(dd)02d.xml' % locals()
return requests.get(url)
vote_xmls = []
for m in meetings:
vote_xmls.append(crawl_xml(m))
clear_output()
print('progress: %s/%s %s' % (len(vote_xmls), len(meetings), '#' * len(vote_xmls)))
sys.stdout.flush()
progress: 108/108 ############################################################################################################
vote_xmls = list(filter(lambda r: r.ok, vote_xmls))
vote_xmls = list(map(lambda r: r.content, vote_xmls))
print(len(vote_xmls))
69
# Information fields, useful for reviewing the result
info_fields = ['vote-date', 'vote-time', 'motion-en', 'mover-en', 'mover-type', 'vote-separate-mechanism']
def xml_to_records(xml):
doc = etree.XML(xml)
records = []
for topic in doc.xpath('//legcohk-vote/meeting/vote'):
info = [topic.xpath(f)[0].text for f in info_fields]
date = info[0]
topic_id = '%s-%s' % (date, topic.attrib['number'])
for member in topic.xpath('individual-votes/member'):
#member_id = member.attrib['name-en'] # Use English name as ID for sipmlicity
member_id = member.attrib['name-ch'] # Use Chinese for a local blog post
vote = member.xpath('vote')[0].text
records.append((topic_id, member_id, vote) + tuple(info))
return records
records = reduce(list.__add__, map(xml_to_records, vote_xmls), [])
rec_guo_1 = [(r[0], r[1]) for r in records if r[1] == '郭偉強']
rec_guo_2 = [(r[0], r[1]) for r in records if r[1] == '郭偉强']
# According to the number of votes, those seem to be the same person
print(len(rec_guo_1))
print(len(rec_guo_2))
print(len(rec_guo_1) + len(rec_guo_2))
-c:1: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal -c:2: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
0 0 0
# More:
# http://nbviewer.ipython.org/urls/course.ie.cuhk.edu.hk/~engg4030/tutorial/tutorial7/Legco-Preprocessing.ipynb
NAME_FIX = {
'Dr Joseph LEE': 'Prof Joseph LEE',
'郭偉强': '郭偉強'
}
def clean_record(t):
# According to the numbers, they seem to be the same person
t = list(t)
t[1] = NAME_FIX.get(t[1], t[1])
# Other normalization if any
# ...
return tuple(t)
records = list(map(clean_record, records))
df = pd.DataFrame(data=records, columns=['topic_id', 'member_id', 'vote'] + info_fields)
df.to_csv('records-all-with-info.csv', encoding='utf-8')
df[:5]
topic_id | member_id | vote | vote-date | vote-time | motion-en | mover-en | mover-type | vote-separate-mechanism | |
---|---|---|---|---|---|---|---|---|---|
0 | 17/10/2012-1 | 曾鈺成 | Present | 17/10/2012 | 19:37:53 | AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | Dr Kenneth CHAN | Member | Yes |
1 | 17/10/2012-1 | 何俊仁 | Yes | 17/10/2012 | 19:37:53 | AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | Dr Kenneth CHAN | Member | Yes |
2 | 17/10/2012-1 | 李卓人 | Yes | 17/10/2012 | 19:37:53 | AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | Dr Kenneth CHAN | Member | Yes |
3 | 17/10/2012-1 | 涂謹申 | Yes | 17/10/2012 | 19:37:53 | AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | Dr Kenneth CHAN | Member | Yes |
4 | 17/10/2012-1 | 陳鑑林 | No | 17/10/2012 | 19:37:53 | AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | Dr Kenneth CHAN | Member | Yes |
df = df[['topic_id', 'member_id', 'vote']]
df.to_csv('records-all.csv', encoding='utf-8')
df[:5]
topic_id | member_id | vote | |
---|---|---|---|
0 | 17/10/2012-1 | 曾鈺成 | Present |
1 | 17/10/2012-1 | 何俊仁 | Yes |
2 | 17/10/2012-1 | 李卓人 | Yes |
3 | 17/10/2012-1 | 涂謹申 | Yes |
4 | 17/10/2012-1 | 陳鑑林 | No |
print('total # of topics:', len(df['topic_id'].unique()))
print('total # of members:',len(df['member_id'].unique()))
print('total # of records:', len(df))
('total # of topics:', 2310) ('total # of members:', 71) ('total # of records:', 161700)
print(df['vote'].unique())
['Present' 'Yes' 'No' 'Absent' 'Abstain']
print(df['member_id'].unique())
[u'\u66fe\u923a\u6210' u'\u4f55\u4fca\u4ec1' u'\u674e\u5353\u4eba' u'\u6d82\u8b39\u7533' u'\u9673\u9451\u6797' u'\u6881\u8000\u5fe0' u'\u5289\u7687\u767c' u'\u5289\u6167\u537f' u'\u8b5a\u8000\u5b97' u'\u77f3\u79ae\u8b19' u'\u5f35\u5b87\u4eba' u'\u99ae\u6aa2\u57fa' u'\u65b9\u525b' u'\u738b\u570b\u8208' u'\u674e\u570b\u9e9f' u'\u6797\u5065\u92d2' u'\u6881\u541b\u5f65' u'\u9ec3\u5b9a\u5149' u'\u6e6f\u5bb6\u9a4a' u'\u4f55\u79c0\u862d' u'\u674e\u6167\u743c' u'\u6797\u5927\u8f1d' u'\u9673\u514b\u52e4' u'\u9673\u5065\u6ce2' u'\u6881\u7f8e\u82ac' u'\u6881\u5bb6\u9a2e' u'\u5f35\u570b\u67f1' u'\u9ec3\u570b\u5065' u'\u8449\u570b\u8b19' u'\u8449\u5289\u6dd1\u5100' u'\u8b1d\u5049\u4fca' u'\u6881\u5bb6\u5091' u'\u6881\u570b\u96c4' u'\u9673\u5049\u696d' u'\u9ec3\u6bd3\u6c11' u'\u6bdb\u5b5f\u975c' u'\u7530\u5317\u8fb0' u'\u7530\u5317\u4fca' u'\u5433\u4eae\u661f' u'\u4f55\u4fca\u8ce2' u'\u6613\u5fd7\u660e' u'\u80e1\u5fd7\u5049' u'\u59da\u601d\u69ae' u'\u8303\u570b\u5a01' u'\u99ac\u9022\u570b' u'\u83ab\u4e43\u5149' u'\u9673\u5fd7\u5168' u'\u9673\u6052\u944c' u'\u9673\u5bb6\u6d1b' u'\u9673\u5a49\u5afb' u'\u6881\u5fd7\u7965' u'\u6881\u7e7c\u660c' u'\u9ea5\u7f8e\u5a1f' u'\u90ed\u5bb6\u9e92' u'\u90ed\u5049\u5f37' u'\u90ed\u69ae\u93d7' u'\u5f35\u83ef\u5cf0' u'\u5f35\u8d85\u96c4' u'\u55ae\u4ef2\u5055' u'\u9ec3\u78a7\u96f2' u'\u8449\u5efa\u6e90' u'\u845b\u73ee\u5e06' u'\u5ed6\u9577\u6c5f' u'\u6f58\u5146\u5e73' u'\u9127\u5bb6\u5f6a' u'\u8523\u9e97\u82b8' u'\u76e7\u5049\u570b' u'\u937e\u570b\u658c' u'\u937e\u6a39\u6839' u'\u8b1d\u5049\u9293' u'\u90ed\u5049\u5f3a']
print(df['topic_id'].unique())
['17/10/2012-1' '17/10/2012-2' '17/10/2012-3' ..., '13/02/2015-7' '13/02/2015-8' '13/02/2015-9']
df[df['member_id'] == '曾鈺成']['vote'].value_counts()
/usr/local/lib/python2.7/site-packages/pandas/core/ops.py:558: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal result = lib.scalar_compare(x, y, op)
Series([], dtype: int64)
# A leader board of voting types
board_pos = pd.DataFrame(index=range(0,5))
for v in df['vote'].unique():
count = df[df['vote']==v].groupby('member_id').count().sort('vote', ascending=False)['vote']
count = count.reset_index()[:5]
board_pos[v] = pd.Series(zip(count['member_id'], count['vote']), index=range(0,5))
board_pos
Present | Yes | No | Absent | Abstain | |
---|---|---|---|---|---|
0 | (曾鈺成, 2284) | (梁國雄, 1682) | (盧偉國, 1919) | (梁家騮, 2007) | (范國威, 426) |
1 | (張國柱, 794) | (陳偉業, 1342) | (譚耀宗, 1861) | (梁耀忠, 1881) | (馬逢國, 121) |
2 | (劉慧卿, 718) | (陳志全, 1331) | (何俊賢, 1861) | (湯家驊, 1881) | (李國麟, 115) |
3 | (何秀蘭, 708) | (莫乃光, 453) | (黃定光, 1851) | (涂謹申, 1822) | (葉國謙, 112) |
4 | (單仲偕, 702) | (黃毓民, 411) | (張華峰, 1807) | (葉建源, 1813) | (何俊賢, 110) |
board_neg = pd.DataFrame(index=range(0,5))
for v in df['vote'].unique():
count = df[df['vote']==v].groupby('member_id').count().sort('vote', ascending=True)['vote']
count = count.reset_index()[:5]
board_neg[v] = pd.Series(zip(count['member_id'], count['vote']), index=range(0,5))
board_neg
Present | Yes | No | Absent | Abstain | |
---|---|---|---|---|---|
0 | (葉國謙, 1) | (石禮謙, 95) | (涂謹申, 64) | (曾鈺成, 26) | (郭榮鏗, 13) |
1 | (梁耀忠, 1) | (劉皇發, 106) | (湯家驊, 67) | (潘兆平, 32) | (方剛, 13) |
2 | (田北俊, 1) | (梁家騮, 129) | (馮檢基, 70) | (譚耀宗, 50) | (郭偉强, 14) |
3 | (吳亮星, 1) | (郭偉強, 131) | (何俊仁, 71) | (郭偉強, 50) | (劉皇發, 15) |
4 | (陳志全, 1) | (林大輝, 132) | (葉建源, 72) | (何俊賢, 53) | (馮檢基, 16) |
df_matrix = pd.DataFrame(index=df['member_id'].unique())
for gn, g in df.groupby('topic_id'):
df_matrix[gn] = g.set_index('member_id')['vote']
df_matrix[:5]
01/02/2013-1 | 01/02/2013-2 | 01/02/2013-3 | 01/02/2013-4 | 01/02/2013-5 | 01/02/2013-6 | 01/02/2013-7 | 01/02/2013-8 | 03/07/2013-1 | 03/07/2013-10 | ... | 31/10/2012-2 | 31/10/2012-3 | 31/10/2012-4 | 31/10/2012-5 | 31/10/2012-6 | 31/10/2012-7 | 31/10/2012-8 | 31/10/2012-9 | 31/10/2014-38 | 31/10/2014-39 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
曾鈺成 | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | ... | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present |
何俊仁 | Yes | Yes | Yes | Yes | No | Yes | Yes | No | Yes | Yes | ... | Yes | Yes | Yes | Yes | Yes | Yes | No | No | Yes | No |
李卓人 | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | ... | Absent | Absent | Absent | Absent | Absent | Absent | Absent | Absent | Yes | No |
涂謹申 | Yes | Yes | Yes | Yes | No | Yes | Yes | No | Yes | Yes | ... | Yes | Yes | Yes | Absent | Absent | Absent | Absent | Absent | Yes | No |
陳鑑林 | No | No | No | No | Abstain | No | No | Yes | No | Abstain | ... | Abstain | Abstain | Abstain | Abstain | Abstain | Abstain | Yes | Abstain | No | Yes |
5 rows × 2310 columns
def to_numeric(x):
x[(x != 'Yes') & (x != 'No')] = 0
x[x == 'Yes'] = 1
x[x == 'No'] = -1
df_matrix.apply(to_numeric)
df_matrix[:5]
01/02/2013-1 | 01/02/2013-2 | 01/02/2013-3 | 01/02/2013-4 | 01/02/2013-5 | 01/02/2013-6 | 01/02/2013-7 | 01/02/2013-8 | 03/07/2013-1 | 03/07/2013-10 | ... | 31/10/2012-2 | 31/10/2012-3 | 31/10/2012-4 | 31/10/2012-5 | 31/10/2012-6 | 31/10/2012-7 | 31/10/2012-8 | 31/10/2012-9 | 31/10/2014-38 | 31/10/2014-39 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
曾鈺成 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
何俊仁 | 1 | 1 | 1 | 1 | -1 | 1 | 1 | -1 | 1 | 1 | ... | 1 | 1 | 1 | 1 | 1 | 1 | -1 | -1 | 1 | -1 |
李卓人 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | -1 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -1 |
涂謹申 | 1 | 1 | 1 | 1 | -1 | 1 | 1 | -1 | 1 | 1 | ... | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | -1 |
陳鑑林 | -1 | -1 | -1 | -1 | 0 | -1 | -1 | 1 | -1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | -1 | 1 |
5 rows × 2310 columns
X = np.matrix(df_matrix.as_matrix()).astype('float')
X = X - np.mean(X, 0)
C = X.T * X
print(C.shape)
(2310, 2310)
import scipy.sparse.linalg
PA = scipy.sparse.linalg.eigs(C, k=2, which='LM', return_eigenvectors=True)[1]
# Use the following one to pop up the 3D plot in another window.
# Good for interactive exploration.
# May or may not be available due to your desktop.
#%pylab
#matplotlib.use('TkAgg')
# Use the following one to plot inline (embedded in this notebook).
#%pylab inline
# Try 3D
PA = scipy.sparse.linalg.eigs(C, k=3, which='LM', return_eigenvectors=True)[1]
# Project data points onto principle axis
X_3D = PA.T * X.T
print(X_3D.shape)
x = np.array(X_3D[0, :]).astype('float')
y = np.array(X_3D[1, :]).astype('float')
z = np.array(X_3D[2, :]).astype('float')
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, picker=True, s=100)
(3, 71)
-c:16: ComplexWarning: Casting complex values to real discards the imaginary part -c:17: ComplexWarning: Casting complex values to real discards the imaginary part -c:18: ComplexWarning: Casting complex values to real discards the imaginary part
<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0x10a0a0390>
matplotlib.rcParams.update({'font.size': 22})
x = -x
df_pc1 = pd.DataFrame(x.flatten(), index=df_matrix.index, columns=['PC1'])
df_pc1 = df_pc1.sort('PC1')
fig = plt.figure(figsize=(10, 30))
ax = fig.add_subplot(1,1,1)
ax.plot(df_pc1['PC1'], df_pc1.rank()['PC1'], 'd', markersize=16)
#yticks(df_pc1.rank()['PC1'], df_pc1.index)
for (_x, _y, _s) in zip(df_pc1['PC1'], df_pc1.rank()['PC1'], df_pc1.index):
plt.annotate(_s, (_x, _y), xytext=(_x + 1, _y - 0.2))
plt.title('Spectrum from Principal Component 1')
plt.axis([-60, 50, 0, 75])
[-60, 50, 0, 75]