Author: Pili Hu
GitHub repo: https://github.com/hupili/legcohk
Related notebooks:
Compared with the above notebooks, this repo contains more compact notes covering the whole data mining flow -- from data collection to final visualization. Interpretations will note be provided in the notes directly. If you have interest, you can dump your thoughts on the issue tracker. You can also request other features there.
%pylab inline
import requests
import pylab as pl
from pyquery import PyQuery as pq
import numpy as np
import matplotlib as plt
import scipy
import pandas as pd
from lxml import etree
Populating the interactive namespace from numpy and matplotlib
seed_pages = [
'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1213.htm',
'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1314.htm'
]
def crawl_seed(seed):
d = pq(seed)
return d('a').map(lambda i, a: a.attrib.get('name', None)).filter(lambda i, s: s.startswith('cm20'))
meetings = reduce(list.__add__, map(crawl_seed, seed_pages), [])
print meetings
['cm20121010', 'cm20121017', 'cm20121024', 'cm20121031', 'cm20121101', 'cm20121107', 'cm20121114', 'cm20121121', 'cm20121128', 'cm20121205', 'cm20121210', 'cm20121212', 'cm20121219', 'cm20130109', 'cm20130116', 'cm20130117', 'cm20130123', 'cm20130130', 'cm20130206', 'cm20130220', 'cm20130227', 'cm20130320', 'cm20130327', 'cm20130417', 'cm20130424', 'cm20130508', 'cm20130509', 'cm20130515', 'cm20130522', 'cm20130529', 'cm20130605', 'cm20130619', 'cm20130626', 'cm20130703', 'cm20130710', 'cm20130711', 'cm20130717', 'cm20131009', 'cm20131016', 'cm20131017', 'cm20131023', 'cm20131030', 'cm20131106', 'cm20131113', 'cm20131120', 'cm20131127', 'cm20131204', 'cm20131211', 'cm20131218', 'cm20140108', 'cm20140115', 'cm20140116', 'cm20140122', 'cm20140212', 'cm20140219', 'cm20140226', 'cm20140319', 'cm20140326', 'cm20140409', 'cm20140416', 'cm20140430', 'cm20140507', 'cm20140514', 'cm20140521', 'cm20140522', 'cm20140528', 'cm20140604', 'cm20140611', 'cm20140618', 'cm20140625', 'cm20140702', 'cm20140709']
from IPython.core.display import clear_output
import sys
def crawl_xml(meeting):
# This logic is translated from the official JS code
yy, mm, dd = map(lambda i: int(meeting[i:(i + 2)]), [4, 6, 8])
if mm >= 10:
yr = 'yr%02d-%02d' % (yy, yy + 1)
else:
yr = 'yr%02d-%02d' % (yy - 1, yy)
prefix = 'http://www.legco.gov.hk'
url = '%(prefix)s/%(yr)s/chinese/counmtg/voting/cm_vote_20%(yy)02d%(mm)02d%(dd)02d.xml' % locals()
return requests.get(url)
vote_xmls = []
for m in meetings:
vote_xmls.append(crawl_xml(m))
clear_output()
print 'progress: %s/%s %s' % (len(vote_xmls), len(meetings), '#' * len(vote_xmls))
sys.stdout.flush()
progress: 72/72 ########################################################################
vote_xmls = filter(lambda r: r.ok, vote_xmls)
vote_xmls = map(lambda r: r.content, vote_xmls)
print len(vote_xmls)
46
# Information fields, useful for reviewing the result
info_fields = ['vote-date', 'vote-time', 'motion-en', 'mover-en', 'mover-type', 'vote-separate-mechanism']
def xml_to_records(xml):
doc = etree.XML(xml)
records = []
for topic in doc.xpath('//legcohk-vote/meeting/vote'):
info = [topic.xpath(f)[0].text for f in info_fields]
date = info[0]
topic_id = '%s-%s' % (date, topic.attrib['number'])
for member in topic.xpath('individual-votes/member'):
member_id = member.attrib['name-en'] # Use English name as ID for sipmlicity
vote = member.xpath('vote')[0].text
records.append((topic_id, member_id, vote) + tuple(info))
return records
records = reduce(list.__add__, map(xml_to_records, vote_xmls), [])
# More:
# http://nbviewer.ipython.org/urls/course.ie.cuhk.edu.hk/~engg4030/tutorial/tutorial7/Legco-Preprocessing.ipynb
def clean_record(t):
# According to the numbers, they seem to be the same person
t = list(t)
if t[1] == 'Dr Joseph LEE':
t[1] = 'Prof Joseph LEE'
# Other normalization if any
# ...
return tuple(t)
records = map(clean_record, records)
df = pd.DataFrame(records, columns = ['topic_id', 'member_id', 'vote'] + info_fields)
df.to_csv('records-all-with-info.csv', encoding='utf-8')
df[:5]
topic_id | member_id | vote | vote-date | vote-time | motion-en | mover-en | mover-type | vote-separate-mechanism | |
---|---|---|---|---|---|---|---|---|---|
0 | 17/10/2012-1 | TSANG Yok-sing | Present | 17/10/2012 | 19:37:53 | AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | Dr Kenneth CHAN | Member | Yes |
1 | 17/10/2012-1 | Albert HO | Yes | 17/10/2012 | 19:37:53 | AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | Dr Kenneth CHAN | Member | Yes |
2 | 17/10/2012-1 | LEE Cheuk-yan | Yes | 17/10/2012 | 19:37:53 | AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | Dr Kenneth CHAN | Member | Yes |
3 | 17/10/2012-1 | James TO | Yes | 17/10/2012 | 19:37:53 | AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | Dr Kenneth CHAN | Member | Yes |
4 | 17/10/2012-1 | CHAN Kam-lam | No | 17/10/2012 | 19:37:53 | AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | Dr Kenneth CHAN | Member | Yes |
5 rows × 9 columns
df = df[['topic_id', 'member_id', 'vote']]
df.to_csv('records-all.csv', encoding='utf-8')
df[:5]
topic_id | member_id | vote | |
---|---|---|---|
0 | 17/10/2012-1 | TSANG Yok-sing | Present |
1 | 17/10/2012-1 | Albert HO | Yes |
2 | 17/10/2012-1 | LEE Cheuk-yan | Yes |
3 | 17/10/2012-1 | James TO | Yes |
4 | 17/10/2012-1 | CHAN Kam-lam | No |
5 rows × 3 columns
print 'total # of topics:', len(df['topic_id'].unique())
print 'total # of members:',len(df['member_id'].unique())
print 'total # of records:', len(df)
total # of topics: 1055 total # of members: 70 total # of records: 73850
print df['vote'].unique()
['Present' 'Yes' 'No' 'Absent' 'Abstain']
print df['member_id'].unique()
['TSANG Yok-sing' 'Albert HO' 'LEE Cheuk-yan' 'James TO' 'CHAN Kam-lam' 'LEUNG Yiu-chung' 'Dr LAU Wong-fat' 'Emily LAU' 'TAM Yiu-chung' 'Abraham SHEK' 'Tommy CHEUNG' 'Frederick FUNG' 'Vincent FANG' 'WONG Kwok-hing' 'Prof Joseph LEE' 'Jeffrey LAM' 'Andrew LEUNG' 'WONG Ting-kwong' 'Ronny TONG' 'Cyd HO' 'Starry LEE' 'Dr LAM Tai-fai' 'CHAN Hak-kan' 'CHAN Kin-por' 'Dr Priscilla LEUNG' 'Dr LEUNG Ka-lau' 'CHEUNG Kwok-che' 'WONG Kwok-kin' 'IP Kwok-him' 'Mrs Regina IP' 'Paul TSE' 'Alan LEONG' 'LEUNG Kwok-hung' 'Albert CHAN' 'WONG Yuk-man' 'Claudia MO' 'Michael TIEN' 'James TIEN' 'NG Leung-sing' 'Steven HO' 'Frankie YICK' 'WU Chi-wai' 'YIU Si-wing' 'Gary FAN' 'MA Fung-kwok' 'Charles Peter MOK' 'CHAN Chi-chuen' 'CHAN Han-pan' 'Dr Kenneth CHAN' 'CHAN Yuen-han' 'LEUNG Che-cheung' 'Kenneth LEUNG' 'Alice MAK' 'Dr KWOK Ka-ki' 'KWOK Wai-keung' 'Dennis KWOK' 'Christopher CHEUNG' 'Dr Fernando CHEUNG' 'SIN Chung-kai' 'Dr Helena WONG' 'IP Kin-yuen' 'Dr Elizabeth QUAT' 'Martin LIAO' 'POON Siu-ping' 'TANG Ka-piu' 'Dr CHIANG Lai-wan' 'Ir Dr LO Wai-kwok' 'CHUNG Kwok-pan' 'Christopher CHUNG' 'Tony TSE']
print df['topic_id'].unique()
['17/10/2012-1' '17/10/2012-2' '17/10/2012-3' ..., '27/03/2014-13' '27/03/2014-14' '16/04/2014-1']
# A leader board of voting types
board_pos = pd.DataFrame(index=range(0,5))
for v in df['vote'].unique():
count = df[df['vote']==v].groupby('member_id').count().sort('vote', ascending=False)['vote']
count = count.reset_index()[:5]
board_pos[v] = pd.Series(zip(count['member_id'], count['vote']), index=range(0,5))
board_pos
Present | Yes | No | Absent | Abstain | |
---|---|---|---|---|---|
0 | (TSANG Yok-sing, 1054) | (LEUNG Kwok-hung, 685) | (YIU Si-wing, 856) | (Dr LEUNG Ka-lau, 803) | (Gary FAN, 238) |
1 | (Albert HO, 295) | (CHAN Chi-chuen, 672) | (Andrew LEUNG, 854) | (LEUNG Yiu-chung, 799) | (MA Fung-kwok, 113) |
2 | (Cyd HO, 209) | (Albert CHAN, 405) | (Ir Dr LO Wai-kwok, 853) | (WONG Yuk-man, 788) | (Prof Joseph LEE, 108) |
3 | (WU Chi-wai, 150) | (Charles Peter MOK, 264) | (Christopher CHEUNG, 842) | (Frederick FUNG, 776) | (IP Kwok-him, 105) |
4 | (Charles Peter MOK, 148) | (Gary FAN, 245) | (TAM Yiu-chung, 800) | (James TO, 763) | (CHAN Kam-lam, 102) |
5 rows × 5 columns
board_neg = pd.DataFrame(index=range(0,5))
for v in df['vote'].unique():
count = df[df['vote']==v].groupby('member_id').count().sort('vote', ascending=True)['vote']
count = count.reset_index()[:5]
board_neg[v] = pd.Series(zip(count['member_id'], count['vote']), index=range(0,5))
board_neg
Present | Yes | No | Absent | Abstain | |
---|---|---|---|---|---|
0 | (YIU Si-wing, 1) | (Dr LAU Wong-fat, 23) | (Prof Joseph LEE, 52) | (TSANG Yok-sing, 1) | (Frederick FUNG, 11) |
1 | (LEUNG Kwok-hung, 1) | (Abraham SHEK, 60) | (Claudia MO, 55) | (Ir Dr LO Wai-kwok, 19) | (Vincent FANG, 13) |
2 | (Andrew LEUNG, 1) | (Dr LAM Tai-fai, 65) | (Albert HO, 56) | (YIU Si-wing, 20) | (Dennis KWOK, 13) |
3 | (CHAN Kin-por, 1) | (Vincent FANG, 71) | (Dennis KWOK, 57) | (POON Siu-ping, 22) | (Claudia MO, 13) |
4 | (LEE Cheuk-yan, 2) | (Jeffrey LAM, 75) | (Ronny TONG, 57) | (TAM Yiu-chung, 30) | (Dr Kenneth CHAN, 14) |
5 rows × 5 columns
df_matrix = pd.DataFrame(index=df['member_id'].unique())
for gn, g in df.groupby('topic_id'):
df_matrix[gn] = g.set_index('member_id')['vote']
df_matrix[:5]
01/02/2013-1 | 01/02/2013-2 | 01/02/2013-3 | 01/02/2013-4 | 01/02/2013-5 | 01/02/2013-6 | 01/02/2013-7 | 01/02/2013-8 | 03/07/2013-1 | 03/07/2013-10 | 03/07/2013-2 | 03/07/2013-3 | 03/07/2013-4 | 03/07/2013-5 | 03/07/2013-6 | 03/07/2013-7 | 03/07/2013-8 | 03/07/2013-9 | 04/07/2013-11 | 04/12/2013-1 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
TSANG Yok-sing | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | Present | ... |
Albert HO | Yes | Yes | Yes | Yes | No | Yes | Yes | No | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | ... |
LEE Cheuk-yan | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | Abstain | Yes | Yes | No | ... |
James TO | Yes | Yes | Yes | Yes | No | Yes | Yes | No | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | ... |
CHAN Kam-lam | No | No | No | No | Abstain | No | No | Yes | No | Abstain | Yes | Abstain | Abstain | Abstain | Abstain | Abstain | Abstain | Abstain | Abstain | Yes | ... |
5 rows × 1055 columns
def to_numeric(x):
x[(x != 'Yes') & (x != 'No')] = 0
x[x == 'Yes'] = 1
x[x == 'No'] = -1
df_matrix.apply(to_numeric)
df_matrix[:5]
01/02/2013-1 | 01/02/2013-2 | 01/02/2013-3 | 01/02/2013-4 | 01/02/2013-5 | 01/02/2013-6 | 01/02/2013-7 | 01/02/2013-8 | 03/07/2013-1 | 03/07/2013-10 | 03/07/2013-2 | 03/07/2013-3 | 03/07/2013-4 | 03/07/2013-5 | 03/07/2013-6 | 03/07/2013-7 | 03/07/2013-8 | 03/07/2013-9 | 04/07/2013-11 | 04/12/2013-1 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
TSANG Yok-sing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
Albert HO | 1 | 1 | 1 | 1 | -1 | 1 | 1 | -1 | 1 | 1 | -1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | -1 | ... |
LEE Cheuk-yan | 1 | 1 | 1 | 1 | 1 | 1 | 1 | -1 | 1 | 1 | -1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | -1 | ... |
James TO | 1 | 1 | 1 | 1 | -1 | 1 | 1 | -1 | 1 | 1 | -1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | -1 | ... |
CHAN Kam-lam | -1 | -1 | -1 | -1 | 0 | -1 | -1 | 1 | -1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... |
5 rows × 1055 columns
X = matrix(df_matrix.as_matrix()).astype('float')
X = X - mean(X, 0)
C = X.T * X
print C.shape
(1055, 1055)
import scipy.sparse.linalg
PA = scipy.sparse.linalg.eigs(C, k=2, which='LM', return_eigenvectors=True)[1]
# Use the following one to pop up the 3D plot in another window.
# Good for interactive exploration.
# May or may not be available due to your desktop.
#%pylab
#matplotlib.use('TkAgg')
# Use the following one to plot inline (embedded in this notebook).
#%pylab inline
# Try 3D
PA = scipy.sparse.linalg.eigs(C, k=3, which='LM', return_eigenvectors=True)[1]
# Project data points onto principle axis
X_3D = PA.T * X.T
print X_3D.shape
# We intentionally add some disturbance for better visualization.
# Or else, some of the nodes are located in the same place.
# (Those who vote exactly the same)
X_3D = X_3D + randn(*tuple(X_3D.shape)) * 0.3
x = array(X_3D[0, :]).astype('float')
y = array(X_3D[1, :]).astype('float')
z = array(X_3D[2, :]).astype('float')
from mpl_toolkits.mplot3d import Axes3D
fig = figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, picker=True, s=100)
(3, 70)
-c:19: ComplexWarning: Casting complex values to real discards the imaginary part -c:20: ComplexWarning: Casting complex values to real discards the imaginary part -c:21: ComplexWarning: Casting complex values to real discards the imaginary part
<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0x107bd0850>
df_pc1 = pd.DataFrame(x.flatten(), index=df_matrix.index, columns=['PC1'])
df_pc1 = df_pc1.sort('PC1')
figure(figsize(12, 20))
plot(df_pc1['PC1'], df_pc1.rank()['PC1'], 'd', markersize=10)
#yticks(df_pc1.rank()['PC1'], df_pc1.index)
for (_x, _y, _s) in zip(df_pc1['PC1'], df_pc1.rank()['PC1'], df_pc1.index):
annotate(_s, (_x, _y), xytext=(_x + 0.5, _y - 0.2))
title('Spectrum from Principal Component 1')
axis([-32, 28, 0, 71])
[-32, 28, 0, 71]