LegCoHK¶

GitHub repo: https://github.com/hupili/legcohk

Related notebooks:

PCA for ENGG4030: http://bit.ly/1riabfV
Recommender System for ENGG4030: http://bit.ly/QwNvLZ
Graph Analysis for ENGG4030: http://bit.ly/1mxjuqu

Compared with the above notebooks, this repo contains more compact notes covering the whole data mining flow -- from data collection to final visualization. Interpretations will note be provided in the notes directly. If you have interest, you can dump your thoughts on the issue tracker. You can also request other features there.

Preparation¶

In [1]:

%pylab inline
import requests
import pylab as pl
from pyquery import PyQuery as pq
import numpy as np
import matplotlib as plt
import scipy
import pandas as pd
from lxml import etree

Populating the interactive namespace from numpy and matplotlib

In [2]:

seed_pages = [
    'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1213.htm',
    'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1314.htm',
    'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1415.htm'
]
def crawl_seed(seed):
    d = pq(seed)
    return d('a').map(lambda i, a: a.attrib.get('name', None)).filter(lambda i, s: s.startswith('cm20'))
meetings = []
for seed_page in seed_pages:
    meetings.extend(crawl_seed(seed_page))
print(meetings)

['cm20121010', 'cm20121017', 'cm20121024', 'cm20121031', 'cm20121101', 'cm20121107', 'cm20121114', 'cm20121121', 'cm20121128', 'cm20121205', 'cm20121210', 'cm20121212', 'cm20121219', 'cm20130109', 'cm20130116', 'cm20130117', 'cm20130123', 'cm20130130', 'cm20130206', 'cm20130220', 'cm20130227', 'cm20130320', 'cm20130327', 'cm20130417', 'cm20130424', 'cm20130508', 'cm20130509', 'cm20130515', 'cm20130522', 'cm20130529', 'cm20130605', 'cm20130619', 'cm20130626', 'cm20130703', 'cm20130710', 'cm20130711', 'cm20130717', 'cm20131009', 'cm20131016', 'cm20131017', 'cm20131023', 'cm20131030', 'cm20131106', 'cm20131113', 'cm20131120', 'cm20131127', 'cm20131204', 'cm20131211', 'cm20131218', 'cm20140108', 'cm20140115', 'cm20140116', 'cm20140122', 'cm20140212', 'cm20140219', 'cm20140226', 'cm20140319', 'cm20140326', 'cm20140409', 'cm20140416', 'cm20140430', 'cm20140507', 'cm20140514', 'cm20140521', 'cm20140522', 'cm20140528', 'cm20140604', 'cm20140611', 'cm20140618', 'cm20140625', 'cm20140702', 'cm20140703', 'cm20140709', 'cm20141008', 'cm20141015', 'cm20141016', 'cm20141022', 'cm20141029', 'cm20141105', 'cm20141112', 'cm20141120', 'cm20141126', 'cm20141203', 'cm20141210', 'cm20141217', 'cm20150107', 'cm20150114', 'cm20150115', 'cm20150121', 'cm20150128', 'cm20150204', 'cm20150211', 'cm20150225', 'cm20150318', 'cm20150325', 'cm20150326', 'cm20150415', 'cm20150422', 'cm20150429', 'cm20150506', 'cm20150513', 'cm20150520', 'cm20150527', 'cm20150603', 'cm20150610', 'cm20150617', 'cm20150624', 'cm20150708']

In [3]:

from IPython.core.display import clear_output
import sys

def crawl_xml(meeting):
    # This logic is translated from the official JS code
    yy, mm, dd = map(lambda i: int(meeting[i:(i + 2)]), [4, 6, 8])
    if mm >= 10:
        yr = 'yr%02d-%02d' % (yy, yy + 1)
    else:
        yr = 'yr%02d-%02d' % (yy - 1, yy)
    prefix = 'http://www.legco.gov.hk'
    url = '%(prefix)s/%(yr)s/chinese/counmtg/voting/cm_vote_20%(yy)02d%(mm)02d%(dd)02d.xml' % locals()
    return requests.get(url)

vote_xmls = []
for m in meetings:
    vote_xmls.append(crawl_xml(m))
    clear_output()
    print('progress: %s/%s %s' % (len(vote_xmls), len(meetings), '#' * len(vote_xmls)))
    sys.stdout.flush()

progress: 108/108 ############################################################################################################

In [4]:

vote_xmls = filter(lambda r: r.ok, vote_xmls)
vote_xmls = [r.content for r in vote_xmls]
print(len(vote_xmls))

In [5]:

# Information fields, useful for reviewing the result
info_fields = ['vote-date', 'vote-time', 'motion-en', 'mover-en', 'mover-type', 'vote-separate-mechanism']
def xml_to_records(xml):
    doc = etree.XML(xml)
    records = []
    for topic in doc.xpath('//legcohk-vote/meeting/vote'):
        info = [topic.xpath(f)[0].text for f in info_fields]
        date = info[0]
        topic_id = '%s-%s' % (date, topic.attrib['number'])
        for member in topic.xpath('individual-votes/member'):
            member_id = member.attrib['name-en'] # Use English name as ID for sipmlicity
            vote = member.xpath('vote')[0].text
            records.append((topic_id, member_id, vote) + tuple(info))
    return records

records = []
for vote_xml in vote_xmls:
    records.extend(xml_to_records(vote_xml))

In [6]:

# More:
# http://nbviewer.ipython.org/urls/course.ie.cuhk.edu.hk/~engg4030/tutorial/tutorial7/Legco-Preprocessing.ipynb
def clean_record(t):
    # According to the numbers, they seem to be the same person
    t = list(t)
    if t[1] == 'Dr Joseph LEE':
        t[1] = 'Prof Joseph LEE'
    # Other normalization if any
    # ...
    return tuple(t)
records = [clean_record(r) for r in records]

In [7]:

df = pd.DataFrame(records, columns = ['topic_id', 'member_id', 'vote'] + info_fields)
df.to_csv('records-all-with-info.csv', encoding='utf-8')
df[:5]

Out[7]:

	topic_id	member_id	vote	vote-date	vote-time	motion-en	mover-en	mover-type	vote-separate-mechanism
0	17/10/2012-1	TSANG Yok-sing	Present	17/10/2012	19:37:53	AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN...	Dr Kenneth CHAN	Member	Yes
1	17/10/2012-1	Albert HO	Yes	17/10/2012	19:37:53	AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN...	Dr Kenneth CHAN	Member	Yes
2	17/10/2012-1	LEE Cheuk-yan	Yes	17/10/2012	19:37:53	AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN...	Dr Kenneth CHAN	Member	Yes
3	17/10/2012-1	James TO	Yes	17/10/2012	19:37:53	AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN...	Dr Kenneth CHAN	Member	Yes
4	17/10/2012-1	CHAN Kam-lam	No	17/10/2012	19:37:53	AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN...	Dr Kenneth CHAN	Member	Yes

In [8]:

df = df[['topic_id', 'member_id', 'vote']]
df.to_csv('records-all.csv', encoding='utf-8')
df[:5]

Out[8]:

	topic_id	member_id	vote
0	17/10/2012-1	TSANG Yok-sing	Present
1	17/10/2012-1	Albert HO	Yes
2	17/10/2012-1	LEE Cheuk-yan	Yes
3	17/10/2012-1	James TO	Yes
4	17/10/2012-1	CHAN Kam-lam	No

Let's play¶

In [9]:

print('total # of topics:', len(df['topic_id'].unique()))
print('total # of members:',len(df['member_id'].unique()))
print('total # of records:', len(df))

('total # of topics:', 2310)
('total # of members:', 70)
('total # of records:', 161700)

In [10]:

print(df['vote'].unique())

['Present' 'Yes' 'No' 'Absent' 'Abstain']

In [11]:

print(df['member_id'].unique())

['TSANG Yok-sing' 'Albert HO' 'LEE Cheuk-yan' 'James TO' 'CHAN Kam-lam'
 'LEUNG Yiu-chung' 'Dr LAU Wong-fat' 'Emily LAU' 'TAM Yiu-chung'
 'Abraham SHEK' 'Tommy CHEUNG' 'Frederick FUNG' 'Vincent FANG'
 'WONG Kwok-hing' 'Prof Joseph LEE' 'Jeffrey LAM' 'Andrew LEUNG'
 'WONG Ting-kwong' 'Ronny TONG' 'Cyd HO' 'Starry LEE' 'Dr LAM Tai-fai'
 'CHAN Hak-kan' 'CHAN Kin-por' 'Dr Priscilla LEUNG' 'Dr LEUNG Ka-lau'
 'CHEUNG Kwok-che' 'WONG Kwok-kin' 'IP Kwok-him' 'Mrs Regina IP' 'Paul TSE'
 'Alan LEONG' 'LEUNG Kwok-hung' 'Albert CHAN' 'WONG Yuk-man' 'Claudia MO'
 'Michael TIEN' 'James TIEN' 'NG Leung-sing' 'Steven HO' 'Frankie YICK'
 'WU Chi-wai' 'YIU Si-wing' 'Gary FAN' 'MA Fung-kwok' 'Charles Peter MOK'
 'CHAN Chi-chuen' 'CHAN Han-pan' 'Dr Kenneth CHAN' 'CHAN Yuen-han'
 'LEUNG Che-cheung' 'Kenneth LEUNG' 'Alice MAK' 'Dr KWOK Ka-ki'
 'KWOK Wai-keung' 'Dennis KWOK' 'Christopher CHEUNG' 'Dr Fernando CHEUNG'
 'SIN Chung-kai' 'Dr Helena WONG' 'IP Kin-yuen' 'Dr Elizabeth QUAT'
 'Martin LIAO' 'POON Siu-ping' 'TANG Ka-piu' 'Dr CHIANG Lai-wan'
 'Ir Dr LO Wai-kwok' 'CHUNG Kwok-pan' 'Christopher CHUNG' 'Tony TSE']

In [12]:

print(df['topic_id'].unique())

['17/10/2012-1' '17/10/2012-2' '17/10/2012-3' ..., '13/02/2015-7'
 '13/02/2015-8' '13/02/2015-9']

In [13]:

# A leader board of voting types
board_pos = pd.DataFrame(index=range(0,5))
for v in df['vote'].unique():
    count = df[df['vote']==v].groupby('member_id').count().sort('vote', ascending=False)['vote']
    count = count.reset_index()[:5]
    board_pos[v] = pd.Series(list(zip(count['member_id'], count['vote'])), index=range(0,5))
board_pos

Out[13]:

	Present	Yes	No	Absent	Abstain
0	(TSANG Yok-sing, 2284)	(LEUNG Kwok-hung, 1682)	(Ir Dr LO Wai-kwok, 1919)	(Dr LEUNG Ka-lau, 2007)	(Gary FAN, 426)
1	(CHEUNG Kwok-che, 794)	(Albert CHAN, 1342)	(TAM Yiu-chung, 1861)	(Ronny TONG, 1881)	(MA Fung-kwok, 121)
2	(Emily LAU, 718)	(CHAN Chi-chuen, 1331)	(Steven HO, 1861)	(LEUNG Yiu-chung, 1881)	(Prof Joseph LEE, 115)
3	(Cyd HO, 708)	(Charles Peter MOK, 453)	(WONG Ting-kwong, 1851)	(James TO, 1822)	(IP Kwok-him, 112)
4	(SIN Chung-kai, 702)	(WONG Yuk-man, 411)	(Christopher CHEUNG, 1807)	(IP Kin-yuen, 1813)	(Steven HO, 110)

In [14]:

board_neg = pd.DataFrame(index=range(0,5))
for v in df['vote'].unique():
    count = df[df['vote']==v].groupby('member_id').count().sort('vote', ascending=True)['vote']
    count = count.reset_index()[:5]
    board_neg[v] = pd.Series(list(zip(count['member_id'], count['vote'])), index=range(0,5))
board_neg

Out[14]:

	Present	Yes	No	Absent	Abstain
0	(YIU Si-wing, 1)	(Abraham SHEK, 95)	(James TO, 64)	(TSANG Yok-sing, 26)	(Dennis KWOK, 13)
1	(James TIEN, 1)	(Dr LAU Wong-fat, 106)	(Ronny TONG, 67)	(POON Siu-ping, 32)	(Vincent FANG, 13)
2	(LEUNG Yiu-chung, 1)	(Dr LEUNG Ka-lau, 129)	(Frederick FUNG, 70)	(TAM Yiu-chung, 50)	(Dr LAU Wong-fat, 15)
3	(NG Leung-sing, 1)	(Dr LAM Tai-fai, 132)	(Albert HO, 71)	(Steven HO, 53)	(Frederick FUNG, 16)
4	(IP Kwok-him, 1)	(Vincent FANG, 162)	(IP Kin-yuen, 72)	(Ir Dr LO Wai-kwok, 55)	(Charles Peter MOK, 19)

In [15]:

df_matrix = pd.DataFrame(index=df['member_id'].unique())
for gn, g in df.groupby('topic_id'):
    df_matrix[gn] = g.set_index('member_id')['vote']
df_matrix[:5]

Out[15]:

	01/02/2013-1	01/02/2013-2	01/02/2013-3	01/02/2013-4	01/02/2013-5	01/02/2013-6	01/02/2013-7	01/02/2013-8	03/07/2013-1	03/07/2013-10	...	31/10/2012-2	31/10/2012-3	31/10/2012-4	31/10/2012-5	31/10/2012-6	31/10/2012-7	31/10/2012-8	31/10/2012-9	31/10/2014-38	31/10/2014-39
TSANG Yok-sing	Present	Present	Present	Present	Present	Present	Present	Present	Present	Present	...	Present	Present	Present	Present	Present	Present	Present	Present	Present	Present
Albert HO	Yes	Yes	Yes	Yes	No	Yes	Yes	No	Yes	Yes	...	Yes	Yes	Yes	Yes	Yes	Yes	No	No	Yes	No
LEE Cheuk-yan	Yes	Yes	Yes	Yes	Yes	Yes	Yes	No	Yes	Yes	...	Absent	Absent	Absent	Absent	Absent	Absent	Absent	Absent	Yes	No
James TO	Yes	Yes	Yes	Yes	No	Yes	Yes	No	Yes	Yes	...	Yes	Yes	Yes	Absent	Absent	Absent	Absent	Absent	Yes	No
CHAN Kam-lam	No	No	No	No	Abstain	No	No	Yes	No	Abstain	...	Abstain	Abstain	Abstain	Abstain	Abstain	Abstain	Yes	Abstain	No	Yes

5 rows × 2310 columns

In [16]:

def to_numeric(x):
    x[(x != 'Yes') & (x != 'No')] = 0
    x[x == 'Yes'] = 1
    x[x == 'No'] = -1
df_matrix.apply(to_numeric)
df_matrix[:5]

Out[16]:

	01/02/2013-1	01/02/2013-2	01/02/2013-3	01/02/2013-4	01/02/2013-5	01/02/2013-6	01/02/2013-7	01/02/2013-8	03/07/2013-1	03/07/2013-10	...	31/10/2012-2	31/10/2012-3	31/10/2012-4	31/10/2012-5	31/10/2012-6	31/10/2012-7	31/10/2012-8	31/10/2012-9	31/10/2014-38	31/10/2014-39
TSANG Yok-sing	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
Albert HO	1	1	1	1	-1	1	1	-1	1	1	...	1	1	1	1	1	1	-1	-1	1	-1
LEE Cheuk-yan	1	1	1	1	1	1	1	-1	1	1	...	0	0	0	0	0	0	0	0	1	-1
James TO	1	1	1	1	-1	1	1	-1	1	1	...	1	1	1	0	0	0	0	0	1	-1
CHAN Kam-lam	-1	-1	-1	-1	0	-1	-1	1	-1	0	...	0	0	0	0	0	0	1	0	-1	1

5 rows × 2310 columns

In [17]:

X = matrix(df_matrix.as_matrix()).astype('float')
X = X - mean(X, 0)
C = X.T * X
print(C.shape)

(2310, 2310)

In [18]:

import scipy.sparse.linalg
PA = scipy.sparse.linalg.eigs(C, k=2, which='LM', return_eigenvectors=True)[1]

In [19]:

# Use the following one to pop up the 3D plot in another window.
# Good for interactive exploration.
# May or may not be available due to your desktop.
#%pylab 
#matplotlib.use('TkAgg') 
# Use the following one to plot inline (embedded in this notebook).
#%pylab inline


# Try 3D
PA = scipy.sparse.linalg.eigs(C, k=3, which='LM', return_eigenvectors=True)[1]
# Project data points onto principle axis
X_3D = PA.T * X.T
print(X_3D.shape)
# We intentionally add some disturbance for better visualization.
# Or else, some of the nodes are located in the same place.
# (Those who vote exactly the same)
X_3D = X_3D + randn(*tuple(X_3D.shape)) * 0.3
x = array(X_3D[0, :]).astype('float')
y = array(X_3D[1, :]).astype('float')
z = array(X_3D[2, :]).astype('float')

from mpl_toolkits.mplot3d import Axes3D
fig = figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, picker=True, s=100)

(3, 70)

-c:19: ComplexWarning: Casting complex values to real discards the imaginary part
-c:20: ComplexWarning: Casting complex values to real discards the imaginary part
-c:21: ComplexWarning: Casting complex values to real discards the imaginary part

Out[19]:

<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0x10bdde110>

In [20]:

x = -x
df_pc1 = pd.DataFrame(x.flatten(), index=df_matrix.index, columns=['PC1'])
df_pc1 = df_pc1.sort('PC1')
figure(figsize(12, 20))
plot(df_pc1['PC1'], df_pc1.rank()['PC1'], 'd', markersize=10)
#yticks(df_pc1.rank()['PC1'], df_pc1.index)
for (_x, _y, _s) in zip(df_pc1['PC1'], df_pc1.rank()['PC1'], df_pc1.index):
    annotate(_s, (_x, _y), xytext=(_x + 0.5, _y - 0.2))
title('Spectrum from Principal Component 1')
axis([-55, 60, 0, 72])

Out[20]:

[-55, 60, 0, 72]