LegCoHK¶

GitHub repo: https://github.com/hupili/legcohk

Related notebooks:

PCA for ENGG4030: http://bit.ly/1riabfV
Recommender System for ENGG4030: http://bit.ly/QwNvLZ
Graph Analysis for ENGG4030: http://bit.ly/1mxjuqu

Compared with the above notebooks, this repo contains more compact notes covering the whole data mining flow -- from data collection to final visualization. Interpretations will note be provided in the notes directly. If you have interest, you can dump your thoughts on the issue tracker. You can also request other features there.

Preparation¶

In [1]:

import requests
import pylab as pl
from pyquery import PyQuery as pq
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import scipy
import pandas as pd
from lxml import etree
from functools import reduce

In [2]:

matplotlib.rc('font', **{'sans-serif' : 'Helvetica, LiHei Pro, sans-serif', #'Arial',
                           'family' : 'sans-serif'})
%matplotlib inline

In [3]:

seed_pages = [
    'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1213.htm',
    'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1314.htm',
    'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1415.htm'
]
def crawl_seed(seed):
    d = pq(seed)
    return d('a').map(lambda i, a: a.attrib.get('name', None)).filter(lambda i, s: s.startswith('cm20'))
meetings = reduce(list.__add__, map(crawl_seed, seed_pages), [])
print(meetings)

['cm20121010', 'cm20121017', 'cm20121024', 'cm20121031', 'cm20121101', 'cm20121107', 'cm20121114', 'cm20121121', 'cm20121128', 'cm20121205', 'cm20121210', 'cm20121212', 'cm20121219', 'cm20130109', 'cm20130116', 'cm20130117', 'cm20130123', 'cm20130130', 'cm20130206', 'cm20130220', 'cm20130227', 'cm20130320', 'cm20130327', 'cm20130417', 'cm20130424', 'cm20130508', 'cm20130509', 'cm20130515', 'cm20130522', 'cm20130529', 'cm20130605', 'cm20130619', 'cm20130626', 'cm20130703', 'cm20130710', 'cm20130711', 'cm20130717', 'cm20131009', 'cm20131016', 'cm20131017', 'cm20131023', 'cm20131030', 'cm20131106', 'cm20131113', 'cm20131120', 'cm20131127', 'cm20131204', 'cm20131211', 'cm20131218', 'cm20140108', 'cm20140115', 'cm20140116', 'cm20140122', 'cm20140212', 'cm20140219', 'cm20140226', 'cm20140319', 'cm20140326', 'cm20140409', 'cm20140416', 'cm20140430', 'cm20140507', 'cm20140514', 'cm20140521', 'cm20140522', 'cm20140528', 'cm20140604', 'cm20140611', 'cm20140618', 'cm20140625', 'cm20140702', 'cm20140703', 'cm20140709', 'cm20141008', 'cm20141015', 'cm20141016', 'cm20141022', 'cm20141029', 'cm20141105', 'cm20141112', 'cm20141120', 'cm20141126', 'cm20141203', 'cm20141210', 'cm20141217', 'cm20150107', 'cm20150114', 'cm20150115', 'cm20150121', 'cm20150128', 'cm20150204', 'cm20150211', 'cm20150225', 'cm20150318', 'cm20150325', 'cm20150326', 'cm20150415', 'cm20150422', 'cm20150429', 'cm20150506', 'cm20150513', 'cm20150520', 'cm20150527', 'cm20150603', 'cm20150610', 'cm20150617', 'cm20150624', 'cm20150708']

In [4]:

from IPython.core.display import clear_output
import sys

def crawl_xml(meeting):
    # This logic is translated from the official JS code
    yy, mm, dd = map(lambda i: int(meeting[i:(i + 2)]), [4, 6, 8])
    if mm >= 10:
        yr = 'yr%02d-%02d' % (yy, yy + 1)
    else:
        yr = 'yr%02d-%02d' % (yy - 1, yy)
    prefix = 'http://www.legco.gov.hk'
    url = '%(prefix)s/%(yr)s/chinese/counmtg/voting/cm_vote_20%(yy)02d%(mm)02d%(dd)02d.xml' % locals()
    return requests.get(url)

vote_xmls = []
for m in meetings:
    vote_xmls.append(crawl_xml(m))
    clear_output()
    print('progress: %s/%s %s' % (len(vote_xmls), len(meetings), '#' * len(vote_xmls)))
    sys.stdout.flush()

progress: 108/108 ############################################################################################################

In [5]:

vote_xmls = list(filter(lambda r: r.ok, vote_xmls))
vote_xmls = list(map(lambda r: r.content, vote_xmls))
print(len(vote_xmls))

In [6]:

# Information fields, useful for reviewing the result
info_fields = ['vote-date', 'vote-time', 'motion-en', 'mover-en', 'mover-type', 'vote-separate-mechanism']
def xml_to_records(xml):
    doc = etree.XML(xml)
    records = []
    for topic in doc.xpath('//legcohk-vote/meeting/vote'):
        info = [topic.xpath(f)[0].text for f in info_fields]
        date = info[0]
        topic_id = '%s-%s' % (date, topic.attrib['number'])
        for member in topic.xpath('individual-votes/member'):
            #member_id = member.attrib['name-en'] # Use English name as ID for sipmlicity
            member_id = member.attrib['name-ch'] # Use Chinese for a local blog post
            vote = member.xpath('vote')[0].text
            records.append((topic_id, member_id, vote) + tuple(info))
    return records

records = reduce(list.__add__, map(xml_to_records, vote_xmls), [])

In [7]:

rec_guo_1 = [(r[0], r[1]) for r in records if r[1] == '郭偉強']
rec_guo_2 = [(r[0], r[1]) for r in records if r[1] == '郭偉强']
# According to the number of votes, those seem to be the same person
print(len(rec_guo_1))
print(len(rec_guo_2))
print(len(rec_guo_1) + len(rec_guo_2))

-c:1: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
-c:2: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal

0
0
0

In [8]:

# More:
# http://nbviewer.ipython.org/urls/course.ie.cuhk.edu.hk/~engg4030/tutorial/tutorial7/Legco-Preprocessing.ipynb
NAME_FIX = {
    'Dr Joseph LEE': 'Prof Joseph LEE',
    '郭偉强': '郭偉強'
}

def clean_record(t):
    # According to the numbers, they seem to be the same person
    t = list(t)
    t[1] = NAME_FIX.get(t[1], t[1])
    # Other normalization if any
    # ...
    return tuple(t)
records = list(map(clean_record, records))

In [9]:

df = pd.DataFrame(data=records, columns=['topic_id', 'member_id', 'vote'] + info_fields)
df.to_csv('records-all-with-info.csv', encoding='utf-8')
df[:5]

Out[9]:

	topic_id	member_id	vote	vote-date	vote-time	motion-en	mover-en	mover-type	vote-separate-mechanism
0	17/10/2012-1	曾鈺成	Present	17/10/2012	19:37:53	AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN...	Dr Kenneth CHAN	Member	Yes
1	17/10/2012-1	何俊仁	Yes	17/10/2012	19:37:53	AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN...	Dr Kenneth CHAN	Member	Yes
2	17/10/2012-1	李卓人	Yes	17/10/2012	19:37:53	AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN...	Dr Kenneth CHAN	Member	Yes
3	17/10/2012-1	涂謹申	Yes	17/10/2012	19:37:53	AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN...	Dr Kenneth CHAN	Member	Yes
4	17/10/2012-1	陳鑑林	No	17/10/2012	19:37:53	AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN...	Dr Kenneth CHAN	Member	Yes

In [10]:

df = df[['topic_id', 'member_id', 'vote']]
df.to_csv('records-all.csv', encoding='utf-8')
df[:5]

Out[10]:

	topic_id	member_id	vote
0	17/10/2012-1	曾鈺成	Present
1	17/10/2012-1	何俊仁	Yes
2	17/10/2012-1	李卓人	Yes
3	17/10/2012-1	涂謹申	Yes
4	17/10/2012-1	陳鑑林	No

Let's play¶

In [11]:

print('total # of topics:', len(df['topic_id'].unique()))
print('total # of members:',len(df['member_id'].unique()))
print('total # of records:', len(df))

('total # of topics:', 2310)
('total # of members:', 71)
('total # of records:', 161700)

In [12]:

print(df['vote'].unique())

['Present' 'Yes' 'No' 'Absent' 'Abstain']

In [13]:

print(df['member_id'].unique())

[u'\u66fe\u923a\u6210' u'\u4f55\u4fca\u4ec1' u'\u674e\u5353\u4eba'
 u'\u6d82\u8b39\u7533' u'\u9673\u9451\u6797' u'\u6881\u8000\u5fe0'
 u'\u5289\u7687\u767c' u'\u5289\u6167\u537f' u'\u8b5a\u8000\u5b97'
 u'\u77f3\u79ae\u8b19' u'\u5f35\u5b87\u4eba' u'\u99ae\u6aa2\u57fa'
 u'\u65b9\u525b' u'\u738b\u570b\u8208' u'\u674e\u570b\u9e9f'
 u'\u6797\u5065\u92d2' u'\u6881\u541b\u5f65' u'\u9ec3\u5b9a\u5149'
 u'\u6e6f\u5bb6\u9a4a' u'\u4f55\u79c0\u862d' u'\u674e\u6167\u743c'
 u'\u6797\u5927\u8f1d' u'\u9673\u514b\u52e4' u'\u9673\u5065\u6ce2'
 u'\u6881\u7f8e\u82ac' u'\u6881\u5bb6\u9a2e' u'\u5f35\u570b\u67f1'
 u'\u9ec3\u570b\u5065' u'\u8449\u570b\u8b19' u'\u8449\u5289\u6dd1\u5100'
 u'\u8b1d\u5049\u4fca' u'\u6881\u5bb6\u5091' u'\u6881\u570b\u96c4'
 u'\u9673\u5049\u696d' u'\u9ec3\u6bd3\u6c11' u'\u6bdb\u5b5f\u975c'
 u'\u7530\u5317\u8fb0' u'\u7530\u5317\u4fca' u'\u5433\u4eae\u661f'
 u'\u4f55\u4fca\u8ce2' u'\u6613\u5fd7\u660e' u'\u80e1\u5fd7\u5049'
 u'\u59da\u601d\u69ae' u'\u8303\u570b\u5a01' u'\u99ac\u9022\u570b'
 u'\u83ab\u4e43\u5149' u'\u9673\u5fd7\u5168' u'\u9673\u6052\u944c'
 u'\u9673\u5bb6\u6d1b' u'\u9673\u5a49\u5afb' u'\u6881\u5fd7\u7965'
 u'\u6881\u7e7c\u660c' u'\u9ea5\u7f8e\u5a1f' u'\u90ed\u5bb6\u9e92'
 u'\u90ed\u5049\u5f37' u'\u90ed\u69ae\u93d7' u'\u5f35\u83ef\u5cf0'
 u'\u5f35\u8d85\u96c4' u'\u55ae\u4ef2\u5055' u'\u9ec3\u78a7\u96f2'
 u'\u8449\u5efa\u6e90' u'\u845b\u73ee\u5e06' u'\u5ed6\u9577\u6c5f'
 u'\u6f58\u5146\u5e73' u'\u9127\u5bb6\u5f6a' u'\u8523\u9e97\u82b8'
 u'\u76e7\u5049\u570b' u'\u937e\u570b\u658c' u'\u937e\u6a39\u6839'
 u'\u8b1d\u5049\u9293' u'\u90ed\u5049\u5f3a']

In [14]:

print(df['topic_id'].unique())

['17/10/2012-1' '17/10/2012-2' '17/10/2012-3' ..., '13/02/2015-7'
 '13/02/2015-8' '13/02/2015-9']

In [15]:

df[df['member_id'] == '曾鈺成']['vote'].value_counts()

/usr/local/lib/python2.7/site-packages/pandas/core/ops.py:558: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  result = lib.scalar_compare(x, y, op)

Out[15]:

Series([], dtype: int64)

In [16]:

# A leader board of voting types
board_pos = pd.DataFrame(index=range(0,5))
for v in df['vote'].unique():
    count = df[df['vote']==v].groupby('member_id').count().sort('vote', ascending=False)['vote']
    count = count.reset_index()[:5]
    board_pos[v] = pd.Series(zip(count['member_id'], count['vote']), index=range(0,5))
board_pos

Out[16]:

	Present	Yes	No	Absent	Abstain
0	(曾鈺成, 2284)	(梁國雄, 1682)	(盧偉國, 1919)	(梁家騮, 2007)	(范國威, 426)
1	(張國柱, 794)	(陳偉業, 1342)	(譚耀宗, 1861)	(梁耀忠, 1881)	(馬逢國, 121)
2	(劉慧卿, 718)	(陳志全, 1331)	(何俊賢, 1861)	(湯家驊, 1881)	(李國麟, 115)
3	(何秀蘭, 708)	(莫乃光, 453)	(黃定光, 1851)	(涂謹申, 1822)	(葉國謙, 112)
4	(單仲偕, 702)	(黃毓民, 411)	(張華峰, 1807)	(葉建源, 1813)	(何俊賢, 110)

In [17]:

board_neg = pd.DataFrame(index=range(0,5))
for v in df['vote'].unique():
    count = df[df['vote']==v].groupby('member_id').count().sort('vote', ascending=True)['vote']
    count = count.reset_index()[:5]
    board_neg[v] = pd.Series(zip(count['member_id'], count['vote']), index=range(0,5))
board_neg

Out[17]:

	Present	Yes	No	Absent	Abstain
0	(葉國謙, 1)	(石禮謙, 95)	(涂謹申, 64)	(曾鈺成, 26)	(郭榮鏗, 13)
1	(梁耀忠, 1)	(劉皇發, 106)	(湯家驊, 67)	(潘兆平, 32)	(方剛, 13)
2	(田北俊, 1)	(梁家騮, 129)	(馮檢基, 70)	(譚耀宗, 50)	(郭偉强, 14)
3	(吳亮星, 1)	(郭偉強, 131)	(何俊仁, 71)	(郭偉強, 50)	(劉皇發, 15)
4	(陳志全, 1)	(林大輝, 132)	(葉建源, 72)	(何俊賢, 53)	(馮檢基, 16)

In [18]:

df_matrix = pd.DataFrame(index=df['member_id'].unique())
for gn, g in df.groupby('topic_id'):
    df_matrix[gn] = g.set_index('member_id')['vote']
df_matrix[:5]

Out[18]:

	01/02/2013-1	01/02/2013-2	01/02/2013-3	01/02/2013-4	01/02/2013-5	01/02/2013-6	01/02/2013-7	01/02/2013-8	03/07/2013-1	03/07/2013-10	...	31/10/2012-2	31/10/2012-3	31/10/2012-4	31/10/2012-5	31/10/2012-6	31/10/2012-7	31/10/2012-8	31/10/2012-9	31/10/2014-38	31/10/2014-39
曾鈺成	Present	Present	Present	Present	Present	Present	Present	Present	Present	Present	...	Present	Present	Present	Present	Present	Present	Present	Present	Present	Present
何俊仁	Yes	Yes	Yes	Yes	No	Yes	Yes	No	Yes	Yes	...	Yes	Yes	Yes	Yes	Yes	Yes	No	No	Yes	No
李卓人	Yes	Yes	Yes	Yes	Yes	Yes	Yes	No	Yes	Yes	...	Absent	Absent	Absent	Absent	Absent	Absent	Absent	Absent	Yes	No
涂謹申	Yes	Yes	Yes	Yes	No	Yes	Yes	No	Yes	Yes	...	Yes	Yes	Yes	Absent	Absent	Absent	Absent	Absent	Yes	No
陳鑑林	No	No	No	No	Abstain	No	No	Yes	No	Abstain	...	Abstain	Abstain	Abstain	Abstain	Abstain	Abstain	Yes	Abstain	No	Yes

5 rows × 2310 columns

In [19]:

def to_numeric(x):
    x[(x != 'Yes') & (x != 'No')] = 0
    x[x == 'Yes'] = 1
    x[x == 'No'] = -1
df_matrix.apply(to_numeric)
df_matrix[:5]

Out[19]:

	01/02/2013-1	01/02/2013-2	01/02/2013-3	01/02/2013-4	01/02/2013-5	01/02/2013-6	01/02/2013-7	01/02/2013-8	03/07/2013-1	03/07/2013-10	...	31/10/2012-2	31/10/2012-3	31/10/2012-4	31/10/2012-5	31/10/2012-6	31/10/2012-7	31/10/2012-8	31/10/2012-9	31/10/2014-38	31/10/2014-39
曾鈺成	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
何俊仁	1	1	1	1	-1	1	1	-1	1	1	...	1	1	1	1	1	1	-1	-1	1	-1
李卓人	1	1	1	1	1	1	1	-1	1	1	...	0	0	0	0	0	0	0	0	1	-1
涂謹申	1	1	1	1	-1	1	1	-1	1	1	...	1	1	1	0	0	0	0	0	1	-1
陳鑑林	-1	-1	-1	-1	0	-1	-1	1	-1	0	...	0	0	0	0	0	0	1	0	-1	1

5 rows × 2310 columns

In [20]:

X = np.matrix(df_matrix.as_matrix()).astype('float')
X = X - np.mean(X, 0)
C = X.T * X
print(C.shape)

(2310, 2310)

In [21]:

import scipy.sparse.linalg
PA = scipy.sparse.linalg.eigs(C, k=2, which='LM', return_eigenvectors=True)[1]

In [22]:

# Use the following one to pop up the 3D plot in another window.
# Good for interactive exploration.
# May or may not be available due to your desktop.
#%pylab 
#matplotlib.use('TkAgg') 
# Use the following one to plot inline (embedded in this notebook).
#%pylab inline


# Try 3D
PA = scipy.sparse.linalg.eigs(C, k=3, which='LM', return_eigenvectors=True)[1]
# Project data points onto principle axis
X_3D = PA.T * X.T
print(X_3D.shape)

x = np.array(X_3D[0, :]).astype('float')
y = np.array(X_3D[1, :]).astype('float')
z = np.array(X_3D[2, :]).astype('float')

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, picker=True, s=100)

(3, 71)

-c:16: ComplexWarning: Casting complex values to real discards the imaginary part
-c:17: ComplexWarning: Casting complex values to real discards the imaginary part
-c:18: ComplexWarning: Casting complex values to real discards the imaginary part

Out[22]:

<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0x10a0a0390>

In [23]:

matplotlib.rcParams.update({'font.size': 22})

x = -x

df_pc1 = pd.DataFrame(x.flatten(), index=df_matrix.index, columns=['PC1'])
df_pc1 = df_pc1.sort('PC1')
fig = plt.figure(figsize=(10, 30))
ax = fig.add_subplot(1,1,1)
ax.plot(df_pc1['PC1'], df_pc1.rank()['PC1'], 'd', markersize=16)
#yticks(df_pc1.rank()['PC1'], df_pc1.index)
for (_x, _y, _s) in zip(df_pc1['PC1'], df_pc1.rank()['PC1'], df_pc1.index):
    plt.annotate(_s, (_x, _y), xytext=(_x + 1, _y - 0.2))
plt.title('Spectrum from Principal Component 1')
plt.axis([-60, 50, 0, 75])

Out[23]:

[-60, 50, 0, 75]