In [2]:
import pandas as pd
import numpy as np
import scipy.stats
from __future__ import division
from sklearn import datasets
from sklearn.decomposition import FactorAnalysis
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
%pylab inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # annoying pandas bug

Populating the interactive namespace from numpy and matplotlib


TruthCoin¶

https://github.com/psztorc/Truthcoin

A simplified example of multi-event resolution¶

• no reputation / stake. uniform vote weights
• only binary events / discrete outcomes.
In [21]:
# events are columns of the voter matrix:
#   col[0] = Obama is the u.s. president (2014),
#   col[1] = Brazil won the 2014 fifa world cup,
#   col[2] = Djokovic won the 2014 wimbledon tennis championship,
#   col[3] = MtGox exchange goes insolvent (1Q 2014)
#   col[4] = Professor bitcorn won his bet ("I predict that Bitcoin will trade for under \$10 a share by the first half of 2014")
#

VoterMatrix = np.matrix([
[1, 0, 1, 1, 0], # first voter
[1, 1, 1, 1, 0], # ignorant about sports
[1, 0, 1, 1, 0],
[0, 0, 1, 1, 0], # republican in denial
[1, 0, 1, 1, 1]]) # prof bitcorn

print VoterMatrix

features = ['outcome_1', 'outcome_2', 'outcome_3', 'outcome_4', 'outcome_5']
voteMatrix_pd = pd.DataFrame(VoterMatrix, columns=features)
#raw['class'] = y
print ' voteMatrix_pd:'
print voteMatrix_pd

[[1 0 1 1 0]
[1 1 1 1 0]
[1 0 1 1 0]
[0 0 1 1 0]
[1 0 1 1 1]]
voteMatrix_pd:
outcome_1  outcome_2  outcome_3  outcome_4  outcome_5
0          1          0          1          1          0
1          1          1          1          1          0
2          1          0          1          1          0
3          0          0          1          1          0
4          1          0          1          1          1

[5 rows x 5 columns]


Simple (non-SVD) event resolution¶

• using a uniform reputation / voter weighting
In [16]:
def reWeight(Vec):
"""Get the relative influence of numbers, treat NaN as influence-less."""
vec2 = np.array(Vec, dtype=float)
for i in range(len(Vec)):
if isnan(Vec[i]):
vec2[i] = 0

vec2sum = np.sum(vec2)
for i in range(len(vec2)):
vec2[i] = vec2[i] / vec2sum

return(vec2)

rew = reWeight(np.array([1,1,1,1]))
print "reweighted vector test. uniform vector", rew

"""Takes an array (vector in practice), and returns proportional distance from zero."""
New = abs(Vec)       #Absolute Value
if AddMean == 1:     #Add the mean to each element of the vector
New = New + mean(New)
if sum(New) == 0:    #Catch an error here
New = New + 1
New = New/sum(New)   #Normalize
return(New)

uniformWeight = array([[1]]*len(VoterMatrix))
print "\nuniform weights:\n", uniformWeight
uniformReputation = getWeight(uniformWeight)
print "\nuniform reputation:\n", uniformReputation

reweighted vector test. uniform vector [ 0.25  0.25  0.25  0.25]

uniform weights:
[[1]
[1]
[1]
[1]
[1]]

uniform reputation:
[[ 0.2]
[ 0.2]
[ 0.2]
[ 0.2]
[ 0.2]]


measure each decision by taking a dot product. essentially this just uses the average vote value among all voters.

• an SVD result would change the reputation vector, but without SVD its a simple uniform vector (all votes equal).
In [11]:
# port of GetDecisionOutcomes()  https://github.com/psztorc/Truthcoin/blob/master/lib/consensus/ConsensusMechanism.r#L139

# VoterMatrix
# we're using the regular matrix here. data is not even zero-centered.

# not sure what the mask is for.
# corresponds to https://github.com/psztorc/Truthcoin/blob/master/pylib/consensus/consensus.py#L113-L114
row = reWeight( rep [ matrix_mask_thingie ] )
print "row:", row
print "col:", col

decisions = []
for i in range(VoterMatrix.shape[1]):
col = np.array(col, dtype=float)
row = np.transpose(row)[0]
decisions.append(np.dot(col, row))

print "\ndecisions:"
print decisions

row: [[ 0.2]
[ 0.2]
[ 0.2]
[ 0.2]
[ 0.2]]
col: [[1 1 1 0 1]]

decisions:
[0.80000000000000004, 0.20000000000000001, 1.0, 1.0, 0.20000000000000001]

• that's the output for a simplified multi-decision resolution from votes on binary outcomes.
• map values between [0,1] to one of {0, 0.5, 1}
• in this simplified version there is no reputation or vote stake amounts. every vote is equal ([0.2, 0.2, 0.2, 0.2, 0.2])

• an extended method would incorporate vote stake/deposit amounts to weigh votes. if we also add scaled/continous outcomes then the resulting consensus method would be a form of multi-decision SchellingCoin (or equivalently, TruthCoin without reputation).

SVD for Reputation based voting¶

• SVD operates on a covariance matrix. covariance calc needs data matrix of normalized continuous values
In [23]:
# normalize each feature/column to mean = 0, std = 1
# data matrix needs to be normalized to get covariance and SVD
normed = voteMatrix_pd.copy()
for col in features:
#normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()) / normed[col].std())
normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()))

# normed data matrix is only zero-centered (not auto-scaled)
print '\nNormalized dataset:'
print normed[:5]

Normalized dataset:
outcome_1  outcome_2  outcome_3  outcome_4  outcome_5
0        0.2       -0.2          0          0       -0.2
1        0.2        0.8          0          0       -0.2
2        0.2       -0.2          0          0       -0.2
3       -0.8       -0.2          0          0       -0.2
4        0.2       -0.2          0          0        0.8

[5 rows x 5 columns]


this covariance calc is from the original pca example

In [24]:
### since our data is already normalized, cov(x1, x2) = sum(x1*x2) / num_observations
#   ^^ old assumption from the original example. valid here??
cov_df = pd.DataFrame(index=features)
for colA in features:
column = []
for colB in features:
cov = normed[colA].cov(normed[colB])
column.append(cov)
cov_df[colA] = column

print 'Covariance matrix:'
print cov_df
# everybody agrees on outcomes 3 & 4 (tennis winner, mtgox solvency), so those columns have zero variance

Covariance matrix:
outcome_1  outcome_2  outcome_3  outcome_4  outcome_5
outcome_1       0.20       0.05          0          0       0.05
outcome_2       0.05       0.20          0          0      -0.05
outcome_3       0.00       0.00          0          0       0.00
outcome_4       0.00       0.00          0          0       0.00
outcome_5       0.05      -0.05          0          0       0.20

[5 rows x 5 columns]


3. Singular Value Decomposition (SVD)¶

[U]: Rows are the original features and columns are the PCA 'components'. Each cell gives the 'loading' of the feature on the corresponding component.

[S]: Represents how much variance is explained by each component.

In [8]:
# use numpy's SVD implementation
u, s, v = scipy.linalg.svd(cov_df)
print '\nExplained variance:\n', s

firstScore = np.transpose(np.dot(cov_df, u))[0]
print "\nfirstScore:"
print firstScore

Set1 = firstScore + abs(min(firstScore))
print "\nSet1:"
print Set1
Set2 = firstScore - max(firstScore)
print "Set2:"
print Set2

U: (feature loading for each component)
0             1        2  3  4
outcome_1 -0.816497 -8.088567e-17  0.57735  0  0
outcome_2 -0.408248 -7.071068e-01 -0.57735  0  0
outcome_3  0.000000  0.000000e+00  0.00000  1  0
outcome_4  0.000000  0.000000e+00  0.00000  0  1
outcome_5 -0.408248  7.071068e-01 -0.57735  0  0

[5 rows x 5 columns]

Explained variance:
[ 0.25  0.25  0.1   0.    0.  ]

firstScore:
[-0.20412415 -0.10206207  0.          0.         -0.10206207]

Set1:
[ 0.          0.10206207  0.20412415  0.20412415  0.10206207]
Set2:
[-0.20412415 -0.10206207  0.          0.         -0.10206207]

In [10]:
# note on these two sets: https://github.com/psztorc/Truthcoin/blob/master/lib/consensus/ConsensusMechanism.r#L40-L51
New1 = getWeight(np.dot(Set1, voteMatrix_pd))
print "\nNew1:"
print New1
New2 = getWeight(np.dot(Set2, voteMatrix_pd))
print "New2:"
print New2

New1:
[ 0.22222222  0.05555556  0.33333333  0.33333333  0.05555556]
New2:
[ 0.28571429  0.07142857  0.28571429  0.28571429  0.07142857]