In [2]:
import pandas as pd
import numpy as np
import scipy.stats
from __future__ import division
from sklearn import datasets
from sklearn.decomposition import FactorAnalysis
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
%pylab inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # annoying pandas bug
Populating the interactive namespace from numpy and matplotlib

TruthCoin

https://github.com/psztorc/Truthcoin

A simplified example of multi-event resolution

  • no reputation / stake. uniform vote weights
  • only binary events / discrete outcomes.
In [21]:
# events are columns of the voter matrix:
#   col[0] = Obama is the u.s. president (2014),
#   col[1] = Brazil won the 2014 fifa world cup,
#   col[2] = Djokovic won the 2014 wimbledon tennis championship,
#   col[3] = MtGox exchange goes insolvent (1Q 2014)
#   col[4] = Professor bitcorn won his bet ("I predict that Bitcoin will trade for under $10 a share by the first half of 2014")
#

VoterMatrix = np.matrix([
           [1, 0, 1, 1, 0], # first voter
           [1, 1, 1, 1, 0], # ignorant about sports
           [1, 0, 1, 1, 0],
           [0, 0, 1, 1, 0], # republican in denial
           [1, 0, 1, 1, 1]]) # prof bitcorn

print VoterMatrix

features = ['outcome_1', 'outcome_2', 'outcome_3', 'outcome_4', 'outcome_5']
voteMatrix_pd = pd.DataFrame(VoterMatrix, columns=features)
#raw['class'] = y
print ' voteMatrix_pd:'
print voteMatrix_pd
[[1 0 1 1 0]
 [1 1 1 1 0]
 [1 0 1 1 0]
 [0 0 1 1 0]
 [1 0 1 1 1]]
 voteMatrix_pd:
   outcome_1  outcome_2  outcome_3  outcome_4  outcome_5
0          1          0          1          1          0
1          1          1          1          1          0
2          1          0          1          1          0
3          0          0          1          1          0
4          1          0          1          1          1

[5 rows x 5 columns]

Simple (non-SVD) event resolution

  • using a uniform reputation / voter weighting
In [16]:
def reWeight(Vec):
    """Get the relative influence of numbers, treat NaN as influence-less."""
    vec2 = np.array(Vec, dtype=float)
    for i in range(len(Vec)):
        if isnan(Vec[i]):
            vec2[i] = 0

    vec2sum = np.sum(vec2)
    for i in range(len(vec2)):
        vec2[i] = vec2[i] / vec2sum

    return(vec2)


rew = reWeight(np.array([1,1,1,1]))
print "reweighted vector test. uniform vector", rew

def getWeight(Vec, AddMean=0):
    """Takes an array (vector in practice), and returns proportional distance from zero."""
    New = abs(Vec)       #Absolute Value
    if AddMean == 1:     #Add the mean to each element of the vector
        New = New + mean(New)
    if sum(New) == 0:    #Catch an error here
        New = New + 1
    New = New/sum(New)   #Normalize
    return(New)


uniformWeight = array([[1]]*len(VoterMatrix))
print "\nuniform weights:\n", uniformWeight
uniformReputation = getWeight(uniformWeight)
print "\nuniform reputation:\n", uniformReputation
reweighted vector test. uniform vector [ 0.25  0.25  0.25  0.25]

uniform weights:
[[1]
 [1]
 [1]
 [1]
 [1]]

uniform reputation:
[[ 0.2]
 [ 0.2]
 [ 0.2]
 [ 0.2]
 [ 0.2]]

Weigh votes and resolve decisions.

measure each decision by taking a dot product. essentially this just uses the average vote value among all voters.

  • an SVD result would change the reputation vector, but without SVD its a simple uniform vector (all votes equal).
In [11]:
# port of GetDecisionOutcomes()  https://github.com/psztorc/Truthcoin/blob/master/lib/consensus/ConsensusMechanism.r#L139

# VoterMatrix
# we're using the regular matrix here. data is not even zero-centered.

MaskedVoterMatrix = np.ma.masked_array(VoterMatrix, isnan(VoterMatrix))
matrix_mask_thingie = -MaskedVoterMatrix[...,0].mask
# not sure what the mask is for.
# corresponds to https://github.com/psztorc/Truthcoin/blob/master/pylib/consensus/consensus.py#L113-L114
row = reWeight( rep [ matrix_mask_thingie ] )
print "row:", row
col = MaskedVoterMatrix[matrix_mask_thingie, 0]
print "col:", col


decisions = []
for i in range(VoterMatrix.shape[1]):
    row = reWeight( rep [ -MaskedVoterMatrix[...,i].mask ] )
    col = MaskedVoterMatrix[ -MaskedVoterMatrix[...,i].mask, i]
    col = np.array(col, dtype=float)
    row = np.transpose(row)[0]
    decisions.append(np.dot(col, row))
    
print "\ndecisions:"
print decisions
row: [[ 0.2]
 [ 0.2]
 [ 0.2]
 [ 0.2]
 [ 0.2]]
col: [[1 1 1 0 1]]

decisions:
[0.80000000000000004, 0.20000000000000001, 1.0, 1.0, 0.20000000000000001]
  • that's the output for a simplified multi-decision resolution from votes on binary outcomes.
    • map values between [0,1] to one of {0, 0.5, 1}
  • in this simplified version there is no reputation or vote stake amounts. every vote is equal ([0.2, 0.2, 0.2, 0.2, 0.2])

  • an extended method would incorporate vote stake/deposit amounts to weigh votes. if we also add scaled/continous outcomes then the resulting consensus method would be a form of multi-decision SchellingCoin (or equivalently, TruthCoin without reputation).

SVD for Reputation based voting

  • SVD operates on a covariance matrix. covariance calc needs data matrix of normalized continuous values
In [23]:
# normalize each feature/column to mean = 0, std = 1
# data matrix needs to be normalized to get covariance and SVD
normed = voteMatrix_pd.copy()
for col in features:
    #normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()) / normed[col].std())
    normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()))

# normed data matrix is only zero-centered (not auto-scaled)
print '\nNormalized dataset:'
print normed[:5]
Normalized dataset:
   outcome_1  outcome_2  outcome_3  outcome_4  outcome_5
0        0.2       -0.2          0          0       -0.2
1        0.2        0.8          0          0       -0.2
2        0.2       -0.2          0          0       -0.2
3       -0.8       -0.2          0          0       -0.2
4        0.2       -0.2          0          0        0.8

[5 rows x 5 columns]

this covariance calc is from the original pca example

In [24]:
### since our data is already normalized, cov(x1, x2) = sum(x1*x2) / num_observations
#   ^^ old assumption from the original example. valid here??
cov_df = pd.DataFrame(index=features)
for colA in features:
    column = []
    for colB in features:
        cov = normed[colA].cov(normed[colB])
        column.append(cov)
    cov_df[colA] = column

print 'Covariance matrix:'
print cov_df
# everybody agrees on outcomes 3 & 4 (tennis winner, mtgox solvency), so those columns have zero variance
Covariance matrix:
           outcome_1  outcome_2  outcome_3  outcome_4  outcome_5
outcome_1       0.20       0.05          0          0       0.05
outcome_2       0.05       0.20          0          0      -0.05
outcome_3       0.00       0.00          0          0       0.00
outcome_4       0.00       0.00          0          0       0.00
outcome_5       0.05      -0.05          0          0       0.20

[5 rows x 5 columns]

3. Singular Value Decomposition (SVD)

[U]: Rows are the original features and columns are the PCA 'components'. Each cell gives the 'loading' of the feature on the corresponding component.

[S]: Represents how much variance is explained by each component.

In [8]:
# use numpy's SVD implementation
u, s, v = scipy.linalg.svd(cov_df)
print 'U: (feature loading for each component)'
print pd.DataFrame(u, index=features) # first loading
print '\nExplained variance:\n', s

firstScore = np.transpose(np.dot(cov_df, u))[0]
print "\nfirstScore:"
print firstScore

Set1 = firstScore + abs(min(firstScore))
print "\nSet1:"
print Set1
Set2 = firstScore - max(firstScore)
print "Set2:"
print Set2
U: (feature loading for each component)
                  0             1        2  3  4
outcome_1 -0.816497 -8.088567e-17  0.57735  0  0
outcome_2 -0.408248 -7.071068e-01 -0.57735  0  0
outcome_3  0.000000  0.000000e+00  0.00000  1  0
outcome_4  0.000000  0.000000e+00  0.00000  0  1
outcome_5 -0.408248  7.071068e-01 -0.57735  0  0

[5 rows x 5 columns]

Explained variance:
[ 0.25  0.25  0.1   0.    0.  ]

firstScore:
[-0.20412415 -0.10206207  0.          0.         -0.10206207]

Set1:
[ 0.          0.10206207  0.20412415  0.20412415  0.10206207]
Set2:
[-0.20412415 -0.10206207  0.          0.         -0.10206207]
In [10]:
# note on these two sets: https://github.com/psztorc/Truthcoin/blob/master/lib/consensus/ConsensusMechanism.r#L40-L51
New1 = getWeight(np.dot(Set1, voteMatrix_pd))
print "\nNew1:"
print New1
New2 = getWeight(np.dot(Set2, voteMatrix_pd))
print "New2:"
print New2
New1:
[ 0.22222222  0.05555556  0.33333333  0.33333333  0.05555556]
New2:
[ 0.28571429  0.07142857  0.28571429  0.28571429  0.07142857]

incomplete. more calcs follow for adjusting voter reputations.

In [ ]: