import pandas as pd
import numpy as np
import scipy.stats
from __future__ import division
from sklearn import datasets
from sklearn.decomposition import FactorAnalysis
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
%pylab inline
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # annoying pandas bug
Populating the interactive namespace from numpy and matplotlib
** https://github.com/psztorc/Truthcoin **
# events are columns of the voter matrix:
# col[0] = Obama is the u.s. president (2014),
# col[1] = Brazil won the 2014 fifa world cup,
# col[2] = Djokovic won the 2014 wimbledon tennis championship,
# col[3] = MtGox exchange goes insolvent (1Q 2014)
# col[4] = Professor bitcorn won his bet ("I predict that Bitcoin will trade for under $10 a share by the first half of 2014")
#
VoterMatrix = np.matrix([
[1, 0, 1, 1, 0], # first voter
[1, 1, 1, 1, 0], # ignorant about sports
[1, 0, 1, 1, 0],
[0, 0, 1, 1, 0], # republican in denial
[1, 0, 1, 1, 1]]) # prof bitcorn
print VoterMatrix
features = ['outcome_1', 'outcome_2', 'outcome_3', 'outcome_4', 'outcome_5']
voteMatrix_pd = pd.DataFrame(VoterMatrix, columns=features)
#raw['class'] = y
print ' voteMatrix_pd:'
print voteMatrix_pd
[[1 0 1 1 0] [1 1 1 1 0] [1 0 1 1 0] [0 0 1 1 0] [1 0 1 1 1]] voteMatrix_pd: outcome_1 outcome_2 outcome_3 outcome_4 outcome_5 0 1 0 1 1 0 1 1 1 1 1 0 2 1 0 1 1 0 3 0 0 1 1 0 4 1 0 1 1 1 [5 rows x 5 columns]
def reWeight(Vec):
"""Get the relative influence of numbers, treat NaN as influence-less."""
vec2 = np.array(Vec, dtype=float)
for i in range(len(Vec)):
if isnan(Vec[i]):
vec2[i] = 0
vec2sum = np.sum(vec2)
for i in range(len(vec2)):
vec2[i] = vec2[i] / vec2sum
return(vec2)
rew = reWeight(np.array([1,1,1,1]))
print "reweighted vector test. uniform vector", rew
def getWeight(Vec, AddMean=0):
"""Takes an array (vector in practice), and returns proportional distance from zero."""
New = abs(Vec) #Absolute Value
if AddMean == 1: #Add the mean to each element of the vector
New = New + mean(New)
if sum(New) == 0: #Catch an error here
New = New + 1
New = New/sum(New) #Normalize
return(New)
uniformWeight = array([[1]]*len(VoterMatrix))
print "\nuniform weights:\n", uniformWeight
uniformReputation = getWeight(uniformWeight)
print "\nuniform reputation:\n", uniformReputation
reweighted vector test. uniform vector [ 0.25 0.25 0.25 0.25] uniform weights: [[1] [1] [1] [1] [1]] uniform reputation: [[ 0.2] [ 0.2] [ 0.2] [ 0.2] [ 0.2]]
measure each decision by taking a dot product. essentially this just uses the average vote value among all voters.
# port of GetDecisionOutcomes() https://github.com/psztorc/Truthcoin/blob/master/lib/consensus/ConsensusMechanism.r#L139
# VoterMatrix
# we're using the regular matrix here. data is not even zero-centered.
MaskedVoterMatrix = np.ma.masked_array(VoterMatrix, isnan(VoterMatrix))
matrix_mask_thingie = -MaskedVoterMatrix[...,0].mask
# not sure what the mask is for.
# corresponds to https://github.com/psztorc/Truthcoin/blob/master/pylib/consensus/consensus.py#L113-L114
row = reWeight( rep [ matrix_mask_thingie ] )
print "row:", row
col = MaskedVoterMatrix[matrix_mask_thingie, 0]
print "col:", col
decisions = []
for i in range(VoterMatrix.shape[1]):
row = reWeight( rep [ -MaskedVoterMatrix[...,i].mask ] )
col = MaskedVoterMatrix[ -MaskedVoterMatrix[...,i].mask, i]
col = np.array(col, dtype=float)
row = np.transpose(row)[0]
decisions.append(np.dot(col, row))
print "\ndecisions:"
print decisions
row: [[ 0.2] [ 0.2] [ 0.2] [ 0.2] [ 0.2]] col: [[1 1 1 0 1]] decisions: [0.80000000000000004, 0.20000000000000001, 1.0, 1.0, 0.20000000000000001]
that's the output for a simplified multi-decision resolution from votes on binary outcomes.
in this simplified version there is no reputation or vote stake amounts. every vote is equal ([0.2, 0.2, 0.2, 0.2, 0.2])
an extended method would incorporate vote stake/deposit amounts to weigh votes. if we also add scaled/continous outcomes then the resulting consensus method would be a form of multi-decision SchellingCoin (or equivalently, TruthCoin without reputation).
# normalize each feature/column to mean = 0, std = 1
# data matrix needs to be normalized to get covariance and SVD
normed = voteMatrix_pd.copy()
for col in features:
#normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()) / normed[col].std())
normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()))
# normed data matrix is only zero-centered (not auto-scaled)
print '\nNormalized dataset:'
print normed[:5]
Normalized dataset: outcome_1 outcome_2 outcome_3 outcome_4 outcome_5 0 0.2 -0.2 0 0 -0.2 1 0.2 0.8 0 0 -0.2 2 0.2 -0.2 0 0 -0.2 3 -0.8 -0.2 0 0 -0.2 4 0.2 -0.2 0 0 0.8 [5 rows x 5 columns]
this covariance calc is from the original pca example
### since our data is already normalized, cov(x1, x2) = sum(x1*x2) / num_observations
# ^^ old assumption from the original example. valid here??
cov_df = pd.DataFrame(index=features)
for colA in features:
column = []
for colB in features:
cov = normed[colA].cov(normed[colB])
column.append(cov)
cov_df[colA] = column
print 'Covariance matrix:'
print cov_df
# everybody agrees on outcomes 3 & 4 (tennis winner, mtgox solvency), so those columns have zero variance
Covariance matrix: outcome_1 outcome_2 outcome_3 outcome_4 outcome_5 outcome_1 0.20 0.05 0 0 0.05 outcome_2 0.05 0.20 0 0 -0.05 outcome_3 0.00 0.00 0 0 0.00 outcome_4 0.00 0.00 0 0 0.00 outcome_5 0.05 -0.05 0 0 0.20 [5 rows x 5 columns]
** [U]: Rows are the original features and columns are the PCA 'components'. Each cell gives the 'loading' of the feature on the corresponding component. **
** [S]: Represents how much variance is explained by each component. **
# use numpy's SVD implementation
u, s, v = scipy.linalg.svd(cov_df)
print 'U: (feature loading for each component)'
print pd.DataFrame(u, index=features) # first loading
print '\nExplained variance:\n', s
firstScore = np.transpose(np.dot(cov_df, u))[0]
print "\nfirstScore:"
print firstScore
Set1 = firstScore + abs(min(firstScore))
print "\nSet1:"
print Set1
Set2 = firstScore - max(firstScore)
print "Set2:"
print Set2
U: (feature loading for each component) 0 1 2 3 4 outcome_1 -0.816497 -8.088567e-17 0.57735 0 0 outcome_2 -0.408248 -7.071068e-01 -0.57735 0 0 outcome_3 0.000000 0.000000e+00 0.00000 1 0 outcome_4 0.000000 0.000000e+00 0.00000 0 1 outcome_5 -0.408248 7.071068e-01 -0.57735 0 0 [5 rows x 5 columns] Explained variance: [ 0.25 0.25 0.1 0. 0. ] firstScore: [-0.20412415 -0.10206207 0. 0. -0.10206207] Set1: [ 0. 0.10206207 0.20412415 0.20412415 0.10206207] Set2: [-0.20412415 -0.10206207 0. 0. -0.10206207]
# note on these two sets: https://github.com/psztorc/Truthcoin/blob/master/lib/consensus/ConsensusMechanism.r#L40-L51
New1 = getWeight(np.dot(Set1, voteMatrix_pd))
print "\nNew1:"
print New1
New2 = getWeight(np.dot(Set2, voteMatrix_pd))
print "New2:"
print New2
New1: [ 0.22222222 0.05555556 0.33333333 0.33333333 0.05555556] New2: [ 0.28571429 0.07142857 0.28571429 0.28571429 0.07142857]