import json # a built-in Python module for dealing with JSON data
data = [] # create an empty array to hold all the stints
# open the file and read the data!
with open('/Users/evanzamir/PycharmProjects/sklearn_tutorial/matchups.json') as units_file:
    for j in units_file:
        data.append(json.loads(j))        

# "pretty" print the first stint, which in this case describes a matchup in a CHA-MIA game last season.

from pprint import pprint
pprint(data[0]) 

# To demonstrate how DictVectorizer works...
# Create a small list of lists with "fake" units

units = [['Stephen Curry','Klay Thompson','Harrison Barnes','Draymond Green','Andrew Bogut'],
         ['Stephen Curry','Klay Thompson','Harrison Barnes','Draymond Green','Andrew Bogut'],
         ['Shaun Livingston','Klay Thompson','Harrison Barnes','Draymond Green','Andrew Bogut'],
         ['Shaun Livingston','Klay Thompson','Andre Iguodala','Draymond Green','Andrew Bogut'],
         ['Leandro Barbosa','Shaun Livingston','Andre Iguodala','Harrison Barnes','Draymond Green']]

from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
list_dicts = []

# For each unit we want to create a dict containing the players' names as keys and the value 1 as the value.
# Later the value will be +1 if the player is playing at home and -1 if he is away.

for unit in units:
    list_dicts.append({name: 1 for name in unit})
print(list_dicts)


# This is where magic happens.
# The function fit_transform has turned our list of dicts into a list of arrays containing enough columns
# for all the players in this fake data set.
# The variable v, an instance of DictVectorizer, is able to keep track of all this bookkeeping for us.
# Basically, it's a really useful thing to have for what we're about to do and eliminates
# writing a metric ton of our own code.

X = v.fit_transform(list_dicts)
print(X)

# We can even take the inverse transform and get back the list of dicts!

x = v.inverse_transform(X)
print(x)

# And we can easily get the names of all the players in the data set!
# W00t

print(v.get_feature_names())

units = []
points = []
weights = []

for d in data:
    home = d['home']
    away = d['away']
    home_poss = d[home]['stats']['poss']
    away_poss = d[away]['stats']['poss']
    point_diff = 100*(d[home]['stats']['pts']-d[away]['stats']['pts'])/((home_poss+away_poss+0.01)/2.)
    home_unit = {name:1 for name in d[home]['on']}
    away_unit = {name:-1 for name in d[away]['on']}
    stint = home_unit.copy()
    stint.update(away_unit)
    if (home_poss+away_poss) >= 2:  # to avoid some ill-conditioning we only use stints that have possessions >= 1
        units.append(stint)
        points.append(point_diff)
        weights.append((home_poss+away_poss)/2.)
print(len(units),len(points),len(weights))

# Now we employ DictVectorizer to do its magic

u = DictVectorizer(sparse=False)
u_mat = u.fit_transform(units)
print(u_mat) # a giant list of lists where each array contains five +1's, five -1's, and a whole mess of 0's
print(points[:25]) # just showing the first 25 stints
print(weights[:100]) # just showing the first 100 stints

# The first 25 players alphabetically in the data set

players = u.get_feature_names()
pprint(u.get_feature_names()[:25])

# perform the inverse transform on one stint just to double check it makes sense

pprint(u.inverse_transform(u_mat)[:1])

from sklearn import linear_model
clf = linear_model.RidgeCV(alphas=(numpy.array([0.01,0.1,1.0,10,100,500,1000,2000,5000])),cv=5)
clf.fit(u_mat,points,sample_weight=weights)

# Here is the value of alpha that RidgeCV selected. We could probably add a few more values to test above too...
# but this is just a tutorial :)

print(clf.alpha_)

ratings = []
for player in players:
    ratings.append((player,clf.coef_[players.index(player)]))
ratings.sort(key=lambda tup: tup[1],reverse=True) # sort by rating in descending order

for idx, rating in enumerate(ratings):
    print(idx+1, "{}".format(rating[0]), "{0:.2f}".format(rating[1]))