import json # a built-in Python module for dealing with JSON data data = [] # create an empty array to hold all the stints # open the file and read the data! with open('/Users/evanzamir/PycharmProjects/sklearn_tutorial/matchups.json') as units_file: for j in units_file: data.append(json.loads(j)) # "pretty" print the first stint, which in this case describes a matchup in a CHA-MIA game last season. from pprint import pprint pprint(data[0]) # To demonstrate how DictVectorizer works... # Create a small list of lists with "fake" units units = [['Stephen Curry','Klay Thompson','Harrison Barnes','Draymond Green','Andrew Bogut'], ['Stephen Curry','Klay Thompson','Harrison Barnes','Draymond Green','Andrew Bogut'], ['Shaun Livingston','Klay Thompson','Harrison Barnes','Draymond Green','Andrew Bogut'], ['Shaun Livingston','Klay Thompson','Andre Iguodala','Draymond Green','Andrew Bogut'], ['Leandro Barbosa','Shaun Livingston','Andre Iguodala','Harrison Barnes','Draymond Green']] from sklearn.feature_extraction import DictVectorizer v = DictVectorizer(sparse=False) list_dicts = [] # For each unit we want to create a dict containing the players' names as keys and the value 1 as the value. # Later the value will be +1 if the player is playing at home and -1 if he is away. for unit in units: list_dicts.append({name: 1 for name in unit}) print(list_dicts) # This is where magic happens. # The function fit_transform has turned our list of dicts into a list of arrays containing enough columns # for all the players in this fake data set. # The variable v, an instance of DictVectorizer, is able to keep track of all this bookkeeping for us. # Basically, it's a really useful thing to have for what we're about to do and eliminates # writing a metric ton of our own code. X = v.fit_transform(list_dicts) print(X) # We can even take the inverse transform and get back the list of dicts! x = v.inverse_transform(X) print(x) # And we can easily get the names of all the players in the data set! # W00t print(v.get_feature_names()) units = [] points = [] weights = [] for d in data: home = d['home'] away = d['away'] home_poss = d[home]['stats']['poss'] away_poss = d[away]['stats']['poss'] point_diff = 100*(d[home]['stats']['pts']-d[away]['stats']['pts'])/((home_poss+away_poss+0.01)/2.) home_unit = {name:1 for name in d[home]['on']} away_unit = {name:-1 for name in d[away]['on']} stint = home_unit.copy() stint.update(away_unit) if (home_poss+away_poss) >= 2: # to avoid some ill-conditioning we only use stints that have possessions >= 1 units.append(stint) points.append(point_diff) weights.append((home_poss+away_poss)/2.) print(len(units),len(points),len(weights)) # Now we employ DictVectorizer to do its magic u = DictVectorizer(sparse=False) u_mat = u.fit_transform(units) print(u_mat) # a giant list of lists where each array contains five +1's, five -1's, and a whole mess of 0's print(points[:25]) # just showing the first 25 stints print(weights[:100]) # just showing the first 100 stints # The first 25 players alphabetically in the data set players = u.get_feature_names() pprint(u.get_feature_names()[:25]) # perform the inverse transform on one stint just to double check it makes sense pprint(u.inverse_transform(u_mat)[:1]) from sklearn import linear_model clf = linear_model.RidgeCV(alphas=(numpy.array([0.01,0.1,1.0,10,100,500,1000,2000,5000])),cv=5) clf.fit(u_mat,points,sample_weight=weights) # Here is the value of alpha that RidgeCV selected. We could probably add a few more values to test above too... # but this is just a tutorial :) print(clf.alpha_) ratings = [] for player in players: ratings.append((player,clf.coef_[players.index(player)])) ratings.sort(key=lambda tup: tup[1],reverse=True) # sort by rating in descending order for idx, rating in enumerate(ratings): print(idx+1, "{}".format(rating[0]), "{0:.2f}".format(rating[1]))