#!/usr/bin/env python # coding: utf-8 # # Getting Play by Play data from nba.com with players on the floor # In[1]: import json import urllib2 import pandas as pd # base url for play by play for game id GAME_BASE_URL = "http://stats.nba.com/stats/playbyplayv2?EndPeriod=10&EndRange=55800&GameID=&RangeType=2&Season=2014-15&SeasonType=Regular+Season&StartPeriod=1&StartRange=0" # base url for moment data for event id MOMENT_BASE_URL = "http://stats.nba.com/stats/locations_getmoments/?eventid=&gameid=" # First let's create a function to get the play by play for a single game. # # To see what it looks like as JSON check out this link - http://stats.nba.com/stats/playbyplayv2?EndPeriod=10&EndRange=55800&GameID=0021400001&RangeType=2&Season=2014-15&SeasonType=Regular+Season&StartPeriod=1&StartRange=0 # In[2]: def getRawPbpForGame(game_id): # for a given game_id, return a pandas data frame with the raw play by play url = GAME_BASE_URL.replace("", game_id) response = urllib2.urlopen(url) data = json.loads(response.read()) game_info = [] plays = [] for line in data['resultSets']: if 'name' in line.keys() and line['name'] == 'PlayByPlay': for event in line['rowSet']: row = dict(zip([header for header in line['headers']],event)) plays.append(row) return pd.DataFrame(plays) # In[3]: getRawPbpForGame("0021400001").head() # Let's make use of the player tracking moment data on nba.com to find the players on the floor for a given event. # # Here is what the data looks like for one event - http://stats.nba.com/stats/locations_getmoments/?eventid=1&gameid=0021400001 # # For a single moment it looks like this: # # [1, 1414541586032, 720.0, 24.0, None, [[-1, -1, 47.4393, 25.94672, 10.65305], [1610612740, 201569, 48.29735, 18.68403, 0.0], [1610612740, 201600, 58.46317, 20.60878, 0.0], [1610612740, 201950, 64.77708, 25.10907, 0.0], [1610612740, 201936, 47.45251, 33.84332, 0.0], [1610612740, 203076, 48.0291, 24.93867, 0.0], [1610612753, 202696, 46.04217, 26.94592, 0.0], [1610612753, 203124, 28.82642, 25.30571, 0.0], [1610612753, 202699, 45.89401, 34.01147, 0.0], [1610612753, 203901, 19.42869, 25.41476, 0.0], [1610612753, 203095, 45.82049, 18.77761, 0.0]]] # # # Here is what all this list represents: # # [period, unix timestamp in ms, seconds remaining in period, seconds remaining on shot clock, ??, [list of player/ball data]] # # And then the list of player/ball data represents: # # [team id, player id, x, y, z] # # # For the ball the team id and player id are -1. Only the ball has z coordinates # # Using this, we can find out who was on the floor for a given event id # # In[4]: def getPlayersOnFloorForMoment(game_id, event_id): # for a given game_id and event_id, return a dict with a list players on the floor for each team and team ids url = MOMENT_BASE_URL.replace("", str(event_id)) url = url + game_id response = urllib2.urlopen(url) data = json.loads(response.read()) players = {} players['home_team_id'] = data["moments"][0][5][1][0] players['away_team_id'] = data["moments"][0][5][6][0] players['home_player_ids'] =[] players['away_player_ids'] =[] for i in range(1,6): players['home_player_ids'].append(data["moments"][0][5][i][1]) for i in range(6,11): players['away_player_ids'].append(data["moments"][0][5][i][1]) return players # In[5]: getPlayersOnFloorForMoment("0021400001", 1) # Rather than using the above function to get the players on the floor for every event id we can simplify things and use it to get the players that start each period and use the play by play data to substitute players in and out. In the play by play this is when EVENTMSGTYPE = 8. PLAYER1_ID is the player getting subbed out and PLAYER2_ID is the player getting subbed in. # In[6]: def getPlayersOnFloorForPeriod(period): # for a given period data frame, return a data frame with new columns for the players on the floor period = period.reset_index(drop=True) start_event_num = period['EVENTNUM'].min() period_number = period['PERIOD'].mean() if period_number == 1 or period_number == 3 or period_number > 4: start_event_num += 1 period_starters = getPlayersOnFloorForMoment(game_id, start_event_num) period['HOME_TEAM_ID'] = period_starters["home_team_id"] period['AWAY_TEAM_ID'] = period_starters["away_team_id"] period['HOME_PLAYER1_ID'] = period_starters['home_player_ids'][0] period['HOME_PLAYER2_ID'] = period_starters['home_player_ids'][1] period['HOME_PLAYER3_ID'] = period_starters['home_player_ids'][2] period['HOME_PLAYER4_ID'] = period_starters['home_player_ids'][3] period['HOME_PLAYER5_ID'] = period_starters['home_player_ids'][4] period['AWAY_PLAYER1_ID'] = period_starters['away_player_ids'][0] period['AWAY_PLAYER2_ID'] = period_starters['away_player_ids'][1] period['AWAY_PLAYER3_ID'] = period_starters['away_player_ids'][2] period['AWAY_PLAYER4_ID'] = period_starters['away_player_ids'][3] period['AWAY_PLAYER5_ID'] = period_starters['away_player_ids'][4] # get index for all substitutions and for each one sub in and out appropriate players subs = period[period['EVENTMSGTYPE'] == 8].index.tolist() end = len(period.index) for i in range(len(subs)): if str(period['HOME_PLAYER1_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]): period.ix[subs[i]:end, 'HOME_PLAYER1_ID'] = str(period['PLAYER2_ID'][subs[i]]) elif str(period['HOME_PLAYER2_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]): period.ix[subs[i]:end, 'HOME_PLAYER2_ID'] = str(period['PLAYER2_ID'][subs[i]]) elif str(period['HOME_PLAYER3_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]): period.ix[subs[i]:end, 'HOME_PLAYER3_ID'] = str(period['PLAYER2_ID'][subs[i]]) elif str(period['HOME_PLAYER4_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]): period.ix[subs[i]:end, 'HOME_PLAYER4_ID'] = str(period['PLAYER2_ID'][subs[i]]) elif str(period['HOME_PLAYER5_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]): period.ix[subs[i]:end, 'HOME_PLAYER5_ID'] = str(period['PLAYER2_ID'][subs[i]]) elif str(period['AWAY_PLAYER1_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]): period.ix[subs[i]:end, 'AWAY_PLAYER1_ID'] = str(period['PLAYER2_ID'][subs[i]]) elif str(period['AWAY_PLAYER2_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]): period.ix[subs[i]:end, 'AWAY_PLAYER2_ID'] = str(period['PLAYER2_ID'][subs[i]]) elif str(period['AWAY_PLAYER3_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]): period.ix[subs[i]:end, 'AWAY_PLAYER3_ID'] = str(period['PLAYER2_ID'][subs[i]]) elif str(period['AWAY_PLAYER4_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]): period.ix[subs[i]:end, 'AWAY_PLAYER4_ID'] = str(period['PLAYER2_ID'][subs[i]]) elif str(period['AWAY_PLAYER5_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]): period.ix[subs[i]:end, 'AWAY_PLAYER5_ID'] = str(period['PLAYER2_ID'][subs[i]]) return period # Putting it all together we can get the play by play data with players on the floor for a full game # In[7]: game_id = "0021400001" pbp = getRawPbpForGame(game_id) pbp_with_lineups = pbp.groupby("PERIOD").apply(getPlayersOnFloorForPeriod) # In[8]: pbp_with_lineups[['HOME_PLAYER1_ID', 'HOME_PLAYER2_ID', 'HOME_PLAYER3_ID', 'HOME_PLAYER4_ID', 'HOME_PLAYER5_ID', 'AWAY_PLAYER1_ID', 'AWAY_PLAYER2_ID', 'AWAY_PLAYER3_ID', 'AWAY_PLAYER4_ID', 'AWAY_PLAYER5_ID']].head() # In[9]: # write to a csv file pbp_with_lineups.to_csv('pbp_with_lineups.csv',index=False,header=True) # In[ ]: