import json
import urllib2
import pandas as pd
# base url for play by play for game id
GAME_BASE_URL = "http://stats.nba.com/stats/playbyplayv2?EndPeriod=10&EndRange=55800&GameID=<game_id>&RangeType=2&Season=2014-15&SeasonType=Regular+Season&StartPeriod=1&StartRange=0"
# base url for moment data for event id
MOMENT_BASE_URL = "http://stats.nba.com/stats/locations_getmoments/?eventid=<event_id>&gameid="
First let's create a function to get the play by play for a single game.
To see what it looks like as JSON check out this link - http://stats.nba.com/stats/playbyplayv2?EndPeriod=10&EndRange=55800&GameID=0021400001&RangeType=2&Season=2014-15&SeasonType=Regular+Season&StartPeriod=1&StartRange=0
def getRawPbpForGame(game_id):
# for a given game_id, return a pandas data frame with the raw play by play
url = GAME_BASE_URL.replace("<game_id>", game_id)
response = urllib2.urlopen(url)
data = json.loads(response.read())
game_info = []
plays = []
for line in data['resultSets']:
if 'name' in line.keys() and line['name'] == 'PlayByPlay':
for event in line['rowSet']:
row = dict(zip([header for header in line['headers']],event))
plays.append(row)
return pd.DataFrame(plays)
getRawPbpForGame("0021400001").head()
EVENTMSGACTIONTYPE | EVENTMSGTYPE | EVENTNUM | GAME_ID | HOMEDESCRIPTION | NEUTRALDESCRIPTION | PCTIMESTRING | PERIOD | PERSON1TYPE | PERSON2TYPE | PERSON3TYPE | PLAYER1_ID | PLAYER1_NAME | PLAYER1_TEAM_ABBREVIATION | PLAYER1_TEAM_CITY | PLAYER1_TEAM_ID | PLAYER1_TEAM_NICKNAME | PLAYER2_ID | PLAYER2_NAME | PLAYER2_TEAM_ABBREVIATION | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 12 | 0 | 0021400001 | None | None | 12:00 | 1 | 0 | 0 | 0 | 0 | None | None | None | NaN | None | 0 | None | None | ... |
1 | 0 | 10 | 1 | 0021400001 | Jump Ball Davis vs. Vucevic: Tip to Holiday | None | 12:00 | 1 | 4 | 5 | 4 | 203076 | Anthony Davis | NOP | New Orleans | 1610612740 | Pelicans | 202696 | Nikola Vucevic | ORL | ... |
2 | 1 | 2 | 2 | 0021400001 | MISS Davis 20' Jump Shot | None | 11:43 | 1 | 4 | 0 | 0 | 203076 | Anthony Davis | NOP | New Orleans | 1610612740 | Pelicans | 0 | None | None | ... |
3 | 0 | 4 | 3 | 0021400001 | None | None | 11:42 | 1 | 5 | 0 | 0 | 203095 | Evan Fournier | ORL | Orlando | 1610612753 | Magic | 0 | None | None | ... |
4 | 66 | 1 | 4 | 0021400001 | None | None | 11:31 | 1 | 5 | 5 | 0 | 202696 | Nikola Vucevic | ORL | Orlando | 1610612753 | Magic | 203901 | Elfrid Payton | ORL | ... |
5 rows × 33 columns
Let's make use of the player tracking moment data on nba.com to find the players on the floor for a given event.
Here is what the data looks like for one event - http://stats.nba.com/stats/locations_getmoments/?eventid=1&gameid=0021400001
For a single moment it looks like this:
[1, 1414541586032, 720.0, 24.0, None, [[-1, -1, 47.4393, 25.94672, 10.65305], [1610612740, 201569, 48.29735, 18.68403, 0.0], [1610612740, 201600, 58.46317, 20.60878, 0.0], [1610612740, 201950, 64.77708, 25.10907, 0.0], [1610612740, 201936, 47.45251, 33.84332, 0.0], [1610612740, 203076, 48.0291, 24.93867, 0.0], [1610612753, 202696, 46.04217, 26.94592, 0.0], [1610612753, 203124, 28.82642, 25.30571, 0.0], [1610612753, 202699, 45.89401, 34.01147, 0.0], [1610612753, 203901, 19.42869, 25.41476, 0.0], [1610612753, 203095, 45.82049, 18.77761, 0.0]]]
Here is what all this list represents:
[period, unix timestamp in ms, seconds remaining in period, seconds remaining on shot clock, ??, [list of player/ball data]]
And then the list of player/ball data represents:
[team id, player id, x, y, z]
For the ball the team id and player id are -1. Only the ball has z coordinates
Using this, we can find out who was on the floor for a given event id
def getPlayersOnFloorForMoment(game_id, event_id):
# for a given game_id and event_id, return a dict with a list players on the floor for each team and team ids
url = MOMENT_BASE_URL.replace("<event_id>", str(event_id))
url = url + game_id
response = urllib2.urlopen(url)
data = json.loads(response.read())
players = {}
players['home_team_id'] = data["moments"][0][5][1][0]
players['away_team_id'] = data["moments"][0][5][6][0]
players['home_player_ids'] =[]
players['away_player_ids'] =[]
for i in range(1,6):
players['home_player_ids'].append(data["moments"][0][5][i][1])
for i in range(6,11):
players['away_player_ids'].append(data["moments"][0][5][i][1])
return players
getPlayersOnFloorForMoment("0021400001", 1)
{'away_player_ids': [202696, 203124, 202699, 203901, 203095], 'away_team_id': 1610612753, 'home_player_ids': [201569, 201600, 201950, 201936, 203076], 'home_team_id': 1610612740}
Rather than using the above function to get the players on the floor for every event id we can simplify things and use it to get the players that start each period and use the play by play data to substitute players in and out. In the play by play this is when EVENTMSGTYPE = 8. PLAYER1_ID is the player getting subbed out and PLAYER2_ID is the player getting subbed in.
def getPlayersOnFloorForPeriod(period):
# for a given period data frame, return a data frame with new columns for the players on the floor
period = period.reset_index(drop=True)
start_event_num = period['EVENTNUM'].min()
period_number = period['PERIOD'].mean()
if period_number == 1 or period_number == 3 or period_number > 4:
start_event_num += 1
period_starters = getPlayersOnFloorForMoment(game_id, start_event_num)
period['HOME_TEAM_ID'] = period_starters["home_team_id"]
period['AWAY_TEAM_ID'] = period_starters["away_team_id"]
period['HOME_PLAYER1_ID'] = period_starters['home_player_ids'][0]
period['HOME_PLAYER2_ID'] = period_starters['home_player_ids'][1]
period['HOME_PLAYER3_ID'] = period_starters['home_player_ids'][2]
period['HOME_PLAYER4_ID'] = period_starters['home_player_ids'][3]
period['HOME_PLAYER5_ID'] = period_starters['home_player_ids'][4]
period['AWAY_PLAYER1_ID'] = period_starters['away_player_ids'][0]
period['AWAY_PLAYER2_ID'] = period_starters['away_player_ids'][1]
period['AWAY_PLAYER3_ID'] = period_starters['away_player_ids'][2]
period['AWAY_PLAYER4_ID'] = period_starters['away_player_ids'][3]
period['AWAY_PLAYER5_ID'] = period_starters['away_player_ids'][4]
# get index for all substitutions and for each one sub in and out appropriate players
subs = period[period['EVENTMSGTYPE'] == 8].index.tolist()
end = len(period.index)
for i in range(len(subs)):
if str(period['HOME_PLAYER1_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]):
period.ix[subs[i]:end, 'HOME_PLAYER1_ID'] = str(period['PLAYER2_ID'][subs[i]])
elif str(period['HOME_PLAYER2_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]):
period.ix[subs[i]:end, 'HOME_PLAYER2_ID'] = str(period['PLAYER2_ID'][subs[i]])
elif str(period['HOME_PLAYER3_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]):
period.ix[subs[i]:end, 'HOME_PLAYER3_ID'] = str(period['PLAYER2_ID'][subs[i]])
elif str(period['HOME_PLAYER4_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]):
period.ix[subs[i]:end, 'HOME_PLAYER4_ID'] = str(period['PLAYER2_ID'][subs[i]])
elif str(period['HOME_PLAYER5_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]):
period.ix[subs[i]:end, 'HOME_PLAYER5_ID'] = str(period['PLAYER2_ID'][subs[i]])
elif str(period['AWAY_PLAYER1_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]):
period.ix[subs[i]:end, 'AWAY_PLAYER1_ID'] = str(period['PLAYER2_ID'][subs[i]])
elif str(period['AWAY_PLAYER2_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]):
period.ix[subs[i]:end, 'AWAY_PLAYER2_ID'] = str(period['PLAYER2_ID'][subs[i]])
elif str(period['AWAY_PLAYER3_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]):
period.ix[subs[i]:end, 'AWAY_PLAYER3_ID'] = str(period['PLAYER2_ID'][subs[i]])
elif str(period['AWAY_PLAYER4_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]):
period.ix[subs[i]:end, 'AWAY_PLAYER4_ID'] = str(period['PLAYER2_ID'][subs[i]])
elif str(period['AWAY_PLAYER5_ID'].iloc[subs[i]]) == str(period['PLAYER1_ID'][subs[i]]):
period.ix[subs[i]:end, 'AWAY_PLAYER5_ID'] = str(period['PLAYER2_ID'][subs[i]])
return period
Putting it all together we can get the play by play data with players on the floor for a full game
game_id = "0021400001"
pbp = getRawPbpForGame(game_id)
pbp_with_lineups = pbp.groupby("PERIOD").apply(getPlayersOnFloorForPeriod)
pbp_with_lineups[['HOME_PLAYER1_ID', 'HOME_PLAYER2_ID', 'HOME_PLAYER3_ID', 'HOME_PLAYER4_ID', 'HOME_PLAYER5_ID', 'AWAY_PLAYER1_ID', 'AWAY_PLAYER2_ID', 'AWAY_PLAYER3_ID', 'AWAY_PLAYER4_ID', 'AWAY_PLAYER5_ID']].head()
HOME_PLAYER1_ID | HOME_PLAYER2_ID | HOME_PLAYER3_ID | HOME_PLAYER4_ID | HOME_PLAYER5_ID | AWAY_PLAYER1_ID | AWAY_PLAYER2_ID | AWAY_PLAYER3_ID | AWAY_PLAYER4_ID | AWAY_PLAYER5_ID | ||
---|---|---|---|---|---|---|---|---|---|---|---|
PERIOD | |||||||||||
1 | 0 | 201569 | 201600 | 201950 | 201936 | 203076 | 202696 | 203124 | 202699 | 203901 | 203095 |
1 | 201569 | 201600 | 201950 | 201936 | 203076 | 202696 | 203124 | 202699 | 203901 | 203095 | |
2 | 201569 | 201600 | 201950 | 201936 | 203076 | 202696 | 203124 | 202699 | 203901 | 203095 | |
3 | 201569 | 201600 | 201950 | 201936 | 203076 | 202696 | 203124 | 202699 | 203901 | 203095 | |
4 | 201569 | 201600 | 201950 | 201936 | 203076 | 202696 | 203124 | 202699 | 203901 | 203095 |
5 rows × 10 columns
# write to a csv file
pbp_with_lineups.to_csv('pbp_with_lineups.csv',index=False,header=True)