library(tidyverse)
library(jsonlite)
library(plotly)
library(DT)
library(bsplus)
# Import json files into data frames
nba_advanced_json <- fromJSON("nba_advanced_json",
simplifyDataFrame = TRUE,
flatten = TRUE)
nba_advanced_flat <- flatten(nba_advanced_json$resultSets)
nba_advanced <- data.frame(nba_advanced_flat$rowSet)
colnames(nba_advanced) <- t(data.frame(nba_advanced_flat$headers))
nba_assistedFGs_json <- fromJSON("nba_assistedFGs_json",
simplifyDataFrame = TRUE,
flatten = TRUE)
nba_assistedFGs_flat <- flatten(nba_assistedFGs_json$resultSets)
nba_assistedFGs <- data.frame(nba_assistedFGs_flat$rowSet)
colnames(nba_assistedFGs) <- t(data.frame(nba_assistedFGs_flat$headers))
nba_d2fgm_json <- fromJSON("nba_d2fgm_json",
simplifyDataFrame = TRUE,
flatten = TRUE)
nba_d2fgm_flat <- flatten(nba_d2fgm_json$resultSets)
nba_d2fgm <- data.frame(nba_d2fgm_flat$rowSet)
colnames(nba_d2fgm) <- t(data.frame(nba_d2fgm_flat$headers))
nba_d3fgm_json <- fromJSON("nba_d3fgm_json",
simplifyDataFrame = TRUE,
flatten = TRUE)
nba_d3fgm_flat <- flatten(nba_d3fgm_json$resultSets)
nba_d3fgm <- data.frame(nba_d3fgm_flat$rowSet)
colnames(nba_d3fgm) <- t(data.frame(nba_d3fgm_flat$headers))
nba_dfg_json <- fromJSON("nba_dfg_json",
simplifyDataFrame = TRUE,
flatten = TRUE)
nba_dfg_flat <- flatten(nba_dfg_json$resultSets)
nba_dfg <- data.frame(nba_dfg_flat$rowSet)
colnames(nba_dfg) <- t(data.frame(nba_dfg_flat$headers))
nba_plusminus_json <- fromJSON("nba_plusminus_json",
simplifyDataFrame = TRUE,
flatten = TRUE)
nba_plusminus_flat <- flatten(nba_plusminus_json$resultSets)
nba_plusminus <- data.frame(nba_plusminus_flat$rowSet)
colnames(nba_plusminus) <- t(data.frame(nba_plusminus_flat$headers))
# End json import
# import csv, edit column = name to match json files
nba_rpm <- read_csv("nba_rpm_csv.csv", col_types = "_?___????")
nba_rpm$Name <- gsub(", C", "", nba_rpm$Name)
nba_rpm$Name <- gsub(", PG", "", nba_rpm$Name)
nba_rpm$Name <- gsub(", SG", "", nba_rpm$Name)
nba_rpm$Name <- gsub(", PF", "", nba_rpm$Name)
nba_rpm$Name <- gsub(", SF", "", nba_rpm$Name)
# Merge relevant variables from json files and csv
merged_data <-
nba_advanced %>%
left_join(nba_assistedFGs[,c("PLAYER_ID","PCT_AST_FGM","PCT_UAST_FGM")],
by = "PLAYER_ID") %>%
left_join(nba_d2fgm[,c("PLAYER_NAME","DFG2M")],
by = "PLAYER_NAME") %>%
left_join(nba_d3fgm[,c("PLAYER_NAME","DFG3M")],
by = "PLAYER_NAME") %>%
left_join(nba_dfg[,c("PLAYER_NAME","D_FG_PCT")],
by = "PLAYER_NAME") %>%
left_join(nba_plusminus[,c("PLAYER_ID","PLUS_MINUS")],
by = "PLAYER_ID") %>%
left_join(nba_rpm, by = c("PLAYER_NAME" = "Name"))
# import team data from json files
nba_teambasic_json <- fromJSON("nba_teambasic_json",
simplifyDataFrame = TRUE,
flatten = TRUE)
nba_teambasic_flat <- flatten(nba_teambasic_json$resultSets)
nba_teambasic <- data.frame(nba_teambasic_flat$rowSet)
colnames(nba_teambasic) <- t(data.frame(nba_teambasic_flat$headers))
nba_teamadvanced_json <- fromJSON("nba_teamadvanced_json",
simplifyDataFrame = TRUE,
flatten = TRUE)
nba_teamadvanced_flat <- flatten(nba_teamadvanced_json$resultSets)
nba_teamadvanced <- data.frame(nba_teamadvanced_flat$rowSet)
colnames(nba_teamadvanced) <- t(data.frame(nba_teamadvanced_flat$headers))
nba_fourfactors_json <- fromJSON("nba_fourfactors_json",
simplifyDataFrame = TRUE,
flatten = TRUE)
nba_fourfactors_flat <- flatten(nba_fourfactors_json$resultSets)
nba_fourfactors <- data.frame(nba_fourfactors_flat$rowSet)
colnames(nba_fourfactors) <- t(data.frame(nba_fourfactors_flat$headers))
# Merge team data
merged_team_data <-
nba_fourfactors %>%
left_join(nba_teambasic[,c("TEAM_ID","PLUS_MINUS")],
by = "TEAM_ID") %>%
left_join(nba_teamadvanced[,c("TEAM_ID","EFG_PCT","TS_PCT","PIE",
"NET_RATING", "AST_PCT","AST_TO","AST_RATIO")],
by = "TEAM_ID")
# filter out unneeded variables
merged_data <- select(merged_data, -(GP_RANK:CFPARAMS))
merged_team_data <- select(merged_team_data, -(GP_RANK:CFPARAMS))
# match team data to players and player data to teams
player_data <-
merged_data %>%
left_join(mutate(merged_team_data,
TEAM_PIE = PIE,
TEAM_PLUS_MINUS = PLUS_MINUS,
TEAM_WINS = W)
[,c("TEAM_ID","TEAM_PIE","TEAM_PLUS_MINUS","TEAM_WINS")],
by = "TEAM_ID")
merged_team_data <-
mutate(merged_team_data, TEAM_RPM =
aggregate(RPM ~ TEAM_ABBREVIATION, player_data, sum)[,c("RPM")]) Anyone that follows professional basketball knows that the NBA has realized a talent boom in the last 5-10 years. Popularity, attention, and league-wide revenue has been growing, as well as punditry and data. Fans of the league are riveted by such important, profound questions such as “is Kevin Durant a cupcake?
More importantly, the NBA is on the cutting-edge of data collection in sports. Thanks to the NBA’s partnership with STATS SportVU, they are tracking data well beyond the box score most of us have seen in a newspaper. SportVU is a system of 6 cameras hung from the ceiling that track ball and player movement 25 times per second to produce metrics such as secondary assists, miles traveled, made field goals defended at rim, and one frequency on pick and rolls, etc.
The Golden State Warriors have received high praise this season as the best team ever. They are frequently said to play the right type of basketball, that they’re unselfish, and that they make their teammates better.
Basketball is one of the most popular sports in America and by far my personal favorite. The strategy on and off the court is fascinating. It’s easy to give a big contract to the player who scored the most points or the MVP. But Russell Westbrook didn’t make it out of the first round of the playoffs. It’s a complex game and we finally have the means to answer the really complex questions that keep fans coming back.
Lucky for us, the NBA has made the data they collect readily available on their website. Unfortunately, they do not make it readily available for download. This might require some text scraping, but some other fans have already done some work on this.
Variables of interest include:
Other variables will be considered as the model is constructed. I will use data from the 2016-2017 NBA regular season and postseason.
I’ll use comparative statistics to analyze team differences with and without certain players. I will likely use linear regression to weigh the different dynamics of a basketball game and which aspects of a player truly make his teammates better.
Given the nature of plus-minus statistics, being calculated at a team level, lineup level, and player level, I may find value in a hierarchical model.
Ultimately, there are a few assumptions I’ll have to more fully address:
Besides the aforementioned pundits and fans, basketball organizations should care, players should care, agents should care, everyone who gets paid from basketball related revenue should care. Team strategy off the court is proving critical . It’s hard to gain an edge in a league with a salary cap and paying the best players and getting the best coach doesn’t cut it anymore. Being able to answer questions like these allow teams to find value where others miss it and optimize the resources they have.
Further iterations on this question would allow teams to understand not only which players impact their teammates, but which types of teammates best suit star players. This could help organizations tailor their teams around a foundation (e.g. how the Cavs build around Lebron James after Kyrie Irving’s request for a trade, or how to build around the new pairing of Chris Paul and James Harden for the Rockets).
# Packages Required
library(tidyverse) # used for several functions in r, including dplyr, ggplot2, etc.
library(jsonlite) # used to convert JSON files to dataframes
library(plotly) # used for making graphs interactive and easier to interpret (especially when overplotted)
library(DT) # used for providing a data dictionary in HTMLI had to compile two data sets for Team data and Player data. I did this by downloading JSON files from the stats.nba.com website HTML and using the “jsonlite” package to scrape this data and flatten it into data frames.
I combined all of these data frames into two single data sets. I also pulled Real Plus/Minus data from ESPN, converted it into a .csv and imported that. I was able to merge all the JSON dataframes and the csv based on either a Player ID from nba.com or the Player Name. I had to clean up the ‘Name’ variable from ESPN to match the NBA data.
I dropped any unneccessary variables (the NBA JSON files provided separate variables to rank all the included variables, etc.). I started with a couple hundred variables, but pared this down to 24 variables for Team data and 43 variables for Player data that might be of interest later on.
datatable(player_data,
extensions = c("FixedColumns","Buttons","KeyTable","Scroller"),
options = list(orderClasses = TRUE,
fixedColumns = list(leftColumns = 5),
dom = "Bfrtip",
buttons = c("copy","csv","excel","colvis"),
keys = TRUE,
deferRender = FALSE,
scrollY = 750,
scrollX = 1000,
scroller = TRUE,
filter = "top",
orderMulti = TRUE,
columnDefs =
list(list(visible = FALSE, targets = c(1,3)))))#Creating Data Dictionary
VarType<- c("PLAYER_NAME",
"TEAM_ABBREVIATION",
"GP",
"W",
"L",
"MIN",
"OFF_RATING",
"DEF_RATING",
"NET_RATING",
"AST_PCT",
"AST_TO",
"AST_RATIO",
"OREB_PCT",
"DREB_PCT",
"REB_PCT",
"TM_TOV_PCT",
"EFG_PCT",
"TS_PCT",
"USG_PCT",
"PACE",
"PIE",
"FG_PCT",
"PCT_UAST_FGM",
"DFG2M",
"DFG3M",
"D_FG_PCT",
"PLUS_MINUS",
"ORPM",
"DRPM",
"RPM",
"WINS",
"TEAM_PIE",
"TEAM_PLUS_MINUS",
"TEAM_WINS")
VarDesc<-c("Player Name",
"3-letter abbreviation of team name",
"Games Played",
"Wins",
"Losses",
"Minutes Played",
"Offensive Rating (points per 100 possessions while player is on court)",
"Defensive Rating (points allowed per 100 possesions while player is on court)",
"Net Rating (point differential per 100 possessions while player is on court)",
"Assist Percentage (percentage of shots a player assists (AST / TmFGM - FGM))",
"Assist to Turnover Ratio",
"Assist Ratio (number of assists per 100 possessions used)",
"Offensive Rebound Percentage (percentage of available offensive rebounds a player grabbed while on floor)",
"Defensive Rebound Percentage (percentage of available defensive rebounds a player grabbed while on floor)",
"Overall Rebound Percentage (percentage of total rebounds a player grabbed while on floor)",
"Team Turnover Percentage (turnovers per 100 possessions used by a player)",
"Effective Field Goal Percentage (adjusts field goal percentage based on higher expected value of 3-pointers)",
"True Shooting Percentage (metric of shooting efficiency (points / points possible on possessions with FG or FT attempt))",
"Usage Rate (possessions used by a player when on the floor ((FGA + (.44*FTA) + TO) / Possessions)))",
"Pace of play (number of possessions per 48 minutes)",
"Player Impact Estimate (comparable to other advanced statistic ratings e.g. PER)",
"Field Goal Percentage",
"Percentage of FGs made that were unassisted",
"Made 2-point FGs a player defended",
"Made 3-point FGs a player defended",
"Defensive Field Goal Percentage",
"Plus Minus (point differential while player is on the floor)",
"Offensive Real Plus Minus",
"Defensive Real Plus Minus",
"Real Plus Minus",
"RPM Wins (similar to win share, or number of wins a player contributes to his team's win total)",
"Team Player Impact Estimate (overall PIE of the team a player is on)",
"Team Plus Minus (Overall Plus Minus of team a player is on)",
"Team Wins (overall wins of the team a player is on, including games in which a player did not play)")
Data_Dictionary<- as.data.frame(cbind(VarType,VarDesc))
colnames(Data_Dictionary)<-c("Data Type","Variable Description")
kable(Data_Dictionary, caption = "Data Dictionary")| Data Type | Variable Description |
|---|---|
| PLAYER_NAME | Player Name |
| TEAM_ABBREVIATION | 3-letter abbreviation of team name |
| GP | Games Played |
| W | Wins |
| L | Losses |
| MIN | Minutes Played |
| OFF_RATING | Offensive Rating (points per 100 possessions while player is on court) |
| DEF_RATING | Defensive Rating (points allowed per 100 possesions while player is on court) |
| NET_RATING | Net Rating (point differential per 100 possessions while player is on court) |
| AST_PCT | Assist Percentage (percentage of shots a player assists (AST / TmFGM - FGM)) |
| AST_TO | Assist to Turnover Ratio |
| AST_RATIO | Assist Ratio (number of assists per 100 possessions used) |
| OREB_PCT | Offensive Rebound Percentage (percentage of available offensive rebounds a player grabbed while on floor) |
| DREB_PCT | Defensive Rebound Percentage (percentage of available defensive rebounds a player grabbed while on floor) |
| REB_PCT | Overall Rebound Percentage (percentage of total rebounds a player grabbed while on floor) |
| TM_TOV_PCT | Team Turnover Percentage (turnovers per 100 possessions used by a player) |
| EFG_PCT | Effective Field Goal Percentage (adjusts field goal percentage based on higher expected value of 3-pointers) |
| TS_PCT | True Shooting Percentage (metric of shooting efficiency (points / points possible on possessions with FG or FT attempt)) |
| USG_PCT | Usage Rate (possessions used by a player when on the floor ((FGA + (.44*FTA) + TO) / Possessions))) |
| PACE | Pace of play (number of possessions per 48 minutes) |
| PIE | Player Impact Estimate (comparable to other advanced statistic ratings e.g. PER) |
| FG_PCT | Field Goal Percentage |
| PCT_UAST_FGM | Percentage of FGs made that were unassisted |
| DFG2M | Made 2-point FGs a player defended |
| DFG3M | Made 3-point FGs a player defended |
| D_FG_PCT | Defensive Field Goal Percentage |
| PLUS_MINUS | Plus Minus (point differential while player is on the floor) |
| ORPM | Offensive Real Plus Minus |
| DRPM | Defensive Real Plus Minus |
| RPM | Real Plus Minus |
| WINS | RPM Wins (similar to win share, or number of wins a player contributes to his team’s win total) |
| TEAM_PIE | Team Player Impact Estimate (overall PIE of the team a player is on) |
| TEAM_PLUS_MINUS | Team Plus Minus (Overall Plus Minus of team a player is on) |
| TEAM_WINS | Team Wins (overall wins of the team a player is on, including games in which a player did not play) |
datatable(merged_team_data,
extensions = c("FixedColumns","Buttons","KeyTable","Scroller"),
options = list(orderClasses = TRUE,
fixedColumns = list(leftColumns = 3),
dom = "Bfrtip",
buttons = c("copy","csv","excel","colvis"),
keys = TRUE,
deferRender = FALSE,
scrollY = 750,
scrollX = 1000,
scroller = TRUE,
filter = "top",
orderMulti = TRUE,
columnDefs =
list(list(visible = FALSE, targets = 1))))VarType<- c("TEAM_NAME",
"W",
"L",
"W_PCT",
"MIN",
"EFG_PCT",
"TM_TOV_PCT",
"OREB_PCT",
"OPP_EFG_PCT",
"OPP_TOV_PCT",
"OPP_OREB_PCT",
"PLUS_MINUS",
"TS_PCT",
"PIE",
"NET_RATING",
"AST_PCT",
"AST_TO",
"AST_RATIO",
"TEAM_RPM")
VarDesc<-c("Team Name",
"Wins",
"Losses",
"Win Percentage",
"Minutes",
"Effective Field Goal Percentage (adjusts field goal percentage based on higher expected value of 3-pointers)",
"Team Turnover Percentage (turnovers per 100 possessions)",
"Offensive Rebound Percentage (percentage of available offensive rebounds a team grabs)",
"Opponent's Effective Field Goal Percentage",
"Opponent's Turnover Percentage",
"Opponent's Offensive Rebound Percentage",
"Plus Minus (overall point differential)",
"True Shooting Percentage (metric of shooting efficiency (points / points possible on possessions with FG or FT attempt))",
"Player Impact Estimate",
"Net Rating",
"Assist Percentage",
"Assist to Turnovers",
"Assist to Turnover Ratio",
"Team Real Plus Minus")
Data_Dictionary<- as.data.frame(cbind(VarType,VarDesc))
colnames(Data_Dictionary)<-c("Variable","Description")
kable(Data_Dictionary, caption = "Data Dictionary")| Variable | Description |
|---|---|
| TEAM_NAME | Team Name |
| W | Wins |
| L | Losses |
| W_PCT | Win Percentage |
| MIN | Minutes |
| EFG_PCT | Effective Field Goal Percentage (adjusts field goal percentage based on higher expected value of 3-pointers) |
| TM_TOV_PCT | Team Turnover Percentage (turnovers per 100 possessions) |
| OREB_PCT | Offensive Rebound Percentage (percentage of available offensive rebounds a team grabs) |
| OPP_EFG_PCT | Opponent’s Effective Field Goal Percentage |
| OPP_TOV_PCT | Opponent’s Turnover Percentage |
| OPP_OREB_PCT | Opponent’s Offensive Rebound Percentage |
| PLUS_MINUS | Plus Minus (overall point differential) |
| TS_PCT | True Shooting Percentage (metric of shooting efficiency (points / points possible on possessions with FG or FT attempt)) |
| PIE | Player Impact Estimate |
| NET_RATING | Net Rating |
| AST_PCT | Assist Percentage |
| AST_TO | Assist to Turnovers |
| AST_RATIO | Assist to Turnover Ratio |
| TEAM_RPM | Team Real Plus Minus |
Some preliminary analysis shows that there is significant differences in the distributions of player talent across teams, which should be expected. This follows conventional wisdom (the Warriors are good, the Sixers not so much), but are a few interesting points that deserve further analysis. Such as whether the Warriors are better or worse than the sum of their parts (comparing the sum of player Real Plus/Minus, a predicted value, vs. actual Plus/Minus, an observed value), and how other teams compare in the same context.
# KEEP
ggplotly(
player_data %>%
arrange(desc(TEAM_PLUS_MINUS)) %>%
ggplot(aes(TEAM_ABBREVIATION, RPM)) +
geom_boxplot())# KEEP
ggplotly(
player_data %>%
arrange(desc(TEAM_PLUS_MINUS)) %>%
ggplot(aes(RPM, fill = TEAM_ABBREVIATION)) +
geom_density(alpha = .2))# KEEP
ggplotly(
merged_team_data %>%
ggplot(aes(TEAM_RPM, PLUS_MINUS, color = TEAM_NAME)) + geom_point())