library(tidyverse)
library(jsonlite)
library(plotly)
library(DT)
library(bsplus)

# Import json files into data frames

nba_advanced_json <- fromJSON("nba_advanced_json",
                              simplifyDataFrame = TRUE,
                              flatten = TRUE)
nba_advanced_flat <- flatten(nba_advanced_json$resultSets)
nba_advanced <- data.frame(nba_advanced_flat$rowSet)
colnames(nba_advanced) <- t(data.frame(nba_advanced_flat$headers))

nba_assistedFGs_json <- fromJSON("nba_assistedFGs_json",
                                simplifyDataFrame = TRUE,
                                flatten = TRUE)
nba_assistedFGs_flat <- flatten(nba_assistedFGs_json$resultSets)
nba_assistedFGs <- data.frame(nba_assistedFGs_flat$rowSet)
colnames(nba_assistedFGs) <- t(data.frame(nba_assistedFGs_flat$headers))

nba_d2fgm_json <- fromJSON("nba_d2fgm_json",
                                simplifyDataFrame = TRUE,
                                flatten = TRUE)
nba_d2fgm_flat <- flatten(nba_d2fgm_json$resultSets)
nba_d2fgm <- data.frame(nba_d2fgm_flat$rowSet)
colnames(nba_d2fgm) <- t(data.frame(nba_d2fgm_flat$headers))

nba_d3fgm_json <- fromJSON("nba_d3fgm_json",
                                simplifyDataFrame = TRUE,
                                flatten = TRUE)
nba_d3fgm_flat <- flatten(nba_d3fgm_json$resultSets)
nba_d3fgm <- data.frame(nba_d3fgm_flat$rowSet)
colnames(nba_d3fgm) <- t(data.frame(nba_d3fgm_flat$headers))

nba_dfg_json <- fromJSON("nba_dfg_json",
                                simplifyDataFrame = TRUE,
                                flatten = TRUE)
nba_dfg_flat <- flatten(nba_dfg_json$resultSets)
nba_dfg <- data.frame(nba_dfg_flat$rowSet)
colnames(nba_dfg) <- t(data.frame(nba_dfg_flat$headers))

nba_plusminus_json <- fromJSON("nba_plusminus_json",
                                simplifyDataFrame = TRUE,
                                flatten = TRUE)
nba_plusminus_flat <- flatten(nba_plusminus_json$resultSets)
nba_plusminus <- data.frame(nba_plusminus_flat$rowSet)
colnames(nba_plusminus) <- t(data.frame(nba_plusminus_flat$headers))

# End json import
# import csv, edit column = name to match json files
nba_rpm <- read_csv("nba_rpm_csv.csv", col_types = "_?___????")

nba_rpm$Name <- gsub(", C", "", nba_rpm$Name)
nba_rpm$Name <- gsub(", PG", "", nba_rpm$Name)
nba_rpm$Name <- gsub(", SG", "", nba_rpm$Name)
nba_rpm$Name <- gsub(", PF", "", nba_rpm$Name)
nba_rpm$Name <- gsub(", SF", "", nba_rpm$Name)

# Merge relevant variables from json files and csv

merged_data <- 
  nba_advanced %>%
  left_join(nba_assistedFGs[,c("PLAYER_ID","PCT_AST_FGM","PCT_UAST_FGM")],
          by = "PLAYER_ID") %>%
  left_join(nba_d2fgm[,c("PLAYER_NAME","DFG2M")],
            by = "PLAYER_NAME") %>%
  left_join(nba_d3fgm[,c("PLAYER_NAME","DFG3M")],
            by = "PLAYER_NAME") %>%
  left_join(nba_dfg[,c("PLAYER_NAME","D_FG_PCT")],
            by = "PLAYER_NAME") %>%
  left_join(nba_plusminus[,c("PLAYER_ID","PLUS_MINUS")],
            by = "PLAYER_ID") %>%
  left_join(nba_rpm, by = c("PLAYER_NAME" = "Name"))

# import team data from json files

nba_teambasic_json <- fromJSON("nba_teambasic_json",
                              simplifyDataFrame = TRUE,
                              flatten = TRUE)
nba_teambasic_flat <- flatten(nba_teambasic_json$resultSets)
nba_teambasic <- data.frame(nba_teambasic_flat$rowSet)
colnames(nba_teambasic) <- t(data.frame(nba_teambasic_flat$headers))

nba_teamadvanced_json <- fromJSON("nba_teamadvanced_json",
                               simplifyDataFrame = TRUE,
                               flatten = TRUE)
nba_teamadvanced_flat <- flatten(nba_teamadvanced_json$resultSets)
nba_teamadvanced <- data.frame(nba_teamadvanced_flat$rowSet)
colnames(nba_teamadvanced) <- t(data.frame(nba_teamadvanced_flat$headers))

nba_fourfactors_json <- fromJSON("nba_fourfactors_json",
                               simplifyDataFrame = TRUE,
                               flatten = TRUE)
nba_fourfactors_flat <- flatten(nba_fourfactors_json$resultSets)
nba_fourfactors <- data.frame(nba_fourfactors_flat$rowSet)
colnames(nba_fourfactors) <- t(data.frame(nba_fourfactors_flat$headers))

# Merge team data
merged_team_data <-
  nba_fourfactors %>%
  left_join(nba_teambasic[,c("TEAM_ID","PLUS_MINUS")],
            by = "TEAM_ID") %>%
  left_join(nba_teamadvanced[,c("TEAM_ID","EFG_PCT","TS_PCT","PIE",
                                "NET_RATING", "AST_PCT","AST_TO","AST_RATIO")],
            by = "TEAM_ID")
# filter out unneeded variables
merged_data <- select(merged_data, -(GP_RANK:CFPARAMS))
merged_team_data <- select(merged_team_data, -(GP_RANK:CFPARAMS))

# match team data to players and player data to teams
player_data <-
  merged_data %>%
  left_join(mutate(merged_team_data, 
                   TEAM_PIE = PIE, 
                   TEAM_PLUS_MINUS = PLUS_MINUS,
                   TEAM_WINS = W)
            [,c("TEAM_ID","TEAM_PIE","TEAM_PLUS_MINUS","TEAM_WINS")],
       by = "TEAM_ID")

merged_team_data <- 
  mutate(merged_team_data, TEAM_RPM = 
           aggregate(RPM ~ TEAM_ABBREVIATION, player_data, sum)[,c("RPM")])

Introduction

What’s the question?

Which NBA players make their teammates better?

Anyone that follows professional basketball knows that the NBA has realized a talent boom in the last 5-10 years. Popularity, attention, and league-wide revenue has been growing, as well as punditry and data. Fans of the league are riveted by such important, profound questions such as “is Kevin Durant a cupcake?

More importantly, the NBA is on the cutting-edge of data collection in sports. Thanks to the NBA’s partnership with STATS SportVU, they are tracking data well beyond the box score most of us have seen in a newspaper. SportVU is a system of 6 cameras hung from the ceiling that track ball and player movement 25 times per second to produce metrics such as secondary assists, miles traveled, made field goals defended at rim, and one frequency on pick and rolls, etc.

The Golden State Warriors have received high praise this season as the best team ever. They are frequently said to play the right type of basketball, that they’re unselfish, and that they make their teammates better.

Basketball is one of the most popular sports in America and by far my personal favorite. The strategy on and off the court is fascinating. It’s easy to give a big contract to the player who scored the most points or the MVP. But Russell Westbrook didn’t make it out of the first round of the playoffs. It’s a complex game and we finally have the means to answer the really complex questions that keep fans coming back.

What data do we need?

Lucky for us, the NBA has made the data they collect readily available on their website. Unfortunately, they do not make it readily available for download. This might require some text scraping, but some other fans have already done some work on this.

Variables of interest include:

  • Real Plus/Minus
  • Points Scored
  • Points Scored Against
  • Player Efficiency Rating (PER)
  • Usage Rates
  • Effective Field Goal %
  • True Shooting %

Other variables will be considered as the model is constructed. I will use data from the 2016-2017 NBA regular season and postseason.

What methodology can we use?

I’ll use comparative statistics to analyze team differences with and without certain players. I will likely use linear regression to weigh the different dynamics of a basketball game and which aspects of a player truly make his teammates better.

Given the nature of plus-minus statistics, being calculated at a team level, lineup level, and player level, I may find value in a hierarchical model.

Ultimately, there are a few assumptions I’ll have to more fully address:

  • How am I defining making your teammates ‘better’?
    • I’ll look at Real Plus/Minus while a particular player is on the court. This captures both an offensive and defensive perspective.
  • How will I measure a player’s impact on this definition of ‘better’?
    • I will try to extract a player’s direct impact on his team’s Real Plus/Minus (i.e. points he scores or allows himself)
  • And how do I determine the difference between making teammates better and a synergistic effect of particular teammates.
    • Comparing similar lineups with and without a particular player may shed light on this, but ultimately this question may not matter. The information is still useful in picking lineups that maximize a group of players’ value.

Who cares?

Besides the aforementioned pundits and fans, basketball organizations should care, players should care, agents should care, everyone who gets paid from basketball related revenue should care. Team strategy off the court is proving critical . It’s hard to gain an edge in a league with a salary cap and paying the best players and getting the best coach doesn’t cut it anymore. Being able to answer questions like these allow teams to find value where others miss it and optimize the resources they have.

Further iterations on this question would allow teams to understand not only which players impact their teammates, but which types of teammates best suit star players. This could help organizations tailor their teams around a foundation (e.g. how the Cavs build around Lebron James after Kyrie Irving’s request for a trade, or how to build around the new pairing of Chris Paul and James Harden for the Rockets).

Packages & Data Prep

Packages Used

  # Packages Required
  library(tidyverse) # used for several functions in r, including dplyr, ggplot2, etc.
  library(jsonlite) # used to convert JSON files to dataframes
  library(plotly) # used for making graphs interactive and easier to interpret (especially when overplotted)
  library(DT) # used for providing a data dictionary in HTML

Data Set

I had to compile two data sets for Team data and Player data. I did this by downloading JSON files from the stats.nba.com website HTML and using the “jsonlite” package to scrape this data and flatten it into data frames.

I combined all of these data frames into two single data sets. I also pulled Real Plus/Minus data from ESPN, converted it into a .csv and imported that. I was able to merge all the JSON dataframes and the csv based on either a Player ID from nba.com or the Player Name. I had to clean up the ‘Name’ variable from ESPN to match the NBA data.

I dropped any unneccessary variables (the NBA JSON files provided separate variables to rank all the included variables, etc.). I started with a couple hundred variables, but pared this down to 24 variables for Team data and 43 variables for Player data that might be of interest later on.

Player Data

datatable(player_data,
          extensions = c("FixedColumns","Buttons","KeyTable","Scroller"),
          options = list(orderClasses = TRUE,
                         fixedColumns = list(leftColumns = 5),
                         dom = "Bfrtip",
                         buttons = c("copy","csv","excel","colvis"),
                         keys = TRUE,
                         deferRender = FALSE,
                         scrollY = 750,
                         scrollX = 1000,
                         scroller = TRUE,
                         filter = "top",
                         orderMulti = TRUE,
                         columnDefs = 
                           list(list(visible = FALSE, targets = c(1,3)))))
#Creating Data Dictionary

VarType<- c("PLAYER_NAME",
            "TEAM_ABBREVIATION",
            "GP",
            "W",
            "L",
            "MIN",
            "OFF_RATING",
            "DEF_RATING",
            "NET_RATING",
            "AST_PCT",
            "AST_TO",
            "AST_RATIO",
            "OREB_PCT",
            "DREB_PCT",
            "REB_PCT",
            "TM_TOV_PCT",
            "EFG_PCT",
            "TS_PCT",
            "USG_PCT",
            "PACE",
            "PIE",
            "FG_PCT",
            "PCT_UAST_FGM",
            "DFG2M",
            "DFG3M",
            "D_FG_PCT",
            "PLUS_MINUS",
            "ORPM",
            "DRPM",
            "RPM",
            "WINS",
            "TEAM_PIE",
            "TEAM_PLUS_MINUS",
            "TEAM_WINS")

VarDesc<-c("Player Name",
           "3-letter abbreviation of team name",
           "Games Played",
           "Wins",
           "Losses",
           "Minutes Played",
           "Offensive Rating (points per 100 possessions while player is on court)",
           "Defensive Rating (points allowed per 100 possesions while player is on court)",
           "Net Rating (point differential per 100 possessions while player is on court)",
           "Assist Percentage (percentage of shots a player assists (AST / TmFGM - FGM))",
           "Assist to Turnover Ratio",
           "Assist Ratio (number of assists per 100 possessions used)",
           "Offensive Rebound Percentage (percentage of available offensive rebounds a player grabbed while on floor)",
           "Defensive Rebound Percentage (percentage of available defensive rebounds a player grabbed while on floor)",
           "Overall Rebound Percentage (percentage of total rebounds a player grabbed while on floor)",
           "Team Turnover Percentage (turnovers per 100 possessions used by a player)",
           "Effective Field Goal Percentage (adjusts field goal percentage based on higher expected value of 3-pointers)",
           "True Shooting Percentage (metric of shooting efficiency (points / points possible on possessions with FG or FT attempt))",
           "Usage Rate (possessions used by a player when on the floor ((FGA + (.44*FTA) + TO) / Possessions)))",
           "Pace of play (number of possessions per 48 minutes)",
           "Player Impact Estimate (comparable to other advanced statistic ratings e.g. PER)",
           "Field Goal Percentage",
           "Percentage of FGs made that were unassisted",
           "Made 2-point FGs a player defended",
           "Made 3-point FGs a player defended",
           "Defensive Field Goal Percentage",
           "Plus Minus (point differential while player is on the floor)",
           "Offensive Real Plus Minus",
           "Defensive Real Plus Minus",
           "Real Plus Minus",
           "RPM Wins (similar to win share, or number of wins a player contributes to his team's win total)",
           "Team Player Impact Estimate (overall PIE of the team a player is on)",
           "Team Plus Minus (Overall Plus Minus of team a player is on)",
           "Team Wins (overall wins of the team a player is on, including games in which a player did not play)")
Data_Dictionary<- as.data.frame(cbind(VarType,VarDesc))
colnames(Data_Dictionary)<-c("Data Type","Variable Description")
kable(Data_Dictionary, caption = "Data Dictionary")
Data Dictionary
Data Type Variable Description
PLAYER_NAME Player Name
TEAM_ABBREVIATION 3-letter abbreviation of team name
GP Games Played
W Wins
L Losses
MIN Minutes Played
OFF_RATING Offensive Rating (points per 100 possessions while player is on court)
DEF_RATING Defensive Rating (points allowed per 100 possesions while player is on court)
NET_RATING Net Rating (point differential per 100 possessions while player is on court)
AST_PCT Assist Percentage (percentage of shots a player assists (AST / TmFGM - FGM))
AST_TO Assist to Turnover Ratio
AST_RATIO Assist Ratio (number of assists per 100 possessions used)
OREB_PCT Offensive Rebound Percentage (percentage of available offensive rebounds a player grabbed while on floor)
DREB_PCT Defensive Rebound Percentage (percentage of available defensive rebounds a player grabbed while on floor)
REB_PCT Overall Rebound Percentage (percentage of total rebounds a player grabbed while on floor)
TM_TOV_PCT Team Turnover Percentage (turnovers per 100 possessions used by a player)
EFG_PCT Effective Field Goal Percentage (adjusts field goal percentage based on higher expected value of 3-pointers)
TS_PCT True Shooting Percentage (metric of shooting efficiency (points / points possible on possessions with FG or FT attempt))
USG_PCT Usage Rate (possessions used by a player when on the floor ((FGA + (.44*FTA) + TO) / Possessions)))
PACE Pace of play (number of possessions per 48 minutes)
PIE Player Impact Estimate (comparable to other advanced statistic ratings e.g. PER)
FG_PCT Field Goal Percentage
PCT_UAST_FGM Percentage of FGs made that were unassisted
DFG2M Made 2-point FGs a player defended
DFG3M Made 3-point FGs a player defended
D_FG_PCT Defensive Field Goal Percentage
PLUS_MINUS Plus Minus (point differential while player is on the floor)
ORPM Offensive Real Plus Minus
DRPM Defensive Real Plus Minus
RPM Real Plus Minus
WINS RPM Wins (similar to win share, or number of wins a player contributes to his team’s win total)
TEAM_PIE Team Player Impact Estimate (overall PIE of the team a player is on)
TEAM_PLUS_MINUS Team Plus Minus (Overall Plus Minus of team a player is on)
TEAM_WINS Team Wins (overall wins of the team a player is on, including games in which a player did not play)

Team Data

datatable(merged_team_data,
          extensions = c("FixedColumns","Buttons","KeyTable","Scroller"),
          options = list(orderClasses = TRUE,
                         fixedColumns = list(leftColumns = 3),
                         dom = "Bfrtip",
                         buttons = c("copy","csv","excel","colvis"),
                         keys = TRUE,
                         deferRender = FALSE,
                         scrollY = 750,
                         scrollX = 1000,
                         scroller = TRUE,
                         filter = "top",
                         orderMulti = TRUE,
                         columnDefs = 
                           list(list(visible = FALSE, targets = 1))))
VarType<- c("TEAM_NAME",
            "W",
            "L",
            "W_PCT",
            "MIN",
            "EFG_PCT",
            "TM_TOV_PCT",
            "OREB_PCT",
            "OPP_EFG_PCT",
            "OPP_TOV_PCT",
            "OPP_OREB_PCT",
            "PLUS_MINUS",
            "TS_PCT",
            "PIE",
            "NET_RATING",
            "AST_PCT",
            "AST_TO",
            "AST_RATIO",
            "TEAM_RPM")

VarDesc<-c("Team Name",
           "Wins",
           "Losses",
           "Win Percentage",
           "Minutes",
           "Effective Field Goal Percentage (adjusts field goal percentage based on higher expected value of 3-pointers)",
           "Team Turnover Percentage (turnovers per 100 possessions)",
           "Offensive Rebound Percentage (percentage of available offensive rebounds a team grabs)",
           "Opponent's Effective Field Goal Percentage",
           "Opponent's Turnover Percentage",
           "Opponent's Offensive Rebound Percentage",
           "Plus Minus (overall point differential)",
           "True Shooting Percentage (metric of shooting efficiency (points / points possible on possessions with FG or FT attempt))",
           "Player Impact Estimate",
           "Net Rating",
           "Assist Percentage",
           "Assist to Turnovers",
           "Assist to Turnover Ratio",
           "Team Real Plus Minus")
Data_Dictionary<- as.data.frame(cbind(VarType,VarDesc))
colnames(Data_Dictionary)<-c("Variable","Description")
kable(Data_Dictionary, caption = "Data Dictionary")
Data Dictionary
Variable Description
TEAM_NAME Team Name
W Wins
L Losses
W_PCT Win Percentage
MIN Minutes
EFG_PCT Effective Field Goal Percentage (adjusts field goal percentage based on higher expected value of 3-pointers)
TM_TOV_PCT Team Turnover Percentage (turnovers per 100 possessions)
OREB_PCT Offensive Rebound Percentage (percentage of available offensive rebounds a team grabs)
OPP_EFG_PCT Opponent’s Effective Field Goal Percentage
OPP_TOV_PCT Opponent’s Turnover Percentage
OPP_OREB_PCT Opponent’s Offensive Rebound Percentage
PLUS_MINUS Plus Minus (overall point differential)
TS_PCT True Shooting Percentage (metric of shooting efficiency (points / points possible on possessions with FG or FT attempt))
PIE Player Impact Estimate
NET_RATING Net Rating
AST_PCT Assist Percentage
AST_TO Assist to Turnovers
AST_RATIO Assist to Turnover Ratio
TEAM_RPM Team Real Plus Minus

Exploratory Analysis

Some preliminary analysis shows that there is significant differences in the distributions of player talent across teams, which should be expected. This follows conventional wisdom (the Warriors are good, the Sixers not so much), but are a few interesting points that deserve further analysis. Such as whether the Warriors are better or worse than the sum of their parts (comparing the sum of player Real Plus/Minus, a predicted value, vs. actual Plus/Minus, an observed value), and how other teams compare in the same context.

# KEEP
ggplotly(
  player_data %>%
  arrange(desc(TEAM_PLUS_MINUS)) %>%
  ggplot(aes(TEAM_ABBREVIATION, RPM)) +
    geom_boxplot())
# KEEP
ggplotly(
  player_data %>%
  arrange(desc(TEAM_PLUS_MINUS)) %>%
  ggplot(aes(RPM, fill = TEAM_ABBREVIATION)) +
  geom_density(alpha = .2))
# KEEP
ggplotly(
merged_team_data %>%
  ggplot(aes(TEAM_RPM, PLUS_MINUS, color = TEAM_NAME)) + geom_point())