Background

In the NBA, team success can be influenced by both individual player performance and team dynamics. Therefore we can look at how much a certain player contributes to each winning game for their team. This can be used to understand how valuable players are to the team.

Load Libraries

library(tidyverse)
library(dplyr)
library(ggplot2)
library(corrplot)
library(hoopR)
library(caret)
library(performance)
library(sjPlot)
library(lme4)

Import data

We are going to use the hoopR package from R and filter the data from the last 5 seasons.

# Load NBA player stats
NBA_seasons = 2020:2025

nba_data <- load_nba_player_box(NBA_seasons)

# Quick look
head(nba_data)
## # A tibble: 6 × 57
##     game_id season season_type game_date  game_date_time      athlete_id
##       <int>  <int>       <int> <date>     <dttm>                   <int>
## 1 401248438   2020           3 2020-10-11 2020-10-11 19:30:00       6583
## 2 401248438   2020           3 2020-10-11 2020-10-11 19:30:00       3988
## 3 401248438   2020           3 2020-10-11 2020-10-11 19:30:00       1966
## 4 401248438   2020           3 2020-10-11 2020-10-11 19:30:00    2581018
## 5 401248438   2020           3 2020-10-11 2020-10-11 19:30:00    2991350
## 6 401248438   2020           3 2020-10-11 2020-10-11 19:30:00       6461
## # ℹ 51 more variables: athlete_display_name <chr>, team_id <int>,
## #   team_name <chr>, team_location <chr>, team_short_display_name <chr>,
## #   minutes <dbl>, field_goals_made <int>, field_goals_attempted <int>,
## #   three_point_field_goals_made <int>,
## #   three_point_field_goals_attempted <int>, free_throws_made <int>,
## #   free_throws_attempted <int>, offensive_rebounds <int>,
## #   defensive_rebounds <int>, rebounds <int>, assists <int>, steals <int>, …
names(nba_data)
##  [1] "game_id"                           "season"                           
##  [3] "season_type"                       "game_date"                        
##  [5] "game_date_time"                    "athlete_id"                       
##  [7] "athlete_display_name"              "team_id"                          
##  [9] "team_name"                         "team_location"                    
## [11] "team_short_display_name"           "minutes"                          
## [13] "field_goals_made"                  "field_goals_attempted"            
## [15] "three_point_field_goals_made"      "three_point_field_goals_attempted"
## [17] "free_throws_made"                  "free_throws_attempted"            
## [19] "offensive_rebounds"                "defensive_rebounds"               
## [21] "rebounds"                          "assists"                          
## [23] "steals"                            "blocks"                           
## [25] "turnovers"                         "fouls"                            
## [27] "plus_minus"                        "points"                           
## [29] "starter"                           "ejected"                          
## [31] "did_not_play"                      "reason"                           
## [33] "active"                            "athlete_jersey"                   
## [35] "athlete_short_name"                "athlete_headshot_href"            
## [37] "athlete_position_name"             "athlete_position_abbreviation"    
## [39] "team_display_name"                 "team_uid"                         
## [41] "team_slug"                         "team_logo"                        
## [43] "team_abbreviation"                 "team_color"                       
## [45] "team_alternate_color"              "home_away"                        
## [47] "team_winner"                       "team_score"                       
## [49] "opponent_team_id"                  "opponent_team_name"               
## [51] "opponent_team_location"            "opponent_team_display_name"       
## [53] "opponent_team_abbreviation"        "opponent_team_logo"               
## [55] "opponent_team_color"               "opponent_team_alternate_color"    
## [57] "opponent_team_score"

Clean the data

As there is 57 variables in the data we need to clean the data and choose which variables are relevant to use and which are not.

nba_clean <- nba_data %>%
  select(game_id, game_date, team = team_abbreviation,
         player = athlete_display_name,
         points, assists, rebounds, steals, blocks, turnovers, fouls,
         minutes, plus_minus, team_winner, defensive_rebounds, free_throws_made,
         offensive_rebounds, three_point_field_goals_made, field_goals_made,
         opponent_team_score, field_goals_attempted,
         three_point_field_goals_attempted, free_throws_attempted, team_score
         ) %>%
  rename(win = team_winner, 
         ft_made = free_throws_made, 
         threes_made = three_point_field_goals_made, 
         fg_made = field_goals_made, 
         opp_score = opponent_team_score, 
         fg_att = field_goals_attempted, 
         three_att = three_point_field_goals_attempted, 
         ft_att = free_throws_attempted, 
         home_score = team_score)

# Change win to 0/1
nba_clean <- nba_clean %>%
  mutate(win = as.numeric(win))

Exploratory Data Analysis

We can now have a look at the data to learn what we are working with, identify any issues like missing data and help us decide what modelling procedure to take.

str(nba_clean)
## hoopR_dt [163,322 × 24] (S3: hoopR_data/tbl_df/tbl/data.table/data.frame)
##  $ game_id           : int [1:163322] 401248438 401248438 401248438 401248438 401248438 401248438 401248438 401248438 401248438 401248438 ...
##  $ game_date         : Date[1:163322], format: "2020-10-11" "2020-10-11" ...
##  $ team              : chr [1:163322] "LAL" "LAL" "LAL" "LAL" ...
##  $ player            : chr [1:163322] "Anthony Davis" "Danny Green" "LeBron James" "Kentavious Caldwell-Pope" ...
##  $ points            : int [1:163322] 19 11 28 17 4 3 0 2 3 19 ...
##  $ assists           : int [1:163322] 3 1 10 0 5 0 0 0 0 4 ...
##  $ rebounds          : int [1:163322] 15 5 14 2 3 2 0 1 0 4 ...
##  $ steals            : int [1:163322] 1 0 1 1 1 0 0 0 0 1 ...
##  $ blocks            : int [1:163322] 2 1 0 0 1 0 0 0 0 0 ...
##  $ turnovers         : int [1:163322] 3 0 1 2 1 0 0 1 0 4 ...
##  $ fouls             : int [1:163322] 4 2 3 5 2 2 0 3 0 1 ...
##  $ minutes           : num [1:163322] 35 25 41 33 33 17 1 22 1 30 ...
##  $ plus_minus        : chr [1:163322] "+18" "+5" "+18" "+8" ...
##  $ win               : num [1:163322] 1 1 1 1 1 1 1 1 1 1 ...
##  $ defensive_rebounds: int [1:163322] 11 3 11 1 2 1 0 1 0 4 ...
##  $ ft_made           : int [1:163322] 5 0 1 3 0 0 0 0 0 0 ...
##  $ offensive_rebounds: int [1:163322] 4 2 3 1 1 1 0 0 0 0 ...
##  $ threes_made       : int [1:163322] 0 3 1 2 0 1 0 0 1 3 ...
##  $ fg_made           : int [1:163322] 7 4 13 6 2 1 0 1 1 8 ...
##  $ opp_score         : int [1:163322] 93 93 93 93 93 93 93 93 93 93 ...
##  $ fg_att            : int [1:163322] 17 10 20 13 7 4 1 4 1 11 ...
##  $ three_att         : int [1:163322] 3 7 5 7 2 2 1 2 1 4 ...
##  $ ft_att            : int [1:163322] 7 0 4 3 0 0 0 0 0 0 ...
##  $ home_score        : int [1:163322] 106 106 106 106 106 106 106 106 106 106 ...
summary(nba_clean)
##     game_id            game_date              team              player         
##  Min.   :401160623   Min.   :2019-10-22   Length:163322      Length:163322     
##  1st Qu.:401267647   1st Qu.:2021-02-23   Class :character   Class :character  
##  Median :401360658   Median :2022-02-10   Mode  :character   Mode  :character  
##  Mean   :401388421   Mean   :2022-03-08                                        
##  3rd Qu.:401469260   3rd Qu.:2023-03-24                                        
##  Max.   :401672980   Max.   :2024-06-17                                        
##                                                                                
##      points         assists          rebounds          steals       
##  Min.   : 0.0    Min.   : 0.000   Min.   : 0.000   Min.   : 0.0000  
##  1st Qu.: 3.0    1st Qu.: 0.000   1st Qu.: 1.000   1st Qu.: 0.0000  
##  Median : 9.0    Median : 2.000   Median : 3.000   Median : 0.0000  
##  Mean   :10.5    Mean   : 2.339   Mean   : 4.107   Mean   : 0.6974  
##  3rd Qu.:16.0    3rd Qu.: 3.000   3rd Qu.: 6.000   3rd Qu.: 1.0000  
##  Max.   :73.0    Max.   :24.000   Max.   :31.000   Max.   :10.0000  
##  NA's   :29860   NA's   :29860    NA's   :29860    NA's   :29860    
##      blocks          turnovers          fouls          minutes     
##  Min.   : 0.0000   Min.   : 0.000   Min.   :0.000   Min.   : 0.00  
##  1st Qu.: 0.0000   1st Qu.: 0.000   1st Qu.:1.000   1st Qu.:15.00  
##  Median : 0.0000   Median : 1.000   Median :2.000   Median :24.00  
##  Mean   : 0.4514   Mean   : 1.234   Mean   :1.841   Mean   :22.56  
##  3rd Qu.: 1.0000   3rd Qu.: 2.000   3rd Qu.:3.000   3rd Qu.:32.00  
##  Max.   :10.0000   Max.   :12.000   Max.   :6.000   Max.   :57.00  
##  NA's   :29860     NA's   :29860    NA's   :29860   NA's   :29861  
##   plus_minus             win         defensive_rebounds    ft_made      
##  Length:163322      Min.   :0.0000   Min.   : 0.000     Min.   : 0.000  
##  Class :character   1st Qu.:0.0000   1st Qu.: 1.000     1st Qu.: 0.000  
##  Mode  :character   Median :1.0000   Median : 3.000     Median : 1.000  
##                     Mean   :0.5032   Mean   : 3.152     Mean   : 1.628  
##                     3rd Qu.:1.0000   3rd Qu.: 5.000     3rd Qu.: 2.000  
##                     Max.   :1.0000   Max.   :21.000     Max.   :26.000  
##                                      NA's   :29860      NA's   :29860   
##  offensive_rebounds  threes_made        fg_made         opp_score    
##  Min.   : 0.0000    Min.   : 0.000   Min.   : 0.000   Min.   : 70.0  
##  1st Qu.: 0.0000    1st Qu.: 0.000   1st Qu.: 1.000   1st Qu.:104.0  
##  Median : 0.0000    Median : 1.000   Median : 3.000   Median :112.0  
##  Mean   : 0.9553    Mean   : 1.168   Mean   : 3.853   Mean   :112.4  
##  3rd Qu.: 1.0000    3rd Qu.: 2.000   3rd Qu.: 6.000   3rd Qu.:121.0  
##  Max.   :15.0000    Max.   :16.000   Max.   :25.000   Max.   :211.0  
##  NA's   :29860      NA's   :29860    NA's   :29860                   
##      fg_att         three_att          ft_att        home_score   
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.00   Min.   : 70.0  
##  1st Qu.: 3.000   1st Qu.: 1.000   1st Qu.: 0.00   1st Qu.:104.0  
##  Median : 7.000   Median : 3.000   Median : 1.00   Median :112.0  
##  Mean   : 8.247   Mean   : 3.239   Mean   : 2.09   Mean   :112.5  
##  3rd Qu.:12.000   3rd Qu.: 5.000   3rd Qu.: 3.00   3rd Qu.:121.0  
##  Max.   :47.000   Max.   :27.000   Max.   :32.00   Max.   :211.0  
##  NA's   :29860    NA's   :29860    NA's   :29860
head(nba_clean)
## # A tibble: 6 × 24
##     game_id game_date  team  player        points assists rebounds steals blocks
##       <int> <date>     <chr> <chr>          <int>   <int>    <int>  <int>  <int>
## 1 401248438 2020-10-11 LAL   Anthony Davis     19       3       15      1      2
## 2 401248438 2020-10-11 LAL   Danny Green       11       1        5      0      1
## 3 401248438 2020-10-11 LAL   LeBron James      28      10       14      1      0
## 4 401248438 2020-10-11 LAL   Kentavious C…     17       0        2      1      0
## 5 401248438 2020-10-11 LAL   Alex Caruso        4       5        3      1      1
## 6 401248438 2020-10-11 LAL   Markieff Mor…      3       0        2      0      0
## # ℹ 15 more variables: turnovers <int>, fouls <int>, minutes <dbl>,
## #   plus_minus <chr>, win <dbl>, defensive_rebounds <int>, ft_made <int>,
## #   offensive_rebounds <int>, threes_made <int>, fg_made <int>,
## #   opp_score <int>, fg_att <int>, three_att <int>, ft_att <int>,
## #   home_score <int>
#Any missing values?
colSums(is.na(nba_clean))
##            game_id          game_date               team             player 
##                  0                  0                  0                  0 
##             points            assists           rebounds             steals 
##              29860              29860              29860              29860 
##             blocks          turnovers              fouls            minutes 
##              29860              29860              29860              29861 
##         plus_minus                win defensive_rebounds            ft_made 
##              29860                  0              29860              29860 
## offensive_rebounds        threes_made            fg_made          opp_score 
##              29860              29860              29860                  0 
##             fg_att          three_att             ft_att         home_score 
##              29860              29860              29860                  0

We can see here that quite a few variables have missing values, 36111 to be exact. So we can remove them from the data to make the data cleaner.

nba_clean <- nba_clean %>%
  filter(!is.na(minutes) & minutes > 0)

nba_clean <- nba_clean %>%
  mutate(
    fg_pct = ifelse(fg_att > 0, fg_made / fg_att, 0),
    ft_pct = ifelse(ft_att > 0, ft_made / ft_att, 0),
    three_pct = ifelse(three_att > 0, threes_made / three_att, 0),
    efficiency = (points + rebounds + assists + steals + blocks - turnovers) / minutes
  )

colSums(is.na(nba_clean))
##            game_id          game_date               team             player 
##                  0                  0                  0                  0 
##             points            assists           rebounds             steals 
##                  0                  0                  0                  0 
##             blocks          turnovers              fouls            minutes 
##                  0                  0                  0                  0 
##         plus_minus                win defensive_rebounds            ft_made 
##                  0                  0                  0                  0 
## offensive_rebounds        threes_made            fg_made          opp_score 
##                  0                  0                  0                  0 
##             fg_att          three_att             ft_att         home_score 
##                  0                  0                  0                  0 
##             fg_pct             ft_pct          three_pct         efficiency 
##                  0                  0                  0                  0

Lets now take a look at our data

#Average Stats
print(nba_clean %>%
  summarise(
    avg_points = mean(points),
    avg_assists = mean(assists),
    avg_rebounds = mean(rebounds),
    avg_steals = mean(steals),
    avg_blocks = mean(blocks),
    avg_turnovers = mean(turnovers),
    avg_minutes = mean(minutes)
  ))
## # A tibble: 1 × 7
##   avg_points avg_assists avg_rebounds avg_steals avg_blocks avg_turnovers
##        <dbl>       <dbl>        <dbl>      <dbl>      <dbl>         <dbl>
## 1       10.6        2.37         4.16      0.707      0.457          1.25
## # ℹ 1 more variable: avg_minutes <dbl>
# Histogram of points
ggplot(nba_clean, aes(x = points)) +
  geom_histogram(binwidth = 2, fill = "steelblue", color = "black") +
  labs(title = "Distribution of Player Points")

# Boxplot of assists by team
ggplot(nba_clean, aes(x = team, y = assists)) +
  geom_boxplot(fill = "lightgreen") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Assists Distribution by Team")

#Correlation
nba_num_cols <- c("points", "assists", "rebounds", "steals", "blocks", 
                  "turnovers","minutes", "defensive_rebounds", "ft_made",
                  "offensive_rebounds","threes_made", "fg_made", "opp_score",
                  "fg_att", "three_att", "ft_att", "home_score")

# Compute correlation matrix using complete observations
nba_cor_matrix <- cor(nba_clean[, nba_num_cols], use = "complete.obs")

# Plot correlation matrix
corrplot(nba_cor_matrix, method = "color", type = "upper",
         addCoef.col = "black", tl.col = "black", tl.srt = 45,
         tl.cex = 0.8, number.cex = 0.6)

Split the data

set.seed(123)

# Create data partition, 70% train, 30% test
split <- createDataPartition(nba_clean$win, p = 0.7, list = FALSE)

# Split data
train <- nba_clean[split, ]
test  <- nba_clean[-split, ]

Logistic Regression

We used logistic regression because the outcome (win/loss) we are trying to predict is binary.

model_1 <- glm(win ~ points + assists + rebounds + steals + blocks + turnovers +
                 minutes + ft_made + threes_made + fg_att + ft_att + three_att +
                 fouls,
           data = train,
           family = binomial)

#Check Summary and Accuracy of model
summary(model_1)
## 
## Call:
## glm(formula = win ~ points + assists + rebounds + steals + blocks + 
##     turnovers + minutes + ft_made + threes_made + fg_att + ft_att + 
##     three_att + fouls, family = binomial, data = train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.117349   0.016254   7.220 5.22e-13 ***
## points       0.083791   0.003003  27.904  < 2e-16 ***
## assists      0.088253   0.003375  26.148  < 2e-16 ***
## rebounds     0.053894   0.002610  20.646  < 2e-16 ***
## steals       0.081559   0.007756  10.516  < 2e-16 ***
## blocks       0.102401   0.008917  11.484  < 2e-16 ***
## turnovers   -0.088299   0.005836 -15.130  < 2e-16 ***
## minutes     -0.018909   0.001232 -15.345  < 2e-16 ***
## ft_made      0.052801   0.010716   4.927 8.35e-07 ***
## threes_made  0.048781   0.011968   4.076 4.59e-05 ***
## fg_att      -0.108871   0.003937 -27.653  < 2e-16 ***
## ft_att      -0.087739   0.008751 -10.027  < 2e-16 ***
## three_att    0.004839   0.005851   0.827    0.408    
## fouls       -0.035179   0.005123  -6.867 6.57e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 127777  on 92171  degrees of freedom
## Residual deviance: 123873  on 92158  degrees of freedom
## AIC: 123901
## 
## Number of Fisher Scoring iterations: 4
model_performance(model_1)
## # Indices of model performance
## 
## AIC     |    AICc |     BIC | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log |   PCP
## --------------------------------------------------------------------------------------
## 1.2e+05 | 1.2e+05 | 1.2e+05 |     0.041 | 0.490 |     1 |    0.672 |      -Inf | 0.521

This model looks preety good having all predictors under 0.05. We can howeber adjust these predictors, removing some to see what the model does or even adding interaction effects.

#This model has removed any variable that has a correlation with another greater than 0.80
model_2 <- glm(win ~ points + assists + rebounds + steals + blocks + turnovers +
                 minutes + fouls,
           data = train,
           family = binomial)

#Check Summary and Accuracy of model
summary(model_2)
## 
## Call:
## glm(formula = win ~ points + assists + rebounds + steals + blocks + 
##     turnovers + minutes + fouls, family = binomial, data = train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.138877   0.016158   8.595  < 2e-16 ***
## points       0.031902   0.001170  27.255  < 2e-16 ***
## assists      0.070241   0.003281  21.410  < 2e-16 ***
## rebounds     0.041882   0.002415  17.342  < 2e-16 ***
## steals       0.077768   0.007664  10.148  < 2e-16 ***
## blocks       0.111677   0.008777  12.723  < 2e-16 ***
## turnovers   -0.111195   0.005729 -19.410  < 2e-16 ***
## minutes     -0.032159   0.001115 -28.830  < 2e-16 ***
## fouls       -0.025542   0.005053  -5.054 4.32e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 127777  on 92171  degrees of freedom
## Residual deviance: 125605  on 92163  degrees of freedom
## AIC: 125623
## 
## Number of Fisher Scoring iterations: 4
model_performance(model_2)
## # Indices of model performance
## 
## AIC     |    AICc |     BIC | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log |   PCP
## --------------------------------------------------------------------------------------
## 1.3e+05 | 1.3e+05 | 1.3e+05 |     0.023 | 0.494 |     1 |    0.681 |      -Inf | 0.512

From this we can see that model 1 has a higher R squared value of 0.042 compared to model 2 (0.024). Therefore it is the slightly better model. However the value is still relatively small meaning that model explains about 4% of the variation in wins.

#Model using different values
model_3 <- glm(win ~ efficiency + fg_pct + ft_pct + three_pct,
           data = train,
           family = binomial)

#Check Summary and Accuracy of model
summary(model_3)
## 
## Call:
## glm(formula = win ~ efficiency + fg_pct + ft_pct + three_pct, 
##     family = binomial, data = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.41426    0.01568 -26.422  < 2e-16 ***
## efficiency   0.42783    0.02319  18.448  < 2e-16 ***
## fg_pct       0.21705    0.03246   6.686 2.30e-11 ***
## ft_pct      -0.11645    0.01633  -7.130 1.01e-12 ***
## three_pct    0.27876    0.02495  11.174  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 127777  on 92171  degrees of freedom
## Residual deviance: 126656  on 92167  degrees of freedom
## AIC: 126666
## 
## Number of Fisher Scoring iterations: 4
model_performance(model_3)
## # Indices of model performance
## 
## AIC     |    AICc |     BIC | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log |   PCP
## --------------------------------------------------------------------------------------
## 1.3e+05 | 1.3e+05 | 1.3e+05 |     0.012 | 0.497 |     1 |    0.687 |      -Inf | 0.506
#Model with a two-way interaction between points and minutes
model_4 <- glm(win ~ points * minutes + assists + rebounds + steals + blocks + 
                 turnovers,
             data = train, family = binomial)

#Check Summary and Accuracy of model
summary(model_4)
## 
## Call:
## glm(formula = win ~ points * minutes + assists + rebounds + steals + 
##     blocks + turnovers, family = binomial, data = train)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     5.809e-02  1.961e-02   2.963  0.00305 ** 
## points          4.904e-02  3.050e-03  16.082  < 2e-16 ***
## minutes        -3.142e-02  1.138e-03 -27.613  < 2e-16 ***
## assists         7.353e-02  3.289e-03  22.359  < 2e-16 ***
## rebounds        4.032e-02  2.408e-03  16.743  < 2e-16 ***
## steals          7.622e-02  7.664e-03   9.945  < 2e-16 ***
## blocks          1.083e-01  8.761e-03  12.360  < 2e-16 ***
## turnovers      -1.121e-01  5.707e-03 -19.639  < 2e-16 ***
## points:minutes -5.221e-04  8.716e-05  -5.990  2.1e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 127777  on 92171  degrees of freedom
## Residual deviance: 125595  on 92163  degrees of freedom
## AIC: 125613
## 
## Number of Fisher Scoring iterations: 4
model_performance(model_4)
## # Indices of model performance
## 
## AIC     |    AICc |     BIC | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log |   PCP
## --------------------------------------------------------------------------------------
## 1.3e+05 | 1.3e+05 | 1.3e+05 |     0.023 | 0.494 |     1 |    0.681 |      -Inf | 0.512
#Model with different interaction effects
model_5 <- glm(win ~ points + minutes + assists + rebounds * turnovers + steals +
                 blocks + ft_made * ft_att + threes_made + fg_att + three_att +
                 fouls,
           data = train,
           family = binomial)

#Check Summary and Accuracy of model
summary(model_5)
## 
## Call:
## glm(formula = win ~ points + minutes + assists + rebounds * turnovers + 
##     steals + blocks + ft_made * ft_att + threes_made + fg_att + 
##     three_att + fouls, family = binomial, data = train)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.1251197  0.0168015   7.447 9.55e-14 ***
## points              0.0839588  0.0030058  27.933  < 2e-16 ***
## minutes            -0.0179278  0.0012689 -14.129  < 2e-16 ***
## assists             0.0878900  0.0033797  26.005  < 2e-16 ***
## rebounds            0.0525484  0.0034628  15.175  < 2e-16 ***
## turnovers          -0.0933019  0.0092525 -10.084  < 2e-16 ***
## steals              0.0815061  0.0077587  10.505  < 2e-16 ***
## blocks              0.1022477  0.0089203  11.462  < 2e-16 ***
## ft_made             0.0304654  0.0116080   2.625  0.00868 ** 
## ft_att             -0.0956234  0.0088921 -10.754  < 2e-16 ***
## threes_made         0.0484407  0.0119760   4.045 5.24e-05 ***
## fg_att             -0.1092304  0.0039409 -27.717  < 2e-16 ***
## three_att           0.0047401  0.0058543   0.810  0.41813    
## fouls              -0.0344572  0.0051282  -6.719 1.83e-11 ***
## rebounds:turnovers  0.0008128  0.0013109   0.620  0.53524    
## ft_made:ft_att      0.0032090  0.0006342   5.060 4.19e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 127777  on 92171  degrees of freedom
## Residual deviance: 123845  on 92156  degrees of freedom
## AIC: 123877
## 
## Number of Fisher Scoring iterations: 4
model_performance(model_5)
## # Indices of model performance
## 
## AIC     |    AICc |     BIC | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log |   PCP
## --------------------------------------------------------------------------------------
## 1.2e+05 | 1.2e+05 | 1.2e+05 |     0.041 | 0.490 |     1 |    0.672 |      -Inf | 0.521

#Compare Models

#Compare models
AIC(model_1, model_2, model_3, model_4, model_5)
##         df      AIC
## model_1 14 123900.7
## model_2  9 125623.0
## model_3  5 126666.1
## model_4  9 125612.7
## model_5 16 123876.9
tab_model(model_1, model_2, model_3, model_4, model_5)
  win win win win win
Predictors Odds Ratios CI p Odds Ratios CI p Odds Ratios CI p Odds Ratios CI p Odds Ratios CI p
(Intercept) 1.12 1.09 – 1.16 <0.001 1.15 1.11 – 1.19 <0.001 0.66 0.64 – 0.68 <0.001 1.06 1.02 – 1.10 0.003 1.13 1.10 – 1.17 <0.001
points 1.09 1.08 – 1.09 <0.001 1.03 1.03 – 1.03 <0.001 1.05 1.04 – 1.06 <0.001 1.09 1.08 – 1.09 <0.001
assists 1.09 1.09 – 1.10 <0.001 1.07 1.07 – 1.08 <0.001 1.08 1.07 – 1.08 <0.001 1.09 1.08 – 1.10 <0.001
rebounds 1.06 1.05 – 1.06 <0.001 1.04 1.04 – 1.05 <0.001 1.04 1.04 – 1.05 <0.001 1.05 1.05 – 1.06 <0.001
steals 1.08 1.07 – 1.10 <0.001 1.08 1.06 – 1.10 <0.001 1.08 1.06 – 1.10 <0.001 1.08 1.07 – 1.10 <0.001
blocks 1.11 1.09 – 1.13 <0.001 1.12 1.10 – 1.14 <0.001 1.11 1.10 – 1.13 <0.001 1.11 1.09 – 1.13 <0.001
turnovers 0.92 0.91 – 0.93 <0.001 0.89 0.88 – 0.90 <0.001 0.89 0.88 – 0.90 <0.001 0.91 0.89 – 0.93 <0.001
minutes 0.98 0.98 – 0.98 <0.001 0.97 0.97 – 0.97 <0.001 0.97 0.97 – 0.97 <0.001 0.98 0.98 – 0.98 <0.001
ft made 1.05 1.03 – 1.08 <0.001 1.03 1.01 – 1.05 0.009
threes made 1.05 1.03 – 1.07 <0.001 1.05 1.03 – 1.07 <0.001
fg att 0.90 0.89 – 0.90 <0.001 0.90 0.89 – 0.90 <0.001
ft att 0.92 0.90 – 0.93 <0.001 0.91 0.89 – 0.92 <0.001
three att 1.00 0.99 – 1.02 0.408 1.00 0.99 – 1.02 0.418
fouls 0.97 0.96 – 0.98 <0.001 0.97 0.97 – 0.98 <0.001 0.97 0.96 – 0.98 <0.001
efficiency 1.53 1.47 – 1.61 <0.001
fg pct 1.24 1.17 – 1.32 <0.001
ft pct 0.89 0.86 – 0.92 <0.001
three pct 1.32 1.26 – 1.39 <0.001
points × minutes 1.00 1.00 – 1.00 <0.001
rebounds × turnovers 1.00 1.00 – 1.00 0.535
ft made × ft att 1.00 1.00 – 1.00 <0.001
Observations 92172 92172 92172 92172 92172
R2 Tjur 0.041 0.023 0.012 0.023 0.041

Out of the 5 models tested, model 5 has a lower AIC value meaning that it has the best fit and complexity out of all models.

Check Predictive Performance

test_model <- test %>%
  select(points, minutes, assists, rebounds, turnovers, steals, blocks,
         ft_made, ft_att, threes_made, fg_att, three_att, fouls, win, player)

# Predict win probability
test_model$win_prob <- predict(model_5, newdata = test_model, type = "response")

# Classify as win/loss using 0.5 threshold
test_model$win_pred <- ifelse(test_model$win_prob > 0.5, 1, 0)

#Test accuracy through confusion matrix
confusionMatrix(factor(test_model$win_pred), factor(test_model$win))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 11190  8319
##          1  8457 11535
##                                           
##                Accuracy : 0.5753          
##                  95% CI : (0.5704, 0.5802)
##     No Information Rate : 0.5026          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.1505          
##                                           
##  Mcnemar's Test P-Value : 0.2902          
##                                           
##             Sensitivity : 0.5696          
##             Specificity : 0.5810          
##          Pos Pred Value : 0.5736          
##          Neg Pred Value : 0.5770          
##              Prevalence : 0.4974          
##          Detection Rate : 0.2833          
##    Detection Prevalence : 0.4939          
##       Balanced Accuracy : 0.5753          
##                                           
##        'Positive' Class : 0               
## 

The model is showing an accuracy value of 0.575, meaning that the model is 57% accurate. The model demonstrates moderate predictive performance, and there is room for improvement. For the purposes of learning and exploration, we will continue using this model.

Find player contribution

Players with higher predicted probabilities of winning are identified as the ones most strongly associated with team success.

test_model$win <- factor(test_model$win, levels = c(0, 1))

player_contrib <- test_model %>%
  group_by(player) %>%
  summarise(
    avg_win_prob = mean(win_prob, na.rm = TRUE),
    avg_efficiency = mean((points + rebounds + assists + steals + blocks - turnovers)/minutes, na.rm = TRUE),
    games_played = n()
  ) %>%
  arrange(desc(avg_win_prob))

#Filter out any players that have played less than 10 games
player_contrib <- player_contrib %>%
  filter(games_played >= 10) %>% 
  arrange(desc(avg_win_prob))


head(player_contrib)
## # A tibble: 6 × 4
##   player                avg_win_prob avg_efficiency games_played
##   <chr>                        <dbl>          <dbl>        <int>
## 1 Nikola Jokic                 0.674          1.27           149
## 2 James Harden                 0.628          1.05            97
## 3 Joel Embiid                  0.615          1.37           103
## 4 Giannis Antetokounmpo        0.612          1.44           113
## 5 Tyrese Haliburton            0.611          0.910           86
## 6 Walker Kessler               0.598          0.835           43

Graph reults

# Filter the top 25 players
top_players <- player_contrib %>%
  arrange(desc(avg_win_prob)) %>%
  slice_head(n = 25)

ggplot(top_players, aes(x = reorder(player, avg_win_prob), y = avg_win_prob)) +
  geom_col(fill = "purple") +
  coord_flip() +
  labs(
    title = "Top NBA Players Contributing to Team Success",
    x = "Player",
    y = "Team Success Contribution"
  ) +
  theme_minimal()

As we can see from this chart above, the player that contributes to the most to team success is Nikola Jokic. This makes sense as he is seen as one of the greatest players and centers of all time.