In the NBA, team success can be influenced by both individual player performance and team dynamics. Therefore we can look at how much a certain player contributes to each winning game for their team. This can be used to understand how valuable players are to the team.
library(tidyverse)
library(dplyr)
library(ggplot2)
library(corrplot)
library(hoopR)
library(caret)
library(performance)
library(sjPlot)
library(lme4)
We are going to use the hoopR package from R and filter the data from the last 5 seasons.
# Load NBA player stats
NBA_seasons = 2020:2025
nba_data <- load_nba_player_box(NBA_seasons)
# Quick look
head(nba_data)
## # A tibble: 6 × 57
## game_id season season_type game_date game_date_time athlete_id
## <int> <int> <int> <date> <dttm> <int>
## 1 401248438 2020 3 2020-10-11 2020-10-11 19:30:00 6583
## 2 401248438 2020 3 2020-10-11 2020-10-11 19:30:00 3988
## 3 401248438 2020 3 2020-10-11 2020-10-11 19:30:00 1966
## 4 401248438 2020 3 2020-10-11 2020-10-11 19:30:00 2581018
## 5 401248438 2020 3 2020-10-11 2020-10-11 19:30:00 2991350
## 6 401248438 2020 3 2020-10-11 2020-10-11 19:30:00 6461
## # ℹ 51 more variables: athlete_display_name <chr>, team_id <int>,
## # team_name <chr>, team_location <chr>, team_short_display_name <chr>,
## # minutes <dbl>, field_goals_made <int>, field_goals_attempted <int>,
## # three_point_field_goals_made <int>,
## # three_point_field_goals_attempted <int>, free_throws_made <int>,
## # free_throws_attempted <int>, offensive_rebounds <int>,
## # defensive_rebounds <int>, rebounds <int>, assists <int>, steals <int>, …
names(nba_data)
## [1] "game_id" "season"
## [3] "season_type" "game_date"
## [5] "game_date_time" "athlete_id"
## [7] "athlete_display_name" "team_id"
## [9] "team_name" "team_location"
## [11] "team_short_display_name" "minutes"
## [13] "field_goals_made" "field_goals_attempted"
## [15] "three_point_field_goals_made" "three_point_field_goals_attempted"
## [17] "free_throws_made" "free_throws_attempted"
## [19] "offensive_rebounds" "defensive_rebounds"
## [21] "rebounds" "assists"
## [23] "steals" "blocks"
## [25] "turnovers" "fouls"
## [27] "plus_minus" "points"
## [29] "starter" "ejected"
## [31] "did_not_play" "reason"
## [33] "active" "athlete_jersey"
## [35] "athlete_short_name" "athlete_headshot_href"
## [37] "athlete_position_name" "athlete_position_abbreviation"
## [39] "team_display_name" "team_uid"
## [41] "team_slug" "team_logo"
## [43] "team_abbreviation" "team_color"
## [45] "team_alternate_color" "home_away"
## [47] "team_winner" "team_score"
## [49] "opponent_team_id" "opponent_team_name"
## [51] "opponent_team_location" "opponent_team_display_name"
## [53] "opponent_team_abbreviation" "opponent_team_logo"
## [55] "opponent_team_color" "opponent_team_alternate_color"
## [57] "opponent_team_score"
As there is 57 variables in the data we need to clean the data and choose which variables are relevant to use and which are not.
nba_clean <- nba_data %>%
select(game_id, game_date, team = team_abbreviation,
player = athlete_display_name,
points, assists, rebounds, steals, blocks, turnovers, fouls,
minutes, plus_minus, team_winner, defensive_rebounds, free_throws_made,
offensive_rebounds, three_point_field_goals_made, field_goals_made,
opponent_team_score, field_goals_attempted,
three_point_field_goals_attempted, free_throws_attempted, team_score
) %>%
rename(win = team_winner,
ft_made = free_throws_made,
threes_made = three_point_field_goals_made,
fg_made = field_goals_made,
opp_score = opponent_team_score,
fg_att = field_goals_attempted,
three_att = three_point_field_goals_attempted,
ft_att = free_throws_attempted,
home_score = team_score)
# Change win to 0/1
nba_clean <- nba_clean %>%
mutate(win = as.numeric(win))
We can now have a look at the data to learn what we are working with, identify any issues like missing data and help us decide what modelling procedure to take.
str(nba_clean)
## hoopR_dt [163,322 × 24] (S3: hoopR_data/tbl_df/tbl/data.table/data.frame)
## $ game_id : int [1:163322] 401248438 401248438 401248438 401248438 401248438 401248438 401248438 401248438 401248438 401248438 ...
## $ game_date : Date[1:163322], format: "2020-10-11" "2020-10-11" ...
## $ team : chr [1:163322] "LAL" "LAL" "LAL" "LAL" ...
## $ player : chr [1:163322] "Anthony Davis" "Danny Green" "LeBron James" "Kentavious Caldwell-Pope" ...
## $ points : int [1:163322] 19 11 28 17 4 3 0 2 3 19 ...
## $ assists : int [1:163322] 3 1 10 0 5 0 0 0 0 4 ...
## $ rebounds : int [1:163322] 15 5 14 2 3 2 0 1 0 4 ...
## $ steals : int [1:163322] 1 0 1 1 1 0 0 0 0 1 ...
## $ blocks : int [1:163322] 2 1 0 0 1 0 0 0 0 0 ...
## $ turnovers : int [1:163322] 3 0 1 2 1 0 0 1 0 4 ...
## $ fouls : int [1:163322] 4 2 3 5 2 2 0 3 0 1 ...
## $ minutes : num [1:163322] 35 25 41 33 33 17 1 22 1 30 ...
## $ plus_minus : chr [1:163322] "+18" "+5" "+18" "+8" ...
## $ win : num [1:163322] 1 1 1 1 1 1 1 1 1 1 ...
## $ defensive_rebounds: int [1:163322] 11 3 11 1 2 1 0 1 0 4 ...
## $ ft_made : int [1:163322] 5 0 1 3 0 0 0 0 0 0 ...
## $ offensive_rebounds: int [1:163322] 4 2 3 1 1 1 0 0 0 0 ...
## $ threes_made : int [1:163322] 0 3 1 2 0 1 0 0 1 3 ...
## $ fg_made : int [1:163322] 7 4 13 6 2 1 0 1 1 8 ...
## $ opp_score : int [1:163322] 93 93 93 93 93 93 93 93 93 93 ...
## $ fg_att : int [1:163322] 17 10 20 13 7 4 1 4 1 11 ...
## $ three_att : int [1:163322] 3 7 5 7 2 2 1 2 1 4 ...
## $ ft_att : int [1:163322] 7 0 4 3 0 0 0 0 0 0 ...
## $ home_score : int [1:163322] 106 106 106 106 106 106 106 106 106 106 ...
summary(nba_clean)
## game_id game_date team player
## Min. :401160623 Min. :2019-10-22 Length:163322 Length:163322
## 1st Qu.:401267647 1st Qu.:2021-02-23 Class :character Class :character
## Median :401360658 Median :2022-02-10 Mode :character Mode :character
## Mean :401388421 Mean :2022-03-08
## 3rd Qu.:401469260 3rd Qu.:2023-03-24
## Max. :401672980 Max. :2024-06-17
##
## points assists rebounds steals
## Min. : 0.0 Min. : 0.000 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 3.0 1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.: 0.0000
## Median : 9.0 Median : 2.000 Median : 3.000 Median : 0.0000
## Mean :10.5 Mean : 2.339 Mean : 4.107 Mean : 0.6974
## 3rd Qu.:16.0 3rd Qu.: 3.000 3rd Qu.: 6.000 3rd Qu.: 1.0000
## Max. :73.0 Max. :24.000 Max. :31.000 Max. :10.0000
## NA's :29860 NA's :29860 NA's :29860 NA's :29860
## blocks turnovers fouls minutes
## Min. : 0.0000 Min. : 0.000 Min. :0.000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.:1.000 1st Qu.:15.00
## Median : 0.0000 Median : 1.000 Median :2.000 Median :24.00
## Mean : 0.4514 Mean : 1.234 Mean :1.841 Mean :22.56
## 3rd Qu.: 1.0000 3rd Qu.: 2.000 3rd Qu.:3.000 3rd Qu.:32.00
## Max. :10.0000 Max. :12.000 Max. :6.000 Max. :57.00
## NA's :29860 NA's :29860 NA's :29860 NA's :29861
## plus_minus win defensive_rebounds ft_made
## Length:163322 Min. :0.0000 Min. : 0.000 Min. : 0.000
## Class :character 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.: 0.000
## Mode :character Median :1.0000 Median : 3.000 Median : 1.000
## Mean :0.5032 Mean : 3.152 Mean : 1.628
## 3rd Qu.:1.0000 3rd Qu.: 5.000 3rd Qu.: 2.000
## Max. :1.0000 Max. :21.000 Max. :26.000
## NA's :29860 NA's :29860
## offensive_rebounds threes_made fg_made opp_score
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 70.0
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.:104.0
## Median : 0.0000 Median : 1.000 Median : 3.000 Median :112.0
## Mean : 0.9553 Mean : 1.168 Mean : 3.853 Mean :112.4
## 3rd Qu.: 1.0000 3rd Qu.: 2.000 3rd Qu.: 6.000 3rd Qu.:121.0
## Max. :15.0000 Max. :16.000 Max. :25.000 Max. :211.0
## NA's :29860 NA's :29860 NA's :29860
## fg_att three_att ft_att home_score
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 70.0
## 1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.: 0.00 1st Qu.:104.0
## Median : 7.000 Median : 3.000 Median : 1.00 Median :112.0
## Mean : 8.247 Mean : 3.239 Mean : 2.09 Mean :112.5
## 3rd Qu.:12.000 3rd Qu.: 5.000 3rd Qu.: 3.00 3rd Qu.:121.0
## Max. :47.000 Max. :27.000 Max. :32.00 Max. :211.0
## NA's :29860 NA's :29860 NA's :29860
head(nba_clean)
## # A tibble: 6 × 24
## game_id game_date team player points assists rebounds steals blocks
## <int> <date> <chr> <chr> <int> <int> <int> <int> <int>
## 1 401248438 2020-10-11 LAL Anthony Davis 19 3 15 1 2
## 2 401248438 2020-10-11 LAL Danny Green 11 1 5 0 1
## 3 401248438 2020-10-11 LAL LeBron James 28 10 14 1 0
## 4 401248438 2020-10-11 LAL Kentavious C… 17 0 2 1 0
## 5 401248438 2020-10-11 LAL Alex Caruso 4 5 3 1 1
## 6 401248438 2020-10-11 LAL Markieff Mor… 3 0 2 0 0
## # ℹ 15 more variables: turnovers <int>, fouls <int>, minutes <dbl>,
## # plus_minus <chr>, win <dbl>, defensive_rebounds <int>, ft_made <int>,
## # offensive_rebounds <int>, threes_made <int>, fg_made <int>,
## # opp_score <int>, fg_att <int>, three_att <int>, ft_att <int>,
## # home_score <int>
#Any missing values?
colSums(is.na(nba_clean))
## game_id game_date team player
## 0 0 0 0
## points assists rebounds steals
## 29860 29860 29860 29860
## blocks turnovers fouls minutes
## 29860 29860 29860 29861
## plus_minus win defensive_rebounds ft_made
## 29860 0 29860 29860
## offensive_rebounds threes_made fg_made opp_score
## 29860 29860 29860 0
## fg_att three_att ft_att home_score
## 29860 29860 29860 0
We can see here that quite a few variables have missing values, 36111 to be exact. So we can remove them from the data to make the data cleaner.
nba_clean <- nba_clean %>%
filter(!is.na(minutes) & minutes > 0)
nba_clean <- nba_clean %>%
mutate(
fg_pct = ifelse(fg_att > 0, fg_made / fg_att, 0),
ft_pct = ifelse(ft_att > 0, ft_made / ft_att, 0),
three_pct = ifelse(three_att > 0, threes_made / three_att, 0),
efficiency = (points + rebounds + assists + steals + blocks - turnovers) / minutes
)
colSums(is.na(nba_clean))
## game_id game_date team player
## 0 0 0 0
## points assists rebounds steals
## 0 0 0 0
## blocks turnovers fouls minutes
## 0 0 0 0
## plus_minus win defensive_rebounds ft_made
## 0 0 0 0
## offensive_rebounds threes_made fg_made opp_score
## 0 0 0 0
## fg_att three_att ft_att home_score
## 0 0 0 0
## fg_pct ft_pct three_pct efficiency
## 0 0 0 0
Lets now take a look at our data
#Average Stats
print(nba_clean %>%
summarise(
avg_points = mean(points),
avg_assists = mean(assists),
avg_rebounds = mean(rebounds),
avg_steals = mean(steals),
avg_blocks = mean(blocks),
avg_turnovers = mean(turnovers),
avg_minutes = mean(minutes)
))
## # A tibble: 1 × 7
## avg_points avg_assists avg_rebounds avg_steals avg_blocks avg_turnovers
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10.6 2.37 4.16 0.707 0.457 1.25
## # ℹ 1 more variable: avg_minutes <dbl>
# Histogram of points
ggplot(nba_clean, aes(x = points)) +
geom_histogram(binwidth = 2, fill = "steelblue", color = "black") +
labs(title = "Distribution of Player Points")
# Boxplot of assists by team
ggplot(nba_clean, aes(x = team, y = assists)) +
geom_boxplot(fill = "lightgreen") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Assists Distribution by Team")
#Correlation
nba_num_cols <- c("points", "assists", "rebounds", "steals", "blocks",
"turnovers","minutes", "defensive_rebounds", "ft_made",
"offensive_rebounds","threes_made", "fg_made", "opp_score",
"fg_att", "three_att", "ft_att", "home_score")
# Compute correlation matrix using complete observations
nba_cor_matrix <- cor(nba_clean[, nba_num_cols], use = "complete.obs")
# Plot correlation matrix
corrplot(nba_cor_matrix, method = "color", type = "upper",
addCoef.col = "black", tl.col = "black", tl.srt = 45,
tl.cex = 0.8, number.cex = 0.6)
set.seed(123)
# Create data partition, 70% train, 30% test
split <- createDataPartition(nba_clean$win, p = 0.7, list = FALSE)
# Split data
train <- nba_clean[split, ]
test <- nba_clean[-split, ]
We used logistic regression because the outcome (win/loss) we are trying to predict is binary.
model_1 <- glm(win ~ points + assists + rebounds + steals + blocks + turnovers +
minutes + ft_made + threes_made + fg_att + ft_att + three_att +
fouls,
data = train,
family = binomial)
#Check Summary and Accuracy of model
summary(model_1)
##
## Call:
## glm(formula = win ~ points + assists + rebounds + steals + blocks +
## turnovers + minutes + ft_made + threes_made + fg_att + ft_att +
## three_att + fouls, family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.117349 0.016254 7.220 5.22e-13 ***
## points 0.083791 0.003003 27.904 < 2e-16 ***
## assists 0.088253 0.003375 26.148 < 2e-16 ***
## rebounds 0.053894 0.002610 20.646 < 2e-16 ***
## steals 0.081559 0.007756 10.516 < 2e-16 ***
## blocks 0.102401 0.008917 11.484 < 2e-16 ***
## turnovers -0.088299 0.005836 -15.130 < 2e-16 ***
## minutes -0.018909 0.001232 -15.345 < 2e-16 ***
## ft_made 0.052801 0.010716 4.927 8.35e-07 ***
## threes_made 0.048781 0.011968 4.076 4.59e-05 ***
## fg_att -0.108871 0.003937 -27.653 < 2e-16 ***
## ft_att -0.087739 0.008751 -10.027 < 2e-16 ***
## three_att 0.004839 0.005851 0.827 0.408
## fouls -0.035179 0.005123 -6.867 6.57e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 127777 on 92171 degrees of freedom
## Residual deviance: 123873 on 92158 degrees of freedom
## AIC: 123901
##
## Number of Fisher Scoring iterations: 4
model_performance(model_1)
## # Indices of model performance
##
## AIC | AICc | BIC | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | PCP
## --------------------------------------------------------------------------------------
## 1.2e+05 | 1.2e+05 | 1.2e+05 | 0.041 | 0.490 | 1 | 0.672 | -Inf | 0.521
This model looks preety good having all predictors under 0.05. We can howeber adjust these predictors, removing some to see what the model does or even adding interaction effects.
#This model has removed any variable that has a correlation with another greater than 0.80
model_2 <- glm(win ~ points + assists + rebounds + steals + blocks + turnovers +
minutes + fouls,
data = train,
family = binomial)
#Check Summary and Accuracy of model
summary(model_2)
##
## Call:
## glm(formula = win ~ points + assists + rebounds + steals + blocks +
## turnovers + minutes + fouls, family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.138877 0.016158 8.595 < 2e-16 ***
## points 0.031902 0.001170 27.255 < 2e-16 ***
## assists 0.070241 0.003281 21.410 < 2e-16 ***
## rebounds 0.041882 0.002415 17.342 < 2e-16 ***
## steals 0.077768 0.007664 10.148 < 2e-16 ***
## blocks 0.111677 0.008777 12.723 < 2e-16 ***
## turnovers -0.111195 0.005729 -19.410 < 2e-16 ***
## minutes -0.032159 0.001115 -28.830 < 2e-16 ***
## fouls -0.025542 0.005053 -5.054 4.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 127777 on 92171 degrees of freedom
## Residual deviance: 125605 on 92163 degrees of freedom
## AIC: 125623
##
## Number of Fisher Scoring iterations: 4
model_performance(model_2)
## # Indices of model performance
##
## AIC | AICc | BIC | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | PCP
## --------------------------------------------------------------------------------------
## 1.3e+05 | 1.3e+05 | 1.3e+05 | 0.023 | 0.494 | 1 | 0.681 | -Inf | 0.512
From this we can see that model 1 has a higher R squared value of 0.042 compared to model 2 (0.024). Therefore it is the slightly better model. However the value is still relatively small meaning that model explains about 4% of the variation in wins.
#Model using different values
model_3 <- glm(win ~ efficiency + fg_pct + ft_pct + three_pct,
data = train,
family = binomial)
#Check Summary and Accuracy of model
summary(model_3)
##
## Call:
## glm(formula = win ~ efficiency + fg_pct + ft_pct + three_pct,
## family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.41426 0.01568 -26.422 < 2e-16 ***
## efficiency 0.42783 0.02319 18.448 < 2e-16 ***
## fg_pct 0.21705 0.03246 6.686 2.30e-11 ***
## ft_pct -0.11645 0.01633 -7.130 1.01e-12 ***
## three_pct 0.27876 0.02495 11.174 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 127777 on 92171 degrees of freedom
## Residual deviance: 126656 on 92167 degrees of freedom
## AIC: 126666
##
## Number of Fisher Scoring iterations: 4
model_performance(model_3)
## # Indices of model performance
##
## AIC | AICc | BIC | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | PCP
## --------------------------------------------------------------------------------------
## 1.3e+05 | 1.3e+05 | 1.3e+05 | 0.012 | 0.497 | 1 | 0.687 | -Inf | 0.506
#Model with a two-way interaction between points and minutes
model_4 <- glm(win ~ points * minutes + assists + rebounds + steals + blocks +
turnovers,
data = train, family = binomial)
#Check Summary and Accuracy of model
summary(model_4)
##
## Call:
## glm(formula = win ~ points * minutes + assists + rebounds + steals +
## blocks + turnovers, family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.809e-02 1.961e-02 2.963 0.00305 **
## points 4.904e-02 3.050e-03 16.082 < 2e-16 ***
## minutes -3.142e-02 1.138e-03 -27.613 < 2e-16 ***
## assists 7.353e-02 3.289e-03 22.359 < 2e-16 ***
## rebounds 4.032e-02 2.408e-03 16.743 < 2e-16 ***
## steals 7.622e-02 7.664e-03 9.945 < 2e-16 ***
## blocks 1.083e-01 8.761e-03 12.360 < 2e-16 ***
## turnovers -1.121e-01 5.707e-03 -19.639 < 2e-16 ***
## points:minutes -5.221e-04 8.716e-05 -5.990 2.1e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 127777 on 92171 degrees of freedom
## Residual deviance: 125595 on 92163 degrees of freedom
## AIC: 125613
##
## Number of Fisher Scoring iterations: 4
model_performance(model_4)
## # Indices of model performance
##
## AIC | AICc | BIC | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | PCP
## --------------------------------------------------------------------------------------
## 1.3e+05 | 1.3e+05 | 1.3e+05 | 0.023 | 0.494 | 1 | 0.681 | -Inf | 0.512
#Model with different interaction effects
model_5 <- glm(win ~ points + minutes + assists + rebounds * turnovers + steals +
blocks + ft_made * ft_att + threes_made + fg_att + three_att +
fouls,
data = train,
family = binomial)
#Check Summary and Accuracy of model
summary(model_5)
##
## Call:
## glm(formula = win ~ points + minutes + assists + rebounds * turnovers +
## steals + blocks + ft_made * ft_att + threes_made + fg_att +
## three_att + fouls, family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1251197 0.0168015 7.447 9.55e-14 ***
## points 0.0839588 0.0030058 27.933 < 2e-16 ***
## minutes -0.0179278 0.0012689 -14.129 < 2e-16 ***
## assists 0.0878900 0.0033797 26.005 < 2e-16 ***
## rebounds 0.0525484 0.0034628 15.175 < 2e-16 ***
## turnovers -0.0933019 0.0092525 -10.084 < 2e-16 ***
## steals 0.0815061 0.0077587 10.505 < 2e-16 ***
## blocks 0.1022477 0.0089203 11.462 < 2e-16 ***
## ft_made 0.0304654 0.0116080 2.625 0.00868 **
## ft_att -0.0956234 0.0088921 -10.754 < 2e-16 ***
## threes_made 0.0484407 0.0119760 4.045 5.24e-05 ***
## fg_att -0.1092304 0.0039409 -27.717 < 2e-16 ***
## three_att 0.0047401 0.0058543 0.810 0.41813
## fouls -0.0344572 0.0051282 -6.719 1.83e-11 ***
## rebounds:turnovers 0.0008128 0.0013109 0.620 0.53524
## ft_made:ft_att 0.0032090 0.0006342 5.060 4.19e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 127777 on 92171 degrees of freedom
## Residual deviance: 123845 on 92156 degrees of freedom
## AIC: 123877
##
## Number of Fisher Scoring iterations: 4
model_performance(model_5)
## # Indices of model performance
##
## AIC | AICc | BIC | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | PCP
## --------------------------------------------------------------------------------------
## 1.2e+05 | 1.2e+05 | 1.2e+05 | 0.041 | 0.490 | 1 | 0.672 | -Inf | 0.521
#Compare Models
#Compare models
AIC(model_1, model_2, model_3, model_4, model_5)
## df AIC
## model_1 14 123900.7
## model_2 9 125623.0
## model_3 5 126666.1
## model_4 9 125612.7
## model_5 16 123876.9
tab_model(model_1, model_2, model_3, model_4, model_5)
| win | win | win | win | win | |||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predictors | Odds Ratios | CI | p | Odds Ratios | CI | p | Odds Ratios | CI | p | Odds Ratios | CI | p | Odds Ratios | CI | p |
| (Intercept) | 1.12 | 1.09 – 1.16 | <0.001 | 1.15 | 1.11 – 1.19 | <0.001 | 0.66 | 0.64 – 0.68 | <0.001 | 1.06 | 1.02 – 1.10 | 0.003 | 1.13 | 1.10 – 1.17 | <0.001 |
| points | 1.09 | 1.08 – 1.09 | <0.001 | 1.03 | 1.03 – 1.03 | <0.001 | 1.05 | 1.04 – 1.06 | <0.001 | 1.09 | 1.08 – 1.09 | <0.001 | |||
| assists | 1.09 | 1.09 – 1.10 | <0.001 | 1.07 | 1.07 – 1.08 | <0.001 | 1.08 | 1.07 – 1.08 | <0.001 | 1.09 | 1.08 – 1.10 | <0.001 | |||
| rebounds | 1.06 | 1.05 – 1.06 | <0.001 | 1.04 | 1.04 – 1.05 | <0.001 | 1.04 | 1.04 – 1.05 | <0.001 | 1.05 | 1.05 – 1.06 | <0.001 | |||
| steals | 1.08 | 1.07 – 1.10 | <0.001 | 1.08 | 1.06 – 1.10 | <0.001 | 1.08 | 1.06 – 1.10 | <0.001 | 1.08 | 1.07 – 1.10 | <0.001 | |||
| blocks | 1.11 | 1.09 – 1.13 | <0.001 | 1.12 | 1.10 – 1.14 | <0.001 | 1.11 | 1.10 – 1.13 | <0.001 | 1.11 | 1.09 – 1.13 | <0.001 | |||
| turnovers | 0.92 | 0.91 – 0.93 | <0.001 | 0.89 | 0.88 – 0.90 | <0.001 | 0.89 | 0.88 – 0.90 | <0.001 | 0.91 | 0.89 – 0.93 | <0.001 | |||
| minutes | 0.98 | 0.98 – 0.98 | <0.001 | 0.97 | 0.97 – 0.97 | <0.001 | 0.97 | 0.97 – 0.97 | <0.001 | 0.98 | 0.98 – 0.98 | <0.001 | |||
| ft made | 1.05 | 1.03 – 1.08 | <0.001 | 1.03 | 1.01 – 1.05 | 0.009 | |||||||||
| threes made | 1.05 | 1.03 – 1.07 | <0.001 | 1.05 | 1.03 – 1.07 | <0.001 | |||||||||
| fg att | 0.90 | 0.89 – 0.90 | <0.001 | 0.90 | 0.89 – 0.90 | <0.001 | |||||||||
| ft att | 0.92 | 0.90 – 0.93 | <0.001 | 0.91 | 0.89 – 0.92 | <0.001 | |||||||||
| three att | 1.00 | 0.99 – 1.02 | 0.408 | 1.00 | 0.99 – 1.02 | 0.418 | |||||||||
| fouls | 0.97 | 0.96 – 0.98 | <0.001 | 0.97 | 0.97 – 0.98 | <0.001 | 0.97 | 0.96 – 0.98 | <0.001 | ||||||
| efficiency | 1.53 | 1.47 – 1.61 | <0.001 | ||||||||||||
| fg pct | 1.24 | 1.17 – 1.32 | <0.001 | ||||||||||||
| ft pct | 0.89 | 0.86 – 0.92 | <0.001 | ||||||||||||
| three pct | 1.32 | 1.26 – 1.39 | <0.001 | ||||||||||||
| points × minutes | 1.00 | 1.00 – 1.00 | <0.001 | ||||||||||||
| rebounds × turnovers | 1.00 | 1.00 – 1.00 | 0.535 | ||||||||||||
| ft made × ft att | 1.00 | 1.00 – 1.00 | <0.001 | ||||||||||||
| Observations | 92172 | 92172 | 92172 | 92172 | 92172 | ||||||||||
| R2 Tjur | 0.041 | 0.023 | 0.012 | 0.023 | 0.041 | ||||||||||
Out of the 5 models tested, model 5 has a lower AIC value meaning that it has the best fit and complexity out of all models.
test_model <- test %>%
select(points, minutes, assists, rebounds, turnovers, steals, blocks,
ft_made, ft_att, threes_made, fg_att, three_att, fouls, win, player)
# Predict win probability
test_model$win_prob <- predict(model_5, newdata = test_model, type = "response")
# Classify as win/loss using 0.5 threshold
test_model$win_pred <- ifelse(test_model$win_prob > 0.5, 1, 0)
#Test accuracy through confusion matrix
confusionMatrix(factor(test_model$win_pred), factor(test_model$win))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 11190 8319
## 1 8457 11535
##
## Accuracy : 0.5753
## 95% CI : (0.5704, 0.5802)
## No Information Rate : 0.5026
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.1505
##
## Mcnemar's Test P-Value : 0.2902
##
## Sensitivity : 0.5696
## Specificity : 0.5810
## Pos Pred Value : 0.5736
## Neg Pred Value : 0.5770
## Prevalence : 0.4974
## Detection Rate : 0.2833
## Detection Prevalence : 0.4939
## Balanced Accuracy : 0.5753
##
## 'Positive' Class : 0
##
The model is showing an accuracy value of 0.575, meaning that the model is 57% accurate. The model demonstrates moderate predictive performance, and there is room for improvement. For the purposes of learning and exploration, we will continue using this model.
Players with higher predicted probabilities of winning are identified as the ones most strongly associated with team success.
test_model$win <- factor(test_model$win, levels = c(0, 1))
player_contrib <- test_model %>%
group_by(player) %>%
summarise(
avg_win_prob = mean(win_prob, na.rm = TRUE),
avg_efficiency = mean((points + rebounds + assists + steals + blocks - turnovers)/minutes, na.rm = TRUE),
games_played = n()
) %>%
arrange(desc(avg_win_prob))
#Filter out any players that have played less than 10 games
player_contrib <- player_contrib %>%
filter(games_played >= 10) %>%
arrange(desc(avg_win_prob))
head(player_contrib)
## # A tibble: 6 × 4
## player avg_win_prob avg_efficiency games_played
## <chr> <dbl> <dbl> <int>
## 1 Nikola Jokic 0.674 1.27 149
## 2 James Harden 0.628 1.05 97
## 3 Joel Embiid 0.615 1.37 103
## 4 Giannis Antetokounmpo 0.612 1.44 113
## 5 Tyrese Haliburton 0.611 0.910 86
## 6 Walker Kessler 0.598 0.835 43
# Filter the top 25 players
top_players <- player_contrib %>%
arrange(desc(avg_win_prob)) %>%
slice_head(n = 25)
ggplot(top_players, aes(x = reorder(player, avg_win_prob), y = avg_win_prob)) +
geom_col(fill = "purple") +
coord_flip() +
labs(
title = "Top NBA Players Contributing to Team Success",
x = "Player",
y = "Team Success Contribution"
) +
theme_minimal()