Loading the relevant packages
library(tidyverse)
library(ggplot2)
nba_data <- read.csv("all_seasons.csv")
glimpse(nba_data)
## Rows: 12,844
## Columns: 22
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ player_name <chr> "Randy Livingston", "Gaylon Nickerson", "George Lync…
## $ team_abbreviation <chr> "HOU", "WAS", "VAN", "LAL", "DEN", "ORL", "WAS", "CH…
## $ age <dbl> 22, 28, 26, 30, 23, 33, 26, 30, 24, 24, 22, 31, 29, …
## $ player_height <dbl> 193.04, 190.50, 203.20, 203.20, 213.36, 198.12, 231.…
## $ player_weight <dbl> 94.80073, 86.18248, 103.41898, 102.05820, 119.74829,…
## $ college <chr> "Louisiana State", "Northwestern Oklahoma", "North C…
## $ country <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "US…
## $ draft_year <chr> "1996", "1994", "1993", "1989", "1995", "1985", "199…
## $ draft_round <chr> "2", "2", "1", "1", "1", "2", "2", "1", "1", "1", "1…
## $ draft_number <chr> "42", "34", "12", "7", "22", "47", "30", "4", "1", "…
## $ gp <int> 64, 4, 41, 64, 52, 80, 73, 79, 80, 80, 82, 65, 65, 4…
## $ pts <dbl> 3.9, 3.8, 8.3, 10.2, 2.8, 10.6, 10.6, 26.8, 21.1, 21…
## $ reb <dbl> 1.5, 1.3, 6.4, 2.8, 1.7, 2.2, 6.6, 4.0, 6.3, 9.0, 5.…
## $ ast <dbl> 2.4, 0.3, 1.9, 1.7, 0.3, 2.2, 0.4, 2.0, 3.1, 7.3, 1.…
## $ net_rating <dbl> 0.3, 8.9, -8.2, -2.7, -14.1, -5.8, 6.9, 3.2, -2.9, 6…
## $ oreb_pct <dbl> 0.042, 0.030, 0.106, 0.027, 0.102, 0.031, 0.098, 0.0…
## $ dreb_pct <dbl> 0.071, 0.111, 0.185, 0.111, 0.169, 0.064, 0.217, 0.0…
## $ usg_pct <dbl> 0.169, 0.174, 0.175, 0.206, 0.195, 0.203, 0.185, 0.2…
## $ ts_pct <dbl> 0.487, 0.497, 0.512, 0.527, 0.500, 0.503, 0.618, 0.6…
## $ ast_pct <dbl> 0.248, 0.043, 0.125, 0.125, 0.064, 0.143, 0.024, 0.0…
## $ season <chr> "1996-97", "1996-97", "1996-97", "1996-97", "1996-97…
# Remove all rows that have missing data, if there are any
nba_data <- nba_data %>%
drop_na()
# Replace missing values with 0
nba_data[is.na(nba_data)] <- 0
# Cleaning the data
nba_data <- nba_data %>%
mutate(
age = as.numeric(age),
player_height = as.numeric(player_height),
player_weight = as.numeric(player_weight),
pts = as.numeric(pts),
reb = as.numeric(reb),
ast = as.numeric(ast),
net_rating = as.numeric(net_rating),
oreb_pct = as.numeric(oreb_pct),
dreb_pct = as.numeric(dreb_pct),
usg_pct = as.numeric(usg_pct),
ts_pct = as.numeric(ts_pct),
ast_pct = as.numeric(ast_pct),
season = as.factor(season)
)
Creating variables
#total rebound percentage
nba_data <- nba_data %>%
mutate(total_reb_pct = oreb_pct + dreb_pct)
#points per game
nba_data <- nba_data %>%
mutate(points_per_game = pts / gp)
#height in metres
nba_data <- nba_data %>%
mutate(height_m = player_height / 100)
Numerical summaries
summary(nba_data %>%
select(player_height, player_weight, points_per_game, ast, total_reb_pct))
## player_height player_weight points_per_game ast
## Min. :160.0 Min. : 60.33 Min. : 0.00000 Min. : 0.000
## 1st Qu.:193.0 1st Qu.: 90.72 1st Qu.: 0.09074 1st Qu.: 0.600
## Median :200.7 Median : 99.79 Median : 0.13954 Median : 1.200
## Mean :200.6 Mean :100.26 Mean : 0.22954 Mean : 1.825
## 3rd Qu.:208.3 3rd Qu.:108.86 3rd Qu.: 0.22564 3rd Qu.: 2.400
## Max. :231.1 Max. :163.29 Max. :20.00000 Max. :11.700
## total_reb_pct
## Min. :0.0000
## 1st Qu.:0.1210
## Median :0.1740
## Mean :0.1947
## 3rd Qu.:0.2600
## Max. :1.4000
nba_data %>%
summarise(
height_mean = mean(player_height),
height_sd = sd(player_height),
weight_mean = mean(player_weight),
weight_sd = sd(player_weight),
ppg_mean = mean(points_per_game),
ppg_sd = sd(points_per_game),
ast_mean = mean(ast),
ast_sd = sd(ast),
reb_mean = mean(total_reb_pct),
reb_sd = sd(total_reb_pct)
)
## height_mean height_sd weight_mean weight_sd ppg_mean ppg_sd ast_mean
## 1 200.5551 9.11109 100.2633 12.42663 0.2295414 0.5509737 1.824681
## ast_sd reb_mean reb_sd
## 1 1.80084 0.1947191 0.09378032
Histogram for the distribution of points per game
ggplot(nba_data, aes(x = points_per_game)) +
geom_histogram(binwidth = 2) +
labs(title = "Distribution of Points per Game",
x= "Points per Game",
y= "Count"
) +
theme_bw()
I use a histogram to examine the distribution shape and detect any potential outliers. The histogram shows a skewed distribution, where most players score arelatively few number of points per game, while a small number of elite players score significantly more.
Density plot for assists
ggplot(nba_data, aes(x = ast)) +
geom_density() +
labs(title = "Density of Assists",
x= "Assists",
y= "Density"
) +
theme_bw()
I use a density plot to better visualise the distribution shape compared to a histogram. The density plot shows that most player have a low number of assists, with only a few players have a high number of assists. This interpretation could be reflected by differences in the player roles and positions.
Scatter plot for height v weight
ggplot(nba_data, aes(x = player_height, y = player_weight)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Height vs Weight",
x= "Player Height",
y= "Player Weight"
)+
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
I use a scatter plot to explore the relationship between two continuous variables and add a regression line to identify trends. The scatter plot shows a positive correlation between player height and player weight. The fitted regression line is used to help highlight the correlation in the data.
Barplot for total rebound percentage
ggplot(nba_data, aes(x = total_reb_pct)) +
geom_bar(fill = "steelblue", colour = "black") +
labs(title = "Distribution of Total Rebound Percentage",
x = "Total Rebound Percentage",
y = "Count"
) +
theme_bw()
I use a bar plot to show how frequently each value of Total Rebound Percentage occurs in the dataset. This is useful for visualising the distribution when values are discrete or grouped. The barplot is mainly concentrated between 0 to 0.5, with most players having a similar rebound percentage, but standout players can easily be seen beyond the concentrated area above 0.5.
Scatter plot for points v assists
ggplot(nba_data, aes(x = ast, y = points_per_game)) +
geom_point() +
labs(title = "Points per Game vs Assists",
x= "Assists",
y= "Points per Game"
) +
theme_bw()
This scatterplot is fairly spread out, with some players having a high number of assists and some scoring low. This plot highlights the different player roles within a team, with potentailly the playmakers scoring high and the defensive players having a low score.
These graphical and numerical summaries help us visualise the key patterns in the dataset. Variables such as points and assists are highly variable and skewed, while other attributes are more normally distributed, such as the relationship between height and weight.
Scatterplot of 2 variables
ggplot(nba_data, aes(x = ast, y = points_per_game)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE) +
facet_wrap(~ season) +
labs(
title = "Relationship Between Assists and Points per Game by Season",
x = "Assists per Game",
y = "Points per Game"
) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
I created a scatterplot to examine the relationship between two continuous variables. A linear trend line was added to identify any relationship, and faceting was used to display separate panels for each season. This is useful as the association between the two variables may differ across different seasons.
Bar chart of a continuous variable
library(dplyr)
# Creating a summary table
summary_data <- nba_data %>%
group_by(season) %>%
summarise(
mean_pts = mean(points_per_game),
se_pts = sd(points_per_game) / sqrt(n())
) %>%
arrange(desc(mean_pts))
# Reordering the factor levels
summary_data$season <- factor(summary_data$season,
levels = summary_data$season)
# Plotting the bar chart
ggplot(summary_data, aes(x = season, y = mean_pts)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = mean_pts - se_pts,
ymax = mean_pts + se_pts),
width = 0.2) +
labs(title = "Average Points per Game by Season",
x = "Season",
y = "Mean Points per Game") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The bar chart shows the highest to lowest mean points per game, and each bar is represented by a season. This plot allows comparison between mean performances across seasons and the ordering makes it easier to identify which seasons had the highest average points per game. The error bars show the variability and helps assess whether the differences between seasons are meaningful rather than due to a random variation.
Boxplot showing the distribution of a continuous variable
ggplot(nba_data, aes(x = season, y = points_per_game, colour = as.factor(draft_year))) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.5) +
labs(title = "Distribution of Points per Game by Season",
x = "Season",
y = "Points per Game",
colour = "Draft Year") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The boxplot helops summarise the distribution of points per game for each season. I used the jittered raw data points to visualise the individual observations and colour coded them by Draft Year to introduce a second categorical variable. The boxplots summarises the median, quartiles and the spread of the data, while the jittered points show the full distribution of the data.
Choosing variables- Response variable: points per game Continuous predictor: usage percentage Categorical predictor: draft round
Model without interaction
fit1 <- glm(points_per_game ~ usg_pct + draft_round,
family = "gaussian",
data = nba_data)
summary(fit1)
##
## Call:
## glm(formula = points_per_game ~ usg_pct + draft_round, family = "gaussian",
## data = nba_data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.91958 0.22200 4.142 3.46e-05 ***
## usg_pct 1.77267 0.09203 19.261 < 2e-16 ***
## draft_round1 -1.04425 0.22144 -4.716 2.43e-06 ***
## draft_round2 -1.02484 0.22157 -4.625 3.78e-06 ***
## draft_round3 -1.05631 0.25239 -4.185 2.87e-05 ***
## draft_round4 -1.04250 0.27112 -3.845 0.000121 ***
## draft_round6 -1.04175 0.32832 -3.173 0.001512 **
## draft_round7 -1.03516 0.32834 -3.153 0.001621 **
## draft_round8 -0.96017 0.44290 -2.168 0.030184 *
## draft_roundUndrafted -0.92796 0.22163 -4.187 2.85e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2939652)
##
## Null deviance: 3898.8 on 12843 degrees of freedom
## Residual deviance: 3772.7 on 12834 degrees of freedom
## AIC: 20737
##
## Number of Fisher Scoring iterations: 2
I can extract the coefficients using the coef function
coef(fit1)
## (Intercept) usg_pct draft_round1
## 0.9195848 1.7726666 -1.0442498
## draft_round2 draft_round3 draft_round4
## -1.0248398 -1.0563056 -1.0425045
## draft_round6 draft_round7 draft_round8
## -1.0417544 -1.0351577 -0.9601734
## draft_roundUndrafted
## -0.9279575
Here I plot the data, and the regressions line
#plot no interaction
ggplot(nba_data, aes(x = usg_pct, y = points_per_game, colour = draft_round)) +
geom_point() +
geom_abline(intercept = coef(fit1)[1], slope = coef(fit1)[3]) +
geom_abline(intercept = coef(fit1)[1] + coef(fit1)[2], slope = coef(fit1)[3]) +
labs(title = "Model without Interaction",
x = "Usage Percentage",
y = "Points per Game",
colour = "Draft Round")
This model assumes that the relationship between usage percentage and points per game is equal across all of the draft rounds.
Model with interaction
fit2 <- glm(points_per_game ~ usg_pct * draft_round,
family = "gaussian",
data = nba_data)
summary(fit2)
##
## Call:
## glm(formula = points_per_game ~ usg_pct * draft_round, family = "gaussian",
## data = nba_data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4579 0.9155 0.500 0.617
## usg_pct 4.2684 4.8017 0.889 0.374
## draft_round1 -0.5686 0.9158 -0.621 0.535
## draft_round2 -0.4777 0.9161 -0.521 0.602
## draft_round3 -0.3348 1.1253 -0.297 0.766
## draft_round4 -0.5997 0.9898 -0.606 0.545
## draft_round6 -0.6018 1.1381 -0.529 0.597
## draft_round7 -0.6220 1.9273 -0.323 0.747
## draft_round8 -0.4329 1.5430 -0.281 0.779
## draft_roundUndrafted -0.5889 0.9162 -0.643 0.520
## usg_pct:draft_round1 -2.5673 4.8032 -0.534 0.593
## usg_pct:draft_round2 -2.9871 4.8057 -0.622 0.534
## usg_pct:draft_round3 -4.1529 6.3146 -0.658 0.511
## usg_pct:draft_round4 -2.3610 5.3834 -0.439 0.661
## usg_pct:draft_round6 -2.3626 6.1631 -0.383 0.701
## usg_pct:draft_round7 -2.1397 13.2101 -0.162 0.871
## usg_pct:draft_round8 -4.2684 32.2927 -0.132 0.895
## usg_pct:draft_roundUndrafted -1.7575 4.8061 -0.366 0.715
##
## (Dispersion parameter for gaussian family taken to be 0.2936912)
##
## Null deviance: 3898.8 on 12843 degrees of freedom
## Residual deviance: 3766.9 on 12826 degrees of freedom
## AIC: 20733
##
## Number of Fisher Scoring iterations: 2
I can extract the coefficeients using the coef function
coef(fit2)
## (Intercept) usg_pct
## 0.4578698 4.2684232
## draft_round1 draft_round2
## -0.5685608 -0.4776565
## draft_round3 draft_round4
## -0.3347566 -0.5997344
## draft_round6 draft_round7
## -0.6017926 -0.6220107
## draft_round8 draft_roundUndrafted
## -0.4328698 -0.5888905
## usg_pct:draft_round1 usg_pct:draft_round2
## -2.5672860 -2.9871440
## usg_pct:draft_round3 usg_pct:draft_round4
## -4.1528610 -2.3609980
## usg_pct:draft_round6 usg_pct:draft_round7
## -2.3626282 -2.1396859
## usg_pct:draft_round8 usg_pct:draft_roundUndrafted
## -4.2684232 -1.7575239
#plot with interaction
ggplot(nba_data, aes(x=usg_pct, y=points_per_game, color=draft_round)) +
geom_point() +
geom_abline(intercept = coef(fit2)[1], slope = coef(fit2)[3]) +
geom_abline(intercept = coef(fit2)[1]+ coef(fit2)[2], slope = coef(fit2)[3] + coef(fit2)[4]) +
labs(title = "Model with Interaction",
x = "Usage Percentage",
y = "Points per Game",
colour = "Draft Round")
The fit of the second model is slightly better, as it’s AIC is slightly lower than the first model.
Comparing the models
AIC(fit1, fit2)
## df AIC
## fit1 11 20736.86
## fit2 19 20732.87
The AIC is used to compare the models, if the AIC is lower this indicates a better fit while penalising the complexity of the model.
Role of the iteration: In the model without interaction, the coefficient for assists represents the average effect of the usage percentage on points per game across all of the draft years. In the interaction model, the coefficient for usage percentage varies depending on the draft round. The interaction term indicates how the relationship between usage and points per game vary across different groups. A significant interaction term would suggest that the effect of usage percentage is not constant and depends on the draft round.
Comparison and evaluation of the models: The comparison between the two models shows whether including an interaction improves model fit. If the interaction model has a lower AIC, this indicates that the relationship between usage percentage and points per game differs across draft rounds. The model with interaction has a lower AIC, meaning that the relationship differes across draft rounds. The interaction term is important because it allows the effect of one predictor to depend on another, providing a more realistic representation of the data. Without having the interaction, the model assumes a constant relationship between all the groups, which may oversimplify the underlying patterns.
Creating a binary outcome variable
nba_data <- nba_data %>%
mutate(high_scorer = ifelse(points_per_game > 10, 1, 0),
high_scorer = as.factor(high_scorer))
We create a variable indicating whether a player is a high scorer
Fitting the logistic regression model.
logit_model <- glm(high_scorer ~ ast + total_reb_pct + height_m,
family = "binomial",
data = nba_data)
summary(logit_model)
##
## Call:
## glm(formula = high_scorer ~ ast + total_reb_pct + height_m, family = "binomial",
## data = nba_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -17.3409 11.2453 -1.542 0.1231
## ast 0.3192 0.1576 2.025 0.0428 *
## total_reb_pct -13.5196 7.0997 -1.904 0.0569 .
## height_m 5.6198 5.7935 0.970 0.3320
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 119.20 on 12843 degrees of freedom
## Residual deviance: 111.73 on 12840 degrees of freedom
## AIC: 119.73
##
## Number of Fisher Scoring iterations: 11
The logistic regression model is used when the response variable is binary. In the model, the probability represents the likelihood that the player is a high scorer
Evaluating the model
AIC(logit_model)
## [1] 119.7264
The AIC is used to compare the models, if the AIC is lower this indicates a better fit.
Using the predict() function to interpret results
nba_data$predicted_prob <- predict(logit_model, type = "response")
head(nba_data[, c("points_per_game", "ast", "total_reb_pct", "height_m", "predicted_prob")])
## points_per_game ast total_reb_pct height_m predicted_prob
## 1 0.06093750 2.4 0.113 1.9304 7.068322e-04
## 2 0.95000000 0.3 0.141 1.9050 2.147843e-04
## 3 0.20243902 1.9 0.291 2.0320 9.618633e-05
## 4 0.15937500 1.7 0.138 2.0320 7.135971e-04
## 5 0.05384615 0.3 0.271 2.1336 1.338694e-04
## 6 0.13250000 2.2 0.095 1.9812 1.124811e-03
The predict() function converts log-odds into probabilities between 0 and 1, making it easier to interpret. I also selected the columns to compare the player characteristics with the predicted probability from the logistic model.
Plotting the predicted probabilities
nba_data$predicted_prob <- predict(logit_model, newdata = nba_data, type = "response")
ggplot(nba_data, aes(x = ast, y = predicted_prob)) +
geom_point(alpha = 0.5) +
geom_smooth() +
labs(title = "Predicted Probability of Being a High Scorer",
x = "Assists",
y = "Predicted Probability"
) +
theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
The graph shows that there is a high density of assists below 6. The blue upsloping trend line shows that at low assists values the probability is close to 0, however, the probability sharply increases after around 7-8 assists.
Using marginal effects
library(marginaleffects)
avg_slopes(logit_model)
##
## Term Estimate Std. Error z Pr(>|z|) S 2.5 % 97.5 %
## ast 0.000174 0.000108 1.612 0.107 3.2 -3.75e-05 0.000385
## height_m 0.003058 0.003355 0.911 0.362 1.5 -3.52e-03 0.009635
## total_reb_pct -0.007357 0.004752 -1.548 0.122 3.0 -1.67e-02 0.001958
##
## Type: response
## Comparison: dY/dX
The marginaleffects package provides predicted probabilities and marginal effects directly, making it easier to interpret how changes in predictors affect the probability of the outcome.
Interpretation: The logistic regression model shows how player characteristics such as assists, rebounding, and height influence the probability of being a high scorer. Using predicted probabilities simplifies interpretation compared to using log-odds, allowing us to understand the model in terms of probabilities. The model demonstrates how the generalised linear models can be used for continuous outcomes, as well as categorical outcomes.
Choicing model- Response Variable= points, Predictor= assist, Random effect= team abbreviation
Fitting Model 1
library(tidyverse)
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'lme4'
## The following object is masked from 'package:marginaleffects':
##
## refit
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
nba_clean <- nba_data %>%
select(pts, ast, team_abbreviation) %>%
drop_na() %>%
mutate(team_abbreviation = as.factor(team_abbreviation))
mmod1 <- lmer(pts ~ ast + (1 | team_abbreviation),
data = nba_data)
## boundary (singular) fit: see help('isSingular')
summary(mmod1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: pts ~ ast + (1 | team_abbreviation)
## Data: nba_data
##
## REML criterion at convergence: 75079.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.3504 -0.6561 -0.1940 0.5230 4.7250
##
## Random effects:
## Groups Name Variance Std.Dev.
## team_abbreviation (Intercept) 0.00 0.000
## Residual 20.23 4.497
## Number of obs: 12844, groups: team_abbreviation, 36
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 4.163e+00 5.649e-02 1.284e+04 73.69 <2e-16 ***
## ast 2.219e+00 2.204e-02 1.284e+04 100.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## ast -0.712
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
This model assumes that the relationship between assists and points per game is the same across all teams, but allows each team to have its own baseline scoring level through a random intercept.
Fitting Model 2
mmod2 <- lmer(pts ~ ast + total_reb_pct + (1 | team_abbreviation),
data = nba_data)
## boundary (singular) fit: see help('isSingular')
summary(mmod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: pts ~ ast + total_reb_pct + (1 | team_abbreviation)
## Data: nba_data
##
## REML criterion at convergence: 74352.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.7382 -0.6354 -0.1891 0.5157 4.8661
##
## Random effects:
## Groups Name Variance Std.Dev.
## team_abbreviation (Intercept) 0.00 0.000
## Residual 19.11 4.372
## Number of obs: 12844, groups: team_abbreviation, 36
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1.561e+00 1.099e-01 1.284e+04 14.21 <2e-16 ***
## ast 2.393e+00 2.234e-02 1.284e+04 107.11 <2e-16 ***
## total_reb_pct 1.173e+01 4.291e-01 1.284e+04 27.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) ast
## ast -0.587
## totl_rb_pct -0.866 0.284
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
This model now includes both assists and total rebound percentage as predictors, while still accounting for team-level variation using a random effect. This is a mixed model, combining fixed effects and random effects
Comparing the models
anova(mmod1, mmod2)
## refitting model(s) with ML (instead of REML)
## Data: nba_data
## Models:
## mmod1: pts ~ ast + (1 | team_abbreviation)
## mmod2: pts ~ ast + total_reb_pct + (1 | team_abbreviation)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## mmod1 4 75077 75107 -37534 75069
## mmod2 5 74352 74389 -37171 74342 727.01 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
I compare the two models by determining whether including the total rebound percentage improves the model significantly.
Evaluating the models
AIC(mmod1, mmod2)
## df AIC
## mmod1 4 75087.16
## mmod2 5 74362.12
The AIC is used to compare mixed models, with lower values indicating a better model, showing that model 2 is a better fit.
Using Homoscedasticity
plot(nba_clean$ast, mmod2@resp$wtres, xlim = c(0,15))
Normality of Residuals
qqnorm(mmod2@resp$wtres)
qqline(mmod2@resp$wtres)
Interpreting the results: In Model 1, the coefficient for assists represents the average increase in points associated with an additional assist, with the team-level variation held constant. In Model 2, both assists and rebound percentage contribute to predicting the scoring performance. The random effect captures variation between the teams. The standard deviation of the random intercept indicates how much theb average scoring differs across all of the teams. A larger value suggests that team context plays an important role in player performance.
Interpreting the random effects
ranef(mmod1)$team_abbreviation
## (Intercept)
## ATL 0
## BKN 0
## BOS 0
## CHA 0
## CHH 0
## CHI 0
## CLE 0
## DAL 0
## DEN 0
## DET 0
## GSW 0
## HOU 0
## IND 0
## LAC 0
## LAL 0
## MEM 0
## MIA 0
## MIL 0
## MIN 0
## NJN 0
## NOH 0
## NOK 0
## NOP 0
## NYK 0
## OKC 0
## ORL 0
## PHI 0
## PHX 0
## POR 0
## SAC 0
## SAS 0
## SEA 0
## TOR 0
## UTA 0
## VAN 0
## WAS 0
This shows that the model estimates almost no team variation after accounting for assists.
Comparison of the two models: Comparing the two models shows whether including the additional predictors improves the model’s performance. If Model 2 has a lower AIC, this indicates that rebound percentage provides additional power of explanation beyond assists alone. The inclusion of a random effect allows the model to account for unobserved differences between the teams, which reduces the bias and improves predictions. This demonstrates the advantage of mixed models, as they allow us to model both overall trends (fixed effects) and group-level variation (random effects) simultaneously.
Selecting Variables: points per game, assists, total rebound percentage and height
Preparing and scaling the data
library(dplyr)
library(cluster)
cluster_data <- nba_data %>%
select(points_per_game, ast, total_reb_pct, player_height) %>%
scale()
cluster_scaled <- scale(cluster_data)
Method 1- Hierarchical clustering :
dist_matrix <- dist(cluster_data, method = "euclidean")
#Create Hierarchical clustering solution
hc_model <- hclust(dist_matrix, method = "complete")
#Plot dendrogram
plot(hc_model, main = "Hierarchical Clustering Dendrogram")
# Cut into clusters (e.g. 3 clusters)
hc_clusters <- cutree(hc_model, k = 3)
Hierarchical clustering builds clusters by merging the observations based on the distance. It does not require specifying the number of clusters in advance; instead, the number of clusters is chosen by cutting the dendrogram.
Method 2- K-means clustering
set.seed(123)
kmeans_model <- kmeans(cluster_data, centers = 3)
kmeans_clusters <- kmeans_model$cluster
K-means clustering is a method that assigns observations to clusters based on their distance to cluster centres. The algorithm iteratively updates the cluster centres to minimise the within-cluster variation .
Comparing cluster solutions using the silhouette score
library(cluster)
sil_hc <- silhouette(hc_clusters, dist_matrix)
sil_km <- silhouette(kmeans_clusters, dist_matrix)
mean(sil_hc[, 3])
## [1] 0.7951831
mean(sil_km[, 3])
## [1] 0.3705343
The silhouette score measures how well each of the observations fit within its assigned cluster compared to the other clusters. Having higher values indicate a better-defined clusters . The method with the higher average silhouette value provides a better clustering structure.
Plotting silhouette for hierarchical clustering
hc <- hclust(dist_matrix, method = "ward.D2")
hc_clusters <- cutree(hc, k = 3)
table(hc_clusters)
## hc_clusters
## 1 2 3
## 3347 5717 3780
sil_hc <- silhouette(hc_clusters, dist_matrix)
plot(
sil_hc,
main = "Silhouette Plot of Hierarchical Clustering",
col = 1:3,
border = NA
)
abline(v = mean(sil_hc[, 3]), lty = 2)
Plotting Silhouette for k-means clustering
plot(
sil_km,
main = "Silhouette Plot of K-means Clustering",
col = 1:3,
border = NA
)
abline(v = mean(sil_km[,3]), lty =2)
Plot K-means Cluster for assists and points per game
library(ggplot2)
cluster_df <- as.data.frame(cluster_data)
cluster_df$hc_cluster <- as.factor(hc_clusters)
cluster_df$km_cluster <- as.factor(kmeans_clusters)
# K-means clusters
ggplot(cluster_df, aes(x = ast, y = points_per_game, colour = km_cluster)) +
geom_point() +
labs(title = "K-means Clustering",
x= "Assists",
y= "Points per Game",
colour= "K-means Cluster") +
theme_bw()
Interpreting and contrasting the clusters:
I applied two clustering methods to the same variables, which were hierarchial clusting and k-means clustering. The hierarchial clustering method creates clusters by linking the observations based on the distances and it produced a dendrogram that shows the structure of the data. However, k-means clustering separates the data into a fixed number of clusters by minimising the within-cluster variance. To evaluate the cluster quality, I used silhouette scores and I concluded that k-means clustering produced more distinct and well-separated clusters, since it had a higher silhouette score. While both methods had similar grouping, the differences in the cluster assignment show that the results depend on which algorithm is used.