Coursework Large Data

Loading Packages

Loading the relevant packages

library(tidyverse)
library(ggplot2)

Loading data

nba_data <- read.csv("all_seasons.csv")
glimpse(nba_data)

## Rows: 12,844
## Columns: 22
## $ X                 <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ player_name       <chr> "Randy Livingston", "Gaylon Nickerson", "George Lync…
## $ team_abbreviation <chr> "HOU", "WAS", "VAN", "LAL", "DEN", "ORL", "WAS", "CH…
## $ age               <dbl> 22, 28, 26, 30, 23, 33, 26, 30, 24, 24, 22, 31, 29, …
## $ player_height     <dbl> 193.04, 190.50, 203.20, 203.20, 213.36, 198.12, 231.…
## $ player_weight     <dbl> 94.80073, 86.18248, 103.41898, 102.05820, 119.74829,…
## $ college           <chr> "Louisiana State", "Northwestern Oklahoma", "North C…
## $ country           <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "US…
## $ draft_year        <chr> "1996", "1994", "1993", "1989", "1995", "1985", "199…
## $ draft_round       <chr> "2", "2", "1", "1", "1", "2", "2", "1", "1", "1", "1…
## $ draft_number      <chr> "42", "34", "12", "7", "22", "47", "30", "4", "1", "…
## $ gp                <int> 64, 4, 41, 64, 52, 80, 73, 79, 80, 80, 82, 65, 65, 4…
## $ pts               <dbl> 3.9, 3.8, 8.3, 10.2, 2.8, 10.6, 10.6, 26.8, 21.1, 21…
## $ reb               <dbl> 1.5, 1.3, 6.4, 2.8, 1.7, 2.2, 6.6, 4.0, 6.3, 9.0, 5.…
## $ ast               <dbl> 2.4, 0.3, 1.9, 1.7, 0.3, 2.2, 0.4, 2.0, 3.1, 7.3, 1.…
## $ net_rating        <dbl> 0.3, 8.9, -8.2, -2.7, -14.1, -5.8, 6.9, 3.2, -2.9, 6…
## $ oreb_pct          <dbl> 0.042, 0.030, 0.106, 0.027, 0.102, 0.031, 0.098, 0.0…
## $ dreb_pct          <dbl> 0.071, 0.111, 0.185, 0.111, 0.169, 0.064, 0.217, 0.0…
## $ usg_pct           <dbl> 0.169, 0.174, 0.175, 0.206, 0.195, 0.203, 0.185, 0.2…
## $ ts_pct            <dbl> 0.487, 0.497, 0.512, 0.527, 0.500, 0.503, 0.618, 0.6…
## $ ast_pct           <dbl> 0.248, 0.043, 0.125, 0.125, 0.064, 0.143, 0.024, 0.0…
## $ season            <chr> "1996-97", "1996-97", "1996-97", "1996-97", "1996-97…

Cleaning data

# Remove all rows that have missing data, if there are any
nba_data <- nba_data %>%
  drop_na()

# Replace missing values with 0
nba_data[is.na(nba_data)] <- 0

# Cleaning the data
nba_data <- nba_data %>%
  mutate(
    age = as.numeric(age),
    player_height = as.numeric(player_height),
    player_weight = as.numeric(player_weight),
    pts = as.numeric(pts),
    reb = as.numeric(reb),
    ast = as.numeric(ast),
    net_rating = as.numeric(net_rating),
    oreb_pct = as.numeric(oreb_pct),
    dreb_pct = as.numeric(dreb_pct),
    usg_pct = as.numeric(usg_pct),
    ts_pct = as.numeric(ts_pct),
    ast_pct = as.numeric(ast_pct),
    season = as.factor(season)
  )

Question 1

Creating variables

#total rebound percentage
nba_data <- nba_data %>%
  mutate(total_reb_pct = oreb_pct + dreb_pct)

#points per game
nba_data <- nba_data %>%
  mutate(points_per_game = pts / gp)
  

#height in metres 
nba_data <- nba_data %>%
  mutate(height_m = player_height / 100)

Numerical summaries

summary(nba_data %>%
  select(player_height, player_weight, points_per_game, ast, total_reb_pct))

##  player_height   player_weight    points_per_game         ast        
##  Min.   :160.0   Min.   : 60.33   Min.   : 0.00000   Min.   : 0.000  
##  1st Qu.:193.0   1st Qu.: 90.72   1st Qu.: 0.09074   1st Qu.: 0.600  
##  Median :200.7   Median : 99.79   Median : 0.13954   Median : 1.200  
##  Mean   :200.6   Mean   :100.26   Mean   : 0.22954   Mean   : 1.825  
##  3rd Qu.:208.3   3rd Qu.:108.86   3rd Qu.: 0.22564   3rd Qu.: 2.400  
##  Max.   :231.1   Max.   :163.29   Max.   :20.00000   Max.   :11.700  
##  total_reb_pct   
##  Min.   :0.0000  
##  1st Qu.:0.1210  
##  Median :0.1740  
##  Mean   :0.1947  
##  3rd Qu.:0.2600  
##  Max.   :1.4000

nba_data %>%
  summarise(
    height_mean = mean(player_height),
    height_sd   = sd(player_height),
    
    weight_mean = mean(player_weight),
    weight_sd   = sd(player_weight),
    
    ppg_mean = mean(points_per_game),
    ppg_sd   = sd(points_per_game),
    
    ast_mean = mean(ast),
    ast_sd   = sd(ast),
    
    reb_mean = mean(total_reb_pct),
    reb_sd   = sd(total_reb_pct)
  )

##   height_mean height_sd weight_mean weight_sd  ppg_mean    ppg_sd ast_mean
## 1    200.5551   9.11109    100.2633  12.42663 0.2295414 0.5509737 1.824681
##    ast_sd  reb_mean     reb_sd
## 1 1.80084 0.1947191 0.09378032

Histogram for the distribution of points per game

  ggplot(nba_data, aes(x = points_per_game)) +
  geom_histogram(binwidth = 2) +
  labs(title = "Distribution of Points per Game",
       x= "Points per Game",
       y= "Count"
       ) + 
  theme_bw()

I use a histogram to examine the distribution shape and detect any potential outliers. The histogram shows a skewed distribution, where most players score arelatively few number of points per game, while a small number of elite players score significantly more.

Density plot for assists

 ggplot(nba_data, aes(x = ast)) +
  geom_density() +
  labs(title = "Density of Assists",
       x= "Assists",
       y= "Density"
       ) + 
  theme_bw()

I use a density plot to better visualise the distribution shape compared to a histogram. The density plot shows that most player have a low number of assists, with only a few players have a high number of assists. This interpretation could be reflected by differences in the player roles and positions.

Scatter plot for height v weight

 ggplot(nba_data, aes(x = player_height, y = player_weight)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Height vs Weight",
       x= "Player Height",
       y= "Player Weight"
       )+ 
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

I use a scatter plot to explore the relationship between two continuous variables and add a regression line to identify trends. The scatter plot shows a positive correlation between player height and player weight. The fitted regression line is used to help highlight the correlation in the data.

Barplot for total rebound percentage

ggplot(nba_data, aes(x = total_reb_pct)) +
  geom_bar(fill = "steelblue", colour = "black") +
  labs(title = "Distribution of Total Rebound Percentage",
       x = "Total Rebound Percentage",
       y = "Count"
       ) +
  theme_bw()

I use a bar plot to show how frequently each value of Total Rebound Percentage occurs in the dataset. This is useful for visualising the distribution when values are discrete or grouped. The barplot is mainly concentrated between 0 to 0.5, with most players having a similar rebound percentage, but standout players can easily be seen beyond the concentrated area above 0.5.

Scatter plot for points v assists

 ggplot(nba_data, aes(x = ast, y = points_per_game)) +
  geom_point() +
  labs(title = "Points per Game vs Assists",
       x= "Assists",
       y= "Points per Game"
       ) +
  theme_bw()

This scatterplot is fairly spread out, with some players having a high number of assists and some scoring low. This plot highlights the different player roles within a team, with potentailly the playmakers scoring high and the defensive players having a low score.

These graphical and numerical summaries help us visualise the key patterns in the dataset. Variables such as points and assists are highly variable and skewed, while other attributes are more normally distributed, such as the relationship between height and weight.

Question 2a

Scatterplot of 2 variables

ggplot(nba_data, aes(x = ast, y = points_per_game)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE) +
  facet_wrap(~ season) +
  labs(
    title = "Relationship Between Assists and Points per Game by Season",
    x = "Assists per Game",
    y = "Points per Game"
  ) +
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

I created a scatterplot to examine the relationship between two continuous variables. A linear trend line was added to identify any relationship, and faceting was used to display separate panels for each season. This is useful as the association between the two variables may differ across different seasons.

Question 2b

Bar chart of a continuous variable

library(dplyr)

# Creating a summary table
summary_data <- nba_data %>%
  group_by(season) %>%
  summarise(
    mean_pts = mean(points_per_game),
    se_pts = sd(points_per_game) / sqrt(n())
  ) %>%
  arrange(desc(mean_pts))

# Reordering the factor levels
summary_data$season <- factor(summary_data$season,
                              levels = summary_data$season)

# Plotting the bar chart
ggplot(summary_data, aes(x = season, y = mean_pts)) +
  geom_bar(stat = "identity") +
  geom_errorbar(aes(ymin = mean_pts - se_pts,
                    ymax = mean_pts + se_pts),
                width = 0.2) +
  labs(title = "Average Points per Game by Season",
       x = "Season",
       y = "Mean Points per Game") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The bar chart shows the highest to lowest mean points per game, and each bar is represented by a season. This plot allows comparison between mean performances across seasons and the ordering makes it easier to identify which seasons had the highest average points per game. The error bars show the variability and helps assess whether the differences between seasons are meaningful rather than due to a random variation.

Question 2c

Boxplot showing the distribution of a continuous variable

ggplot(nba_data, aes(x = season, y = points_per_game, colour = as.factor(draft_year))) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(title = "Distribution of Points per Game by Season",
       x = "Season",
       y = "Points per Game",
       colour = "Draft Year") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The boxplot helops summarise the distribution of points per game for each season. I used the jittered raw data points to visualise the individual observations and colour coded them by Draft Year to introduce a second categorical variable. The boxplots summarises the median, quartiles and the spread of the data, while the jittered points show the full distribution of the data.

Question 3

Choosing variables- Response variable: points per game Continuous predictor: usage percentage Categorical predictor: draft round

Model without interaction

fit1 <- glm(points_per_game ~ usg_pct + draft_round,
            family = "gaussian",
            data = nba_data)
summary(fit1)

## 
## Call:
## glm(formula = points_per_game ~ usg_pct + draft_round, family = "gaussian", 
##     data = nba_data)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.91958    0.22200   4.142 3.46e-05 ***
## usg_pct               1.77267    0.09203  19.261  < 2e-16 ***
## draft_round1         -1.04425    0.22144  -4.716 2.43e-06 ***
## draft_round2         -1.02484    0.22157  -4.625 3.78e-06 ***
## draft_round3         -1.05631    0.25239  -4.185 2.87e-05 ***
## draft_round4         -1.04250    0.27112  -3.845 0.000121 ***
## draft_round6         -1.04175    0.32832  -3.173 0.001512 ** 
## draft_round7         -1.03516    0.32834  -3.153 0.001621 ** 
## draft_round8         -0.96017    0.44290  -2.168 0.030184 *  
## draft_roundUndrafted -0.92796    0.22163  -4.187 2.85e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2939652)
## 
##     Null deviance: 3898.8  on 12843  degrees of freedom
## Residual deviance: 3772.7  on 12834  degrees of freedom
## AIC: 20737
## 
## Number of Fisher Scoring iterations: 2

I can extract the coefficients using the coef function

coef(fit1)

##          (Intercept)              usg_pct         draft_round1 
##            0.9195848            1.7726666           -1.0442498 
##         draft_round2         draft_round3         draft_round4 
##           -1.0248398           -1.0563056           -1.0425045 
##         draft_round6         draft_round7         draft_round8 
##           -1.0417544           -1.0351577           -0.9601734 
## draft_roundUndrafted 
##           -0.9279575

Here I plot the data, and the regressions line

#plot no interaction
ggplot(nba_data, aes(x = usg_pct, y = points_per_game, colour = draft_round)) +
  geom_point() +
  geom_abline(intercept = coef(fit1)[1], slope = coef(fit1)[3]) +
  geom_abline(intercept = coef(fit1)[1] + coef(fit1)[2], slope = coef(fit1)[3]) +
  labs(title = "Model without Interaction",
       x = "Usage Percentage",
       y = "Points per Game",
       colour = "Draft Round")

This model assumes that the relationship between usage percentage and points per game is equal across all of the draft rounds.

Model with interaction

fit2 <- glm(points_per_game ~ usg_pct * draft_round,
            family = "gaussian",
            data = nba_data)

summary(fit2)

## 
## Call:
## glm(formula = points_per_game ~ usg_pct * draft_round, family = "gaussian", 
##     data = nba_data)
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)
## (Intercept)                    0.4579     0.9155   0.500    0.617
## usg_pct                        4.2684     4.8017   0.889    0.374
## draft_round1                  -0.5686     0.9158  -0.621    0.535
## draft_round2                  -0.4777     0.9161  -0.521    0.602
## draft_round3                  -0.3348     1.1253  -0.297    0.766
## draft_round4                  -0.5997     0.9898  -0.606    0.545
## draft_round6                  -0.6018     1.1381  -0.529    0.597
## draft_round7                  -0.6220     1.9273  -0.323    0.747
## draft_round8                  -0.4329     1.5430  -0.281    0.779
## draft_roundUndrafted          -0.5889     0.9162  -0.643    0.520
## usg_pct:draft_round1          -2.5673     4.8032  -0.534    0.593
## usg_pct:draft_round2          -2.9871     4.8057  -0.622    0.534
## usg_pct:draft_round3          -4.1529     6.3146  -0.658    0.511
## usg_pct:draft_round4          -2.3610     5.3834  -0.439    0.661
## usg_pct:draft_round6          -2.3626     6.1631  -0.383    0.701
## usg_pct:draft_round7          -2.1397    13.2101  -0.162    0.871
## usg_pct:draft_round8          -4.2684    32.2927  -0.132    0.895
## usg_pct:draft_roundUndrafted  -1.7575     4.8061  -0.366    0.715
## 
## (Dispersion parameter for gaussian family taken to be 0.2936912)
## 
##     Null deviance: 3898.8  on 12843  degrees of freedom
## Residual deviance: 3766.9  on 12826  degrees of freedom
## AIC: 20733
## 
## Number of Fisher Scoring iterations: 2

I can extract the coefficeients using the coef function

coef(fit2)

##                  (Intercept)                      usg_pct 
##                    0.4578698                    4.2684232 
##                 draft_round1                 draft_round2 
##                   -0.5685608                   -0.4776565 
##                 draft_round3                 draft_round4 
##                   -0.3347566                   -0.5997344 
##                 draft_round6                 draft_round7 
##                   -0.6017926                   -0.6220107 
##                 draft_round8         draft_roundUndrafted 
##                   -0.4328698                   -0.5888905 
##         usg_pct:draft_round1         usg_pct:draft_round2 
##                   -2.5672860                   -2.9871440 
##         usg_pct:draft_round3         usg_pct:draft_round4 
##                   -4.1528610                   -2.3609980 
##         usg_pct:draft_round6         usg_pct:draft_round7 
##                   -2.3626282                   -2.1396859 
##         usg_pct:draft_round8 usg_pct:draft_roundUndrafted 
##                   -4.2684232                   -1.7575239

#plot with interaction
ggplot(nba_data, aes(x=usg_pct, y=points_per_game, color=draft_round)) +
geom_point() +
geom_abline(intercept = coef(fit2)[1], slope = coef(fit2)[3]) +
geom_abline(intercept = coef(fit2)[1]+ coef(fit2)[2], slope = coef(fit2)[3] + coef(fit2)[4]) +
    labs(title = "Model with Interaction",
       x = "Usage Percentage",
       y = "Points per Game",
       colour = "Draft Round")

The fit of the second model is slightly better, as it’s AIC is slightly lower than the first model.

Comparing the models

AIC(fit1, fit2)

##      df      AIC
## fit1 11 20736.86
## fit2 19 20732.87

The AIC is used to compare the models, if the AIC is lower this indicates a better fit while penalising the complexity of the model.

Role of the iteration: In the model without interaction, the coefficient for assists represents the average effect of the usage percentage on points per game across all of the draft years. In the interaction model, the coefficient for usage percentage varies depending on the draft round. The interaction term indicates how the relationship between usage and points per game vary across different groups. A significant interaction term would suggest that the effect of usage percentage is not constant and depends on the draft round.

Comparison and evaluation of the models: The comparison between the two models shows whether including an interaction improves model fit. If the interaction model has a lower AIC, this indicates that the relationship between usage percentage and points per game differs across draft rounds. The model with interaction has a lower AIC, meaning that the relationship differes across draft rounds. The interaction term is important because it allows the effect of one predictor to depend on another, providing a more realistic representation of the data. Without having the interaction, the model assumes a constant relationship between all the groups, which may oversimplify the underlying patterns.

Question 4

Creating a binary outcome variable

nba_data <- nba_data %>%
  mutate(high_scorer = ifelse(points_per_game > 10, 1, 0),
         high_scorer = as.factor(high_scorer))

We create a variable indicating whether a player is a high scorer

Fitting the logistic regression model.

logit_model <- glm(high_scorer ~ ast + total_reb_pct + height_m,
                   family = "binomial",
                   data = nba_data)

summary(logit_model)

## 
## Call:
## glm(formula = high_scorer ~ ast + total_reb_pct + height_m, family = "binomial", 
##     data = nba_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -17.3409    11.2453  -1.542   0.1231  
## ast             0.3192     0.1576   2.025   0.0428 *
## total_reb_pct -13.5196     7.0997  -1.904   0.0569 .
## height_m        5.6198     5.7935   0.970   0.3320  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 119.20  on 12843  degrees of freedom
## Residual deviance: 111.73  on 12840  degrees of freedom
## AIC: 119.73
## 
## Number of Fisher Scoring iterations: 11

The logistic regression model is used when the response variable is binary. In the model, the probability represents the likelihood that the player is a high scorer

Evaluating the model

AIC(logit_model)

## [1] 119.7264

The AIC is used to compare the models, if the AIC is lower this indicates a better fit.

Using the predict() function to interpret results

nba_data$predicted_prob <- predict(logit_model, type = "response")
head(nba_data[, c("points_per_game", "ast", "total_reb_pct", "height_m", "predicted_prob")])

##   points_per_game ast total_reb_pct height_m predicted_prob
## 1      0.06093750 2.4         0.113   1.9304   7.068322e-04
## 2      0.95000000 0.3         0.141   1.9050   2.147843e-04
## 3      0.20243902 1.9         0.291   2.0320   9.618633e-05
## 4      0.15937500 1.7         0.138   2.0320   7.135971e-04
## 5      0.05384615 0.3         0.271   2.1336   1.338694e-04
## 6      0.13250000 2.2         0.095   1.9812   1.124811e-03

The predict() function converts log-odds into probabilities between 0 and 1, making it easier to interpret. I also selected the columns to compare the player characteristics with the predicted probability from the logistic model.

Plotting the predicted probabilities

nba_data$predicted_prob <- predict(logit_model, newdata = nba_data, type = "response")

ggplot(nba_data, aes(x = ast, y = predicted_prob)) +
  geom_point(alpha = 0.5) +
  geom_smooth() +
  labs(title = "Predicted Probability of Being a High Scorer",
       x = "Assists",
       y = "Predicted Probability"
       ) + 
  theme_bw()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

The graph shows that there is a high density of assists below 6. The blue upsloping trend line shows that at low assists values the probability is close to 0, however, the probability sharply increases after around 7-8 assists.

Using marginal effects

library(marginaleffects)

avg_slopes(logit_model)

## 
##           Term  Estimate Std. Error      z Pr(>|z|)   S     2.5 %   97.5 %
##  ast            0.000174   0.000108  1.612    0.107 3.2 -3.75e-05 0.000385
##  height_m       0.003058   0.003355  0.911    0.362 1.5 -3.52e-03 0.009635
##  total_reb_pct -0.007357   0.004752 -1.548    0.122 3.0 -1.67e-02 0.001958
## 
## Type: response
## Comparison: dY/dX

The marginaleffects package provides predicted probabilities and marginal effects directly, making it easier to interpret how changes in predictors affect the probability of the outcome.

Interpretation: The logistic regression model shows how player characteristics such as assists, rebounding, and height influence the probability of being a high scorer. Using predicted probabilities simplifies interpretation compared to using log-odds, allowing us to understand the model in terms of probabilities. The model demonstrates how the generalised linear models can be used for continuous outcomes, as well as categorical outcomes.

Question 5

Choicing model- Response Variable= points, Predictor= assist, Random effect= team abbreviation

Fitting Model 1

library(tidyverse)
library(lme4)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'lme4'

## The following object is masked from 'package:marginaleffects':
## 
##     refit

library(lmerTest)

## 
## Attaching package: 'lmerTest'

## The following object is masked from 'package:lme4':
## 
##     lmer

## The following object is masked from 'package:stats':
## 
##     step

nba_clean <- nba_data %>%
  select(pts, ast, team_abbreviation) %>%
  drop_na() %>%
  mutate(team_abbreviation = as.factor(team_abbreviation))

mmod1 <- lmer(pts ~ ast + (1 | team_abbreviation),
               data = nba_data)

## boundary (singular) fit: see help('isSingular')

summary(mmod1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: pts ~ ast + (1 | team_abbreviation)
##    Data: nba_data
## 
## REML criterion at convergence: 75079.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.3504 -0.6561 -0.1940  0.5230  4.7250 
## 
## Random effects:
##  Groups            Name        Variance Std.Dev.
##  team_abbreviation (Intercept)  0.00    0.000   
##  Residual                      20.23    4.497   
## Number of obs: 12844, groups:  team_abbreviation, 36
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept) 4.163e+00  5.649e-02 1.284e+04   73.69   <2e-16 ***
## ast         2.219e+00  2.204e-02 1.284e+04  100.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##     (Intr)
## ast -0.712
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

This model assumes that the relationship between assists and points per game is the same across all teams, but allows each team to have its own baseline scoring level through a random intercept.

Fitting Model 2

mmod2 <- lmer(pts ~ ast + total_reb_pct + (1 | team_abbreviation),
               data = nba_data)

## boundary (singular) fit: see help('isSingular')

summary(mmod2)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: pts ~ ast + total_reb_pct + (1 | team_abbreviation)
##    Data: nba_data
## 
## REML criterion at convergence: 74352.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.7382 -0.6354 -0.1891  0.5157  4.8661 
## 
## Random effects:
##  Groups            Name        Variance Std.Dev.
##  team_abbreviation (Intercept)  0.00    0.000   
##  Residual                      19.11    4.372   
## Number of obs: 12844, groups:  team_abbreviation, 36
## 
## Fixed effects:
##                Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)   1.561e+00  1.099e-01 1.284e+04   14.21   <2e-16 ***
## ast           2.393e+00  2.234e-02 1.284e+04  107.11   <2e-16 ***
## total_reb_pct 1.173e+01  4.291e-01 1.284e+04   27.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) ast   
## ast         -0.587       
## totl_rb_pct -0.866  0.284
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

This model now includes both assists and total rebound percentage as predictors, while still accounting for team-level variation using a random effect. This is a mixed model, combining fixed effects and random effects

Comparing the models

anova(mmod1, mmod2)

## refitting model(s) with ML (instead of REML)

## Data: nba_data
## Models:
## mmod1: pts ~ ast + (1 | team_abbreviation)
## mmod2: pts ~ ast + total_reb_pct + (1 | team_abbreviation)
##       npar   AIC   BIC logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## mmod1    4 75077 75107 -37534     75069                         
## mmod2    5 74352 74389 -37171     74342 727.01  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

I compare the two models by determining whether including the total rebound percentage improves the model significantly.

Evaluating the models

AIC(mmod1, mmod2)

##       df      AIC
## mmod1  4 75087.16
## mmod2  5 74362.12

The AIC is used to compare mixed models, with lower values indicating a better model, showing that model 2 is a better fit.

Using Homoscedasticity

plot(nba_clean$ast, mmod2@resp$wtres, xlim = c(0,15))

Normality of Residuals

qqnorm(mmod2@resp$wtres)
qqline(mmod2@resp$wtres)

Interpreting the results: In Model 1, the coefficient for assists represents the average increase in points associated with an additional assist, with the team-level variation held constant. In Model 2, both assists and rebound percentage contribute to predicting the scoring performance. The random effect captures variation between the teams. The standard deviation of the random intercept indicates how much theb average scoring differs across all of the teams. A larger value suggests that team context plays an important role in player performance.

Interpreting the random effects

ranef(mmod1)$team_abbreviation

##     (Intercept)
## ATL           0
## BKN           0
## BOS           0
## CHA           0
## CHH           0
## CHI           0
## CLE           0
## DAL           0
## DEN           0
## DET           0
## GSW           0
## HOU           0
## IND           0
## LAC           0
## LAL           0
## MEM           0
## MIA           0
## MIL           0
## MIN           0
## NJN           0
## NOH           0
## NOK           0
## NOP           0
## NYK           0
## OKC           0
## ORL           0
## PHI           0
## PHX           0
## POR           0
## SAC           0
## SAS           0
## SEA           0
## TOR           0
## UTA           0
## VAN           0
## WAS           0

This shows that the model estimates almost no team variation after accounting for assists.

Comparison of the two models: Comparing the two models shows whether including the additional predictors improves the model’s performance. If Model 2 has a lower AIC, this indicates that rebound percentage provides additional power of explanation beyond assists alone. The inclusion of a random effect allows the model to account for unobserved differences between the teams, which reduces the bias and improves predictions. This demonstrates the advantage of mixed models, as they allow us to model both overall trends (fixed effects) and group-level variation (random effects) simultaneously.

Question 6

Selecting Variables: points per game, assists, total rebound percentage and height

Preparing and scaling the data

library(dplyr)
library(cluster)

cluster_data <- nba_data %>%
  select(points_per_game, ast, total_reb_pct, player_height) %>%
  scale()

cluster_scaled <- scale(cluster_data)

Method 1- Hierarchical clustering :

dist_matrix <- dist(cluster_data, method = "euclidean")

#Create Hierarchical clustering solution
hc_model <- hclust(dist_matrix, method = "complete")

#Plot dendrogram
plot(hc_model, main = "Hierarchical Clustering Dendrogram")

# Cut into clusters (e.g. 3 clusters)
hc_clusters <- cutree(hc_model, k = 3)

Hierarchical clustering builds clusters by merging the observations based on the distance. It does not require specifying the number of clusters in advance; instead, the number of clusters is chosen by cutting the dendrogram.

Method 2- K-means clustering

set.seed(123)

kmeans_model <- kmeans(cluster_data, centers = 3)

kmeans_clusters <- kmeans_model$cluster

K-means clustering is a method that assigns observations to clusters based on their distance to cluster centres. The algorithm iteratively updates the cluster centres to minimise the within-cluster variation .

Comparing cluster solutions using the silhouette score

library(cluster)

sil_hc <- silhouette(hc_clusters, dist_matrix)
sil_km <- silhouette(kmeans_clusters, dist_matrix)

mean(sil_hc[, 3])

## [1] 0.7951831

mean(sil_km[, 3])

## [1] 0.3705343

The silhouette score measures how well each of the observations fit within its assigned cluster compared to the other clusters. Having higher values indicate a better-defined clusters . The method with the higher average silhouette value provides a better clustering structure.

Plotting silhouette for hierarchical clustering

hc <- hclust(dist_matrix, method = "ward.D2")
hc_clusters <- cutree(hc, k = 3)

table(hc_clusters)

## hc_clusters
##    1    2    3 
## 3347 5717 3780

sil_hc <- silhouette(hc_clusters, dist_matrix)

plot(
  sil_hc,
  main = "Silhouette Plot of Hierarchical Clustering",
  col = 1:3,
  border = NA
)

abline(v = mean(sil_hc[, 3]), lty = 2)

Plotting Silhouette for k-means clustering

plot(
  sil_km,
  main = "Silhouette Plot of K-means Clustering",
  col = 1:3,
  border = NA
)

abline(v = mean(sil_km[,3]), lty =2)

Plot K-means Cluster for assists and points per game

library(ggplot2)

cluster_df <- as.data.frame(cluster_data)

cluster_df$hc_cluster <- as.factor(hc_clusters)
cluster_df$km_cluster <- as.factor(kmeans_clusters)


# K-means clusters
ggplot(cluster_df, aes(x = ast, y = points_per_game, colour = km_cluster)) +
  geom_point() +
  labs(title = "K-means Clustering",
       x= "Assists",
       y= "Points per Game",
       colour= "K-means Cluster") +
  theme_bw()

Interpreting and contrasting the clusters:

I applied two clustering methods to the same variables, which were hierarchial clusting and k-means clustering. The hierarchial clustering method creates clusters by linking the observations based on the distances and it produced a dendrogram that shows the structure of the data. However, k-means clustering separates the data into a fixed number of clusters by minimising the within-cluster variance. To evaluate the cluster quality, I used silhouette scores and I concluded that k-means clustering produced more distinct and well-separated clusters, since it had a higher silhouette score. While both methods had similar grouping, the differences in the cluster assignment show that the results depend on which algorithm is used.