This independent case study builds a Schedule Difficulty Index for the NBA using only public data. The goal is to quantify how schedule structure, such as back-to-backs, 4-in-6 stretches, rest-day differential, travel/load markers, and home/away context relates to game difficulty and team performance.
What to expect in this report
-
Data: Publicly available schedules and results (no
proprietary prompts or datasets).
- Features:
Engineered schedule signals (B2B, 4-in-6, rest differential, optional
travel load).
- Methods: Reproducible R workflow
(tidyverse, tidymodels); cross-validation; baseline comparisons.
-
Outputs: A simple Schedule Difficulty
Index, risk windows over a season, and example visuals.
-
Interpretation: Which schedule factors tend to elevate
risk and how results can inform planning (rotations, recovery,
practice).
Disclaimer: This is an independent, unaffiliated
case study. It is not endorsed by any NBA team or organization.
No employer prompts, datasets, or proprietary materials are used here.
library(tidyverse)
schedule <- read_csv("schedule.csv")
draft_schedule <- read_csv("schedule_24_partial.csv")
locations <- read_csv("locations.csv")
game_data <- read_csv("team_game_data.csv")
QUESTION: 4-in-6 Frequency. Identify every game that is the 4th in the past 6 nights for OKC using a sliding, date-based window. Overlaps are allowed. The result is a season-level count of compressed stretches, plus a list of flagged game dates that can be overlaid on the schedule timeline.
okc <- draft_schedule %>%
filter(team == "OKC") %>%
arrange(gamedate) %>%
select(-season, - opponent, - home, - win)
okc_4in6 <- okc %>%
mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number 1-4 to identify which game is 4th game in 6 nights
is_4in6 = (games_in_6 == 4)) #TRUE if 4th game in 6 nights span
sum(okc_4in6$is_4in6) #total number of 4-in-6 stretches
## [1] 26
ANSWER 1:
26 4-in-6 stretches in OKC’s draft schedule.
QUESTION: Find the average number of 4-in-6 stretches for a team in a season from 2014-15 to 2023-24. Adjust each team/season to per-82 games before taking final average.
group_sch <- schedule %>%
group_by(team, season) %>%
arrange(gamedate) %>%
select(season, gamedate, team)
#double check if schedule is now grouped into each team and season and accurately in chronological order
#check_sch <- group_sch %>%
# filter(team == "OKC", season == "2018")
#check_sch <- check_sch %>%
# mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)),
# is_4in6 = (games_in_6 == 4)
# )
#sum(check_sch$is_4in6)
group_sch <- group_sch %>%
mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number 1-4 to identify which game is 4th game in 6 nightd
is_4in6 = (games_in_6 == 4)) %>% #TRUE if 4th game in 6 nights span
summarise(
four_in_six = sum(is_4in6), #total number of 4-in-6 stretches
num_play_games = n(), #number of games played in each season
per82 = (four_in_six/num_play_games) * 82, #adjust to per-82 games
.groups = "drop"
)
avg_4in6 <-mean(group_sch$per82) #sum up per82 columns then divide by tibble row size 300
avg_4in6 #final average
## [1] 25.09998
ANSWER 2:
25.1 4-in-6 stretches on average.
QUESTION: Out of the 30 NBA teams, find the highest and lowest average number of 4-in-6 stretches between 2014-15 and 2023-24. Adjust each team/season to per-82 games.
two_seasons <- group_sch %>%
filter(season == 2014 | season == 2023)
two_seasons
## # A tibble: 60 × 5
## team season four_in_six num_play_games per82
## <chr> <dbl> <int> <int> <dbl>
## 1 ATL 2014 32 82 32
## 2 ATL 2023 25 82 25
## 3 BKN 2014 30 82 30
## 4 BKN 2023 24 82 24
## 5 BOS 2014 37 82 37
## 6 BOS 2023 25 82 25
## 7 CHA 2014 38 82 38
## 8 CHA 2023 28 82 28
## 9 CHI 2014 29 82 29
## 10 CHI 2023 23 82 23
## # ℹ 50 more rows
two_seasons %>%
slice_max(per82, n = 1) #find max per82 value and prints the whole row
## # A tibble: 1 × 5
## team season four_in_six num_play_games per82
## <chr> <dbl> <int> <int> <dbl>
## 1 CHA 2014 38 82 38
two_seasons %>%
slice_min(per82, n = 1) #find min per82 value and prints the whole row
## # A tibble: 1 × 5
## team season four_in_six num_play_games per82
## <chr> <dbl> <int> <int> <dbl>
## 1 DEN 2023 15 82 15
ANSWER 3:
QUESTION: Based on 2023-2024 season, find BKN’s defensive eFG% and their defensive eFG% that season in situations where their opponent was on the second night of back-to-back.
bkn_2023 <- game_data %>%
filter(def_team == "BKN", season == 2023) %>%
arrange(gamedate) %>%
select(fgmade, fgattempted, fg3made)
def_eFG <- bkn_2023 %>%
summarise(
fgm = sum(fgmade), #total fg made for opponents vs BKN in 2023 season
fg3m = sum(fg3made), #total 3 pointers made for opponents vs BKN in 2023 season
fga = sum(fgattempted) #total fg attempted for opponents vs BKN in 2023 season
) %>%
mutate(def_eFG = (fgm + 0.5 * fg3m)/fga) #eFG% formula: (FGM + 0.5 * 3PM) / FGA
def_eFG
## # A tibble: 1 × 4
## fgm fg3m fga def_eFG
## <dbl> <dbl> <dbl> <dbl>
## 1 3410 1066 7255 0.543
nba_sch <- game_data %>%
filter(season == 2023) %>%
group_by(off_team) %>%
arrange(gamedate) %>%
select(gamedate, off_team, def_team, fgmade, fgattempted, fg3made)
#double check if schedule is now grouped into each team and season and accurately in chronological order
#check_sch <- nba_sch %>%
# filter(off_team == "OKC")
#check_sch <- check_sch %>%
# mutate(games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)),
# is_b2b = (games_b2b == 2)
# )
#sum(check_sch$is_b2b)
b2b <- nba_sch %>%
mutate(games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)), #assigns a 1 if game is not 2nd night of a b2b and 2 if game is a 2nd night of a b2b
is_b2b = (games_b2b == 2) #TRUE if 2th game in 2 nights span
)
opp_b2b <- b2b %>%
ungroup() %>%
filter(is_b2b == "TRUE", def_team == "BKN") %>%
summarise(
fgm = sum(fgmade), #total fg made for opponents on 2nd night of b2b vs BKN in 2023 season
fg3m = sum(fg3made), #total 3 pointers made for opponents on 2nd night of b2b vs BKN in 2023 season
fga = sum(fgattempted), #total fg attempted for opponents on 2nd night of b2b vs BKN in 2023 season
) %>%
mutate(def_eFG = (fgm + 0.5 * fg3m)/fga) #eFG% formula: (FGM + 0.5 * 3PM) / FGA
opp_b2b
## # A tibble: 1 × 4
## fgm fg3m fga def_eFG
## <dbl> <dbl> <dbl> <dbl>
## 1 650 217 1418 0.535
ANSWER 4:
QUESTION: Identify 2 trends in scheduling over time. How are the more recent schedules different from the schedules of the past? Include a visual (plot or styled table) highlighting or explaining each trend and include a brief written description of your findings.
ANSWER 5:
First Visualization
group_sch <- schedule %>%
group_by(team, season) %>%
arrange(gamedate) %>%
select(season, gamedate, team)
group_sch <- group_sch %>%
mutate(games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)), #assigns a 1 if game is not 2nd night of a b2b and 2 if game is a 2nd night of a b2b
is_b2b = (games_b2b == 2)) %>% #TRUE if 2th game in 2 nights span
summarise(
num_b2b = sum(is_b2b), #total number of b2b games
num_play_games = n(), #number of games played in each season
.groups = "drop"
)
#checking for league avg of b2b games for 2018 and 2020 season
#check_sn <- season %>%
# filter(season == "2018") %>%
# summarise(
# per82 = (num_b2b / num_play_games) * 82,
# .groups = "drop"
# )
#avg_2018 <- mean(check_sn$per82)
#check_sn <- season %>%
# filter(season == "2020") %>%
# summarise(
# per82 = (num_b2b / num_play_games) * 82,
# .groups = "drop"
# )
#avg_2020 <- mean(check_sn$per82)
season <- group_sch %>%
group_by(season) %>%
summarise(
league_avg_per82 = mean((num_b2b/num_play_games) * 82), #take mean of each group "season"
.groups = "drop"
)
#season
library(ggplot2)
b2b_plot <- season %>%
mutate(season = factor(season, levels = unique(season))) #make each season an ordered factor so that each point can have its own color on the plot to distinguish
ggplot(b2b_plot, aes(x = season, y = league_avg_per82, color = season)) + geom_point(size = 2) + labs(title = "League Avg B2Bs per Team (per-82) by Season", subtitle = "NBA Seasons: 2014-2023", x = "Season", y = "Number of B2Bs per-82 games") + theme_minimal()
A scheduling pattern that can be observed from this scatterplot is the decline over time in league average back-to-back games per team (per-82) from 2014 to 2023. From 2014 to 2019, the points follow a downward trend falling to about 12 back-to-back games per team (per-82), suggesting the league’s efforts to reduce schedule compression. The 2019 and 2020 NBA season are interesting to consider because COVID shortened the seasons, impacting the total number of games played and back-to-back games. More specifically, the 2020 season was unique since we see a jump on the scatterplot to about 17 back-to-back games per team (per-82) due to the delay of the season. As the league was transitioning back from COVID-19, they had about 10 less games (72) and had teams play consecutively in the same city against the same opponents to reduce travel. In the following three seasons, the league average was constant around 13 to 14 back-to-back games per team (per-82), suggesting a new baseline for scheduling back-to-back games for each team (per-82) in future seasons.
Second Visualization
group_sch <- schedule %>%
group_by(team, season) %>%
arrange(gamedate) %>%
select(season, gamedate, team)
group_sch <- group_sch %>%
mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number 1-4 to identify which game is 4th game in 6 nights
is_4in6 = (games_in_6 == 4)) %>% #TRUE if 4th game in 6 nights span
summarise(
four_in_six = sum(is_4in6), #total number of 4-in-6 stretches
num_play_games = n(), #number of games played in each season
per82 = sum(four_in_six)/num_play_games * 82, #adjust to per-82 games
.groups = "drop"
)
#group_sch
four_in_six_plot <- group_sch %>%
mutate(season = factor(season, levels = unique(season))) #make each season an ordered factor so that each point can have its own color on the plot to distinguish
ggplot(four_in_six_plot, aes(x = season, y = per82, color = season)) + geom_boxplot() + stat_summary(fun = mean, geom = "point", fill = "white") + labs(title = "League Avg 4-in-6 Stretches per Team (per-82) by Season", subtitles = "NBA Seasons: 2014-2023", x = "Season", y = "Number of 4-in-6 Stretches per-82 games") + theme_minimal()
From this boxplot, there is an overall decline over time from the 2014 season to 2023 season from the mean number and distribution (median and IQR) of 4-in-6 stretches (per 82). As mentioned in the first visualization, the 2019 and 2020 NBA season were greatly impacted by COVID-19 so their distributions were larger than other seasons, and more specifically, spikes up in 2020 due to their plan to reduce traveling during the pandemic. If we were to ignore those two seasons, there is an interesting trend I noticed where from 2014-2016, the league averaged almost 30 4-in-6 stretches (per 82) then significantly drops to almost 20 4-in-6 stretches (per 82) in the 2017 and 2018 season, but then rebounds slightly to near mid-20s 4-in-6 stretches (per 82) by 2021-2023. Based on this observation, the league may have experimented different unique scheduling methods prior to COVID; however, by 2023, it seems they plan to be consistent with scheduling around 23 or 24 4-in-6 stretches (per 82) for future seasons.
QUESTION: Design a plotting tool to help visualize a team’s schedule for a season. The plot covers the whole season and should help the viewer contextualize and understand a team’s schedule, potentially highlighting periods of excessive travel, dense blocks of games, or other schedule anomalies.
Then, use the tool to plot OKC and DEN’s provided 80-game 2024-25 schedules.
#we want to do scatter plot timeline
#this is the plan:
#x: game date (full season)
#y: games in last 6 nights (schedule density)
#different color for Home vs Away
#different shape for 2nd night of a B2B (triangle) vs not (circle)
#hover: opponent + Win/Loss (and any extras you want)
plot_tool <- draft_schedule %>%
group_by(team) %>%
arrange(gamedate) %>%
mutate(
#distinct home and away games for color later
location = (home == 1),
venue = if_else(location, "Home", "Away"),
#y axis: games in last 6 nights (schedule density)
games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)),
#2nd game of b2b games
games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)),
is_b2b = (games_b2b == 2), #TRUE if 2th game in 2 nights span
#distinct wins and losses
result = (win == 1),
record = if_else(result, "Win", "Loss"),
#opponent
opp = .data[["opponent"]],
#tooltip for hover over point on interactive plot
text = paste0(
format(gamedate, "%b %d, %Y"), " - ",
if_else(location, "vs ", "@ "), opp, " (", record, ")",
"<br>Games in last 6: <b>", games_in_6, "</b>",
"<br>2nd of B2B: ", if_else(is_b2b, "Yes", "No")
)
)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#OKC 2024-2025 schedule
okc <- plot_tool %>%
filter(team == "OKC") %>%
ggplot(aes(x = gamedate, y = games_in_6)) + geom_point(aes(color = venue, shape = is_b2b, text = text), size = 2) + labs(title = "OKC Team Schedule 2024-2025", x = "Game date (Full Season)", y = "Games in the Last 6 Nights (Schedule Density)")+ theme_minimal()
## Warning in geom_point(aes(color = venue, shape = is_b2b, text = text), size =
## 2): Ignoring unknown aesthetics: text
#DEN 2024-2025 schedule
den <- plot_tool %>%
filter(team == "DEN") %>%
ggplot(aes(x = gamedate, y = games_in_6)) + geom_point(aes(color = venue, shape = is_b2b, text = text), size = 2) + labs(title = "DEN Team Schedule 2024-2025", x = "Game date (Full Season)", y = "Games in the Last 6 Nights (Schedule Density)")+ theme_minimal()
## Warning in geom_point(aes(color = venue, shape = is_b2b, text = text), size =
## 2): Ignoring unknown aesthetics: text
ANSWER 6:
ggplotly(okc, tooltip = "text")
ggplotly(den, tooltip = "text")
QUESTION: Using your tool, find the best and worst part of OKC’s 2024-25 draft schedule. You can include context from past schedules, and use them to make a brief description about OKC’s schedule.
plot_tool <- schedule %>%
filter(season == 2023) %>%
group_by(team) %>%
arrange(gamedate) %>%
mutate(
#distinct home and away games for color later
location = (home == 1),
venue = if_else(location, "Home", "Away"),
#y axis: games in last 6 nights (schedule density)
games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)),
#2nd game of b2b games
games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)),
is_b2b = (games_b2b == 2), #TRUE if 2th game in 2 nights span
#distinct wins and losses
result = (win == 1),
record = if_else(result, "Win", "Loss"),
#opponent
opp = .data[["opponent"]],
#tooltip for hover over point on interactive plot
text = paste0(
format(gamedate, "%b %d, %Y"), " - ",
if_else(location, "vs ", "@ "), opp, " (", record, ")",
"<br>Games in last 6: <b>", games_in_6, "</b>",
"<br>2nd of B2B: ", if_else(is_b2b, "Yes", "No")
)
)
library(plotly)
#OKC 2023-2024 schedule to compare to 2024-2025 above
okc <- plot_tool %>%
filter(team == "OKC") %>%
ggplot(aes(x = gamedate, y = games_in_6)) + geom_point(aes(color = venue, shape = is_b2b, text = text), size = 2) + labs(title = "OKC Team Schedule 2023-2024", x = "Game date (Full Season)", y = "Games in the Last 6 Nights (Schedule Density)")+ theme_minimal()
## Warning in geom_point(aes(color = venue, shape = is_b2b, text = text), size =
## 2): Ignoring unknown aesthetics: text
ggplotly(okc, tooltip = "text")
ANSWER 7:
Best: Observing the OKC Team Schedule 2024, there looks to be more blue
points when y = 3 or 4 compared to the OKC Team Schedule 2023,
indicating there have been more home games for the Thunder during dense
stretches. This is helpful for the team because during these dense
stretches, the players from OKC don’t have to travel, preventing travel
fatigue like disruption of sleep and circadian rhythm and benefiting
from home-court advantage with their fans.
Worst: Based on the OKC Team Schedule 2024, there are two tough stretches the team faced: 1) Nov 19th to Dec 28th: 13 games where only 2 home games and 11 away games with one game on the second night of a back-to-back (away -> away), 2) Jan 8th to Jan 17th: 6 games where only 1 home game and 5 away games with one game on the second night of a back-to-back (home -> away). During these stretches, players will most likely be dealing with heavy travel fatigue, which could impact their game performance and potentially team record. With this in mind, the team can efficiently plan ahead for this to ensure their players are at their best.
QUESTION: Estimate how many more/fewer regular season wins each team has had due to schedule-related factors from 2019-20 though 2023-24. May consider the on-court strength of the scheduled opponents as well as the impact of travel/schedule density. Find the teams and estimates that benefited and struggled most from the schedule-related factors.
#schedule-related factors
# 1) Team on 2nd-of-B2B (−)
# 2) Team 6-night density overload (over 2) (−)
# 3) Rest differential (+ or -)
# 4) Opponent on 2nd-of-B2B (+) <- not needed anymore
# 5) Opponent 6-night density overload (over 2) (+) <- not needed anymore
#merit based for true win projection
# 1) fg%
# 2) ft%
# 3) OREB%
# 4) DREB%
# 5) net rtg
# 6) ast%
# 7) TOV%
# 8) win percentage of previous season <- not doing this anymore
# 9) track record percentage of previous season <- not doing this anymore
#output: y = m1x1 + m2x2 + ... + s1x3 + s2x4 + ...
#m1, m2: Coefficients for merit based features
#s1, s2: Coefficients for schedule based factors
#x1, x2: Merit based factors
#x3, x4: Schedule based factors
#y: Target (# games won by a team in a given season)
#Step 1: Fit model to predict y (linear regression) using actual data on merit and schedule (don't set schedule features to 0 yet)
#Step 2: Using the model (coefficients) obtained above, replace m1, m2, s1, s2,.. and set x3, x4,.. to 0. Then, get new y
#Step 3: Simplify table to 30 rows for teams and find each team's avg across all games of all seasons the subtract actual and predicted to get fewer/more wins
#Step 1
#schedule-related factors
sched_team <- game_data %>%
filter(season %in% 2019:2023) %>%
group_by(off_team, season) %>%
arrange(gamedate) %>%
mutate(
team_rest_days = as.integer(gamedate - lag(gamedate)), #calculates number of rest days before team plays again
team_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)), #assigns a 1 if game is not 2nd night of a b2b and 2 if game is a 2nd night of a b2b
is_team_b2b = (team_b2b == 2), #TRUE if 2th game in 2 nights span
team_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number to identify which game in 6 nights span
total_win = sum(off_win) #total wins of that season for corresponding team
) %>%
ungroup()
#mirror factors for opponents
opp_feats <- sched_team %>%
transmute(
season, gamedate,
def_team = off_team,
opp_rest_days = team_rest_days, #opponent's number of rest days in between
is_opp_b2b = is_team_b2b, #check if opponent is playing on a b2b
opp_in_6 = team_in_6 #check what game in 6 nights span for opponent
)
sched_based <- sched_team %>%
left_join(opp_feats, by = c("season", "gamedate", "def_team")) %>%
mutate(
team_rest_days = replace_na(team_rest_days, 0), #replace any NAs with 0
opp_rest_days = replace_na(opp_rest_days, 0), #replace any NAs with 0
rest_diff = team_rest_days - opp_rest_days #rest difference between team and opponent
)
#merit-based factors
merit_team <- game_data %>%
filter(season %in% 2019:2023) %>%
group_by(off_team, season) %>%
arrange(gamedate) %>%
mutate(
# 1) fg%
fg_per = fgmade/fgattempted,
# 2) ft%
ft_per = ftmade/ftattempted,
# 3) OREB%
oreb_per = reboffensive/reboundchance,
# 4) DREB%
dreb_per = rebdefensive/reboundchance,
# 5) ORTG%
ortg = (points/possessions) * 100,
# 6) ast%
ast_per = assists/fgmade,
# 7) TOV%
tov_per = (turnovers/possessions) * 100
) %>%
ungroup()
#mirror factors for opponents
opp_feats <- merit_team %>%
transmute(
season, gamedate,
def_team = off_team,
drtg = ortg #finds team's defensive rating
)
merit_based <- merit_team %>%
left_join(opp_feats, by = c("season", "gamedate", "def_team")) %>%
mutate(
ft_per = replace_na(ft_per, 0), #replace NAs with 0
netrtg = ortg - drtg #calculates difference of ortg and drtg for team net rating
)
#combine into one big table of 11,658 games from 2019-2023 season
#decided not to include oreb_per and netrtg due to similar overlaps with other predictors (multicollinearity)
combined <- sched_based %>%
select(off_team, season, gamedate, off_win, team_b2b, team_in_6, rest_diff) %>%
left_join(merit_based %>%
select(off_team, season, gamedate, off_win, fg_per, ft_per, dreb_per, ast_per, tov_per),
by = c("off_team", "season", "gamedate", "off_win")
)
combined
## # A tibble: 11,658 × 12
## off_team season gamedate off_win team_b2b team_in_6 rest_diff fg_per ft_per
## <chr> <dbl> <date> <dbl> <int> <int> <int> <dbl> <dbl>
## 1 LAC 2019 2019-10-22 1 1 1 0 0.519 0.708
## 2 LAL 2019 2019-10-22 0 1 1 0 0.435 0.714
## 3 NOP 2019 2019-10-22 0 1 1 0 0.422 0.85
## 4 TOR 2019 2019-10-22 1 1 1 0 0.408 0.842
## 5 DEN 2019 2019-10-23 1 1 1 0 0.420 0.815
## 6 POR 2019 2019-10-23 0 1 1 0 0.414 0.913
## 7 PHX 2019 2019-10-23 1 1 1 0 0.5 0.833
## 8 SAC 2019 2019-10-23 0 1 1 0 0.391 0.654
## 9 OKC 2019 2019-10-23 0 1 1 0 0.386 0.677
## 10 UTA 2019 2019-10-23 1 1 1 0 0.444 0.7
## # ℹ 11,648 more rows
## # ℹ 3 more variables: dreb_per <dbl>, ast_per <dbl>, tov_per <dbl>
#OLS linear regression model
model <- lm(off_win ~ team_b2b + team_in_6 + rest_diff + fg_per + ft_per + dreb_per + ast_per + tov_per, data = combined)
summary(model)
##
## Call:
## lm(formula = off_win ~ team_b2b + team_in_6 + rest_diff + fg_per +
## ft_per + dreb_per + ast_per + tov_per, data = combined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1591 -0.3762 -0.0018 0.3761 1.0761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.874038 0.068993 -12.669 < 2e-16 ***
## team_b2b -0.033475 0.011116 -3.011 0.00261 **
## team_in_6 -0.014841 0.006009 -2.470 0.01354 *
## rest_diff 0.002591 0.001423 1.820 0.06875 .
## fg_per 4.197967 0.072964 57.535 < 2e-16 ***
## ft_per 0.484024 0.039535 12.243 < 2e-16 ***
## dreb_per -1.125909 0.057206 -19.682 < 2e-16 ***
## ast_per 0.261095 0.042712 6.113 1.01e-09 ***
## tov_per -0.015933 0.001079 -14.765 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.432 on 11649 degrees of freedom
## Multiple R-squared: 0.2541, Adjusted R-squared: 0.2536
## F-statistic: 496.1 on 8 and 11649 DF, p-value: < 2.2e-16
To quantify how the two different types of predictors, schedule-related and team “merit”, relates to a team’s chance of winning a game, I fit an OLS linear regression on 11, 658 games (2019-2023 season) with the response variable off_win. My schedule-related predictors were team_b2b, team_in_6, and rest_diff. As for team “merit” predictors, they are fg_per, ft_per, dreb_per, ast_per, and tov_per. Looking at the model, all but one predictor, rest_diff, are statistically significant with p-values less than 0.05, and the overall F-test is also statistically significant with p-value less than 2.2e-16, indicating the predictors are meaningful to our model. However, I noticed the R-squared value is relatively low (0.2541) so the predictors don’t explain much of the variation in the response variable. These two observations somewhat contradict with each other, which brings some concerns that we should take this model with caution. With the coefficients from the model, I am able to create a similar equation to predict a team’s chance of winning a game, but this time, I removed the schedule-related predictors. That way, I am able to take the difference between the actual and predicted to get a difference value for each team to see who benefitted most and least from the schedule-related factors.
#Step 2
#substitute coefficients from model into similar equation to predict team's chance of winning a game
#remove schedule-related predictors since we want to take difference of actual vs predicted to see the impact of schedule-related predictors
combined <- combined %>%
mutate(win_prob = -0.874038 + (4.197967*fg_per) + (0.484024*ft_per) + (-1.125909*dreb_per) + (0.261095*ast_per) + (-0.015933*tov_per),
pred_win = round(win_prob))
combined %>% select(off_team, season, off_win, win_prob, pred_win)
## # A tibble: 11,658 × 5
## off_team season off_win win_prob pred_win
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 LAC 2019 1 0.722 1
## 2 LAL 2019 0 0.381 0
## 3 NOP 2019 0 0.401 0
## 4 TOR 2019 1 0.475 0
## 5 DEN 2019 1 0.324 0
## 6 POR 2019 0 0.314 0
## 7 PHX 2019 1 0.754 1
## 8 SAC 2019 0 0.0174 0
## 9 OKC 2019 0 0.0727 0
## 10 UTA 2019 1 0.355 0
## # ℹ 11,648 more rows
#Step 3
#simplify table into 30 rows (30 teams) with total wins, total predicted wins, and difference across all games of all season
team_group <- combined %>%
group_by(off_team) %>%
summarise(
total_win = sum(off_win), #total wins across all games of all seasons
total_pred_win = sum(pred_win), #total predicted wins across all games of all seasons
true_diff = total_win - total_pred_win, #difference between actual and predicted
.groups = "drop"
)
team_group
## # A tibble: 30 × 4
## off_team total_win total_pred_win true_diff
## <chr> <dbl> <dbl> <dbl>
## 1 ATL 181 266 -85
## 2 BKN 204 259 -55
## 3 BOS 256 266 -10
## 4 CHA 147 205 -58
## 5 CHI 178 250 -72
## 6 CLE 184 245 -61
## 7 DAL 225 246 -21
## 8 DEN 251 303 -52
## 9 DET 94 194 -100
## 10 GSW 197 238 -41
## # ℹ 20 more rows
team_group %>%
slice_max(true_diff, n = 1) #find max true_diff value and prints the whole row
## # A tibble: 1 × 4
## off_team total_win total_pred_win true_diff
## <chr> <dbl> <dbl> <dbl>
## 1 MIL 260 255 5
team_group %>%
slice_min(true_diff, n = 1) #find min true_diff value and prints the whole row
## # A tibble: 1 × 4
## off_team total_win total_pred_win true_diff
## <chr> <dbl> <dbl> <dbl>
## 1 WAS 144 249 -105
ANSWER 8: