Setup and Data

library(tidyverse)
schedule <- read_csv("schedule.csv")
draft_schedule <- read_csv("schedule_24_partial.csv")
locations <- read_csv("locations.csv")
game_data <- read_csv("team_game_data.csv")

Part 1 – Schedule Analysis

Question 1

QUESTION: 4-in-6 Frequency. Identify every game that is the 4th in the past 6 nights for OKC using a sliding, date-based window. Overlaps are allowed. The result is a season-level count of compressed stretches, plus a list of flagged game dates that can be overlaid on the schedule timeline.

okc <- draft_schedule %>%
  filter(team == "OKC") %>%
  arrange(gamedate) %>%
  select(-season, - opponent, - home, - win)

okc_4in6 <- okc %>%
  mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number 1-4 to identify which game is 4th game in 6 nights
    is_4in6 = (games_in_6 == 4)) #TRUE if 4th game in 6 nights span

sum(okc_4in6$is_4in6) #total number of 4-in-6 stretches

## [1] 26

ANSWER 1:

26 4-in-6 stretches in OKC’s draft schedule.

Question 2

QUESTION: Find the average number of 4-in-6 stretches for a team in a season from 2014-15 to 2023-24. Adjust each team/season to per-82 games before taking final average.

group_sch <- schedule %>%
  group_by(team, season) %>%
  arrange(gamedate) %>%
  select(season, gamedate, team)

#double check if schedule is now grouped into each team and season and accurately in chronological order
#check_sch <- group_sch %>%
#  filter(team == "OKC", season == "2018")
#check_sch <- check_sch %>%
#  mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)),
#    is_4in6 = (games_in_6 == 4)
#  )
#sum(check_sch$is_4in6)

group_sch <- group_sch %>%
  mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number 1-4 to identify which game is 4th game in 6 nightd
    is_4in6 = (games_in_6 == 4)) %>% #TRUE if 4th game in 6 nights span
  summarise(
    four_in_six = sum(is_4in6), #total number of 4-in-6 stretches
    num_play_games = n(), #number of games played in each season
    per82 = (four_in_six/num_play_games) * 82, #adjust to per-82 games
    .groups = "drop"
  )

avg_4in6 <-mean(group_sch$per82) #sum up per82 columns then divide by tibble row size 300
avg_4in6 #final average

## [1] 25.09998

ANSWER 2:

25.1 4-in-6 stretches on average.

Question 3

QUESTION: Out of the 30 NBA teams, find the highest and lowest average number of 4-in-6 stretches between 2014-15 and 2023-24. Adjust each team/season to per-82 games.

two_seasons <- group_sch %>% 
  filter(season == 2014 | season == 2023)
two_seasons

## # A tibble: 60 × 5
##    team  season four_in_six num_play_games per82
##    <chr>  <dbl>       <int>          <int> <dbl>
##  1 ATL     2014          32             82    32
##  2 ATL     2023          25             82    25
##  3 BKN     2014          30             82    30
##  4 BKN     2023          24             82    24
##  5 BOS     2014          37             82    37
##  6 BOS     2023          25             82    25
##  7 CHA     2014          38             82    38
##  8 CHA     2023          28             82    28
##  9 CHI     2014          29             82    29
## 10 CHI     2023          23             82    23
## # ℹ 50 more rows

two_seasons %>%
  slice_max(per82, n = 1) #find max per82 value and prints the whole row

## # A tibble: 1 × 5
##   team  season four_in_six num_play_games per82
##   <chr>  <dbl>       <int>          <int> <dbl>
## 1 CHA     2014          38             82    38

two_seasons %>%
  slice_min(per82, n = 1) #find min per82 value and prints the whole row

## # A tibble: 1 × 5
##   team  season four_in_six num_play_games per82
##   <chr>  <dbl>       <int>          <int> <dbl>
## 1 DEN     2023          15             82    15

ANSWER 3:

Most 4-in-6 stretches on average: CHA (38)
Fewest 4-in-6 stretches on average: DEN (15)

Question 4

QUESTION: Based on 2023-2024 season, find BKN’s defensive eFG% and their defensive eFG% that season in situations where their opponent was on the second night of back-to-back.

bkn_2023 <- game_data %>%
  filter(def_team == "BKN", season == 2023) %>%
  arrange(gamedate) %>%
  select(fgmade, fgattempted, fg3made)

def_eFG <- bkn_2023 %>%
  summarise(
    fgm = sum(fgmade), #total fg made for opponents vs BKN in 2023 season
    fg3m = sum(fg3made), #total 3 pointers made for opponents vs BKN in 2023 season
    fga = sum(fgattempted) #total fg attempted for opponents vs BKN in 2023 season
  ) %>%
  mutate(def_eFG = (fgm + 0.5 * fg3m)/fga) #eFG% formula: (FGM + 0.5 * 3PM) / FGA
def_eFG

## # A tibble: 1 × 4
##     fgm  fg3m   fga def_eFG
##   <dbl> <dbl> <dbl>   <dbl>
## 1  3410  1066  7255   0.543

nba_sch <- game_data %>%
  filter(season == 2023) %>%
  group_by(off_team) %>%
  arrange(gamedate) %>%
  select(gamedate, off_team, def_team, fgmade, fgattempted, fg3made)

#double check if schedule is now grouped into each team and season and accurately in chronological order
#check_sch <- nba_sch %>%
#  filter(off_team == "OKC")
#check_sch <- check_sch %>%
#  mutate(games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)),
#    is_b2b = (games_b2b == 2)
#  )
#sum(check_sch$is_b2b)

b2b <- nba_sch %>%
  mutate(games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)), #assigns a 1 if game is not 2nd night of a b2b and 2 if game is a 2nd night of a b2b
    is_b2b = (games_b2b == 2) #TRUE if 2th game in 2 nights span
)

opp_b2b <- b2b %>%
  ungroup() %>%
  filter(is_b2b == "TRUE", def_team == "BKN") %>%
  summarise(
    fgm = sum(fgmade), #total fg made for opponents on 2nd night of b2b vs BKN in 2023 season
    fg3m = sum(fg3made), #total 3 pointers made for opponents on 2nd night of b2b vs BKN in 2023 season
    fga = sum(fgattempted), #total fg attempted for opponents on 2nd night of b2b vs BKN in 2023 season
  ) %>%
  mutate(def_eFG = (fgm + 0.5 * fg3m)/fga) #eFG% formula: (FGM + 0.5 * 3PM) / FGA
opp_b2b

## # A tibble: 1 × 4
##     fgm  fg3m   fga def_eFG
##   <dbl> <dbl> <dbl>   <dbl>
## 1   650   217  1418   0.535

ANSWER 4:

BKN Defensive eFG%: 54.3%
When opponent on a B2B: 53.5%

Part 2 – Trends and Visualizations

Question 5

QUESTION: Identify 2 trends in scheduling over time. How are the more recent schedules different from the schedules of the past? Include a visual (plot or styled table) highlighting or explaining each trend and include a brief written description of your findings.

ANSWER 5:

First Visualization

group_sch <- schedule %>%
  group_by(team, season) %>%
  arrange(gamedate) %>%
  select(season, gamedate, team)

group_sch <- group_sch %>%
  mutate(games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)), #assigns a 1 if game is not 2nd night of a b2b and 2 if game is a 2nd night of a b2b
    is_b2b = (games_b2b == 2)) %>% #TRUE if 2th game in 2 nights span
  summarise(
    num_b2b = sum(is_b2b), #total number of b2b games
    num_play_games = n(), #number of games played in each season
    .groups = "drop"
  )

#checking for league avg of b2b games for 2018 and 2020 season
#check_sn <- season %>%
#  filter(season == "2018") %>%
#  summarise(
#    per82 = (num_b2b / num_play_games) * 82, 
#    .groups = "drop"
#  )
#avg_2018 <- mean(check_sn$per82)

#check_sn <- season %>%
#  filter(season == "2020") %>%
#  summarise(
#    per82 = (num_b2b / num_play_games) * 82, 
#    .groups = "drop"
#  )
#avg_2020 <- mean(check_sn$per82)

season <- group_sch %>%
  group_by(season) %>%
  summarise(
    league_avg_per82 = mean((num_b2b/num_play_games) * 82), #take mean of each group "season"
    .groups = "drop"
  )
#season

library(ggplot2)
b2b_plot <- season %>%
  mutate(season = factor(season, levels = unique(season))) #make each season an ordered factor so that each point can have its own color on the plot to distinguish

ggplot(b2b_plot, aes(x = season, y = league_avg_per82, color = season)) + geom_point(size = 2) + labs(title = "League Avg B2Bs per Team (per-82) by Season", subtitle = "NBA Seasons: 2014-2023", x = "Season", y = "Number of B2Bs per-82 games") + theme_minimal()

A scheduling pattern that can be observed from this scatterplot is the decline over time in league average back-to-back games per team (per-82) from 2014 to 2023. From 2014 to 2019, the points follow a downward trend falling to about 12 back-to-back games per team (per-82), suggesting the league’s efforts to reduce schedule compression. The 2019 and 2020 NBA season are interesting to consider because COVID shortened the seasons, impacting the total number of games played and back-to-back games. More specifically, the 2020 season was unique since we see a jump on the scatterplot to about 17 back-to-back games per team (per-82) due to the delay of the season. As the league was transitioning back from COVID-19, they had about 10 less games (72) and had teams play consecutively in the same city against the same opponents to reduce travel. In the following three seasons, the league average was constant around 13 to 14 back-to-back games per team (per-82), suggesting a new baseline for scheduling back-to-back games for each team (per-82) in future seasons.

Second Visualization

group_sch <- schedule %>%
  group_by(team, season) %>%
  arrange(gamedate) %>%
  select(season, gamedate, team)

group_sch <- group_sch %>%
  mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number 1-4 to identify which game is 4th game in 6 nights
    is_4in6 = (games_in_6 == 4)) %>% #TRUE if 4th game in 6 nights span
  summarise(
    four_in_six = sum(is_4in6), #total number of 4-in-6 stretches
    num_play_games = n(), #number of games played in each season
    per82 = sum(four_in_six)/num_play_games * 82, #adjust to per-82 games
    .groups = "drop"
  )

#group_sch

four_in_six_plot <- group_sch %>%
  mutate(season = factor(season, levels = unique(season))) #make each season an ordered factor so that each point can have its own color on the plot to distinguish

ggplot(four_in_six_plot, aes(x = season, y = per82, color = season)) + geom_boxplot() + stat_summary(fun = mean, geom = "point", fill = "white") + labs(title = "League Avg 4-in-6 Stretches per Team (per-82) by Season", subtitles = "NBA Seasons: 2014-2023", x = "Season", y = "Number of 4-in-6 Stretches per-82 games") + theme_minimal()

From this boxplot, there is an overall decline over time from the 2014 season to 2023 season from the mean number and distribution (median and IQR) of 4-in-6 stretches (per 82). As mentioned in the first visualization, the 2019 and 2020 NBA season were greatly impacted by COVID-19 so their distributions were larger than other seasons, and more specifically, spikes up in 2020 due to their plan to reduce traveling during the pandemic. If we were to ignore those two seasons, there is an interesting trend I noticed where from 2014-2016, the league averaged almost 30 4-in-6 stretches (per 82) then significantly drops to almost 20 4-in-6 stretches (per 82) in the 2017 and 2018 season, but then rebounds slightly to near mid-20s 4-in-6 stretches (per 82) by 2021-2023. Based on this observation, the league may have experimented different unique scheduling methods prior to COVID; however, by 2023, it seems they plan to be consistent with scheduling around 23 or 24 4-in-6 stretches (per 82) for future seasons.

Question 6

QUESTION: Design a plotting tool to help visualize a team’s schedule for a season. The plot covers the whole season and should help the viewer contextualize and understand a team’s schedule, potentially highlighting periods of excessive travel, dense blocks of games, or other schedule anomalies.

Then, use the tool to plot OKC and DEN’s provided 80-game 2024-25 schedules.

#we want to do scatter plot timeline
#this is the plan:
#x: game date (full season)
#y: games in last 6 nights (schedule density)
#different color for Home vs Away
#different shape for 2nd night of a B2B (triangle) vs not (circle)
#hover: opponent + Win/Loss (and any extras you want)

plot_tool <- draft_schedule %>%
  group_by(team) %>%
  arrange(gamedate) %>%
  mutate(
    #distinct home and away games for color later
    location = (home == 1),
    venue = if_else(location, "Home", "Away"),
    
    #y axis: games in last 6 nights (schedule density)
    games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)),
    
    #2nd game of b2b games
    games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)),
    is_b2b = (games_b2b == 2), #TRUE if 2th game in 2 nights span
    
    #distinct wins and losses
    result = (win == 1),
    record = if_else(result, "Win", "Loss"),
    
    #opponent
    opp = .data[["opponent"]],
    
    #tooltip for hover over point on interactive plot
    text = paste0(
      format(gamedate, "%b %d, %Y"), " - ",
      if_else(location, "vs ", "@ "), opp, " (", record, ")",
      "<br>Games in last 6: <b>", games_in_6, "</b>",
      "<br>2nd of B2B: ", if_else(is_b2b, "Yes", "No")
    )
  )

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

#OKC 2024-2025 schedule
okc <- plot_tool %>%
  filter(team == "OKC") %>%
  ggplot(aes(x = gamedate, y = games_in_6)) + geom_point(aes(color = venue, shape = is_b2b, text = text), size = 2) + labs(title = "OKC Team Schedule 2024-2025", x = "Game date (Full Season)", y = "Games in the Last 6 Nights (Schedule Density)")+ theme_minimal()

## Warning in geom_point(aes(color = venue, shape = is_b2b, text = text), size =
## 2): Ignoring unknown aesthetics: text

#DEN 2024-2025 schedule
den <- plot_tool %>%
  filter(team == "DEN") %>%
  ggplot(aes(x = gamedate, y = games_in_6)) + geom_point(aes(color = venue, shape = is_b2b, text = text), size = 2) + labs(title = "DEN Team Schedule 2024-2025", x = "Game date (Full Season)", y = "Games in the Last 6 Nights (Schedule Density)")+ theme_minimal()

## Warning in geom_point(aes(color = venue, shape = is_b2b, text = text), size =
## 2): Ignoring unknown aesthetics: text

ANSWER 6:

ggplotly(okc, tooltip = "text")

ggplotly(den, tooltip = "text")

Question 7

QUESTION: Using your tool, find the best and worst part of OKC’s 2024-25 draft schedule. You can include context from past schedules, and use them to make a brief description about OKC’s schedule.

plot_tool <- schedule %>%
  filter(season == 2023) %>%
  group_by(team) %>%
  arrange(gamedate) %>%
  mutate(
    #distinct home and away games for color later
    location = (home == 1),
    venue = if_else(location, "Home", "Away"),
    
    #y axis: games in last 6 nights (schedule density)
    games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)),
    
    #2nd game of b2b games
    games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)),
    is_b2b = (games_b2b == 2), #TRUE if 2th game in 2 nights span
    
    #distinct wins and losses
    result = (win == 1),
    record = if_else(result, "Win", "Loss"),
    
    #opponent
    opp = .data[["opponent"]],
    
    #tooltip for hover over point on interactive plot
    text = paste0(
      format(gamedate, "%b %d, %Y"), " - ",
      if_else(location, "vs ", "@ "), opp, " (", record, ")",
      "<br>Games in last 6: <b>", games_in_6, "</b>",
      "<br>2nd of B2B: ", if_else(is_b2b, "Yes", "No")
    )
  )

library(plotly)

#OKC 2023-2024 schedule to compare to 2024-2025 above
okc <- plot_tool %>%
  filter(team == "OKC") %>%
  ggplot(aes(x = gamedate, y = games_in_6)) + geom_point(aes(color = venue, shape = is_b2b, text = text), size = 2) + labs(title = "OKC Team Schedule 2023-2024", x = "Game date (Full Season)", y = "Games in the Last 6 Nights (Schedule Density)")+ theme_minimal()

## Warning in geom_point(aes(color = venue, shape = is_b2b, text = text), size =
## 2): Ignoring unknown aesthetics: text

ggplotly(okc, tooltip = "text")

ANSWER 7:
Best: Observing the OKC Team Schedule 2024, there looks to be more blue points when y = 3 or 4 compared to the OKC Team Schedule 2023, indicating there have been more home games for the Thunder during dense stretches. This is helpful for the team because during these dense stretches, the players from OKC don’t have to travel, preventing travel fatigue like disruption of sleep and circadian rhythm and benefiting from home-court advantage with their fans.

Worst: Based on the OKC Team Schedule 2024, there are two tough stretches the team faced: 1) Nov 19th to Dec 28th: 13 games where only 2 home games and 11 away games with one game on the second night of a back-to-back (away -> away), 2) Jan 8th to Jan 17th: 6 games where only 1 home game and 5 away games with one game on the second night of a back-to-back (home -> away). During these stretches, players will most likely be dealing with heavy travel fatigue, which could impact their game performance and potentially team record. With this in mind, the team can efficiently plan ahead for this to ensure their players are at their best.

Part 3 – Modeling

Question 8

QUESTION: Estimate how many more/fewer regular season wins each team has had due to schedule-related factors from 2019-20 though 2023-24. May consider the on-court strength of the scheduled opponents as well as the impact of travel/schedule density. Find the teams and estimates that benefited and struggled most from the schedule-related factors.

#schedule-related factors
# 1) Team on 2nd-of-B2B (−)
# 2) Team 6-night density overload (over 2) (−)
# 3) Rest differential (+ or -)
# 4) Opponent on 2nd-of-B2B (+)  <- not needed anymore
# 5) Opponent 6-night density overload (over 2) (+)  <- not needed anymore

#merit based for true win projection
# 1) fg%
# 2) ft%
# 3) OREB%
# 4) DREB%
# 5) net rtg
# 6) ast%
# 7) TOV%
# 8) win percentage of previous season <- not doing this anymore
# 9) track record percentage of previous season <- not doing this anymore

#output: y = m1x1 + m2x2 + ... + s1x3 + s2x4 + ...
#m1, m2: Coefficients for merit based features
#s1, s2: Coefficients for schedule based factors
#x1, x2: Merit based factors
#x3, x4: Schedule based factors
#y: Target (# games won by a team in a given season)


#Step 1: Fit model to predict y (linear regression) using actual data on merit and schedule (don't set schedule features to 0 yet)
#Step 2: Using the model (coefficients) obtained above, replace m1, m2, s1, s2,.. and set x3, x4,.. to 0. Then, get new y
#Step 3: Simplify table to 30 rows for teams and find each team's avg across all games of all seasons the subtract actual and predicted to get fewer/more wins

#Step 1
#schedule-related factors
sched_team <- game_data %>%
  filter(season %in% 2019:2023) %>%
  group_by(off_team, season) %>%
  arrange(gamedate) %>%
  mutate(
    team_rest_days = as.integer(gamedate - lag(gamedate)), #calculates number of rest days before team plays again
    team_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)), #assigns a 1 if game is not 2nd night of a b2b and 2 if game is a 2nd night of a b2b
    is_team_b2b = (team_b2b == 2), #TRUE if 2th game in 2 nights span
    team_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number to identify which game in 6 nights span
    total_win = sum(off_win) #total wins of that season for corresponding team
  ) %>%
  ungroup()

#mirror factors for opponents
opp_feats <- sched_team %>%
  transmute(
    season, gamedate,
    def_team = off_team,
    opp_rest_days = team_rest_days, #opponent's number of rest days in between
    is_opp_b2b = is_team_b2b, #check if opponent is playing on a b2b
    opp_in_6 = team_in_6 #check what game in 6 nights span for opponent
  )

sched_based <- sched_team %>%
  left_join(opp_feats, by = c("season", "gamedate", "def_team")) %>%
  mutate(
    team_rest_days = replace_na(team_rest_days, 0), #replace any NAs with 0
    opp_rest_days  = replace_na(opp_rest_days, 0), #replace any NAs with 0
    rest_diff = team_rest_days - opp_rest_days #rest difference between team and opponent
  )

#merit-based factors
merit_team <- game_data %>%
  filter(season %in% 2019:2023) %>%
  group_by(off_team, season) %>%
  arrange(gamedate) %>%
  mutate(
    # 1) fg%
    fg_per = fgmade/fgattempted,
    # 2) ft%
    ft_per = ftmade/ftattempted,
    # 3) OREB%
    oreb_per = reboffensive/reboundchance,
    # 4) DREB%
    dreb_per = rebdefensive/reboundchance,
    # 5) ORTG%
    ortg = (points/possessions) * 100, 
    # 6) ast%
    ast_per = assists/fgmade,
    # 7) TOV%
    tov_per = (turnovers/possessions) * 100
  ) %>%
  ungroup() 
  
#mirror factors for opponents
opp_feats <- merit_team %>%
  transmute(
    season, gamedate,
    def_team = off_team,
    drtg = ortg #finds team's defensive rating
  )

merit_based <- merit_team %>%
  left_join(opp_feats, by = c("season", "gamedate", "def_team")) %>%
  mutate(
    ft_per = replace_na(ft_per, 0), #replace NAs with 0
    netrtg = ortg - drtg #calculates difference of ortg and drtg for team net rating
  )

#combine into one big table of 11,658 games from 2019-2023 season
#decided not to include oreb_per and netrtg due to similar overlaps with other predictors (multicollinearity)
combined <- sched_based %>%
  select(off_team, season, gamedate, off_win, team_b2b, team_in_6, rest_diff) %>%
  left_join(merit_based %>%
    select(off_team, season, gamedate, off_win, fg_per, ft_per, dreb_per, ast_per, tov_per),
    by = c("off_team", "season", "gamedate", "off_win")
)
combined

## # A tibble: 11,658 × 12
##    off_team season gamedate   off_win team_b2b team_in_6 rest_diff fg_per ft_per
##    <chr>     <dbl> <date>       <dbl>    <int>     <int>     <int>  <dbl>  <dbl>
##  1 LAC        2019 2019-10-22       1        1         1         0  0.519  0.708
##  2 LAL        2019 2019-10-22       0        1         1         0  0.435  0.714
##  3 NOP        2019 2019-10-22       0        1         1         0  0.422  0.85 
##  4 TOR        2019 2019-10-22       1        1         1         0  0.408  0.842
##  5 DEN        2019 2019-10-23       1        1         1         0  0.420  0.815
##  6 POR        2019 2019-10-23       0        1         1         0  0.414  0.913
##  7 PHX        2019 2019-10-23       1        1         1         0  0.5    0.833
##  8 SAC        2019 2019-10-23       0        1         1         0  0.391  0.654
##  9 OKC        2019 2019-10-23       0        1         1         0  0.386  0.677
## 10 UTA        2019 2019-10-23       1        1         1         0  0.444  0.7  
## # ℹ 11,648 more rows
## # ℹ 3 more variables: dreb_per <dbl>, ast_per <dbl>, tov_per <dbl>

#OLS linear regression model
model <- lm(off_win ~ team_b2b + team_in_6 + rest_diff + fg_per + ft_per + dreb_per + ast_per + tov_per, data = combined)
summary(model)

## 
## Call:
## lm(formula = off_win ~ team_b2b + team_in_6 + rest_diff + fg_per + 
##     ft_per + dreb_per + ast_per + tov_per, data = combined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1591 -0.3762 -0.0018  0.3761  1.0761 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.874038   0.068993 -12.669  < 2e-16 ***
## team_b2b    -0.033475   0.011116  -3.011  0.00261 ** 
## team_in_6   -0.014841   0.006009  -2.470  0.01354 *  
## rest_diff    0.002591   0.001423   1.820  0.06875 .  
## fg_per       4.197967   0.072964  57.535  < 2e-16 ***
## ft_per       0.484024   0.039535  12.243  < 2e-16 ***
## dreb_per    -1.125909   0.057206 -19.682  < 2e-16 ***
## ast_per      0.261095   0.042712   6.113 1.01e-09 ***
## tov_per     -0.015933   0.001079 -14.765  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.432 on 11649 degrees of freedom
## Multiple R-squared:  0.2541, Adjusted R-squared:  0.2536 
## F-statistic: 496.1 on 8 and 11649 DF,  p-value: < 2.2e-16

To quantify how the two different types of predictors, schedule-related and team “merit”, relates to a team’s chance of winning a game, I fit an OLS linear regression on 11, 658 games (2019-2023 season) with the response variable off_win. My schedule-related predictors were team_b2b, team_in_6, and rest_diff. As for team “merit” predictors, they are fg_per, ft_per, dreb_per, ast_per, and tov_per. Looking at the model, all but one predictor, rest_diff, are statistically significant with p-values less than 0.05, and the overall F-test is also statistically significant with p-value less than 2.2e-16, indicating the predictors are meaningful to our model. However, I noticed the R-squared value is relatively low (0.2541) so the predictors don’t explain much of the variation in the response variable. These two observations somewhat contradict with each other, which brings some concerns that we should take this model with caution. With the coefficients from the model, I am able to create a similar equation to predict a team’s chance of winning a game, but this time, I removed the schedule-related predictors. That way, I am able to take the difference between the actual and predicted to get a difference value for each team to see who benefitted most and least from the schedule-related factors.

#Step 2
#substitute coefficients from model into similar equation to predict team's chance of winning a game
#remove schedule-related predictors since we want to take difference of actual vs predicted to see the impact of schedule-related predictors
combined <- combined %>%
  mutate(win_prob = -0.874038 + (4.197967*fg_per) + (0.484024*ft_per) + (-1.125909*dreb_per) + (0.261095*ast_per) + (-0.015933*tov_per),
         pred_win = round(win_prob))
combined %>% select(off_team, season, off_win, win_prob, pred_win)

## # A tibble: 11,658 × 5
##    off_team season off_win win_prob pred_win
##    <chr>     <dbl>   <dbl>    <dbl>    <dbl>
##  1 LAC        2019       1   0.722         1
##  2 LAL        2019       0   0.381         0
##  3 NOP        2019       0   0.401         0
##  4 TOR        2019       1   0.475         0
##  5 DEN        2019       1   0.324         0
##  6 POR        2019       0   0.314         0
##  7 PHX        2019       1   0.754         1
##  8 SAC        2019       0   0.0174        0
##  9 OKC        2019       0   0.0727        0
## 10 UTA        2019       1   0.355         0
## # ℹ 11,648 more rows

#Step 3
#simplify table into 30 rows (30 teams) with total wins, total predicted wins, and difference across all games of all season
team_group <- combined %>%
  group_by(off_team) %>%
  summarise(
    total_win = sum(off_win), #total wins across all games of all seasons
    total_pred_win = sum(pred_win), #total predicted wins across all games of all seasons
    true_diff = total_win - total_pred_win, #difference between actual and predicted
    .groups = "drop"
  )
team_group

## # A tibble: 30 × 4
##    off_team total_win total_pred_win true_diff
##    <chr>        <dbl>          <dbl>     <dbl>
##  1 ATL            181            266       -85
##  2 BKN            204            259       -55
##  3 BOS            256            266       -10
##  4 CHA            147            205       -58
##  5 CHI            178            250       -72
##  6 CLE            184            245       -61
##  7 DAL            225            246       -21
##  8 DEN            251            303       -52
##  9 DET             94            194      -100
## 10 GSW            197            238       -41
## # ℹ 20 more rows

team_group %>%
  slice_max(true_diff, n = 1) #find max true_diff value and prints the whole row

## # A tibble: 1 × 4
##   off_team total_win total_pred_win true_diff
##   <chr>        <dbl>          <dbl>     <dbl>
## 1 MIL            260            255         5

team_group %>%
  slice_min(true_diff, n = 1) #find min true_diff value and prints the whole row

## # A tibble: 1 × 4
##   off_team total_win total_pred_win true_diff
##   <chr>        <dbl>          <dbl>     <dbl>
## 1 WAS            144            249      -105

ANSWER 8:

Most Helped by Schedule: MIL (+5 wins)
Most Hurt by Schedule: WAS (-105 wins)

NBA Schedule Modeling

Independent Case Study

Johnathan Pham

09/17/25

Introduction