Introduction

This independent case study builds a Schedule Difficulty Index for the NBA using only public data. The goal is to quantify how schedule structure, such as back-to-backs, 4-in-6 stretches, rest-day differential, travel/load markers, and home/away context relates to game difficulty and team performance.

What to expect in this report
- Data: Publicly available schedules and results (no proprietary prompts or datasets).
- Features: Engineered schedule signals (B2B, 4-in-6, rest differential, optional travel load).
- Methods: Reproducible R workflow (tidyverse, tidymodels); cross-validation; baseline comparisons.
- Outputs: A simple Schedule Difficulty Index, risk windows over a season, and example visuals.
- Interpretation: Which schedule factors tend to elevate risk and how results can inform planning (rotations, recovery, practice).

Disclaimer: This is an independent, unaffiliated case study. It is not endorsed by any NBA team or organization. No employer prompts, datasets, or proprietary materials are used here.

logo

Setup and Data

library(tidyverse)
schedule <- read_csv("schedule.csv")
draft_schedule <- read_csv("schedule_24_partial.csv")
locations <- read_csv("locations.csv")
game_data <- read_csv("team_game_data.csv")

Part 1 – Schedule Analysis

Question 1

QUESTION: 4-in-6 Frequency. Identify every game that is the 4th in the past 6 nights for OKC using a sliding, date-based window. Overlaps are allowed. The result is a season-level count of compressed stretches, plus a list of flagged game dates that can be overlaid on the schedule timeline.

okc <- draft_schedule %>%
  filter(team == "OKC") %>%
  arrange(gamedate) %>%
  select(-season, - opponent, - home, - win)

okc_4in6 <- okc %>%
  mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number 1-4 to identify which game is 4th game in 6 nights
    is_4in6 = (games_in_6 == 4)) #TRUE if 4th game in 6 nights span

sum(okc_4in6$is_4in6) #total number of 4-in-6 stretches
## [1] 26

ANSWER 1:

26 4-in-6 stretches in OKC’s draft schedule.

Question 2

QUESTION: Find the average number of 4-in-6 stretches for a team in a season from 2014-15 to 2023-24. Adjust each team/season to per-82 games before taking final average.

group_sch <- schedule %>%
  group_by(team, season) %>%
  arrange(gamedate) %>%
  select(season, gamedate, team)

#double check if schedule is now grouped into each team and season and accurately in chronological order
#check_sch <- group_sch %>%
#  filter(team == "OKC", season == "2018")
#check_sch <- check_sch %>%
#  mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)),
#    is_4in6 = (games_in_6 == 4)
#  )
#sum(check_sch$is_4in6)

group_sch <- group_sch %>%
  mutate(games_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number 1-4 to identify which game is 4th game in 6 nightd
    is_4in6 = (games_in_6 == 4)) %>% #TRUE if 4th game in 6 nights span
  summarise(
    four_in_six = sum(is_4in6), #total number of 4-in-6 stretches
    num_play_games = n(), #number of games played in each season
    per82 = (four_in_six/num_play_games) * 82, #adjust to per-82 games
    .groups = "drop"
  )

avg_4in6 <-mean(group_sch$per82) #sum up per82 columns then divide by tibble row size 300
avg_4in6 #final average
## [1] 25.09998

ANSWER 2:

25.1 4-in-6 stretches on average.

Question 3

QUESTION: Out of the 30 NBA teams, find the highest and lowest average number of 4-in-6 stretches between 2014-15 and 2023-24. Adjust each team/season to per-82 games.

two_seasons <- group_sch %>% 
  filter(season == 2014 | season == 2023)
two_seasons
## # A tibble: 60 × 5
##    team  season four_in_six num_play_games per82
##    <chr>  <dbl>       <int>          <int> <dbl>
##  1 ATL     2014          32             82    32
##  2 ATL     2023          25             82    25
##  3 BKN     2014          30             82    30
##  4 BKN     2023          24             82    24
##  5 BOS     2014          37             82    37
##  6 BOS     2023          25             82    25
##  7 CHA     2014          38             82    38
##  8 CHA     2023          28             82    28
##  9 CHI     2014          29             82    29
## 10 CHI     2023          23             82    23
## # ℹ 50 more rows
two_seasons %>%
  slice_max(per82, n = 1) #find max per82 value and prints the whole row
## # A tibble: 1 × 5
##   team  season four_in_six num_play_games per82
##   <chr>  <dbl>       <int>          <int> <dbl>
## 1 CHA     2014          38             82    38
two_seasons %>%
  slice_min(per82, n = 1) #find min per82 value and prints the whole row
## # A tibble: 1 × 5
##   team  season four_in_six num_play_games per82
##   <chr>  <dbl>       <int>          <int> <dbl>
## 1 DEN     2023          15             82    15

ANSWER 3:

  • Most 4-in-6 stretches on average: CHA (38)
  • Fewest 4-in-6 stretches on average: DEN (15)

Question 4

QUESTION: Based on 2023-2024 season, find BKN’s defensive eFG% and their defensive eFG% that season in situations where their opponent was on the second night of back-to-back.

bkn_2023 <- game_data %>%
  filter(def_team == "BKN", season == 2023) %>%
  arrange(gamedate) %>%
  select(fgmade, fgattempted, fg3made)

def_eFG <- bkn_2023 %>%
  summarise(
    fgm = sum(fgmade), #total fg made for opponents vs BKN in 2023 season
    fg3m = sum(fg3made), #total 3 pointers made for opponents vs BKN in 2023 season
    fga = sum(fgattempted) #total fg attempted for opponents vs BKN in 2023 season
  ) %>%
  mutate(def_eFG = (fgm + 0.5 * fg3m)/fga) #eFG% formula: (FGM + 0.5 * 3PM) / FGA
def_eFG
## # A tibble: 1 × 4
##     fgm  fg3m   fga def_eFG
##   <dbl> <dbl> <dbl>   <dbl>
## 1  3410  1066  7255   0.543
nba_sch <- game_data %>%
  filter(season == 2023) %>%
  group_by(off_team) %>%
  arrange(gamedate) %>%
  select(gamedate, off_team, def_team, fgmade, fgattempted, fg3made)

#double check if schedule is now grouped into each team and season and accurately in chronological order
#check_sch <- nba_sch %>%
#  filter(off_team == "OKC")
#check_sch <- check_sch %>%
#  mutate(games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)),
#    is_b2b = (games_b2b == 2)
#  )
#sum(check_sch$is_b2b)

b2b <- nba_sch %>%
  mutate(games_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)), #assigns a 1 if game is not 2nd night of a b2b and 2 if game is a 2nd night of a b2b
    is_b2b = (games_b2b == 2) #TRUE if 2th game in 2 nights span
)

opp_b2b <- b2b %>%
  ungroup() %>%
  filter(is_b2b == "TRUE", def_team == "BKN") %>%
  summarise(
    fgm = sum(fgmade), #total fg made for opponents on 2nd night of b2b vs BKN in 2023 season
    fg3m = sum(fg3made), #total 3 pointers made for opponents on 2nd night of b2b vs BKN in 2023 season
    fga = sum(fgattempted), #total fg attempted for opponents on 2nd night of b2b vs BKN in 2023 season
  ) %>%
  mutate(def_eFG = (fgm + 0.5 * fg3m)/fga) #eFG% formula: (FGM + 0.5 * 3PM) / FGA
opp_b2b
## # A tibble: 1 × 4
##     fgm  fg3m   fga def_eFG
##   <dbl> <dbl> <dbl>   <dbl>
## 1   650   217  1418   0.535

ANSWER 4:

  • BKN Defensive eFG%: 54.3%
  • When opponent on a B2B: 53.5%

Part 3 – Modeling

Question 8

QUESTION: Estimate how many more/fewer regular season wins each team has had due to schedule-related factors from 2019-20 though 2023-24. May consider the on-court strength of the scheduled opponents as well as the impact of travel/schedule density. Find the teams and estimates that benefited and struggled most from the schedule-related factors.

#schedule-related factors
# 1) Team on 2nd-of-B2B (−)
# 2) Team 6-night density overload (over 2) (−)
# 3) Rest differential (+ or -)
# 4) Opponent on 2nd-of-B2B (+)  <- not needed anymore
# 5) Opponent 6-night density overload (over 2) (+)  <- not needed anymore

#merit based for true win projection
# 1) fg%
# 2) ft%
# 3) OREB%
# 4) DREB%
# 5) net rtg
# 6) ast%
# 7) TOV%
# 8) win percentage of previous season <- not doing this anymore
# 9) track record percentage of previous season <- not doing this anymore

#output: y = m1x1 + m2x2 + ... + s1x3 + s2x4 + ...
#m1, m2: Coefficients for merit based features
#s1, s2: Coefficients for schedule based factors
#x1, x2: Merit based factors
#x3, x4: Schedule based factors
#y: Target (# games won by a team in a given season)


#Step 1: Fit model to predict y (linear regression) using actual data on merit and schedule (don't set schedule features to 0 yet)
#Step 2: Using the model (coefficients) obtained above, replace m1, m2, s1, s2,.. and set x3, x4,.. to 0. Then, get new y
#Step 3: Simplify table to 30 rows for teams and find each team's avg across all games of all seasons the subtract actual and predicted to get fewer/more wins
#Step 1
#schedule-related factors
sched_team <- game_data %>%
  filter(season %in% 2019:2023) %>%
  group_by(off_team, season) %>%
  arrange(gamedate) %>%
  mutate(
    team_rest_days = as.integer(gamedate - lag(gamedate)), #calculates number of rest days before team plays again
    team_b2b = map_int(gamedate, ~ sum(gamedate >= (.x - days(1)) & gamedate <= .x)), #assigns a 1 if game is not 2nd night of a b2b and 2 if game is a 2nd night of a b2b
    is_team_b2b = (team_b2b == 2), #TRUE if 2th game in 2 nights span
    team_in_6 = map_int(gamedate, ~ sum(gamedate >= (.x - days(5)) & gamedate <= .x)), #assigns a number to identify which game in 6 nights span
    total_win = sum(off_win) #total wins of that season for corresponding team
  ) %>%
  ungroup()

#mirror factors for opponents
opp_feats <- sched_team %>%
  transmute(
    season, gamedate,
    def_team = off_team,
    opp_rest_days = team_rest_days, #opponent's number of rest days in between
    is_opp_b2b = is_team_b2b, #check if opponent is playing on a b2b
    opp_in_6 = team_in_6 #check what game in 6 nights span for opponent
  )

sched_based <- sched_team %>%
  left_join(opp_feats, by = c("season", "gamedate", "def_team")) %>%
  mutate(
    team_rest_days = replace_na(team_rest_days, 0), #replace any NAs with 0
    opp_rest_days  = replace_na(opp_rest_days, 0), #replace any NAs with 0
    rest_diff = team_rest_days - opp_rest_days #rest difference between team and opponent
  )
#merit-based factors
merit_team <- game_data %>%
  filter(season %in% 2019:2023) %>%
  group_by(off_team, season) %>%
  arrange(gamedate) %>%
  mutate(
    # 1) fg%
    fg_per = fgmade/fgattempted,
    # 2) ft%
    ft_per = ftmade/ftattempted,
    # 3) OREB%
    oreb_per = reboffensive/reboundchance,
    # 4) DREB%
    dreb_per = rebdefensive/reboundchance,
    # 5) ORTG%
    ortg = (points/possessions) * 100, 
    # 6) ast%
    ast_per = assists/fgmade,
    # 7) TOV%
    tov_per = (turnovers/possessions) * 100
  ) %>%
  ungroup() 
  
#mirror factors for opponents
opp_feats <- merit_team %>%
  transmute(
    season, gamedate,
    def_team = off_team,
    drtg = ortg #finds team's defensive rating
  )

merit_based <- merit_team %>%
  left_join(opp_feats, by = c("season", "gamedate", "def_team")) %>%
  mutate(
    ft_per = replace_na(ft_per, 0), #replace NAs with 0
    netrtg = ortg - drtg #calculates difference of ortg and drtg for team net rating
  )
#combine into one big table of 11,658 games from 2019-2023 season
#decided not to include oreb_per and netrtg due to similar overlaps with other predictors (multicollinearity)
combined <- sched_based %>%
  select(off_team, season, gamedate, off_win, team_b2b, team_in_6, rest_diff) %>%
  left_join(merit_based %>%
    select(off_team, season, gamedate, off_win, fg_per, ft_per, dreb_per, ast_per, tov_per),
    by = c("off_team", "season", "gamedate", "off_win")
)
combined
## # A tibble: 11,658 × 12
##    off_team season gamedate   off_win team_b2b team_in_6 rest_diff fg_per ft_per
##    <chr>     <dbl> <date>       <dbl>    <int>     <int>     <int>  <dbl>  <dbl>
##  1 LAC        2019 2019-10-22       1        1         1         0  0.519  0.708
##  2 LAL        2019 2019-10-22       0        1         1         0  0.435  0.714
##  3 NOP        2019 2019-10-22       0        1         1         0  0.422  0.85 
##  4 TOR        2019 2019-10-22       1        1         1         0  0.408  0.842
##  5 DEN        2019 2019-10-23       1        1         1         0  0.420  0.815
##  6 POR        2019 2019-10-23       0        1         1         0  0.414  0.913
##  7 PHX        2019 2019-10-23       1        1         1         0  0.5    0.833
##  8 SAC        2019 2019-10-23       0        1         1         0  0.391  0.654
##  9 OKC        2019 2019-10-23       0        1         1         0  0.386  0.677
## 10 UTA        2019 2019-10-23       1        1         1         0  0.444  0.7  
## # ℹ 11,648 more rows
## # ℹ 3 more variables: dreb_per <dbl>, ast_per <dbl>, tov_per <dbl>
#OLS linear regression model
model <- lm(off_win ~ team_b2b + team_in_6 + rest_diff + fg_per + ft_per + dreb_per + ast_per + tov_per, data = combined)
summary(model)
## 
## Call:
## lm(formula = off_win ~ team_b2b + team_in_6 + rest_diff + fg_per + 
##     ft_per + dreb_per + ast_per + tov_per, data = combined)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1591 -0.3762 -0.0018  0.3761  1.0761 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.874038   0.068993 -12.669  < 2e-16 ***
## team_b2b    -0.033475   0.011116  -3.011  0.00261 ** 
## team_in_6   -0.014841   0.006009  -2.470  0.01354 *  
## rest_diff    0.002591   0.001423   1.820  0.06875 .  
## fg_per       4.197967   0.072964  57.535  < 2e-16 ***
## ft_per       0.484024   0.039535  12.243  < 2e-16 ***
## dreb_per    -1.125909   0.057206 -19.682  < 2e-16 ***
## ast_per      0.261095   0.042712   6.113 1.01e-09 ***
## tov_per     -0.015933   0.001079 -14.765  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.432 on 11649 degrees of freedom
## Multiple R-squared:  0.2541, Adjusted R-squared:  0.2536 
## F-statistic: 496.1 on 8 and 11649 DF,  p-value: < 2.2e-16

To quantify how the two different types of predictors, schedule-related and team “merit”, relates to a team’s chance of winning a game, I fit an OLS linear regression on 11, 658 games (2019-2023 season) with the response variable off_win. My schedule-related predictors were team_b2b, team_in_6, and rest_diff. As for team “merit” predictors, they are fg_per, ft_per, dreb_per, ast_per, and tov_per. Looking at the model, all but one predictor, rest_diff, are statistically significant with p-values less than 0.05, and the overall F-test is also statistically significant with p-value less than 2.2e-16, indicating the predictors are meaningful to our model. However, I noticed the R-squared value is relatively low (0.2541) so the predictors don’t explain much of the variation in the response variable. These two observations somewhat contradict with each other, which brings some concerns that we should take this model with caution. With the coefficients from the model, I am able to create a similar equation to predict a team’s chance of winning a game, but this time, I removed the schedule-related predictors. That way, I am able to take the difference between the actual and predicted to get a difference value for each team to see who benefitted most and least from the schedule-related factors.

#Step 2
#substitute coefficients from model into similar equation to predict team's chance of winning a game
#remove schedule-related predictors since we want to take difference of actual vs predicted to see the impact of schedule-related predictors
combined <- combined %>%
  mutate(win_prob = -0.874038 + (4.197967*fg_per) + (0.484024*ft_per) + (-1.125909*dreb_per) + (0.261095*ast_per) + (-0.015933*tov_per),
         pred_win = round(win_prob))
combined %>% select(off_team, season, off_win, win_prob, pred_win)
## # A tibble: 11,658 × 5
##    off_team season off_win win_prob pred_win
##    <chr>     <dbl>   <dbl>    <dbl>    <dbl>
##  1 LAC        2019       1   0.722         1
##  2 LAL        2019       0   0.381         0
##  3 NOP        2019       0   0.401         0
##  4 TOR        2019       1   0.475         0
##  5 DEN        2019       1   0.324         0
##  6 POR        2019       0   0.314         0
##  7 PHX        2019       1   0.754         1
##  8 SAC        2019       0   0.0174        0
##  9 OKC        2019       0   0.0727        0
## 10 UTA        2019       1   0.355         0
## # ℹ 11,648 more rows
#Step 3
#simplify table into 30 rows (30 teams) with total wins, total predicted wins, and difference across all games of all season
team_group <- combined %>%
  group_by(off_team) %>%
  summarise(
    total_win = sum(off_win), #total wins across all games of all seasons
    total_pred_win = sum(pred_win), #total predicted wins across all games of all seasons
    true_diff = total_win - total_pred_win, #difference between actual and predicted
    .groups = "drop"
  )
team_group 
## # A tibble: 30 × 4
##    off_team total_win total_pred_win true_diff
##    <chr>        <dbl>          <dbl>     <dbl>
##  1 ATL            181            266       -85
##  2 BKN            204            259       -55
##  3 BOS            256            266       -10
##  4 CHA            147            205       -58
##  5 CHI            178            250       -72
##  6 CLE            184            245       -61
##  7 DAL            225            246       -21
##  8 DEN            251            303       -52
##  9 DET             94            194      -100
## 10 GSW            197            238       -41
## # ℹ 20 more rows
team_group %>%
  slice_max(true_diff, n = 1) #find max true_diff value and prints the whole row
## # A tibble: 1 × 4
##   off_team total_win total_pred_win true_diff
##   <chr>        <dbl>          <dbl>     <dbl>
## 1 MIL            260            255         5
team_group %>%
  slice_min(true_diff, n = 1) #find min true_diff value and prints the whole row
## # A tibble: 1 × 4
##   off_team total_win total_pred_win true_diff
##   <chr>        <dbl>          <dbl>     <dbl>
## 1 WAS            144            249      -105

ANSWER 8:

  • Most Helped by Schedule: MIL (+5 wins)
  • Most Hurt by Schedule: WAS (-105 wins)