shot_logs <- read.csv("C:/Users/13177/OneDrive/Stats for Data Science/filtered_shot_logs.csv")
head(shot_logs)
## GAME_ID MATCHUP LOCATION W.L FINAL_MARGIN SHOT_NUMBER
## 1 21400899 MAR 04, 2015 - CHA @ BKN A W 24 1
## 2 21400899 MAR 04, 2015 - CHA @ BKN A W 24 2
## 3 21400899 MAR 04, 2015 - CHA @ BKN A W 24 3
## 4 21400899 MAR 04, 2015 - CHA @ BKN A W 24 4
## 5 21400899 MAR 04, 2015 - CHA @ BKN A W 24 5
## 6 21400899 MAR 04, 2015 - CHA @ BKN A W 24 6
## PERIOD GAME_CLOCK SHOT_CLOCK DRIBBLES TOUCH_TIME SHOT_DIST PTS_TYPE
## 1 1 1:09 10.8 2 1.9 7.7 2
## 2 1 0:14 3.4 0 0.8 28.2 3
## 3 1 0:00 NA 3 2.7 10.1 2
## 4 2 11:47 10.3 2 1.9 17.2 2
## 5 2 10:34 10.9 2 2.7 3.7 2
## 6 2 8:15 9.1 2 4.4 18.4 2
## SHOT_RESULT CLOSEST_DEFENDER CLOSEST_DEFENDER_PLAYER_ID CLOSE_DEF_DIST FGM
## 1 made Anderson, Alan 101187 1.3 1
## 2 missed Bogdanovic, Bojan 202711 6.1 0
## 3 missed Bogdanovic, Bojan 202711 0.9 0
## 4 missed Brown, Markel 203900 3.4 0
## 5 missed Young, Thaddeus 201152 1.1 0
## 6 missed Williams, Deron 101114 2.6 0
## PTS player_name player_id
## 1 2 brian roberts 203148
## 2 0 brian roberts 203148
## 3 0 brian roberts 203148
## 4 0 brian roberts 203148
## 5 0 brian roberts 203148
## 6 0 brian roberts 203148
The original shot logs dataset contains a GAME_CLOCK
column with clock time remaining in the period. Since there’s no full
game date, we’ll simulate a date structure using a made-up date plus
period to show change over time.
#Simulate a game date column#
shot_logs <- shot_logs |>
mutate(GAME_DATE = as.Date("2023-01-01") + as.numeric(PERIOD - 1),
GAME_TIME = as.numeric(ms(GAME_CLOCK)),
TOTAL_SECONDS_LEFT = (PERIOD - 1) * 720 + GAME_TIME)
We will analyze shot success (FGM) over time (by
game period), aggregating by the fake GAME_DATE
to simulate
a time-based trend.
#Aggregate average FGM per fake date (really per period)#
daily_summary <- shot_logs |>
group_by(GAME_DATE) |>
summarize(fgm_rate = mean(as.numeric(as.character(FGM)), na.rm = TRUE))
ts_data <- daily_summary |>
as_tsibble(index = GAME_DATE)
autoplot(ts_data, fgm_rate) +
labs(title = "FGM Rate Over Time (by Game Period)", x = "Game Date", y = "FGM Rate")
ts_lm <- ts_data |>
model(lm = TSLM(fgm_rate ~ trend()))
ts_lm |> report()
## Series: fgm_rate
## Model: TSLM
##
## Residuals:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.481699 0.019200 25.088 1.88e-06 ***
## trend() -0.012942 0.004293 -3.014 0.0296 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02272 on 5 degrees of freedom
## Multiple R-squared: 0.6451, Adjusted R-squared: 0.5741
## F-statistic: 9.087 on 1 and 5 DF, p-value: 0.029606
#Fit the model#
model_fit <- ts_data |> model(lm = TSLM(fgm_rate ~ trend()))
#Add fitted values to the original data#
model_augmented <- model_fit |> augment()
#Plot the original series and the trend#
ggplot(model_augmented, aes(x = GAME_DATE)) +
geom_line(aes(y = fgm_rate), color = "blue", size = 1) +
geom_line(aes(y = .fitted), color = "red", linetype = "dashed") +
labs(
title = "FGM Rate with Linear Trend",
x = "Game Date",
y = "FGM Rate",
caption = "Red dashed line = trend"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ts_data |>
model(SMOOTH = ETS(fgm_rate)) |>
components() |>
autoplot() +
labs(title = "Seasonality Detected via ETS")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
ts_data |>
ACF(fgm_rate) |>
autoplot() +
labs(title = "Autocorrelation of FGM Rate")
GAME_DATE
) and
tracked FGM% as a response.