knitr::opts_chunk$set(echo = TRUE,
fig.align = "center")
# Loading in the necessary packages
pacman::p_load(tidyverse, broom)
# Reading in the data set
fg <- read.csv("field_goals_full.csv")
# Changing roof, result, and field into factors:
fg <-
fg |>
mutate(
across(
.cols = c(roof, result, field),
.fns = factor
),
field = relevel(field, ref = "stadium")
)
Start by summarizing the data that can be used to fit the logistic regression model:
# Summarizing the data first:
fg_sum <-
fg |>
# Our XYZ counts:
count(kick_distance, field, result) |>
# The sample size and probability a kick is successful at each XZ combo:
mutate(
.by = c(kick_distance, field),
dist_field_n = sum(n),
success_rate = n/dist_field_n
) |>
# Only keeping the rows where the kick was successful
filter(
result == "made",
dist_field_n > 10
) |>
dplyr::select(-result)
fg_sum
## kick_distance field n dist_field_n success_rate
## 1 19 stadium 64 64 1.0000000
## 2 19 dome 26 26 1.0000000
## 3 20 stadium 145 145 1.0000000
## 4 20 dome 43 43 1.0000000
## 5 21 stadium 137 139 0.9856115
## 6 21 dome 63 63 1.0000000
## 7 22 stadium 176 183 0.9617486
## 8 22 dome 57 57 1.0000000
## 9 23 stadium 197 200 0.9850000
## 10 23 dome 72 72 1.0000000
## 11 24 stadium 155 161 0.9627329
## 12 24 dome 48 52 0.9230769
## 13 25 stadium 179 181 0.9889503
## 14 25 dome 41 42 0.9761905
## 15 26 stadium 174 175 0.9942857
## 16 26 dome 36 38 0.9473684
## 17 27 stadium 168 176 0.9545455
## 18 27 dome 68 68 1.0000000
## 19 28 stadium 194 199 0.9748744
## 20 28 dome 63 68 0.9264706
## 21 29 stadium 166 176 0.9431818
## 22 29 dome 72 74 0.9729730
## 23 30 stadium 192 205 0.9365854
## 24 30 dome 59 60 0.9833333
## 25 31 stadium 196 204 0.9607843
## 26 31 dome 58 59 0.9830508
## 27 32 stadium 177 188 0.9414894
## 28 32 dome 65 69 0.9420290
## 29 33 stadium 231 243 0.9506173
## 30 33 dome 72 73 0.9863014
## 31 34 stadium 172 196 0.8775510
## 32 34 dome 51 56 0.9107143
## 33 35 stadium 192 209 0.9186603
## 34 35 dome 57 63 0.9047619
## 35 36 stadium 187 214 0.8738318
## 36 36 dome 61 67 0.9104478
## 37 37 stadium 174 206 0.8446602
## 38 37 dome 66 74 0.8918919
## 39 38 stadium 205 236 0.8686441
## 40 38 dome 73 83 0.8795181
## 41 39 stadium 189 216 0.8750000
## 42 39 dome 57 63 0.9047619
## 43 40 stadium 189 220 0.8590909
## 44 40 dome 68 77 0.8831169
## 45 41 stadium 166 201 0.8258706
## 46 41 dome 59 69 0.8550725
## 47 42 stadium 174 210 0.8285714
## 48 42 dome 75 87 0.8620690
## 49 43 stadium 195 242 0.8057851
## 50 43 dome 66 82 0.8048780
## 51 44 stadium 169 214 0.7897196
## 52 44 dome 52 61 0.8524590
## 53 45 stadium 183 235 0.7787234
## 54 45 dome 51 63 0.8095238
## 55 46 stadium 180 232 0.7758621
## 56 46 dome 46 60 0.7666667
## 57 47 stadium 162 214 0.7570093
## 58 47 dome 51 64 0.7968750
## 59 48 stadium 193 278 0.6942446
## 60 48 dome 74 95 0.7789474
## 61 49 stadium 159 215 0.7395349
## 62 49 dome 57 74 0.7702703
## 63 50 stadium 134 181 0.7403315
## 64 50 dome 51 73 0.6986301
## 65 51 stadium 117 169 0.6923077
## 66 51 dome 47 65 0.7230769
## 67 52 stadium 107 174 0.6149425
## 68 52 dome 44 61 0.7213115
## 69 53 stadium 115 176 0.6534091
## 70 53 dome 52 73 0.7123288
## 71 54 stadium 68 110 0.6181818
## 72 54 dome 37 50 0.7400000
## 73 55 stadium 43 73 0.5890411
## 74 55 dome 26 47 0.5531915
## 75 56 stadium 28 51 0.5490196
## 76 56 dome 14 21 0.6666667
## 77 57 stadium 13 30 0.4333333
## 78 57 dome 9 15 0.6000000
## 79 58 stadium 8 24 0.3333333
## 80 58 dome 8 14 0.5714286
## 81 59 stadium 9 12 0.7500000
The scatterplot below compares the kicking distance, roof type, and success rate:
gg_fg <-
ggplot(
data = fg_sum,
mapping = aes(
x = kick_distance,
y = success_rate,
color = field
)
) +
geom_point(alpha = 0.5) +
theme_bw() +
labs(
x = "Kicking Distance (yards)",
y = "Success Rate",
color = "Roof Type"
) +
scale_y_continuous(labels = scales::label_percent()) +
theme(
legend.position = 'inside',
legend.position.inside = c(0.1,0.2)
)
gg_fg
To fit an additive model, we include both explanatory variables in
the formula with a + between them:
fg_glm_add <-
glm(
formula = success_rate ~ kick_distance + field,
weight = dist_field_n,
family = binomial,
data = fg_sum
)
# The intercept and slopes for the log-odds scale:
tidy(fg_glm_add)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 5.98 0.165 36.2 6.35e-287
## 2 kick_distance -0.104 0.00368 -28.3 1.32e-175
## 3 fielddome 0.237 0.0720 3.29 9.86e- 4
# The intercept and slopes for the odds scale:
tidy(fg_glm_add, exponentiate = T)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 397. 0.165 36.2 6.35e-287
## 2 kick_distance 0.901 0.00368 -28.3 1.32e-175
## 3 fielddome 1.27 0.0720 3.29 9.86e- 4
Let’s add the probability lines to the graph original scatterplot
# Adding the predicted probabilities to the data set:
fg_sum <-
fg_sum |>
add_column(
add_prob = predict(object = fg_glm_add, type = "response")
)
# Now we can add the lines to the graph, assuming the lines are "parallel"
gg_fg +
geom_line(
data = fg_sum,
mapping = aes(y = add_prob),
show.legend = F
)
If you want to fit an interaction model between X and Z, you just
need to change the + to * in the formula
argument of glm()
fg_glm_int <-
glm(
formula = success_rate ~ kick_distance * field,
weight = dist_field_n,
family = binomial,
data = fg_sum
)
tidy(fg_glm_int, exponentiate = T)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 402. 0.188 31.8 2.88e-222
## 2 kick_distance 0.901 0.00421 -24.7 5.20e-135
## 3 fielddome 1.21 0.394 0.476 6.34e- 1
## 4 kick_distance:fielddome 1.00 0.00864 0.128 8.98e- 1
Let’s compare how different the the lines are by adding the predicted lines to the scatterplot again, but this time for the interaction model:
# Adding the predicted probabilities to the data set:
fg_sum <-
fg_sum |>
mutate(
int_prob = predict(object = fg_glm_int, type = "response")
)
# Now we can add the lines to the graph, not assuming the lines are "parallel"
gg_fg +
geom_line(
data = fg_sum,
mapping = aes(y = int_prob),
show.legend = F
)
Since the factor (field) is binary, we can just conduct a z-test using the standard error:
\[H_0: \beta_{12} = 0 \\ H_a: \beta_{12} \ne 0\]
tidy(fg_glm_int, exponentiate = T) |>
mutate(
across(
.cols = where(is.numeric),
.fns = round,
digits = 4
)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(.cols = where(is.numeric), .fns = round, digits = 4)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 402. 0.188 31.8 0
## 2 kick_distance 0.901 0.0042 -24.7 0
## 3 fielddome 1.21 0.394 0.476 0.634
## 4 kick_distance:fielddome 1.00 0.0086 0.128 0.898
With a p-value of almost 0.9, we don’t have evidence that the interaction is needed!
What if the factor isn’t binary? Then we can’t conduct a simple
z-test and need to use the deviances! Easiest way is using
Anova(mod, type = 3)
car::Anova(mod = fg_glm_int, type = 3)
## Analysis of Deviance Table (Type III tests)
##
## Response: success_rate
## LR Chisq Df Pr(>Chisq)
## kick_distance 781.67 1 <2e-16 ***
## field 0.23 1 0.6325
## kick_distance:field 0.02 1 0.8978
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since field is binary, we end up with the same result as our simple z-test!
The p-value for both terms with field in it are non-significant, does that mean we can conclude that field isn’t associated with the probability a field goal is successful?
NO! (maybe a little overly dramatic, but still, no)
If we want to use the interaction model to test to see if field is associated with field goal success (after accounting for kick distance), we can compare the deviance to a glm without field at all!
\[\log \left( \frac{\pi}{1-\pi} \right) = \alpha + \beta_1x + \beta_2z + \beta_{12}xz\] vs
\[\log \left( \frac{\pi}{1-\pi} \right) = \alpha + \beta_1x\]
Which is a hypothesis test of:
\[H_0: \beta_2 = \beta_{12} = 0\]
fg_glm_dist <-
glm(
formula = success_rate ~ kick_distance,
weight = dist_field_n,
family = binomial,
data = fg_sum
)
bind_rows(
.id = "model",
"distance" = glance(fg_glm_dist),
"distance*field" = glance(fg_glm_int)
) |>
dplyr::select(deviance, df.residual) |>
summarize(
G = abs(diff(deviance)),
df = abs(diff(df.residual))
) |>
mutate(
`p-val` = pchisq(q = G, df = df, lower.tail = F)
)
## # A tibble: 1 × 3
## G df `p-val`
## <dbl> <int> <dbl>
## 1 11.1 2 0.00381
Since the p-value is small, we can conclude that we have strong evidence that the field type is associated with the probability a field goal attempt is successful
Alternatively, we can use
anova(simple_model, complex_model, test = "Chisq) to do the
test for us:
anova(
fg_glm_dist,
fg_glm_int,
test = "Chisq"
) # test = "LRT" will give the same result
## Analysis of Deviance Table
##
## Model 1: success_rate ~ kick_distance
## Model 2: success_rate ~ kick_distance * field
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 79 106.120
## 2 77 94.978 2 11.143 0.003806 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1