This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Question 1 A)
library(readxl)
# Load data
data <- read_excel("ps22025.xlsx", skip = 1)
## New names:
## • `0` -> `0...7`
## • `1` -> `1...8`
## • `1` -> `1...9`
## • `` -> `...10`
## • `0` -> `0...11`
## • `0` -> `0...12`
## • `1` -> `1...13`
## • `0` -> `0...14`
## • `0` -> `0...15`
## • `0` -> `0...16`
# Print column names to confirm they match what we expect
cat("Column names:\n")
## Column names:
print(colnames(data))
## [1] "Wolfsburg" "Hoffenheim" "-"
## [4] "Left" "3" "Defended"
## [7] "0...7" "1...8" "1...9"
## [10] "...10" "0...11" "0...12"
## [13] "1...13" "0...14" "0...15"
## [16] "0...16" "-16.666666030883789" "18.333333969116211"
## [19] "11" "18"
# Preview first few rows
head(data)
## # A tibble: 6 × 20
## Wolfsburg Hoffenheim `-` Left `3` Defended `0...7` `1...8` `1...9` ...10
## <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 RB Leipzig Stuttgart - Left 2 Miss 0 0 1 NA
## 2 Koln Hoffenheim - Head 3 Miss 0 0 1 NA
## 3 Mainz 05 Hoffenheim - Right 1 Goal 1 0 0 1
## 4 Hoffenheim Eintracht… - Head 4 Miss 0 0 1 NA
## 5 Borussia … Hoffenheim - Right 2 Goal 1 0 0 1
## 6 Wolfsburg RB Leipzig - Left 3 Miss 0 0 1 NA
## # ℹ 10 more variables: `0...11` <dbl>, `0...12` <dbl>, `1...13` <dbl>,
## # `0...14` <dbl>, `0...15` <dbl>, `0...16` <dbl>,
## # `-16.666666030883789` <dbl>, `18.333333969116211` <dbl>, `11` <dbl>,
## # `18` <dbl>
colnames(data) <- c("hometeam_team1", "awayteam_team2", "player", "bodypart", "defpressure",
"outcome", "goal", "blocked", "blockmiss", "finish",
"DefPress1", "DefPress2", "DefPress3", "DefPress4", "DefPress5",
"head", "location_y", "location_x", "awayid", "homeid")
# Confirm names after forced rename
print(colnames(data))
## [1] "hometeam_team1" "awayteam_team2" "player" "bodypart"
## [5] "defpressure" "outcome" "goal" "blocked"
## [9] "blockmiss" "finish" "DefPress1" "DefPress2"
## [13] "DefPress3" "DefPress4" "DefPress5" "head"
## [17] "location_y" "location_x" "awayid" "homeid"
# Proceed to regression
logit_model <- glm(goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + DefPress5,
data = data,
family = binomial(link = "logit"))
summary(logit_model)
##
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 +
## DefPress5, family = binomial(link = "logit"), data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.61496 0.59810 -4.372 1.23e-05 ***
## DefPress1 1.14367 0.60469 1.891 0.0586 .
## DefPress2 0.56307 0.60827 0.926 0.3546
## DefPress3 0.56033 0.60727 0.923 0.3562
## DefPress4 0.19459 0.61216 0.318 0.7506
## DefPress5 -0.07967 0.64117 -0.124 0.9011
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2678.2 on 3679 degrees of freedom
## Residual deviance: 2622.2 on 3674 degrees of freedom
## AIC: 2634.2
##
## Number of Fisher Scoring iterations: 5
1b) Null hypothesis: β1 =β4 alternative: β1 is not equal to β4 (there may be a difference between dfpress1 and defprss 4)
# find coefficient estimates and standard errors
coef_summary <- summary(logit_model)$coefficients
# find values for DefPress1 and DefPress4
beta1 <- coef_summary["DefPress1", "Estimate"]
se1 <- coef_summary["DefPress1", "Std. Error"]
beta4 <- coef_summary["DefPress4", "Estimate"]
se4 <- coef_summary["DefPress4", "Std. Error"]
# Compute the test statistic
z_score <- (beta1 - beta4) / sqrt(se1^2 + se4^2)
# Compute p-value for two-sided test
p_value <- 2 * (1 - pnorm(abs(z_score)))
cat("Beta1 (DefPress1):", beta1, "\n")
## Beta1 (DefPress1): 1.143672
cat("SE1 (DefPress1):", se1, "\n")
## SE1 (DefPress1): 0.604694
cat("Beta4 (DefPress4):", beta4, "\n")
## Beta4 (DefPress4): 0.1945916
cat("SE4 (DefPress4):", se4, "\n")
## SE4 (DefPress4): 0.6121566
# Compute the test statistic
z_score <- (beta1 - beta4) / sqrt(se1^2 + se4^2)
cat("Z-score for H0: Beta1 = Beta4 is", z_score, "\n")
## Z-score for H0: Beta1 = Beta4 is 1.102993
# Compute p-value for two-sided test
p_value <- 2 * (1 - pnorm(abs(z_score)))
cat("P-value for H0: Beta1 = Beta4 is", p_value, "\n")
## P-value for H0: Beta1 = Beta4 is 0.2700303
1c)
# Calculate total distance using Pythagorean Theorem
data$TotalDistance <- sqrt(data$location_x^2 + data$location_y^2)
# Check a few rows to make sure it worked
head(data[, c("location_x", "location_y", "TotalDistance")])
## # A tibble: 6 × 3
## location_x location_y TotalDistance
## <dbl> <dbl> <dbl>
## 1 27.3 14.7 31.0
## 2 12.7 9.33 15.7
## 3 9.67 -6.67 11.7
## 4 8.33 8.67 12.0
## 5 10 -1.67 10.1
## 6 28.7 -14 31.9
# Logistic regression including TotalDistance
logit_model_distance <- glm(goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + DefPress5 + TotalDistance,
data = data,
family = binomial(link = "logit"))
# Show regression summary
summary(logit_model_distance)
##
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 +
## DefPress5 + TotalDistance, family = binomial(link = "logit"),
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.952926 0.613459 -1.553 0.12034
## DefPress1 1.876865 0.615079 3.051 0.00228 **
## DefPress2 1.258501 0.618371 2.035 0.04183 *
## DefPress3 1.162879 0.616958 1.885 0.05945 .
## DefPress4 0.513763 0.621121 0.827 0.40815
## DefPress5 0.065476 0.650315 0.101 0.91980
## TotalDistance -0.115917 0.007221 -16.052 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2678.2 on 3679 degrees of freedom
## Residual deviance: 2266.4 on 3673 degrees of freedom
## AIC: 2280.4
##
## Number of Fisher Scoring iterations: 6
# find coefficient table
coef_summary_dist <- summary(logit_model_distance)$coefficients
# find TotalDistance coefficient and standard error
beta_distance <- coef_summary_dist["TotalDistance", "Estimate"]
se_distance <- coef_summary_dist["TotalDistance", "Std. Error"]
# Compute z-score and p-value
z_distance <- beta_distance / se_distance
p_distance <- 2 * (1 - pnorm(abs(z_distance)))
# Output
cat("Hypothesis Test for Total Distance\n")
## Hypothesis Test for Total Distance
cat("----------------------------------\n")
## ----------------------------------
cat("Beta (TotalDistance):", beta_distance, "\n")
## Beta (TotalDistance): -0.1159174
cat("SE (TotalDistance):", se_distance, "\n")
## SE (TotalDistance): 0.007221185
cat("Z-score:", z_distance, "\n")
## Z-score: -16.05241
cat("P-value:", p_distance, "\n")
## P-value: 0
1d)
# Calculate angle to goal using atan2 (gives angle in radians)
data$ShotAngle <- atan2(data$location_y, data$location_x)
# double check first few rows to make sure it worked
head(data[, c("location_x", "location_y", "ShotAngle")])
## # A tibble: 6 × 3
## location_x location_y ShotAngle
## <dbl> <dbl> <dbl>
## 1 27.3 14.7 0.492
## 2 12.7 9.33 0.635
## 3 9.67 -6.67 -0.604
## 4 8.33 8.67 0.805
## 5 10 -1.67 -0.165
## 6 28.7 -14 -0.454
logit_model_angle <- glm(goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + DefPress5 +
TotalDistance + ShotAngle,
data = data,
family = binomial(link = "logit"))
# Print summary so you can see all coefficients
summary(logit_model_angle)
##
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 +
## DefPress5 + TotalDistance + ShotAngle, family = binomial(link = "logit"),
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.954695 0.613457 -1.556 0.11965
## DefPress1 1.878412 0.615050 3.054 0.00226 **
## DefPress2 1.260330 0.618361 2.038 0.04153 *
## DefPress3 1.163294 0.616880 1.886 0.05933 .
## DefPress4 0.514947 0.621070 0.829 0.40703
## DefPress5 0.066823 0.650278 0.103 0.91815
## TotalDistance -0.115899 0.007221 -16.051 < 2e-16 ***
## ShotAngle 0.017722 0.090661 0.195 0.84502
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2678.2 on 3679 degrees of freedom
## Residual deviance: 2266.4 on 3672 degrees of freedom
## AIC: 2282.4
##
## Number of Fisher Scoring iterations: 6
# Make sure logit_model_angle exists first (this chunk depends on the model being fit already)
coef_summary_angle <- summary(logit_model_angle)$coefficients
# Extract the ShotAngle coefficient and standard error
beta_angle <- coef_summary_angle["ShotAngle", "Estimate"]
se_angle <- coef_summary_angle["ShotAngle", "Std. Error"]
# Compute the test statistic (Z-score) and p-value
z_angle <- beta_angle / se_angle
p_angle <- 2 * (1 - pnorm(abs(z_angle)))
# Output results
cat("Hypothesis Test for Shot Angle\n")
## Hypothesis Test for Shot Angle
cat("------------------------------\n")
## ------------------------------
cat("Beta (ShotAngle):", beta_angle, "\n")
## Beta (ShotAngle): 0.01772212
cat("SE (ShotAngle):", se_angle, "\n")
## SE (ShotAngle): 0.09066147
cat("Z-score:", z_angle, "\n")
## Z-score: 0.1954757
cat("P-value:", p_angle, "\n")
## P-value: 0.8450206
if (p_angle < 0.05) {
cat("Conclusion: ShotAngle has a statistically significant effect on goal probability.\n")
} else {
cat("Conclusion: ShotAngle does not have a statistically significant effect on goal probability.\n")
}
## Conclusion: ShotAngle does not have a statistically significant effect on goal probability.
1e)
new_shot <- data.frame(
DefPress1 = 0,
DefPress2 = 0,
DefPress3 = 1,
DefPress4 = 0,
DefPress5 = 0,
TotalDistance = 12,
ShotAngle = 0
)
predicted_probability <- predict(logit_model_angle, newdata = new_shot, type = "response")
cat("Predicted Probability of Scoring:", predicted_probability, "\n")
## Predicted Probability of Scoring: 0.2346594
1f)
# Summary of Part (a) Model
cat("Model (a) - Defensive Pressure Only\n")
## Model (a) - Defensive Pressure Only
summary(logit_model)
##
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 +
## DefPress5, family = binomial(link = "logit"), data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.61496 0.59810 -4.372 1.23e-05 ***
## DefPress1 1.14367 0.60469 1.891 0.0586 .
## DefPress2 0.56307 0.60827 0.926 0.3546
## DefPress3 0.56033 0.60727 0.923 0.3562
## DefPress4 0.19459 0.61216 0.318 0.7506
## DefPress5 -0.07967 0.64117 -0.124 0.9011
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2678.2 on 3679 degrees of freedom
## Residual deviance: 2622.2 on 3674 degrees of freedom
## AIC: 2634.2
##
## Number of Fisher Scoring iterations: 5
cat("\nModel (a) Deviance:", summary(logit_model)$deviance, "\n")
##
## Model (a) Deviance: 2622.171
cat("Model (a) AIC:", AIC(logit_model), "\n")
## Model (a) AIC: 2634.171
# Summary of Part (c) Model (with TotalDistance)
cat("\nModel (c) - Defensive Pressure + TotalDistance\n")
##
## Model (c) - Defensive Pressure + TotalDistance
summary(logit_model_distance)
##
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 +
## DefPress5 + TotalDistance, family = binomial(link = "logit"),
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.952926 0.613459 -1.553 0.12034
## DefPress1 1.876865 0.615079 3.051 0.00228 **
## DefPress2 1.258501 0.618371 2.035 0.04183 *
## DefPress3 1.162879 0.616958 1.885 0.05945 .
## DefPress4 0.513763 0.621121 0.827 0.40815
## DefPress5 0.065476 0.650315 0.101 0.91980
## TotalDistance -0.115917 0.007221 -16.052 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2678.2 on 3679 degrees of freedom
## Residual deviance: 2266.4 on 3673 degrees of freedom
## AIC: 2280.4
##
## Number of Fisher Scoring iterations: 6
cat("\nModel (c) Deviance:", summary(logit_model_distance)$deviance, "\n")
##
## Model (c) Deviance: 2266.444
cat("Model (c) AIC:", AIC(logit_model_distance), "\n")
## Model (c) AIC: 2280.444
# Summary of Part (d) Model (with TotalDistance + ShotAngle)
cat("\nModel (d) - Defensive Pressure + TotalDistance + ShotAngle\n")
##
## Model (d) - Defensive Pressure + TotalDistance + ShotAngle
summary(logit_model_angle)
##
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 +
## DefPress5 + TotalDistance + ShotAngle, family = binomial(link = "logit"),
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.954695 0.613457 -1.556 0.11965
## DefPress1 1.878412 0.615050 3.054 0.00226 **
## DefPress2 1.260330 0.618361 2.038 0.04153 *
## DefPress3 1.163294 0.616880 1.886 0.05933 .
## DefPress4 0.514947 0.621070 0.829 0.40703
## DefPress5 0.066823 0.650278 0.103 0.91815
## TotalDistance -0.115899 0.007221 -16.051 < 2e-16 ***
## ShotAngle 0.017722 0.090661 0.195 0.84502
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2678.2 on 3679 degrees of freedom
## Residual deviance: 2266.4 on 3672 degrees of freedom
## AIC: 2282.4
##
## Number of Fisher Scoring iterations: 6
cat("\nModel (d) Deviance:", summary(logit_model_angle)$deviance, "\n")
##
## Model (d) Deviance: 2266.405
cat("Model (d) AIC:", AIC(logit_model_angle), "\n")
## Model (d) AIC: 2282.405