R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Question 1 A)

add Required Libraries

library(readxl)

Read Data & Check Columns

# Load data
data <- read_excel("ps22025.xlsx", skip = 1)
## New names:
## • `0` -> `0...7`
## • `1` -> `1...8`
## • `1` -> `1...9`
## • `` -> `...10`
## • `0` -> `0...11`
## • `0` -> `0...12`
## • `1` -> `1...13`
## • `0` -> `0...14`
## • `0` -> `0...15`
## • `0` -> `0...16`
# Print column names to confirm they match what we expect
cat("Column names:\n")
## Column names:
print(colnames(data))
##  [1] "Wolfsburg"           "Hoffenheim"          "-"                  
##  [4] "Left"                "3"                   "Defended"           
##  [7] "0...7"               "1...8"               "1...9"              
## [10] "...10"               "0...11"              "0...12"             
## [13] "1...13"              "0...14"              "0...15"             
## [16] "0...16"              "-16.666666030883789" "18.333333969116211" 
## [19] "11"                  "18"
# Preview first few rows
head(data)
## # A tibble: 6 × 20
##   Wolfsburg  Hoffenheim `-`   Left    `3` Defended `0...7` `1...8` `1...9` ...10
##   <chr>      <chr>      <chr> <chr> <dbl> <chr>      <dbl>   <dbl>   <dbl> <dbl>
## 1 RB Leipzig Stuttgart  -     Left      2 Miss           0       0       1    NA
## 2 Koln       Hoffenheim -     Head      3 Miss           0       0       1    NA
## 3 Mainz 05   Hoffenheim -     Right     1 Goal           1       0       0     1
## 4 Hoffenheim Eintracht… -     Head      4 Miss           0       0       1    NA
## 5 Borussia … Hoffenheim -     Right     2 Goal           1       0       0     1
## 6 Wolfsburg  RB Leipzig -     Left      3 Miss           0       0       1    NA
## # ℹ 10 more variables: `0...11` <dbl>, `0...12` <dbl>, `1...13` <dbl>,
## #   `0...14` <dbl>, `0...15` <dbl>, `0...16` <dbl>,
## #   `-16.666666030883789` <dbl>, `18.333333969116211` <dbl>, `11` <dbl>,
## #   `18` <dbl>
colnames(data) <- c("hometeam_team1", "awayteam_team2", "player", "bodypart", "defpressure",
                    "outcome", "goal", "blocked", "blockmiss", "finish",
                    "DefPress1", "DefPress2", "DefPress3", "DefPress4", "DefPress5",
                    "head", "location_y", "location_x", "awayid", "homeid")

# Confirm names after forced rename
print(colnames(data))
##  [1] "hometeam_team1" "awayteam_team2" "player"         "bodypart"      
##  [5] "defpressure"    "outcome"        "goal"           "blocked"       
##  [9] "blockmiss"      "finish"         "DefPress1"      "DefPress2"     
## [13] "DefPress3"      "DefPress4"      "DefPress5"      "head"          
## [17] "location_y"     "location_x"     "awayid"         "homeid"
# Proceed to regression
logit_model <- glm(goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + DefPress5, 
                   data = data, 
                   family = binomial(link = "logit"))

summary(logit_model)
## 
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + 
##     DefPress5, family = binomial(link = "logit"), data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.61496    0.59810  -4.372 1.23e-05 ***
## DefPress1    1.14367    0.60469   1.891   0.0586 .  
## DefPress2    0.56307    0.60827   0.926   0.3546    
## DefPress3    0.56033    0.60727   0.923   0.3562    
## DefPress4    0.19459    0.61216   0.318   0.7506    
## DefPress5   -0.07967    0.64117  -0.124   0.9011    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2678.2  on 3679  degrees of freedom
## Residual deviance: 2622.2  on 3674  degrees of freedom
## AIC: 2634.2
## 
## Number of Fisher Scoring iterations: 5

1b) Null hypothesis: β1 =β4 alternative: β1 is not equal to β4 (there may be a difference between dfpress1 and defprss 4)

# find coefficient estimates and standard errors
coef_summary <- summary(logit_model)$coefficients

# find values for DefPress1 and DefPress4
beta1 <- coef_summary["DefPress1", "Estimate"]
se1 <- coef_summary["DefPress1", "Std. Error"]

beta4 <- coef_summary["DefPress4", "Estimate"]
se4 <- coef_summary["DefPress4", "Std. Error"]

# Compute the test statistic
z_score <- (beta1 - beta4) / sqrt(se1^2 + se4^2)

# Compute p-value for two-sided test
p_value <- 2 * (1 - pnorm(abs(z_score)))

cat("Beta1 (DefPress1):", beta1, "\n")
## Beta1 (DefPress1): 1.143672
cat("SE1 (DefPress1):", se1, "\n")
## SE1 (DefPress1): 0.604694
cat("Beta4 (DefPress4):", beta4, "\n")
## Beta4 (DefPress4): 0.1945916
cat("SE4 (DefPress4):", se4, "\n")
## SE4 (DefPress4): 0.6121566
# Compute the test statistic
z_score <- (beta1 - beta4) / sqrt(se1^2 + se4^2)

cat("Z-score for H0: Beta1 = Beta4 is", z_score, "\n")
## Z-score for H0: Beta1 = Beta4 is 1.102993
# Compute p-value for two-sided test
p_value <- 2 * (1 - pnorm(abs(z_score)))

cat("P-value for H0: Beta1 = Beta4 is", p_value, "\n")
## P-value for H0: Beta1 = Beta4 is 0.2700303

1c)

# Calculate total distance using Pythagorean Theorem
data$TotalDistance <- sqrt(data$location_x^2 + data$location_y^2)

# Check a few rows to make sure it worked
head(data[, c("location_x", "location_y", "TotalDistance")])
## # A tibble: 6 × 3
##   location_x location_y TotalDistance
##        <dbl>      <dbl>         <dbl>
## 1      27.3       14.7           31.0
## 2      12.7        9.33          15.7
## 3       9.67      -6.67          11.7
## 4       8.33       8.67          12.0
## 5      10         -1.67          10.1
## 6      28.7      -14             31.9
# Logistic regression including TotalDistance
logit_model_distance <- glm(goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + DefPress5 + TotalDistance,
                            data = data,
                            family = binomial(link = "logit"))

# Show regression summary
summary(logit_model_distance)
## 
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + 
##     DefPress5 + TotalDistance, family = binomial(link = "logit"), 
##     data = data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.952926   0.613459  -1.553  0.12034    
## DefPress1      1.876865   0.615079   3.051  0.00228 ** 
## DefPress2      1.258501   0.618371   2.035  0.04183 *  
## DefPress3      1.162879   0.616958   1.885  0.05945 .  
## DefPress4      0.513763   0.621121   0.827  0.40815    
## DefPress5      0.065476   0.650315   0.101  0.91980    
## TotalDistance -0.115917   0.007221 -16.052  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2678.2  on 3679  degrees of freedom
## Residual deviance: 2266.4  on 3673  degrees of freedom
## AIC: 2280.4
## 
## Number of Fisher Scoring iterations: 6
# find coefficient table
coef_summary_dist <- summary(logit_model_distance)$coefficients

# find TotalDistance coefficient and standard error
beta_distance <- coef_summary_dist["TotalDistance", "Estimate"]
se_distance <- coef_summary_dist["TotalDistance", "Std. Error"]

# Compute z-score and p-value
z_distance <- beta_distance / se_distance
p_distance <- 2 * (1 - pnorm(abs(z_distance)))

# Output
cat("Hypothesis Test for Total Distance\n")
## Hypothesis Test for Total Distance
cat("----------------------------------\n")
## ----------------------------------
cat("Beta (TotalDistance):", beta_distance, "\n")
## Beta (TotalDistance): -0.1159174
cat("SE (TotalDistance):", se_distance, "\n")
## SE (TotalDistance): 0.007221185
cat("Z-score:", z_distance, "\n")
## Z-score: -16.05241
cat("P-value:", p_distance, "\n")
## P-value: 0

1d)

# Calculate angle to goal using atan2 (gives angle in radians)
data$ShotAngle <- atan2(data$location_y, data$location_x)

# double check first few rows to make sure it worked
head(data[, c("location_x", "location_y", "ShotAngle")])
## # A tibble: 6 × 3
##   location_x location_y ShotAngle
##        <dbl>      <dbl>     <dbl>
## 1      27.3       14.7      0.492
## 2      12.7        9.33     0.635
## 3       9.67      -6.67    -0.604
## 4       8.33       8.67     0.805
## 5      10         -1.67    -0.165
## 6      28.7      -14       -0.454
logit_model_angle <- glm(goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + DefPress5 +
                         TotalDistance + ShotAngle,
                         data = data,
                         family = binomial(link = "logit"))

# Print summary so you can see all coefficients
summary(logit_model_angle)
## 
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + 
##     DefPress5 + TotalDistance + ShotAngle, family = binomial(link = "logit"), 
##     data = data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.954695   0.613457  -1.556  0.11965    
## DefPress1      1.878412   0.615050   3.054  0.00226 ** 
## DefPress2      1.260330   0.618361   2.038  0.04153 *  
## DefPress3      1.163294   0.616880   1.886  0.05933 .  
## DefPress4      0.514947   0.621070   0.829  0.40703    
## DefPress5      0.066823   0.650278   0.103  0.91815    
## TotalDistance -0.115899   0.007221 -16.051  < 2e-16 ***
## ShotAngle      0.017722   0.090661   0.195  0.84502    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2678.2  on 3679  degrees of freedom
## Residual deviance: 2266.4  on 3672  degrees of freedom
## AIC: 2282.4
## 
## Number of Fisher Scoring iterations: 6
# Make sure logit_model_angle exists first (this chunk depends on the model being fit already)
coef_summary_angle <- summary(logit_model_angle)$coefficients

# Extract the ShotAngle coefficient and standard error
beta_angle <- coef_summary_angle["ShotAngle", "Estimate"]
se_angle <- coef_summary_angle["ShotAngle", "Std. Error"]

# Compute the test statistic (Z-score) and p-value
z_angle <- beta_angle / se_angle
p_angle <- 2 * (1 - pnorm(abs(z_angle)))

# Output results
cat("Hypothesis Test for Shot Angle\n")
## Hypothesis Test for Shot Angle
cat("------------------------------\n")
## ------------------------------
cat("Beta (ShotAngle):", beta_angle, "\n")
## Beta (ShotAngle): 0.01772212
cat("SE (ShotAngle):", se_angle, "\n")
## SE (ShotAngle): 0.09066147
cat("Z-score:", z_angle, "\n")
## Z-score: 0.1954757
cat("P-value:", p_angle, "\n")
## P-value: 0.8450206
if (p_angle < 0.05) {
    cat("Conclusion: ShotAngle has a statistically significant effect on goal probability.\n")
} else {
    cat("Conclusion: ShotAngle does not have a statistically significant effect on goal probability.\n")
}
## Conclusion: ShotAngle does not have a statistically significant effect on goal probability.

1e)

new_shot <- data.frame(
    DefPress1 = 0,
    DefPress2 = 0,
    DefPress3 = 1,
    DefPress4 = 0,
    DefPress5 = 0,
    TotalDistance = 12,
    ShotAngle = 0
)
predicted_probability <- predict(logit_model_angle, newdata = new_shot, type = "response")

cat("Predicted Probability of Scoring:", predicted_probability, "\n")
## Predicted Probability of Scoring: 0.2346594

1f)

# Summary of Part (a) Model
cat("Model (a) - Defensive Pressure Only\n")
## Model (a) - Defensive Pressure Only
summary(logit_model)
## 
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + 
##     DefPress5, family = binomial(link = "logit"), data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.61496    0.59810  -4.372 1.23e-05 ***
## DefPress1    1.14367    0.60469   1.891   0.0586 .  
## DefPress2    0.56307    0.60827   0.926   0.3546    
## DefPress3    0.56033    0.60727   0.923   0.3562    
## DefPress4    0.19459    0.61216   0.318   0.7506    
## DefPress5   -0.07967    0.64117  -0.124   0.9011    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2678.2  on 3679  degrees of freedom
## Residual deviance: 2622.2  on 3674  degrees of freedom
## AIC: 2634.2
## 
## Number of Fisher Scoring iterations: 5
cat("\nModel (a) Deviance:", summary(logit_model)$deviance, "\n")
## 
## Model (a) Deviance: 2622.171
cat("Model (a) AIC:", AIC(logit_model), "\n")
## Model (a) AIC: 2634.171
# Summary of Part (c) Model (with TotalDistance)
cat("\nModel (c) - Defensive Pressure + TotalDistance\n")
## 
## Model (c) - Defensive Pressure + TotalDistance
summary(logit_model_distance)
## 
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + 
##     DefPress5 + TotalDistance, family = binomial(link = "logit"), 
##     data = data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.952926   0.613459  -1.553  0.12034    
## DefPress1      1.876865   0.615079   3.051  0.00228 ** 
## DefPress2      1.258501   0.618371   2.035  0.04183 *  
## DefPress3      1.162879   0.616958   1.885  0.05945 .  
## DefPress4      0.513763   0.621121   0.827  0.40815    
## DefPress5      0.065476   0.650315   0.101  0.91980    
## TotalDistance -0.115917   0.007221 -16.052  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2678.2  on 3679  degrees of freedom
## Residual deviance: 2266.4  on 3673  degrees of freedom
## AIC: 2280.4
## 
## Number of Fisher Scoring iterations: 6
cat("\nModel (c) Deviance:", summary(logit_model_distance)$deviance, "\n")
## 
## Model (c) Deviance: 2266.444
cat("Model (c) AIC:", AIC(logit_model_distance), "\n")
## Model (c) AIC: 2280.444
# Summary of Part (d) Model (with TotalDistance + ShotAngle)
cat("\nModel (d) - Defensive Pressure + TotalDistance + ShotAngle\n")
## 
## Model (d) - Defensive Pressure + TotalDistance + ShotAngle
summary(logit_model_angle)
## 
## Call:
## glm(formula = goal ~ DefPress1 + DefPress2 + DefPress3 + DefPress4 + 
##     DefPress5 + TotalDistance + ShotAngle, family = binomial(link = "logit"), 
##     data = data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.954695   0.613457  -1.556  0.11965    
## DefPress1      1.878412   0.615050   3.054  0.00226 ** 
## DefPress2      1.260330   0.618361   2.038  0.04153 *  
## DefPress3      1.163294   0.616880   1.886  0.05933 .  
## DefPress4      0.514947   0.621070   0.829  0.40703    
## DefPress5      0.066823   0.650278   0.103  0.91815    
## TotalDistance -0.115899   0.007221 -16.051  < 2e-16 ***
## ShotAngle      0.017722   0.090661   0.195  0.84502    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2678.2  on 3679  degrees of freedom
## Residual deviance: 2266.4  on 3672  degrees of freedom
## AIC: 2282.4
## 
## Number of Fisher Scoring iterations: 6
cat("\nModel (d) Deviance:", summary(logit_model_angle)$deviance, "\n")
## 
## Model (d) Deviance: 2266.405
cat("Model (d) AIC:", AIC(logit_model_angle), "\n")
## Model (d) AIC: 2282.405