library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pl <- read_csv("C:/Users/bfunk/Downloads/E0.csv")
## Rows: 380 Columns: 120
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (7): Div, Date, HomeTeam, AwayTeam, FTR, HTR, Referee
## dbl  (112): FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY...
## time   (1): Time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Using home win/loss as my binary variable. This data dive I am trying to figure out what affects home wins.

pl <- pl |>
  mutate(home_win = if_else(FTR == "H", 1, 0))

Looking at shots on target and how it relates to winning. clear positive relationship.

pl |>
  ggplot(aes(x = HST, y = home_win)) +
  geom_jitter(width = 0, height = 0.1, shape = "O", size= 3) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Home Shots on Target",
       y = "Home Win") +
  scale_y_continuous(breaks = c(0, 1))
## `geom_smooth()` using formula = 'y ~ x'

I added away shots and betting odds to the log model two more variables that I believe affect winning. The coefficients tell me that home shots positively affect winning but the lower the bet 365 odds or away shots the higher chance there is for the home team to win. this is about what we would expect.

model <- glm(home_win ~ HST + AST + B365H,
             data = pl,
             family = binomial(link = "logit"))

model$coefficients
## (Intercept)         HST         AST       B365H 
##   0.1686033   0.3473562  -0.3107811  -0.4223579
summary(model)
## 
## Call:
## glm(formula = home_win ~ HST + AST + B365H, family = binomial(link = "logit"), 
##     data = pl)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.16860    0.50313   0.335 0.737544    
## HST          0.34736    0.06160   5.639 1.71e-08 ***
## AST         -0.31078    0.06869  -4.524 6.06e-06 ***
## B365H       -0.42236    0.12615  -3.348 0.000814 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 513.82  on 379  degrees of freedom
## Residual deviance: 382.20  on 376  degrees of freedom
## AIC: 390.2
## 
## Number of Fisher Scoring iterations: 5
exp(coef(model))
## (Intercept)         HST         AST       B365H 
##   1.1836505   1.4153207   0.7328743   0.6554994

HST 41% increase in odds while ASt 23.7% decrease and B365H 34.5% decrease in odds. According to this HST is the most influential in this model.

coef(summary(model))
##               Estimate Std. Error    z value     Pr(>|z|)
## (Intercept)  0.1686033 0.50313186  0.3351076 7.375440e-01
## HST          0.3473562 0.06159630  5.6392377 1.708046e-08
## AST         -0.3107811 0.06869199 -4.5242699 6.060434e-06
## B365H       -0.4223579 0.12614842 -3.3481026 8.136688e-04

All 3 p values are very small so they are significant we can reject all 3 null hypothesis which each respective variable in association with home results.

beta_hst <- coef(summary(model))["HST", "Estimate"]
se_hst   <- coef(summary(model))["HST", "Std. Error"]

ci_hst <- c(beta_hst - 1.96 * se_hst,
            beta_hst + 1.96 * se_hst)

ci_hst
## [1] 0.2266274 0.4680849
exp(ci_hst)
## [1] 1.254362 1.596933

log odds scale .227-.468 - HST is significant

odds ratio scale 1.254-1.597 - home shots increase home win chance by 25-60%