library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
pl <- read_csv("C:/Users/bfunk/Downloads/E0.csv")
## Rows: 380 Columns: 120
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr    (7): Div, Date, HomeTeam, AwayTeam, FTR, HTR, Referee
## dbl  (112): FTHG, FTAG, HTHG, HTAG, HS, AS, HST, AST, HF, AF, HC, AC, HY, AY...
## time   (1): Time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

I wanted to model the relationship between home win odds (B365h) and home goals. I wondered if the favored team ends up scoring more goals and what does that relationship look like.

dat <- na.omit(pl[, c("FTHG", "B365H")])

model <- glm(FTHG ~ B365H, data = dat, family = poisson(link = "log"))

summary(model)
## 
## Call:
## glm(formula = FTHG ~ B365H, family = poisson(link = "log"), data = dat)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.94814    0.08877  10.680  < 2e-16 ***
## B365H       -0.21439    0.03424  -6.262 3.81e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 473.30  on 379  degrees of freedom
## Residual deviance: 423.61  on 378  degrees of freedom
## AIC: 1148.6
## 
## Number of Fisher Scoring iterations: 5

I went with poisson because FTHG is a count variable. The coefficient tells me that B365 is significant. So as home odds get better (lower) goals go up and the inverse is also true. e^coef gives me .81. So for every unit increase there is a 19 percent decrease in goals.

dat |>
  ggplot(aes(x = B365H, y = FTHG)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "glm",
              method.args = list(family = poisson(link = "log")),
              se = FALSE) +
  labs(title = "Home Goals vs Bet365 Home Odds")
## `geom_smooth()` using formula = 'y ~ x'

Right away the scatter plot looks quadratic with the highest goal values happening with the lowest win odds for home teams. I identified this as a graph built from (.5)^x which my poisson model resembles using logarithmic regression.

The next steps were me playing with my model i tried a more quadratic one since the relationship appeared that way.

dat <- pl |>
  select(Date, HomeTeam, AwayTeam, FTHG, B365H) |>
  drop_na() |>
  mutate(log_goals = log(FTHG + 1))
mod_quad <- glm(FTHG ~ B365H + I(B365H^2),
                data = dat,
                family = poisson(link = "log"))

summary(mod_quad)
## 
## Call:
## glm(formula = FTHG ~ B365H + I(B365H^2), family = poisson(link = "log"), 
##     data = dat)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.25691    0.15276   8.228  < 2e-16 ***
## B365H       -0.43120    0.09393  -4.591 4.41e-06 ***
## I(B365H^2)   0.02727    0.01052   2.593  0.00952 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 473.30  on 379  degrees of freedom
## Residual deviance: 417.91  on 377  degrees of freedom
## AIC: 1144.9
## 
## Number of Fisher Scoring iterations: 5

The second model I tried was attempting to find the quadratic model since the relationship looked quadratic to me. This one improved my scores so it fits better(AIC) but it looks like if this model was to continue to play out home goals would go up as the odds get worse which is not correct.

p2 <- dat |>
  ggplot(aes(x = B365H, y = FTHG)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "glm",
              formula = y ~ x + I(x^2),
              method.args = list(family = poisson(link = "log")),
              se = FALSE) +
  labs(title = "Home Goals vs Bet365 Home Odds",
       subtitle = "With quadratic term")

p2

Building data for diagnostics

lg <- glm(FTHG ~ log(B365H),
                data = dat,
                family = poisson(link = "log"))

summary(lg)
## 
## Call:
## glm(formula = FTHG ~ log(B365H), family = poisson(link = "log"), 
##     data = dat)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.94510    0.07969  11.860  < 2e-16 ***
## log(B365H)  -0.67584    0.09516  -7.102 1.23e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 473.30  on 379  degrees of freedom
## Residual deviance: 417.37  on 378  degrees of freedom
## AIC: 1142.3
## 
## Number of Fisher Scoring iterations: 5

This model used log(b365h) instead I wanted to see another option.With a lower AIC and residual deviance compared to the other I went with this one going forward. The line of best fit itself looks best with the data and makes the most sense but the model performance numbers are the most telling.

p3 <- dat |>
  ggplot(aes(x = B365H, y = FTHG)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "glm",
              formula = y ~ log(x),
              method.args = list(family = poisson(link = "log")),
              se = FALSE) +
  labs(title = "Home Goals vs Bet365 Home Odds",
       subtitle = "log(B365H)")

p3

aug_lg <- augment(lg) |>
  mutate(
    pearson_resid = residuals(lg, type = "pearson"),
    deviance_resid = residuals(lg, type = "deviance"),
    cooks_d = cooks.distance(lg)
  )

res_fit_plot <- aug_lg |>
  ggplot(aes(x = .fitted, y = pearson_resid)) +
  geom_point(alpha = 0.5) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Residual Plot",
       subtitle = "Pearson residuals vs fitted for log(B365H) model",
       x = "Fitted values",
       y = "Pearson residuals")

res_fit_plot

The residual plot is centered around 0 which is good but it has a strong shape that funnels downwards so it is not perfect.

qq_plot <- aug_lg |>
  ggplot(aes(sample = deviance_resid)) +
  stat_qq(alpha = 0.5) +
  stat_qq_line(color = "red") +
  labs(title = "QQ Plot",
       subtitle = "Deviance residuals for log(B365H) model")

qq_plot

The left side of the qq plot tails away on the left which makes sense since a lot of the data points are below the line on the lower end. Other than that to me it looks good it hugs the middle well.

plot(lg, which = 4, id.n = 5)

As far as cooks distance this one is pretty good, even the outliers are close to the average points. There would be no reason to takeout outliers.

dispersion_lg <- sum(residuals(lg, type = "pearson")^2) / df.residual(lg)
dispersion_lg
## [1] 0.9622801

The dispersion stat for this model is really close to one so over dispersion is not an issue.

Overall I tested a few models and I believe the log(B365H) is the best performing and according to diagnostics the model works well. The issue it runs into will exist within any model because the games where the home team is favored the most has a lot of variance. This is because it is easier to stack up goals as a heavy favorite, but at the same time a lot of the heavy favorite matches go similar to mild favorite matches. I think the log model still captures this variation the best.