#Importing libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(boot)
library(broom)
library(lindia)
data <-read.csv('C:/Downloads/final_dataset.csv')
colnames(data)
## [1] "X" "Date" "HomeTeam" "AwayTeam"
## [5] "FTHG" "FTAG" "FTR" "HTGS"
## [9] "ATGS" "HTGC" "ATGC" "HTP"
## [13] "ATP" "HM1" "HM2" "HM3"
## [17] "HM4" "HM5" "AM1" "AM2"
## [21] "AM3" "AM4" "AM5" "MW"
## [25] "HTFormPtsStr" "ATFormPtsStr" "HTFormPts" "ATFormPts"
## [29] "HTWinStreak3" "HTWinStreak5" "HTLossStreak3" "HTLossStreak5"
## [33] "ATWinStreak3" "ATWinStreak5" "ATLossStreak3" "ATLossStreak5"
## [37] "HTGD" "ATGD" "DiffPts" "DiffFormPts"
data$Outcome <- ifelse(data$FTR == "H", 1, 0)
model <- glm(Outcome ~ HTGD + ATGD + DiffPts + DiffFormPts, data = data, family = binomial(link = "logit"))
summary(model)
##
## Call:
## glm(formula = Outcome ~ HTGD + ATGD + DiffPts + DiffFormPts,
## family = binomial(link = "logit"), data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.14421 0.02560 -5.633 1.77e-08 ***
## HTGD 0.50774 0.07538 6.736 1.63e-11 ***
## ATGD -0.48169 0.07576 -6.358 2.04e-10 ***
## DiffPts 0.47879 0.11796 4.059 4.93e-05 ***
## DiffFormPts -0.31953 0.10182 -3.138 0.0017 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9447.4 on 6839 degrees of freedom
## Residual deviance: 8727.9 on 6835 degrees of freedom
## AIC: 8737.9
##
## Number of Fisher Scoring iterations: 4
coef <- coef(summary(model))
se <- coef["HTGD", 2]
ci <- exp(confint(model, "HTGD"))
## Waiting for profiling to be done...
cat("Coefficient for HTGD:", coef["HTGD", 1], "\n")
## Coefficient for HTGD: 0.5077388
cat("Standard Error for HTGD:", se, "\n")
## Standard Error for HTGD: 0.07537998
cat("95% Confidence Interval for HTGD:", ci, "\n")
## 95% Confidence Interval for HTGD: 1.434214 1.927359
The model required 4 iterations to converge which is shown by the number of fiher scoring iterations.The convergence shows the model is stable and is well fitted to the data.
The model is a good fit as indicated by the lower residual deviance which suggests that it captures the underlying patterns in the data effectively.
library(ggplot2)
ggplot(data, aes(x = HTGD, y = Outcome)) +
geom_point() +
geom_smooth(method = "glm", method.args = list(family = "binomial")) +
xlab("Home Team Goal Difference (HTGD)") +
ylab("Probability of Home Team Winning (Outcome)") +
ggtitle("Scatter Plot: HTGD vs. Probability of Home Team Winning")
## `geom_smooth()` using formula = 'y ~ x'
data$MW_log <- log(data$MW)
model_with_transformation <- glm(Outcome ~ HTGD + ATGD + DiffPts + MW_log, data = data, family = binomial(link = "logit"))
summary(model_with_transformation)
##
## Call:
## glm(formula = Outcome ~ HTGD + ATGD + DiffPts + MW_log, family = binomial(link = "logit"),
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.29342 0.08491 -3.456 0.000549 ***
## HTGD 0.53917 0.07551 7.141 9.29e-13 ***
## ATGD -0.51295 0.07583 -6.765 1.34e-11 ***
## DiffPts 0.28958 0.10250 2.825 0.004727 **
## MW_log 0.05605 0.02978 1.882 0.059846 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9447.4 on 6839 degrees of freedom
## Residual deviance: 8734.1 on 6835 degrees of freedom
## AIC: 8744.1
##
## Number of Fisher Scoring iterations: 4
The logistic regression model with the log-transformed “MW_log” variable provides insights into the factors influencing the probability of a home team winning a football match.
While “MW_log” is included in the model it is not highly significant.
ggplot(data, aes(x = MW_log, y = Outcome)) +
geom_point() +
geom_smooth(method = "glm", method.args = list(family = "binomial")) +
xlab("Log-Transformed Matchweek (MW_log)") +
ylab("Probability of Home Team Winning (Outcome)") +
ggtitle("Scatter Plot: Log-Transformed MW vs. Probability of Home Team Winning")
## `geom_smooth()` using formula = 'y ~ x'