library(mosaic)
library(ResourceSelection)
library(tidyverse)
library(plotly)
library(rmarkdown)
library(DT)
library(rlang)
library(dplyr)
library(lattice)
library(alr4)
library(pander)

UEFA <- read.csv('C:/Users/flore/BYU-Idaho/BYU-I/Fall 22/Linear Regression/Statistics-Notebook-master/Statistics-Notebook-master/Analyses/Logistic Regression/ChampionsLeague.csv', header=TRUE)


UEFA <- UEFA %>% 
  select(team_home_score, possession_home) %>%
     mutate(
         scored = case_when(
           team_home_score %in% c(1,2,3,4,5) ~ 1,
           team_home_score %in% c(0) ~ 0)) %>% 
  mutate(possession_home = possession_home %>% stringr::str_sub(1,-2) %>% as.integer())


Background

The UEFA Champions League is the hardest and highest level competition in football. Having every year the champion clubs of the countries in Europe. The best players, stadiums, level, and teams face up in a group stage with 4 teams, each team playing versus all the teams in its group, one game home, one as visitor. The best two teams in the group stage will go to play offs played in two games, one home and one as visitor. Is well known in football that playing as local has advantages and being a visitor might get complicated. The locals have the fans support, they know their pitch, and don’t have hours of traveling on them, between many other variables. In this study we will analyze with a logistic regression if the possession of the home team is related with scoring at least one goal in their stadium.

The significance level α for this analysis is α = 0.05


Analysis

Hypotheses

I’ll use the following model to my analysis.

\[ P(Y_i = 1|\, x_i) = \frac{e^{\beta_0 + \beta_1 x_i}}{1+e^{\beta_0 + \beta_1 x_i}} = \pi_i \]

Hypotheses about the slope:

\[ H_0: \beta_1 = 0 \quad H_a: \beta_1 \neq 0 \]


Logistic Regression

Next I’ll perform the logistic regression analysis of the possession of the home team with if they scored (1) or if they didn’t score (0) at least one goal.

# Libraries
library(showtext)
library(ggplot2)
library(grid)
library(png)

# Enable custom fonts
showtext_auto()

# Load logo image
logo_path <- "C:/Users/flore/BYU-Idaho/BYU-I/Fall 22/Linear Regression/Statistics-Notebook-master/Statistics-Notebook-master/Analyses/Logistic Regression/uefachampionsleague.png"
logo <- readPNG(logo_path)

# Create grob for logo background
logo_grob <- rasterGrob(logo, width=unit(1,"npc"), height=unit(1,"npc"), interpolate=TRUE)

# UEFA Champions League colors
champions_blue <- "#1B2A6D"  # Dark blue background
champions_purple <- "#6A0DAD" # Purple for points and line
highlight_color <- "#A77BDA"  # Light purple for contrast behind points
light_grid_color <- "#B0C4DE" # Light blue for grid lines

# Create the plot with custom background
ggplot(data=UEFA, aes(x=possession_home, y=scored)) +
  # Add logo image as background
  annotation_custom(logo_grob, xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=Inf) +
  # Add lighter, larger circles for contrast
  geom_point(aes(y=scored), color=highlight_color, size=6, alpha=0.25) +  # Background circles
  # Points and regression line with thicker size
  geom_point(color=champions_purple, size=6, alpha=0.9) +  # Main points
  geom_smooth(method="glm", method.args=list(family="binomial"), 
              se=FALSE, color=champions_purple, size=1.7, linetype="solid") +  # Thicker regression line
  # Add horizontal lines at 0.50 and 0.75
  geom_hline(yintercept = c(0.50, 0.75), color=light_grid_color, linetype="dashed", size=0.7) +
  # Title of the plot
  ggtitle("Probability of a Home Team Scoring Based on Possession") +
  # Theme of the plot
  theme_minimal(base_family="Arial") +
  theme(
    # Background of the plot
    plot.background = element_rect(fill=champions_blue, color=NA),
    panel.background = element_rect(fill=NA, color=NA),
    # Grid lines
    panel.grid.major.y = element_line(color=light_grid_color, size=0.5),
    panel.grid.minor.y = element_line(color=light_grid_color, size=0.25),
    panel.grid.major.x = element_blank(),  # Optional: Remove vertical grid lines
    # Titles and texts
    plot.title = element_text(size=22, color="white", face="bold", hjust=0.5), # Center the title
    axis.text = element_text(color="white", size=12),
    axis.title = element_text(color="white", size=14, face="bold")
  ) +
  # Add axis titles
  labs(
    x = "Home Team Possession (%)",
    y = "Scored (1 = Yes, 0 = No)"
  )

Numerical Summary:

For our numerical summary we have a sample size bigger than 30. Here’s a summary of the data:

  UEFA %>% 
  summarise(ave=mean(possession_home),sd=sd(possession_home),SampleSize=n()) %>% pander()
ave sd SampleSize
49.81 10.74 125
pander(favstats(possession_home ~ scored,data=UEFA))
scored min Q1 median Q3 max mean sd n missing
0 29 39 46 56.5 65 46.46 10.19 35 0
1 27 44 51 59 70 51.11 10.72 90 0

We can notice that at least 90 of the 125 local teams at least scored one goal. Independently of their posession, being local has a positive influence for that same team. The mean and median were about the same between 46% and 51% possession. So it seems our data has a perfectly symmetrical distribution.

Conclusion

As our p-value is 0.0318 for Beta 1, we can be confident to reject our null hypothesis, meaning then that \[H_a: \beta_1 \neq 0\] . There’s a slope which is positive according to our graph, which as longer the possession of the ball of the home team, it is more likely for them to score at least one goal.

The odds of success (scoring at least one goal) increase by the factor of \[e^\beta_1\] in every unit increased in possession \[x_i\] In this case by a factor of 1.04294662478.

Diagnostic Plots

UEFA_study <- glm(scored ~ possession_home, data = UEFA, family = binomial)

library(ResourceSelection)

hoslem.test(UEFA_study$y, UEFA_study$fitted, g=10) %>% pander()
Hosmer and Lemeshow goodness of fit (GOF) test: UEFA_study$y, UEFA_study$fitted
Test statistic df P value
13.82 8 0.08662

Because our p-value is bigger than our significance level (.05) we can say that the data is a good fit for this logistic regression analysis.