Importing the libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(dplyr)
library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

Importing the dataset

data <-read.csv('C:/Downloads/final_dataset.csv')

Data Exploration

head(data)
##   X     Date  HomeTeam      AwayTeam FTHG FTAG FTR HTGS ATGS HTGC ATGC HTP ATP
## 1 0 19/08/00  Charlton      Man City    4    0   H    0    0    0    0   0   0
## 2 1 19/08/00   Chelsea      West Ham    4    2   H    0    0    0    0   0   0
## 3 2 19/08/00  Coventry Middlesbrough    1    3  NH    0    0    0    0   0   0
## 4 3 19/08/00     Derby   Southampton    2    2  NH    0    0    0    0   0   0
## 5 4 19/08/00     Leeds       Everton    2    0   H    0    0    0    0   0   0
## 6 5 19/08/00 Leicester   Aston Villa    0    0  NH    0    0    0    0   0   0
##   HM1 HM2 HM3 HM4 HM5 AM1 AM2 AM3 AM4 AM5 MW HTFormPtsStr ATFormPtsStr
## 1   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
## 2   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
## 3   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
## 4   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
## 5   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
## 6   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
##   HTFormPts ATFormPts HTWinStreak3 HTWinStreak5 HTLossStreak3 HTLossStreak5
## 1         0         0            0            0             0             0
## 2         0         0            0            0             0             0
## 3         0         0            0            0             0             0
## 4         0         0            0            0             0             0
## 5         0         0            0            0             0             0
## 6         0         0            0            0             0             0
##   ATWinStreak3 ATWinStreak5 ATLossStreak3 ATLossStreak5 HTGD ATGD DiffPts
## 1            0            0             0             0    0    0       0
## 2            0            0             0             0    0    0       0
## 3            0            0             0             0    0    0       0
## 4            0            0             0             0    0    0       0
## 5            0            0             0             0    0    0       0
## 6            0            0             0             0    0    0       0
##   DiffFormPts
## 1           0
## 2           0
## 3           0
## 4           0
## 5           0
## 6           0
summary(data)
##        X            Date             HomeTeam           AwayTeam        
##  Min.   :   0   Length:6840        Length:6840        Length:6840       
##  1st Qu.:1710   Class :character   Class :character   Class :character  
##  Median :3420   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :3420                                                           
##  3rd Qu.:5129                                                           
##  Max.   :6839                                                           
##       FTHG            FTAG          FTR                 HTGS       
##  Min.   :0.000   Min.   :0.00   Length:6840        Min.   :  0.00  
##  1st Qu.:1.000   1st Qu.:0.00   Class :character   1st Qu.: 11.00  
##  Median :1.000   Median :1.00   Mode  :character   Median : 23.00  
##  Mean   :1.527   Mean   :1.13                      Mean   : 24.42  
##  3rd Qu.:2.000   3rd Qu.:2.00                      3rd Qu.: 35.00  
##  Max.   :9.000   Max.   :7.00                      Max.   :102.00  
##       ATGS             HTGC           ATGC            HTP        
##  Min.   :  0.00   Min.   : 0.0   Min.   : 0.00   Min.   :0.0000  
##  1st Qu.: 11.00   1st Qu.:11.0   1st Qu.:11.00   1st Qu.:0.8889  
##  Median : 23.00   Median :23.0   Median :23.00   Median :1.1724  
##  Mean   : 24.51   Mean   :24.5   Mean   :24.35   Mean   :1.2090  
##  3rd Qu.: 35.00   3rd Qu.:36.0   3rd Qu.:36.00   3rd Qu.:1.5556  
##  Max.   :105.00   Max.   :85.0   Max.   :82.00   Max.   :2.7368  
##       ATP             HM1                HM2                HM3           
##  Min.   :0.0000   Length:6840        Length:6840        Length:6840       
##  1st Qu.:0.9062   Class :character   Class :character   Class :character  
##  Median :1.1923   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.2268                                                           
##  3rd Qu.:1.5625                                                           
##  Max.   :2.7619                                                           
##      HM4                HM5                AM1                AM2           
##  Length:6840        Length:6840        Length:6840        Length:6840       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      AM3                AM4                AM5                  MW      
##  Length:6840        Length:6840        Length:6840        Min.   : 1.0  
##  Class :character   Class :character   Class :character   1st Qu.:10.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :19.5  
##                                                           Mean   :19.5  
##                                                           3rd Qu.:29.0  
##                                                           Max.   :38.0  
##  HTFormPtsStr       ATFormPtsStr         HTFormPts        ATFormPts     
##  Length:6840        Length:6840        Min.   : 0.000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 4.000   1st Qu.: 4.000  
##  Mode  :character   Mode  :character   Median : 6.000   Median : 6.000  
##                                        Mean   : 6.243   Mean   : 6.414  
##                                        3rd Qu.: 9.000   3rd Qu.: 9.000  
##                                        Max.   :15.000   Max.   :15.000  
##   HTWinStreak3      HTWinStreak5     HTLossStreak3    HTLossStreak5    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.06228   Mean   :0.01798   Mean   :0.0576   Mean   :0.01433  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   ATWinStreak3      ATWinStreak5     ATLossStreak3     ATLossStreak5    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.06287   Mean   :0.01652   Mean   :0.05102   Mean   :0.01023  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##       HTGD               ATGD             DiffPts          DiffFormPts      
##  Min.   :-3.00000   Min.   :-3.33333   Min.   :-2.36364   Min.   :-2.25000  
##  1st Qu.:-0.50000   1st Qu.:-0.46429   1st Qu.:-0.45161   1st Qu.:-0.17647  
##  Median :-0.07692   Median :-0.05000   Median : 0.00000   Median : 0.00000  
##  Mean   :-0.00969   Mean   : 0.01454   Mean   :-0.01775   Mean   :-0.01809  
##  3rd Qu.: 0.40000   3rd Qu.: 0.42105   3rd Qu.: 0.42857   3rd Qu.: 0.15385  
##  Max.   : 4.00000   Max.   : 3.50000   Max.   : 2.28571   Max.   : 2.25000

Finding the avg,max goals

mean_goals <- mean(data$FTHG + data$FTAG)
max_goals <- max(data$FTHG + data$FTAG)
total_matches <- nrow(data)
total_teams <- length(unique(c(data$HomeTeam, data$AwayTeam)))

cat("Average goals per match:", mean_goals, "\n")
## Average goals per match: 2.657749
cat("Maximum goals in a match:", max_goals, "\n")
## Maximum goals in a match: 11
cat("Total matches:", total_matches, "\n")
## Total matches: 6840
cat("Total teams:", total_teams, "\n")
## Total teams: 44

Team performance analysis

data$HTTotalGoals <- data$HTGS + data$ATGC
data$ATTotalGoals <- data$ATGS + data$HTGC

Goals scored by each team

ggplot(data, aes(x = HomeTeam, y = HTTotalGoals)) +
  geom_bar(stat = "identity", fill = "blue") +
  ggtitle("Total Goals Scored by each Team") +
  xlab("Team") + ylab("Total Goals")

Goals conceded by each team

ggplot(data, aes(x = AwayTeam, y = ATTotalGoals)) +
  geom_bar(stat = "identity", fill = "red") +
  ggtitle("Total Goals Conceded by Each Team") +
  xlab("Team") + ylab("Total Goals")

Team form analysis

data$HTAvgPoints <- data$HTP / data$MW
data$ATAvgPoints <- data$ATP / data$MW

Average points for each team

ggplot(data, aes(x = HomeTeam, y = HTAvgPoints)) +
  geom_bar(stat = "identity", fill = "brown") +
  ggtitle("Average Points for Each Team") +
  xlab("Team") + ylab("Average Points")

Average points conceded by each team

ggplot(data , aes(x = AwayTeam, y = ATAvgPoints)) +
  geom_bar(stat = "identity", fill = "orange") +
  ggtitle("Average Points Conceded by Each Team") +
  xlab("Team") + ylab("Average Points")

Goal difference analysis

data$HTGoalDiff <- data$HTGD - data$ATGD
data$ATGoalDiff <- data$ATGD - data$HTGD

Goal difference of Home Teams

ggplot(data , aes(x = HomeTeam, y = HTGoalDiff)) +
  geom_bar(stat = "identity", fill = "blue") +
  ggtitle("Goal Difference for Home Teams") +
  xlab("Team") + ylab("Goal Difference")

Goal difference of Away Teams

ggplot(data, aes(x = AwayTeam, y = ATGoalDiff)) +
  geom_bar(stat = "identity", fill = "red") +
  ggtitle("Goal Difference for Away Teams") +
  xlab("Team") + ylab("Goal Difference")

Linear model to predict the number of goals scored by home team

model <- lm(FTHG ~ HTGS + ATGC + HTFormPts + ATFormPts + HTWinStreak3 + ATLossStreak3, data = data)
summary(model)
## 
## Call:
## lm(formula = FTHG ~ HTGS + ATGC + HTFormPts + ATFormPts + HTWinStreak3 + 
##     ATLossStreak3, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6961 -0.9328 -0.2326  0.6829  7.3374 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.474604   0.044699  32.990  < 2e-16 ***
## HTGS           0.011192   0.001623   6.894 5.93e-12 ***
## ATGC          -0.003113   0.001537  -2.025   0.0429 *  
## HTFormPts      0.048237   0.005447   8.856  < 2e-16 ***
## ATFormPts     -0.067724   0.004701 -14.405  < 2e-16 ***
## HTWinStreak3  -0.043024   0.071178  -0.604   0.5456    
## ATLossStreak3 -0.169904   0.072971  -2.328   0.0199 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.26 on 6833 degrees of freedom
## Multiple R-squared:  0.05889,    Adjusted R-squared:  0.05806 
## F-statistic: 71.26 on 6 and 6833 DF,  p-value: < 2.2e-16
predictions <- predict(model, newdata = data)

Visualization to predict the actual goals score vs predicted home team goals

plot(data$FTHG, predictions, main = "Actual vs Predicted Home Team Goals", 
     xlab = "Actual Goals", ylab = "Predicted Goals", col = "blue")

rmse <- sqrt(mean((data$FTHG - predictions)^2))
mae <- mean(abs(data$FTHG - predictions))
rmse
## [1] 1.259026
mae
## [1] 0.9941052

Interpretation:

Audience Benefits: