library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(dplyr)
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
data <-read.csv('C:/Downloads/final_dataset.csv')
head(data)
## X Date HomeTeam AwayTeam FTHG FTAG FTR HTGS ATGS HTGC ATGC HTP ATP
## 1 0 19/08/00 Charlton Man City 4 0 H 0 0 0 0 0 0
## 2 1 19/08/00 Chelsea West Ham 4 2 H 0 0 0 0 0 0
## 3 2 19/08/00 Coventry Middlesbrough 1 3 NH 0 0 0 0 0 0
## 4 3 19/08/00 Derby Southampton 2 2 NH 0 0 0 0 0 0
## 5 4 19/08/00 Leeds Everton 2 0 H 0 0 0 0 0 0
## 6 5 19/08/00 Leicester Aston Villa 0 0 NH 0 0 0 0 0 0
## HM1 HM2 HM3 HM4 HM5 AM1 AM2 AM3 AM4 AM5 MW HTFormPtsStr ATFormPtsStr
## 1 M M M M M M M M M M 1 MMMMM MMMMM
## 2 M M M M M M M M M M 1 MMMMM MMMMM
## 3 M M M M M M M M M M 1 MMMMM MMMMM
## 4 M M M M M M M M M M 1 MMMMM MMMMM
## 5 M M M M M M M M M M 1 MMMMM MMMMM
## 6 M M M M M M M M M M 1 MMMMM MMMMM
## HTFormPts ATFormPts HTWinStreak3 HTWinStreak5 HTLossStreak3 HTLossStreak5
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## ATWinStreak3 ATWinStreak5 ATLossStreak3 ATLossStreak5 HTGD ATGD DiffPts
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## DiffFormPts
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
summary(data)
## X Date HomeTeam AwayTeam
## Min. : 0 Length:6840 Length:6840 Length:6840
## 1st Qu.:1710 Class :character Class :character Class :character
## Median :3420 Mode :character Mode :character Mode :character
## Mean :3420
## 3rd Qu.:5129
## Max. :6839
## FTHG FTAG FTR HTGS
## Min. :0.000 Min. :0.00 Length:6840 Min. : 0.00
## 1st Qu.:1.000 1st Qu.:0.00 Class :character 1st Qu.: 11.00
## Median :1.000 Median :1.00 Mode :character Median : 23.00
## Mean :1.527 Mean :1.13 Mean : 24.42
## 3rd Qu.:2.000 3rd Qu.:2.00 3rd Qu.: 35.00
## Max. :9.000 Max. :7.00 Max. :102.00
## ATGS HTGC ATGC HTP
## Min. : 0.00 Min. : 0.0 Min. : 0.00 Min. :0.0000
## 1st Qu.: 11.00 1st Qu.:11.0 1st Qu.:11.00 1st Qu.:0.8889
## Median : 23.00 Median :23.0 Median :23.00 Median :1.1724
## Mean : 24.51 Mean :24.5 Mean :24.35 Mean :1.2090
## 3rd Qu.: 35.00 3rd Qu.:36.0 3rd Qu.:36.00 3rd Qu.:1.5556
## Max. :105.00 Max. :85.0 Max. :82.00 Max. :2.7368
## ATP HM1 HM2 HM3
## Min. :0.0000 Length:6840 Length:6840 Length:6840
## 1st Qu.:0.9062 Class :character Class :character Class :character
## Median :1.1923 Mode :character Mode :character Mode :character
## Mean :1.2268
## 3rd Qu.:1.5625
## Max. :2.7619
## HM4 HM5 AM1 AM2
## Length:6840 Length:6840 Length:6840 Length:6840
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## AM3 AM4 AM5 MW
## Length:6840 Length:6840 Length:6840 Min. : 1.0
## Class :character Class :character Class :character 1st Qu.:10.0
## Mode :character Mode :character Mode :character Median :19.5
## Mean :19.5
## 3rd Qu.:29.0
## Max. :38.0
## HTFormPtsStr ATFormPtsStr HTFormPts ATFormPts
## Length:6840 Length:6840 Min. : 0.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 4.000 1st Qu.: 4.000
## Mode :character Mode :character Median : 6.000 Median : 6.000
## Mean : 6.243 Mean : 6.414
## 3rd Qu.: 9.000 3rd Qu.: 9.000
## Max. :15.000 Max. :15.000
## HTWinStreak3 HTWinStreak5 HTLossStreak3 HTLossStreak5
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.06228 Mean :0.01798 Mean :0.0576 Mean :0.01433
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## ATWinStreak3 ATWinStreak5 ATLossStreak3 ATLossStreak5
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.06287 Mean :0.01652 Mean :0.05102 Mean :0.01023
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## HTGD ATGD DiffPts DiffFormPts
## Min. :-3.00000 Min. :-3.33333 Min. :-2.36364 Min. :-2.25000
## 1st Qu.:-0.50000 1st Qu.:-0.46429 1st Qu.:-0.45161 1st Qu.:-0.17647
## Median :-0.07692 Median :-0.05000 Median : 0.00000 Median : 0.00000
## Mean :-0.00969 Mean : 0.01454 Mean :-0.01775 Mean :-0.01809
## 3rd Qu.: 0.40000 3rd Qu.: 0.42105 3rd Qu.: 0.42857 3rd Qu.: 0.15385
## Max. : 4.00000 Max. : 3.50000 Max. : 2.28571 Max. : 2.25000
mean_goals <- mean(data$FTHG + data$FTAG)
max_goals <- max(data$FTHG + data$FTAG)
total_matches <- nrow(data)
total_teams <- length(unique(c(data$HomeTeam, data$AwayTeam)))
cat("Average goals per match:", mean_goals, "\n")
## Average goals per match: 2.657749
cat("Maximum goals in a match:", max_goals, "\n")
## Maximum goals in a match: 11
cat("Total matches:", total_matches, "\n")
## Total matches: 6840
cat("Total teams:", total_teams, "\n")
## Total teams: 44
data$HTTotalGoals <- data$HTGS + data$ATGC
data$ATTotalGoals <- data$ATGS + data$HTGC
ggplot(data, aes(x = HomeTeam, y = HTTotalGoals)) +
geom_bar(stat = "identity", fill = "blue") +
ggtitle("Total Goals Scored by each Team") +
xlab("Team") + ylab("Total Goals")
ggplot(data, aes(x = AwayTeam, y = ATTotalGoals)) +
geom_bar(stat = "identity", fill = "red") +
ggtitle("Total Goals Conceded by Each Team") +
xlab("Team") + ylab("Total Goals")
data$HTAvgPoints <- data$HTP / data$MW
data$ATAvgPoints <- data$ATP / data$MW
ggplot(data, aes(x = HomeTeam, y = HTAvgPoints)) +
geom_bar(stat = "identity", fill = "brown") +
ggtitle("Average Points for Each Team") +
xlab("Team") + ylab("Average Points")
ggplot(data , aes(x = AwayTeam, y = ATAvgPoints)) +
geom_bar(stat = "identity", fill = "orange") +
ggtitle("Average Points Conceded by Each Team") +
xlab("Team") + ylab("Average Points")
data$HTGoalDiff <- data$HTGD - data$ATGD
data$ATGoalDiff <- data$ATGD - data$HTGD
ggplot(data , aes(x = HomeTeam, y = HTGoalDiff)) +
geom_bar(stat = "identity", fill = "blue") +
ggtitle("Goal Difference for Home Teams") +
xlab("Team") + ylab("Goal Difference")
ggplot(data, aes(x = AwayTeam, y = ATGoalDiff)) +
geom_bar(stat = "identity", fill = "red") +
ggtitle("Goal Difference for Away Teams") +
xlab("Team") + ylab("Goal Difference")
model <- lm(FTHG ~ HTGS + ATGC + HTFormPts + ATFormPts + HTWinStreak3 + ATLossStreak3, data = data)
summary(model)
##
## Call:
## lm(formula = FTHG ~ HTGS + ATGC + HTFormPts + ATFormPts + HTWinStreak3 +
## ATLossStreak3, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6961 -0.9328 -0.2326 0.6829 7.3374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.474604 0.044699 32.990 < 2e-16 ***
## HTGS 0.011192 0.001623 6.894 5.93e-12 ***
## ATGC -0.003113 0.001537 -2.025 0.0429 *
## HTFormPts 0.048237 0.005447 8.856 < 2e-16 ***
## ATFormPts -0.067724 0.004701 -14.405 < 2e-16 ***
## HTWinStreak3 -0.043024 0.071178 -0.604 0.5456
## ATLossStreak3 -0.169904 0.072971 -2.328 0.0199 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.26 on 6833 degrees of freedom
## Multiple R-squared: 0.05889, Adjusted R-squared: 0.05806
## F-statistic: 71.26 on 6 and 6833 DF, p-value: < 2.2e-16
predictions <- predict(model, newdata = data)
plot(data$FTHG, predictions, main = "Actual vs Predicted Home Team Goals",
xlab = "Actual Goals", ylab = "Predicted Goals", col = "blue")
rmse <- sqrt(mean((data$FTHG - predictions)^2))
mae <- mean(abs(data$FTHG - predictions))
rmse
## [1] 1.259026
mae
## [1] 0.9941052
The model suggests that factors like the number of goals scored by the home team and away team, recent form, and the away team’s losing streak can be predictors of the number of goals scored by the home team.
However, the model’s overall explanatory power is limited, as indicated by the low R-squared value.
Team managers and fantasy game players can use the model to gain insights into potential factors influencing a team’s goal-scoring performance.
Fans might find it interesting to understand which aspects of team performance are statistically associated with goal-scoring.