Forecasting projects
Install packages
install.packages("tidyverse", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpwesZom/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
install.packages("dyplyr", repos = "https://cloud.r-project.org")
## Warning: package 'dyplyr' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library(dplyr)
install.packages("corrplot", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpwesZom/downloaded_packages
library(corrplot)
## corrplot 0.92 loaded
install.packages("ggplot2", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpwesZom/downloaded_packages
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
Read data
Dataset 1 for the spi
global_ranking <- read_csv("gbl_rankings2.csv")
## Rows: 641 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, league
## dbl (5): rank, prev_rank, off, def, spi
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Dataset 2 for recent matches
recent_matches <- read_csv("latest_matches.csv")
## Rows: 11737 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): league, team1, team2
## dbl (19): season, league_id, spi1, spi2, prob1, prob2, probtie, proj_score1...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Dataset 3 for past matches
hist_matches <- read_csv("hist_matches.csv")
## Rows: 68913 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): league, team1, team2
## dbl (19): season, league_id, spi1, spi2, prob1, prob2, probtie, proj_score1...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Cleaning the dataset 1 for Global ranking (Deleting rank and previus
rank variable)
global_ranking2 <- global_ranking[,-1:-2]
Deleting all the other teams outside the spanish league.
spanish_ranking <- global_ranking2 %>% filter(grepl('Spanish Primera Division', league))
Cleaning dataset 2 for latest matchest (Selecting only the team from
the spanish primera division) It only have the 22 season.
lst_season <- recent_matches %>% filter(grepl('1869', league_id))
Cleaning dataset 3 for history of matches (Selecting only the teams
from the Spanish primera divison)
all_seasons <- hist_matches %>% filter(grepl('1869', league_id))
Exploring my datasets
Creating a summary of statistics for spanish ranking
describe(spanish_ranking)
## vars n mean sd median trimmed mad min max range skew kurtosis
## name* 1 20 10.50 5.92 10.50 10.50 7.41 1.00 20.00 19.00 0.00 -1.38
## league* 2 20 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 NaN NaN
## off 3 20 1.83 0.30 1.73 1.78 0.20 1.51 2.56 1.05 1.06 0.03
## def 4 20 0.84 0.23 0.81 0.84 0.22 0.43 1.27 0.84 0.16 -0.93
## spi 5 20 67.71 9.21 67.03 67.00 11.20 53.94 86.40 32.46 0.46 -0.87
## se
## name* 1.32
## league* 0.00
## off 0.07
## def 0.05
## spi 2.06
Creating a summary of statistics for all_seasons
describe(all_seasons)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## season 1 2660 2019.00 2.00 2019.00 2019.00 2.97 2016.00 2022.00
## date 2 2660 NaN NA NA NaN NA Inf -Inf
## league_id 3 2660 1869.00 0.00 1869.00 1869.00 0.00 1869.00 1869.00
## league* 4 2660 1.00 0.00 1.00 1.00 0.00 1.00 1.00
## team1* 5 2660 15.59 9.07 16.00 15.54 11.86 1.00 30.00
## team2* 6 2660 15.59 9.07 16.00 15.54 11.86 1.00 30.00
## spi1 7 2660 71.77 9.53 70.82 71.43 8.64 43.58 96.57
## spi2 8 2660 71.73 9.53 70.76 71.38 8.52 43.91 96.69
## prob1 9 2660 0.46 0.17 0.45 0.45 0.15 0.06 0.98
## prob2 10 2660 0.29 0.15 0.27 0.28 0.13 0.00 0.83
## probtie 11 2660 0.25 0.05 0.26 0.26 0.03 0.02 0.35
## proj_score1 12 2660 1.51 0.47 1.41 1.44 0.33 0.62 4.03
## proj_score2 13 2660 1.11 0.46 1.02 1.05 0.34 0.20 3.42
## importance1 14 2612 35.03 25.32 33.05 33.28 26.32 0.00 100.00
## importance2 15 2612 34.37 24.78 32.55 32.73 26.46 0.00 100.00
## score1 16 2660 1.48 1.26 1.00 1.34 1.48 0.00 8.00
## score2 17 2660 1.13 1.11 1.00 0.97 1.48 0.00 6.00
## xg1 18 2658 1.53 0.87 1.40 1.44 0.80 0.08 7.04
## xg2 19 2658 1.16 0.77 0.99 1.07 0.68 0.00 5.02
## nsxg1 20 2658 1.39 0.71 1.27 1.31 0.58 0.10 6.89
## nsxg2 21 2658 1.08 0.60 0.97 1.01 0.50 0.02 5.92
## adj_score1 22 2658 1.48 1.23 1.05 1.36 1.56 0.00 7.59
## adj_score2 23 2658 1.14 1.09 1.05 0.99 1.56 0.00 6.30
## range skew kurtosis se
## season 6.00 0.00 -1.25 0.04
## date -Inf NA NA NA
## league_id 0.00 NaN NaN 0.00
## league* 0.00 NaN NaN 0.00
## team1* 29.00 0.00 -1.33 0.18
## team2* 29.00 0.00 -1.33 0.18
## spi1 52.99 0.32 -0.13 0.18
## spi2 52.78 0.32 -0.12 0.18
## prob1 0.91 0.36 0.20 0.00
## prob2 0.83 0.80 0.69 0.00
## probtie 0.33 -1.60 2.90 0.00
## proj_score1 3.41 1.55 3.19 0.01
## proj_score2 3.22 1.35 2.65 0.01
## importance1 100.00 0.48 -0.35 0.50
## importance2 100.00 0.49 -0.29 0.48
## score1 8.00 0.96 1.10 0.02
## score2 6.00 1.13 1.44 0.02
## xg1 6.96 1.13 2.17 0.02
## xg2 5.02 1.24 2.04 0.01
## nsxg1 6.79 1.53 4.42 0.01
## nsxg2 5.90 1.65 5.78 0.01
## adj_score1 7.59 0.86 0.89 0.02
## adj_score2 6.30 1.04 1.25 0.02
Checking for missing values spanish_ranking
sum(is.na(spanish_ranking))
## [1] 0
Checking for missing values all_seasons
sum(is.na(all_seasons))
## [1] 108
# Tell us which rows and columns are missing
# which(is.na(all_seasons), arr.ind = TRUE)
clean_all_seasons <- na.omit(all_seasons)
Creating a correlation table for clean_all_seasons
numeric_data <- clean_all_seasons %>% select_if(is.numeric)
cm <- cor(numeric_data, use = "complete.obs")
## Warning in cor(numeric_data, use = "complete.obs"): the standard deviation is
## zero
corrplot(cm, method = "circle")

Creating new features for the model
clean_all_seasons$date <- ymd(clean_all_seasons$date)
clean_all_seasons$season <- as.factor(clean_all_seasons$season)
Calculting the points for each game
clean_all_seasons <- clean_all_seasons %>%
mutate(team1_points = case_when(
score1 > score2 ~ 3,
score1 == score2 ~ 1,
TRUE ~ 0 ),
team2_points = case_when(
score2 > score1 ~ 3,
score1 == score2 ~ 1,
TRUE ~ 0))
Aggregate the points by team and season
team_points <- clean_all_seasons %>%
select(season, team1, team1_points) %>%
rename(team = team1, points = team1_points) %>%
bind_rows(
clean_all_seasons %>%
select(season, team2, team2_points) %>%
rename(team = team2, points = team2_points)
) %>%
group_by(season, team) %>%
summarize(total_points = sum(points))
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
Creating even more features to add the vairables from
clean_all_season to the modeldata ( We still have to add more
variables)
avg_goals_scored <- clean_all_seasons %>%
group_by(team1) %>%
summarize(avg_goals_scored = mean(score1, na.rm = TRUE)) %>%
rename(team = team1)
avg_goals_conceded <- clean_all_seasons %>%
group_by(team2) %>%
summarize(avg_goals_conceded = mean(score2, na.rm = TRUE)) %>%
rename(team = team2)
# Merging these features with the spanish_ranking dataset
model_data <- spanish_ranking %>%
left_join(avg_goals_scored, by = c("name" = "team")) %>%
left_join(avg_goals_conceded, by = c("name" = "team"))
# Now merge with team_points to get total_points
model_data <- merge(model_data, team_points, by.x = "name", by.y = "team")
We have to join both of our variable to reduce the
multicolinearity.
# Creating a new variable, for example: goal difference
model_data$goal_difference <- model_data$avg_goals_scored - model_data$avg_goals_conceded
Creating lag variables
model_data <- model_data %>%
arrange(name, season) %>% # Ensure data is ordered first
group_by(name) %>%
mutate(prev_season_points = lag(total_points, n = 1)) %>%
ungroup()
Cleaning the na values
model_data <- na.omit(model_data)
Splitting the data to test on the 2022 season
train_data <- model_data %>% filter(season != "2022")
test_data <- model_data %>% filter(season == "2022")
Creating the first model using a simple linear regression as our
starting point. ( Other models that we want to applay are Random forest
and maybe time series regression since we have yearly data)
Model 1 Linear regression
Predict using the test data
model1_predictions <- predict(lm_model, newdata = test_data)
actual_points <- test_data$total_points
Calculate the RMSE
rmse <- sqrt(mean(( model1_predictions - actual_points)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 7.24784474255032"
Creating a graph to vizualize the results
# Create a dataframe for plotting
plot_data <- data.frame(Actual = actual_points, Predicted = model1_predictions)
# Scatter plot
ggplot(plot_data, aes(x = Actual, y = Predicted)) +
geom_point(color = "blue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
theme_minimal() +
labs(title = "Actual vs Predicted Total Points", x = "Actual Total Points", y = "Predicted Total Points")

Creating a table to have a more clear view of the results
comparison_table <- data.frame(Actual = actual_points, Predicted = model1_predictions, Team = test_data$name)
comparison_table$Difference = comparison_table$Actual - comparison_table$Predicted
print(comparison_table)
## Actual Predicted Team Difference
## 1 51 56.77623 Athletic Bilbao -5.7762291
## 2 77 69.11422 Atletico Madrid 7.8857757
## 3 88 80.36174 Barcelona 7.6382598
## 4 42 39.33600 Cadiz 2.6639991
## 5 43 50.36756 Celta Vigo -7.3675594
## 6 25 40.66319 Elche -15.6631871
## 7 37 41.10839 Espanyol -4.1083919
## 8 42 45.99088 Getafe -3.9908816
## 9 49 44.16239 Girona FC 4.8376104
## 10 50 43.13161 Mallorca 6.8683860
## 11 53 46.81570 Osasuna 6.1843049
## 12 49 49.14726 Rayo Vallecano -0.1472558
## 13 60 54.72878 Real Betis 5.2712154
## 14 78 81.69105 Real Madrid -3.6910534
## 15 71 58.26109 Real Sociedad 12.7389134
## 16 40 35.10094 Real Valladolid 4.8990626
## 17 49 57.90245 Sevilla FC -8.9024548
## 18 42 51.05519 Valencia -9.0551946
## 19 64 61.37394 Villarreal 2.6260557
Run a regular regression and run the model without doing the test
split to see how well it predicts the next season.
Creating grearter weight for the lastest season:
model_data$latest_season <- if_else(model_data$season == "2022", 1, 0)
# Filter out the 2022 data
model_data_2022 <- model_data %>% filter(season == "2022")
# Prepare the 2023 prediction data
model_data_2023 <- model_data_2022 %>%
mutate(season = "2023",
latest_season = 1, # Since 2023 is the latest season
total_points = NA) # total_points is what we want to predict
Creating the model
lm_model_f <- lm(total_points ~ off + def + spi + goal_difference + latest_season + prev_season_points , data = model_data )
summary(lm_model_f)
##
## Call:
## lm(formula = total_points ~ off + def + spi + goal_difference +
## latest_season + prev_season_points, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.4062 -5.6877 -0.4172 5.3925 21.7595
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 119.97547 78.60227 1.526 0.1309
## off 51.20417 22.11006 2.316 0.0231 *
## def -62.44058 32.19721 -1.939 0.0560 .
## spi -1.84486 1.34830 -1.368 0.1751
## goal_difference 10.30919 10.91494 0.945 0.3478
## latest_season 0.21214 2.22922 0.095 0.9244
## prev_season_points 0.24435 0.09895 2.469 0.0157 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.454 on 80 degrees of freedom
## Multiple R-squared: 0.7246, Adjusted R-squared: 0.7039
## F-statistic: 35.08 on 6 and 80 DF, p-value: < 2.2e-16
# Predictions for 2023
predictions_2023 <- predict(lm_model_f, newdata = model_data_2023)
# Adding predictions to the 2023 data
model_data_2023$predicted_points <- predictions_2023
Model_results <- data.frame(Predicted2023 = model_data_2023$predicted_points, Team = model_data_2023$name)
print(Model_results)
## Predicted2023 Team
## 1 57.29343 Athletic Bilbao
## 2 69.55057 Atletico Madrid
## 3 81.32522 Barcelona
## 4 39.85911 Cadiz
## 5 50.28450 Celta Vigo
## 6 40.36242 Elche
## 7 40.93052 Espanyol
## 8 46.03568 Getafe
## 9 45.44971 Girona FC
## 10 42.68106 Mallorca
## 11 47.33621 Osasuna
## 12 47.80430 Rayo Vallecano
## 13 54.32746 Real Betis
## 14 82.60059 Real Madrid
## 15 59.50566 Real Sociedad
## 16 34.99441 Real Valladolid
## 17 57.65568 Sevilla FC
## 18 51.14048 Valencia
## 19 60.86297 Villarreal
testing a third model using the averge of all our predictions
Creating the model
lm_model_f2 <- lm(total_points ~ off + def + spi + goal_difference + latest_season + prev_season_points , data = model_data)
summary(lm_model_f2)
##
## Call:
## lm(formula = total_points ~ off + def + spi + goal_difference +
## latest_season + prev_season_points, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.4062 -5.6877 -0.4172 5.3925 21.7595
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 119.97547 78.60227 1.526 0.1309
## off 51.20417 22.11006 2.316 0.0231 *
## def -62.44058 32.19721 -1.939 0.0560 .
## spi -1.84486 1.34830 -1.368 0.1751
## goal_difference 10.30919 10.91494 0.945 0.3478
## latest_season 0.21214 2.22922 0.095 0.9244
## prev_season_points 0.24435 0.09895 2.469 0.0157 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.454 on 80 degrees of freedom
## Multiple R-squared: 0.7246, Adjusted R-squared: 0.7039
## F-statistic: 35.08 on 6 and 80 DF, p-value: < 2.2e-16
model3_predictions <- predict(lm_model_f2)
comparison_table3 <- data.frame( Predicted = model3_predictions, Team = model_data$name)
print(comparison_table3)
## Predicted Team
## 1 56.83694 Athletic Bilbao
## 2 54.14911 Athletic Bilbao
## 3 56.59259 Athletic Bilbao
## 4 56.10390 Athletic Bilbao
## 5 54.88216 Athletic Bilbao
## 6 57.29343 Athletic Bilbao
## 7 70.07148 Atletico Madrid
## 8 70.56017 Atletico Madrid
## 9 70.56017 Atletico Madrid
## 10 69.09408 Atletico Madrid
## 11 73.00365 Atletico Madrid
## 12 69.55057 Atletico Madrid
## 13 82.82351 Barcelona
## 14 85.26699 Barcelona
## 15 84.53395 Barcelona
## 16 83.31221 Barcelona
## 17 82.33482 Barcelona
## 18 81.32522 Barcelona
## 19 40.86871 Cadiz
## 20 39.85911 Cadiz
## 21 48.85062 Celta Vigo
## 22 50.80540 Celta Vigo
## 23 48.85062 Celta Vigo
## 24 47.87323 Celta Vigo
## 25 51.78280 Celta Vigo
## 26 50.28450 Celta Vigo
## 27 39.41724 Elche
## 28 40.36242 Elche
## 29 43.65056 Espanyol
## 30 42.42882 Espanyol
## 31 43.40621 Espanyol
## 32 36.56447 Espanyol
## 33 40.93052 Espanyol
## 34 49.00007 Getafe
## 35 50.71050 Getafe
## 36 49.48876 Getafe
## 37 45.57919 Getafe
## 38 46.03568 Getafe
## 39 48.41409 Girona FC
## 40 45.44971 Girona FC
## 41 41.00283 Mallorca
## 42 42.68106 Mallorca
## 43 40.28233 Osasuna
## 44 48.34581 Osasuna
## 45 46.39103 Osasuna
## 46 47.33621 Osasuna
## 47 45.14867 Rayo Vallecano
## 48 47.80430 Rayo Vallecano
## 49 46.78488 Real Betis
## 50 52.16054 Real Betis
## 51 50.45010 Real Betis
## 52 48.25097 Real Betis
## 53 53.13793 Real Betis
## 54 54.32746 Real Betis
## 55 81.41106 Real Madrid
## 56 79.94497 Real Madrid
## 57 77.99019 Real Madrid
## 58 82.63280 Real Madrid
## 59 81.16671 Real Madrid
## 60 82.60059 Real Madrid
## 61 58.07178 Real Sociedad
## 62 56.11700 Real Sociedad
## 63 56.36134 Real Sociedad
## 64 57.82743 Real Sociedad
## 65 59.29352 Real Sociedad
## 66 59.50566 Real Sociedad
## 67 37.22575 Real Valladolid
## 68 37.47010 Real Valladolid
## 69 34.99441 Real Valladolid
## 70 57.19919 Sevilla FC
## 71 53.77832 Sevilla FC
## 72 54.75572 Sevilla FC
## 73 57.44354 Sevilla FC
## 74 58.17659 Sevilla FC
## 75 57.65568 Sevilla FC
## 76 48.24051 Valencia
## 77 56.30399 Valencia
## 78 54.10486 Valencia
## 79 52.15008 Valencia
## 80 49.70660 Valencia
## 81 51.14048 Valencia
## 82 60.16213 Villarreal
## 83 61.13952 Villarreal
## 84 56.98561 Villarreal
## 85 60.16213 Villarreal
## 86 60.40648 Villarreal
## 87 60.86297 Villarreal
# Calculate the average predicted points for each team
average_predictions <- comparison_table3 %>%
group_by(Team) %>%
summarize(Average_Predicted = mean(Predicted, na.rm = TRUE))
# View the resulting table
print(average_predictions)
## # A tibble: 19 × 2
## Team Average_Predicted
## <chr> <dbl>
## 1 Athletic Bilbao 56.0
## 2 Atletico Madrid 70.5
## 3 Barcelona 83.3
## 4 Cadiz 40.4
## 5 Celta Vigo 49.7
## 6 Elche 39.9
## 7 Espanyol 41.4
## 8 Getafe 48.2
## 9 Girona FC 46.9
## 10 Mallorca 41.8
## 11 Osasuna 45.6
## 12 Rayo Vallecano 46.5
## 13 Real Betis 50.9
## 14 Real Madrid 81.0
## 15 Real Sociedad 57.9
## 16 Real Valladolid 36.6
## 17 Sevilla FC 56.5
## 18 Valencia 51.9
## 19 Villarreal 60.0