Forecasting projects

Install packages

install.packages("tidyverse", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpwesZom/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
install.packages("dyplyr", repos = "https://cloud.r-project.org")
## Warning: package 'dyplyr' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library(dplyr)
install.packages("corrplot", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpwesZom/downloaded_packages
library(corrplot)
## corrplot 0.92 loaded
install.packages("ggplot2", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpwesZom/downloaded_packages
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Read data

Dataset 1 for the spi

global_ranking <- read_csv("gbl_rankings2.csv")
## Rows: 641 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): name, league
## dbl (5): rank, prev_rank, off, def, spi
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Dataset 2 for recent matches

recent_matches <- read_csv("latest_matches.csv")
## Rows: 11737 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (3): league, team1, team2
## dbl  (19): season, league_id, spi1, spi2, prob1, prob2, probtie, proj_score1...
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Dataset 3 for past matches

hist_matches <- read_csv("hist_matches.csv")
## Rows: 68913 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (3): league, team1, team2
## dbl  (19): season, league_id, spi1, spi2, prob1, prob2, probtie, proj_score1...
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Cleaning the dataset 1 for Global ranking (Deleting rank and previus rank variable)

global_ranking2 <- global_ranking[,-1:-2]

Deleting all the other teams outside the spanish league.

spanish_ranking <- global_ranking2 %>% filter(grepl('Spanish Primera Division', league))

Cleaning dataset 2 for latest matchest (Selecting only the team from the spanish primera division) It only have the 22 season.

lst_season <- recent_matches %>% filter(grepl('1869', league_id))

Cleaning dataset 3 for history of matches (Selecting only the teams from the Spanish primera divison)

all_seasons <- hist_matches %>% filter(grepl('1869', league_id))

Exploring my datasets

Creating a summary of statistics for spanish ranking

describe(spanish_ranking)
##         vars  n  mean   sd median trimmed   mad   min   max range skew kurtosis
## name*      1 20 10.50 5.92  10.50   10.50  7.41  1.00 20.00 19.00 0.00    -1.38
## league*    2 20  1.00 0.00   1.00    1.00  0.00  1.00  1.00  0.00  NaN      NaN
## off        3 20  1.83 0.30   1.73    1.78  0.20  1.51  2.56  1.05 1.06     0.03
## def        4 20  0.84 0.23   0.81    0.84  0.22  0.43  1.27  0.84 0.16    -0.93
## spi        5 20 67.71 9.21  67.03   67.00 11.20 53.94 86.40 32.46 0.46    -0.87
##           se
## name*   1.32
## league* 0.00
## off     0.07
## def     0.05
## spi     2.06

Creating a summary of statistics for all_seasons

describe(all_seasons)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##             vars    n    mean    sd  median trimmed   mad     min     max
## season         1 2660 2019.00  2.00 2019.00 2019.00  2.97 2016.00 2022.00
## date           2 2660     NaN    NA      NA     NaN    NA     Inf    -Inf
## league_id      3 2660 1869.00  0.00 1869.00 1869.00  0.00 1869.00 1869.00
## league*        4 2660    1.00  0.00    1.00    1.00  0.00    1.00    1.00
## team1*         5 2660   15.59  9.07   16.00   15.54 11.86    1.00   30.00
## team2*         6 2660   15.59  9.07   16.00   15.54 11.86    1.00   30.00
## spi1           7 2660   71.77  9.53   70.82   71.43  8.64   43.58   96.57
## spi2           8 2660   71.73  9.53   70.76   71.38  8.52   43.91   96.69
## prob1          9 2660    0.46  0.17    0.45    0.45  0.15    0.06    0.98
## prob2         10 2660    0.29  0.15    0.27    0.28  0.13    0.00    0.83
## probtie       11 2660    0.25  0.05    0.26    0.26  0.03    0.02    0.35
## proj_score1   12 2660    1.51  0.47    1.41    1.44  0.33    0.62    4.03
## proj_score2   13 2660    1.11  0.46    1.02    1.05  0.34    0.20    3.42
## importance1   14 2612   35.03 25.32   33.05   33.28 26.32    0.00  100.00
## importance2   15 2612   34.37 24.78   32.55   32.73 26.46    0.00  100.00
## score1        16 2660    1.48  1.26    1.00    1.34  1.48    0.00    8.00
## score2        17 2660    1.13  1.11    1.00    0.97  1.48    0.00    6.00
## xg1           18 2658    1.53  0.87    1.40    1.44  0.80    0.08    7.04
## xg2           19 2658    1.16  0.77    0.99    1.07  0.68    0.00    5.02
## nsxg1         20 2658    1.39  0.71    1.27    1.31  0.58    0.10    6.89
## nsxg2         21 2658    1.08  0.60    0.97    1.01  0.50    0.02    5.92
## adj_score1    22 2658    1.48  1.23    1.05    1.36  1.56    0.00    7.59
## adj_score2    23 2658    1.14  1.09    1.05    0.99  1.56    0.00    6.30
##              range  skew kurtosis   se
## season        6.00  0.00    -1.25 0.04
## date          -Inf    NA       NA   NA
## league_id     0.00   NaN      NaN 0.00
## league*       0.00   NaN      NaN 0.00
## team1*       29.00  0.00    -1.33 0.18
## team2*       29.00  0.00    -1.33 0.18
## spi1         52.99  0.32    -0.13 0.18
## spi2         52.78  0.32    -0.12 0.18
## prob1         0.91  0.36     0.20 0.00
## prob2         0.83  0.80     0.69 0.00
## probtie       0.33 -1.60     2.90 0.00
## proj_score1   3.41  1.55     3.19 0.01
## proj_score2   3.22  1.35     2.65 0.01
## importance1 100.00  0.48    -0.35 0.50
## importance2 100.00  0.49    -0.29 0.48
## score1        8.00  0.96     1.10 0.02
## score2        6.00  1.13     1.44 0.02
## xg1           6.96  1.13     2.17 0.02
## xg2           5.02  1.24     2.04 0.01
## nsxg1         6.79  1.53     4.42 0.01
## nsxg2         5.90  1.65     5.78 0.01
## adj_score1    7.59  0.86     0.89 0.02
## adj_score2    6.30  1.04     1.25 0.02

Checking for missing values spanish_ranking

sum(is.na(spanish_ranking))
## [1] 0

Checking for missing values all_seasons

sum(is.na(all_seasons))
## [1] 108
# Tell us which rows and columns are missing 
# which(is.na(all_seasons), arr.ind = TRUE)

clean_all_seasons <- na.omit(all_seasons)

Creating a correlation table for clean_all_seasons

numeric_data <- clean_all_seasons %>% select_if(is.numeric)
cm <- cor(numeric_data, use = "complete.obs")
## Warning in cor(numeric_data, use = "complete.obs"): the standard deviation is
## zero
corrplot(cm, method = "circle")

Creating new features for the model

clean_all_seasons$date <- ymd(clean_all_seasons$date)
clean_all_seasons$season <- as.factor(clean_all_seasons$season)

Calculting the points for each game

clean_all_seasons <- clean_all_seasons %>%
  mutate(team1_points = case_when(
    score1 > score2 ~ 3,
    score1 == score2 ~ 1,
    TRUE ~ 0 ),
  team2_points = case_when(
    score2 > score1 ~ 3,
    score1 == score2 ~ 1,
    TRUE ~ 0))

Aggregate the points by team and season

team_points <- clean_all_seasons %>% 
  select(season, team1, team1_points) %>%
  rename(team = team1, points = team1_points) %>%
  bind_rows(
    clean_all_seasons %>%
      select(season, team2, team2_points) %>%
      rename(team = team2, points = team2_points)
  ) %>%
  group_by(season, team) %>%
  summarize(total_points = sum(points))
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.

Creating even more features to add the vairables from clean_all_season to the modeldata ( We still have to add more variables)

avg_goals_scored <- clean_all_seasons %>%
  group_by(team1) %>%
  summarize(avg_goals_scored = mean(score1, na.rm = TRUE)) %>%
  rename(team = team1)

avg_goals_conceded <- clean_all_seasons %>%
  group_by(team2) %>%
  summarize(avg_goals_conceded = mean(score2, na.rm = TRUE)) %>%
  rename(team = team2)

# Merging these features with the spanish_ranking dataset

model_data <- spanish_ranking %>%
  left_join(avg_goals_scored, by = c("name" = "team")) %>%
  left_join(avg_goals_conceded, by = c("name" = "team"))

# Now merge with team_points to get total_points
model_data <- merge(model_data, team_points, by.x = "name", by.y = "team")

We have to join both of our variable to reduce the multicolinearity.

# Creating a new variable, for example: goal difference

model_data$goal_difference <- model_data$avg_goals_scored - model_data$avg_goals_conceded

Creating lag variables

model_data <- model_data %>%
  arrange(name, season) %>% # Ensure data is ordered first
  group_by(name) %>%
  mutate(prev_season_points = lag(total_points, n = 1)) %>%
  ungroup()

Cleaning the na values

model_data <- na.omit(model_data)

Splitting the data to test on the 2022 season

train_data <- model_data %>% filter(season != "2022")
test_data <- model_data %>% filter(season == "2022")

Creating the first model using a simple linear regression as our starting point. ( Other models that we want to applay are Random forest and maybe time series regression since we have yearly data)

Model 1 Linear regression

Here we are suggesting that latest_season has a significant coefficient in the model, it suggests that there is a difference in how the most recent season’s data relates to the total points compared to other seasons. Meaning that recent performance is a stronger (or weaker) indicator of future performance.

train_data$latest_season <- if_else(train_data$season == "2021", 1, 0)
test_data$latest_season <- if_else(test_data$season == "2021", 1, 0)
lm_model <- lm(total_points ~ off + def + spi + goal_difference + latest_season + prev_season_points , data = train_data)
summary(lm_model)
## 
## Call:
## lm(formula = total_points ~ off + def + spi + goal_difference + 
##     latest_season + prev_season_points, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.9355  -5.0055  -0.8201   5.1940  21.7719 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        128.88611   97.47102   1.322   0.1910  
## off                 53.26239   27.09144   1.966   0.0539 .
## def                -64.89713   39.64576  -1.637   0.1068  
## spi                 -2.03792    1.66782  -1.222   0.2264  
## goal_difference     15.61564   13.83604   1.129   0.2635  
## latest_season        0.09577    2.53508   0.038   0.9700  
## prev_season_points   0.25701    0.11175   2.300   0.0249 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.825 on 61 degrees of freedom
## Multiple R-squared:  0.7013, Adjusted R-squared:  0.6719 
## F-statistic: 23.87 on 6 and 61 DF,  p-value: 2.55e-14

Predict using the test data

model1_predictions <- predict(lm_model, newdata = test_data)
actual_points <- test_data$total_points

Calculate the RMSE

rmse <- sqrt(mean(( model1_predictions - actual_points)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 7.24784474255032"

Creating a graph to vizualize the results

# Create a dataframe for plotting

plot_data <- data.frame(Actual = actual_points, Predicted = model1_predictions)

# Scatter plot

ggplot(plot_data, aes(x = Actual, y = Predicted)) +
  geom_point(color = "blue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  theme_minimal() +
  labs(title = "Actual vs Predicted Total Points", x = "Actual Total Points", y = "Predicted Total Points")

Creating a table to have a more clear view of the results

comparison_table <- data.frame(Actual = actual_points, Predicted = model1_predictions, Team = test_data$name)

comparison_table$Difference = comparison_table$Actual - comparison_table$Predicted
print(comparison_table)
##    Actual Predicted            Team  Difference
## 1      51  56.77623 Athletic Bilbao  -5.7762291
## 2      77  69.11422 Atletico Madrid   7.8857757
## 3      88  80.36174       Barcelona   7.6382598
## 4      42  39.33600           Cadiz   2.6639991
## 5      43  50.36756      Celta Vigo  -7.3675594
## 6      25  40.66319           Elche -15.6631871
## 7      37  41.10839        Espanyol  -4.1083919
## 8      42  45.99088          Getafe  -3.9908816
## 9      49  44.16239       Girona FC   4.8376104
## 10     50  43.13161        Mallorca   6.8683860
## 11     53  46.81570         Osasuna   6.1843049
## 12     49  49.14726  Rayo Vallecano  -0.1472558
## 13     60  54.72878      Real Betis   5.2712154
## 14     78  81.69105     Real Madrid  -3.6910534
## 15     71  58.26109   Real Sociedad  12.7389134
## 16     40  35.10094 Real Valladolid   4.8990626
## 17     49  57.90245      Sevilla FC  -8.9024548
## 18     42  51.05519        Valencia  -9.0551946
## 19     64  61.37394      Villarreal   2.6260557

Run a regular regression and run the model without doing the test split to see how well it predicts the next season.

Creating grearter weight for the lastest season:

model_data$latest_season <- if_else(model_data$season == "2022", 1, 0)
# Filter out the 2022 data

model_data_2022 <- model_data %>% filter(season == "2022")

# Prepare the 2023 prediction data

model_data_2023 <- model_data_2022 %>%
  mutate(season = "2023",
         latest_season = 1, # Since 2023 is the latest season
         total_points = NA) # total_points is what we want to predict

Creating the model

lm_model_f <- lm(total_points ~ off + def + spi + goal_difference + latest_season + prev_season_points  , data = model_data )
summary(lm_model_f)
## 
## Call:
## lm(formula = total_points ~ off + def + spi + goal_difference + 
##     latest_season + prev_season_points, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.4062  -5.6877  -0.4172   5.3925  21.7595 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        119.97547   78.60227   1.526   0.1309  
## off                 51.20417   22.11006   2.316   0.0231 *
## def                -62.44058   32.19721  -1.939   0.0560 .
## spi                 -1.84486    1.34830  -1.368   0.1751  
## goal_difference     10.30919   10.91494   0.945   0.3478  
## latest_season        0.21214    2.22922   0.095   0.9244  
## prev_season_points   0.24435    0.09895   2.469   0.0157 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.454 on 80 degrees of freedom
## Multiple R-squared:  0.7246, Adjusted R-squared:  0.7039 
## F-statistic: 35.08 on 6 and 80 DF,  p-value: < 2.2e-16
# Predictions for 2023
predictions_2023 <- predict(lm_model_f, newdata = model_data_2023)
# Adding predictions to the 2023 data
model_data_2023$predicted_points <- predictions_2023
Model_results <- data.frame(Predicted2023 = model_data_2023$predicted_points, Team = model_data_2023$name)
print(Model_results)
##    Predicted2023            Team
## 1       57.29343 Athletic Bilbao
## 2       69.55057 Atletico Madrid
## 3       81.32522       Barcelona
## 4       39.85911           Cadiz
## 5       50.28450      Celta Vigo
## 6       40.36242           Elche
## 7       40.93052        Espanyol
## 8       46.03568          Getafe
## 9       45.44971       Girona FC
## 10      42.68106        Mallorca
## 11      47.33621         Osasuna
## 12      47.80430  Rayo Vallecano
## 13      54.32746      Real Betis
## 14      82.60059     Real Madrid
## 15      59.50566   Real Sociedad
## 16      34.99441 Real Valladolid
## 17      57.65568      Sevilla FC
## 18      51.14048        Valencia
## 19      60.86297      Villarreal

testing a third model using the averge of all our predictions

Creating the model

lm_model_f2 <- lm(total_points ~ off + def + spi + goal_difference + latest_season + prev_season_points  , data = model_data)
summary(lm_model_f2)
## 
## Call:
## lm(formula = total_points ~ off + def + spi + goal_difference + 
##     latest_season + prev_season_points, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.4062  -5.6877  -0.4172   5.3925  21.7595 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        119.97547   78.60227   1.526   0.1309  
## off                 51.20417   22.11006   2.316   0.0231 *
## def                -62.44058   32.19721  -1.939   0.0560 .
## spi                 -1.84486    1.34830  -1.368   0.1751  
## goal_difference     10.30919   10.91494   0.945   0.3478  
## latest_season        0.21214    2.22922   0.095   0.9244  
## prev_season_points   0.24435    0.09895   2.469   0.0157 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.454 on 80 degrees of freedom
## Multiple R-squared:  0.7246, Adjusted R-squared:  0.7039 
## F-statistic: 35.08 on 6 and 80 DF,  p-value: < 2.2e-16
model3_predictions <- predict(lm_model_f2)
comparison_table3 <- data.frame( Predicted = model3_predictions, Team = model_data$name)


print(comparison_table3)
##    Predicted            Team
## 1   56.83694 Athletic Bilbao
## 2   54.14911 Athletic Bilbao
## 3   56.59259 Athletic Bilbao
## 4   56.10390 Athletic Bilbao
## 5   54.88216 Athletic Bilbao
## 6   57.29343 Athletic Bilbao
## 7   70.07148 Atletico Madrid
## 8   70.56017 Atletico Madrid
## 9   70.56017 Atletico Madrid
## 10  69.09408 Atletico Madrid
## 11  73.00365 Atletico Madrid
## 12  69.55057 Atletico Madrid
## 13  82.82351       Barcelona
## 14  85.26699       Barcelona
## 15  84.53395       Barcelona
## 16  83.31221       Barcelona
## 17  82.33482       Barcelona
## 18  81.32522       Barcelona
## 19  40.86871           Cadiz
## 20  39.85911           Cadiz
## 21  48.85062      Celta Vigo
## 22  50.80540      Celta Vigo
## 23  48.85062      Celta Vigo
## 24  47.87323      Celta Vigo
## 25  51.78280      Celta Vigo
## 26  50.28450      Celta Vigo
## 27  39.41724           Elche
## 28  40.36242           Elche
## 29  43.65056        Espanyol
## 30  42.42882        Espanyol
## 31  43.40621        Espanyol
## 32  36.56447        Espanyol
## 33  40.93052        Espanyol
## 34  49.00007          Getafe
## 35  50.71050          Getafe
## 36  49.48876          Getafe
## 37  45.57919          Getafe
## 38  46.03568          Getafe
## 39  48.41409       Girona FC
## 40  45.44971       Girona FC
## 41  41.00283        Mallorca
## 42  42.68106        Mallorca
## 43  40.28233         Osasuna
## 44  48.34581         Osasuna
## 45  46.39103         Osasuna
## 46  47.33621         Osasuna
## 47  45.14867  Rayo Vallecano
## 48  47.80430  Rayo Vallecano
## 49  46.78488      Real Betis
## 50  52.16054      Real Betis
## 51  50.45010      Real Betis
## 52  48.25097      Real Betis
## 53  53.13793      Real Betis
## 54  54.32746      Real Betis
## 55  81.41106     Real Madrid
## 56  79.94497     Real Madrid
## 57  77.99019     Real Madrid
## 58  82.63280     Real Madrid
## 59  81.16671     Real Madrid
## 60  82.60059     Real Madrid
## 61  58.07178   Real Sociedad
## 62  56.11700   Real Sociedad
## 63  56.36134   Real Sociedad
## 64  57.82743   Real Sociedad
## 65  59.29352   Real Sociedad
## 66  59.50566   Real Sociedad
## 67  37.22575 Real Valladolid
## 68  37.47010 Real Valladolid
## 69  34.99441 Real Valladolid
## 70  57.19919      Sevilla FC
## 71  53.77832      Sevilla FC
## 72  54.75572      Sevilla FC
## 73  57.44354      Sevilla FC
## 74  58.17659      Sevilla FC
## 75  57.65568      Sevilla FC
## 76  48.24051        Valencia
## 77  56.30399        Valencia
## 78  54.10486        Valencia
## 79  52.15008        Valencia
## 80  49.70660        Valencia
## 81  51.14048        Valencia
## 82  60.16213      Villarreal
## 83  61.13952      Villarreal
## 84  56.98561      Villarreal
## 85  60.16213      Villarreal
## 86  60.40648      Villarreal
## 87  60.86297      Villarreal
# Calculate the average predicted points for each team
average_predictions <- comparison_table3 %>%
  group_by(Team) %>%
  summarize(Average_Predicted = mean(Predicted, na.rm = TRUE))

# View the resulting table
print(average_predictions)
## # A tibble: 19 × 2
##    Team            Average_Predicted
##    <chr>                       <dbl>
##  1 Athletic Bilbao              56.0
##  2 Atletico Madrid              70.5
##  3 Barcelona                    83.3
##  4 Cadiz                        40.4
##  5 Celta Vigo                   49.7
##  6 Elche                        39.9
##  7 Espanyol                     41.4
##  8 Getafe                       48.2
##  9 Girona FC                    46.9
## 10 Mallorca                     41.8
## 11 Osasuna                      45.6
## 12 Rayo Vallecano               46.5
## 13 Real Betis                   50.9
## 14 Real Madrid                  81.0
## 15 Real Sociedad                57.9
## 16 Real Valladolid              36.6
## 17 Sevilla FC                   56.5
## 18 Valencia                     51.9
## 19 Villarreal                   60.0