This R Markdown demonstrates how to use regression model in predicting the number of goals Erling Haaland will score in Manchester City. This prediction is done under some assumptions such as: Haaland will play all if not most of the matches, and Manchester City will perform as good as the average of Man City’s last three seasons.
All the data used here are from fbref.com.
library(rvest)
library(tidyverse)
library(GGally)
library(janitor)
library(DataExplorer)
library(caret)
library(performance)
library(car)
library(MLmetrics)
library(see)
library(miscTools)First, let’s read the table of Haaland goal.
# haaland_2122 <- read_html("https://fbref.com/en/players/1f44ac21/matchlogs/s11193/summary/Erling-Haaland-Match-Logs#matchlogs_11193")
#
# haaland_2021 <- read_html("https://fbref.com/en/players/1f44ac21/matchlogs/s10737/summary/Erling-Haaland-Match-Logs#matchlogs_10737")Use rvest to create table.
# haaland_2122_table <- haaland_2122 %>% html_elements(".stats_table") %>% html_table()
# haaland_2122_table <- as.data.frame(haaland_2122_table)
# rename headers and clean dataframe
# haaland_2122_table <- haaland_2122_table %>%
# rename(round = Var.4,
# start = Var.9,
# minutes = Var.11,
# goals = Performance
# ) %>%
# select(round, start, minutes, goals) %>% filter(minutes!="Min") %>%
# mutate(across(.cols=c("minutes", "goals"),
# .fns = as.numeric),
# start = as.factor(start)) %>% drop_na()
# ```
# ```{r}
# haaland_2021_table <- haaland_2021 %>% html_elements(".stats_table") %>% html_table()
# haaland_2021_table <- as.data.frame(haaland_2021_table)
# rename headers and clean dataframe
# haaland_2021_table <- haaland_2021_table %>%
# rename(round = Var.4,
# start = Var.9,
# minutes = Var.11,
# goals = Performance
# ) %>%
# select(round, start, minutes, goals) %>% filter(minutes!="Min") %>%
# mutate(across(.cols=c("minutes", "goals"),
# .fns = as.numeric),
# start = as.factor(start)) %>% drop_na()# haaland_2122_table
# haaland_2021_tableSave the table into RDS in anticipation of being banned from fbref from scraping their site too often.
# saveRDS(haaland_2122_table, file = "h_t_2122.RDS")
# saveRDS(haaland_2021_table, file = "h_t_2021.RDS")haaland_2122_table <- readRDS("h_t_2122.RDS")
haaland_2021_table <- readRDS("h_t_2021.RDS")Import the html.
# dortmund_2122 <- read_html("https://fbref.com/en/squads/add600ae/2021-2022/matchlogs/s11193/shooting/Dortmund-Match-Logs-Bundesliga#matchlogs_for")
#
# dortmund_2021 <- read_html("https://fbref.com/en/squads/add600ae/2020-2021/matchlogs/s10737/shooting/Dortmund-Match-Logs-Bundesliga#matchlogs_for")Create dataframe.
# dortmund_2122_table_scoring <-
# dortmund_2122 %>% html_elements(".stats_table#matchlogs_for") %>% html_table() %>% as.data.frame()
#
# dortmund_2021_table_scoring <-
# dortmund_2021 %>% html_elements(".stats_table#matchlogs_for") %>% html_table() %>% as.data.frame()Save the table into RDS in anticipation of being banned from fbref from scraping their site too often.
# saveRDS(dortmund_2122_table_scoring, file = "hd_scoring_2122.RDS")
# saveRDS(dortmund_2021_table_scoring, file = "hd_scoring_2021.RDS")dortmund_2122_table_scoring <- readRDS("hd_scoring_2122.RDS")
dortmund_2021_table_scoring <- readRDS("hd_scoring_2021.RDS")Pull the first row as vector for headers.
dort_headers <- dortmund_2122_table_scoring %>% slice(1) %>% as.character()Clean headers and select column.
dortmund_2122_table_scoring <-
dortmund_2122_table_scoring %>%
rename_with(~dort_headers, .cols = everything()) %>% clean_names()
dortmund_2122_table_scoring <-
dortmund_2122_table_scoring %>%
filter(date != "Date") %>%
mutate(venue = as.factor(venue),
result = as.factor(result),
goals_for = as.numeric(gf),
goals_against = as.numeric(ga),
shots = as.numeric(sh),
shots_ontarget = as.numeric(so_t),
shots_ontarget_percent = as.numeric(so_t_percent),
goals_pershot = as.numeric(g_sh),
goals_pershot_ontarget = as.numeric(g_so_t),
shots_avg_dist = as.numeric(dist),
shots_freekick = as.numeric(fk),
penalty_kick = as.numeric(pk),
penalty_kick_att = as.numeric(p_katt),
expected_goals = as.numeric(x_g),
nonpenalty_expected_goals = as.numeric(npx_g),
nonpenalty_expected_goals_pershot = as.numeric(npx_g_sh),
goals_minus_expected_goals = as.numeric(g_x_g),
nonpenalty.goals_minus_nonpenalty.expectedgoals = as.numeric(np_g_x_g)
) %>% select(round, venue, result, goals_for, goals_against, shots, shots_ontarget, shots_ontarget_percent,
goals_pershot, goals_pershot_ontarget, shots_avg_dist, shots_freekick, penalty_kick,
penalty_kick_att, expected_goals, nonpenalty_expected_goals,
nonpenalty_expected_goals_pershot, goals_minus_expected_goals,
nonpenalty.goals_minus_nonpenalty.expectedgoals)dortmund_2021_table_scoring <-
dortmund_2021_table_scoring %>%
rename_with(~dort_headers, .cols = everything()) %>% clean_names()
dortmund_2021_table_scoring <-
dortmund_2021_table_scoring %>%
filter(date != "Date") %>%
mutate(venue = as.factor(venue),
result = as.factor(result),
goals_for = as.numeric(gf),
goals_against = as.numeric(ga),
shots = as.numeric(sh),
shots_ontarget = as.numeric(so_t),
shots_ontarget_percent = as.numeric(so_t_percent),
goals_pershot = as.numeric(g_sh),
goals_pershot_ontarget = as.numeric(g_so_t),
shots_avg_dist = as.numeric(dist),
shots_freekick = as.numeric(fk),
penalty_kick = as.numeric(pk),
penalty_kick_att = as.numeric(p_katt),
expected_goals = as.numeric(x_g),
nonpenalty_expected_goals = as.numeric(npx_g),
nonpenalty_expected_goals_pershot = as.numeric(npx_g_sh),
goals_minus_expected_goals = as.numeric(g_x_g),
nonpenalty.goals_minus_nonpenalty.expectedgoals = as.numeric(np_g_x_g)
) %>% select(round, venue, result, goals_for, goals_against, shots, shots_ontarget, shots_ontarget_percent,
goals_pershot, goals_pershot_ontarget, shots_avg_dist, shots_freekick, penalty_kick,
penalty_kick_att, expected_goals, nonpenalty_expected_goals,
nonpenalty_expected_goals_pershot, goals_minus_expected_goals,
nonpenalty.goals_minus_nonpenalty.expectedgoals)Join Haaland personal scoring match stats and Dortmund shooting match stats.
haaland_dortmund_scoring_2122 <-
haaland_2122_table %>%
left_join(dortmund_2122_table_scoring,
suffix = c(".haaland", ".dortmund"),
by = "round")
haaland_dortmund_scoring_2021 <-
haaland_2021_table %>%
left_join(dortmund_2021_table_scoring,
suffix = c(".haaland", ".dortmund"),
by = "round")Import from fbref.
# dortmund_2122 <- read_html("https://fbref.com/en/squads/add600ae/2021-2022/matchlogs/s11193/passing/Dortmund-Match-Logs-Bundesliga#matchlogs_for")
# dortmund_2021 <- read_html("https://fbref.com/en/squads/add600ae/2020-2021/matchlogs/s10737/passing/Dortmund-Match-Logs-Bundesliga#matchlogs_for")Use rvest to create dataframe.
# dortmund_2122_table_passing <-
# dortmund_2122 %>% html_elements(".stats_table#matchlogs_for") %>% html_table() %>% as.data.frame()
# dortmund_2021_table_passing <-
# dortmund_2021 %>% html_elements(".stats_table#matchlogs_for") %>% html_table() %>% as.data.frame()Save the table into RDS in anticipation of being banned from fbref from scraping their site too often.
# saveRDS(dortmund_2122_table_passing, file = "hd_passing_2122.RDS")
# saveRDS(dortmund_2021_table_passing, file = "hd_passing_2021.RDS")dortmund_2122_table_passing <- readRDS("hd_passing_2122.RDS")
dortmund_2021_table_passing <- readRDS("hd_passing_2021.RDS")Remove excess features.
dortmund_2122_table_passing <- dortmund_2122_table_passing %>% select(-c(15:23)) %>%
select(-c("For.Dortmund","For.Dortmund.1","For.Dortmund.3","For.Dortmund.4", "For.Dortmund.5",
"For.Dortmund.6","For.Dortmund.7","For.Dortmund.8","Var.31"))
dortmund_2021_table_passing <- dortmund_2021_table_passing %>% select(-c(15:23)) %>%
select(-c("For.Dortmund","For.Dortmund.1","For.Dortmund.3","For.Dortmund.4", "For.Dortmund.5",
"For.Dortmund.6","For.Dortmund.7","For.Dortmund.8","Var.31"))Clean headers.
dort_headers <- dortmund_2122_table_passing %>% slice(1) %>% as.character()dortmund_2122_table_passing <-
dortmund_2122_table_passing %>%
rename_with(~dort_headers, .cols = everything()) %>% clean_names()
dortmund_2021_table_passing <-
dortmund_2021_table_passing %>%
rename_with(~dort_headers, .cols = everything()) %>% clean_names() Rename headers.
dortmund_2122_table_passing <-
dortmund_2122_table_passing %>%
rename(passes_completed = cmp,
passes_attempted = att,
passes_completion_percentage = cmp_percent,
passes_total_distance = tot_dist,
passes_total_progressive_distance = prg_dist,
assists = ast,
expected_goals_assisted = x_a,
passes_directy_led_to_shot = kp,
completed_passes_to_thirdhalf = x1_3,
completed_passes_to_18yardbox = ppa,
completed_crosses_to_18yardbox = crs_pa,
progressive_passes = prog
)
dortmund_2021_table_passing <-
dortmund_2021_table_passing %>%
rename(passes_completed = cmp,
passes_attempted = att,
passes_completion_percentage = cmp_percent,
passes_total_distance = tot_dist,
passes_total_progressive_distance = prg_dist,
assists = ast,
expected_goals_assisted = x_a,
passes_directy_led_to_shot = kp,
completed_passes_to_thirdhalf = x1_3,
completed_passes_to_18yardbox = ppa,
completed_crosses_to_18yardbox = crs_pa,
progressive_passes = prog
)Eliminate the first row.
dortmund_2122_table_passing <- dortmund_2122_table_passing[-1,]
dortmund_2021_table_passing <- dortmund_2021_table_passing[-1,]Adjust the data types.
dortmund_2122_table_passing <-
dortmund_2122_table_passing %>%
mutate(across(.cols = 2:13,
.fns = as.numeric)
)
dortmund_2021_table_passing <-
dortmund_2021_table_passing %>%
mutate(across(.cols = 2:13,
.fns = as.numeric)
)Join the haaland table into the dortmund passing table by round.
haaland_dortmund_passing_2122 <-
haaland_2122_table %>%
left_join(dortmund_2122_table_passing,
by = "round")
haaland_dortmund_passing_2021 <-
haaland_2021_table %>%
left_join(dortmund_2021_table_passing,
by = "round")Import from fbref.
# dortmund_2122 <- read_html("https://fbref.com/en/squads/add600ae/2021-2022/matchlogs/s11193/defense/Dortmund-Match-Logs-Bundesliga#matchlogs_for")
#
# dortmund_2021 <- read_html("https://fbref.com/en/squads/add600ae/2020-2021/matchlogs/s10737/defense/Dortmund-Match-Logs-Bundesliga#matchlogs_for")Use rvest to create dataframe.
# dortmund_2122_table_defense <-
# dortmund_2122 %>% html_elements(".stats_table#matchlogs_for") %>% html_table() %>% as.data.frame()
# dortmund_2021_table_defense <-
# dortmund_2021 %>% html_elements(".stats_table#matchlogs_for") %>% html_table() %>% as.data.frame()Save the table into RDS in anticipation of being banned from fbref from scraping their site too often.
# saveRDS(dortmund_2122_table_defense, file = "hd_defense_2122.RDS")
# saveRDS(dortmund_2021_table_defense, file = "hd_defense_2021.RDS")dortmund_2122_table_defense <- readRDS("hd_defense_2122.RDS")
dortmund_2021_table_defense <- readRDS("hd_defense_2021.RDS")Remove excess features.
dortmund_2122_table_defense <- dortmund_2122_table_defense %>%
select(-c(4:9)) %>%
select(-c("For.Dortmund",
"For.Dortmund.1",
"Vs.Dribbles",
"Vs.Dribbles.1",
"Var.30",
"Var.33"))
dortmund_2021_table_defense <- dortmund_2021_table_defense %>%
select(-c(4:9)) %>%
select(-c("For.Dortmund",
"For.Dortmund.1",
"Vs.Dribbles",
"Vs.Dribbles.1",
"Var.30",
"Var.33"))Rename headers and clean names.
dortmund_2122_table_defense <-
dortmund_2122_table_defense %>%
rename(round = For.Dortmund.2,
players_tackled = Tackles,
tackles_won_possession = Tackles.1,
tackles_def3rd = Tackles.2,
tackles_mid3rd = Tackles.3,
tackles_att3rd = Tackles.4,
percentage_dribblers_tackled = Vs.Dribbles.2,
dribbled_past = Vs.Dribbles.3,
apply_pressures = Pressures,
gained_possession_within5secsofpressure = Pressures.1,
succesful_pressure_percentage = Pressures.2,
pressures_def3rd = Pressures.3,
pressures_mid3rd = Pressures.4,
pressures_att3rd = Pressures.5,
blocks = Blocks,
blocks_shots = Blocks.1,
blocks_shots_ontarget = Blocks.2,
blocks_passes = Blocks.3,
interceptions = Var.29,
clearances = Var.31,
error_led_toshot = Var.32
) %>%
clean_names()dortmund_2021_table_defense <-
dortmund_2021_table_defense %>%
rename(round = For.Dortmund.2,
players_tackled = Tackles,
tackles_won_possession = Tackles.1,
tackles_def3rd = Tackles.2,
tackles_mid3rd = Tackles.3,
tackles_att3rd = Tackles.4,
percentage_dribblers_tackled = Vs.Dribbles.2,
dribbled_past = Vs.Dribbles.3,
apply_pressures = Pressures,
gained_possession_within5secsofpressure = Pressures.1,
succesful_pressure_percentage = Pressures.2,
pressures_def3rd = Pressures.3,
pressures_mid3rd = Pressures.4,
pressures_att3rd = Pressures.5,
blocks = Blocks,
blocks_shots = Blocks.1,
blocks_shots_ontarget = Blocks.2,
blocks_passes = Blocks.3,
interceptions = Var.29,
clearances = Var.31,
error_led_toshot = Var.32
) %>%
clean_names()Eliminate the first row.
dortmund_2122_table_defense <- dortmund_2122_table_defense[-1,]
dortmund_2021_table_defense <- dortmund_2021_table_defense[-1,]Adjust the data types.
dortmund_2122_table_defense <-
dortmund_2122_table_defense %>%
mutate(across(.cols = 2:length(names(dortmund_2122_table_defense)),
.fns = as.numeric)
)
dortmund_2021_table_defense <-
dortmund_2021_table_defense %>%
mutate(across(.cols = 2:length(names(dortmund_2021_table_defense)),
.fns = as.numeric)
)head(dortmund_2122_table_defense)## round players_tackled tackles_won_possession tackles_def3rd
## 2 Matchweek 1 17 12 6
## 3 Matchweek 2 16 15 8
## 4 Matchweek 3 13 8 6
## 5 Matchweek 4 22 16 8
## 6 Matchweek 5 12 7 6
## 7 Matchweek 6 16 11 5
## tackles_mid3rd tackles_att3rd percentage_dribblers_tackled dribbled_past
## 2 9 2 66.7 3
## 3 8 0 44.4 5
## 4 7 0 26.7 11
## 5 12 2 50.0 9
## 6 4 2 64.3 5
## 7 9 2 45.5 6
## apply_pressures gained_possession_within5secsofpressure
## 2 127 49
## 3 129 50
## 4 151 43
## 5 159 68
## 6 110 36
## 7 114 36
## succesful_pressure_percentage pressures_def3rd pressures_mid3rd
## 2 38.6 27 65
## 3 38.8 37 50
## 4 28.5 56 60
## 5 42.8 38 81
## 6 32.7 32 46
## 7 31.6 46 56
## pressures_att3rd blocks blocks_shots blocks_shots_ontarget blocks_passes
## 2 35 21 4 0 17
## 3 42 14 2 0 12
## 4 35 22 3 0 19
## 5 40 10 2 0 8
## 6 32 7 0 0 7
## 7 12 16 8 0 8
## interceptions clearances error_led_toshot
## 2 21 14 0
## 3 4 7 0
## 4 8 30 0
## 5 17 17 0
## 6 11 23 0
## 7 19 19 0
head(dortmund_2021_table_defense)## round players_tackled tackles_won_possession tackles_def3rd
## 2 Matchweek 1 14 12 9
## 3 Matchweek 2 8 5 4
## 4 Matchweek 3 17 12 7
## 5 Matchweek 4 27 23 10
## 6 Matchweek 5 17 9 10
## 7 Matchweek 6 14 9 2
## tackles_mid3rd tackles_att3rd percentage_dribblers_tackled dribbled_past
## 2 5 0 30.0 7
## 3 1 3 31.3 11
## 4 8 2 33.3 8
## 5 11 6 53.3 7
## 6 5 2 33.3 8
## 7 7 5 33.3 10
## apply_pressures gained_possession_within5secsofpressure
## 2 164 54
## 3 110 39
## 4 168 50
## 5 152 64
## 6 67 27
## 7 108 36
## succesful_pressure_percentage pressures_def3rd pressures_mid3rd
## 2 32.9 55 86
## 3 35.5 23 45
## 4 29.8 40 76
## 5 42.1 38 76
## 6 40.3 18 30
## 7 33.3 14 57
## pressures_att3rd blocks blocks_shots blocks_shots_ontarget blocks_passes
## 2 23 15 3 0 12
## 3 42 9 1 0 8
## 4 52 9 4 0 5
## 5 38 18 2 0 16
## 6 19 15 2 0 13
## 7 37 12 0 0 12
## interceptions clearances error_led_toshot
## 2 9 21 0
## 3 4 12 0
## 4 2 15 1
## 5 5 24 0
## 6 12 3 0
## 7 4 9 0
Join the haaland table into the dortmund passing table by round.
haaland_dortmund_defense_2122 <-
haaland_2122_table %>%
left_join(dortmund_2122_table_defense,
by = "round")
haaland_dortmund_defense_2021 <-
haaland_2021_table %>%
left_join(dortmund_2021_table_defense,
by = "round")Import from fbref.
# dortmund_2122 <- read_html("https://fbref.com/en/squads/add600ae/2021-2022/matchlogs/s11193/possession/Dortmund-Match-Logs-Bundesliga#matchlogs_for")
#
# dortmund_2021 <- read_html("https://fbref.com/en/squads/add600ae/2020-2021/matchlogs/s10737/possession/Dortmund-Match-Logs-Bundesliga#matchlogs_for")Use rvest to create dataframe.
# dortmund_2122_table_possession <-
# dortmund_2122 %>% html_elements(".stats_table#matchlogs_for") %>% html_table() %>% as.data.frame()
# dortmund_2021_table_possession <-
# dortmund_2021 %>% html_elements(".stats_table#matchlogs_for") %>% html_table() %>% as.data.frame()Save the table into RDS in anticipation of being banned from fbref from scraping their site too often.
# saveRDS(dortmund_2122_table_possession, file = "hd_possession_2122.RDS")
# saveRDS(dortmund_2021_table_possession, file = "hd_possession_2021.RDS")dortmund_2122_table_possession <- readRDS("hd_possession_2122.RDS")
dortmund_2021_table_possession <- readRDS("hd_possession_2021.RDS")Remove excess features.
dortmund_2122_table_possession <- dortmund_2122_table_possession %>%
select(-c(4:9)) %>%
select(-c("For.Dortmund","For.Dortmund.1","Var.35"))
dortmund_2021_table_possession <- dortmund_2021_table_possession %>%
select(-c(4:9)) %>%
select(-c("For.Dortmund","For.Dortmund.1","Var.35"))Rename headers and clean names.
dortmund_2122_table_possession <-
dortmund_2122_table_possession %>%
rename(round = For.Dortmund.2,
possession = For.Dortmund.9,
touches = Touches,
t_def_pen = Touches.1,
t_def3rd = Touches.2,
t_mid3rd = Touches.3,
t_att3rd = Touches.4,
t_att_pen = Touches.5,
t_live_ball = Touches.6,
dribbles_completed = Dribbles,
dribbles_attempted = Dribbles.1,
completed_dribbles_percentage = Dribbles.2,
foe_dribbled_past = Dribbles.3,
foe_nutmegged = Dribbles.4,
carries = Carries,
progressive_carries = Carries.1,
carries_distance_yards = Carries.2,
progressive_carries_distance_yards = Carries.3,
carries_to3rdpitch = Carries.4,
carries_to18yardsbox = Carries.5,
failed_possession_gain = Carries.6,
dispossessed = Carries.7,
passing_target = Receiving,
passes_received = Receiving.1,
passes_received_percentage = Receiving.2,
progressive_passes_received = Receiving.3
) %>%
clean_names()dortmund_2021_table_possession <-
dortmund_2021_table_possession %>%
rename(round = For.Dortmund.2,
possession = For.Dortmund.9,
touches = Touches,
t_def_pen = Touches.1,
t_def3rd = Touches.2,
t_mid3rd = Touches.3,
t_att3rd = Touches.4,
t_att_pen = Touches.5,
t_live_ball = Touches.6,
dribbles_completed = Dribbles,
dribbles_attempted = Dribbles.1,
completed_dribbles_percentage = Dribbles.2,
foe_dribbled_past = Dribbles.3,
foe_nutmegged = Dribbles.4,
carries = Carries,
progressive_carries = Carries.1,
carries_distance_yards = Carries.2,
progressive_carries_distance_yards = Carries.3,
carries_to3rdpitch = Carries.4,
carries_to18yardsbox = Carries.5,
failed_possession_gain = Carries.6,
dispossessed = Carries.7,
passing_target = Receiving,
passes_received = Receiving.1,
passes_received_percentage = Receiving.2,
progressive_passes_received = Receiving.3
) %>%
clean_names()Eliminate the first row.
dortmund_2122_table_possession <- dortmund_2122_table_possession[-1,]
dortmund_2021_table_possession <- dortmund_2021_table_possession[-1,]Adjust the data types.
dortmund_2122_table_possession <-
dortmund_2122_table_possession %>%
mutate(across(.cols = 2:length(names(dortmund_2122_table_possession)),
.fns = as.numeric)
)
dortmund_2021_table_possession <-
dortmund_2021_table_possession %>%
mutate(across(.cols = 2:length(names(dortmund_2021_table_possession)),
.fns = as.numeric)
)head(dortmund_2122_table_possession)## round possession touches t_def_pen t_def3rd t_mid3rd t_att3rd t_att_pen
## 2 Matchweek 1 60 774 105 283 372 157 37
## 3 Matchweek 2 77 860 71 264 423 239 35
## 4 Matchweek 3 59 734 75 193 347 221 48
## 5 Matchweek 4 58 680 60 213 348 176 27
## 6 Matchweek 5 63 756 65 217 389 201 22
## 7 Matchweek 6 45 591 73 242 290 90 11
## t_live_ball dribbles_completed dribbles_attempted
## 2 720 11 17
## 3 807 9 17
## 4 686 10 19
## 5 635 11 14
## 6 716 10 16
## 7 546 11 22
## completed_dribbles_percentage foe_dribbled_past foe_nutmegged carries
## 2 64.7 14 0 520
## 3 52.9 11 0 554
## 4 52.6 10 1 477
## 5 78.6 11 0 493
## 6 62.5 11 1 571
## 7 50.0 11 0 357
## progressive_carries carries_distance_yards progressive_carries_distance_yards
## 2 2260 1185 35
## 3 3064 1697 77
## 4 2334 1203 61
## 5 2328 1465 56
## 6 2760 1409 56
## 7 1762 807 34
## carries_to3rdpitch carries_to18yardsbox failed_possession_gain dispossessed
## 2 10 7 13 17
## 3 21 6 17 7
## 4 7 7 19 15
## 5 23 4 15 12
## 6 20 4 11 14
## 7 6 1 11 14
## passing_target passes_received passes_received_percentage
## 2 559 532 95.2
## 3 745 653 87.7
## 4 563 509 90.4
## 5 549 470 85.6
## 6 630 562 89.2
## 7 457 383 83.8
## progressive_passes_received
## 2 45
## 3 53
## 4 56
## 5 52
## 6 46
## 7 23
head(dortmund_2021_table_possession)## round possession touches t_def_pen t_def3rd t_mid3rd t_att3rd t_att_pen
## 2 Matchweek 1 48 667 72 258 352 107 20
## 3 Matchweek 2 79 992 48 163 522 363 39
## 4 Matchweek 3 66 924 78 303 485 196 36
## 5 Matchweek 4 55 732 69 247 334 191 34
## 6 Matchweek 5 73 970 37 195 550 279 43
## 7 Matchweek 6 73 960 40 193 560 274 43
## t_live_ball dribbles_completed dribbles_attempted
## 2 631 15 20
## 3 936 22 37
## 4 885 15 18
## 5 683 6 14
## 6 920 24 34
## 7 905 17 24
## completed_dribbles_percentage foe_dribbled_past foe_nutmegged carries
## 2 75.0 16 1 441
## 3 59.5 23 0 757
## 4 83.3 15 0 679
## 5 42.9 6 1 463
## 6 70.6 25 1 805
## 7 70.8 17 3 680
## progressive_carries carries_distance_yards progressive_carries_distance_yards
## 2 2439 1193 37
## 3 3855 2024 91
## 4 3157 1551 66
## 5 2418 1224 43
## 6 3329 1497 84
## 7 3297 1875 96
## carries_to3rdpitch carries_to18yardsbox failed_possession_gain dispossessed
## 2 16 2 10 13
## 3 23 5 10 11
## 4 24 8 13 10
## 5 15 5 13 15
## 6 18 11 12 10
## 7 22 4 10 13
## passing_target passes_received passes_received_percentage
## 2 555 489 88.1
## 3 871 800 91.8
## 4 753 733 97.3
## 5 584 505 86.5
## 6 801 758 94.6
## 7 836 743 88.9
## progressive_passes_received
## 2 39
## 3 90
## 4 50
## 5 43
## 6 46
## 7 57
Join the haaland table into the dortmund possession table by round.
haaland_dortmund_possession_2122 <-
haaland_2122_table %>%
left_join(dortmund_2122_table_possession,
by = "round")
haaland_dortmund_possession_2021 <-
haaland_2021_table %>%
left_join(dortmund_2021_table_possession,
by = "round")Import from fbref.
# dortmund_2122 <- read_html("https://fbref.com/en/squads/add600ae/2021-2022/matchlogs/s11193/misc/Dortmund-Match-Logs-Bundesliga#matchlogs_for")
#
# dortmund_2021 <- read_html("https://fbref.com/en/squads/add600ae/2020-2021/matchlogs/s10737/misc/Dortmund-Match-Logs-Bundesliga#matchlogs_for")Use rvest to create dataframe.
# dortmund_2122_table_misc <-
# dortmund_2122 %>% html_elements(".stats_table#matchlogs_for") %>% html_table() %>% as.data.frame()
# dortmund_2021_table_misc <-
# dortmund_2021 %>% html_elements(".stats_table#matchlogs_for") %>% html_table() %>% as.data.frame()Save the table into RDS in anticipation of being banned from fbref from scraping their site too often.
# saveRDS(dortmund_2122_table_misc, file = "hd_misc_2122.RDS")
# saveRDS(dortmund_2021_table_misc, file = "hd_misc_2021.RDS")dortmund_2122_table_misc <- readRDS("hd_misc_2122.RDS")
dortmund_2021_table_misc <- readRDS("hd_misc_2021.RDS")Remove excess features.
dortmund_2122_table_misc <- dortmund_2122_table_misc %>%
select(-c(4:9)) %>%
select(-c("For.Dortmund","For.Dortmund.1","Var.26","Performance.7","Performance.8"))
dortmund_2021_table_misc <- dortmund_2021_table_misc %>%
select(-c(4:9)) %>%
select(-c("For.Dortmund","For.Dortmund.1","Var.26","Performance.7","Performance.8"))Rename headers and clean names.
dortmund_2122_table_misc <-
dortmund_2122_table_misc %>%
rename(round = For.Dortmund.2,
card_y = Performance,
card_r = Performance.1,
card_y2 = Performance.2,
fouls_committed = Performance.3,
fouls_drawn = Performance.4,
offsides = Performance.5,
crosses = Performance.6,
penalty_kicks_won = Performance.9,
penalty_kicks_conceded = Performance.10,
own_goals = Performance.11,
loose_ball_recovered = Performance.12,
aerial_duels_won = Aerial.Duels,
aerial_duels_lost = Aerial.Duels.1,
percentage_of_aerial_duels_won = Aerial.Duels.2
) %>%
clean_names()dortmund_2021_table_misc <-
dortmund_2021_table_misc %>%
rename(round = For.Dortmund.2,
card_y = Performance,
card_r = Performance.1,
card_y2 = Performance.2,
fouls_committed = Performance.3,
fouls_drawn = Performance.4,
offsides = Performance.5,
crosses = Performance.6,
penalty_kicks_won = Performance.9,
penalty_kicks_conceded = Performance.10,
own_goals = Performance.11,
loose_ball_recovered = Performance.12,
aerial_duels_won = Aerial.Duels,
aerial_duels_lost = Aerial.Duels.1,
percentage_of_aerial_duels_won = Aerial.Duels.2
) %>%
clean_names()Eliminate the first row.
dortmund_2122_table_misc <- dortmund_2122_table_misc[-1,]
dortmund_2021_table_misc <- dortmund_2021_table_misc[-1,]Adjust the data types.
dortmund_2122_table_misc <-
dortmund_2122_table_misc %>%
mutate(across(.cols = 2:length(names(dortmund_2122_table_misc)),
.fns = as.numeric)
)
dortmund_2021_table_misc <-
dortmund_2021_table_misc %>%
mutate(across(.cols = 2:length(names(dortmund_2021_table_misc)),
.fns = as.numeric)
)head(dortmund_2122_table_misc)## round card_y card_r card_y2 fouls_committed fouls_drawn offsides
## 2 Matchweek 1 0 0 0 5 19 4
## 3 Matchweek 2 4 0 0 14 14 1
## 4 Matchweek 3 2 0 0 10 12 4
## 5 Matchweek 4 3 0 0 13 12 0
## 6 Matchweek 5 2 0 0 11 12 0
## 7 Matchweek 6 4 1 1 15 17 3
## crosses penalty_kicks_won penalty_kicks_conceded own_goals
## 2 6 0 0 1
## 3 15 0 0 0
## 4 15 0 0 0
## 5 8 1 0 0
## 6 9 0 1 0
## 7 7 0 0 0
## loose_ball_recovered aerial_duels_won aerial_duels_lost
## 2 92 10 14
## 3 96 17 14
## 4 90 18 10
## 5 80 9 6
## 6 83 18 16
## 7 75 5 13
## percentage_of_aerial_duels_won
## 2 41.7
## 3 54.8
## 4 64.3
## 5 60.0
## 6 52.9
## 7 27.8
head(dortmund_2021_table_misc)## round card_y card_r card_y2 fouls_committed fouls_drawn offsides
## 2 Matchweek 1 1 0 0 11 9 2
## 3 Matchweek 2 4 0 0 8 9 0
## 4 Matchweek 3 1 0 0 8 12 1
## 5 Matchweek 4 1 0 0 11 13 3
## 6 Matchweek 5 0 0 0 5 19 0
## 7 Matchweek 6 1 0 0 13 13 4
## crosses penalty_kicks_won penalty_kicks_conceded own_goals
## 2 5 1 0 0
## 3 25 0 0 0
## 4 8 0 0 0
## 5 10 0 0 0
## 6 6 0 0 0
## 7 19 0 0 0
## loose_ball_recovered aerial_duels_won aerial_duels_lost
## 2 76 2 11
## 3 95 10 11
## 4 98 16 8
## 5 94 21 11
## 6 105 10 11
## 7 114 15 20
## percentage_of_aerial_duels_won
## 2 15.4
## 3 47.6
## 4 66.7
## 5 65.6
## 6 47.6
## 7 42.9
Join the haaland table into the dortmund possession table by round.
haaland_dortmund_misc_2122 <-
haaland_2122_table %>%
left_join(dortmund_2122_table_misc,
by = "round")
haaland_dortmund_misc_2021 <-
haaland_2021_table %>%
left_join(dortmund_2021_table_misc,
by = "round")# mc_2122_scr <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/s11160/shooting/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>% html_elements(".stats_table#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_2122_scr, "mc_2122_scr.RDS")
#
# mc_2122_psg <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/s11160/passing/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_2122_psg, "mc_2122_psg.RDS")
#
# mc_2122_dfs <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/s11160/defense/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_2122_dfs, "mc_2122_dfs.RDS")
#
# mc_2122_pos <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/s11160/possession/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_2122_pos, "mc_2122_pos.RDS")
#
# mc_2122_misc <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2021-2022/matchlogs/s11160/misc/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_2122_misc, "mc_2122_misc.RDS")# mc_2021_scr <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2020-2021/matchlogs/s10728/shooting/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_2021_scr, "mc_2021_scr.RDS")
#
# mc_2021_psg <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2020-2021/matchlogs/s10728/passing/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_2021_psg, "mc_2021_psg.RDS")
#
# mc_2021_dfs <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2020-2021/matchlogs/s10728/defense/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_2021_dfs, "mc_2021_dfs.RDS")
#
# mc_2021_pos <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2020-2021/matchlogs/s10728/possession/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_2021_pos, "mc_2021_pos.RDS")
#
# mc_2021_misc <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2020-2021/matchlogs/s10728/misc/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_2021_misc, "mc_2021_misc.RDS") # mc_1920_scr <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2019-2020/matchlogs/s3232/shooting/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_1920_scr, "mc_1920_scr.RDS")
#
# mc_1920_psg <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2019-2020/matchlogs/s3232/passing/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_1920_psg, "mc_1920_psg.RDS")
#
# mc_1920_dfs <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2019-2020/matchlogs/s3232/defense/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_1920_dfs, "mc_1920_dfs.RDS")
#
# mc_1920_pos <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2019-2020/matchlogs/s3232/possession/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_1920_pos, "mc_1920_pos.RDS")
#
# mc_1920_misc <-
# read_html("https://fbref.com/en/squads/b8fd03ef/2019-2020/matchlogs/s3232/misc/Manchester-City-Match-Logs-Premier-League#matchlogs_for") %>%
# html_table() %>%
# as.data.frame()
# saveRDS(mc_1920_misc, "mc_1920_misc.RDS")mc_1920_misc <- readRDS("mc_1920_misc.RDS")
mc_1920_pos <- readRDS("mc_1920_pos.RDS")
mc_1920_dfs <- readRDS("mc_1920_dfs.RDS")
mc_1920_psg <- readRDS("mc_1920_psg.RDS")
mc_1920_scr <- readRDS("mc_1920_scr.RDS")
mc_2021_misc <- readRDS("mc_2021_misc.RDS")
mc_2021_pos <- readRDS("mc_2021_pos.RDS")
mc_2021_dfs <- readRDS("mc_2021_dfs.RDS")
mc_2021_psg <- readRDS("mc_2021_psg.RDS")
mc_2021_scr <- readRDS("mc_2021_scr.RDS")
mc_2122_misc <- readRDS("mc_2122_misc.RDS")
mc_2122_pos <- readRDS("mc_2122_pos.RDS")
mc_2122_dfs <- readRDS("mc_2122_dfs.RDS")
mc_2122_psg <- readRDS("mc_2122_psg.RDS")
mc_2122_scr <- readRDS("mc_2122_scr.RDS")In this phase, I only merge Haaland and Dortmund data. The merging and processing for Manchester City data is done at the later stage after figuring out what explanatory variables to include in the model.
haaland_dortmund_scoring <-
haaland_dortmund_scoring_2021 %>%
rbind(haaland_dortmund_scoring_2122)
haaland_dortmund_passing <-
haaland_dortmund_passing_2021 %>%
rbind(haaland_dortmund_passing_2122)
haaland_dortmund_defense <-
haaland_dortmund_defense_2021 %>%
rbind(haaland_dortmund_defense_2122)
haaland_dortmund_possession <-
haaland_dortmund_possession_2021 %>%
rbind(haaland_dortmund_possession_2122)
haaland_dortmund_misc <-
haaland_dortmund_misc_2021 %>%
rbind(haaland_dortmund_misc_2122)plot_bar(haaland_dortmund_scoring[,-1])plot_bar(haaland_dortmund_passing[,-1])plot_bar(haaland_dortmund_defense[,-1])plot_bar(haaland_dortmund_possession[,-1])plot_bar(haaland_dortmund_misc[,-1])plot_histogram(haaland_dortmund_scoring)plot_histogram(haaland_dortmund_passing)plot_histogram(haaland_dortmund_defense)plot_histogram(haaland_dortmund_possession)plot_histogram(haaland_dortmund_misc)hist(haaland_dortmund_scoring$goals,breaks = 4)ggplot(haaland_dortmund_scoring)+
geom_density(aes(x=goals))The distribution of goals count is resembling a poisson distribution. It’s possibly worth a try to model the prediction as a poisson model. For now, let’s see the value of mean and var.
mean(haaland_dortmund_scoring$goals)## [1] 0.9423077
var(haaland_dortmund_scoring$goals)## [1] 1.075038
When the value of var is more than the mean, the distribution is overdispersed. This overdispersion violates the assumption of a poisson model. But let’s not be too hasty, we will only need to dive deeper into this, IF the poisson model perform better than linear model.
Let’s select features by looking into correlations of the features.
ggcorr(haaland_dortmund_scoring,
label = TRUE,
label_size = 2.9,
hjust = 1,
layout.exp = 7,)ggcorr(haaland_dortmund_passing,
label = TRUE,
label_size = 2.9,
hjust = 1,
layout.exp = 7,)ggcorr(haaland_dortmund_defense,
label = TRUE,
label_size = 2.9,
hjust = 1,
layout.exp = 7,)ggcorr(haaland_dortmund_possession,
label = TRUE,
label_size = 2.9,
hjust = 1,
layout.exp = 7,)ggcorr(haaland_dortmund_misc,
label = TRUE,
label_size = 2.9,
hjust = 1,
layout.exp = 7,)i1 <- sapply(haaland_dortmund_scoring, is.numeric)
y1 <- "goals" #change it to actual column name
x1 <- setdiff(names(haaland_dortmund_scoring)[i1], y1)
sc <- as.data.frame(cor(haaland_dortmund_scoring[x1], haaland_dortmund_scoring[[y1]]))
sc %>% arrange(desc(V1))## V1
## goals_for 0.67977339
## goals_minus_expected_goals 0.54966730
## goals_pershot 0.52802457
## nonpenalty.goals_minus_nonpenalty.expectedgoals 0.50122485
## goals_pershot_ontarget 0.46835153
## shots_ontarget_percent 0.39261746
## penalty_kick 0.31560611
## nonpenalty_expected_goals_pershot 0.27354567
## expected_goals 0.21207968
## shots_freekick 0.19493543
## penalty_kick_att 0.18565765
## nonpenalty_expected_goals 0.15961622
## shots_ontarget 0.15219927
## goals_against 0.14561715
## minutes 0.13267334
## shots_avg_dist 0.01827763
## shots -0.17674966
i1 <- sapply(haaland_dortmund_passing, is.numeric)
n1 <- "goals" #change it to actual column name
m1 <- setdiff(names(haaland_dortmund_passing)[i1], n1)
ps <- as.data.frame(cor(haaland_dortmund_passing[m1], haaland_dortmund_passing[[n1]]))
ps %>% arrange(desc(V1))## V1
## assists 0.4683390854
## passes_completion_percentage 0.1385364960
## minutes 0.1326733381
## expected_goals_assisted 0.1041133055
## completed_crosses_to_18yardbox 0.0439491853
## completed_passes_to_18yardbox 0.0008611117
## passes_completed -0.0303892494
## passes_total_distance -0.0353332685
## passes_total_progressive_distance -0.0588232754
## passes_attempted -0.0745441536
## progressive_passes -0.0771225940
## completed_passes_to_thirdhalf -0.0924217325
## passes_directy_led_to_shot -0.1650121479
i1 <- sapply(haaland_dortmund_defense, is.numeric)
b1 <- "goals" #change it to actual column name
a1 <- setdiff(names(haaland_dortmund_defense)[i1], b1)
df <- as.data.frame(cor(haaland_dortmund_defense[a1], haaland_dortmund_defense[[b1]]))
df %>% arrange(desc(V1))## V1
## pressures_mid3rd 0.25459214
## error_led_toshot 0.21375993
## apply_pressures 0.20578037
## blocks_shots 0.19669387
## pressures_def3rd 0.19254375
## clearances 0.13728243
## minutes 0.13267334
## tackles_att3rd 0.12001725
## gained_possession_within5secsofpressure 0.11514390
## tackles_won_possession 0.10794441
## tackles_mid3rd 0.08812434
## players_tackled 0.08574533
## dribbled_past 0.07739570
## blocks 0.03123582
## interceptions -0.01503459
## tackles_def3rd -0.03657160
## blocks_passes -0.06165204
## pressures_att3rd -0.06704470
## succesful_pressure_percentage -0.09964455
## blocks_shots_ontarget -0.12850274
## percentage_dribblers_tackled -0.13011715
i1 <- sapply(haaland_dortmund_possession, is.numeric)
k1 <- "goals" #change it to actual column name
j1 <- setdiff(names(haaland_dortmund_possession)[i1], k1)
pss <- as.data.frame(cor(haaland_dortmund_possession[j1], haaland_dortmund_possession[[k1]]))
pss %>% arrange(desc(V1))## V1
## dispossessed 0.416182267
## completed_dribbles_percentage 0.365471883
## t_def3rd 0.148237291
## minutes 0.132673338
## passes_received_percentage 0.115462349
## dribbles_completed 0.090530514
## t_def_pen 0.063197788
## foe_dribbled_past 0.051260065
## carries 0.002509543
## passes_received -0.030389249
## failed_possession_gain -0.030851384
## t_mid3rd -0.048865375
## t_live_ball -0.050086896
## touches -0.067071211
## t_att_pen -0.077003401
## passing_target -0.078452253
## progressive_passes_received -0.079897923
## dribbles_attempted -0.085182747
## carries_to3rdpitch -0.108664480
## possession -0.110992047
## carries_to18yardsbox -0.115550560
## foe_nutmegged -0.119303859
## t_att3rd -0.145418283
## progressive_carries_distance_yards -0.152553214
## carries_distance_yards -0.155470775
## progressive_carries -0.160567277
i1 <- sapply(haaland_dortmund_misc, is.numeric)
u1 <- "goals" #change it to actual column name
t1 <- setdiff(names(haaland_dortmund_misc)[i1], u1)
mis <- as.data.frame(cor(haaland_dortmund_misc[t1], haaland_dortmund_misc[[u1]]))
mis %>% arrange(desc(V1))## V1
## penalty_kicks_conceded 0.36005842
## card_r 0.14423777
## card_y2 0.14423777
## minutes 0.13267334
## fouls_committed 0.09807890
## own_goals 0.01123706
## percentage_of_aerial_duels_won -0.02751473
## penalty_kicks_won -0.02795040
## card_y -0.06956626
## offsides -0.09453684
## fouls_drawn -0.11180918
## crosses -0.16667117
## aerial_duels_won -0.16847194
## aerial_duels_lost -0.18370010
## loose_ball_recovered -0.22906936
haaland_dortmund_all <-
haaland_dortmund_scoring[,-1:-3] %>%
cbind(haaland_dortmund_passing[,-1:-4],
haaland_dortmund_defense[,-1:-4],
haaland_dortmund_possession[,-1:-4],
haaland_dortmund_misc[,-1:-4]) i1 <- sapply(haaland_dortmund_all, is.numeric)
f1 <- "goals" # targeted column
g1 <- setdiff(names(haaland_dortmund_all)[i1], f1)
hd <- as.data.frame(cor(haaland_dortmund_all[g1], haaland_dortmund_all[[f1]]))
hd %>% arrange(desc(V1))## V1
## goals_for 0.6797733939
## goals_minus_expected_goals 0.5496673042
## goals_pershot 0.5280245659
## nonpenalty.goals_minus_nonpenalty.expectedgoals 0.5012248475
## goals_pershot_ontarget 0.4683515304
## assists 0.4683390854
## dispossessed 0.4161822666
## shots_ontarget_percent 0.3926174634
## completed_dribbles_percentage 0.3654718834
## penalty_kicks_conceded 0.3600584172
## penalty_kick 0.3156061130
## nonpenalty_expected_goals_pershot 0.2735456676
## pressures_mid3rd 0.2545921380
## error_led_toshot 0.2137599329
## expected_goals 0.2120796752
## apply_pressures 0.2057803662
## blocks_shots 0.1966938731
## shots_freekick 0.1949354254
## pressures_def3rd 0.1925437453
## penalty_kick_att 0.1856576471
## nonpenalty_expected_goals 0.1596162175
## shots_ontarget 0.1521992731
## t_def3rd 0.1482372908
## goals_against 0.1456171459
## card_r 0.1442377678
## card_y2 0.1442377678
## passes_completion_percentage 0.1385364960
## clearances 0.1372824298
## tackles_att3rd 0.1200172493
## passes_received_percentage 0.1154623492
## gained_possession_within5secsofpressure 0.1151439009
## tackles_won_possession 0.1079444082
## expected_goals_assisted 0.1041133055
## fouls_committed 0.0980788954
## dribbles_completed 0.0905305136
## tackles_mid3rd 0.0881243392
## players_tackled 0.0857453274
## dribbled_past 0.0773957016
## t_def_pen 0.0631977884
## foe_dribbled_past 0.0512600655
## completed_crosses_to_18yardbox 0.0439491853
## blocks 0.0312358161
## shots_avg_dist 0.0182776299
## own_goals 0.0112370585
## carries 0.0025095426
## completed_passes_to_18yardbox 0.0008611117
## interceptions -0.0150345921
## percentage_of_aerial_duels_won -0.0275147293
## penalty_kicks_won -0.0279504041
## passes_completed -0.0303892494
## passes_received -0.0303892494
## failed_possession_gain -0.0308513843
## passes_total_distance -0.0353332685
## tackles_def3rd -0.0365716043
## t_mid3rd -0.0488653752
## t_live_ball -0.0500868962
## passes_total_progressive_distance -0.0588232754
## blocks_passes -0.0616520444
## pressures_att3rd -0.0670447045
## touches -0.0670712109
## card_y -0.0695662576
## passes_attempted -0.0745441536
## t_att_pen -0.0770034010
## progressive_passes -0.0771225940
## passing_target -0.0784522531
## progressive_passes_received -0.0798979235
## dribbles_attempted -0.0851827469
## completed_passes_to_thirdhalf -0.0924217325
## offsides -0.0945368447
## succesful_pressure_percentage -0.0996445491
## carries_to3rdpitch -0.1086644800
## possession -0.1109920472
## fouls_drawn -0.1118091779
## carries_to18yardsbox -0.1155505601
## foe_nutmegged -0.1193038592
## blocks_shots_ontarget -0.1285027386
## percentage_dribblers_tackled -0.1301171543
## t_att3rd -0.1454182832
## progressive_carries_distance_yards -0.1525532138
## carries_distance_yards -0.1554707749
## progressive_carries -0.1605672767
## passes_directy_led_to_shot -0.1650121479
## crosses -0.1666711737
## aerial_duels_won -0.1684719434
## shots -0.1767496616
## aerial_duels_lost -0.1837001049
## loose_ball_recovered -0.2290693567
I select features for the model with 4 methods:
One feature from each top and bottom correlation table.
Two features from each top and bottom correlation table.
Top 5 and bottom 5 features from the compiled correlation table.
Top 10 and bottom 10 features from the compiled correlation table.
haaland_dortmund_1ofeach<-
haaland_dortmund_all %>%
select(
c(goals,
goals_for,
shots,
assists,
passes_directy_led_to_shot,
pressures_mid3rd,
percentage_dribblers_tackled,
dispossessed,
progressive_carries,
penalty_kicks_conceded,
loose_ball_recovered
)
)
haaland_dortmund_2ofeach<-
haaland_dortmund_all %>%
select(
c(goals,
goals_for,
shots,
assists,
passes_directy_led_to_shot,
pressures_mid3rd,
percentage_dribblers_tackled,
dispossessed,
progressive_carries,
penalty_kicks_conceded,
loose_ball_recovered,
goals_minus_expected_goals,
shots_avg_dist,
passes_completion_percentage,
passes_directy_led_to_shot,
error_led_toshot,
blocks_shots_ontarget,
completed_dribbles_percentage,
carries_distance_yards,
card_r,
aerial_duels_lost
)
)
haaland_dortmund_5ofall<-
haaland_dortmund_all %>%
select(
c(goals,
goals_for,
goals_minus_expected_goals,
goals_pershot,
nonpenalty.goals_minus_nonpenalty.expectedgoals,
goals_pershot_ontarget,
loose_ball_recovered,
aerial_duels_lost,
shots,
aerial_duels_won,
crosses
)
)
haaland_dortmund_10ofall<-
haaland_dortmund_all %>%
select(
c(goals,
goals_for,
goals_minus_expected_goals,
goals_pershot,
nonpenalty.goals_minus_nonpenalty.expectedgoals,
goals_pershot_ontarget,
loose_ball_recovered,
aerial_duels_lost,
shots,
aerial_duels_won,
crosses,
assists,
dispossessed,
shots_ontarget_percent,
completed_dribbles_percentage,
penalty_kicks_conceded,
progressive_carries,
passes_directy_led_to_shot,
carries_distance_yards,
progressive_carries_distance_yards,
t_att3rd
)
)Index and split data.
RNGkind(sample.kind = "Rounding")
set.seed(999)
# index sampling
index <- sample(x = nrow(haaland_dortmund_1ofeach),
size = nrow(haaland_dortmund_1ofeach)*0.8)
# splitting
train_1ofeach <- na.omit(haaland_dortmund_1ofeach[index,])
test_1ofeach <- na.omit(haaland_dortmund_1ofeach[-index,])
# index sampling
index <- sample(x = nrow(haaland_dortmund_2ofeach),
size = nrow(haaland_dortmund_2ofeach)*0.8)
# splitting
train_2ofeach <- na.omit(haaland_dortmund_2ofeach[index,])
test_2ofeach <- na.omit(haaland_dortmund_2ofeach[-index,])
# index sampling
index <- sample(x = nrow(haaland_dortmund_5ofall),
size = nrow(haaland_dortmund_5ofall)*0.8)
# splitting
train_5ofall <- na.omit(haaland_dortmund_5ofall[index,])
test_5ofall <- na.omit(haaland_dortmund_5ofall[-index,])
# index sampling
index <- sample(x = nrow(haaland_dortmund_10ofall),
size = nrow(haaland_dortmund_10ofall)*0.8)
# splitting
train_10ofall <- na.omit(haaland_dortmund_10ofall[index,])
test_10ofall <- na.omit(haaland_dortmund_10ofall[-index,])Since the histogram and density plot in the EDA shown poisson distribution, here I will use both linear and poisson regression.
linear_model_all_1ofeach <- lm(goals ~ ., data=train_1ofeach)
linear_model_null_1ofeach <- lm(goals ~ 1, data=train_1ofeach)
poisson_model_all_1ofeach <- glm(goals ~., data=train_1ofeach, family = poisson)
poisson_model_null_1ofeach <- glm(goals ~1, data=train_1ofeach, family = poisson)
linear_model_all_2ofeach <- lm(goals ~ ., data=train_2ofeach)
linear_model_null_2ofeach <- lm(goals ~ 1, data=train_2ofeach)
poisson_model_all_2ofeach <- glm(goals ~., data=train_2ofeach, family = poisson)
poisson_model_null_2ofeach <- glm(goals ~1, data=train_2ofeach, family = poisson)
linear_model_all_5ofall <- lm(goals ~ ., data=train_5ofall)
linear_model_null_5ofall <- lm(goals ~ 1, data=train_5ofall)
poisson_model_all_5ofall <- glm(goals ~., data=train_5ofall, family = poisson)
poisson_model_null_5ofall <- glm(goals ~1, data=train_5ofall, family = poisson)
linear_model_all_10ofall <- lm(goals ~ ., data=train_10ofall)
linear_model_null_10ofall <- lm(goals ~ 1, data=train_10ofall)
poisson_model_all_10ofall <- glm(goals ~., data=train_10ofall, family = poisson)
poisson_model_null_10ofall <- glm(goals ~1, data=train_10ofall, family = poisson)Use stepwise regression to find suitable features.
linear_model_stepboth_1ofeach <-
step(object = linear_model_null_1ofeach, direction = "both",
scope = list(lower = linear_model_null_1ofeach,
upper = linear_model_all_1ofeach), trace = 0)
linear_model_stepback_1ofeach <-
step(object = linear_model_all_1ofeach,
direction = "backward",
trace = 0)
linear_model_stepfor_1ofeach <-
step(object = linear_model_null_1ofeach, direction = "forward",
scope = list(lower = linear_model_null_1ofeach,
upper = linear_model_all_1ofeach),
trace = 0)
poisson_model_stepboth_1ofeach <-
step(object = poisson_model_null_1ofeach, direction = "both",
scope = list(lower = poisson_model_null_1ofeach,
upper = poisson_model_all_1ofeach), trace = 0)
poisson_model_stepback_1ofeach <-
step(object = poisson_model_all_1ofeach,
direction = "backward",
trace = 0)
poisson_model_stepfor_1ofeach <-
step(object = poisson_model_null_1ofeach, direction = "forward",
scope = list(lower = poisson_model_null_1ofeach,
upper = poisson_model_all_1ofeach),
trace = 0)
###
linear_model_stepboth_2ofeach <-
step(object = linear_model_null_2ofeach, direction = "both",
scope = list(lower = linear_model_null_2ofeach,
upper = linear_model_all_2ofeach), trace = 0)
linear_model_stepback_2ofeach <-
step(object = linear_model_all_2ofeach,
direction = "backward",
trace = 0)
linear_model_stepfor_2ofeach <-
step(object = linear_model_null_2ofeach, direction = "forward",
scope = list(lower = linear_model_null_2ofeach,
upper = linear_model_all_2ofeach),
trace = 0)
poisson_model_stepboth_2ofeach <-
step(object = poisson_model_null_2ofeach, direction = "both",
scope = list(lower = poisson_model_null_2ofeach,
upper = poisson_model_all_2ofeach), trace = 0)
poisson_model_stepback_2ofeach <-
step(object = poisson_model_all_2ofeach,
direction = "backward",
trace = 0)
poisson_model_stepfor_2ofeach <-
step(object = poisson_model_null_2ofeach, direction = "forward",
scope = list(lower = poisson_model_null_2ofeach,
upper = poisson_model_all_2ofeach),
trace = 0)
###
linear_model_stepboth_5ofall <-
step(object = linear_model_null_5ofall, direction = "both",
scope = list(lower = linear_model_null_5ofall,
upper = linear_model_all_5ofall), trace = 0)
linear_model_stepback_5ofall <-
step(object = linear_model_all_5ofall,
direction = "backward",
trace = 0)
linear_model_stepfor_5ofall <-
step(object = linear_model_null_5ofall, direction = "forward",
scope = list(lower = linear_model_null_5ofall,
upper = linear_model_all_5ofall),
trace = 0)
poisson_model_stepboth_5ofall <-
step(object = poisson_model_null_5ofall, direction = "both",
scope = list(lower = poisson_model_null_5ofall,
upper = poisson_model_all_5ofall), trace = 0)
poisson_model_stepback_5ofall <-
step(object = poisson_model_all_5ofall,
direction = "backward",
trace = 0)
poisson_model_stepfor_5ofall <-
step(object = poisson_model_null_5ofall, direction = "forward",
scope = list(lower = poisson_model_null_5ofall,
upper = poisson_model_all_5ofall),
trace = 0)
###
linear_model_stepboth_10ofall <-
step(object = linear_model_null_10ofall, direction = "both",
scope = list(lower = linear_model_null_10ofall,
upper = linear_model_all_10ofall), trace = 0)
linear_model_stepback_10ofall <-
step(object = linear_model_all_10ofall,
direction = "backward",
trace = 0)
linear_model_stepfor_10ofall <-
step(object = linear_model_null_10ofall, direction = "forward",
scope = list(lower = linear_model_null_10ofall,
upper = linear_model_all_10ofall),
trace = 0)
poisson_model_stepboth_10ofall <-
step(object = poisson_model_null_10ofall, direction = "both",
scope = list(lower = poisson_model_null_10ofall,
upper = poisson_model_all_10ofall), trace = 0)
poisson_model_stepback_10ofall <-
step(object = poisson_model_all_10ofall,
direction = "backward",
trace = 0)
poisson_model_stepfor_10ofall <-
step(object = poisson_model_null_10ofall, direction = "forward",
scope = list(lower = poisson_model_null_10ofall,
upper = poisson_model_all_10ofall),
trace = 0)Let’s compare between all the model we’ve got from stepwise method.
compare_performance(linear_model_stepboth_1ofeach,
linear_model_stepback_1ofeach,
linear_model_stepfor_1ofeach,
poisson_model_stepboth_1ofeach,
poisson_model_stepback_1ofeach,
poisson_model_stepfor_1ofeach
#metrics = "all"
)## # Comparison of Model Performance Indices
##
## Name | Model | AIC | AIC weights | BIC | BIC weights | RMSE | Sigma | Nagelkerke's R2 | Score_log | Score_spherical | R2 | R2 (adj.)
## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## linear_model_stepboth_1ofeach | lm | 84.519 | 0.221 | 96.514 | 0.169 | 0.572 | 0.619 | | | | 0.632 | 0.579
## linear_model_stepback_1ofeach | lm | 84.519 | 0.221 | 96.514 | 0.169 | 0.572 | 0.619 | | | | 0.632 | 0.579
## linear_model_stepfor_1ofeach | lm | 84.519 | 0.221 | 96.514 | 0.169 | 0.572 | 0.619 | | | | 0.632 | 0.579
## poisson_model_stepboth_1ofeach | glm | 85.505 | 0.135 | 95.786 | 0.243 | 0.591 | 0.802 | 0.713 | -0.896 | 0.136 | |
## poisson_model_stepback_1ofeach | glm | 86.930 | 0.066 | 102.353 | 0.009 | 0.509 | 0.749 | 0.785 | -0.841 | 0.136 | |
## poisson_model_stepfor_1ofeach | glm | 85.505 | 0.135 | 95.786 | 0.243 | 0.591 | 0.802 | 0.713 | -0.896 | 0.136 | |
compare_performance(linear_model_stepboth_2ofeach,
linear_model_stepback_2ofeach,
linear_model_stepfor_2ofeach,
poisson_model_stepboth_2ofeach,
poisson_model_stepback_2ofeach,
poisson_model_stepfor_2ofeach
#metrics = "all"
)## # Comparison of Model Performance Indices
##
## Name | Model | AIC | AIC weights | BIC | BIC weights | RMSE | Sigma | Nagelkerke's R2 | Score_log | Score_spherical | R2 | R2 (adj.)
## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## linear_model_stepboth_2ofeach | lm | 75.302 | 0.282 | 90.724 | 0.456 | 0.487 | 0.542 | | | | 0.779 | 0.732
## linear_model_stepback_2ofeach | lm | 74.425 | 0.437 | 94.988 | 0.054 | 0.448 | 0.523 | | | | 0.813 | 0.751
## linear_model_stepfor_2ofeach | lm | 75.302 | 0.282 | 90.724 | 0.456 | 0.487 | 0.542 | | | | 0.779 | 0.732
## poisson_model_stepboth_2ofeach | glm | 91.269 | < 0.001 | 98.124 | 0.011 | 0.688 | 0.855 | 0.655 | -1.015 | 0.138 | |
## poisson_model_stepback_2ofeach | glm | 91.269 | < 0.001 | 98.124 | 0.011 | 0.688 | 0.855 | 0.655 | -1.015 | 0.138 | |
## poisson_model_stepfor_2ofeach | glm | 91.269 | < 0.001 | 98.124 | 0.011 | 0.688 | 0.855 | 0.655 | -1.015 | 0.138 | |
compare_performance(linear_model_stepboth_5ofall,
linear_model_stepback_5ofall,
linear_model_stepfor_5ofall,
poisson_model_stepboth_5ofall,
poisson_model_stepback_5ofall,
poisson_model_stepfor_5ofall
# metrics = "all"
)## # Comparison of Model Performance Indices
##
## Name | Model | AIC | AIC weights | BIC | BIC weights | RMSE | Sigma | Nagelkerke's R2 | Score_log | Score_spherical | R2 | R2 (adj.)
## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## linear_model_stepboth_5ofall | lm | 104.742 | 0.025 | 111.596 | 0.018 | 0.787 | 0.818 | | | | 0.421 | 0.391
## linear_model_stepback_5ofall | lm | 103.423 | 0.049 | 113.704 | 0.006 | 0.738 | 0.787 | | | | 0.492 | 0.435
## linear_model_stepfor_5ofall | lm | 104.742 | 0.025 | 111.596 | 0.018 | 0.787 | 0.818 | | | | 0.421 | 0.391
## poisson_model_stepboth_5ofall | glm | 101.036 | 0.160 | 106.177 | 0.271 | 0.884 | 0.976 | 0.453 | -1.159 | 0.139 | |
## poisson_model_stepback_5ofall | glm | 98.460 | 0.581 | 105.314 | 0.417 | 0.790 | 0.924 | 0.551 | -1.103 | 0.135 | |
## poisson_model_stepfor_5ofall | glm | 101.036 | 0.160 | 106.177 | 0.271 | 0.884 | 0.976 | 0.453 | -1.159 | 0.139 | |
compare_performance(linear_model_stepboth_10ofall,
linear_model_stepback_10ofall,
linear_model_stepfor_10ofall,
poisson_model_stepboth_10ofall,
poisson_model_stepback_10ofall,
poisson_model_stepfor_10ofall
# metrics = "all"
)## # Comparison of Model Performance Indices
##
## Name | Model | AIC | AIC weights | BIC | BIC weights | RMSE | Sigma | Nagelkerke's R2 | Score_log | Score_spherical | R2 | R2 (adj.)
## -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
## linear_model_stepboth_10ofall | lm | 87.614 | 0.319 | 106.463 | 0.019 | 0.539 | 0.619 | | | | 0.736 | 0.659
## linear_model_stepback_10ofall | lm | 87.614 | 0.319 | 106.463 | 0.019 | 0.539 | 0.619 | | | | 0.736 | 0.659
## linear_model_stepfor_10ofall | lm | 87.614 | 0.319 | 106.463 | 0.019 | 0.539 | 0.619 | | | | 0.736 | 0.659
## poisson_model_stepboth_10ofall | glm | 93.218 | 0.019 | 100.072 | 0.471 | 0.712 | 0.915 | 0.609 | -1.039 | 0.136 | |
## poisson_model_stepback_10ofall | glm | 95.938 | 0.005 | 114.787 | < 0.001 | 0.583 | 0.810 | 0.787 | -0.902 | 0.134 | |
## poisson_model_stepfor_10ofall | glm | 93.218 | 0.019 | 100.072 | 0.471 | 0.712 | 0.915 | 0.609 | -1.039 | 0.136 | |
compare_performance(linear_model_stepboth_1ofeach,
linear_model_stepback_1ofeach,
linear_model_stepfor_1ofeach,
poisson_model_stepboth_1ofeach,
poisson_model_stepback_1ofeach,
poisson_model_stepfor_1ofeach,
linear_model_stepboth_2ofeach,
linear_model_stepback_2ofeach,
linear_model_stepfor_2ofeach,
poisson_model_stepboth_2ofeach,
poisson_model_stepback_2ofeach,
poisson_model_stepfor_2ofeach,
linear_model_stepboth_5ofall,
linear_model_stepback_5ofall,
linear_model_stepfor_5ofall,
poisson_model_stepboth_5ofall,
poisson_model_stepback_5ofall,
poisson_model_stepfor_5ofall,
linear_model_stepboth_10ofall,
linear_model_stepback_10ofall,
linear_model_stepfor_10ofall,
poisson_model_stepboth_10ofall,
poisson_model_stepback_10ofall,
poisson_model_stepfor_10ofall,
rank = TRUE,
verbose = T
)## # Comparison of Model Performance Indices
##
## Name | Model | RMSE | Sigma | AIC weights | BIC weights | Performance-Score
## ------------------------------------------------------------------------------------------------------
## linear_model_stepboth_2ofeach | lm | 0.487 | 0.542 | 0.278 | 0.393 | 87.82%
## linear_model_stepfor_2ofeach | lm | 0.487 | 0.542 | 0.278 | 0.393 | 87.82%
## linear_model_stepback_2ofeach | lm | 0.448 | 0.523 | 0.430 | 0.047 | 77.97%
## linear_model_stepback_10ofall | lm | 0.539 | 0.619 | < 0.001 | < 0.001 | 39.52%
## linear_model_stepboth_10ofall | lm | 0.539 | 0.619 | < 0.001 | < 0.001 | 39.52%
## linear_model_stepfor_10ofall | lm | 0.539 | 0.619 | < 0.001 | < 0.001 | 39.52%
## linear_model_stepback_1ofeach | lm | 0.572 | 0.619 | 0.003 | 0.022 | 39.14%
## linear_model_stepboth_1ofeach | lm | 0.572 | 0.619 | 0.003 | 0.022 | 39.14%
## linear_model_stepfor_1ofeach | lm | 0.572 | 0.619 | 0.003 | 0.022 | 39.14%
## poisson_model_stepback_1ofeach | glm | 0.509 | 0.749 | < 0.001 | 0.001 | 34.14%
## poisson_model_stepboth_1ofeach | glm | 0.591 | 0.802 | 0.002 | 0.031 | 28.44%
## poisson_model_stepfor_1ofeach | glm | 0.591 | 0.802 | 0.002 | 0.031 | 28.44%
## poisson_model_stepback_10ofall | glm | 0.583 | 0.810 | < 0.001 | < 0.001 | 26.36%
## linear_model_stepback_5ofall | lm | 0.738 | 0.787 | < 0.001 | < 0.001 | 18.79%
## poisson_model_stepboth_2ofeach | glm | 0.688 | 0.855 | < 0.001 | 0.010 | 18.57%
## poisson_model_stepback_2ofeach | glm | 0.688 | 0.855 | < 0.001 | 0.010 | 18.57%
## poisson_model_stepfor_2ofeach | glm | 0.688 | 0.855 | < 0.001 | 0.010 | 18.57%
## linear_model_stepboth_5ofall | lm | 0.787 | 0.818 | < 0.001 | < 0.001 | 14.28%
## linear_model_stepfor_5ofall | lm | 0.787 | 0.818 | < 0.001 | < 0.001 | 14.28%
## poisson_model_stepboth_10ofall | glm | 0.712 | 0.915 | < 0.001 | 0.004 | 13.46%
## poisson_model_stepfor_10ofall | glm | 0.712 | 0.915 | < 0.001 | 0.004 | 13.46%
## poisson_model_stepback_5ofall | glm | 0.790 | 0.924 | < 0.001 | < 0.001 | 8.28%
## poisson_model_stepboth_5ofall | glm | 0.884 | 0.976 | < 0.001 | < 0.001 | 0.01%
## poisson_model_stepfor_5ofall | glm | 0.884 | 0.976 | < 0.001 | < 0.001 | 0.01%
Despite having poisson distribution when we inspected the target variable earlier, the function above shows us that linear_model_stepboth_2ofeach gives the best performance. Another thing to note that, all linear models in each predictors group are doing better than their poisson models.
Let’s take a look at the comparison between the best of each features group.
plot(compare_performance(linear_model_stepback_1ofeach,
linear_model_stepboth_2ofeach,
linear_model_stepback_5ofall,
linear_model_stepback_10ofall
)
)
In general, linear_model_stepboth_2ofeach is far superior above the
rest.
Since the comparison above didn’t show Adjusted R-squared explicitly, let’s take the best performing model out of each features selection methods and compare their Adjusted R-squared values.
summary(linear_model_stepback_1ofeach)$adj.r.squared## [1] 0.5789923
summary(linear_model_stepboth_2ofeach)$adj.r.squared## [1] 0.7324671
summary(linear_model_stepback_5ofall)$adj.r.squared## [1] 0.4350874
summary(linear_model_stepback_10ofall)$adj.r.squared## [1] 0.6587668
Before we continue looking deeper on our best-performance model on the training dataset, let’s test, the best of each 4 features selection methods on the dataset.
1ofeach: linear_model_stepback_1ofeach with previously 39.14% performance score
2ofeach: linear_model_stepboth_2ofeach with previously 87.82% performance score
5ofall: linear_model_stepback_5ofall with previously 18.79% performance score
10ofall: linear_model_stepback_10ofall with previously 39.52% performance score
test_1ofeach <- test_1ofeach %>% mutate(pred = predict(linear_model_stepback_1ofeach,
test_1ofeach)) %>% floor()
test_2ofeach <- test_2ofeach %>% mutate(pred = predict(linear_model_stepboth_2ofeach,
test_2ofeach)) %>% floor()
test_5ofall <- test_5ofall %>% mutate(pred = predict(linear_model_stepback_5ofall,
test_5ofall)) %>% floor()
test_10ofall <- test_10ofall %>% mutate(pred = predict(linear_model_stepback_10ofall,
test_10ofall)) %>% floor() test_1ofeach[,c("goals","pred")] %>% summarise(total_goal = sum(goals), total_pred = sum(pred),
error = abs(total_goal - total_pred)) ## total_goal total_pred error
## 1 13 5 8
test_2ofeach[,c("goals","pred")] %>% summarise(total_goal = sum(goals), total_pred = sum(pred),
error = abs(total_goal - total_pred))## total_goal total_pred error
## 1 8 -2 10
test_5ofall[,c("goals","pred")] %>% summarise(total_goal = sum(goals), total_pred = sum(pred),
error = abs(total_goal - total_pred))## total_goal total_pred error
## 1 6 2 4
test_10ofall[,c("goals","pred")] %>% summarise(total_goal = sum(goals), total_pred = sum(pred),
error = abs(total_goal - total_pred))## total_goal total_pred error
## 1 9 4 5
In terms of total errors, linear_model_stepback_1ofeach is better than linear_model_stepboth_2ofeach on the test data.
MAE(y_pred = test_1ofeach$pred, y_true = test_1ofeach$goals)## [1] 0.7272727
MAE(y_pred = test_2ofeach$pred, y_true = test_2ofeach$goals)## [1] 0.9090909
MAE(y_pred = test_5ofall$pred, y_true = test_5ofall$goals)## [1] 0.3636364
MAE(y_pred = test_10ofall$pred, y_true = test_10ofall$goals)## [1] 0.4545455
Another upset, linear_model_stepback_5ofall is better than linear_model_stepboth_2ofeach on the data test when it comes to MAE.
MSE(y_pred = test_1ofeach$pred, y_true = test_1ofeach$goals)## [1] 0.9090909
MSE(y_pred = test_2ofeach$pred, y_true = test_2ofeach$goals)## [1] 1.272727
MSE(y_pred = test_5ofall$pred, y_true = test_5ofall$goals)## [1] 0.3636364
MSE(y_pred = test_10ofall$pred, y_true = test_10ofall$goals)## [1] 0.6363636
Once again, the linear_model_stepback_1ofeach is better than linear_model_stepboth_2ofeach on the test data.
rSquared(y = test_1ofeach$goals, resid = test_1ofeach$goals-test_1ofeach$pred)## [,1]
## [1,] 0.4329897
rSquared(y = test_2ofeach$goals, resid = test_2ofeach$goals-test_2ofeach$pred)## [,1]
## [1,] -0.375
rSquared(y = test_5ofall$goals, resid = test_5ofall$goals-test_5ofall$pred)## [,1]
## [1,] 0.5416667
rSquared(y = test_10ofall$goals, resid = test_10ofall$goals-test_10ofall$pred)## [,1]
## [1,] 0.2735849
Even in the R-squared aspect, linear_model_stepback_1ofeac stomped over linear_model_stepboth_2ofeach when it comes to the test data.
Despite the superior performance on training data, linear_model_stepboth_2ofeach has been beaten repeatedly on the test data by linear_model_stepback_1ofeach. More predictors don’t necessarily affect better result. So, in the next step, we’ll pick linear_model_stepback_1ofeach as our model.
summary(linear_model_stepback_1ofeach)##
## Call:
## lm(formula = goals ~ goals_for + shots + percentage_dribblers_tackled +
## dispossessed + penalty_kicks_conceded, data = train_1ofeach)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.29459 -0.39311 0.01308 0.38746 1.42160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03322 0.48520 0.068 0.9458
## goals_for 0.49415 0.08654 5.710 1.86e-06 ***
## shots -0.03989 0.02045 -1.951 0.0591 .
## percentage_dribblers_tackled -0.01828 0.00695 -2.631 0.0126 *
## dispossessed 0.07582 0.02676 2.833 0.0076 **
## penalty_kicks_conceded 0.48482 0.26451 1.833 0.0753 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6189 on 35 degrees of freedom
## Multiple R-squared: 0.6316, Adjusted R-squared: 0.579
## F-statistic: 12 on 5 and 35 DF, p-value: 8.261e-07
The most statistically significant coefficients in the linear_model_stepback_1ofeach above is goals_for, then dispossessed comes second. Goals_for refers to the number of goals scored by the team. It is understandable that goals_for has postitive estimate to the number of goals scored by Erling Haaland, the dispossessed predictor is a bit weird. Dispossessed metric refers to the number of times a player loses control of the ball after being tackled by an opposing player (does not include attempted dribbles). It’s probably counter intuitive to say that the more any teammate gets dispossessed, the more Haaland scores, but on the other hand, this might be an indirect effect of the team’s playing style. A team that play more aggressive has an increased chance of being dispossessed compared to a team that play safely with sideways and backward passes to hold on to the ball. Another curious feature that seems counter intuitive is penalty_kicks_conceded. It refers to the number of event the enemy having a chance at penalty in front of their goal. It seems to affect Haaland goal scoring stats positively. Perhaps an event that put his team on edge, actually fuels fire in him to score more in order to overturned the situation.
RMSE of the train data is 0.572.
performance(linear_model_stepback_1ofeach)## # Indices of model performance
##
## AIC | BIC | R2 | R2 (adj.) | RMSE | Sigma
## ---------------------------------------------------
## 84.519 | 96.514 | 0.632 | 0.579 | 0.572 | 0.619
Let’s see the RMSE of the test data.
test_1ofeach$pred <- predict(linear_model_stepback_1ofeach, newdata = test_1ofeach)RMSE(y_pred = test_1ofeach$pred, y_true = test_1ofeach$goals)## [1] 0.5863103
There is a slight bigger RMSE in the test data than the train data. The model is slightly overfit.
Let’s see how does it fare with the assumptions for linear regression. First, we test for its linearity between the response variable and explanatory variables.
cor.test(train_2ofeach$goals, train_2ofeach$goals_for)##
## Pearson's product-moment correlation
##
## data: train_2ofeach$goals and train_2ofeach$goals_for
## t = 5.9393, df = 39, p-value = 6.288e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4841484 0.8224314
## sample estimates:
## cor
## 0.6891509
cor.test(train_2ofeach$goals, train_2ofeach$dispossessed)##
## Pearson's product-moment correlation
##
## data: train_2ofeach$goals and train_2ofeach$dispossessed
## t = 3.1039, df = 39, p-value = 0.003548
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1592257 0.6620690
## sample estimates:
## cor
## 0.4450738
cor.test(train_2ofeach$goals, train_2ofeach$penalty_kicks_conceded)##
## Pearson's product-moment correlation
##
## data: train_2ofeach$goals and train_2ofeach$penalty_kicks_conceded
## t = 2.8558, df = 39, p-value = 0.006846
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1241015 0.6414543
## sample estimates:
## cor
## 0.4158739
cor.test(train_2ofeach$goals, train_2ofeach$percentage_dribblers_tackled)##
## Pearson's product-moment correlation
##
## data: train_2ofeach$goals and train_2ofeach$percentage_dribblers_tackled
## t = -0.38308, df = 39, p-value = 0.7037
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3620580 0.2511538
## sample estimates:
## cor
## -0.06122755
cor.test(train_2ofeach$goals, train_2ofeach$pressures_mid3rd)##
## Pearson's product-moment correlation
##
## data: train_2ofeach$goals and train_2ofeach$pressures_mid3rd
## t = 1.3239, df = 39, p-value = 0.1932
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1071008 0.4841441
## sample estimates:
## cor
## 0.2073828
cor.test(train_2ofeach$goals, train_2ofeach$assists)##
## Pearson's product-moment correlation
##
## data: train_2ofeach$goals and train_2ofeach$assists
## t = 2.9858, df = 39, p-value = 0.004867
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1426175 0.6524154
## sample estimates:
## cor
## 0.4313422
cor.test(train_2ofeach$goals, train_2ofeach$loose_ball_recovered)##
## Pearson's product-moment correlation
##
## data: train_2ofeach$goals and train_2ofeach$loose_ball_recovered
## t = -1.0461, df = 39, p-value = 0.302
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4499886 0.1500683
## sample estimates:
## cor
## -0.1652096
Since the alternative hypothesis all resulted in true, these explanatory variables are linear enough to pass the test.
We hope to see that the errors from our model are distributed normally.
hist(linear_model_stepback_1ofeach$residuals, breaks = 5)
The histogram seems good, now let’s apply shapiro test. If the p-value
failed to reject NULL, it is normally distributed
shapiro.test(linear_model_stepback_1ofeach$residuals)##
## Shapiro-Wilk normality test
##
## data: linear_model_stepback_1ofeach$residuals
## W = 0.98778, p-value = 0.9323
Since the p-value is above the 0.05, we can fairly say our model has errors that are distributed normally.
We expect our errors’ variance to be constant. Let’s observe if there’s any pattern at all.
plot(train_1ofeach$goals, linear_model_stepboth_1ofeach$residuals)
It seems there’s a pattern here. To be sure, let’s apply Breusch-Pagan
hypothesis test. We hope the resulting p-value will be above alpha. If
we fail to reject the null hypothesis, it means our errors have no
pattern.
library(lmtest)
bptest(linear_model_stepback_1ofeach)##
## studentized Breusch-Pagan test
##
## data: linear_model_stepback_1ofeach
## BP = 11.609, df = 5, p-value = 0.04056
With the p-value of 0.04, we can say that there’s a pattern.
We don’t want our predictors/explanatory variables to have strong correlations between them. To verify this, let’s see their Variance Inflation Factor (VIF).
vif(linear_model_stepback_1ofeach)## goals_for shots
## 1.051090 1.009494
## percentage_dribblers_tackled dispossessed
## 1.068329 1.108631
## penalty_kicks_conceded
## 1.060369
Since there are no value above 10, we can say that this model’s predictors have no multicollinearity amongst them.
Since the model has problem of heteroscedasticity, we can try to transform the response variable into log, log2, log10, or sqrt.
train_1ofeach <- train_1ofeach %>%
mutate(log_goals = log(goals),
log_goals = ifelse(log_goals < 0, 0, log_goals),
log2_goals = log2(goals),
log2_goals = ifelse(log2_goals < 0, 0, log2_goals),
log10_goals = log10(goals),
log10_goals = ifelse(log10_goals < 0, 0, log10_goals),
sqrt_goals = sqrt(goals),
sqrt_goals = ifelse(sqrt_goals < 0, 0, sqrt_goals))
train_1ofeach[,c("goals","log_goals","log2_goals","log10_goals","sqrt_goals")] ## goals log_goals log2_goals log10_goals sqrt_goals
## 21 2 0.6931472 1.000000 0.3010300 1.414214
## 30 0 0.0000000 0.000000 0.0000000 0.000000
## 5 1 0.0000000 0.000000 0.0000000 1.000000
## 42 1 0.0000000 0.000000 0.0000000 1.000000
## 38 2 0.6931472 1.000000 0.3010300 1.414214
## 6 1 0.0000000 0.000000 0.0000000 1.000000
## 28 2 0.6931472 1.000000 0.3010300 1.414214
## 4 0 0.0000000 0.000000 0.0000000 0.000000
## 18 0 0.0000000 0.000000 0.0000000 0.000000
## 27 0 0.0000000 0.000000 0.0000000 0.000000
## 12 0 0.0000000 0.000000 0.0000000 0.000000
## 23 0 0.0000000 0.000000 0.0000000 0.000000
## 2 0 0.0000000 0.000000 0.0000000 0.000000
## 39 0 0.0000000 0.000000 0.0000000 0.000000
## 45 0 0.0000000 0.000000 0.0000000 0.000000
## 51 0 0.0000000 0.000000 0.0000000 0.000000
## 47 0 0.0000000 0.000000 0.0000000 0.000000
## 29 2 0.6931472 1.000000 0.3010300 1.414214
## 50 3 1.0986123 1.584963 0.4771213 1.732051
## 3 2 0.6931472 1.000000 0.3010300 1.414214
## 37 0 0.0000000 0.000000 0.0000000 0.000000
## 20 0 0.0000000 0.000000 0.0000000 0.000000
## 17 2 0.6931472 1.000000 0.3010300 1.414214
## 10 2 0.6931472 1.000000 0.3010300 1.414214
## 24 2 0.6931472 1.000000 0.3010300 1.414214
## 41 2 0.6931472 1.000000 0.3010300 1.414214
## 15 0 0.0000000 0.000000 0.0000000 0.000000
## 16 1 0.0000000 0.000000 0.0000000 1.000000
## 31 1 0.0000000 0.000000 0.0000000 1.000000
## 11 0 0.0000000 0.000000 0.0000000 0.000000
## 40 0 0.0000000 0.000000 0.0000000 0.000000
## 34 2 0.6931472 1.000000 0.3010300 1.414214
## 43 0 0.0000000 0.000000 0.0000000 0.000000
## 49 0 0.0000000 0.000000 0.0000000 0.000000
## 35 1 0.0000000 0.000000 0.0000000 1.000000
## 26 2 0.6931472 1.000000 0.3010300 1.414214
## 14 0 0.0000000 0.000000 0.0000000 0.000000
## 19 2 0.6931472 1.000000 0.3010300 1.414214
## 1 2 0.6931472 1.000000 0.3010300 1.414214
## 36 1 0.0000000 0.000000 0.0000000 1.000000
## 22 0 0.0000000 0.000000 0.0000000 0.000000
Remodel.
linear_model_stepback_1ofeach_log <- lm(formula = log_goals ~ goals_for + shots + percentage_dribblers_tackled + dispossessed + penalty_kicks_conceded,
data = train_1ofeach)
linear_model_stepback_1ofeach_log2 <- lm(formula = log2_goals ~ goals_for + shots + percentage_dribblers_tackled + dispossessed + penalty_kicks_conceded,
data = train_1ofeach)
linear_model_stepback_1ofeach_log10 <- lm(formula = log10_goals ~ goals_for + shots + percentage_dribblers_tackled + dispossessed + penalty_kicks_conceded,
data = train_1ofeach)
linear_model_stepback_1ofeach_sqrt <- lm(formula = sqrt_goals ~ goals_for + shots + percentage_dribblers_tackled + dispossessed + penalty_kicks_conceded,
data = train_1ofeach)Let’s reevaluate the homoscedasticity assumption.
bptest(linear_model_stepback_1ofeach)##
## studentized Breusch-Pagan test
##
## data: linear_model_stepback_1ofeach
## BP = 11.609, df = 5, p-value = 0.04056
bptest(linear_model_stepback_1ofeach_log)##
## studentized Breusch-Pagan test
##
## data: linear_model_stepback_1ofeach_log
## BP = 9.436, df = 5, p-value = 0.09289
bptest(linear_model_stepback_1ofeach_log2)##
## studentized Breusch-Pagan test
##
## data: linear_model_stepback_1ofeach_log2
## BP = 9.436, df = 5, p-value = 0.09289
bptest(linear_model_stepback_1ofeach_log10)##
## studentized Breusch-Pagan test
##
## data: linear_model_stepback_1ofeach_log10
## BP = 9.436, df = 5, p-value = 0.09289
bptest(linear_model_stepback_1ofeach_sqrt)##
## studentized Breusch-Pagan test
##
## data: linear_model_stepback_1ofeach_sqrt
## BP = 11.061, df = 5, p-value = 0.05017
The transformed models are now fulfilling the assumption of homoscedasticity.
comp <- compare_performance(linear_model_stepback_1ofeach,
linear_model_stepback_1ofeach_log,
linear_model_stepback_1ofeach_log2,
linear_model_stepback_1ofeach_log10,
linear_model_stepback_1ofeach_sqrt, rank = T)
comp## # Comparison of Model Performance Indices
##
## Name | Model | R2 | R2 (adj.) | RMSE | Sigma | AIC weights | BIC weights | Performance-Score
## -------------------------------------------------------------------------------------------------------------------------------
## linear_model_stepback_1ofeach_log10 | lm | 0.491 | 0.418 | 0.108 | 0.117 | 1.000 | 1.000 | 66.67%
## linear_model_stepback_1ofeach_sqrt | lm | 0.646 | 0.595 | 0.395 | 0.427 | < 0.001 | < 0.001 | 46.04%
## linear_model_stepback_1ofeach | lm | 0.632 | 0.579 | 0.572 | 0.619 | < 0.001 | < 0.001 | 30.24%
## linear_model_stepback_1ofeach_log | lm | 0.491 | 0.418 | 0.248 | 0.269 | < 0.001 | < 0.001 | 23.24%
## linear_model_stepback_1ofeach_log2 | lm | 0.491 | 0.418 | 0.358 | 0.388 | < 0.001 | < 0.001 | 15.35%
plot(comp)It seems there’s a trade-off between Adjusted R-squared and RMSE. This time we pick RMSE, since it’s also has the better performance score as well. So we pick linear_model_stepback_1ofeach_log10.
Before predicting the goals, we have to prepare the predictors. I need to extract the features as shown in our model from Manchester City’s 19/20, 20/21, and 21/22 data.
goals_for <- (as.numeric(mc_1920_scr[-c(1,40),7]) +
as.numeric(mc_2021_scr[-c(1,40),7]) +
as.numeric(mc_2122_scr[-c(1,40),7])
)/3
shots <- (as.numeric(mc_1920_scr[-c(1,40),"Standard.1"]) +
as.numeric(mc_2021_scr[-c(1,40),"Standard.1"]) +
as.numeric(mc_2122_scr[-c(1,40),"Standard.1"])
)/3
percentage_dribblers_tackled <- (as.numeric(mc_1920_dfs[-c(1,40),"Vs.Dribbles.6"]) +
as.numeric(mc_2021_dfs[-c(1,40),"Vs.Dribbles.6"]) +
as.numeric(mc_2122_dfs[-c(1,40),"Vs.Dribbles.6"])
)/3
dispossessed <- (as.numeric(mc_1920_pos[-c(1,40),"Carries.7"]) +
as.numeric(mc_2021_pos[-c(1,40),"Carries.7"]) +
as.numeric(mc_2122_pos[-c(1,40),"Carries.7"])
)/3
penalty_kicks_conceded <- (as.numeric(mc_1920_misc[-c(1,40),"Performance.23"]) +
as.numeric(mc_2021_misc[-c(1,40),"Performance.23"]) +
as.numeric(mc_2122_misc[-c(1,40),"Performance.23"])
)/3Then create a dataframe for them.
mc_predictors <- data.frame(
goals_for,
shots,
percentage_dribblers_tackled,
dispossessed,
penalty_kicks_conceded
)Let’s predict!
mc_predictors <- mc_predictors %>%
mutate(pred_log10 = predict(linear_model_stepback_1ofeach_log10,
mc_predictors,
interval = "prediction",
level=0.90)) %>%
mutate(pred = floor(10^pred_log10))
mc_predictors$pred## fit lwr upr
## 1 1 0 2
## 2 1 0 2
## 3 1 0 1
## 4 1 0 1
## 5 0 0 1
## 6 1 0 2
## 7 1 0 1
## 8 1 0 1
## 9 1 0 2
## 10 1 0 1
## 11 1 0 1
## 12 1 0 1
## 13 1 0 1
## 14 1 0 2
## 15 1 0 1
## 16 0 0 1
## 17 1 0 2
## 18 1 0 2
## 19 2 1 3
## 20 1 0 1
## 21 1 0 1
## 22 1 1 2
## 23 1 0 2
## 24 1 0 2
## 25 1 0 1
## 26 1 0 1
## 27 1 0 1
## 28 0 0 1
## 29 1 0 2
## 30 1 1 2
## 31 1 0 1
## 32 1 0 2
## 33 1 0 1
## 34 1 0 2
## 35 1 0 2
## 36 1 0 2
## 37 1 0 2
## 38 1 0 2
With the prediction interval involved, total_gol_lower implies a very bad condition, that might regard him as a flop; he would still score 5 though. The total_goal_upper implies a perfect condition where he is at his full capacity at all times capable of scoring an impossible 58 tally.
mc_predictors %>% summarise(total_goal = sum(pred[,"fit"]),
total_goal_lower = sum(pred[,"lwr"]),
total_goal_upper = sum(pred[,"upr"])) ## total_goal total_goal_lower total_goal_upper
## 1 36 3 58
IF:
THEN:
In the last 2 seasons, Erling Haaland, played 76% of all the Dortmund matches. Let’s predict how many goals he will score if he play 76% matches in Manchester City.
RNGkind(sample.kind = "Rounding")
set.seed(123456)
index_76 <- sample(x = nrow(mc_predictors),
size = nrow(mc_predictors)*0.76)
mc_predictors_76 <- mc_predictors[index_76,] %>%
mutate(pred_log10 = predict(linear_model_stepback_1ofeach_log10,
mc_predictors[index_76,],
interval = "prediction",
level=0.90)) %>%
mutate(pred = floor(10^pred_log10))
mc_predictors_76 %>% summarise(total_goal = sum(pred[,"fit"]),
total_goal_lower = sum(pred[,"lwr"]),
total_goal_upper = sum(pred[,"upr"]))## total_goal total_goal_lower total_goal_upper
## 1 25 2 41
IF:
THEN: