library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)

joined <- read_csv("joined.csv")
## Parsed with column specification:
## cols(
##   last_season_position = col_double(),
##   last_season_won = col_double(),
##   last_season_drew = col_double(),
##   last_season_lost = col_double(),
##   last_season_goals_scored = col_double(),
##   last_season_goals_conceeded = col_double(),
##   last_season_points = col_double(),
##   team = col_character(),
##   squad_size = col_double(),
##   average_age = col_double(),
##   foreigners = col_double(),
##   market_value = col_double(),
##   average_market_value = col_double()
## )
results_18_19 <- read_csv("results_18_19.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   HomeTeam = col_character(),
##   AwayTeam = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Arsenal <- read_csv("Arsenal.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Bournemouth <- read_csv("Bournemouth.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Brighton <- read_csv("Brighton.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Burnley <- read_csv("Burnley.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Cardiff <- read_csv("Cardiff.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Chelsea <- read_csv("Chelsea.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Crystal_Palace <- read_csv("Crystal_Palace.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Everton <- read_csv("Everton.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Fulham <- read_csv("Fulham.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Huddersfield <- read_csv("Huddersfield.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Leicester <- read_csv("Leicester.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Liverpool <- read_csv("Liverpool.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Man_City <- read_csv("Man_City.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Man_United <- read_csv("Man_United.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Newcastle <- read_csv("Newcastle.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Southampton <- read_csv("Southampton.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Tottenham <- read_csv("Tottenham.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Watford <- read_csv("Watford.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
West_Ham <- read_csv("West_Ham.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.
Wolves <- read_csv("Wolves.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Date = col_date(format = ""),
##   team = col_character(),
##   opposition_team = col_character(),
##   FTR = col_character(),
##   HTR = col_character(),
##   Referee = col_character()
## )
## See spec(...) for full column specifications.

Point in time merge

Align club names

joined$team[joined$team == "Manchester United"] <- "Man United"
joined$team[joined$team == "Manchester City"] <- "Man City"
joined$team[joined$team == "Tottenham Hotspur"] <- "Tottenham"
joined$team[joined$team == "Leicester City"] <- "Leicester"
joined$team[joined$team == "Newcastle United"] <- "Newcastle"
joined$team[joined$team == "West Ham United"] <- "West Ham"
joined$team[joined$team == "Brighton & Hove Albion"] <- "Brighton"
joined$team[joined$team == "Huddersfield Town"] <- "Huddersfield"
joined$team[joined$team == "Wolverhampton Wanderers"] <- "Wolves"
joined$team[joined$team == "Cardiff City"] <- "Cardiff"

Create home and away versions of this point in time data

joined_home <- joined %>% rename_all(paste0, "_home") %>% rename(HomeTeam = team_home)
joined_away <- joined %>% rename_all(paste0, "_away") %>% rename(AwayTeam = team_away)

Merge

results_18_19 <- results_18_19 %>% 
  left_join(joined_home, by = "HomeTeam") %>% 
  left_join(joined_away, by = "AwayTeam")

Lag results so form is applicable to prior results

Arsenal <- Arsenal %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Bournemouth <- Bournemouth %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Brighton <- Brighton %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Burnley <- Burnley %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Cardiff <- Cardiff %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Chelsea <- Chelsea %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Crystal_Palace <- Crystal_Palace %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Everton <- Everton %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Fulham <- Fulham %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Huddersfield <- Huddersfield %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Leicester <- Leicester %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Liverpool <- Liverpool %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Man_City <- Man_City %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Man_United <- Man_United %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Newcastle <- Newcastle %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Southampton <- Southampton %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Tottenham <- Tottenham %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Watford <- Watford %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

West_Ham <- West_Ham %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))

Wolves <- Wolves %>% 
  select(1:3, contains("average"), contains("total")) %>% 
  mutate_at(-1:-3, ~lag(., 1))
all <- rbind(Arsenal, Bournemouth, Brighton, Burnley, Cardiff, Chelsea, Crystal_Palace, Everton, Fulham, Huddersfield, Leicester, Liverpool, Man_City, Man_United, Newcastle, Southampton, Tottenham, Watford, West_Ham, Wolves)

create 2 keys to merge

all_home <- all %>% 
mutate(key = paste0(Date,team)) %>% 
rename_all(paste0, "_home") 

all_away <- all %>% 
mutate(key = paste0(Date,team)) %>% 
  rename_all(paste0, "_away")
results_18_19 <- results_18_19 %>% 
  mutate(key_home = paste0(Date,HomeTeam), key_away = paste0(Date,AwayTeam)) 

Merge on these keys

combined <- results_18_19 %>% 
  left_join(all_home, by = "key_home") %>% 
  left_join(all_away, by = "key_away") 

Drop these columns

combined <- combined %>% 
  mutate(home_score_difference = FTHG - FTAG) %>% 
select(-1, -5:-26, -Date_home, -team_home, -Date_away, -team_away, -opposition_team_home, -opposition_team_away)

This is the data for ML

write_csv(combined, “combined.csv”)