library(engsoccerdata)
library(rvest)
library(tidyverse)
library(data.table)
library(magrittr)
library(zoo)
#download and munge data as per https://github.com/hkhare42/recipe_testing/blob/master/adjusted_table.ipynb
data <- read.csv('http://www.football-data.co.uk/mmz4281/1819/E0.csv') %>%
  #take only columns we care about
  select(date = Date, home = HomeTeam, away = AwayTeam, home_goals = FTHG, away_goals = FTAG) %>%
  #convert dates and filter to before last weekend
  mutate(date = as.Date(date, "%d/%m/%y")) %>%
  filter(date < as.Date("2018-11-24")) %>%
  #melt data by team and bind back opponents
  setDT() %>%
  melt.data.table(measure.vars = c("home", "away"), variable.name = "venue", value.name = "team") %>%
  .[, opponent := team[order(-as.numeric(venue), date)]] %>%
  #calculate points per match
  .[(venue == "home" & home_goals > away_goals) | (venue == "away" & home_goals < away_goals), points := 3] %>%
  .[(venue == "home" & home_goals < away_goals) | (venue == "away" & home_goals > away_goals), points := 0] %>%
  .[home_goals == away_goals, points := 1] %>%
  #reselect columns
  .[, c("team", "venue", "opponent", "points")]

#get the teams
teams <- data$team %>% unique() %>% sort()

#create a matrix of all team combinations
team_matrix <- matrix(data = 0,
                      nrow = length(teams)/2 * (length(teams)-1), 
                      ncol = length(teams),
                      #name columbs as single teams and rows as combinations
                      dimnames = list(
                        c(lapply(seq(length(teams)/2 * (length(teams)-1)), function(n) {
                          paste0(combn(teams, 2)[,n], collapse = "-")
                          })),
                        c(teams)
                        ))

#add in 1/ -1 for each team combinations
for(row in seq(nrow(team_matrix))) {
  sub_teams <- str_split(rownames(team_matrix)[row], "-")[[1]]
  team_matrix[row,unlist(lapply(sub_teams, grep, colnames(team_matrix)))] <- c(1, -1)
}
#get the points per game difference for any team pair combination
get_ppg_diff <- function(row) {
  #the pair of teams
  sub_teams <- str_split(rownames(team_matrix)[row], "-")[[1]]
  
  #the matches they have in common
  common_matches <- data[team %in% sub_teams] %>%
    .[, opponent := replace(opponent, opponent %in% sub_teams, "vs.")] %>%
    #shouldn't include single head to head matches but will do for consistency with published work
    .[duplicated(paste(venue, opponent))|duplicated(paste(venue, opponent), fromLast = TRUE)|opponent == "vs."]
    #.[duplicated(paste(venue, opponent))|duplicated(paste(venue, opponent), fromLast = TRUE)]

  #calculate the points per game
  if(nrow(common_matches) > 0) {
    points <- common_matches %>% group_by(team) %>% summarise(sum(points)) 
    ppg_diff <- -diff(points$`sum(points)`) / (nrow(common_matches)/2)
  } else {
    ppg_diff <- 0
  }
  
  return(ppg_diff)
}

#run the function
ppg_differences <- lapply(seq(nrow(team_matrix)), get_ppg_diff)
#least squares fit using the 1/-1 matrix and the points per game differences
lst_sq_fit <- lsfit(team_matrix, unlist(ppg_differences))
## Warning in lsfit(team_matrix, unlist(ppg_differences)): 'X' matrix was
## collinear
#take the coefficients (ignoring the intercept)
lst_sq_coefficients <- lst_sq_fit$coefficients %>%
  .[2:length(.)]

lst_sq_coefficients
##        Arsenal    Bournemouth       Brighton        Burnley        Cardiff 
##     0.77539683     0.05681704    -0.23688179    -0.89296157    -1.00046992 
##        Chelsea Crystal Palace        Everton         Fulham   Huddersfield 
##     0.74511696    -0.68477235     0.16010025    -1.29455096    -0.87860693 
##      Leicester      Liverpool       Man City     Man United      Newcastle 
##    -0.03932957     1.10078112     1.32493943     0.38302632    -0.88852966 
##    Southampton      Tottenham        Watford       West Ham         Wolves 
##    -1.00163325     0.97050125     0.04180242    -0.34511069     0.00000000
#take the coefficients * the mean point per game for the last team (coefficient == 0) and multiply by matches played
points <- (lst_sq_coefficients + mean(data[team == last(names(lst_sq_coefficients))]$points)) * table(data$team)
#sort the new table
sort(points, decreasing = TRUE)
## 
##       Man City      Liverpool      Tottenham        Arsenal        Chelsea 
##     31.8992732     29.2093734     27.6460150     25.3047619     24.9414035 
##     Man United        Everton    Bournemouth        Watford         Wolves 
##     20.5963158     17.9212030     16.6818045     16.5016291     16.0000000 
##      Leicester       Brighton       West Ham Crystal Palace   Huddersfield 
##     15.5280451     13.1574185     11.8586717      7.7827318      5.4567168 
##      Newcastle        Burnley        Cardiff    Southampton         Fulham 
##      5.3376441      5.2844612      3.9943609      3.9804010      0.4653885