Data

This document analyzes the performance of Manchester United in home games. The document visually represents the wins,draws,and losses using geocoding.

Load required packages

library(tidyr)
library(dplyr)
library("RSQLite")
library(ggplot2)
library(gganimate)
library(qdapTools)
library(lubridate)
library(ggmap)
library(rvest)
library(stringr)
library(fuzzyjoin)
library(ggrepel)

Import the dataset

The database is an SQLLite database. So Importing all the required tables

con = dbConnect(drv = RSQLite::SQLite(), dbname = "database.sqlite")
tables <- dbListTables(con)

tables <- tables[tables != "sqlite_sequence"]
DataFrames <- vector("list", length = length(tables))
Country <-
  dbGetQuery(conn = con,
             statement = ("SELECT * FROM 'Country'"))
League <-
  dbGetQuery(conn = con, statement = ("SELECT * FROM 'League'"))
Match <-
  dbGetQuery(conn = con, statement = ("SELECT * FROM 'Match'"))
Player <-
  dbGetQuery(conn = con, statement = ("SELECT * FROM 'Player'"))
Player_Stats <-
  dbGetQuery(conn = con,
             statement = ("SELECT * FROM 'Player_Stats'"))
Team <- dbGetQuery(conn = con, statement = ("SELECT * FROM 'Team'"))

Get cities of the team for geo location

The SQLLite database does not contain city information of the clubs. Getting the loacation information by webscraping and using Gmaps to get the geocodes of the clubs

team.cities <-
  "https://en.wikipedia.org/wiki/List_of_Premier_League_clubs"
team.cities.df <- team.cities %>%
  read_html() %>%
  html_nodes(xpath = '//*[@id="mw-content-text"]/table[1]') %>%
  html_table()
team.cities.df <- team.cities.df[[1]]

url <- "https://liaison.reuters.com/tools/sports-team-codes"
team.codes <-
  url %>% read_html() %>% html_nodes(xpath = '//*[@id="Category"]/tbody/tr[3]/td/table') %>%
  html_table()

team.codes <- team.codes[[2]]
names(team.codes)[1:2] <- c("club_code", "Club")

Data Exploration of home games

Explore the data and cleanse it in the required format for geocode analysis

manutd.hme <-
  Team %>% filter(team_long_name == "Manchester United") %>% inner_join(Match, by =
  c("team_api_id" = "home_team_api_id")) %>%
  select(
  one_of(
  "team_short_name",
  "season",
  "date",
  "away_team_api_id",
  "home_team_goal",
  "away_team_goal",
  "match_api_id"
  )
  ) %>% mutate(result = ifelse (
  home_team_goal > away_team_goal,
  "Win",
  ifelse (home_team_goal < away_team_goal, "Loss", "Draw")
  ))
  
  
  
  
  manutd.hme <-
  manutd.hme %>% inner_join(Team, by = c("away_team_api_id" = "team_api_id")) %>%
  select(-one_of("id")) %>% mutate(manager = ifelse(
  season %in% c(
  "2008/2009",
  "2009/2010",
  "2010/2011",
  "2011/2012",
  "2012/2013"
  ),
  "SAF",
  ifelse(season ==
  "2013/2014",
  "DM", "LVG")
  ),
  game = "home") %>% dplyr::rename(
  home_team = team_short_name.x,
  goal_scored = home_team_goal,
  goal_conceded = away_team_goal,
  away_team = team_short_name.y
  )

**Data cleansing of geocode data*

Cleansing the data obtained through webscraping before plotting them using Gmaps. The webscraped data contains missing information that needs to be corrected. Using Fuzzyjoin to join the datasets as the Club names are not having a common nomenclature.

team.coord <-
manutd.hme %>% distinct(away_team, team_long_name, away_team_api_id) %>% rename(Club =
team_long_name)

team.coord <-
team.coord %>% stringdist_left_join(team.cities.df, by = "Club", max_dist =
4) %>% rename(city =
`Town or City`) %>% select(one_of("away_team", "Club.x", "Club.y", "city"))

team.coord$city[team.coord$away_team == "WHU"] <-
"London Borough of Newham"
team.coord$city[team.coord$away_team == "WIG"] <- "Wigan"
team.coord$city[team.coord$away_team == "TOT"] <-
"London Borough of Haringey"
team.coord$city[team.coord$away_team == "BIR"] <- "Birmingham"
team.coord$city[team.coord$away_team == "NOR"] <- "Norwich"
team.coord$city[team.coord$away_team == "CAR"] <- "Cardiff"
team.coord$city[team.coord$away_team == "LEI"] <- "Leicester"
team.coord$city[team.coord$away_team == "STK"] <- "Stoke-on-Trent"

team.coord <-
team.coord %>% rename(club = Club.x)  %>% filter(!((away_team == "POR" &
city == "Boscombe, Bournemouth") | (away_team ==
"BOU" & city == "Portsmouth") | (away_team ==
"MCI" & city == "Leicester") | (away_team ==
"BUR" & city == "Barnsley")
)) %>% distinct(away_team, club, city)

team.coord$geocode <- paste0(team.coord$city, ",england")

team.coord.location <- geocode(team.coord$geocode)
team.coord <- cbind(team.coord, team.coord.location)
manutd.hme <- manutd.hme %>% left_join(team.coord, by = "away_team")

manutd.hme.saf <-
manutd.hme %>% filter(season %in% c("2011/2012", "2012/2013"))
manutd.hme.dm <- manutd.hme %>% filter(season == "2013/2014")
manutd.hme.lvg <-
manutd.hme %>% filter(season %in% c("2014/2015", "2015/2016"))
 gmap<-get_map("england",zoom=6) 

Comparing Saf last season with LVG and DM seasons

ggmap(gmap) + geom_point(aes(x = lon, y = lat, color = result), data = manutd.hme.saf) +
geom_label_repel(
aes(label = away_team, fill = result),
size = 2.5,
angle = 45,
data = manutd.hme.saf
) + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "SAF last two season home games", x = "", y = "")+scale_fill_manual(values = c("deepskyblue4", "firebrick", "darkolivegreen4"))

David Moyes home games

ggmap(gmap) + geom_point(aes(x = lon, y = lat, color = result), data = manutd.hme.dm) +
  geom_label_repel(
  aes(label = away_team, fill = result),
  size = 2.5,
  angle = 45,
  data = manutd.hme.dm
  ) +theme(plot.title = element_text(hjust = 0.5, face = "bold")) + labs(title = "David Moyes home games", x = "", y = "")+scale_fill_manual(values = c("deepskyblue4", "firebrick", "darkolivegreen4"))

LVG home Games

ggmap(gmap) + geom_point(aes(x = lon, y = lat, color = result), data = manutd.hme.lvg) +
  geom_label_repel(
  aes(label = away_team, fill = result),
  size = 2.5,
  angle = 45,
  data = manutd.hme.lvg
  ) + theme(plot.title = element_text(hjust = 0.5, face = "bold")) + labs(title =
  "LVG home games", x = "", y = "")+scale_fill_manual(values = c("deepskyblue4", "firebrick", "darkolivegreen4"))

Observation

It is not a surprise that SAF is the most successful manager and it is largely due to his winning streak. He lost very little at home, making Old Trafford a fortress. On the other hand, David Moyes’s short reign was dotted with losses at home against key rivals.

Visualization showing analysis by month. Compares results by month, goals scored, and goals conceded by month

ggplot(manutd.hme) + aes(month(date, label = TRUE)) + geom_histogram(stat =
                                                                       'count', fill = "firebrick") + facet_wrap(~ result) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "United results by Month", x = "", y = "")

ggplot(manutd.hme) + aes(month(date, label = TRUE),goal_scored)+geom_boxplot(fill="firebrick")+
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Goals scored by month", x = "", y = "")

ggplot(manutd.hme) + aes(month(date, label = TRUE),goal_conceded)+geom_boxplot(fill="firebrick")+
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Goals conceded by month", x = "", y = "")

Observations

United’s success is largely due to the fact that we always outscore our opponents. It is pretty evident from the boxplots that we score more goals than we concede