This document analyzes the performance of Manchester United in home games. The document visually represents the wins,draws,and losses using geocoding.
Load required packages
library(tidyr)
library(dplyr)
library("RSQLite")
library(ggplot2)
library(gganimate)
library(qdapTools)
library(lubridate)
library(ggmap)
library(rvest)
library(stringr)
library(fuzzyjoin)
library(ggrepel)Import the dataset
The database is an SQLLite database. So Importing all the required tables
con = dbConnect(drv = RSQLite::SQLite(), dbname = "database.sqlite")
tables <- dbListTables(con)
tables <- tables[tables != "sqlite_sequence"]
DataFrames <- vector("list", length = length(tables))
Country <-
dbGetQuery(conn = con,
statement = ("SELECT * FROM 'Country'"))
League <-
dbGetQuery(conn = con, statement = ("SELECT * FROM 'League'"))
Match <-
dbGetQuery(conn = con, statement = ("SELECT * FROM 'Match'"))
Player <-
dbGetQuery(conn = con, statement = ("SELECT * FROM 'Player'"))
Player_Stats <-
dbGetQuery(conn = con,
statement = ("SELECT * FROM 'Player_Stats'"))
Team <- dbGetQuery(conn = con, statement = ("SELECT * FROM 'Team'"))Get cities of the team for geo location
The SQLLite database does not contain city information of the clubs. Getting the loacation information by webscraping and using Gmaps to get the geocodes of the clubs
team.cities <-
"https://en.wikipedia.org/wiki/List_of_Premier_League_clubs"
team.cities.df <- team.cities %>%
read_html() %>%
html_nodes(xpath = '//*[@id="mw-content-text"]/table[1]') %>%
html_table()
team.cities.df <- team.cities.df[[1]]
url <- "https://liaison.reuters.com/tools/sports-team-codes"
team.codes <-
url %>% read_html() %>% html_nodes(xpath = '//*[@id="Category"]/tbody/tr[3]/td/table') %>%
html_table()
team.codes <- team.codes[[2]]
names(team.codes)[1:2] <- c("club_code", "Club")Data Exploration of home games
Explore the data and cleanse it in the required format for geocode analysis
manutd.hme <-
Team %>% filter(team_long_name == "Manchester United") %>% inner_join(Match, by =
c("team_api_id" = "home_team_api_id")) %>%
select(
one_of(
"team_short_name",
"season",
"date",
"away_team_api_id",
"home_team_goal",
"away_team_goal",
"match_api_id"
)
) %>% mutate(result = ifelse (
home_team_goal > away_team_goal,
"Win",
ifelse (home_team_goal < away_team_goal, "Loss", "Draw")
))
manutd.hme <-
manutd.hme %>% inner_join(Team, by = c("away_team_api_id" = "team_api_id")) %>%
select(-one_of("id")) %>% mutate(manager = ifelse(
season %in% c(
"2008/2009",
"2009/2010",
"2010/2011",
"2011/2012",
"2012/2013"
),
"SAF",
ifelse(season ==
"2013/2014",
"DM", "LVG")
),
game = "home") %>% dplyr::rename(
home_team = team_short_name.x,
goal_scored = home_team_goal,
goal_conceded = away_team_goal,
away_team = team_short_name.y
)**Data cleansing of geocode data*
Cleansing the data obtained through webscraping before plotting them using Gmaps. The webscraped data contains missing information that needs to be corrected. Using Fuzzyjoin to join the datasets as the Club names are not having a common nomenclature.
team.coord <-
manutd.hme %>% distinct(away_team, team_long_name, away_team_api_id) %>% rename(Club =
team_long_name)
team.coord <-
team.coord %>% stringdist_left_join(team.cities.df, by = "Club", max_dist =
4) %>% rename(city =
`Town or City`) %>% select(one_of("away_team", "Club.x", "Club.y", "city"))
team.coord$city[team.coord$away_team == "WHU"] <-
"London Borough of Newham"
team.coord$city[team.coord$away_team == "WIG"] <- "Wigan"
team.coord$city[team.coord$away_team == "TOT"] <-
"London Borough of Haringey"
team.coord$city[team.coord$away_team == "BIR"] <- "Birmingham"
team.coord$city[team.coord$away_team == "NOR"] <- "Norwich"
team.coord$city[team.coord$away_team == "CAR"] <- "Cardiff"
team.coord$city[team.coord$away_team == "LEI"] <- "Leicester"
team.coord$city[team.coord$away_team == "STK"] <- "Stoke-on-Trent"
team.coord <-
team.coord %>% rename(club = Club.x) %>% filter(!((away_team == "POR" &
city == "Boscombe, Bournemouth") | (away_team ==
"BOU" & city == "Portsmouth") | (away_team ==
"MCI" & city == "Leicester") | (away_team ==
"BUR" & city == "Barnsley")
)) %>% distinct(away_team, club, city)
team.coord$geocode <- paste0(team.coord$city, ",england")
team.coord.location <- geocode(team.coord$geocode)
team.coord <- cbind(team.coord, team.coord.location)
manutd.hme <- manutd.hme %>% left_join(team.coord, by = "away_team")
manutd.hme.saf <-
manutd.hme %>% filter(season %in% c("2011/2012", "2012/2013"))
manutd.hme.dm <- manutd.hme %>% filter(season == "2013/2014")
manutd.hme.lvg <-
manutd.hme %>% filter(season %in% c("2014/2015", "2015/2016"))
gmap<-get_map("england",zoom=6) Comparing Saf last season with LVG and DM seasons
ggmap(gmap) + geom_point(aes(x = lon, y = lat, color = result), data = manutd.hme.saf) +
geom_label_repel(
aes(label = away_team, fill = result),
size = 2.5,
angle = 45,
data = manutd.hme.saf
) + theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "SAF last two season home games", x = "", y = "")+scale_fill_manual(values = c("deepskyblue4", "firebrick", "darkolivegreen4"))David Moyes home games
ggmap(gmap) + geom_point(aes(x = lon, y = lat, color = result), data = manutd.hme.dm) +
geom_label_repel(
aes(label = away_team, fill = result),
size = 2.5,
angle = 45,
data = manutd.hme.dm
) +theme(plot.title = element_text(hjust = 0.5, face = "bold")) + labs(title = "David Moyes home games", x = "", y = "")+scale_fill_manual(values = c("deepskyblue4", "firebrick", "darkolivegreen4"))LVG home Games
ggmap(gmap) + geom_point(aes(x = lon, y = lat, color = result), data = manutd.hme.lvg) +
geom_label_repel(
aes(label = away_team, fill = result),
size = 2.5,
angle = 45,
data = manutd.hme.lvg
) + theme(plot.title = element_text(hjust = 0.5, face = "bold")) + labs(title =
"LVG home games", x = "", y = "")+scale_fill_manual(values = c("deepskyblue4", "firebrick", "darkolivegreen4"))Observation
It is not a surprise that SAF is the most successful manager and it is largely due to his winning streak. He lost very little at home, making Old Trafford a fortress. On the other hand, David Moyes’s short reign was dotted with losses at home against key rivals.
Visualization showing analysis by month. Compares results by month, goals scored, and goals conceded by month
ggplot(manutd.hme) + aes(month(date, label = TRUE)) + geom_histogram(stat =
'count', fill = "firebrick") + facet_wrap(~ result) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "United results by Month", x = "", y = "")ggplot(manutd.hme) + aes(month(date, label = TRUE),goal_scored)+geom_boxplot(fill="firebrick")+
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Goals scored by month", x = "", y = "")ggplot(manutd.hme) + aes(month(date, label = TRUE),goal_conceded)+geom_boxplot(fill="firebrick")+
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
labs(title = "Goals conceded by month", x = "", y = "")Observations
United’s success is largely due to the fact that we always outscore our opponents. It is pretty evident from the boxplots that we score more goals than we concede