We read the data directly from espn website. the steps below require package for web scrapping to get the HTML table. We collect Premiere League Seasons data from 2018-2022. All Teams in the played 38 matches every season.
# load data from the web using web scrapping package
library(rvest)
library(tidyverse)
library(vtable)
# Libraries
library(tidyverse)
library(hrbrthemes)
library(viridis)
# Read the data
url <- 'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2022'
webpage <- read_html(url)
url21 <- 'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2021'
webpage21 <- read_html(url21)
# establish nodes to extract the data from html tables
sb_table <- html_nodes(webpage, 'table')
pmr <- html_table(sb_table,fill = T)[[2]]
pmr$Team <- html_table(sb_table,fill = T)[[1]]
pmr_league22 <- pmr %>% select(Team,GP,W,D,L,"F",A,GD,GD,P) %>%
rename (Team = Team,
Games_Played = GD,
Wins = W,
Draw = D,
Loss = L,
Goal_scored = "F",
Goals_Conceided = A,
Goal_Difference = GD,
Points = P)
prm_league22 <- pmr_league22 %>% mutate(English_team = c(
"Manchester City",
"Arsenal","Manchester United","Newcastle United",
"Liverpool","Brighton & Hove Albion","Aston Villa",
"Tottenham Hotspur","Brentford","Fulham","Crystal Palace",
"Chelsea","Wolverhampton Wanderers","West Ham United","AFC Bournemouth",
"Nottingham Forest","Everton","Leicester City","Leeds United",
"Southampton"),
season = '2022-2023') %>% select(English_team,2:11)
# Adding data for 5 seasons
url21 <- 'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2021'
webpage21 <- read_html(url21)
sb_table21 <- html_nodes(webpage21, 'table')
pmr21 <- html_table(sb_table21,fill = T)[[2]]
pmr21$Team <- html_table(sb_table21,fill = T)[[1]]
pmr_league21 <- pmr21 %>% select(Team,GP,W,D,L,"F",A,GD,GD,P) %>%
rename (Team = Team,
Games_Played = GD,
Wins = W,
Draw = D,
Loss = L,
Goal_scored = "F",
Goals_Conceided = A,
Goal_Difference = GD,
Points = P)
prm_league21 <- pmr_league21 %>% mutate(English_team = c(
"Manchester City",
"Arsenal","Manchester United","Newcastle United",
"Liverpool","Brighton & Hove Albion","Aston Villa",
"Tottenham Hotspur","Brentford","Fulham","Crystal Palace",
"Chelsea","Wolverhampton Wanderers","West Ham United","AFC Bournemouth",
"Nottingham Forest","Everton","Leicester City","Leeds United",
"Southampton"),
season = '2021-2022') %>% select(English_team,2:11)
prm_league21
## # A tibble: 20 × 10
## English_team GP Wins Draw Loss Goal_scored Goals_Conceided
## <chr> <int> <int> <int> <int> <int> <int>
## 1 Manchester City 38 29 6 3 99 26
## 2 Arsenal 38 28 8 2 94 26
## 3 Manchester United 38 21 11 6 76 33
## 4 Newcastle United 38 22 5 11 69 40
## 5 Liverpool 38 22 3 13 61 48
## 6 Brighton & Hove Albion 38 16 10 12 57 57
## 7 Aston Villa 38 16 8 14 60 51
## 8 Tottenham Hotspur 38 14 10 14 62 59
## 9 Brentford 38 12 15 11 42 44
## 10 Fulham 38 15 6 17 38 43
## 11 Crystal Palace 38 13 10 15 44 62
## 12 Chelsea 38 11 15 12 50 46
## 13 Wolverhampton Wanderers 38 13 7 18 48 56
## 14 West Ham United 38 13 6 19 52 54
## 15 AFC Bournemouth 38 9 13 16 43 67
## 16 Nottingham Forest 38 11 6 21 43 66
## 17 Everton 38 9 11 18 42 79
## 18 Leicester City 38 7 14 17 34 53
## 19 Leeds United 38 6 5 27 34 77
## 20 Southampton 38 5 7 26 23 84
## # ℹ 3 more variables: Goal_Difference <int>, Points <int>, season <chr>
#######################################################
url20 <-'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2020'
webpage20 <- read_html(url20)
sb_table20 <- html_nodes(webpage20, 'table')
pmr20 <- html_table(sb_table20,fill = T)[[2]]
pmr20$Team <- html_table(sb_table20,fill = T)[[1]]
pmr_league20 <- pmr20 %>% select(Team,GP,W,D,L,"F",A,GD,GD,P) %>%
rename (Team = Team,
Games_Played = GD,
Wins = W,
Draw = D,
Loss = L,
Goal_scored = "F",
Goals_Conceided = A,
Goal_Difference = GD,
Points = P)
prm_league20 <- pmr_league20 %>% mutate(English_team = c(
"Manchester City",
"Arsenal","Manchester United","Newcastle United",
"Liverpool","Brighton & Hove Albion","Aston Villa",
"Tottenham Hotspur","Brentford","Fulham","Crystal Palace",
"Chelsea","Wolverhampton Wanderers","West Ham United","AFC Bournemouth",
"Nottingham Forest","Everton","Leicester City","Leeds United",
"Southampton"),
season = '2020-2021') %>% select(English_team,2:11)
prm_league20
## # A tibble: 20 × 10
## English_team GP Wins Draw Loss Goal_scored Goals_Conceided
## <chr> <int> <int> <int> <int> <int> <int>
## 1 Manchester City 38 27 5 6 83 32
## 2 Arsenal 38 21 11 6 73 44
## 3 Manchester United 38 20 9 9 68 42
## 4 Newcastle United 38 19 10 9 58 36
## 5 Liverpool 38 20 6 12 68 50
## 6 Brighton & Hove Albion 38 19 8 11 62 47
## 7 Aston Villa 38 18 8 12 68 45
## 8 Tottenham Hotspur 38 18 7 13 55 39
## 9 Brentford 38 18 5 15 62 54
## 10 Fulham 38 17 8 13 47 48
## 11 Crystal Palace 38 16 7 15 55 46
## 12 Chelsea 38 12 9 17 46 62
## 13 Wolverhampton Wanderers 38 12 9 17 36 52
## 14 West Ham United 38 12 8 18 41 66
## 15 AFC Bournemouth 38 12 7 19 47 68
## 16 Nottingham Forest 38 9 14 15 40 46
## 17 Everton 38 10 9 19 33 55
## 18 Leicester City 38 5 13 20 27 53
## 19 Leeds United 38 5 11 22 35 76
## 20 Southampton 38 7 2 29 20 63
## # ℹ 3 more variables: Goal_Difference <int>, Points <int>, season <chr>
##################################################################
url19 <-'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2019'
webpage19 <- read_html(url19)
sb_table19 <- html_nodes(webpage19, 'table')
pmr19 <- html_table(sb_table19,fill = T)[[2]]
pmr19$Team <- html_table(sb_table19,fill = T)[[1]]
pmr_league19 <- pmr19 %>% select(Team,GP,W,D,L,"F",A,GD,GD,P) %>%
rename (Team = Team,
Games_Played = GD,
Wins = W,
Draw = D,
Loss = L,
Goal_scored = "F",
Goals_Conceided = A,
Goal_Difference = GD,
Points = P)
prm_league19 <- pmr_league19 %>% mutate(English_team = c(
"Manchester City",
"Arsenal","Manchester United","Newcastle United",
"Liverpool","Brighton & Hove Albion","Aston Villa",
"Tottenham Hotspur","Brentford","Fulham","Crystal Palace",
"Chelsea","Wolverhampton Wanderers","West Ham United","AFC Bournemouth",
"Nottingham Forest","Everton","Leicester City","Leeds United",
"Southampton"),
season = '2019-2020') %>% select(English_team,2:11)
prm_league19
## # A tibble: 20 × 10
## English_team GP Wins Draw Loss Goal_scored Goals_Conceided
## <chr> <int> <int> <int> <int> <int> <int>
## 1 Manchester City 38 32 3 3 85 33
## 2 Arsenal 38 26 3 9 102 35
## 3 Manchester United 38 18 12 8 66 36
## 4 Newcastle United 38 20 6 12 69 54
## 5 Liverpool 38 18 8 12 67 41
## 6 Brighton & Hove Albion 38 16 11 11 61 47
## 7 Aston Villa 38 15 14 9 51 40
## 8 Tottenham Hotspur 38 14 14 10 56 48
## 9 Brentford 38 14 12 12 39 39
## 10 Fulham 38 15 9 14 43 50
## 11 Crystal Palace 38 15 7 16 51 60
## 12 Chelsea 38 13 10 15 44 56
## 13 Wolverhampton Wanderers 38 11 11 16 38 58
## 14 West Ham United 38 11 10 17 31 50
## 15 AFC Bournemouth 38 9 14 15 39 54
## 16 Nottingham Forest 38 10 9 19 49 62
## 17 Everton 38 9 8 21 41 67
## 18 Leicester City 38 9 7 22 40 65
## 19 Leeds United 38 8 10 20 36 64
## 20 Southampton 38 5 6 27 26 75
## # ℹ 3 more variables: Goal_Difference <int>, Points <int>, season <chr>
###################################################################
url18 <- 'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2018'
webpage18 <- read_html(url18)
sb_table18 <- html_nodes(webpage18, 'table')
pmr18 <- html_table(sb_table18,fill = T)[[2]]
pmr18$Team <- html_table(sb_table18,fill = T)[[1]]
pmr_league18 <- pmr18 %>% select(Team,GP,W,D,L,"F",A,GD,GD,P) %>%
rename (Team = Team,
Games_Played = GD,
Wins = W,
Draw = D,
Loss = L,
Goal_scored = "F",
Goals_Conceided = A,
Goal_Difference = GD,
Points = P)
prm_league18 <- pmr_league18 %>% mutate(English_team = c(
"Manchester City",
"Arsenal","Manchester United","Newcastle United",
"Liverpool","Brighton & Hove Albion","Aston Villa",
"Tottenham Hotspur","Brentford","Fulham","Crystal Palace",
"Chelsea","Wolverhampton Wanderers","West Ham United","AFC Bournemouth",
"Nottingham Forest","Everton","Leicester City","Leeds United",
"Southampton"),
season = '2018-2019') %>% select(English_team,2:11)
prm_league18
## # A tibble: 20 × 10
## English_team GP Wins Draw Loss Goal_scored Goals_Conceided
## <chr> <int> <int> <int> <int> <int> <int>
## 1 Manchester City 38 32 2 4 95 23
## 2 Arsenal 38 30 7 1 89 22
## 3 Manchester United 38 21 9 8 63 39
## 4 Newcastle United 38 23 2 13 67 39
## 5 Liverpool 38 21 7 10 73 51
## 6 Brighton & Hove Albion 38 19 9 10 65 54
## 7 Aston Villa 38 16 9 13 47 46
## 8 Tottenham Hotspur 38 15 9 14 54 46
## 9 Brentford 38 15 7 16 51 48
## 10 Fulham 38 15 7 16 52 55
## 11 Crystal Palace 38 14 8 16 52 59
## 12 Chelsea 38 14 7 17 51 53
## 13 Wolverhampton Wanderers 38 12 9 17 42 48
## 14 West Ham United 38 13 6 19 56 70
## 15 AFC Bournemouth 38 11 7 20 45 68
## 16 Nottingham Forest 38 9 12 17 45 65
## 17 Everton 38 9 9 20 35 60
## 18 Leicester City 38 10 4 24 34 69
## 19 Leeds United 38 7 5 26 34 81
## 20 Southampton 38 3 7 28 22 76
## # ℹ 3 more variables: Goal_Difference <int>, Points <int>, season <chr>
team_stat <- rbind(prm_league22,prm_league21,prm_league20,prm_league19,prm_league18)
head(team_stat)
## # A tibble: 6 × 10
## English_team GP Wins Draw Loss Goal_scored Goals_Conceided
## <chr> <int> <int> <int> <int> <int> <int>
## 1 Manchester City 38 28 5 5 94 33
## 2 Arsenal 38 26 6 6 88 43
## 3 Manchester United 38 23 6 9 58 43
## 4 Newcastle United 38 19 14 5 68 33
## 5 Liverpool 38 19 10 9 75 47
## 6 Brighton & Hove Albion 38 18 8 12 72 53
## # ℹ 3 more variables: Goal_Difference <int>, Points <int>, season <chr>
st(team_stat,title = "Summary Statistics for 5 Seasons")
Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
---|---|---|---|---|---|---|---|
GP | 100 | 38 | 0 | 38 | 38 | 38 | 38 |
Wins | 100 | 15 | 6.4 | 3 | 10 | 18 | 32 |
Draw | 100 | 8.4 | 3 | 2 | 6.8 | 10 | 15 |
Loss | 100 | 15 | 6 | 1 | 11 | 19 | 29 |
Goal_scored | 100 | 53 | 18 | 20 | 40 | 64 | 102 |
Goals_Conceided | 100 | 53 | 13 | 22 | 45 | 62 | 84 |
Goal_Difference | 100 | 0 | 29 | -61 | -21 | 15 | 73 |
Points | 100 | 53 | 18 | 16 | 40 | 65 | 99 |
season | 100 | ||||||
… 2018-2019 | 20 | 20% | |||||
… 2019-2020 | 20 | 20% | |||||
… 2020-2021 | 20 | 20% | |||||
… 2021-2022 | 20 | 20% | |||||
… 2022-2023 | 20 | 20% |
What is the time that is going to win Premiere league this season? What are the most significant trait that can help a team with the league? Is it it defense or attach.
nrow(team_stat)
## [1] 100
We have 100 cases where each team might play for5 seasons.
We collect the data directly on ESPN Website using web scrapping techniques. we modify the data set to include both season and renaming variables.
This is observational study. Data collected during the game and summarized at the end of every season
ESPN Internet Ventures. (n.d.). 2018-19 English premier league standings. ESPN. https://www.espn.com/soccer/standings/_/league/ENG.1/season/2018
ESPN Internet Ventures. (n.d.). 2018-19 English premier league standings. ESPN. https://www.espn.com/soccer/standings/_/league/ENG.1/season/2019
ESPN Internet Ventures. (n.d.). 2018-19 English premier league standings. ESPN. https://www.espn.com/soccer/standings/_/league/ENG.1/season/2020
ESPN Internet Ventures. (n.d.). 2018-19 English premier league standings. ESPN. https://www.espn.com/soccer/standings/_/league/ENG.1/season/2021
ESPN Internet Ventures. (n.d.). 2018-19 English premier league standings. ESPN. https://www.espn.com/soccer/standings/_/league/ENG.1/season/2022
The dependent variable Points
is
quantitative(numeric).
The independent variables are Teams,Goal_scored and Wins. Team is qualitative and Wins and Goal_scored are both Quantitative
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# create a correlation graph of the variable to see how these variable are correlated
ggpairs(team_stat, columns = c("Wins","Goal_scored","Points"))
# Barplot
team <- team_stat %>% arrange(desc(Goal_scored)) %>% head(30)
team <- as.data.frame(team)
team %>% ggplot(aes(x=English_team, y=Goal_scored)) +
geom_bar(stat = "identity") +
coord_flip()
# BoxPlot
team %>% head(10) %>%
ggplot( aes(x=English_team, y=Points, fill=English_team)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
geom_jitter(color="black", size=0.4, alpha=0.9) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("Points distributions for 5 seasons top rated-Teams") +
xlab("Teams with Maximum Points collected")
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
summary(team_stat$English_team)
## Length Class Mode
## 100 character character
summary(team_stat$Wins)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 10.00 14.00 14.79 18.25 32.00
summary(team_stat$Goal_scored)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.00 39.75 51.00 52.85 63.50 102.00
summary(team_stat$Points)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 39.75 51.50 52.79 65.25 99.00
The summary statistics give us a clear picture of what kind of data we are working with. Team with an average points over 51.56 points seems to have an average wins of 15 wins per season. Our GGally graph show how correlated this variables are. We trim the data to only see how 10 best records for Team with highest Point are distributed in the data set.