Data Preparation

We read the data directly from espn website. the steps below require package for web scrapping to get the HTML table. We collect Premiere League Seasons data from 2018-2022. All Teams in the played 38 matches every season.

# load data from the web using web scrapping package
library(rvest)
library(tidyverse)
library(vtable)
# Libraries
library(tidyverse)
library(hrbrthemes)
library(viridis)
# Read the data 
url <- 'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2022'
webpage <- read_html(url)

url21 <- 'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2021'
webpage21 <- read_html(url21)


# establish nodes to extract the data from html tables
sb_table <- html_nodes(webpage, 'table')
pmr <- html_table(sb_table,fill = T)[[2]]
pmr$Team <- html_table(sb_table,fill = T)[[1]]
pmr_league22 <- pmr %>% select(Team,GP,W,D,L,"F",A,GD,GD,P) %>% 
  rename (Team = Team,
          Games_Played = GD,
          Wins = W,
          Draw = D,
          Loss = L,
          Goal_scored = "F",
          Goals_Conceided = A,
          Goal_Difference = GD,
          Points = P)

prm_league22 <- pmr_league22 %>% mutate(English_team = c(
  "Manchester City",
  "Arsenal","Manchester United","Newcastle United",
  "Liverpool","Brighton & Hove Albion","Aston Villa",
  "Tottenham Hotspur","Brentford","Fulham","Crystal Palace",
  
  "Chelsea","Wolverhampton Wanderers","West Ham United","AFC Bournemouth",
  
  "Nottingham Forest","Everton","Leicester City","Leeds United",
  
  "Southampton"),
  season = '2022-2023') %>% select(English_team,2:11)
# Adding data for 5 seasons 
url21 <- 'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2021'
webpage21 <- read_html(url21)


sb_table21 <- html_nodes(webpage21, 'table')




pmr21 <- html_table(sb_table21,fill = T)[[2]]
pmr21$Team <- html_table(sb_table21,fill = T)[[1]]
pmr_league21 <- pmr21 %>% select(Team,GP,W,D,L,"F",A,GD,GD,P) %>% 
  rename (Team = Team,
          Games_Played = GD,
          Wins = W,
          Draw = D,
          Loss = L,
          Goal_scored = "F",
          Goals_Conceided = A,
          Goal_Difference = GD,
          Points = P)
prm_league21 <- pmr_league21 %>% mutate(English_team = c(
  "Manchester City",
  "Arsenal","Manchester United","Newcastle United",
  "Liverpool","Brighton & Hove Albion","Aston Villa",
  "Tottenham Hotspur","Brentford","Fulham","Crystal Palace",
  
  "Chelsea","Wolverhampton Wanderers","West Ham United","AFC Bournemouth",
  
  "Nottingham Forest","Everton","Leicester City","Leeds United",
  
  "Southampton"),
  season = '2021-2022') %>% select(English_team,2:11)
prm_league21 
## # A tibble: 20 × 10
##    English_team               GP  Wins  Draw  Loss Goal_scored Goals_Conceided
##    <chr>                   <int> <int> <int> <int>       <int>           <int>
##  1 Manchester City            38    29     6     3          99              26
##  2 Arsenal                    38    28     8     2          94              26
##  3 Manchester United          38    21    11     6          76              33
##  4 Newcastle United           38    22     5    11          69              40
##  5 Liverpool                  38    22     3    13          61              48
##  6 Brighton & Hove Albion     38    16    10    12          57              57
##  7 Aston Villa                38    16     8    14          60              51
##  8 Tottenham Hotspur          38    14    10    14          62              59
##  9 Brentford                  38    12    15    11          42              44
## 10 Fulham                     38    15     6    17          38              43
## 11 Crystal Palace             38    13    10    15          44              62
## 12 Chelsea                    38    11    15    12          50              46
## 13 Wolverhampton Wanderers    38    13     7    18          48              56
## 14 West Ham United            38    13     6    19          52              54
## 15 AFC Bournemouth            38     9    13    16          43              67
## 16 Nottingham Forest          38    11     6    21          43              66
## 17 Everton                    38     9    11    18          42              79
## 18 Leicester City             38     7    14    17          34              53
## 19 Leeds United               38     6     5    27          34              77
## 20 Southampton                38     5     7    26          23              84
## # ℹ 3 more variables: Goal_Difference <int>, Points <int>, season <chr>
#######################################################
url20 <-'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2020'

webpage20 <- read_html(url20)


sb_table20 <- html_nodes(webpage20, 'table')




pmr20 <- html_table(sb_table20,fill = T)[[2]]
pmr20$Team <- html_table(sb_table20,fill = T)[[1]]
pmr_league20 <- pmr20 %>% select(Team,GP,W,D,L,"F",A,GD,GD,P) %>% 
  rename (Team = Team,
          Games_Played = GD,
          Wins = W,
          Draw = D,
          Loss = L,
          Goal_scored = "F",
          Goals_Conceided = A,
          Goal_Difference = GD,
          Points = P)
prm_league20 <- pmr_league20 %>% mutate(English_team = c(
  "Manchester City",
  "Arsenal","Manchester United","Newcastle United",
  "Liverpool","Brighton & Hove Albion","Aston Villa",
  "Tottenham Hotspur","Brentford","Fulham","Crystal Palace",
  
  "Chelsea","Wolverhampton Wanderers","West Ham United","AFC Bournemouth",
  
  "Nottingham Forest","Everton","Leicester City","Leeds United",
  
  "Southampton"),
  season = '2020-2021') %>% select(English_team,2:11)
prm_league20 
## # A tibble: 20 × 10
##    English_team               GP  Wins  Draw  Loss Goal_scored Goals_Conceided
##    <chr>                   <int> <int> <int> <int>       <int>           <int>
##  1 Manchester City            38    27     5     6          83              32
##  2 Arsenal                    38    21    11     6          73              44
##  3 Manchester United          38    20     9     9          68              42
##  4 Newcastle United           38    19    10     9          58              36
##  5 Liverpool                  38    20     6    12          68              50
##  6 Brighton & Hove Albion     38    19     8    11          62              47
##  7 Aston Villa                38    18     8    12          68              45
##  8 Tottenham Hotspur          38    18     7    13          55              39
##  9 Brentford                  38    18     5    15          62              54
## 10 Fulham                     38    17     8    13          47              48
## 11 Crystal Palace             38    16     7    15          55              46
## 12 Chelsea                    38    12     9    17          46              62
## 13 Wolverhampton Wanderers    38    12     9    17          36              52
## 14 West Ham United            38    12     8    18          41              66
## 15 AFC Bournemouth            38    12     7    19          47              68
## 16 Nottingham Forest          38     9    14    15          40              46
## 17 Everton                    38    10     9    19          33              55
## 18 Leicester City             38     5    13    20          27              53
## 19 Leeds United               38     5    11    22          35              76
## 20 Southampton                38     7     2    29          20              63
## # ℹ 3 more variables: Goal_Difference <int>, Points <int>, season <chr>
##################################################################
url19 <-'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2019'

webpage19 <- read_html(url19)


sb_table19 <- html_nodes(webpage19, 'table')




pmr19 <- html_table(sb_table19,fill = T)[[2]]
pmr19$Team <- html_table(sb_table19,fill = T)[[1]]
pmr_league19 <- pmr19 %>% select(Team,GP,W,D,L,"F",A,GD,GD,P) %>% 
  rename (Team = Team,
          Games_Played = GD,
          Wins = W,
          Draw = D,
          Loss = L,
          Goal_scored = "F",
          Goals_Conceided = A,
          Goal_Difference = GD,
          Points = P)
prm_league19 <- pmr_league19 %>% mutate(English_team = c(
  "Manchester City",
  "Arsenal","Manchester United","Newcastle United",
  "Liverpool","Brighton & Hove Albion","Aston Villa",
  "Tottenham Hotspur","Brentford","Fulham","Crystal Palace",
  
  "Chelsea","Wolverhampton Wanderers","West Ham United","AFC Bournemouth",
  
  "Nottingham Forest","Everton","Leicester City","Leeds United",
  
  "Southampton"),
  season = '2019-2020') %>% select(English_team,2:11)
prm_league19 
## # A tibble: 20 × 10
##    English_team               GP  Wins  Draw  Loss Goal_scored Goals_Conceided
##    <chr>                   <int> <int> <int> <int>       <int>           <int>
##  1 Manchester City            38    32     3     3          85              33
##  2 Arsenal                    38    26     3     9         102              35
##  3 Manchester United          38    18    12     8          66              36
##  4 Newcastle United           38    20     6    12          69              54
##  5 Liverpool                  38    18     8    12          67              41
##  6 Brighton & Hove Albion     38    16    11    11          61              47
##  7 Aston Villa                38    15    14     9          51              40
##  8 Tottenham Hotspur          38    14    14    10          56              48
##  9 Brentford                  38    14    12    12          39              39
## 10 Fulham                     38    15     9    14          43              50
## 11 Crystal Palace             38    15     7    16          51              60
## 12 Chelsea                    38    13    10    15          44              56
## 13 Wolverhampton Wanderers    38    11    11    16          38              58
## 14 West Ham United            38    11    10    17          31              50
## 15 AFC Bournemouth            38     9    14    15          39              54
## 16 Nottingham Forest          38    10     9    19          49              62
## 17 Everton                    38     9     8    21          41              67
## 18 Leicester City             38     9     7    22          40              65
## 19 Leeds United               38     8    10    20          36              64
## 20 Southampton                38     5     6    27          26              75
## # ℹ 3 more variables: Goal_Difference <int>, Points <int>, season <chr>
###################################################################
url18 <- 'https://www.espn.com/soccer/standings/_/league/ENG.1/season/2018'


webpage18 <- read_html(url18)


sb_table18 <- html_nodes(webpage18, 'table')




pmr18 <- html_table(sb_table18,fill = T)[[2]]
pmr18$Team <- html_table(sb_table18,fill = T)[[1]]
pmr_league18 <- pmr18 %>% select(Team,GP,W,D,L,"F",A,GD,GD,P) %>% 
  rename (Team = Team,
          Games_Played = GD,
          Wins = W,
          Draw = D,
          Loss = L,
          Goal_scored = "F",
          Goals_Conceided = A,
          Goal_Difference = GD,
          Points = P)
prm_league18 <- pmr_league18 %>% mutate(English_team = c(
  "Manchester City",
  "Arsenal","Manchester United","Newcastle United",
  "Liverpool","Brighton & Hove Albion","Aston Villa",
  "Tottenham Hotspur","Brentford","Fulham","Crystal Palace",
  
  "Chelsea","Wolverhampton Wanderers","West Ham United","AFC Bournemouth",
  
  "Nottingham Forest","Everton","Leicester City","Leeds United",
  
  "Southampton"),
  season = '2018-2019') %>% select(English_team,2:11)
prm_league18 
## # A tibble: 20 × 10
##    English_team               GP  Wins  Draw  Loss Goal_scored Goals_Conceided
##    <chr>                   <int> <int> <int> <int>       <int>           <int>
##  1 Manchester City            38    32     2     4          95              23
##  2 Arsenal                    38    30     7     1          89              22
##  3 Manchester United          38    21     9     8          63              39
##  4 Newcastle United           38    23     2    13          67              39
##  5 Liverpool                  38    21     7    10          73              51
##  6 Brighton & Hove Albion     38    19     9    10          65              54
##  7 Aston Villa                38    16     9    13          47              46
##  8 Tottenham Hotspur          38    15     9    14          54              46
##  9 Brentford                  38    15     7    16          51              48
## 10 Fulham                     38    15     7    16          52              55
## 11 Crystal Palace             38    14     8    16          52              59
## 12 Chelsea                    38    14     7    17          51              53
## 13 Wolverhampton Wanderers    38    12     9    17          42              48
## 14 West Ham United            38    13     6    19          56              70
## 15 AFC Bournemouth            38    11     7    20          45              68
## 16 Nottingham Forest          38     9    12    17          45              65
## 17 Everton                    38     9     9    20          35              60
## 18 Leicester City             38    10     4    24          34              69
## 19 Leeds United               38     7     5    26          34              81
## 20 Southampton                38     3     7    28          22              76
## # ℹ 3 more variables: Goal_Difference <int>, Points <int>, season <chr>
team_stat <- rbind(prm_league22,prm_league21,prm_league20,prm_league19,prm_league18)
head(team_stat)
## # A tibble: 6 × 10
##   English_team              GP  Wins  Draw  Loss Goal_scored Goals_Conceided
##   <chr>                  <int> <int> <int> <int>       <int>           <int>
## 1 Manchester City           38    28     5     5          94              33
## 2 Arsenal                   38    26     6     6          88              43
## 3 Manchester United         38    23     6     9          58              43
## 4 Newcastle United          38    19    14     5          68              33
## 5 Liverpool                 38    19    10     9          75              47
## 6 Brighton & Hove Albion    38    18     8    12          72              53
## # ℹ 3 more variables: Goal_Difference <int>, Points <int>, season <chr>
st(team_stat,title = "Summary Statistics  for 5 Seasons")
Summary Statistics for 5 Seasons
Variable N Mean Std. Dev. Min Pctl. 25 Pctl. 75 Max
GP 100 38 0 38 38 38 38
Wins 100 15 6.4 3 10 18 32
Draw 100 8.4 3 2 6.8 10 15
Loss 100 15 6 1 11 19 29
Goal_scored 100 53 18 20 40 64 102
Goals_Conceided 100 53 13 22 45 62 84
Goal_Difference 100 0 29 -61 -21 15 73
Points 100 53 18 16 40 65 99
season 100
… 2018-2019 20 20%
… 2019-2020 20 20%
… 2020-2021 20 20%
… 2021-2022 20 20%
… 2022-2023 20 20%

Research question

What is the time that is going to win Premiere league this season? What are the most significant trait that can help a team with the league? Is it it defense or attach.

Cases

nrow(team_stat)
## [1] 100

We have 100 cases where each team might play for5 seasons.

Data collection

We collect the data directly on ESPN Website using web scrapping techniques. we modify the data set to include both season and renaming variables.

Type of study

This is observational study. Data collected during the game and summarized at the end of every season

Data Source

ESPN Internet Ventures. (n.d.). 2018-19 English premier league standings. ESPN. https://www.espn.com/soccer/standings/_/league/ENG.1/season/2018

ESPN Internet Ventures. (n.d.). 2018-19 English premier league standings. ESPN. https://www.espn.com/soccer/standings/_/league/ENG.1/season/2019

ESPN Internet Ventures. (n.d.). 2018-19 English premier league standings. ESPN. https://www.espn.com/soccer/standings/_/league/ENG.1/season/2020

ESPN Internet Ventures. (n.d.). 2018-19 English premier league standings. ESPN. https://www.espn.com/soccer/standings/_/league/ENG.1/season/2021

ESPN Internet Ventures. (n.d.). 2018-19 English premier league standings. ESPN. https://www.espn.com/soccer/standings/_/league/ENG.1/season/2022

Dependent Variable

The dependent variable Points is quantitative(numeric).

Independent Variable(s)

The independent variables are Teams,Goal_scored and Wins. Team is qualitative and Wins and Goal_scored are both Quantitative

Relevant summary statistics

Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# create a correlation graph of the variable to see how these variable are correlated

ggpairs(team_stat, columns = c("Wins","Goal_scored","Points"))

# Barplot

team <- team_stat %>% arrange(desc(Goal_scored)) %>% head(30) 
team <- as.data.frame(team) 
team %>% ggplot(aes(x=English_team, y=Goal_scored)) + 
  geom_bar(stat = "identity") +
  coord_flip()

# BoxPlot
team %>% head(10) %>%
  ggplot( aes(x=English_team, y=Points, fill=English_team)) +
  geom_boxplot() +
  scale_fill_viridis(discrete = TRUE, alpha=0.6) +
  geom_jitter(color="black", size=0.4, alpha=0.9) +
  theme_ipsum() +
  theme(
    legend.position="none",
    plot.title = element_text(size=11)
  ) +
  ggtitle("Points distributions for 5 seasons top rated-Teams") +
  xlab("Teams with Maximum Points collected")
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

summary(team_stat$English_team)
##    Length     Class      Mode 
##       100 character character
summary(team_stat$Wins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   10.00   14.00   14.79   18.25   32.00
summary(team_stat$Goal_scored)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   39.75   51.00   52.85   63.50  102.00
summary(team_stat$Points)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   39.75   51.50   52.79   65.25   99.00

The summary statistics give us a clear picture of what kind of data we are working with. Team with an average points over 51.56 points seems to have an average wins of 15 wins per season. Our GGally graph show how correlated this variables are. We trim the data to only see how 10 best records for Team with highest Point are distributed in the data set.