# R libraries
library(reticulate)
library(tidyverse)
library(forcats)
# Py modules
import pandas as pd
import requests
from io import StringIO
url = "https://fbref.com/en/comps/9/keepers/Premier-League-Stats"
# Send a GET request and remove the comment tags from response text
stripped_comment_tags = StringIO(requests.get(url).text.replace('<!--', '').replace('-->', ''))
df1 = pd.read_html(stripped_comment_tags, attrs={'id': 'stats_squads_keeper_for'})[0]
df2 = pd.read_html(stripped_comment_tags, attrs={'id': 'stats_keeper'})[0]
# Print data frames Py
print(df1)
## Unnamed: 0_level_0 Unnamed: 1_level_0 Playing Time ... Penalty Kicks
## Squad # Pl MP Starts ... PKA PKsv PKm Save%
## 0 Arsenal 2 35 35 ... 2 1 0 33.3
## 1 Aston Villa 2 35 35 ... 1 0 0 0.0
## 2 Bournemouth 3 35 35 ... 5 1 0 16.7
## 3 Brentford 2 35 35 ... 2 0 0 0.0
## 4 Brighton 2 34 34 ... 6 0 0 0.0
## 5 Burnley 2 35 35 ... 6 0 0 0.0
## 6 Chelsea 2 34 34 ... 3 1 1 25.0
## 7 Crystal Palace 3 35 35 ... 3 0 0 0.0
## 8 Everton 1 35 35 ... 8 0 0 0.0
## 9 Fulham 1 35 35 ... 6 0 0 0.0
## 10 Liverpool 2 35 35 ... 1 0 0 0.0
## 11 Luton Town 1 35 35 ... 3 0 0 0.0
## 12 Manchester City 2 34 34 ... 3 0 0 0.0
## 13 Manchester Utd 1 34 34 ... 7 0 0 0.0
## 14 Newcastle Utd 3 34 34 ... 4 1 0 20.0
## 15 Nott'ham Forest 3 35 35 ... 3 0 0 0.0
## 16 Sheffield Utd 2 35 35 ... 5 0 2 0.0
## 17 Tottenham 1 34 34 ... 6 0 0 0.0
## 18 West Ham 2 35 35 ... 8 2 0 20.0
## 19 Wolves 2 35 35 ... 6 0 0 0.0
##
## [20 rows x 21 columns]
print(df2)
## Unnamed: 0_level_0 Unnamed: 1_level_0 ... Penalty Kicks Unnamed: 26_level_0
## Rk Player ... Save% Matches
## 0 1 Alisson ... 0.0 Matches
## 1 2 Alphonse Areola ... 28.6 Matches
## 2 3 Daniel Bentley ... NaN Matches
## 3 4 Martin Dúbravka ... 20.0 Matches
## 4 5 Ederson ... 0.0 Matches
## 5 6 Łukasz Fabiański ... 0.0 Matches
## 6 7 Mark Flekken ... 0.0 Matches
## 7 8 Wes Foderingham ... 0.0 Matches
## 8 9 Ivo Grbić ... NaN Matches
## 9 10 Dean Henderson ... 0.0 Matches
## 10 11 Sam Johnstone ... 0.0 Matches
## 11 12 Thomas Kaminski ... 0.0 Matches
## 12 13 Loris Karius ... NaN Matches
## 13 14 Caoimhín Kelleher ... NaN Matches
## 14 15 Bernd Leno ... 0.0 Matches
## 15 16 Emiliano Martínez ... 0.0 Matches
## 16 17 Remi Matthews ... NaN Matches
## 17 18 Arijanet Muric ... 0.0 Matches
## 18 19 Neto ... 16.7 Matches
## 19 20 Robin Olsen ... NaN Matches
## 20 21 André Onana ... 0.0 Matches
## 21 22 Stefan Ortega ... NaN Matches
## 22 23 Đorđe Petrović ... 0.0 Matches
## 23 24 Jordan Pickford ... 0.0 Matches
## 24 25 Nick Pope ... NaN Matches
## 25 Rk Player ... Save% Matches
## 26 26 Ionuț Radu ... NaN Matches
## 27 27 Aaron Ramsdale ... NaN Matches
## 28 28 David Raya ... 33.3 Matches
## 29 29 José Sá ... 0.0 Matches
## 30 30 Robert Sánchez ... 33.3 Matches
## 31 31 Matz Sels ... NaN Matches
## 32 32 Jason Steele ... 0.0 Matches
## 33 33 Thomas Strakosha ... NaN Matches
## 34 34 James Trafford ... 0.0 Matches
## 35 35 Mark Travers ... NaN Matches
## 36 36 Matt Turner ... 0.0 Matches
## 37 37 Bart Verbruggen ... 0.0 Matches
## 38 38 Guglielmo Vicario ... 0.0 Matches
## 39 39 Odisseas Vlachodimos ... 0.0 Matches
##
## [40 rows x 27 columns]
# Transform Py into R objects
rdf1 <- py$df1
rdf2 <- py$df2
rdf1 <- rdf1 %>%
setNames(names(.) %>%
gsub(pattern = "['() ]", replacement = "", .) %>% # Elimina paréntesis y comillas
gsub(pattern = ",", replacement = "_"))
names(rdf1)[1] <- 'Squad'
names(rdf1)[7] <- 'GA'
names(rdf1)[12] <- 'Wins'
rdf1 |> ggplot(aes(fct_reorder(Squad,-GA),y=GA)) +
geom_col() +
scale_y_continuous(limits = c(0,max(rdf1$GA)+5)) +
geom_text(aes(label= GA),vjust = -0.2) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = 9)) +
labs(title = 'Teams 2023-2024 Premier League',
x='Squad',
y='Goals Agains')

rdf1 |> ggplot(aes(fct_reorder(Squad,-Wins),y=Wins)) +
geom_col() +
scale_y_continuous(limits = c(0,max(rdf1$Wins)+3)) +
geom_text(aes(label= Wins),vjust = -0.2) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = 9)) +
labs(title = 'Wins 2023-2024 Premier League',
x='Squad',
y='Wins')

names(rdf2)
## [1] "('Unnamed: 0_level_0', 'Rk')" "('Unnamed: 1_level_0', 'Player')"
## [3] "('Unnamed: 2_level_0', 'Nation')" "('Unnamed: 3_level_0', 'Pos')"
## [5] "('Unnamed: 4_level_0', 'Squad')" "('Unnamed: 5_level_0', 'Age')"
## [7] "('Unnamed: 6_level_0', 'Born')" "('Playing Time', 'MP')"
## [9] "('Playing Time', 'Starts')" "('Playing Time', 'Min')"
## [11] "('Playing Time', '90s')" "('Performance', 'GA')"
## [13] "('Performance', 'GA90')" "('Performance', 'SoTA')"
## [15] "('Performance', 'Saves')" "('Performance', 'Save%')"
## [17] "('Performance', 'W')" "('Performance', 'D')"
## [19] "('Performance', 'L')" "('Performance', 'CS')"
## [21] "('Performance', 'CS%')" "('Penalty Kicks', 'PKatt')"
## [23] "('Penalty Kicks', 'PKA')" "('Penalty Kicks', 'PKsv')"
## [25] "('Penalty Kicks', 'PKm')" "('Penalty Kicks', 'Save%')"
## [27] "('Unnamed: 26_level_0', 'Matches')"
rdf2 <- rdf2 %>%
setNames(names(.) %>%
gsub(pattern = "['() ]", replacement = "", .) %>% # Elimina paréntesis y comillas
gsub(pattern = ",", replacement = "_"))
names(rdf2)[1] <- 'RK'
names(rdf2)[2] <- 'Player'
names(rdf2)[3] <- 'Nation'
names(rdf2)[5] <- 'Squad'
names(rdf2)[7] <- 'Born'
names(rdf2)[8] <- 'MP'
names(rdf2)[17] <- 'Wins'
rdf2 <- rdf2 |>
mutate(MP=as.numeric(MP)) |>
mutate(Wins=as.numeric(Wins))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `MP = as.numeric(MP)`.
## Caused by warning:
## ! NAs introduced by coercion
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Wins = as.numeric(Wins)`.
## Caused by warning:
## ! NAs introduced by coercion
rdf2 |> mutate(Wins=case_when(is.na(Wins) ~ 0,
TRUE~Wins)) |>
ggplot(aes(fct_reorder(Player,-Wins),y=Wins)) +
geom_col() +
scale_y_continuous(limits = c(0,23)) +
geom_text(aes(label= Wins),vjust = -0.2) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = 9)) +
labs(title = 'Players wins 2023-2024 Premier League',
x='Players',
y='Wins')

rdf2 |> mutate(Wins=case_when(is.na(Wins) ~ 0,
TRUE~Wins)) |>
filter(Squad %in% c('Crystal Palace',
'Newcastle Utd',
"Nott'ham Forest",
'Arsenal',
'Aston Villa',
'Bournemouth',
'Brentford',
'Brighton')) |>
ggplot(aes(fct_reorder(Player,-Wins),y=Wins)) +
geom_col() +
scale_y_continuous(limits = c(0,23)) +
geom_text(aes(label= Wins),vjust = -0.2) +
theme_minimal() +
facet_wrap(~Squad, scales='free') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = 9)) +
labs(title = 'Players wins 2023-2024 Premier League',
x='Players',
y='Wins')

rdf2 |> mutate(Wins=case_when(is.na(Wins) ~ 0,
TRUE~Wins)) |>
group_by(Squad) |>
summarise(Wins = sum(Wins)) |>
ggplot(aes(fct_reorder(Squad,-Wins),y=Wins)) +
geom_col() +
scale_y_continuous(limits = c(0,25)) +
geom_text(aes(label= Wins),vjust = -0.2) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = 9)) +
labs(title = 'Squad wins 2023-2024 Premier League',
x='Squad',
y='Wins')

# Interesting links
# https://medium.com/data-science-at-microsoft/collaborating-between-python-and-r-using-reticulate-25246b367957