Scraping Py and plots with R

# R libraries
library(reticulate)
library(tidyverse)
library(forcats)

# Py modules 
import pandas as pd
import requests
from io import StringIO

url = "https://fbref.com/en/comps/9/keepers/Premier-League-Stats"

# Send a GET request and remove the comment tags from response text
stripped_comment_tags = StringIO(requests.get(url).text.replace('<!--', '').replace('-->', ''))

df1 = pd.read_html(stripped_comment_tags, attrs={'id': 'stats_squads_keeper_for'})[0]
df2 = pd.read_html(stripped_comment_tags, attrs={'id': 'stats_keeper'})[0]

# Print data frames Py
print(df1)

##    Unnamed: 0_level_0 Unnamed: 1_level_0 Playing Time         ... Penalty Kicks               
##                 Squad               # Pl           MP Starts  ...           PKA PKsv PKm Save%
## 0             Arsenal                  2           35     35  ...             2    1   0  33.3
## 1         Aston Villa                  2           35     35  ...             1    0   0   0.0
## 2         Bournemouth                  3           35     35  ...             5    1   0  16.7
## 3           Brentford                  2           35     35  ...             2    0   0   0.0
## 4            Brighton                  2           34     34  ...             6    0   0   0.0
## 5             Burnley                  2           35     35  ...             6    0   0   0.0
## 6             Chelsea                  2           34     34  ...             3    1   1  25.0
## 7      Crystal Palace                  3           35     35  ...             3    0   0   0.0
## 8             Everton                  1           35     35  ...             8    0   0   0.0
## 9              Fulham                  1           35     35  ...             6    0   0   0.0
## 10          Liverpool                  2           35     35  ...             1    0   0   0.0
## 11         Luton Town                  1           35     35  ...             3    0   0   0.0
## 12    Manchester City                  2           34     34  ...             3    0   0   0.0
## 13     Manchester Utd                  1           34     34  ...             7    0   0   0.0
## 14      Newcastle Utd                  3           34     34  ...             4    1   0  20.0
## 15    Nott'ham Forest                  3           35     35  ...             3    0   0   0.0
## 16      Sheffield Utd                  2           35     35  ...             5    0   2   0.0
## 17          Tottenham                  1           34     34  ...             6    0   0   0.0
## 18           West Ham                  2           35     35  ...             8    2   0  20.0
## 19             Wolves                  2           35     35  ...             6    0   0   0.0
## 
## [20 rows x 21 columns]

print(df2)

##    Unnamed: 0_level_0    Unnamed: 1_level_0  ... Penalty Kicks Unnamed: 26_level_0
##                    Rk                Player  ...         Save%             Matches
## 0                   1               Alisson  ...           0.0             Matches
## 1                   2       Alphonse Areola  ...          28.6             Matches
## 2                   3        Daniel Bentley  ...           NaN             Matches
## 3                   4       Martin Dúbravka  ...          20.0             Matches
## 4                   5               Ederson  ...           0.0             Matches
## 5                   6      Łukasz Fabiański  ...           0.0             Matches
## 6                   7          Mark Flekken  ...           0.0             Matches
## 7                   8       Wes Foderingham  ...           0.0             Matches
## 8                   9             Ivo Grbić  ...           NaN             Matches
## 9                  10        Dean Henderson  ...           0.0             Matches
## 10                 11         Sam Johnstone  ...           0.0             Matches
## 11                 12       Thomas Kaminski  ...           0.0             Matches
## 12                 13          Loris Karius  ...           NaN             Matches
## 13                 14     Caoimhín Kelleher  ...           NaN             Matches
## 14                 15            Bernd Leno  ...           0.0             Matches
## 15                 16     Emiliano Martínez  ...           0.0             Matches
## 16                 17         Remi Matthews  ...           NaN             Matches
## 17                 18        Arijanet Muric  ...           0.0             Matches
## 18                 19                  Neto  ...          16.7             Matches
## 19                 20           Robin Olsen  ...           NaN             Matches
## 20                 21           André Onana  ...           0.0             Matches
## 21                 22         Stefan Ortega  ...           NaN             Matches
## 22                 23        Đorđe Petrović  ...           0.0             Matches
## 23                 24       Jordan Pickford  ...           0.0             Matches
## 24                 25             Nick Pope  ...           NaN             Matches
## 25                 Rk                Player  ...         Save%             Matches
## 26                 26            Ionuț Radu  ...           NaN             Matches
## 27                 27        Aaron Ramsdale  ...           NaN             Matches
## 28                 28            David Raya  ...          33.3             Matches
## 29                 29               José Sá  ...           0.0             Matches
## 30                 30        Robert Sánchez  ...          33.3             Matches
## 31                 31             Matz Sels  ...           NaN             Matches
## 32                 32          Jason Steele  ...           0.0             Matches
## 33                 33      Thomas Strakosha  ...           NaN             Matches
## 34                 34        James Trafford  ...           0.0             Matches
## 35                 35          Mark Travers  ...           NaN             Matches
## 36                 36           Matt Turner  ...           0.0             Matches
## 37                 37       Bart Verbruggen  ...           0.0             Matches
## 38                 38     Guglielmo Vicario  ...           0.0             Matches
## 39                 39  Odisseas Vlachodimos  ...           0.0             Matches
## 
## [40 rows x 27 columns]

# Transform Py into R objects
rdf1 <- py$df1
rdf2 <- py$df2

rdf1 <- rdf1 %>%
  setNames(names(.) %>%
             gsub(pattern = "['() ]", replacement = "", .) %>%  # Elimina paréntesis y comillas
             gsub(pattern = ",", replacement = "_")) 


names(rdf1)[1] <- 'Squad'
names(rdf1)[7] <- 'GA'
names(rdf1)[12] <- 'Wins'

rdf1 |> ggplot(aes(fct_reorder(Squad,-GA),y=GA)) +
  geom_col() +
  scale_y_continuous(limits = c(0,max(rdf1$GA)+5)) +
   geom_text(aes(label= GA),vjust = -0.2) +
  theme_minimal() +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = 9)) +
  labs(title = 'Teams 2023-2024 Premier League',
       x='Squad',
       y='Goals Agains')

rdf1 |> ggplot(aes(fct_reorder(Squad,-Wins),y=Wins)) +
  geom_col() +
  scale_y_continuous(limits = c(0,max(rdf1$Wins)+3)) +
   geom_text(aes(label= Wins),vjust = -0.2) +
  theme_minimal() +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = 9)) +
  labs(title = 'Wins 2023-2024 Premier League',
       x='Squad',
       y='Wins')

names(rdf2)

##  [1] "('Unnamed: 0_level_0', 'Rk')"       "('Unnamed: 1_level_0', 'Player')"  
##  [3] "('Unnamed: 2_level_0', 'Nation')"   "('Unnamed: 3_level_0', 'Pos')"     
##  [5] "('Unnamed: 4_level_0', 'Squad')"    "('Unnamed: 5_level_0', 'Age')"     
##  [7] "('Unnamed: 6_level_0', 'Born')"     "('Playing Time', 'MP')"            
##  [9] "('Playing Time', 'Starts')"         "('Playing Time', 'Min')"           
## [11] "('Playing Time', '90s')"            "('Performance', 'GA')"             
## [13] "('Performance', 'GA90')"            "('Performance', 'SoTA')"           
## [15] "('Performance', 'Saves')"           "('Performance', 'Save%')"          
## [17] "('Performance', 'W')"               "('Performance', 'D')"              
## [19] "('Performance', 'L')"               "('Performance', 'CS')"             
## [21] "('Performance', 'CS%')"             "('Penalty Kicks', 'PKatt')"        
## [23] "('Penalty Kicks', 'PKA')"           "('Penalty Kicks', 'PKsv')"         
## [25] "('Penalty Kicks', 'PKm')"           "('Penalty Kicks', 'Save%')"        
## [27] "('Unnamed: 26_level_0', 'Matches')"

rdf2 <- rdf2 %>%
  setNames(names(.) %>%
             gsub(pattern = "['() ]", replacement = "", .) %>%  # Elimina paréntesis y comillas
             gsub(pattern = ",", replacement = "_")) 

names(rdf2)[1] <- 'RK'
names(rdf2)[2] <- 'Player'
names(rdf2)[3] <- 'Nation'
names(rdf2)[5] <- 'Squad'
names(rdf2)[7] <- 'Born'
names(rdf2)[8] <- 'MP'
names(rdf2)[17] <- 'Wins'


rdf2 <- rdf2 |> 
  mutate(MP=as.numeric(MP)) |> 
  mutate(Wins=as.numeric(Wins))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `MP = as.numeric(MP)`.
## Caused by warning:
## ! NAs introduced by coercion

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Wins = as.numeric(Wins)`.
## Caused by warning:
## ! NAs introduced by coercion

rdf2 |> mutate(Wins=case_when(is.na(Wins) ~ 0, 
                              TRUE~Wins)) |> 
  ggplot(aes(fct_reorder(Player,-Wins),y=Wins)) +
  geom_col() +
  scale_y_continuous(limits = c(0,23)) +
   geom_text(aes(label= Wins),vjust = -0.2) +
  theme_minimal() +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = 9)) +
  labs(title = 'Players wins 2023-2024 Premier League',
       x='Players',
       y='Wins')

rdf2 |> mutate(Wins=case_when(is.na(Wins) ~ 0, 
                              TRUE~Wins)) |>
  filter(Squad %in% c('Crystal Palace',
                 'Newcastle Utd',
                 "Nott'ham Forest",
                 'Arsenal',
                 'Aston Villa',
                 'Bournemouth',
                 'Brentford',
                 'Brighton')) |> 
  ggplot(aes(fct_reorder(Player,-Wins),y=Wins)) +
  geom_col() +
  scale_y_continuous(limits = c(0,23)) +
   geom_text(aes(label= Wins),vjust = -0.2) +
  theme_minimal() +
  facet_wrap(~Squad, scales='free') +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = 9)) +
  labs(title = 'Players wins 2023-2024 Premier League',
       x='Players',
       y='Wins')

rdf2 |> mutate(Wins=case_when(is.na(Wins) ~ 0, 
                              TRUE~Wins)) |> 
  group_by(Squad) |> 
  summarise(Wins = sum(Wins)) |> 
  ggplot(aes(fct_reorder(Squad,-Wins),y=Wins)) +
  geom_col() +
  scale_y_continuous(limits = c(0,25)) +
   geom_text(aes(label= Wins),vjust = -0.2) +
  theme_minimal() +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = 9)) +
  labs(title = 'Squad wins 2023-2024 Premier League',
       x='Squad',
       y='Wins')

# Interesting links
# https://medium.com/data-science-at-microsoft/collaborating-between-python-and-r-using-reticulate-25246b367957

Scraping Py and plots with R

M.Acosta

2024-05-03