Tool to get download fbref 2021 stats into a single data frame
Loading libraries
library(tidyverse,quietly=TRUE)
library(rvest,quietly=TRUE)
library(janitor,quietly=TRUE)
library(dplyr,quietly=TRUE)
library(prismatic,quietly=TRUE)
library(broom,quietly=TRUE)
library(purrr,quietly=TRUE)
Scraping data
urls = c('https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats',
'https://fbref.com/en/comps/Big5/shooting/players/Big-5-European-Leagues-Stats',
'https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats',
'https://fbref.com/en/comps/Big5/passing_types/players/Big-5-European-Leagues-Stats',
'https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats',
'https://fbref.com/en/comps/Big5/gca/players/Big-5-European-Leagues-Stats',
'https://fbref.com/en/comps/Big5/possession/players/Big-5-European-Leagues-Stats',
'https://fbref.com/en/comps/Big5/playingtime/players/Big-5-European-Leagues-Stats',
'https://fbref.com/en/comps/Big5/misc/players/Big-5-European-Leagues-Stats')
stats=read_html(urls[1],as.data.frame=TRUE,stringAsFactors=TRUE) %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
shooting=read_html(urls[2],as.data.frame=TRUE,stringAsFactors=TRUE) %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
passing=read_html(urls[3],as.data.frame=TRUE,stringAsFactors=TRUE) %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
passing_types=read_html(urls[4],as.data.frame=TRUE,stringAsFactors=TRUE) %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
defence=read_html(urls[5],as.data.frame=TRUE,stringAsFactors=TRUE) %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
gca=read_html(urls[6],as.data.frame=TRUE,stringAsFactors=TRUE) %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
possession=read_html(urls[7],as.data.frame=TRUE,stringAsFactors=TRUE) %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
playingtime=read_html(urls[8],as.data.frame=TRUE,stringAsFactors=TRUE) %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
misc=read_html(urls[9],as.data.frame=TRUE,stringAsFactors=TRUE) %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
Cleaning data - fbref has data entries that are the header repeated for readibility on site - this code removes non player rows all in one
list=list(stats,shooting,passing,passing_types,defence,gca,possession,misc)
# Changes first row to header, then removes duplicate rows of header that fbref uses to display table
list=lapply(list,function(df){
names(df)=as.matrix(df[1,])
df=df[-1,]
df$Rk=as.numeric(df$Rk)
df=df[!is.na(df$Rk),]
})
Renaming headings to more clear names and preparing data for merging, removing final column (hyperlink column on fbref), changing characters to integers
# Remove matches column
list=lapply(list,function(df){
df=df[1:length(df)-1]
})
# Cleans the stats dataframe - comes with some addition stats we dont want
list[[1]]=subset(list[[1]],select=-(9:11))
# converts data into numeric form
for (i in 1:8){
list[[i]][,9:length(list[[i]])]=sapply(list[[i]][,9:length(list[[i]])],as.numeric)
}
# NAMING COLUMNS TO AVOID DUPLICATES FOR MERGING
names(list[[1]])=c('Rk','Player','Nation','Pos','Squad','Comp','Age','Born','90s','Gls','Ast','npG','PK','PKA','YCrd','RCrd','Gls p90','Ast p90','GandA p90','npG p90','npGandA p90','xG','npxG','xA','npxG+xA','xG p90','xA p90','xG+xA p90','npxG p90','npxG+xA p90')
names(list[[2]])=c('Rk','Player','Nation','Pos','Squad','Comp','Age','Born','90s','Gls','Sh','SoT','SoT%','Sh p90','SoT p90','Gls pSh','Gls pSoT','Sh Dist','Sh FK','PK','PKA','xG','npxG','npxG pSh','G-xG','npG-xG')
names(list[[3]])=c('Rk','Player','Nation','Pos','Squad','Comp','Age','Born','90s','PassCmp','PassA','Pass%','PassTotDist','PassProgDist','ShortPassCmp','ShortPassA','ShortPass%','MedPassCmp','MedPassA','MedPass%','LongPassCmp','LongPassA','LongPass%','Ast','xA','A-xA','Key Passes','F3 Passes','PA Passes','PA Crosses','Prog Passes')
names(list[[4]])=c('Rk','Player','Nation','Pos','Squad','Comp','Age','Born','90s','PassA','Live Passes','Dead Passes','Passes FK','Thruballs','Pressed Passes','Switches','Crs','CK','InCK','OutCK','StCK','Ground Passes','Low Passes','High Passes','LF Passes','RF Passes','Head Passes','Throwins','OthBP Passes','PassCmp','PassOff','PassOut','PassInt','PassBlockedbyOpp')
names(list[[5]])=c('Rk','Player','Nation','Pos','Squad','Comp','Age','Born','90s','TacklesA','TacklesW','TklDef3rd','TklMid3rd','TklAtt3rd','1v1W','1v1A','1v1%','Past','Pressures','Succ_Press','Succ_Press%','PressDef3rd','PressMid3rd','PressAtt3rd','Blocks','ShBlocked','ShSv','PassBlocked','Int','Tkl+Int','Clr','Err')
names(list[[6]])=c('Rk','Player','Nation','Pos','Squad','Comp','Age','Born','90s','SCA','SCA90','SCAPassLive','SCAPassDead','SCADrib','SCASh','SCAFld','SCADef','GCA','GCA90','GCAPassLive','GCAPassDead','GCADrib','GCASh','GCAFld','GCADef')
names(list[[7]])=c('Rk','Player','Nation','Pos','Squad','Comp','Age','Born','90s','Touches','Touches DefPA','Touches Def3','Touches Mid3','Touches Att3','Touches AttPA','Touches Live','DribCmp','DribA','Drib%','PlayersPast','Megs','Carries','CarryTotDist','CarryProgDist','Prog Carries','F3 Carries','PA Carries','Mis','Dis','RecA','RecCmp','Rec%','Prog Rec')
names(list[[8]])=c('Rk','Player','Nation','Pos','Squad','Comp','Age','Born','90s','YCrd','RCrd','2YCrd','Fls','Fld','Off','Crs','Int','TacklesW','PKwon','PKcon','OG','Recov','AerW','AerL','Aer%')
Clearing duplicate rows - I chose to keep row where player has higher mins for simplicity. You could pick another method to merge rows.
# order by mins, filter duplicates, order by RK
for (i in 1:8){
list[[i]]=list[[i]][order(list[[i]]$`90s`,decreasing=TRUE),]
list[[i]]=list[[i]][!duplicated(list[[i]]$Player),]
list[[i]]=list[[i]][order(list[[i]]$Rk,decreasing=FALSE),]
}
Merging data into one final data frame - first remove Rk because these are not the same across the data frames (fbref is weird about their Rk columns across multiple tables)
# Removing Rk
list=lapply(list,function(df){
df=df[,2:length(df)]
})
# merging dataframes into data21
data21 = full_join(list[[1]],list[[2]])
for (i in 3:8){
data21 = full_join(data21,list[[i]])
}
Printing final data frame - from here can export to Excel or continue analysis in R
head(data21)
## # A tibble: 6 x 158
## Player Nation Pos Squad Comp Age Born `90s` Gls Ast npG PK
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Max Aar~ eng ENG DF Norwi~ eng P~ 22-0~ 2000 26.6 0 1 0 0
## 2 Yunis A~ ma MAR DF Reims fr Li~ 34-1~ 1987 24.1 1 0 1 0
## 3 Salis A~ gh GHA MF Clerm~ fr Li~ 21-3~ 2000 22.9 0 0 0 0
## 4 Laurent~ fr FRA MF Lorie~ fr Li~ 29-0~ 1993 23.2 0 2 0 0
## 5 Charles~ fr FRA FW Saint~ fr Li~ 21-3~ 2000 0.5 0 0 0 0
## 6 Dickson~ ng NGA FW Greut~ de Bu~ 23-1~ 1998 7.1 0 1 0 0
## # ... with 146 more variables: PKA <dbl>, YCrd <dbl>, RCrd <dbl>,
## # Gls p90 <dbl>, Ast p90 <dbl>, GandA p90 <dbl>, npG p90 <dbl>,
## # npGandA p90 <dbl>, xG <dbl>, npxG <dbl>, xA <dbl>, npxG+xA <dbl>,
## # xG p90 <dbl>, xA p90 <dbl>, xG+xA p90 <dbl>, npxG p90 <dbl>,
## # npxG+xA p90 <dbl>, Sh <dbl>, SoT <dbl>, SoT% <dbl>, Sh p90 <dbl>,
## # SoT p90 <dbl>, Gls pSh <dbl>, Gls pSoT <dbl>, Sh Dist <dbl>, Sh FK <dbl>,
## # npxG pSh <dbl>, G-xG <dbl>, npG-xG <dbl>, PassCmp <dbl>, PassA <dbl>,
## # Pass% <dbl>, PassTotDist <dbl>, PassProgDist <dbl>, ShortPassCmp <dbl>,
## # ShortPassA <dbl>, ShortPass% <dbl>, MedPassCmp <dbl>, MedPassA <dbl>,
## # MedPass% <dbl>, LongPassCmp <dbl>, LongPassA <dbl>, LongPass% <dbl>,
## # A-xA <dbl>, Key Passes <dbl>, F3 Passes <dbl>, PA Passes <dbl>,
## # PA Crosses <dbl>, Prog Passes <dbl>, Live Passes <dbl>, Dead Passes <dbl>,
## # Passes FK <dbl>, Thruballs <dbl>, Pressed Passes <dbl>, Switches <dbl>,
## # Crs <dbl>, CK <dbl>, InCK <dbl>, OutCK <dbl>, StCK <dbl>,
## # Ground Passes <dbl>, Low Passes <dbl>, High Passes <dbl>, LF Passes <dbl>,
## # RF Passes <dbl>, Head Passes <dbl>, Throwins <dbl>, OthBP Passes <dbl>,
## # PassOff <dbl>, PassOut <dbl>, PassInt <dbl>, PassBlockedbyOpp <dbl>,
## # TacklesA <dbl>, TacklesW <dbl>, TklDef3rd <dbl>, TklMid3rd <dbl>,
## # TklAtt3rd <dbl>, 1v1W <dbl>, 1v1A <dbl>, 1v1% <dbl>, Past <dbl>,
## # Pressures <dbl>, Succ_Press <dbl>, Succ_Press% <dbl>, PressDef3rd <dbl>,
## # PressMid3rd <dbl>, PressAtt3rd <dbl>, Blocks <dbl>, ShBlocked <dbl>,
## # ShSv <dbl>, PassBlocked <dbl>, Int <dbl>, Tkl+Int <dbl>, Clr <dbl>,
## # Err <dbl>, SCA <dbl>, SCA90 <dbl>, SCAPassLive <dbl>, SCAPassDead <dbl>,
## # SCADrib <dbl>, ...