I have a Kaggle dataset with 2019 FIFA soccer players’ information. Problem is that it only contains Club but not League information. However, another less current dataset I have entails both club and league information. I will join the two dataframes in order to incorporate League information into the current dataset I will be working with.

library(psych) #for describe function to work
library(knitr)
library(ggplot2)
library(data.table) #for rename columns
library(magrittr) #for specialized pipes
library(dplyr)
library(plyr)#for joining dataframes

Data Preprocessing of 2019 FIFA Dataset

df<-read.csv('data.csv') #2019 FIFA dataset
head(df)
#subsetting data to only keep the volumns we want
fifadf<-df[c('Name','Club','Position','Potential','Special','Age','Nationality','International.Reputation','Value','Wage','Overall')]

#only keep the high potential players
#fifadf<-fifadf[which(fifadf$potential>90),]

#let R know which variables are factors
cols<-c('Name','Club','Nationality')
fifadf[cols]<-lapply(fifadf[cols],factor)
head(fifadf)

Data Preprocessing of League Dataset

leaguedf<-read.csv('complete.csv') #dataset with league information
head(leaguedf)
a<-leaguedf %>%
  extract(c('league','club')) %>% #Subtract needed columns
  group_by(league,club) %>% 
  unique() %>% #Only retain unique league/club combinations
  setnames(old = c('league','club'), new = c('League','Club'))
  
#Convert the pipe results into data frame
a<-as.data.frame(a[order(a$League),])

Join two dataframes

library(plyr)
#Join two dataframes using join function from 'plyr' package
dat<-join(fifadf, a, type = "inner")
## Joining by: Club
head(dat)

Examine how many players are in each Club

#have to detach plyr, it conflicts with dplyr somehow
detach(package:plyr) 
#To group data by league and club, then rank the counted value in descending order
#Then save the piped results into an object called grouped
dat[c('League','Club')]<-lapply(dat[c('League','Club')],factor)

b<-dat %>%
  group_by(League, Club) %>%
  summarise(no_rows = length(Club)) %>%
  arrange(desc(no_rows))

b

Examine how many Clubs are in each League

#To group by league to find out how many clubs each league contains in our dataset
b %>% 
  group_by(League) %>%
  summarise(no_rows = length(League)) %>%
  arrange(desc(no_rows))