I have a Kaggle dataset with 2019 FIFA soccer players’ information. Problem is that it only contains Club but not League information. However, another less current dataset I have entails both club and league information. I will join the two dataframes in order to incorporate League information into the current dataset I will be working with.
library(psych) #for describe function to work
library(knitr)
library(ggplot2)
library(data.table) #for rename columns
library(magrittr) #for specialized pipes
library(dplyr)
library(plyr)#for joining dataframes
df<-read.csv('data.csv') #2019 FIFA dataset
head(df)
#subsetting data to only keep the volumns we want
fifadf<-df[c('Name','Club','Position','Potential','Special','Age','Nationality','International.Reputation','Value','Wage','Overall')]
#only keep the high potential players
#fifadf<-fifadf[which(fifadf$potential>90),]
#let R know which variables are factors
cols<-c('Name','Club','Nationality')
fifadf[cols]<-lapply(fifadf[cols],factor)
head(fifadf)
leaguedf<-read.csv('complete.csv') #dataset with league information
head(leaguedf)
a<-leaguedf %>%
extract(c('league','club')) %>% #Subtract needed columns
group_by(league,club) %>%
unique() %>% #Only retain unique league/club combinations
setnames(old = c('league','club'), new = c('League','Club'))
#Convert the pipe results into data frame
a<-as.data.frame(a[order(a$League),])
library(plyr)
#Join two dataframes using join function from 'plyr' package
dat<-join(fifadf, a, type = "inner")
## Joining by: Club
head(dat)
#have to detach plyr, it conflicts with dplyr somehow
detach(package:plyr)
#To group data by league and club, then rank the counted value in descending order
#Then save the piped results into an object called grouped
dat[c('League','Club')]<-lapply(dat[c('League','Club')],factor)
b<-dat %>%
group_by(League, Club) %>%
summarise(no_rows = length(Club)) %>%
arrange(desc(no_rows))
b
#To group by league to find out how many clubs each league contains in our dataset
b %>%
group_by(League) %>%
summarise(no_rows = length(League)) %>%
arrange(desc(no_rows))