library(dplyr)
library(stringr)
library(measurements)
library(FactoMineR)
library(factoextra)
feetToCm <- function(x){
feet <- as.numeric(str_split(x, "'", simplify = T)[1])
inch <- as.numeric(str_split(x, "'", simplify = T)[2])
cm <- conv_unit(feet, from = "ft", to = "cm") +
conv_unit(inch, from = "inch", to = "cm")
cm
}
posRating <- function(x){
pr <- as.numeric(str_split(x, "\\+", simplify = T)[1]) +
as.numeric(str_split(x, "\\+", simplify = T)[2])
pr
}We are using FIFA 2019 Player dataset from Kaggle:
This data includes latest edition FIFA 2019 players attributes like Age, Nationality, Overall, Potential, Club, Value, Wage, Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight, LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.
We are going to predict the Player Value based on these variables.
f <- read.csv("fifa19.csv")We’re going to remove column line number, ID, Name, Photo, Flag, Club Logo, and Player Real Face, as we don’t need those variables for our prediction model
f <- f[, c("Age", "Value","Wage", "Height", "Weight", "Release.Clause")]
# f <- f[,-c(1,2,3,5,7,11,21)]f$Value <- as.numeric(str_extract_all(f$Value,"\\(?[0-9,.]+\\)?", simplify = T))
f$Wage <- as.numeric(str_extract_all(f$Wage,"\\(?[0-9,.]+\\)?", simplify = T))
f$Release.Clause <- as.numeric(str_extract_all(f$Release.Clause,"\\(?[0-9,.]+\\)?", simplify = T))f$Height <- unlist(lapply(f$Height, feetToCm))
f$Weight <- as.numeric(str_remove(f$Weight, "lbs"))f <- f[complete.cases(f),]f.scale <- scale(f)
f.pc <- PCA(f.scale, graph = F)f.pc$eig[,3]## comp 1 comp 2 comp 3 comp 4 comp 5 comp 6
## 30.68356 55.72634 71.75883 84.37016 96.17137 100.00000
As we can see from the result, if we want to get 80% of the information at the least, it will be optimal to choose 4 PCs.
f.new <- as.data.frame(reconst(f.pc, ncp=4))Iterate K for 3 <= K <= 7, as a practical optimal number of K
set.seed(2902)
res <- data.frame(k=integer(),
withinss=numeric(),
tot.withinss=numeric(),
betweenss=numeric(),
totss=numeric(),
bet.per.tot=numeric()
)
for (i in 3:7) {
km <- kmeans(f.new, i)
nr <- data.frame(
k=i,
withinss=km$withinss,
tot.withinss=km$tot.withinss,
betweenss=km$betweenss,
totss=km$totss,
size=km$size,
bet.per.tot = km$betweenss/km$totss)
res <-rbind(res, nr)
}res %>% group_by(k) %>% summarise(bet.per.tot = max(bet.per.tot))## # A tibble: 5 x 2
## k bet.per.tot
## <int> <dbl>
## 1 3 0.377
## 2 4 0.454
## 3 5 0.529
## 4 6 0.583
## 5 7 0.626
As we can see from the above data, the betweenSS/totSS value is quite good regardless for K number > 4 (above 50%)
Analyzing the elbow plot of the tot.withinss, the trend is good if we pick the K number 5
plot(y=res$tot.withinss, type="b", x = res$k)km <- kmeans(f.scale,5)
fviz_cluster(km, f.scale, labelsize = 0)f$Clust <- as.factor(km$cluster)
knitr::kable(f %>% group_by(Clust) %>%
summarise_all(mean))| Clust | Age | Value | Wage | Height | Weight | Release.Clause |
|---|---|---|---|---|---|---|
| 1 | 20.62807 | 136.52603 | 3.829934 | 183.7927 | 167.4745 | 174.14112 |
| 2 | 23.40874 | 534.73612 | 2.731721 | 175.6028 | 152.7457 | 356.40459 |
| 3 | 28.71961 | 18.57459 | 29.671048 | 186.3881 | 180.4677 | 30.86302 |
| 4 | 26.67077 | 29.76741 | 13.555180 | 175.3295 | 154.4943 | 46.01405 |
| 5 | 27.31231 | 523.03887 | 3.262300 | 185.9288 | 177.1147 | 458.42618 |
From the above data, we can see that