1 Initialization

1.1 Library Call & Function Definition

library(dplyr)
library(stringr)
library(measurements)
library(FactoMineR)
library(factoextra)

feetToCm <- function(x){
  feet <- as.numeric(str_split(x, "'", simplify = T)[1])
  inch <- as.numeric(str_split(x, "'", simplify = T)[2])
  cm <- conv_unit(feet, from = "ft", to = "cm") +
    conv_unit(inch, from = "inch", to = "cm")
  cm
}

posRating <- function(x){
  pr <- as.numeric(str_split(x, "\\+", simplify = T)[1]) +
    as.numeric(str_split(x, "\\+", simplify = T)[2])
  pr
}

1.2 Data Pre-Processing

1.2.1 Read Data Source

We are using FIFA 2019 Player dataset from Kaggle:

This data includes latest edition FIFA 2019 players attributes like Age, Nationality, Overall, Potential, Club, Value, Wage, Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight, LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.

We are going to predict the Player Value based on these variables.

f <- read.csv("fifa19.csv")

1.2.2 Remove Un-necessary Column

We’re going to remove column line number, ID, Name, Photo, Flag, Club Logo, and Player Real Face, as we don’t need those variables for our prediction model

f <- f[, c("Age", "Value","Wage", "Height", "Weight", "Release.Clause")]
# f <- f[,-c(1,2,3,5,7,11,21)]

1.2.3 Extract Value, Wage & Release Clause number from currency format

f$Value <- as.numeric(str_extract_all(f$Value,"\\(?[0-9,.]+\\)?", simplify = T))
f$Wage <- as.numeric(str_extract_all(f$Wage,"\\(?[0-9,.]+\\)?", simplify = T))
f$Release.Clause <- as.numeric(str_extract_all(f$Release.Clause,"\\(?[0-9,.]+\\)?", simplify = T))

1.2.4 Convert Height from Imperial to Metric unit, and Extract Number from weight

f$Height <- unlist(lapply(f$Height, feetToCm))
f$Weight <- as.numeric(str_remove(f$Weight, "lbs"))

1.2.5 Remove NA rows

f <- f[complete.cases(f),]

2 Finding Principal Components

f.scale <- scale(f)
f.pc <- PCA(f.scale, graph = F)

2.1 Determine the number of dimensions (NCP)

f.pc$eig[,3]
##    comp 1    comp 2    comp 3    comp 4    comp 5    comp 6 
##  30.68356  55.72634  71.75883  84.37016  96.17137 100.00000

As we can see from the result, if we want to get 80% of the information at the least, it will be optimal to choose 4 PCs.

2.2 Reconstruct the dataset using 4 NCP

f.new <- as.data.frame(reconst(f.pc, ncp=4))

3 K-Means Clustering

Iterate K for 3 <= K <= 7, as a practical optimal number of K

set.seed(2902)
res <- data.frame(k=integer(),
                  withinss=numeric(),
                  tot.withinss=numeric(),
                  betweenss=numeric(),
                  totss=numeric(),
                  bet.per.tot=numeric()
                  )
for (i in 3:7) {
  km <- kmeans(f.new, i)
  nr <- data.frame(
    k=i,
    withinss=km$withinss, 
    tot.withinss=km$tot.withinss, 
    betweenss=km$betweenss, 
    totss=km$totss, 
    size=km$size,
    bet.per.tot = km$betweenss/km$totss)
  res <-rbind(res, nr)
}

3.1 Analyze BetweenSS/TotSS value

res %>% group_by(k) %>% summarise(bet.per.tot = max(bet.per.tot))
## # A tibble: 5 x 2
##       k bet.per.tot
##   <int>       <dbl>
## 1     3       0.377
## 2     4       0.454
## 3     5       0.529
## 4     6       0.583
## 5     7       0.626

As we can see from the above data, the betweenSS/totSS value is quite good regardless for K number > 4 (above 50%)

3.2 Find the optimal K

Analyzing the elbow plot of the tot.withinss, the trend is good if we pick the K number 5

plot(y=res$tot.withinss, type="b", x = res$k)

km <- kmeans(f.scale,5)
fviz_cluster(km, f.scale, labelsize = 0)

4 Cluster Profiling

f$Clust <- as.factor(km$cluster)
knitr::kable(f %>% group_by(Clust) %>% 
  summarise_all(mean))
Clust Age Value Wage Height Weight Release.Clause
1 20.62807 136.52603 3.829934 183.7927 167.4745 174.14112
2 23.40874 534.73612 2.731721 175.6028 152.7457 356.40459
3 28.71961 18.57459 29.671048 186.3881 180.4677 30.86302
4 26.67077 29.76741 13.555180 175.3295 154.4943 46.01405
5 27.31231 523.03887 3.262300 185.9288 177.1147 458.42618

From the above data, we can see that

  1. Cluster 1 contains group of player with lowest Age
  2. Cluster 2 contains group of player with highest Market Value
  3. Cluster 3 contains group of player with highest Wage, highest in Age, highest in Weight and lowest in Market Value (we can see that this is the group of veteran world class player)
  4. Cluster 4 contains group of player with average stats
  5. Cluster 5 contains group with highest Release Clause