1 Initialization

1.1 Library Call & Function Definition

library(caret)
library(dplyr)
library(stringr)
library(measurements)
library(e1071)

feetToCm <- function(x){
  feet <- as.numeric(str_split(x, "'", simplify = T)[1])
  inch <- as.numeric(str_split(x, "'", simplify = T)[2])
  cm <- conv_unit(feet, from = "ft", to = "cm") +
    conv_unit(inch, from = "inch", to = "cm")
  cm
}

posRating <- function(x){
  pr <- as.numeric(str_split(x, "\\+", simplify = T)[1]) +
    as.numeric(str_split(x, "\\+", simplify = T)[2])
  pr
}

1.2 Data Pre-Processing

1.2.1 Read Data Source

We are using FIFA 2019 Player dataset from Kaggle:

This data includes latest edition FIFA 2019 players attributes like Age, Nationality, Overall, Potential, Club, Value, Wage, Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight, LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.

We are going to predict the Player Value based on these variables.

f <- read.csv("fifa19.csv")

1.2.2 Remove Un-necessary Column

We’re going to remove column line number, ID, Name, Photo, Flag, Club Logo, and Player Real Face, as we don’t need those variables for our prediction model

f <- f[,-c(1,2,3,5,7,11,21)]

1.2.3 Remove NA rows and Subset 10% of The whole data to speed-up Rando Forest Modeling

f <- f[complete.cases(f),]
set.seed(2902)
f <- f[sample(16643, 16643*0.1), ]

1.2.4 Extract Value, Wage & Release Clause number from currency format

f$Value <- as.numeric(str_extract_all(f$Value,"\\(?[0-9,.]+\\)?", simplify = T))
f$Wage <- as.numeric(str_extract_all(f$Wage,"\\(?[0-9,.]+\\)?", simplify = T))
f$Release.Clause <- as.numeric(str_extract_all(f$Release.Clause,"\\(?[0-9,.]+\\)?", simplify = T))

1.2.5 Convert Height from Imperial to Metric unit, and Extract Number from weight

f$Height <- unlist(lapply(f$Height, feetToCm))
f$Weight <- as.numeric(str_remove(f$Weight, "lbs"))

1.2.6 Convert Position Rating Value

f[,22:47] <- apply(f[,22:47], c(1,2), posRating)

1.2.7 Convert Contract Validity

f$Contract.Valid.Until <- 
  as.integer(str_sub(f$Contract.Valid.Until, start = -4, end = -1))

1.2.8 Generate final Dataset

As the origin of Value column is a numeric, we’re going to normalize it into 5 factors based on it’s quantile value. After that, we’re going to remove variables with near zero variance.

breaks.v <- quantile(f$Value, probs = c(0,20,40,60,80,100)/100)
f.v <- f
f.v$Value <- cut(f$Value, breaks = breaks.v)
levels(f.v$Value) <- c("Very Cheap", "Cheap", "Mainstream", "Expensive", "Very Expensive")
f.v <- f.v[,c(6,1:5,7:82)]
n0v.v <- nearZeroVar(f.v)
f.v <- f.v[,-n0v.v]
f.v <- f.v[complete.cases(f.v),]

1.2.9 Prepare Data Train and Data Test

set.seed(2902)

f_intrain <- sample(nrow(f.v), nrow(f.v)*0.8)
f.v_train <- f.v[f_intrain, ]
f.v_test <- f.v[-f_intrain, ]

1.2.10 Check Data Train and Data Test

prop.table(table(f.v_train$Value))
## 
##     Very Cheap          Cheap     Mainstream      Expensive Very Expensive 
##      0.1979742      0.2145488      0.1952118      0.1970534      0.1952118
prop.table(table(f.v_test$Value))
## 
##     Very Cheap          Cheap     Mainstream      Expensive Very Expensive 
##      0.2022059      0.1838235      0.1875000      0.2205882      0.2058824

2 Random Forest Model

2.1 Create Random Forest Model

We’ve done multiple trial experiments on creating the Random Forest with different combination of Number of Folds and Repeats. In order to save the processing time,

set.seed(1507)
ctrl21 <- trainControl(method="repeatedcv", number=2, repeats=1)
ctrl22 <- trainControl(method="repeatedcv", number=2, repeats=2)
ctrl53 <- trainControl(method="repeatedcv", number=5, repeats=3)
ctrl105 <- trainControl(method="repeatedcv", number=10, repeats=5)

saveRDS(train(Value ~ ., data=f.v_train, method="rf", trControl = ctrl21),
        "fv_Lite_2N1R.RDS")
saveRDS(train(Value ~ ., data=f.v_train, method="rf", trControl = ctrl22),
        "fv_Lite_2N2R.RDS")
saveRDS(train(Value ~ ., data=f.v_train, method="rf", trControl = ctrl53),
        "fv_Lite_5N3R.RDS")
saveRDS(train(Value ~ ., data=f.v_train, method="rf", trControl = ctrl105),
        "fv_Lite_10N5R.RDS")

2.2 Read Random Forest Model

3 Model Evaluation

We’re going to evaluate all four models we created before

f21 <- readRDS("fv_Lite_2N1R.RDS")
f22 <- readRDS("fv_Lite_2N2R.RDS")
f53 <- readRDS("fv_Lite_5N3R.RDS")
f105 <- readRDS("fv_Lite_10N5R.RDS")

3.1 Final Model Error Plot and Model Summary

plot(f21$finalModel)



f21$finalModel



plot(f22$finalModel)



f22$finalModel



plot(f53$finalModel)



f53$finalModel



plot(f105$finalModel)



f105$finalModel



3.2 Confusion Matrix Approach

We could also calculate the confusion matrix for those model, to preview how good the model on predicting unseen data

confusionMatrix(predict(f21, f.v_test[,-1]), f.v_test[,1])
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Very Cheap Cheap Mainstream Expensive Very Expensive
##   Very Cheap             50     8          0         0              2
##   Cheap                   0    42          5         0              0
##   Mainstream              0     0         36         1              0
##   Expensive               0     0         10        51              2
##   Very Expensive          5     0          0         8             52
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8493          
##                  95% CI : (0.8011, 0.8896)
##     No Information Rate : 0.2206          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8111          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Very Cheap Class: Cheap Class: Mainstream
## Sensitivity                     0.9091       0.8400            0.7059
## Specificity                     0.9539       0.9775            0.9955
## Pos Pred Value                  0.8333       0.8936            0.9730
## Neg Pred Value                  0.9764       0.9644            0.9362
## Prevalence                      0.2022       0.1838            0.1875
## Detection Rate                  0.1838       0.1544            0.1324
## Detection Prevalence            0.2206       0.1728            0.1360
## Balanced Accuracy               0.9315       0.9087            0.8507
##                      Class: Expensive Class: Very Expensive
## Sensitivity                    0.8500                0.9286
## Specificity                    0.9434                0.9398
## Pos Pred Value                 0.8095                0.8000
## Neg Pred Value                 0.9569                0.9807
## Prevalence                     0.2206                0.2059
## Detection Rate                 0.1875                0.1912
## Detection Prevalence           0.2316                0.2390
## Balanced Accuracy              0.8967                0.9342
confusionMatrix(predict(f22, f.v_test[,-1]), f.v_test[,1])
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Very Cheap Cheap Mainstream Expensive Very Expensive
##   Very Cheap             50     8          0         0              2
##   Cheap                   0    42          5         0              0
##   Mainstream              0     0         34         1              0
##   Expensive               0     0         12        51              2
##   Very Expensive          5     0          0         8             52
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8419         
##                  95% CI : (0.793, 0.8832)
##     No Information Rate : 0.2206         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8018         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Very Cheap Class: Cheap Class: Mainstream
## Sensitivity                     0.9091       0.8400            0.6667
## Specificity                     0.9539       0.9775            0.9955
## Pos Pred Value                  0.8333       0.8936            0.9714
## Neg Pred Value                  0.9764       0.9644            0.9283
## Prevalence                      0.2022       0.1838            0.1875
## Detection Rate                  0.1838       0.1544            0.1250
## Detection Prevalence            0.2206       0.1728            0.1287
## Balanced Accuracy               0.9315       0.9087            0.8311
##                      Class: Expensive Class: Very Expensive
## Sensitivity                    0.8500                0.9286
## Specificity                    0.9340                0.9398
## Pos Pred Value                 0.7846                0.8000
## Neg Pred Value                 0.9565                0.9807
## Prevalence                     0.2206                0.2059
## Detection Rate                 0.1875                0.1912
## Detection Prevalence           0.2390                0.2390
## Balanced Accuracy              0.8920                0.9342
confusionMatrix(predict(f53, f.v_test[,-1]), f.v_test[,1])
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Very Cheap Cheap Mainstream Expensive Very Expensive
##   Very Cheap             50     8          0         0              2
##   Cheap                   0    42          5         0              0
##   Mainstream              0     0         34         0              0
##   Expensive               0     0         12        52              2
##   Very Expensive          5     0          0         8             52
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8456          
##                  95% CI : (0.7971, 0.8864)
##     No Information Rate : 0.2206          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8064          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Very Cheap Class: Cheap Class: Mainstream
## Sensitivity                     0.9091       0.8400            0.6667
## Specificity                     0.9539       0.9775            1.0000
## Pos Pred Value                  0.8333       0.8936            1.0000
## Neg Pred Value                  0.9764       0.9644            0.9286
## Prevalence                      0.2022       0.1838            0.1875
## Detection Rate                  0.1838       0.1544            0.1250
## Detection Prevalence            0.2206       0.1728            0.1250
## Balanced Accuracy               0.9315       0.9087            0.8333
##                      Class: Expensive Class: Very Expensive
## Sensitivity                    0.8667                0.9286
## Specificity                    0.9340                0.9398
## Pos Pred Value                 0.7879                0.8000
## Neg Pred Value                 0.9612                0.9807
## Prevalence                     0.2206                0.2059
## Detection Rate                 0.1912                0.1912
## Detection Prevalence           0.2426                0.2390
## Balanced Accuracy              0.9003                0.9342
confusionMatrix(predict(f105, f.v_test[,-1]), f.v_test[,1])
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Very Cheap Cheap Mainstream Expensive Very Expensive
##   Very Cheap             50     8          0         0              2
##   Cheap                   0    42          5         0              0
##   Mainstream              0     0         34         1              0
##   Expensive               0     0         12        51              2
##   Very Expensive          5     0          0         8             52
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8419         
##                  95% CI : (0.793, 0.8832)
##     No Information Rate : 0.2206         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8018         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Very Cheap Class: Cheap Class: Mainstream
## Sensitivity                     0.9091       0.8400            0.6667
## Specificity                     0.9539       0.9775            0.9955
## Pos Pred Value                  0.8333       0.8936            0.9714
## Neg Pred Value                  0.9764       0.9644            0.9283
## Prevalence                      0.2022       0.1838            0.1875
## Detection Rate                  0.1838       0.1544            0.1250
## Detection Prevalence            0.2206       0.1728            0.1287
## Balanced Accuracy               0.9315       0.9087            0.8311
##                      Class: Expensive Class: Very Expensive
## Sensitivity                    0.8500                0.9286
## Specificity                    0.9340                0.9398
## Pos Pred Value                 0.7846                0.8000
## Neg Pred Value                 0.9565                0.9807
## Prevalence                     0.2206                0.2059
## Detection Rate                 0.1875                0.1912
## Detection Prevalence           0.2390                0.2390
## Balanced Accuracy              0.8920                0.9342


## Conclusion Having try to create the model with different iteration, we could see that the best model was using 5 folds and 3 repeats, with the least OOB (11.97%). Furthermore, while inspecting the confusion matrix of each model, we could also see that the chosen model have the best accuracy (85.56%)

4 Naive Bayes Model

Creating naive bayes model and confusion matrix for our data model yields:

f.v_nb <- naiveBayes(Value ~ ., f.v_train)
confusionMatrix(predict(f.v_nb, f.v_test[,-1]), f.v_test[,1])
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       Very Cheap Cheap Mainstream Expensive Very Expensive
##   Very Cheap             43     5          0         7             18
##   Cheap                   8    43          2         0              0
##   Mainstream              0     0         19        15              6
##   Expensive               2     0         28        28             14
##   Very Expensive          2     2          2        10             18
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5551          
##                  95% CI : (0.4939, 0.6152)
##     No Information Rate : 0.2206          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4427          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Very Cheap Class: Cheap Class: Mainstream
## Sensitivity                     0.7818       0.8600           0.37255
## Specificity                     0.8618       0.9550           0.90498
## Pos Pred Value                  0.5890       0.8113           0.47500
## Neg Pred Value                  0.9397       0.9680           0.86207
## Prevalence                      0.2022       0.1838           0.18750
## Detection Rate                  0.1581       0.1581           0.06985
## Detection Prevalence            0.2684       0.1949           0.14706
## Balanced Accuracy               0.8218       0.9075           0.63876
##                      Class: Expensive Class: Very Expensive
## Sensitivity                    0.4667               0.32143
## Specificity                    0.7925               0.92593
## Pos Pred Value                 0.3889               0.52941
## Neg Pred Value                 0.8400               0.84034
## Prevalence                     0.2206               0.20588
## Detection Rate                 0.1029               0.06618
## Detection Prevalence           0.2647               0.12500
## Balanced Accuracy              0.6296               0.62368


The accuracy is getting near the random forest model, therefore we could using random forest model instead of this one