Read data

Read the data from the kaggle website https://www.kaggle.com/karangadiya/fifa19.

library("readr")
fifa19 <- as.data.frame(read_csv("G:/MODEL-STUDIO/data.csv"))
#fifa19 <- fifa18[,-1]

Data Preprocessing

Transform Value into a standard numeric.

fifa19$Value <- substr(fifa19$Value,2,200)
fifa19$ValueNum <- sapply(as.character(fifa19$Value), function(x) {
  unit <- substr(x, nchar(x), nchar(x))
  if (unit == "M") return (as.numeric(substr(x, 1, nchar(x)-1)) * 1000000)
  if (unit == "K") return (as.numeric(substr(x, 1, nchar(x)-1)) * 1000)
  as.numeric(x)
})
rownames(fifa19) <- make.names(fifa19$Name, unique = TRUE)

Feature selection

Let’s select only features related to player characteristics.

fifa19_selected <- fifa19[ ,c(4,8,14:18,55:88,90)]

fifa19_selected$`Preferred Foot` <- factor(fifa19_selected$`Preferred Foot`)

Create a gbm model

Let’s use gbm library to create a gbm model with 250 trees 3 levels deep.

set.seed(1313)
library("gbm")
fifa_gbm <- gbm(ValueNum~.-Overall, data = fifa19_selected, n.trees = 250, interaction.depth = 3)
## Distribution not specified, assuming gaussian ...

Create a DALEX explainer

Let’s wrap gbm model into a DALEX explainer.

library("DALEX")
fifa_gbm_exp <- explain(fifa_gbm, 
                        data = fifa19_selected, 
                        y = fifa19_selected$ValueNum^2, 
                        predict_function = function(m,x) 
                          predict(m, x, n.trees = 250)^2)
## Preparation of a new explainer is initiated
##   -> model label       :  gbm  (default)
##   -> data              :  18207  rows  42  cols 
##   -> target variable   :  18207  values 
##   -> predict function  :  function(m, x) predict(m, x, n.trees = 250)^2 
##   -> predicted values  :  numerical, min =  226416.2 , mean =  3.52238e+13 , max =  1.206726e+16  
##   -> residual function :  difference between y and yhat (default)
##   -> residuals         :  numerical, min =  -1.136101e+15 , mean =  1.889204e+12 , max =  1.974986e+15  
## A new explainer has been created!

Feature Importance explainer

Calculate Feature Importnace explainer.

library("ingredients")
fifa_feat <- ingredients::feature_importance(fifa_gbm_exp)
plot(fifa_feat, max_vars = 12)

Partial Dependency explainer

Calculate Partial Dependency explainer.

fifa19_pd <- ingredients::partial_dependency(fifa_gbm_exp, variables = "Age")
plot(fifa19_pd)

Ceteris Paribus explainer

Calculate Ceteris Paribus explainer.

fifa19_cp_pg <- ingredients::ceteris_paribus(fifa_gbm_exp, new_observation = fifa19_selected["P..Gulácsi",], variables = "Age", variable_splits = list(Age = seq(18,45,0.1)))
plot(fifa19_cp_pg)

Break Down explainer

Calculate Break Down explainer.

library("iBreakDown")
fifa_pg_gbm <- break_down(fifa_gbm_exp, new_observation = fifa19_selected["P..Gulácsi",])
plot(fifa_pg_gbm)

fifa_pg_gbm$label = "Break Down for Péter Gulácsi (GBM model)"
library("ggplot2")
library("scales")
plot(fifa_pg_gbm, digits = 0) +  
  scale_y_continuous(labels = dollar_format(suffix = "€", prefix = ""), name = "Estimated value", limits = 1000000*c(2,13), breaks = 1000000*c(2.5,5,7.5,10))

modelStudio app

Calculate modelStudio dashboard.

library("modelStudio")
options(
    parallelMap.default.mode        = "socket",
    parallelMap.default.cpus        = 4,
    parallelMap.default.show.info   = FALSE
)
nationality <- c("England", "France", "Argentina", "Spain", "Germany", "Sweden", "Netherlands", "Japan", "Poland", "United States", "Mexico", "Denmark", "Saudi Arabia", "Norway", "Argentina", "Portugal", "Brasil")
library(dplyr)
fifa19 %>% 
  filter(Nationality %in% nationality) %>%
  group_by(Nationality) %>%
  arrange(-ValueNum) %>%
  top_n(2) %>%
  select(Name, Nationality) %>%
  as.data.frame() -> players
  
selected <- c("Cristiano.Ronaldo", "L..Messi","R..Lewandowski", "W..Szczęsny", "P..Gulácsi","A..Szalai", "Neymar.Jr",
"P..Dybala", "H..Kane", "K..Mbappé", "A..Griezmann", 
"T..Kroos", "Isco", "C..Eriksen", "De.Gea", 
"L..Sané", "R..Sterling", "V..van.Dijk", "M..Depay", 
"E..Forsberg", "H..Lozano", "T..Delaney", "A..Christensen", "J..Corona", 
"C..Pulisic", "S..Kagawa", "M..Elyounoussi", "V..Lindelöf", 
"T..Inui", "J..Brooks", "S..Nakajima", "R..Jarstein", "S..Al.Faraj", 
"S..Al.Dawsari", "Bernardo.Silva") 
names_sel <- c("Cristiano Ronaldo (PT)", "Lionel Messi (AR)", "Robert Lewandowski (PL)", "Wojciech Szczęsny (PL)", "Péter Gulácsi (HU)","Ádám Szalai (HU)", "Neymar Jr (BR)",
"Paulo Dybala (AR)", "Harry Kane (UK)", "Kylian Mbappé (FR)", "Antoine Griezmann (FR)", "Toni Kroos (DE)", "Isco (ES)", "Christian Eriksen (DK)", 
"De Gea (ES)", "Leroy Sané (DE)", "Raheem Sterling (UK)", "Virgil van Dijk (NL)", "Memphis Depay (NL)", "Emil Forsberg (SE)", "Hirving Lozano (MX)", "Thomas Delaney (DK)", "Andreas Christensen (DK)", "Jesús Corona (MX)", "Christian Pulisic (US)", "Shinji Kagawa (JP)", "Mohamed Elyounoussi (NO)", "Victor Lindelöf (SE)", "Takashi Inui (JP)", "John Brooks (US)", "Shoya Nakajima (JP)", "Rune Almenning Jarstein (NO)", "Salman Al Faraj (SA)", "Salem Al Dawsari (SA)", "Bernardo Silva (PT)")
fifa_selected6 <- fifa19_selected[selected, ]
rownames(fifa_selected6) <- names_sel
fifa_selected6 <- fifa_selected6[c(1,35,2,8,3:7,9:34),]
modelStudio(fifa_gbm_exp, 
                         new_observation = fifa_selected6,
                         B = 5, N = 300,
                         parallel = TRUE,
                         options = modelStudioOptions(margin_left = 125, margin_ytitle = 90),
                         digits = 0)