1 IDL

library(tidyverse)
library(DALEX)
library(keras)
library(titanic)
library(fastDummies)
library(h2o)
set.seed(123)

# Data preparation and cleaning
titanic_small <- titanic_train %>%
  select(Survived, Pclass, Sex, Age, SibSp, Parch, Fare, Embarked) %>%
  mutate_at(c("Survived", "Sex", "Embarked"), as.factor) %>%
  mutate(Family_members = SibSp + Parch) %>% # Calculate family members
  na.omit() %>%
  dummy_cols() %>%
  select(-Sex, -Embarked, -Survived_0, -Survived_1, -Parch, -SibSp)
print(head(titanic_small))
##   Survived Pclass Age    Fare Family_members Sex_female Sex_male Embarked_
## 1        0      3  22  7.2500              1          0        1         0
## 2        1      1  38 71.2833              1          1        0         0
## 3        1      3  26  7.9250              0          1        0         0
## 4        1      1  35 53.1000              1          1        0         0
## 5        0      3  35  8.0500              0          0        1         0
## 6        0      1  54 51.8625              0          0        1         0
##   Embarked_C Embarked_Q Embarked_S
## 1          0          0          1
## 2          1          0          0
## 3          0          0          1
## 4          0          0          1
## 5          0          0          1
## 6          0          0          1
# Data preprocessing for Keras
titanic_small_y <- titanic_small %>% select(Survived) %>% mutate(Survived = as.numeric(as.character(Survived))) %>% as.matrix()
titanic_small_x <- titanic_small %>% select(-Survived) %>% as.matrix()


h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         17 minutes 22 seconds 
##     H2O cluster timezone:       America/Chicago 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.28.0.2 
##     H2O cluster version age:    1 month and 4 days  
##     H2O cluster name:           H2O_started_from_R_subas_osn291 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.60 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.0 (2019-04-26)
h2o.no_progress()

titanic_h2o <- as.h2o(titanic_small, destination_frame = "titanic_small")

model_h2o <- h2o.deeplearning(x = 2:11,
                              y = 1,
                              training_frame = "titanic_small",
                              activation = "Rectifier", # ReLU as activation functions
                              hidden = c(16, 8), # Two hidden layers with 16 and 8 neurons
                              epochs = 100,
                              rate = 0.01, # Learning rate
                              adaptive_rate = FALSE, # Simple SGD
                              loss = "CrossEntropy")


model_keras <- keras_model_sequential() %>% # Initialization
  layer_dense(units = 16, # 16 neurons in first hidden layer
              activation = "relu", # ReLU as activation function
              input_shape = c(10)) %>% # Ten inputs
  layer_dense(units = 8, activation = "relu") %>%
  layer_dense(units = 1, activation = "sigmoid")

model_keras %>% compile(
  optimizer = optimizer_sgd(lr = 0.01), # Simple SGD with learning rate 0.01
  loss = "binary_crossentropy",
  metrics = c("accuracy")
)

history <- model_keras %>% fit(
  titanic_small_x,
  titanic_small_y,
  epochs = 100,
  validation_split = 0.2
)

henry <- data.frame(
  Pclass = 1,
  Age = 8,
  Family_members = 0,
  Fare = 72,
  Sex_male = 1,
  Sex_female = 0,
  Embarked_S = 0,
  Embarked_C = 1,
  Embarked_Q = 0,
  Embarked_ = 0
)

henry_h2o <- as.h2o(henry, destination_frame = "henry")
henry_keras <- as.matrix(henry)
predict(model_h2o, henry_h2o) %>% print()
##   predict        p0        p1
## 1       1 0.3269735 0.6730265
## 
## [1 row x 3 columns]
predict(model_keras, henry_keras) %>% print()
##             [,1]
## [1,] 0.007204166
custom_predict_h2o <- function(model, newdata)  {
  newdata_h2o <- as.h2o(newdata)
  res <- as.data.frame(h2o.predict(model, newdata_h2o))
  return(as.numeric(res$p1))
}

explainer_titanic_h2o <- explain(model = model_h2o, data = titanic_small[ , -1], y = as.numeric(as.character(titanic_small$Survived)),
                                 predict_function = custom_predict_h2o, label = "MLP_h2o", colorize = FALSE)
## Preparation of a new explainer is initiated
##   -> model label       :  MLP_h2o 
##   -> data              :  714  rows  10  cols 
##   -> target variable   :  714  values 
##   -> model_info        :  package Model of class: H2OBinomialModel package unrecognized , ver. Unknown , task regression (  default  ) 
##   -> predict function  :  custom_predict_h2o 
##   -> predicted values  :  numerical, min =  0.001006115 , mean =  0.3679414 , max =  0.9999982  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -0.9671573 , mean =  0.03822109 , max =  0.9911428  
##   A new explainer has been created!
explainer_titanic_keras <- explain(model = model_keras, data = titanic_small_x, y = as.numeric(titanic_small_y),
                                   predict_function = predict, label = "MLP_keras", colorize = FALSE)
## Preparation of a new explainer is initiated
##   -> model label       :  MLP_keras 
##   -> data              :  714  rows  10  cols 
##   -> target variable   :  714  values 
##   -> model_info        :  package Model of class: keras.engine.sequential.Sequential package unrecognized , ver. Unknown , task regression (  default  ) 
##   -> predict function  :  predict 
##   -> predicted values  :  predict function returns multiple columns:  1  (  WARNING  ) some of functionalities may not work 
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -0.5650429 , mean =  0.05039686 , max =  0.9536537  
##   A new explainer has been created!
mp_titinic_h2o <- model_performance(explainer_titanic_h2o)
mp_titanic_keras <- model_performance(explainer_titanic_keras)


mp_titinic_h2o
## Measures for:  regression
## mse      : 0.1405328 
## rmse     : 0.374877 
## r2       : 0.4173467 
## mad      : 0.1055516
## 
## Residuals:
##          0%         10%         20%         30%         40%         50% 
## -0.96715730 -0.33671202 -0.12363344 -0.09299802 -0.07156105 -0.04507381 
##         60%         70%         80%         90%        100% 
##  0.01005732  0.08841553  0.26663513  0.63452530  0.99114279
mp_titanic_keras
## Measures for:  regression
## mse      : 0.2052927 
## rmse     : 0.4530924 
## r2       : 0.14885 
## mad      : 0.4349571
## 
## Residuals:
##         0%        10%        20%        30%        40%        50%        60% 
## -0.5650429 -0.4774194 -0.3342696 -0.2892744 -0.2374322 -0.1530977  0.4349571 
##        70%        80%        90%       100% 
##  0.4349571  0.5323707  0.6816853  0.9536537
library(ingredients)

plot(mp_titinic_h2o, mp_titanic_keras)

plot(mp_titinic_h2o, mp_titanic_keras, geom = "boxplot")

vi_titinic_h2o <- variable_importance(explainer_titanic_h2o)
vi_titinic_keras <- variable_importance(explainer_titanic_keras)

plot(vi_titinic_h2o, vi_titinic_keras)

vi_titinic_h2o <- variable_importance(explainer_titanic_h2o, type="difference")
vi_titinic_keras <- variable_importance(explainer_titanic_keras, type="difference")
plot(vi_titinic_h2o, vi_titinic_keras)

vr_age_h2o  <- variable_effect(explainer_titanic_h2o, variable =  "Age")
vr_age_keras  <- variable_effect(explainer_titanic_keras, variable =  "Age")
plot(vr_age_h2o, vr_age_keras)