IDL
library(tidyverse)
library(DALEX)
library(keras)
library(titanic)
library(fastDummies)
library(h2o)
set.seed(123)
# Data preparation and cleaning
titanic_small <- titanic_train %>%
select(Survived, Pclass, Sex, Age, SibSp, Parch, Fare, Embarked) %>%
mutate_at(c("Survived", "Sex", "Embarked"), as.factor) %>%
mutate(Family_members = SibSp + Parch) %>% # Calculate family members
na.omit() %>%
dummy_cols() %>%
select(-Sex, -Embarked, -Survived_0, -Survived_1, -Parch, -SibSp)
print(head(titanic_small))
## Survived Pclass Age Fare Family_members Sex_female Sex_male Embarked_
## 1 0 3 22 7.2500 1 0 1 0
## 2 1 1 38 71.2833 1 1 0 0
## 3 1 3 26 7.9250 0 1 0 0
## 4 1 1 35 53.1000 1 1 0 0
## 5 0 3 35 8.0500 0 0 1 0
## 6 0 1 54 51.8625 0 0 1 0
## Embarked_C Embarked_Q Embarked_S
## 1 0 0 1
## 2 1 0 0
## 3 0 0 1
## 4 0 0 1
## 5 0 0 1
## 6 0 0 1
# Data preprocessing for Keras
titanic_small_y <- titanic_small %>% select(Survived) %>% mutate(Survived = as.numeric(as.character(Survived))) %>% as.matrix()
titanic_small_x <- titanic_small %>% select(-Survived) %>% as.matrix()
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 17 minutes 22 seconds
## H2O cluster timezone: America/Chicago
## H2O data parsing timezone: UTC
## H2O cluster version: 3.28.0.2
## H2O cluster version age: 1 month and 4 days
## H2O cluster name: H2O_started_from_R_subas_osn291
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.60 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.0 (2019-04-26)
h2o.no_progress()
titanic_h2o <- as.h2o(titanic_small, destination_frame = "titanic_small")
model_h2o <- h2o.deeplearning(x = 2:11,
y = 1,
training_frame = "titanic_small",
activation = "Rectifier", # ReLU as activation functions
hidden = c(16, 8), # Two hidden layers with 16 and 8 neurons
epochs = 100,
rate = 0.01, # Learning rate
adaptive_rate = FALSE, # Simple SGD
loss = "CrossEntropy")
model_keras <- keras_model_sequential() %>% # Initialization
layer_dense(units = 16, # 16 neurons in first hidden layer
activation = "relu", # ReLU as activation function
input_shape = c(10)) %>% # Ten inputs
layer_dense(units = 8, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
model_keras %>% compile(
optimizer = optimizer_sgd(lr = 0.01), # Simple SGD with learning rate 0.01
loss = "binary_crossentropy",
metrics = c("accuracy")
)
history <- model_keras %>% fit(
titanic_small_x,
titanic_small_y,
epochs = 100,
validation_split = 0.2
)
henry <- data.frame(
Pclass = 1,
Age = 8,
Family_members = 0,
Fare = 72,
Sex_male = 1,
Sex_female = 0,
Embarked_S = 0,
Embarked_C = 1,
Embarked_Q = 0,
Embarked_ = 0
)
henry_h2o <- as.h2o(henry, destination_frame = "henry")
henry_keras <- as.matrix(henry)
predict(model_h2o, henry_h2o) %>% print()
## predict p0 p1
## 1 1 0.3269735 0.6730265
##
## [1 row x 3 columns]
predict(model_keras, henry_keras) %>% print()
## [,1]
## [1,] 0.007204166
custom_predict_h2o <- function(model, newdata) {
newdata_h2o <- as.h2o(newdata)
res <- as.data.frame(h2o.predict(model, newdata_h2o))
return(as.numeric(res$p1))
}
explainer_titanic_h2o <- explain(model = model_h2o, data = titanic_small[ , -1], y = as.numeric(as.character(titanic_small$Survived)),
predict_function = custom_predict_h2o, label = "MLP_h2o", colorize = FALSE)
## Preparation of a new explainer is initiated
## -> model label : MLP_h2o
## -> data : 714 rows 10 cols
## -> target variable : 714 values
## -> model_info : package Model of class: H2OBinomialModel package unrecognized , ver. Unknown , task regression ( default )
## -> predict function : custom_predict_h2o
## -> predicted values : numerical, min = 0.001006115 , mean = 0.3679414 , max = 0.9999982
## -> residual function : difference between y and yhat ( default )
## -> residuals : numerical, min = -0.9671573 , mean = 0.03822109 , max = 0.9911428
## A new explainer has been created!
explainer_titanic_keras <- explain(model = model_keras, data = titanic_small_x, y = as.numeric(titanic_small_y),
predict_function = predict, label = "MLP_keras", colorize = FALSE)
## Preparation of a new explainer is initiated
## -> model label : MLP_keras
## -> data : 714 rows 10 cols
## -> target variable : 714 values
## -> model_info : package Model of class: keras.engine.sequential.Sequential package unrecognized , ver. Unknown , task regression ( default )
## -> predict function : predict
## -> predicted values : predict function returns multiple columns: 1 ( WARNING ) some of functionalities may not work
## -> residual function : difference between y and yhat ( default )
## -> residuals : numerical, min = -0.5650429 , mean = 0.05039686 , max = 0.9536537
## A new explainer has been created!
mp_titinic_h2o <- model_performance(explainer_titanic_h2o)
mp_titanic_keras <- model_performance(explainer_titanic_keras)
mp_titinic_h2o
## Measures for: regression
## mse : 0.1405328
## rmse : 0.374877
## r2 : 0.4173467
## mad : 0.1055516
##
## Residuals:
## 0% 10% 20% 30% 40% 50%
## -0.96715730 -0.33671202 -0.12363344 -0.09299802 -0.07156105 -0.04507381
## 60% 70% 80% 90% 100%
## 0.01005732 0.08841553 0.26663513 0.63452530 0.99114279
## Measures for: regression
## mse : 0.2052927
## rmse : 0.4530924
## r2 : 0.14885
## mad : 0.4349571
##
## Residuals:
## 0% 10% 20% 30% 40% 50% 60%
## -0.5650429 -0.4774194 -0.3342696 -0.2892744 -0.2374322 -0.1530977 0.4349571
## 70% 80% 90% 100%
## 0.4349571 0.5323707 0.6816853 0.9536537
library(ingredients)
plot(mp_titinic_h2o, mp_titanic_keras)

plot(mp_titinic_h2o, mp_titanic_keras, geom = "boxplot")

vi_titinic_h2o <- variable_importance(explainer_titanic_h2o)
vi_titinic_keras <- variable_importance(explainer_titanic_keras)
plot(vi_titinic_h2o, vi_titinic_keras)

vi_titinic_h2o <- variable_importance(explainer_titanic_h2o, type="difference")
vi_titinic_keras <- variable_importance(explainer_titanic_keras, type="difference")
plot(vi_titinic_h2o, vi_titinic_keras)

vr_age_h2o <- variable_effect(explainer_titanic_h2o, variable = "Age")
vr_age_keras <- variable_effect(explainer_titanic_keras, variable = "Age")
plot(vr_age_h2o, vr_age_keras)
