pacman::p_load(rsample,mlr,reticulate,data.table,tidyverse,keras,tensorflow)
library(keras)
pacman::p_load( "lime", "tidyquant", "recipes", "yardstick", "corrr",knitr)
#install.packages("corrr")
The data is available at the UCI machine learning repository here. It consist of eight predictor variables and an outcome variable class.This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective is to predict based on diagnostic measurements whether a patient has diabetes.
url<-"http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
data<-fread(url)
#read_csv(url)
colnames(data)<-c("preg","plas","pres","skin","test","mass","pedi","age","class")
data%>%glimpse()
## Observations: 768
## Variables: 9
## $ preg <int> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7,...
## $ plas <int> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168,...
## $ pres <int> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74, 80, 60, 7...
## $ skin <int> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, 23, 19, 0, ...
## $ test <int> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, 846, 175, 0...
## $ mass <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, 0....
## $ pedi <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, ...
## $ age <int> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, ...
## $ class <int> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,...
mlr::summarizeColumns(data)%>%kable()
name | type | na | mean | disp | median | mad | min | max | nlevs |
---|---|---|---|---|---|---|---|---|---|
preg | integer | 0 | 3.8450521 | 3.3695781 | 3.0000 | 2.9652000 | 0.000 | 17.00 | 0 |
plas | integer | 0 | 120.8945312 | 31.9726182 | 117.0000 | 29.6520000 | 0.000 | 199.00 | 0 |
pres | integer | 0 | 69.1054688 | 19.3558072 | 72.0000 | 11.8608000 | 0.000 | 122.00 | 0 |
skin | integer | 0 | 20.5364583 | 15.9522176 | 23.0000 | 17.7912000 | 0.000 | 99.00 | 0 |
test | integer | 0 | 79.7994792 | 115.2440024 | 30.5000 | 45.2193000 | 0.000 | 846.00 | 0 |
mass | numeric | 0 | 31.9925781 | 7.8841603 | 32.0000 | 6.8199600 | 0.000 | 67.10 | 0 |
pedi | numeric | 0 | 0.4718763 | 0.3313286 | 0.3725 | 0.2483355 | 0.078 | 2.42 | 0 |
age | integer | 0 | 33.2408854 | 11.7602315 | 29.0000 | 10.3782000 | 21.000 | 81.00 | 0 |
class | integer | 0 | 0.3489583 | 0.4769514 | 0.0000 | 0.0000000 | 0.000 | 1.00 | 0 |
The rsample package is useful for sampling methods. It has the initial_split() function for splitting data sets into training and testing sets. It returns special rsplit object.
# Split test/training sets
set.seed(100)
train_test_split <- initial_split(data, prop = 0.8)
train_test_split%>%head()
## $data
## preg plas pres skin test mass pedi age class
## 1: 6 148 72 35 0 33.6 0.627 50 1
## 2: 1 85 66 29 0 26.6 0.351 31 0
## 3: 8 183 64 0 0 23.3 0.672 32 1
## 4: 1 89 66 23 94 28.1 0.167 21 0
## 5: 0 137 40 35 168 43.1 2.288 33 1
## ---
## 764: 10 101 76 48 180 32.9 0.171 63 0
## 765: 2 122 70 27 0 36.8 0.340 27 0
## 766: 5 121 72 23 112 26.2 0.245 30 0
## 767: 1 126 60 0 0 30.1 0.349 47 1
## 768: 1 93 70 31 0 30.4 0.315 23 0
##
## $in_id
## [1] 1 2 3 4 5 6 7 9 10 12 14 15 16 17 18 20 22
## [18] 23 24 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
## [35] 45 46 47 48 49 50 51 52 53 56 57 58 59 60 61 62 63
## [52] 65 67 68 69 70 71 72 73 74 75 76 78 79 80 81 82 83
## [69] 84 87 89 91 92 93 94 96 97 98 99 100 101 102 103 104 105
## [86] 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
## [103] 123 125 126 127 129 132 134 135 136 137 138 139 140 141 142 144 145
## [120] 146 147 148 149 151 153 155 156 158 159 160 161 162 163 165 166 167
## [137] 168 171 173 174 175 176 177 178 179 180 182 183 184 185 186 187 188
## [154] 189 190 191 192 193 194 195 196 199 200 201 202 203 204 207 208 209
## [171] 210 211 213 214 215 216 217 218 220 222 223 224 225 226 228 230 231
## [188] 232 233 234 235 236 239 240 242 244 245 246 247 248 249 250 251 253
## [205] 254 256 259 261 262 264 265 266 267 268 271 272 273 274 275 276 277
## [222] 278 279 280 281 283 284 285 286 287 289 290 291 293 294 295 296 297
## [239] 298 300 302 304 305 307 308 309 310 311 312 315 318 319 321 322 323
## [256] 324 325 326 327 328 330 331 332 333 334 335 336 337 338 339 340 341
## [273] 342 343 344 345 346 347 349 350 351 352 355 357 359 360 362 363 364
## [290] 365 366 367 369 371 372 373 374 375 376 378 379 381 382 383 384 385
## [307] 386 387 388 389 390 391 392 393 394 396 398 400 403 404 405 406 408
## [324] 409 410 411 413 414 415 417 418 419 420 421 422 426 427 428 429 430
## [341] 431 432 434 435 436 438 439 440 441 442 443 444 445 446 447 448 449
## [358] 450 451 452 453 454 456 457 459 462 463 465 466 467 468 469 471 472
## [375] 475 476 478 479 480 481 482 483 484 485 487 488 489 490 491 492 493
## [392] 494 495 496 497 498 500 502 503 505 506 507 508 509 510 512 513 514
## [409] 515 516 519 520 521 522 523 524 525 526 527 528 529 530 532 533 534
## [426] 535 537 538 539 540 541 542 543 544 545 547 549 550 552 553 554 555
## [443] 556 557 559 560 561 562 563 566 567 569 570 571 573 574 577 578 579
## [460] 580 581 582 583 584 585 586 587 588 589 590 591 594 595 596 597 599
## [477] 601 602 603 604 605 606 607 608 609 610 612 613 616 617 618 619 621
## [494] 622 623 624 625 626 627 628 630 633 634 635 636 637 640 641 642 644
## [511] 645 646 647 648 649 650 651 653 655 656 657 658 659 660 661 662 664
## [528] 665 666 667 669 670 671 672 675 676 677 678 680 681 682 683 684 687
## [545] 688 689 690 691 692 693 694 695 696 697 698 699 700 703 704 705 706
## [562] 707 708 709 710 711 712 713 715 716 718 719 721 722 724 725 726 727
## [579] 729 730 731 732 733 734 735 736 737 738 739 740 741 743 745 746 748
## [596] 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765
## [613] 766 767 768
##
## $out_id
## [1] NA
##
## $id
## # A tibble: 1 x 1
## id
## <chr>
## 1 Resample1
We can retrieve our training and testing sets using training() and testing() functions.
# Retrieve train and test sets
train_tbl<- training(train_test_split)
test_tbl<-testing(train_test_split)
The outcome variable class,can be stored as y_train and y_test, which are needed for modeling our ANN. They should be converted numeric variables if they are not already. The Keras ANN modeling function accepts numeric entry only. I our case the response is already a binary numeric variable.
# Response variables for training and testing sets
y_train<- train_tbl$class
y_test<- test_tbl$class
We Standardize the features so that they are centered around 0 with a standard deviation of 1. This is not only important if we are comparing measurements that have different units, but it is also a general requirement for many machine learning algorithms. The well known optimizzation technique for many machine learning algorithms gradient descent, For example updates certain features with diferent scales or weights faster. Standardizing the features allows different features to be updated similarly. Because ANNs use gradient descent, weights tend to update faster.This makes standardizing necessary for Artificial Neural Networks.
The recipe package makes preprocessing data for machine learning task extremely convenient. In this step we convert the continuous age variable to a categorical with six levels.Convert all the predictor variables to dummy by one-hot encoding.step_dummy creates a a specification of a recipe step that will convert nominal data (e.g. character or factors) into one or more numeric binary model terms for the levels of the original data. Normalize the data by mcentering at 0 and scaling to 1.
# Create recipe
rec_obj <- recipe(class ~ ., data = train_tbl) %>%
step_discretize(class, options = list(cuts = 4)) %>%
#step_log(test) %>%
#step_dummy(all_nominal(), -all_outcomes()) %>%
#step_dummy(class) %>%
step_center(all_predictors(), -all_outcomes()) %>%
step_scale(all_predictors(), -all_outcomes()) %>%
prep(data = train_tbl)
# Print the recipe object
rec_obj
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 8
##
## Training data contained 615 data points and no missing data.
##
## Steps:
##
## Dummy variables from class [trained]
## Centering for preg, plas, pres, skin, test, mass, ... [trained]
## Scaling for preg, plas, pres, skin, test, mass, ... [trained]
We continue with our data preparation by using the bake function on the recipe we have already created. This completes our preprocessing step amd the data is ready for the machine learning algorithm.
# Predictors
x_train_tbl <- bake(rec_obj, newdata = train_tbl) %>% select(-class)
x_test_tbl <- bake(rec_obj, newdata = test_tbl) %>% select(-class)
glimpse(x_train_tbl)
## Observations: 615
## Variables: 8
## $ preg <dbl> 0.6538424, -0.8288754, 1.2469295, -0.8288754, -1.1254190,...
## $ plas <dbl> 0.83603153, -1.08167859, 1.90142605, -0.95991922, 0.50119...
## $ pres <dbl> 0.16287213, -0.15294404, -0.25821609, -0.15294404, -1.521...
## $ skin <dbl> 0.89471385, 0.52454969, -1.26457706, 0.15438554, 0.894713...
## $ test <dbl> -0.67815105, -0.67815105, -0.67815105, 0.13941469, 0.7830...
## $ mass <dbl> 0.20095418, -0.66216839, -1.06906903, -0.47721355, 1.3723...
## $ pedi <dbl> 0.44982881, -0.36765646, 0.58311445, -0.91264664, 5.36954...
## $ age <dbl> 1.44674188, -0.18913763, -0.10303871, -1.05012685, -0.016...
Multi-Layer Perceptron A multilayer perceptron (MLP) is a class of feedforward artificial neural network. An MLP consists of three or more layers of nodes. Except for the input nodes, each node is a neuron that uses a nonlinear activation function. A perceptron is a single neuron model that was a precursor to larger neural networks. MLP utilizes a supervised learning technique called backpropagation for training. Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable. MLPs are quite versatile as they can be used for regression and multi classification task. The core data structure of Keras is a model, a way to organize layers. The simplest type of model is the Sequential model, a linear stack of layers.
Initialize a sequential model: The first step is to initialize a sequential model with keras_model_sequential(). The sequential model is composed of a linear stack of layers.
Apply layers to the sequential model: Layers consist of the input layer, hidden layers and an output layer. The input layer is the data and provided which the activation function transforms through several layers and returns it through the output layer.
Hidden Layers: Hidden layers form the neural network nodes that enable non-linear activation using weights. The hidden layers are created using layer_dense(). In this project we add two hidden layers,number of units = 16, which is the number of nodes. We’ll select kernel_initializer = “uniform” and activation = “relu” for both layers. The first layer needs to have the input_shape = 8 the number of features in the training data.These parameters can be optimized through a process called hyperparameter tuning.
Dropout Layers: Dropout layers are used to control overfitting. This eliminates weights below a cutoff threshold to prevent low weights from overfitting the layers. We use the layer_dropout() function add two drop out layers with rate = 0.3 to remove weights below 30%.
Output Layer: The output layer specifies the shape of the output and the method of assimilating the learned information. The output layer is applied using the layer_dense(). For binary values, the shape should be units = 1. For multi-classification, the units should correspond to the number of classes. We set the kernel_initializer = “uniform” and the activation = “sigmoid” (common for binary classification) whereas softmax is activation categorical_crossentropy loss function is used in cases of multi-class target variables.
Compile the model: The model is then compiled with compile(). A common optimization algorithm of choice is adaptive moment estimation adam. We select loss = “binary_crossentropy” since this is a binary classification problem. We’ll select metrics = c(“accuracy”) to be evaluated during training and testing.
# Building our Artificial Neural Network
model_keras <- keras_model_sequential()
model_keras %>%
# First hidden layer
layer_dense(
units = 16,
kernel_initializer = "uniform",
activation = "relu",
input_shape = ncol(x_train_tbl)) %>%
# Dropout to prevent overfitting
layer_dropout(rate = 0.3) %>%
# Second hidden layer
layer_dense(
units = 16,
kernel_initializer = "uniform",
activation = "relu") %>%
# Dropout to prevent overfitting
layer_dropout(rate = 0.3) %>%
# Output layer
layer_dense(
units = 1,
kernel_initializer = "uniform",
activation = "sigmoid") %>%
# Compile ANN
compile(
optimizer = 'adam',
loss = 'binary_crossentropy',
metrics = 'accuracy'
)
model_keras
## Model
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## dense_1 (Dense) (None, 16) 144
## ___________________________________________________________________________
## dropout_1 (Dropout) (None, 16) 0
## ___________________________________________________________________________
## dense_2 (Dense) (None, 16) 272
## ___________________________________________________________________________
## dropout_2 (Dropout) (None, 16) 0
## ___________________________________________________________________________
## dense_3 (Dense) (None, 1) 17
## ===========================================================================
## Total params: 433
## Trainable params: 433
## Non-trainable params: 0
## ___________________________________________________________________________
We use the fit() function to run the ANN on our training data. The object is our model, and x and y are our training data in matrix and numeric vector forms, respectively. The batch_size = 615 sets the number samples per gradient update within each epoch. We set epochs = 100 to control the number training cycles. Typically we want to keep the batch size high since this decreases the error within each training cycle (epoch). The epoch is number of iterations the algorithm go through the training set.The model is updated each time a batch is processed, which means that it can be updated multiple times during one epoch. If batch_size is set equal to the length of x, then the model will be updated once per epoch. We set validation_split = 0.30 to include 30% of the data for model validation, which prevents overfitting.
# Fit the keras model to the training data
history <- fit(
object = model_keras,
x = as.matrix(x_train_tbl),
y = y_train,
batch_size = dim(x_train_tbl)[1],
epochs = 100,
validation_split = 0.30,
verbose=0 #suppress progress bars
)
#set global default to show metrics
# options(keras.view_metrics=FALSE)
#
# # Fit the keras model to the training data
history <- model_keras%>%fit(
x = as.matrix(x_train_tbl),
y = y_train,
batch_size = dim(x_train_tbl)[1],
epochs = 100,
validation_split = 0.30,
callbacks = callback_tensorboard("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/DataMiningscience/Deeplearning/logs"),
verbose=1 #suppress progress bars
)
#~/Documents/memphisclassesbooks/DataMiningscience/Deeplearning
Tensorboard automatically include all runs logged within the sub-directories of the specified log_dir.
callback_tensorboard(log_dir ="~/Documents/memphisclassesbooks/DataMiningscience/Deeplearning/logs/run_b" )
## <keras.callbacks.TensorBoard>
Then called as
tensorboard("logs")
## Started TensorBoard at http://127.0.0.1:3846
or multiple log directories
tensorboard(c("logs/run_a","logs/run_b"))
## Started TensorBoard at http://127.0.0.1:3846
# Print a summary of the training history
print(history)
## Trained on 430 samples, validated on 185 samples (batch_size=615, epochs=100)
## Final epoch (plot to see history):
## val_loss: 0.4448
## val_acc: 0.7946
## loss: 0.4826
## acc: 0.7535
The Keras model training history using the plot() function.When the accuracy of the training and validation accuracy and loss sets begin to converge to the same value,it is a sign that our model has avoided overfitting and we can stop at the number of epochs at which this begins to happen.The loss function for both validation and training is also both heading downwards.The model fit is good.
# Plot the training/validation history of our Keras model
plot(history)+theme_bw()
Fig. 30
The plot above can be split up into two plots and make two separate ones. We make one for the model loss and another one for the model accuracy.
library(RColorBrewer)
# Plot the model loss of the training data
plot(history$metrics$loss, main="Model Loss", xlab = "epoch", ylab="loss", col="#FB8072", type="l",lwd=3)
# Plot the model loss of the test data
lines(history$metrics$val_loss, col="#8DD3C7",lwd=3)
# Add legend
legend("topright", c("train","test"), col=c("#FB8072", "#8DD3C7"), lty=c(1,1))
Fig. 30
# Plot the model loss of the training data
plot(history$metrics$acc, main="Model Accuracy", xlab = "epoch", ylab="accuracy", col="#FB8072", type="l",lwd=3,ylim=c(ymin=min(0.65), ymax=max(history$metrics$acc)),xlim=c(xmin=0, xmax=100))
# Plot the model loss of the test data
lines(history$metrics$val_acc, col="#8DD3C7",lwd=3,ylim=c(ymin=min(history$metrics$acc), ymax=max(history$metrics$acc)))
# Add legend
legend("topright", c("train","test"), col=c("#FB8072", "#8DD3C7"), lty=c(1,1))
Fig. 30
predict_classes(): Generates class values as a matrix of ones and zeros. predict_proba(): Generates the class probabilities as a numeric matrix indicating the probability of being a class.
# Predicted Class
keras_class <- predict_classes(object = model_keras, x = as.matrix(x_test_tbl)) %>%
as.vector()
# Predicted Class Probability
keras_prob <- predict_proba(object = model_keras, x = as.matrix(x_test_tbl)) %>%
as.vector()
We can obtain the confusion matrix and other model metrics with the help of the caret package. Our model accuracy of 77% is not excellent but good.
library(caret)
#ModelMetrics::confusionMatrix(keras_class,y_test)
caret::confusionMatrix(keras_class,y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 87 28
## 1 9 29
##
## Accuracy : 0.7582
## 95% CI : (0.6824, 0.8237)
## No Information Rate : 0.6275
## P-Value [Acc > NIR] : 0.0003978
##
## Kappa : 0.4452
## Mcnemar's Test P-Value : 0.0030846
##
## Sensitivity : 0.9062
## Specificity : 0.5088
## Pos Pred Value : 0.7565
## Neg Pred Value : 0.7632
## Prevalence : 0.6275
## Detection Rate : 0.5686
## Detection Prevalence : 0.7516
## Balanced Accuracy : 0.7075
##
## 'Positive' Class : 0
##
We can also get the ROC Area Under the Curve (AUC) measurement with the help of the ModelMetrics package. AUC is a good metric used to compare different classifiers.Random guessing has AUC value of 0.5. Our model has AUC = 0.81, which is much better than randomly guessing.
library(ModelMetrics)
# AUC
auc(y_test, keras_prob)
## [1] 0.8349781
Sensitivity and specificity are statistical measures of the performance of a binary classification test.
Sensitivity (also called the true positive rate, the recall, or probability of detection in some fields) measures the proportion of positives that are correctly identified as such (e.g. the percentage of sick people who are correctly identified as having the condition). Specificity (also called the true negative rate) measures the proportion of negatives that are correctly identified as such (e.g. the percentage of healthy people who are correctly identified as not having the condition). In this project high sensitivity means we are correctly able to predict if a person has diabetes and specitivity means we correctly identify those without diabetes.
The F1-score, which is a weighted average between the precision and recall. Machine learning classifier thresholds are often adjusted to maximize the F1-score.
# F1-Statistic
ModelMetrics::f1Score(y_test, keras_prob)
## [1] 0.6105263
The ROC curve looks good. The higher it is about the diagonal line or closer to the upper left corner, the better. The diagonal line represents a random prediction.
library(verification)
# plot the ROC curve
roc.plot(y_test, keras_prob)
Fig. 30
Local Interpretable Model-agnostic Explanations(LIME) is used to explain the predictions of black box classifiers.For any given prediction and any given classifier it is able to determine a small set of features in the original data that has driven the outcome of the prediction.
model_type: Used to tell lime what type of model we are dealing with. It could be classification, regression, survival, etc.
predict_model: Used to allow lime to perform predictions that its algorithm can interpret.
class(model_keras)
## [1] "keras.models.Sequential" "keras.engine.training.Model"
## [3] "keras.engine.topology.Container" "keras.engine.topology.Layer"
## [5] "python.builtin.object"
model_type() function. It’s only input is x the keras model. The function simply returns “classification”, which tells LIME we are classifying.
# Setup lime::model_type() function for keras
model_type.keras.models.Sequential <- function(x, ...) {
"classification"
}
The predict_model() function, which wraps keras::predict_proba(). The inputs here are the x model, newdata which is a dataframe object , and type which is not used but can be use to switch the output type. The output must be in the format of probabilities by classification.
# Setup lime::predict_model() function for keras
predict_model.keras.models.Sequential <- function(x, newdata, type, ...) {
pred <- predict_proba(object = x, x = as.matrix(newdata))
data.frame(one = pred, zero = 1 - pred)
}
# Test our predict_model() function
predict_model(x = model_keras, newdata = x_test_tbl, type = 'raw') %>%
tibble::as_tibble()%>%head()
## # A tibble: 6 x 2
## one zero
## <dbl> <dbl>
## 1 0.6431847 0.3568153
## 2 0.2369795 0.7630205
## 3 0.6358837 0.3641163
## 4 0.4474712 0.5525288
## 5 0.4900604 0.5099396
## 6 0.5611347 0.4388653
The central function of lime is lime() It creates the function that is used in the nexplainer function to explain the model’s predictions.It’s inputs include the training data,model,bin_continuous which continuous variables be binned when making the explanation.
# Run lime() on training set
explainer <- lime::lime(
x = x_train_tbl,
model = model_keras,
bin_continuous = FALSE
)
After the explainer has been created using the lime() function it can be used to explain the result of the model on new observations. The explain() function takes new observation along with the explainer and returns a data.frame with prediction explanations, one observation per row. The returned explanations can then be visualised in a number of ways, e.g. with plot_features(). We set n_labels = 1 because we care about explaining a single class. Setting n_features = 4 returns the top four features that are critical to each case. Finally, setting kernel_width = 0.5 is the width of the exponential kernel that will be used to convert the distance to a similarity.
# Run explain() on explainer
explanation <- lime::explain(
x_test_tbl[1:20, ],
explainer = explainer,
n_labels = 1,
n_features = 4,
kernel_width = 0.5
)
This shows how the feature variables explain the class variable.
explanation%>%head()%>%kable()
model_type | case | label | label_prob | model_r2 | model_intercept | model_prediction | feature | feature_value | feature_weight | feature_desc | data | prediction |
---|---|---|---|---|---|---|---|---|---|---|---|---|
classification | 1 | one | 0.6431847 | 0.8929047 | 0.3182050 | 0.6431779 | skin | -1.2645771 | 0.0436563 | skin | 1.8400166, -0.1684833, -3.6269219, -1.2645771, -0.6781510, 0.4105697, -1.0103894, -0.3613355 | 0.6431847, 0.3568153 |
classification | 1 | one | 0.6431847 | 0.8929047 | 0.3182050 | 0.6431779 | preg | 1.8400166 | 0.0585127 | preg | 1.8400166, -0.1684833, -3.6269219, -1.2645771, -0.6781510, 0.4105697, -1.0103894, -0.3613355 | 0.6431847, 0.3568153 |
classification | 1 | one | 0.6431847 | 0.8929047 | 0.3182050 | 0.6431779 | mass | 0.4105697 | 0.1153761 | mass | 1.8400166, -0.1684833, -3.6269219, -1.2645771, -0.6781510, 0.4105697, -1.0103894, -0.3613355 | 0.6431847, 0.3568153 |
classification | 1 | one | 0.6431847 | 0.8929047 | 0.3182050 | 0.6431779 | pres | -3.6269219 | -0.0620762 | pres | 1.8400166, -0.1684833, -3.6269219, -1.2645771, -0.6781510, 0.4105697, -1.0103894, -0.3613355 | 0.6431847, 0.3568153 |
classification | 2 | zero | 0.7630205 | 0.6115207 | 0.6098223 | 0.7501195 | test | -0.6781510 | 0.0296440 | test | 0.06075527, -0.32068251, 1.21559269, -1.26457706, -0.67815105, 0.69416708, -0.84156096, -0.27523655 | 0.2369795, 0.7630205 |
classification | 2 | zero | 0.7630205 | 0.6115207 | 0.6098223 | 0.7501195 | plas | -0.3206825 | -0.1554969 | plas | 0.06075527, -0.32068251, 1.21559269, -1.26457706, -0.67815105, 0.69416708, -0.84156096, -0.27523655 | 0.2369795, 0.7630205 |
This also can be represented graphicaly by the plot_features() function.
plot_features(explanation) +
labs(title = "LIME Feature Importance Visualization",
subtitle = " ")
Fig. 30
The plot_explanations()produces a facetted heatmap of all case/label/feature combinations.
plot_explanations(explanation) +
labs(title = "LIME Feature Importance Heatmap",
subtitle = "")
Fig. 30
We can perform a correlation analysis on the training set to identify which features correlate to class. The corrr package, which performs tidy correlations with the function correlate() can help us achieve this.
# Feature correlations to Churn
corrr_analysis <- x_train_tbl %>%
mutate(class = y_train) %>%
correlate() %>%
focus(class) %>%
rename(feature = rowname) %>%
arrange(abs(class)) %>%
mutate(feature = as_factor(feature))
corrr_analysis
## # A tibble: 8 x 2
## feature class
## <fctr> <dbl>
## 1 pres 0.07070969
## 2 skin 0.09643693
## 3 test 0.13091732
## 4 pedi 0.19267151
## 5 preg 0.20251775
## 6 age 0.22710150
## 7 mass 0.31170609
## 8 plas 0.46653104
The correlation visualization helps in distinguishing which features are have a linear relationship with class.
# Correlation visualization
corrr_analysis %>%
ggplot(aes(x = class, y = reorder(feature, desc(class)))) +
geom_point() +
#theme_tq()+
# Positive Correlations - Contribute to churn
geom_segment(aes(xend = 0, yend = feature),
color = palette_light()[[2]],
data = corrr_analysis ) +
geom_point(color = palette_light()[[2]],
data = corrr_analysis )+theme_bw()+
labs(title = "Class Correlation Analysis",
subtitle = "Positive Correlations (contribute to class)",
y = "Feature Importance")+
# Vertical lines
geom_vline(xintercept = 0, color = palette_light()[[5]], size = 1, linetype = 2)+
geom_vline(xintercept = 0.25, color = palette_light()[[5]], size = 1, linetype = 2)+
# Negative Correlations - Prevent churn
geom_segment(aes(xend = 0, yend = feature),
color = palette_light()[[1]],
data = corrr_analysis %>% filter(class < 0.1)) +
geom_point(color = palette_light()[[1]],
data = corrr_analysis %>% filter(class < 0.1))
Fig. 30
The features mass and plas has good correlation with the outcome variable class. All features are positively correlated with the class outcome variable.
library(corrplot)
corrplot(cor(data))
Fig. 30
library(GGally)
ggpairs(data, title = "Within Variables")
Fig. 30
Firstly, you can easily make use of the save_model_hdf5() and load_model_hdf5() functions to save and load your model into your workspace:
save_model_hdf5(model_keras, "~/Documents/memphisclassesbooks/DataMiningscience/Deeplearning/my_model.h5")
model_keras<- load_model_hdf5("~/Documents/memphisclassesbooks/DataMiningscience/Deeplearning/my_model.h5")