library(caret)
library(rpart.plot)
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
setwd(knitr::opts_knit$get("root.dir"))
source("R/flowshop.R")
source("R/models/model_utils.R")
recommendation_data <- loadModelData("MH recommendation")
trainRPart <- function(mh) {
set.seed(123)
input_data <- getInputCols(recommendation_data)
# input_data <- removeCorrelatedFeatures(input_data)
mh_rec <- mh
out_col_name <- str_to_lower(paste0('rec_', mh_rec))
output_data <- factor(recommendation_data[,out_col_name],
labels = c('no', 'yes'))
train_idxs <- createDataPartition(output_data,
p = 0.8,
list = FALSE)
train_input_data <- input_data[train_idxs,]
train_output_data <- output_data[train_idxs]
test_input_data <- input_data[-train_idxs,]
test_output_data <- output_data[-train_idxs]
train_result <- train(
x = train_input_data,
y = train_output_data,
method = 'rf',
metric = 'Kappa',
trControl = trainControl(
method = 'repeatedcv',
number = 10,
repeats = 10,
classProbs = F
)
)
final_model <- train_result$finalModel
predicted_test_output <- predict(final_model, test_input_data, type = "class")
perf <- postResample(predicted_test_output, test_output_data)
perf['Precision'] <- precision(predicted_test_output, reference = test_output_data)
perf['Recall'] <- recall(predicted_test_output, reference = test_output_data)
perf['F1'] <- F_meas(predicted_test_output, reference = test_output_data)
cm <- confusionMatrix(predicted_test_output, test_output_data)
list(
model = final_model,
confusion_matrix = cm,
performance = perf,
test = list(
predicted = predicted_test_output,
reference = test_output_data
)
)
}
r_part_models <- map(ALL_MHS, trainRPart)
names(r_part_models) <- ALL_MHS
performances <- bind_rows(map(r_part_models, ~ as.list(.x$performance)))
performances$MH <- ALL_MHS
predictions <- map(r_part_models, ~ .x$test) %>%
as_tibble() %>%
mutate(rowname = c("predicted", "reference")) %>%
gather(MH, value, -rowname) %>%
spread(rowname, value) %>%
unnest() %>%
group_by(MH) %>%
mutate(instance_id = 1:n())
knitr::kable(
performances
)
| Accuracy | Kappa | Precision | Recall | F1 | MH |
|---|---|---|---|---|---|
| 0.9175258 | 0.4263007 | 0.6666667 | 0.3589744 | 0.4666667 | IHC |
| 0.8891753 | 0.1204133 | 0.4000000 | 0.0975610 | 0.1568627 | ISA |
| 0.8788660 | 0.6691102 | 0.7865169 | 0.7142857 | 0.7486631 | TS |
| 0.8427835 | 0.5702665 | 0.6847826 | 0.6631579 | 0.6737968 | ACO |
| 0.9252577 | 0.3193806 | 0.5000000 | 0.2758621 | 0.3555556 | ILS |
| 0.9432990 | 0.3592554 | 0.4117647 | 0.3684211 | 0.3888889 | IG |
cm <- confusionMatrix(predictions$predicted, predictions$reference)
knitr::kable(
t(c(cm$overall[c('Accuracy', 'Kappa')],
cm$byClass[c('Precision', 'Recall', 'F1')]))
)
| Accuracy | Kappa | Precision | Recall | F1 |
|---|---|---|---|---|
| 0.8994845 | 0.5305311 | 0.677551 | 0.517134 | 0.5865724 |
predictions_by_instance <- predictions %>%
group_by(instance_id) %>%
nest() %>%
mutate(
predicted_set = map(
data,
~pull(filter(.x, as.integer(predicted) == 2), MH)
),
reference_set = map(
data,
~ pull(filter(.x, as.integer(reference) == 2), MH)
)
) %>%
select(-data)
hammingLoss <- function(pred, ref, M = 2) {
symm_set_diff <- c(setdiff(pred, ref), setdiff(ref, pred))
length(symm_set_diff) / M
}
classAccuracy <- function(pred, ref) {
setequal(pred, ref)
}
macroPrecision <- function(pred, ref) {
length(intersect(pred, ref)) / length(ref)
}
macroRecall <- function(pred, ref) {
length(intersect(pred, ref)) / length(pred)
}
macroF1 <- function(pred, ref) {
2 * length(intersect(pred, ref)) / (length(pred) + length(ref))
}
macroAccuracy <- function(pred, ref) {
length(intersect(pred, ref)) / length(union(pred, ref))
}
M <- length(ALL_MHS)
micro_performances <- predictions_by_instance %>%
mutate(
"Hamming loss" = map2_dbl(predicted_set, reference_set,
~hammingLoss(.x, .y, M = M)),
"Classification Acc." = map2_dbl(predicted_set, reference_set,
~classAccuracy(.x, .y)),
Precision = map2_dbl(predicted_set, reference_set,
~macroPrecision(.x, .y)),
Recall = map2_dbl(predicted_set, reference_set,
~macroRecall(.x, .y)),
F1 = map2_dbl(predicted_set, reference_set,
~macroF1(.x, .y)),
Accuracy = map2_dbl(predicted_set, reference_set,
~macroAccuracy(.x, .y))
) %>%
select(-instance_id, -predicted_set, -reference_set) %>%
ungroup() %>%
summarise_all(funs(mean))
knitr::kable(micro_performances)
| Hamming loss | Classification Acc. | Precision | Recall | F1 | Accuracy |
|---|---|---|---|---|---|
| 0.1005155 | 0.5386598 | 0.9610825 | 0.9253436 | 0.9364668 | 0.8919674 |
walk(ALL_MHS, function(mh) {
model_dt <- r_part_models[[mh]]
cat('\n\n### ', mh, ' recommendation\n\n')
plt <- varImp(model_dt$model) %>%
rownames_to_column('Feature') %>%
ggplot() +
geom_col(aes(x = reorder(Feature, Overall), y = Overall)) +
coord_flip() +
labs(x = NULL, y = NULL)
plot(plt)
cat('\n```\n')
print(model_dt$confusion_matrix)
cat('\n```\n')
})
Confusion Matrix and Statistics
Reference
Prediction no yes
no 14 7
yes 25 342
Accuracy : 0.9175
95% CI : (0.8856, 0.9429)
No Information Rate : 0.8995
P-Value [Acc > NIR] : 0.134958
Kappa : 0.4263
Mcnemar's Test P-Value : 0.002654
Sensitivity : 0.35897
Specificity : 0.97994
Pos Pred Value : 0.66667
Neg Pred Value : 0.93188
Prevalence : 0.10052
Detection Rate : 0.03608
Detection Prevalence : 0.05412
Balanced Accuracy : 0.66946
'Positive' Class : no
Confusion Matrix and Statistics
Reference
Prediction no yes
no 4 6
yes 37 341
Accuracy : 0.8892
95% CI : (0.8536, 0.9186)
No Information Rate : 0.8943
P-Value [Acc > NIR] : 0.6667
Kappa : 0.1204
Mcnemar's Test P-Value : 4.763e-06
Sensitivity : 0.09756
Specificity : 0.98271
Pos Pred Value : 0.40000
Neg Pred Value : 0.90212
Prevalence : 0.10567
Detection Rate : 0.01031
Detection Prevalence : 0.02577
Balanced Accuracy : 0.54013
'Positive' Class : no
Confusion Matrix and Statistics
Reference
Prediction no yes
no 70 19
yes 28 271
Accuracy : 0.8789
95% CI : (0.8422, 0.9096)
No Information Rate : 0.7474
P-Value [Acc > NIR] : 1.051e-10
Kappa : 0.6691
Mcnemar's Test P-Value : 0.2432
Sensitivity : 0.7143
Specificity : 0.9345
Pos Pred Value : 0.7865
Neg Pred Value : 0.9064
Prevalence : 0.2526
Detection Rate : 0.1804
Detection Prevalence : 0.2294
Balanced Accuracy : 0.8244
'Positive' Class : no
Confusion Matrix and Statistics
Reference
Prediction no yes
no 63 29
yes 32 264
Accuracy : 0.8428
95% CI : (0.8027, 0.8776)
No Information Rate : 0.7552
P-Value [Acc > NIR] : 1.763e-05
Kappa : 0.5703
Mcnemar's Test P-Value : 0.7979
Sensitivity : 0.6632
Specificity : 0.9010
Pos Pred Value : 0.6848
Neg Pred Value : 0.8919
Prevalence : 0.2448
Detection Rate : 0.1624
Detection Prevalence : 0.2371
Balanced Accuracy : 0.7821
'Positive' Class : no
Confusion Matrix and Statistics
Reference
Prediction no yes
no 8 8
yes 21 351
Accuracy : 0.9253
95% CI : (0.8944, 0.9494)
No Information Rate : 0.9253
P-Value [Acc > NIR] : 0.54921
Kappa : 0.3194
Mcnemar's Test P-Value : 0.02586
Sensitivity : 0.27586
Specificity : 0.97772
Pos Pred Value : 0.50000
Neg Pred Value : 0.94355
Prevalence : 0.07474
Detection Rate : 0.02062
Detection Prevalence : 0.04124
Balanced Accuracy : 0.62679
'Positive' Class : no
Confusion Matrix and Statistics
Reference
Prediction no yes
no 7 10
yes 12 359
Accuracy : 0.9433
95% CI : (0.9154, 0.9641)
No Information Rate : 0.951
P-Value [Acc > NIR] : 0.7983
Kappa : 0.3593
Mcnemar's Test P-Value : 0.8312
Sensitivity : 0.36842
Specificity : 0.97290
Pos Pred Value : 0.41176
Neg Pred Value : 0.96765
Prevalence : 0.04897
Detection Rate : 0.01804
Detection Prevalence : 0.04381
Balanced Accuracy : 0.67066
'Positive' Class : no