library(caret)
library(rpart.plot)
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
setwd(knitr::opts_knit$get("root.dir"))
source("R/flowshop.R")
source("R/models/model_utils.R")
recommendation_data <- loadModelData("MH recommendation")
trainRPart <- function(mh) {
set.seed(123)
input_data <- getInputCols(recommendation_data)
# input_data <- removeCorrelatedFeatures(input_data)
mh_rec <- mh
out_col_name <- str_to_lower(paste0('rec_', mh_rec))
output_data <- factor(recommendation_data[,out_col_name],
labels = c('no', 'yes'))
train_idxs <- createDataPartition(output_data,
p = 0.8,
list = FALSE)
train_input_data <- input_data[train_idxs,]
train_output_data <- output_data[train_idxs]
test_input_data <- input_data[-train_idxs,]
test_output_data <- output_data[-train_idxs]
train_result <- train(
x = train_input_data,
y = train_output_data,
method = 'rpart',
metric = 'Kappa',
trControl = trainControl(
method = 'repeatedcv',
number = 10,
repeats = 10,
classProbs = F
)
)
final_model <- train_result$finalModel
predicted_test_output <- predict(final_model, test_input_data, type = "class")
perf <- postResample(predicted_test_output, test_output_data)
perf['Precision'] <- precision(predicted_test_output, reference = test_output_data)
perf['Recall'] <- recall(predicted_test_output, reference = test_output_data)
perf['F1'] <- F_meas(predicted_test_output, reference = test_output_data)
cm <- confusionMatrix(predicted_test_output, test_output_data)
list(
model = final_model,
confusion_matrix = cm,
performance = perf,
test = list(
predicted = predicted_test_output,
reference = test_output_data
)
)
}
r_part_models <- map(ALL_MHS, trainRPart)
names(r_part_models) <- ALL_MHS
performances <- bind_rows(map(r_part_models, ~ as.list(.x$performance)))
performances$MH <- ALL_MHS
predictions <- map(r_part_models, ~ .x$test) %>%
as_tibble() %>%
mutate(rowname = c("predicted", "reference")) %>%
gather(MH, value, -rowname) %>%
spread(rowname, value) %>%
unnest() %>%
group_by(MH) %>%
mutate(instance_id = 1:n())
knitr::kable(
performances
)
| Accuracy | Kappa | Precision | Recall | F1 | MH |
|---|---|---|---|---|---|
| 0.9046392 | 0.1848739 | 0.6250000 | 0.1282051 | 0.2127660 | IHC |
| 0.8840206 | 0.0488124 | 0.2500000 | 0.0487805 | 0.0816327 | ISA |
| 0.8350515 | 0.5026438 | 0.7741935 | 0.4897959 | 0.6000000 | TS |
| 0.7809278 | 0.2886722 | 0.6041667 | 0.3052632 | 0.4055944 | ACO |
| 0.9175258 | 0.1657035 | 0.3636364 | 0.1379310 | 0.2000000 | ILS |
| 0.9664948 | 0.5041290 | 0.8750000 | 0.3684211 | 0.5185185 | IG |
cm <- confusionMatrix(predictions$predicted, predictions$reference)
knitr::kable(
t(c(cm$overall[c('Accuracy', 'Kappa')],
cm$byClass[c('Precision', 'Recall', 'F1')]))
)
| Accuracy | Kappa | Precision | Recall | F1 |
|---|---|---|---|---|
| 0.8814433 | 0.3521323 | 0.6551724 | 0.2959502 | 0.4077253 |
predictions_by_instance <- predictions %>%
group_by(instance_id) %>%
nest() %>%
mutate(
predicted_set = map(
data,
~pull(filter(.x, as.integer(predicted) == 2), MH)
),
reference_set = map(
data,
~ pull(filter(.x, as.integer(reference) == 2), MH)
)
) %>%
select(-data)
hammingLoss <- function(pred, ref, M = 2) {
symm_set_diff <- c(setdiff(pred, ref), setdiff(ref, pred))
length(symm_set_diff) / M
}
classAccuracy <- function(pred, ref) {
setequal(pred, ref)
}
macroPrecision <- function(pred, ref) {
length(intersect(pred, ref)) / length(ref)
}
macroRecall <- function(pred, ref) {
length(intersect(pred, ref)) / length(pred)
}
macroF1 <- function(pred, ref) {
2 * length(intersect(pred, ref)) / (length(pred) + length(ref))
}
macroAccuracy <- function(pred, ref) {
length(intersect(pred, ref)) / length(union(pred, ref))
}
M <- length(ALL_MHS)
micro_performances <- predictions_by_instance %>%
mutate(
"Hamming loss" = map2_dbl(predicted_set, reference_set,
~hammingLoss(.x, .y, M = M)),
"Classification Acc." = map2_dbl(predicted_set, reference_set,
~classAccuracy(.x, .y)),
Precision = map2_dbl(predicted_set, reference_set,
~macroPrecision(.x, .y)),
Recall = map2_dbl(predicted_set, reference_set,
~macroRecall(.x, .y)),
F1 = map2_dbl(predicted_set, reference_set,
~macroF1(.x, .y)),
Accuracy = map2_dbl(predicted_set, reference_set,
~macroAccuracy(.x, .y))
) %>%
select(-instance_id, -predicted_set, -reference_set) %>%
ungroup() %>%
summarise_all(funs(mean))
knitr::kable(micro_performances)
| Hamming loss | Classification Acc. | Precision | Recall | F1 | Accuracy |
|---|---|---|---|---|---|
| 0.1185567 | 0.4768041 | 0.9762027 | 0.897079 | 0.9272554 | 0.8763746 |
walk(ALL_MHS, function(mh) {
model_dt <- r_part_models[[mh]]
cat('\n\n### ', mh, ' recommendation\n\n')
rpart.plot(model_dt$model)
cat('\n```\n')
print(model_dt$confusion_matrix)
cat('\n```\n')
})
Confusion Matrix and Statistics
Reference
Prediction no yes
no 5 3
yes 34 346
Accuracy : 0.9046
95% CI : (0.871, 0.932)
No Information Rate : 0.8995
P-Value [Acc > NIR] : 0.4083
Kappa : 0.1849
Mcnemar's Test P-Value : 8.14e-07
Sensitivity : 0.12821
Specificity : 0.99140
Pos Pred Value : 0.62500
Neg Pred Value : 0.91053
Prevalence : 0.10052
Detection Rate : 0.01289
Detection Prevalence : 0.02062
Balanced Accuracy : 0.55980
'Positive' Class : no
Confusion Matrix and Statistics
Reference
Prediction no yes
no 2 6
yes 39 341
Accuracy : 0.884
95% CI : (0.8479, 0.9141)
No Information Rate : 0.8943
P-Value [Acc > NIR] : 0.7743
Kappa : 0.0488
Mcnemar's Test P-Value : 1.84e-06
Sensitivity : 0.048780
Specificity : 0.982709
Pos Pred Value : 0.250000
Neg Pred Value : 0.897368
Prevalence : 0.105670
Detection Rate : 0.005155
Detection Prevalence : 0.020619
Balanced Accuracy : 0.515745
'Positive' Class : no
Confusion Matrix and Statistics
Reference
Prediction no yes
no 48 14
yes 50 276
Accuracy : 0.8351
95% CI : (0.7943, 0.8706)
No Information Rate : 0.7474
P-Value [Acc > NIR] : 2.215e-05
Kappa : 0.5026
Mcnemar's Test P-Value : 1.214e-05
Sensitivity : 0.4898
Specificity : 0.9517
Pos Pred Value : 0.7742
Neg Pred Value : 0.8466
Prevalence : 0.2526
Detection Rate : 0.1237
Detection Prevalence : 0.1598
Balanced Accuracy : 0.7208
'Positive' Class : no
Confusion Matrix and Statistics
Reference
Prediction no yes
no 29 19
yes 66 274
Accuracy : 0.7809
95% CI : (0.7364, 0.8211)
No Information Rate : 0.7552
P-Value [Acc > NIR] : 0.1304
Kappa : 0.2887
Mcnemar's Test P-Value : 6.057e-07
Sensitivity : 0.30526
Specificity : 0.93515
Pos Pred Value : 0.60417
Neg Pred Value : 0.80588
Prevalence : 0.24485
Detection Rate : 0.07474
Detection Prevalence : 0.12371
Balanced Accuracy : 0.62021
'Positive' Class : no
Confusion Matrix and Statistics
Reference
Prediction no yes
no 4 7
yes 25 352
Accuracy : 0.9175
95% CI : (0.8856, 0.9429)
No Information Rate : 0.9253
P-Value [Acc > NIR] : 0.755102
Kappa : 0.1657
Mcnemar's Test P-Value : 0.002654
Sensitivity : 0.13793
Specificity : 0.98050
Pos Pred Value : 0.36364
Neg Pred Value : 0.93369
Prevalence : 0.07474
Detection Rate : 0.01031
Detection Prevalence : 0.02835
Balanced Accuracy : 0.55922
'Positive' Class : no
Confusion Matrix and Statistics
Reference
Prediction no yes
no 7 1
yes 12 368
Accuracy : 0.9665
95% CI : (0.9434, 0.982)
No Information Rate : 0.951
P-Value [Acc > NIR] : 0.092778
Kappa : 0.5041
Mcnemar's Test P-Value : 0.005546
Sensitivity : 0.36842
Specificity : 0.99729
Pos Pred Value : 0.87500
Neg Pred Value : 0.96842
Prevalence : 0.04897
Detection Rate : 0.01804
Detection Prevalence : 0.02062
Balanced Accuracy : 0.68286
'Positive' Class : no