library(tidyverse)
fraction <- 0.95 # fraction of classes in the negative class
population_size <- 10000
# binarize by thresholding
score <- function(s) as.factor(s > quantile(s, fraction, names=FALSE))
# clamp to [0,1]
clamp <- function(s) pmin(pmax(s, 0), 1)
# plot
plotx <- function(s) {
ggplot(data.frame(a, s), aes(a, s)) +
geom_point() +
geom_vline(xintercept = fraction) +
geom_hline(yintercept = fraction)
}
#' Stratified the sampling
#'
#' @param s predicted class labels
#' @param sampling_fraction fraction of data to be sampled
#' @param pos_fraction fraction of sampled data that should belong to the positive class
#'
#' @return list of actual class labels and predicted class labels for sampled data
sample_on_output <- function(s, sampling_fraction=.05, pos_fraction=0.5) {
n_samples <- ceiling(length(a) * sampling_fraction)
n_pos_class_sampled <- pos_fraction * n_samples
n_neg_class_sampled <- n_samples - n_pos_class_sampled
index_pos_class <- which(s == "TRUE")
index_neg_class <- which(s == "FALSE")
n_pos_class <- length(index_pos_class)
n_neg_class <- length(index_neg_class)
index_pos_class_sampled <- sample(index_pos_class, n_pos_class_sampled)
index_neg_class_sampled <- sample(index_neg_class, n_neg_class_sampled)
index_both_classes_sampled <- c(index_pos_class_sampled, index_neg_class_sampled)
return(list(a = actual[index_both_classes_sampled], s = s[index_both_classes_sampled]))
}
#' Uniform sampling
#'
#' @param s predicted class labels
#' @param sampling_fraction fraction of data to be sampled
#'
#' @return list of actual class labels and predicted class labels for sampled data
sample_uniform <- function(s, sampling_fraction=.05) {
n_samples <- ceiling(length(a) * sampling_fraction)
index_both_classes_sampled <- sample(seq(length(a)), n_samples)
return(list(a = actual[index_both_classes_sampled], s = s[index_both_classes_sampled]))
}
a <- runif(population_size)
actual <- score(a)
r <- runif(population_size)
plotx(r)

predicted_random <- score(r)
caret::confusionMatrix(predicted_random, actual, positive = "TRUE")
Confusion Matrix and Statistics
Reference
Prediction FALSE TRUE
FALSE 9025 475
TRUE 475 25
Accuracy : 0.905
95% CI : (0.8991, 0.9107)
No Information Rate : 0.95
P-Value [Acc > NIR] : 1
Kappa : 0
Mcnemar's Test P-Value : 1
Sensitivity : 0.0500
Specificity : 0.9500
Pos Pred Value : 0.0500
Neg Pred Value : 0.9500
Prevalence : 0.0500
Detection Rate : 0.0025
Detection Prevalence : 0.0500
Balanced Accuracy : 0.5000
'Positive' Class : TRUE
predicted_random_sampled_biased <- sample_on_output(predicted_random)
caret::confusionMatrix(predicted_random_sampled_biased$a,
predicted_random_sampled_biased$s,
positive = "TRUE")
Confusion Matrix and Statistics
Reference
Prediction FALSE TRUE
FALSE 233 237
TRUE 17 13
Accuracy : 0.492
95% CI : (0.4473, 0.5368)
No Information Rate : 0.5
P-Value [Acc > NIR] : 0.6563
Kappa : -0.016
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.0520
Specificity : 0.9320
Pos Pred Value : 0.4333
Neg Pred Value : 0.4957
Prevalence : 0.5000
Detection Rate : 0.0260
Detection Prevalence : 0.0600
Balanced Accuracy : 0.4920
'Positive' Class : TRUE
predicted_random_sampled_uniform <- sample_uniform(predicted_random)
caret::confusionMatrix(predicted_random_sampled_uniform$a,
predicted_random_sampled_uniform$s,
positive = "TRUE")
Confusion Matrix and Statistics
Reference
Prediction FALSE TRUE
FALSE 452 18
TRUE 30 0
Accuracy : 0.904
95% CI : (0.8747, 0.9284)
No Information Rate : 0.964
P-Value [Acc > NIR] : 1.0000
Kappa : -0.0471
Mcnemar's Test P-Value : 0.1124
Sensitivity : 0.0000
Specificity : 0.9378
Pos Pred Value : 0.0000
Neg Pred Value : 0.9617
Prevalence : 0.0360
Detection Rate : 0.0000
Detection Prevalence : 0.0600
Balanced Accuracy : 0.4689
'Positive' Class : TRUE
p <- clamp(a + (runif(population_size)-0.5)/10)
plotx(p)

predicted_prettygood <- score(p)
caret::confusionMatrix(predicted_prettygood, actual, positive = "TRUE")
Confusion Matrix and Statistics
Reference
Prediction FALSE TRUE
FALSE 9386 114
TRUE 114 386
Accuracy : 0.9772
95% CI : (0.9741, 0.98)
No Information Rate : 0.95
P-Value [Acc > NIR] : <2e-16
Kappa : 0.76
Mcnemar's Test P-Value : 1
Sensitivity : 0.7720
Specificity : 0.9880
Pos Pred Value : 0.7720
Neg Pred Value : 0.9880
Prevalence : 0.0500
Detection Rate : 0.0386
Detection Prevalence : 0.0500
Balanced Accuracy : 0.8800
'Positive' Class : TRUE
predicted_prettygood_sampled_biased <- sample_on_output(predicted_prettygood)
caret::confusionMatrix(
predicted_prettygood_sampled_biased$a,
predicted_prettygood_sampled_biased$s,
positive = "TRUE"
)
Confusion Matrix and Statistics
Reference
Prediction FALSE TRUE
FALSE 244 55
TRUE 6 195
Accuracy : 0.878
95% CI : (0.8461, 0.9054)
No Information Rate : 0.5
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.756
Mcnemar's Test P-Value : 7.958e-10
Sensitivity : 0.7800
Specificity : 0.9760
Pos Pred Value : 0.9701
Neg Pred Value : 0.8161
Prevalence : 0.5000
Detection Rate : 0.3900
Detection Prevalence : 0.4020
Balanced Accuracy : 0.8780
'Positive' Class : TRUE
predicted_prettygood_sampled_uniform <- sample_uniform(predicted_prettygood)
caret::confusionMatrix(
predicted_prettygood_sampled_uniform$a,
predicted_prettygood_sampled_uniform$s,
positive = "TRUE"
)
Confusion Matrix and Statistics
Reference
Prediction FALSE TRUE
FALSE 468 7
TRUE 4 21
Accuracy : 0.978
95% CI : (0.961, 0.989)
No Information Rate : 0.944
P-Value [Acc > NIR] : 0.0001677
Kappa : 0.7809
Mcnemar's Test P-Value : 0.5464936
Sensitivity : 0.7500
Specificity : 0.9915
Pos Pred Value : 0.8400
Neg Pred Value : 0.9853
Prevalence : 0.0560
Detection Rate : 0.0420
Detection Prevalence : 0.0500
Balanced Accuracy : 0.8708
'Positive' Class : TRUE
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3IgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5dmVyc2UpCmBgYAoKYGBge3J9CmZyYWN0aW9uIDwtIDAuOTUgIyBmcmFjdGlvbiBvZiBjbGFzc2VzIGluIHRoZSBuZWdhdGl2ZSBjbGFzcwpwb3B1bGF0aW9uX3NpemUgPC0gMTAwMDAgCgojIGJpbmFyaXplIGJ5IHRocmVzaG9sZGluZwpzY29yZSA8LSBmdW5jdGlvbihzKSBhcy5mYWN0b3IocyA+IHF1YW50aWxlKHMsIGZyYWN0aW9uLCBuYW1lcz1GQUxTRSkpCgojIGNsYW1wIHRvIFswLDFdCmNsYW1wIDwtIGZ1bmN0aW9uKHMpIHBtaW4ocG1heChzLCAwKSwgMSkKCiMgcGxvdCAKcGxvdHggPC0gZnVuY3Rpb24ocykgewogIGdncGxvdChkYXRhLmZyYW1lKGEsIHMpLCBhZXMoYSwgcykpICsKICAgIGdlb21fcG9pbnQoKSArCiAgICBnZW9tX3ZsaW5lKHhpbnRlcmNlcHQgPSBmcmFjdGlvbikgKwogICAgZ2VvbV9obGluZSh5aW50ZXJjZXB0ID0gZnJhY3Rpb24pCn0KCiMnIFN0cmF0aWZpZWQgdGhlIHNhbXBsaW5nIAojJwojJyBAcGFyYW0gcyBwcmVkaWN0ZWQgY2xhc3MgbGFiZWxzIAojJyBAcGFyYW0gc2FtcGxpbmdfZnJhY3Rpb24gZnJhY3Rpb24gb2YgZGF0YSB0byBiZSBzYW1wbGVkCiMnIEBwYXJhbSBwb3NfZnJhY3Rpb24gZnJhY3Rpb24gb2Ygc2FtcGxlZCBkYXRhIHRoYXQgc2hvdWxkIGJlbG9uZyB0byB0aGUgcG9zaXRpdmUgY2xhc3MKIycKIycgQHJldHVybiBsaXN0IG9mIGFjdHVhbCBjbGFzcyBsYWJlbHMgYW5kIHByZWRpY3RlZCBjbGFzcyBsYWJlbHMgZm9yIHNhbXBsZWQgZGF0YQpzYW1wbGVfb25fb3V0cHV0IDwtIGZ1bmN0aW9uKHMsIHNhbXBsaW5nX2ZyYWN0aW9uPS4wNSwgcG9zX2ZyYWN0aW9uPTAuNSkgewogIG5fc2FtcGxlcyA8LSBjZWlsaW5nKGxlbmd0aChhKSAqIHNhbXBsaW5nX2ZyYWN0aW9uKQogIG5fcG9zX2NsYXNzX3NhbXBsZWQgPC0gcG9zX2ZyYWN0aW9uICogbl9zYW1wbGVzCiAgbl9uZWdfY2xhc3Nfc2FtcGxlZCA8LSBuX3NhbXBsZXMgLSBuX3Bvc19jbGFzc19zYW1wbGVkCiAgCiAgaW5kZXhfcG9zX2NsYXNzIDwtIHdoaWNoKHMgPT0gIlRSVUUiKQogIGluZGV4X25lZ19jbGFzcyA8LSB3aGljaChzID09ICJGQUxTRSIpCiAgCiAgbl9wb3NfY2xhc3MgPC0gbGVuZ3RoKGluZGV4X3Bvc19jbGFzcykKICBuX25lZ19jbGFzcyA8LSBsZW5ndGgoaW5kZXhfbmVnX2NsYXNzKQoKICBpbmRleF9wb3NfY2xhc3Nfc2FtcGxlZCA8LSBzYW1wbGUoaW5kZXhfcG9zX2NsYXNzLCAgbl9wb3NfY2xhc3Nfc2FtcGxlZCkKICBpbmRleF9uZWdfY2xhc3Nfc2FtcGxlZCA8LSBzYW1wbGUoaW5kZXhfbmVnX2NsYXNzLCBuX25lZ19jbGFzc19zYW1wbGVkKQogIGluZGV4X2JvdGhfY2xhc3Nlc19zYW1wbGVkIDwtIGMoaW5kZXhfcG9zX2NsYXNzX3NhbXBsZWQsIGluZGV4X25lZ19jbGFzc19zYW1wbGVkKQogIHJldHVybihsaXN0KGEgPSBhY3R1YWxbaW5kZXhfYm90aF9jbGFzc2VzX3NhbXBsZWRdLCBzID0gc1tpbmRleF9ib3RoX2NsYXNzZXNfc2FtcGxlZF0pKQp9CgojJyBVbmlmb3JtIHNhbXBsaW5nIAojJwojJyBAcGFyYW0gcyBwcmVkaWN0ZWQgY2xhc3MgbGFiZWxzIAojJyBAcGFyYW0gc2FtcGxpbmdfZnJhY3Rpb24gZnJhY3Rpb24gb2YgZGF0YSB0byBiZSBzYW1wbGVkCiMnCiMnIEByZXR1cm4gbGlzdCBvZiBhY3R1YWwgY2xhc3MgbGFiZWxzIGFuZCBwcmVkaWN0ZWQgY2xhc3MgbGFiZWxzIGZvciBzYW1wbGVkIGRhdGEKc2FtcGxlX3VuaWZvcm0gPC0gZnVuY3Rpb24ocywgc2FtcGxpbmdfZnJhY3Rpb249LjA1KSB7CiAgbl9zYW1wbGVzIDwtIGNlaWxpbmcobGVuZ3RoKGEpICogc2FtcGxpbmdfZnJhY3Rpb24pCgogIGluZGV4X2JvdGhfY2xhc3Nlc19zYW1wbGVkIDwtIHNhbXBsZShzZXEobGVuZ3RoKGEpKSwgbl9zYW1wbGVzKQogIAogIHJldHVybihsaXN0KGEgPSBhY3R1YWxbaW5kZXhfYm90aF9jbGFzc2VzX3NhbXBsZWRdLCBzID0gc1tpbmRleF9ib3RoX2NsYXNzZXNfc2FtcGxlZF0pKQp9CgpgYGAKCgpgYGB7cn0KYSA8LSBydW5pZihwb3B1bGF0aW9uX3NpemUpCmFjdHVhbCA8LSBzY29yZShhKQpgYGAKCgpgYGB7cn0KciA8LSBydW5pZihwb3B1bGF0aW9uX3NpemUpIAoKcGxvdHgocikKYGBgCgoKYGBge3J9CnByZWRpY3RlZF9yYW5kb20gPC0gc2NvcmUocikKCmNhcmV0Ojpjb25mdXNpb25NYXRyaXgocHJlZGljdGVkX3JhbmRvbSwgYWN0dWFsLCBwb3NpdGl2ZSA9ICJUUlVFIikKCmBgYAoKCmBgYHtyfQpwcmVkaWN0ZWRfcmFuZG9tX3NhbXBsZWRfYmlhc2VkIDwtIHNhbXBsZV9vbl9vdXRwdXQocHJlZGljdGVkX3JhbmRvbSkKCmNhcmV0Ojpjb25mdXNpb25NYXRyaXgocHJlZGljdGVkX3JhbmRvbV9zYW1wbGVkX2JpYXNlZCRhLAogICAgICAgICAgICAgICAgICAgICAgIHByZWRpY3RlZF9yYW5kb21fc2FtcGxlZF9iaWFzZWQkcywKICAgICAgICAgICAgICAgICAgICAgICBwb3NpdGl2ZSA9ICJUUlVFIikKCmBgYAoKYGBge3J9CnByZWRpY3RlZF9yYW5kb21fc2FtcGxlZF91bmlmb3JtIDwtIHNhbXBsZV91bmlmb3JtKHByZWRpY3RlZF9yYW5kb20pCgpjYXJldDo6Y29uZnVzaW9uTWF0cml4KHByZWRpY3RlZF9yYW5kb21fc2FtcGxlZF91bmlmb3JtJGEsCiAgICAgICAgICAgICAgICAgICAgICAgcHJlZGljdGVkX3JhbmRvbV9zYW1wbGVkX3VuaWZvcm0kcywKICAgICAgICAgICAgICAgICAgICAgICBwb3NpdGl2ZSA9ICJUUlVFIikKYGBgCgpgYGB7cn0KcCA8LSBjbGFtcChhICsgKHJ1bmlmKHBvcHVsYXRpb25fc2l6ZSktMC41KS8xMCkKCnBsb3R4KHApCmBgYAoKCmBgYHtyfQpwcmVkaWN0ZWRfcHJldHR5Z29vZCA8LSBzY29yZShwKQoKY2FyZXQ6OmNvbmZ1c2lvbk1hdHJpeChwcmVkaWN0ZWRfcHJldHR5Z29vZCwgYWN0dWFsLCBwb3NpdGl2ZSA9ICJUUlVFIikKCmBgYAoKYGBge3J9CnByZWRpY3RlZF9wcmV0dHlnb29kX3NhbXBsZWRfYmlhc2VkIDwtIHNhbXBsZV9vbl9vdXRwdXQocHJlZGljdGVkX3ByZXR0eWdvb2QpCgpjYXJldDo6Y29uZnVzaW9uTWF0cml4KAogIHByZWRpY3RlZF9wcmV0dHlnb29kX3NhbXBsZWRfYmlhc2VkJGEsCiAgcHJlZGljdGVkX3ByZXR0eWdvb2Rfc2FtcGxlZF9iaWFzZWQkcywKICBwb3NpdGl2ZSA9ICJUUlVFIgopCmBgYApgYGB7cn0KcHJlZGljdGVkX3ByZXR0eWdvb2Rfc2FtcGxlZF91bmlmb3JtIDwtIHNhbXBsZV91bmlmb3JtKHByZWRpY3RlZF9wcmV0dHlnb29kKQoKY2FyZXQ6OmNvbmZ1c2lvbk1hdHJpeCgKICBwcmVkaWN0ZWRfcHJldHR5Z29vZF9zYW1wbGVkX3VuaWZvcm0kYSwKICBwcmVkaWN0ZWRfcHJldHR5Z29vZF9zYW1wbGVkX3VuaWZvcm0kcywKICBwb3NpdGl2ZSA9ICJUUlVFIgopCmBgYAoK