This step loads the data and finds out the actual number and proportion of male and female.
df <- read.csv("C:/data/predictions.csv")
table(df$sex)
##
## female male
## 39 54
ratio <- round(prop.table(table(df$sex)), 2)
ratio
##
## female male
## 0.42 0.58
The majority is male. Null accuracy is 0.58 and the null error rate is 0.42. I create a bar chart based on that.
ratio <- c(female = 0.42, male = 0.58)
plot <- data.frame(
sex = names(ratio),
prop = as.numeric(ratio)
)
ggplot(data = plot, aes(x = sex, y = prop, fill = sex)) + geom_col() + geom_text(aes(label = prop), vjust = -0.3)
## 3.
Threshold
thres_02 <- ifelse(df$.pred_female > 0.2, 1, 0)
thres_05 <- ifelse(df$.pred_female > 0.5, 1, 0)
thres_08 <- ifelse(df$.pred_female > 0.8, 1, 0)
actual <- ifelse(df$sex == 'female', 1, 0)
get_metrics <- function(metrics) {
TN <- metrics[1,1]
FN <- metrics[1,2]
FP <- metrics[2,1]
TP <- metrics[2,2]
accuracy <- (TP + TN) / (TP + TN + FP + FN)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1 <- 2 * (precision * recall) / (precision + recall)
return(c(Accuracy = accuracy, Precision = precision, Recall = recall, F1 = f1))
}
table_02 <- table(predicted = thres_02, Actual = actual)
me_02 <- get_metrics(table_02)
table_05 <- table(predicted = thres_05, Actual = actual)
me_05 <- get_metrics(table_05)
table_08 <- table(predicted = thres_08, Actual = actual)
me_08 <- get_metrics(table_08)
summary <- data.frame(
Threshold = c(0.2,0.5,0.8),
Accuracy =c(me_02["Accuracy"],me_05["Accuracy"],me_08["Accuracy"]),
Precision = c(me_02["Precision"],me_05["Precision"],me_05["Precision"]),
Recall = c(me_02["Recall"],me_05["Recall"],me_05["Recall"]),
F1 = c(me_02["F1"],me_05["F1"],me_05["F1"])
)
show(summary)
## Threshold Accuracy Precision Recall F1
## 1 0.2 0.9139785 0.8604651 0.9487179 0.9024390
## 2 0.5 0.9354839 0.9230769 0.9230769 0.9230769
## 3 0.8 0.9462366 0.9230769 0.9230769 0.9230769
Threshold 0.2 can be used when doing medical examination for possible cancer patients. A low possibility should be awared because they don’t want a real cancer patient miss treating. threshold 0.8 can be used when only sending luxury products promotes or real estates ads to those who should be affordable to those products, otherwise it would be kind of wasting time and resources.