1.Load the csv file

This step loads the data and finds out the actual number and proportion of male and female.

df <- read.csv("C:/data/predictions.csv")
table(df$sex)
## 
## female   male 
##     39     54
ratio <- round(prop.table(table(df$sex)), 2)
ratio
## 
## female   male 
##   0.42   0.58

2. Calculate null error rate and plot

The majority is male. Null accuracy is 0.58 and the null error rate is 0.42. I create a bar chart based on that.

ratio <- c(female = 0.42, male = 0.58)
plot <- data.frame(
  sex = names(ratio),
  prop = as.numeric(ratio)
)
ggplot(data = plot, aes(x = sex, y = prop, fill = sex)) + geom_col() + geom_text(aes(label = prop), vjust = -0.3)

## 3. Threshold

thres_02 <- ifelse(df$.pred_female > 0.2, 1, 0)
thres_05 <- ifelse(df$.pred_female > 0.5, 1, 0)
thres_08 <- ifelse(df$.pred_female > 0.8, 1, 0)
actual <- ifelse(df$sex == 'female', 1, 0)
get_metrics <- function(metrics) {
  TN <- metrics[1,1]
  FN <- metrics[1,2]
  FP <- metrics[2,1]
  TP <- metrics[2,2]

  accuracy <- (TP + TN) / (TP + TN + FP + FN)
  precision <- TP / (TP + FP)
  recall <- TP / (TP + FN)
  f1 <- 2 * (precision * recall) / (precision + recall)
  return(c(Accuracy = accuracy, Precision = precision, Recall = recall, F1 = f1))
}

Table for threshold 0.2

table_02 <- table(predicted = thres_02, Actual = actual)
me_02 <- get_metrics(table_02)

Table for threshold 0.5

table_05 <- table(predicted = thres_05, Actual = actual)
me_05 <- get_metrics(table_05)

Table for threshold 0.8

table_08 <- table(predicted = thres_08, Actual = actual)
me_08 <- get_metrics(table_08)

4. Summary table for three thresholds

summary <- data.frame(
  Threshold = c(0.2,0.5,0.8),
  Accuracy =c(me_02["Accuracy"],me_05["Accuracy"],me_08["Accuracy"]),
  Precision = c(me_02["Precision"],me_05["Precision"],me_05["Precision"]),
  Recall = c(me_02["Recall"],me_05["Recall"],me_05["Recall"]),
  F1 = c(me_02["F1"],me_05["F1"],me_05["F1"])
)
show(summary)
##   Threshold  Accuracy Precision    Recall        F1
## 1       0.2 0.9139785 0.8604651 0.9487179 0.9024390
## 2       0.5 0.9354839 0.9230769 0.9230769 0.9230769
## 3       0.8 0.9462366 0.9230769 0.9230769 0.9230769

5. Cases for threshold 0.2 and 0.8

Threshold 0.2 can be used when doing medical examination for possible cancer patients. A low possibility should be awared because they don’t want a real cancer patient miss treating. threshold 0.8 can be used when only sending luxury products promotes or real estates ads to those who should be affordable to those products, otherwise it would be kind of wasting time and resources.