Week 2B: Evaluating Classification Model Performance

Setup

# If needed (in Console, not here): install.packages(c("readr","dplyr","ggplot2","knitr"))
library(readr)
library(dplyr)
library(ggplot2)
library(knitr)

Load Data

# Option A: Load directly from GitHub (recommended)
df <- read_csv(
  "https://raw.githubusercontent.com/acatlin/data/master/penguin_predictions.csv",
  show_col_types = FALSE
)

# Option B: Local file (uncomment if you saved penguin_predictions.csv alongside this Rmd)
# df <- read_csv("penguin_predictions.csv", show_col_types = FALSE)

glimpse(df)

## Rows: 93
## Columns: 3
## $ .pred_female <dbl> 0.99217462, 0.95423945, 0.98473504, 0.18702056, 0.9947012…
## $ .pred_class  <chr> "female", "female", "female", "male", "female", "female",…
## $ sex          <chr> "female", "female", "female", "female", "female", "female…

head(df, 10)

## # A tibble: 10 × 3
##    .pred_female .pred_class sex   
##           <dbl> <chr>       <chr> 
##  1        0.992 female      female
##  2        0.954 female      female
##  3        0.985 female      female
##  4        0.187 male        female
##  5        0.995 female      female
##  6        1.000 female      female
##  7        0.959 female      female
##  8        1.000 female      female
##  9        1.000 female      female
## 10        0.339 male        female

Prepare Columns

# The file includes:
#  - .pred_female : model probability of "female"
#  - .pred_class  : 1 if predicted female, else 0
#  - sex          : actual label ("female"/"male")

df_clean <- df %>%
  mutate(
    actual = if_else(sex == "female", 1L, 0L),                 # 1 = female (positive class), 0 = male
    prob   = `.pred_female`                                    # easier name
  )

# Quick sanity check
df_clean %>% count(sex)

## # A tibble: 2 × 2
##   sex        n
##   <chr>  <int>
## 1 female    39
## 2 male      54

Task 1 — Null Error Rate + Actual Distribution Plot

# Majority class proportion (baseline accuracy) and null error rate
class_counts <- df_clean %>% count(sex, name = "n")
N <- sum(class_counts$n)
majority_prop <- max(class_counts$n) / N
null_error_rate <- 1 - majority_prop

# Show numbers nicely
kable(class_counts, caption = "Actual Class Distribution (sex)")

Actual Class Distribution (sex)
sex	n
female	39
male	54

kable(
  data.frame(
    N = N,
    majority_class_accuracy = round(majority_prop, 4),
    null_error_rate = round(null_error_rate, 4)
  ),
  caption = "Null Error Rate Summary"
)

Null Error Rate Summary
N	majority_class_accuracy	null_error_rate
93	0.5806	0.4194

# Plot the distribution of the actual variable
ggplot(df_clean, aes(x = sex)) +
  geom_bar() +
  labs(title = "Distribution of Actual Class (sex)", x = "Actual Class", y = "Count") +
  theme_minimal()

Helper — Confusion Matrix + Metrics at a Threshold

confusion_and_metrics <- function(data, threshold = 0.5) {
  preds <- if_else(data$prob >= threshold, 1L, 0L)
  actual <- data$actual

  TP <- sum(preds == 1L & actual == 1L)
  FP <- sum(preds == 1L & actual == 0L)
  TN <- sum(preds == 0L & actual == 0L)
  FN <- sum(preds == 0L & actual == 1L)

  accuracy  <- (TP + TN) / (TP + FP + TN + FN)
  precision <- ifelse((TP + FP) > 0, TP / (TP + FP), NA_real_)
  recall    <- ifelse((TP + FN) > 0, TP / (TP + FN), NA_real_)
  f1        <- ifelse(!is.na(precision + recall) && (precision + recall) > 0,
                      2 * precision * recall / (precision + recall), NA_real_)

  list(
    counts = data.frame(
      Threshold = threshold,
      TP = TP, FP = FP, TN = TN, FN = FN
    ),
    matrix = data.frame(
      `Predicted\\Actual` = c("Predicted Positive", "Predicted Negative"),
      `Actual Positive (1)` = c(TP, FN),
      `Actual Negative (0)` = c(FP, TN),
      check.names = FALSE
    ),
    metrics = data.frame(
      Threshold = threshold,
      Accuracy  = round(accuracy, 4),
      Precision = round(precision, 4),
      Recall    = round(recall, 4),
      F1        = round(f1, 4)
    )
  )
}

Task 2 — Confusion Matrices at 0.2, 0.5, 0.8

res_02 <- confusion_and_metrics(df_clean, threshold = 0.2)
res_05 <- confusion_and_metrics(df_clean, threshold = 0.5)
res_08 <- confusion_and_metrics(df_clean, threshold = 0.8)

kable(res_02$matrix, caption = "Confusion Matrix (Threshold = 0.2)")

Confusion Matrix (Threshold = 0.2)
Predicted	Actual Positive (1)	Actual Negative (0)
Predicted Positive	37	6
Predicted Negative	2	48

kable(res_05$matrix, caption = "Confusion Matrix (Threshold = 0.5)")

Confusion Matrix (Threshold = 0.5)
Predicted	Actual Positive (1)	Actual Negative (0)
Predicted Positive	36	3
Predicted Negative	3	51

kable(res_08$matrix, caption = "Confusion Matrix (Threshold = 0.8)")

Confusion Matrix (Threshold = 0.8)
Predicted	Actual Positive (1)	Actual Negative (0)
Predicted Positive	36	2
Predicted Negative	3	52

Task 3 — Accuracy, Precision, Recall, F1 (all thresholds)

metrics_table <- bind_rows(res_02$metrics, res_05$metrics, res_08$metrics)
kable(metrics_table, caption = "Performance Metrics by Threshold")

Performance Metrics by Threshold
Threshold	Accuracy	Precision	Recall	F1
0.2	0.9140	0.8605	0.9487	0.9024
0.5	0.9355	0.9231	0.9231	0.9231
0.8	0.9462	0.9474	0.9231	0.9351

# Identify the threshold with the highest F1 score
best <- metrics_table[which.max(metrics_table$F1), ]
kable(best, caption = "Best Threshold Based on F1 Score")

Best Threshold Based on F1 Score
	Threshold	Accuracy	Precision	Recall	F1
3	0.8	0.9462	0.9474	0.9231	0.9351

Task 4 — When to choose 0.2 vs 0.8

Short answer:
- A 0.2 threshold is useful when missing positives is costly and you want high recall (you’d rather flag more female penguins—even with more false positives—than miss actual females). Think medical screening or initial candidate filtering.
- A 0.8 threshold is useful when false positives are costly and you need high precision (you only want to label as female when the model is very sure). Think resource-intensive follow-ups or publishing high-confidence results.

Based on F1, the best-performing threshold is 0.8 (F1 = 0.9351). This suggests a balance between precision and recall, but the choice of threshold should ultimately depend on whether false positives or false negatives are more costly.