# --- Load required packages ---
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(knitr)
# --- Load the CSV ---
penguins <- read_csv("https://raw.githubusercontent.com/acatlin/data/refs/heads/master/penguin_predictions.csv")
## Rows: 93 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): .pred_class, sex
## dbl (1): .pred_female
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Quick look
head(penguins)
## # A tibble: 6 × 3
##   .pred_female .pred_class sex   
##          <dbl> <chr>       <chr> 
## 1        0.992 female      female
## 2        0.954 female      female
## 3        0.985 female      female
## 4        0.187 male        female
## 5        0.995 female      female
## 6        1.00  female      female
# Count actual classes
class_counts <- penguins %>%
  count(sex)
# Calculate null error rate (majority class)
majority_class <- max(class_counts$n)
total_obs <- sum(class_counts$n)
null_error_rate <- 1 - (majority_class / total_obs)
null_error_rate
## [1] 0.4193548
# Distribution plot
ggplot(penguins, aes(x = sex, fill = sex)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Distribution of Actual Sex in Penguin Dataset",
       x = "Sex",
       y = "Count")
### Null Error Rate The null error rate represents the error rate if we
always predicted the majority class. It provides a baseline to evaluate
if the model is actually performing better than naive guessing.
# Function to compute confusion matrix
conf_matrix <- function(df, threshold) {
  df <- df %>%
    mutate(pred_class = ifelse(.pred_female > threshold, 1, 0),
           actual_class = ifelse(sex == "female", 1, 0))
  
  TP <- sum(df$pred_class == 1 & df$actual_class == 1)
  FP <- sum(df$pred_class == 1 & df$actual_class == 0)
  TN <- sum(df$pred_class == 0 & df$actual_class == 0)
  FN <- sum(df$pred_class == 0 & df$actual_class == 1)
  
  tibble(
    Threshold = threshold,
    TP = TP,
    FP = FP,
    TN = TN,
    FN = FN
  )
}
# Compute for thresholds 0.2, 0.5, 0.8
cm_02 <- conf_matrix(penguins, 0.2)
cm_05 <- conf_matrix(penguins, 0.5)
cm_08 <- conf_matrix(penguins, 0.8)
# Combine for display
confusion_matrices <- bind_rows(cm_02, cm_05, cm_08)
kable(confusion_matrices)
| Threshold | TP | FP | TN | FN | 
|---|---|---|---|---|
| 0.2 | 37 | 6 | 48 | 2 | 
| 0.5 | 36 | 3 | 51 | 3 | 
| 0.8 | 36 | 2 | 52 | 3 | 
# Function to calculate metrics
calc_metrics <- function(cm) {
  accuracy <- (cm$TP + cm$TN) / (cm$TP + cm$FP + cm$TN + cm$FN)
  precision <- ifelse(cm$TP + cm$FP == 0, NA, cm$TP / (cm$TP + cm$FP))
  recall <- ifelse(cm$TP + cm$FN == 0, NA, cm$TP / (cm$TP + cm$FN))
  f1 <- ifelse(is.na(precision) | is.na(recall) | (precision + recall) == 0, NA,
               2 * precision * recall / (precision + recall))
  
  tibble(
    Threshold = cm$Threshold,
    Accuracy = round(accuracy, 3),
    Precision = round(precision, 3),
    Recall = round(recall, 3),
    F1_Score = round(f1, 3)
  )
}
metrics_table <- bind_rows(
  calc_metrics(cm_02),
  calc_metrics(cm_05),
  calc_metrics(cm_08)
)
kable(metrics_table)
| Threshold | Accuracy | Precision | Recall | F1_Score | 
|---|---|---|---|---|
| 0.2 | 0.914 | 0.860 | 0.949 | 0.902 | 
| 0.5 | 0.935 | 0.923 | 0.923 | 0.923 | 
| 0.8 | 0.946 | 0.947 | 0.923 | 0.935 | 
Threshold 0.2: Useful when missing a positive prediction is costly. For example, identifying female penguins for breeding programs you want to catch as many females as possible, even if some males are falsely included.
Threshold 0.8: Useful when false positives are costly. For example, sending only highly confident females for medical testing we want fewer mistakes even if we miss some females.