# --- Load required packages ---
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(knitr)
# --- Load the CSV ---
penguins <- read_csv("https://raw.githubusercontent.com/acatlin/data/refs/heads/master/penguin_predictions.csv")
## Rows: 93 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): .pred_class, sex
## dbl (1): .pred_female
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Quick look
head(penguins)
## # A tibble: 6 × 3
## .pred_female .pred_class sex
## <dbl> <chr> <chr>
## 1 0.992 female female
## 2 0.954 female female
## 3 0.985 female female
## 4 0.187 male female
## 5 0.995 female female
## 6 1.00 female female
# Count actual classes
class_counts <- penguins %>%
count(sex)
# Calculate null error rate (majority class)
majority_class <- max(class_counts$n)
total_obs <- sum(class_counts$n)
null_error_rate <- 1 - (majority_class / total_obs)
null_error_rate
## [1] 0.4193548
# Distribution plot
ggplot(penguins, aes(x = sex, fill = sex)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribution of Actual Sex in Penguin Dataset",
x = "Sex",
y = "Count")
### Null Error Rate The null error rate represents the error rate if we
always predicted the majority class. It provides a baseline to evaluate
if the model is actually performing better than naive guessing.
# Function to compute confusion matrix
conf_matrix <- function(df, threshold) {
df <- df %>%
mutate(pred_class = ifelse(.pred_female > threshold, 1, 0),
actual_class = ifelse(sex == "female", 1, 0))
TP <- sum(df$pred_class == 1 & df$actual_class == 1)
FP <- sum(df$pred_class == 1 & df$actual_class == 0)
TN <- sum(df$pred_class == 0 & df$actual_class == 0)
FN <- sum(df$pred_class == 0 & df$actual_class == 1)
tibble(
Threshold = threshold,
TP = TP,
FP = FP,
TN = TN,
FN = FN
)
}
# Compute for thresholds 0.2, 0.5, 0.8
cm_02 <- conf_matrix(penguins, 0.2)
cm_05 <- conf_matrix(penguins, 0.5)
cm_08 <- conf_matrix(penguins, 0.8)
# Combine for display
confusion_matrices <- bind_rows(cm_02, cm_05, cm_08)
kable(confusion_matrices)
Threshold | TP | FP | TN | FN |
---|---|---|---|---|
0.2 | 37 | 6 | 48 | 2 |
0.5 | 36 | 3 | 51 | 3 |
0.8 | 36 | 2 | 52 | 3 |
# Function to calculate metrics
calc_metrics <- function(cm) {
accuracy <- (cm$TP + cm$TN) / (cm$TP + cm$FP + cm$TN + cm$FN)
precision <- ifelse(cm$TP + cm$FP == 0, NA, cm$TP / (cm$TP + cm$FP))
recall <- ifelse(cm$TP + cm$FN == 0, NA, cm$TP / (cm$TP + cm$FN))
f1 <- ifelse(is.na(precision) | is.na(recall) | (precision + recall) == 0, NA,
2 * precision * recall / (precision + recall))
tibble(
Threshold = cm$Threshold,
Accuracy = round(accuracy, 3),
Precision = round(precision, 3),
Recall = round(recall, 3),
F1_Score = round(f1, 3)
)
}
metrics_table <- bind_rows(
calc_metrics(cm_02),
calc_metrics(cm_05),
calc_metrics(cm_08)
)
kable(metrics_table)
Threshold | Accuracy | Precision | Recall | F1_Score |
---|---|---|---|---|
0.2 | 0.914 | 0.860 | 0.949 | 0.902 |
0.5 | 0.935 | 0.923 | 0.923 | 0.923 |
0.8 | 0.946 | 0.947 | 0.923 | 0.935 |
Threshold 0.2: Useful when missing a positive prediction is costly. For example, identifying female penguins for breeding programs you want to catch as many females as possible, even if some males are falsely included.
Threshold 0.8: Useful when false positives are costly. For example, sending only highly confident females for medical testing we want fewer mistakes even if we miss some females.