Setup
# If needed (in Console, not here): install.packages(c("readr","dplyr","ggplot2","knitr"))
library(readr)
library(dplyr)
library(ggplot2)
library(knitr)
Load Data
# Option A: Load directly from GitHub (recommended)
df <- read_csv(
"https://raw.githubusercontent.com/acatlin/data/master/penguin_predictions.csv",
show_col_types = FALSE
)
# Option B: Local file (uncomment if you saved penguin_predictions.csv alongside this Rmd)
# df <- read_csv("penguin_predictions.csv", show_col_types = FALSE)
glimpse(df)
## Rows: 93
## Columns: 3
## $ .pred_female <dbl> 0.99217462, 0.95423945, 0.98473504, 0.18702056, 0.9947012…
## $ .pred_class <chr> "female", "female", "female", "male", "female", "female",…
## $ sex <chr> "female", "female", "female", "female", "female", "female…
head(df, 10)
## # A tibble: 10 × 3
## .pred_female .pred_class sex
## <dbl> <chr> <chr>
## 1 0.992 female female
## 2 0.954 female female
## 3 0.985 female female
## 4 0.187 male female
## 5 0.995 female female
## 6 1.000 female female
## 7 0.959 female female
## 8 1.000 female female
## 9 1.000 female female
## 10 0.339 male female
Prepare Columns
# The file includes:
# - .pred_female : model probability of "female"
# - .pred_class : 1 if predicted female, else 0
# - sex : actual label ("female"/"male")
df_clean <- df %>%
mutate(
actual = if_else(sex == "female", 1L, 0L), # 1 = female (positive class), 0 = male
prob = `.pred_female` # easier name
)
# Quick sanity check
df_clean %>% count(sex)
## # A tibble: 2 × 2
## sex n
## <chr> <int>
## 1 female 39
## 2 male 54
Task 1 — Null Error Rate + Actual Distribution Plot
# Majority class proportion (baseline accuracy) and null error rate
class_counts <- df_clean %>% count(sex, name = "n")
N <- sum(class_counts$n)
majority_prop <- max(class_counts$n) / N
null_error_rate <- 1 - majority_prop
# Show numbers nicely
kable(class_counts, caption = "Actual Class Distribution (sex)")
Actual Class Distribution (sex)
female |
39 |
male |
54 |
kable(
data.frame(
N = N,
majority_class_accuracy = round(majority_prop, 4),
null_error_rate = round(null_error_rate, 4)
),
caption = "Null Error Rate Summary"
)
Null Error Rate Summary
93 |
0.5806 |
0.4194 |
# Plot the distribution of the actual variable
ggplot(df_clean, aes(x = sex)) +
geom_bar() +
labs(title = "Distribution of Actual Class (sex)", x = "Actual Class", y = "Count") +
theme_minimal()

Helper — Confusion Matrix + Metrics at a Threshold
confusion_and_metrics <- function(data, threshold = 0.5) {
preds <- if_else(data$prob >= threshold, 1L, 0L)
actual <- data$actual
TP <- sum(preds == 1L & actual == 1L)
FP <- sum(preds == 1L & actual == 0L)
TN <- sum(preds == 0L & actual == 0L)
FN <- sum(preds == 0L & actual == 1L)
accuracy <- (TP + TN) / (TP + FP + TN + FN)
precision <- ifelse((TP + FP) > 0, TP / (TP + FP), NA_real_)
recall <- ifelse((TP + FN) > 0, TP / (TP + FN), NA_real_)
f1 <- ifelse(!is.na(precision + recall) && (precision + recall) > 0,
2 * precision * recall / (precision + recall), NA_real_)
list(
counts = data.frame(
Threshold = threshold,
TP = TP, FP = FP, TN = TN, FN = FN
),
matrix = data.frame(
`Predicted\\Actual` = c("Predicted Positive", "Predicted Negative"),
`Actual Positive (1)` = c(TP, FN),
`Actual Negative (0)` = c(FP, TN),
check.names = FALSE
),
metrics = data.frame(
Threshold = threshold,
Accuracy = round(accuracy, 4),
Precision = round(precision, 4),
Recall = round(recall, 4),
F1 = round(f1, 4)
)
)
}
Task 2 — Confusion Matrices at 0.2, 0.5, 0.8
res_02 <- confusion_and_metrics(df_clean, threshold = 0.2)
res_05 <- confusion_and_metrics(df_clean, threshold = 0.5)
res_08 <- confusion_and_metrics(df_clean, threshold = 0.8)
kable(res_02$matrix, caption = "Confusion Matrix (Threshold = 0.2)")
Confusion Matrix (Threshold = 0.2)
Predicted Positive |
37 |
6 |
Predicted Negative |
2 |
48 |
kable(res_05$matrix, caption = "Confusion Matrix (Threshold = 0.5)")
Confusion Matrix (Threshold = 0.5)
Predicted Positive |
36 |
3 |
Predicted Negative |
3 |
51 |
kable(res_08$matrix, caption = "Confusion Matrix (Threshold = 0.8)")
Confusion Matrix (Threshold = 0.8)
Predicted Positive |
36 |
2 |
Predicted Negative |
3 |
52 |
Task 3 — Accuracy, Precision, Recall, F1 (all thresholds)
metrics_table <- bind_rows(res_02$metrics, res_05$metrics, res_08$metrics)
kable(metrics_table, caption = "Performance Metrics by Threshold")
Performance Metrics by Threshold
0.2 |
0.9140 |
0.8605 |
0.9487 |
0.9024 |
0.5 |
0.9355 |
0.9231 |
0.9231 |
0.9231 |
0.8 |
0.9462 |
0.9474 |
0.9231 |
0.9351 |
# Identify the threshold with the highest F1 score
best <- metrics_table[which.max(metrics_table$F1), ]
kable(best, caption = "Best Threshold Based on F1 Score")
Best Threshold Based on F1 Score
3 |
0.8 |
0.9462 |
0.9474 |
0.9231 |
0.9351 |
Task 4 — When to choose 0.2 vs 0.8
Short answer:
- A 0.2 threshold is useful when missing
positives is costly and you want high recall
(you’d rather flag more female penguins—even with more false
positives—than miss actual females). Think medical
screening or initial candidate
filtering.
- A 0.8 threshold is useful when false
positives are costly and you need high
precision (you only want to label as female when the model is
very sure). Think resource-intensive follow-ups or
publishing high-confidence results.
Based on F1, the best-performing threshold is 0.8
(F1 = 0.9351). This suggests a balance between
precision and recall, but the choice of threshold should ultimately
depend on whether false positives or false negatives are more
costly.