Classification Performance Metrics

# --- Load required packages ---
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(readr)
library(knitr)


# --- Load the CSV ---
penguins <- read_csv("https://raw.githubusercontent.com/acatlin/data/refs/heads/master/penguin_predictions.csv")

## Rows: 93 Columns: 3

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): .pred_class, sex
## dbl (1): .pred_female
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Quick look
head(penguins)

## # A tibble: 6 × 3
##   .pred_female .pred_class sex   
##          <dbl> <chr>       <chr> 
## 1        0.992 female      female
## 2        0.954 female      female
## 3        0.985 female      female
## 4        0.187 male        female
## 5        0.995 female      female
## 6        1.00  female      female

# Count actual classes
class_counts <- penguins %>%
  count(sex)

# Calculate null error rate (majority class)
majority_class <- max(class_counts$n)
total_obs <- sum(class_counts$n)
null_error_rate <- 1 - (majority_class / total_obs)
null_error_rate

## [1] 0.4193548

# Distribution plot
ggplot(penguins, aes(x = sex, fill = sex)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Distribution of Actual Sex in Penguin Dataset",
       x = "Sex",
       y = "Count")

### Null Error Rate The null error rate represents the error rate if we always predicted the majority class. It provides a baseline to evaluate if the model is actually performing better than naive guessing.

# Function to compute confusion matrix
conf_matrix <- function(df, threshold) {
  df <- df %>%
    mutate(pred_class = ifelse(.pred_female > threshold, 1, 0),
           actual_class = ifelse(sex == "female", 1, 0))
  
  TP <- sum(df$pred_class == 1 & df$actual_class == 1)
  FP <- sum(df$pred_class == 1 & df$actual_class == 0)
  TN <- sum(df$pred_class == 0 & df$actual_class == 0)
  FN <- sum(df$pred_class == 0 & df$actual_class == 1)
  
  tibble(
    Threshold = threshold,
    TP = TP,
    FP = FP,
    TN = TN,
    FN = FN
  )
}

# Compute for thresholds 0.2, 0.5, 0.8
cm_02 <- conf_matrix(penguins, 0.2)
cm_05 <- conf_matrix(penguins, 0.5)
cm_08 <- conf_matrix(penguins, 0.8)

# Combine for display
confusion_matrices <- bind_rows(cm_02, cm_05, cm_08)
kable(confusion_matrices)

Threshold	TP	FP	TN	FN
0.2	37	6	48	2
0.5	36	3	51	3
0.8	36	2	52	3

# Function to calculate metrics
calc_metrics <- function(cm) {
  accuracy <- (cm$TP + cm$TN) / (cm$TP + cm$FP + cm$TN + cm$FN)
  precision <- ifelse(cm$TP + cm$FP == 0, NA, cm$TP / (cm$TP + cm$FP))
  recall <- ifelse(cm$TP + cm$FN == 0, NA, cm$TP / (cm$TP + cm$FN))
  f1 <- ifelse(is.na(precision) | is.na(recall) | (precision + recall) == 0, NA,
               2 * precision * recall / (precision + recall))
  
  tibble(
    Threshold = cm$Threshold,
    Accuracy = round(accuracy, 3),
    Precision = round(precision, 3),
    Recall = round(recall, 3),
    F1_Score = round(f1, 3)
  )
}

metrics_table <- bind_rows(
  calc_metrics(cm_02),
  calc_metrics(cm_05),
  calc_metrics(cm_08)
)

kable(metrics_table)

Threshold	Accuracy	Precision	Recall	F1_Score
0.2	0.914	0.860	0.949	0.902
0.5	0.935	0.923	0.923	0.923
0.8	0.946	0.947	0.923	0.935

ThreshHolds

Threshold 0.2: Useful when missing a positive prediction is costly. For example, identifying female penguins for breeding programs you want to catch as many females as possible, even if some males are falsely included.

Threshold 0.8: Useful when false positives are costly. For example, sending only highly confident females for medical testing we want fewer mistakes even if we miss some females.

Classification Performance Metrics - Penguin Dataset

Cai Lin

2025-09-07

ThreshHolds