Question 1

penguins <- read_csv("penguin_predictions.csv")
## Rows: 93 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): .pred_class, sex
## dbl (1): .pred_female
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
penguins <- penguins |> mutate(prediction_result = ifelse(.pred_class == sex, "True", "False"))
penguins <- penguins |> mutate(binary_pred = ifelse(.pred_class == "female", "Positive", "Negative"))

penguins_summary <- penguins |> group_by(prediction_result, binary_pred) |> summarise(n = n())
## `summarise()` has grouped output by 'prediction_result'. You can override using
## the `.groups` argument.
fp_tn = penguins_summary[penguins_summary$prediction_result == "False" & penguins_summary$binary_pred == "Positive", "n"] + penguins_summary[penguins_summary$prediction_result == "True" & penguins_summary$binary_pred == "Negative", "n"]

tp_fn = penguins_summary[penguins_summary$prediction_result == "False" & penguins_summary$binary_pred == "Negative", "n"] + penguins_summary[penguins_summary$prediction_result == "True" & penguins_summary$binary_pred == "Positive", "n"]
  
null_err <- tibble(id = c("male","female"), percent_error = c((fp_tn[[1]] / count(penguins)) * 100 , (tp_fn[[1]] / count(penguins)) * 100))

null_err <- transform(null_err, percent_error = as.double(percent_error)) |>
  mutate(percent_error = round(percent_error, digits = 2))




ggplot(null_err, aes(x = id, y = percent_error, fill = id,)) + geom_bar(stat = "identity",  width = 0.6, show.legend = FALSE) + labs(x = "actuals") + geom_text(
     aes(label = percent_error),
     colour = "white", size = 5.4,
     vjust =1, position = position_dodge(0.1)
 )

D