library(readr)
library(RMySQL)
## Loading required package: DBI
library(DBI)
library(RCurl)
library(ggplot2)
library(rstudioapi)
# Read the dataset
URL <- "https://raw.githubusercontent.com/acatlin/data/master/penguin_predictions.csv"
URL_handle <- RCurl::getURL(URL)
df<-data.frame(read.csv(text=URL_handle, header=TRUE,sep=","))
#dim(Major_data)
pillar::glimpse(df)
## Rows: 93
## Columns: 3
## $ .pred_female <dbl> 0.99217462, 0.95423945, 0.98473504, 0.18702056, 0.9947012…
## $ .pred_class <chr> "female", "female", "female", "male", "female", "female",…
## $ sex <chr> "female", "female", "female", "female", "female", "female…
print("This is the size of the dataframen and let's take a look at its contents")
## [1] "This is the size of the dataframen and let's take a look at its contents"
head(df)
# Calculate the frequency table of the actual outcome variable.
freq_table <- table(df$sex)
# Calculate the majority class count and total observations
majority_count <- max(freq_table)
total_count <- sum(freq_table)
# Compute the null error rate: the proportion of observations in the majority class.
null_error_rate <- majority_count / total_count
# Print the null error rate
print(paste("Null Error Rate:", round(null_error_rate, 3)))
## [1] "Null Error Rate: 0.581"
# the null error rate is 0.581 (58.1%),
# that meaning if we always predicted the majority class (male),
#we would be correct 58.1% of the time.
# The null error rate serves as a baseline performance metric,
# If any classification model perform better than the null
##error rate it is considered as useful.
# Create a bar plot to show the distribution of the actual outcome variable
ggplot(df, aes(x = sex, fill = sex)) +
geom_bar() +
labs(title = "Distribution of Actual Sex in the Dataset",
x = "Sex",
y = "Count") +
theme_minimal()

#
# Confusion matrix
# Define the thresholds to evaluate
thresholds <- c(0.2, 0.5, 0.8)
# Loop through each threshold and compute the confusion matrix
for (thr in thresholds) {
# Generate predicted class: if .pred_female is at least the threshold,
#predict "female", otherwise "male"
predicted <- ifelse(df$.pred_female >= thr, "female", "male")
# Calculate confusion matrix components (treating "female" as positive)
TP <- sum(predicted == "female" & df$sex == "female")
FP <- sum(predicted == "female" & df$sex == "male")
TN <- sum(predicted == "male" & df$sex == "male")
FN <- sum(predicted == "male" & df$sex == "female")
# Print out the confusion matrix in a readable format
cat("-------------------------------------------------\n")
cat("Threshold:", thr, "\n")
cat("-------------------------------------------------\n")
cat(sprintf("%20s %15s\n", "Actual: female", "Actual: male"))
cat(sprintf("Predicted female: %5d %15d\n", TP, FP))
cat(sprintf("Predicted male: %5d %15d\n", FN, TN))
cat("\n")
}
## -------------------------------------------------
## Threshold: 0.2
## -------------------------------------------------
## Actual: female Actual: male
## Predicted female: 37 6
## Predicted male: 2 48
##
## -------------------------------------------------
## Threshold: 0.5
## -------------------------------------------------
## Actual: female Actual: male
## Predicted female: 36 3
## Predicted male: 3 51
##
## -------------------------------------------------
## Threshold: 0.8
## -------------------------------------------------
## Actual: female Actual: male
## Predicted female: 36 2
## Predicted male: 3 52
#3
# Create an empty data frame to store the metrics
metrics_table <- data.frame(
Threshold = thresholds,
Accuracy = NA,
Precision = NA,
Recall = NA,
F1 = NA
)
#Loop through each threshold and compute the confusion matrix
for (i in seq_along(thresholds)) {
thr <- thresholds[i]
# Generate predicted class: if .pred_female is at least the
#threshold, predict "female", otherwise "male"
predicted <- ifelse(df$.pred_female >= thr, "female", "male")
# Calculate confusion matrix components (treating "female" as positive)
TP <- sum(predicted == "female" & df$sex == "female")
FP <- sum(predicted == "female" & df$sex == "male")
TN <- sum(predicted == "male" & df$sex == "male")
FN <- sum(predicted == "male" & df$sex == "female")
# Calculate the evaluation metrics
accuracy <- (TP + TN) / (TP + TN + FP + FN)
precision <- if ((TP + FP) > 0) TP / (TP + FP) else NA
recall <- if ((TP + FN) > 0) TP / (TP + FN) else NA
f1 <- if (!is.na(precision) && !is.na(recall) &&
(precision + recall) > 0) {
2 * precision * recall / (precision + recall)
} else {
NA
}
# Store the metrics in the table
metrics_table[i, "Accuracy"] <- round(accuracy, 3)
metrics_table[i, "Precision"] <- round(precision, 3)
metrics_table[i, "Recall"] <- round(recall, 3)
metrics_table[i, "F1"] <- round(f1, 3)
}
# Display the resulting table
print(metrics_table)
## Threshold Accuracy Precision Recall F1
## 1 0.2 0.914 0.860 0.949 0.902
## 2 0.5 0.935 0.923 0.923 0.923
## 3 0.8 0.946 0.947 0.923 0.935
##In this case the precision for threshold 0.2 is better with the
##score be ~95% while for the 0.5 and 0.8 precision is lower ~92%.
##But as the threshold increases to 0.8 the accuracy in general has improved.