In this homework assignment, you will work through various classification metrics. You will be asked to create functions in R to carry out the various calculations. You will also investigate some functions in packages that will let you obtain the equivalent results. Finally, you will create graphical output that also can be used to evaluate the output of classification models, such as binary logistic regression.
Supplemental Material
Deliverables (100 Points)
Instructions
Complete each of the following steps as instructed:
# Reading in the data.
url <- "https://raw.githubusercontent.com/jhnboyy/CUNY_SPS_WORK/refs/heads/main/Spring2025/DATA621/DATA621_Homework2/classification-output-data.csv"
df <- read.csv(url)
head(df)
## pregnant glucose diastolic skinfold insulin bmi pedigree age class
## 1 7 124 70 33 215 25.5 0.161 37 0
## 2 2 122 76 27 200 35.9 0.483 26 0
## 3 3 107 62 13 48 22.9 0.678 23 1
## 4 1 91 64 24 0 29.2 0.192 21 0
## 5 4 83 86 19 0 29.3 0.317 34 0
## 6 1 100 74 12 46 19.5 0.149 28 0
## scored.class scored.probability
## 1 0 0.32845226
## 2 0 0.27319044
## 3 0 0.10966039
## 4 0 0.05599835
## 5 0 0.10049072
## 6 0 0.05515460
Use the table() function to get the raw confusion matrix for this scored dataset. Make sure you understand the output. In particular, do the rows represent the actual or predicted class? The columns?
Output break down (Assuming 0 is positive & 1 is Negative):
The output of this table function, broken down above, has properly categorized ~82% (149 of 181 rows) of the data, while incorrectly categorizing ~18% (32/181) of the data. Please note these are preliminary numbers, not leveraging the official Accuracy formula. The successful categorizations are 119 True positives, and 30 True Negatives, while the incorrect categorizations are 27 false negatives and 5 false positives. Additionally, the rows represent the predicted values, while the columns are the actual observed values.
##
## 0 1
## 0 119 5
## 1 30 27
\[ \text{Accuracy} = \frac{TP + TN}{TP + FP + TN + FN} \]
## Function creation
accuracy_function <- function(df){
total_rows<- nrow(df)
confusion_matrix <- table(df$class, df$scored.class)
cm_df <- data.frame(confusion_matrix)
colnames(cm_df)[colnames(cm_df) == "Var1"] <- "Actual"
colnames(cm_df)[colnames(cm_df) == "Var2"] <- "Predicted"
TP <- cm_df |> filter(Actual == 1, Predicted == 1) |> select(Freq) |> pull(Freq)
TN <- cm_df |> filter(Actual == 0, Predicted == 0) |> select(Freq) |> pull(Freq)
FP <- cm_df |> filter(Actual == 0, Predicted == 1) |> select(Freq) |> pull(Freq)
FN <- cm_df |> filter(Actual == 1, Predicted == 0) |> select(Freq) |> pull(Freq)
accuracy <- (TP + TN) / (TP + FN + TN + FP)
return(accuracy)
}
## Usage Testing
accuracy <- accuracy_function(df)
#print(accuracy)
print(paste(round(accuracy * 100, 2), "% accurate."))
## [1] "80.66 % accurate."
\[ \text{Classification Error Rate} = \frac{FP + FN}{TP + FP + TN + FN} \]
Verify that you get an accuracy and an error rate that sums to one.
## Function Creation
error_rate_func <- function(df){
confusion_matrix <- table(df$class, df$scored.class)
cm_df <- data.frame(confusion_matrix)
colnames(cm_df)[colnames(cm_df) == "Var1"] <- "Actual"
colnames(cm_df)[colnames(cm_df) == "Var2"] <- "Predicted"
TP <- cm_df |> filter(Actual == 1, Predicted == 1) |> select(Freq) |> pull(Freq)
TN <- cm_df |> filter(Actual == 0, Predicted == 0) |> select(Freq) |> pull(Freq)
FP <- cm_df |> filter(Actual == 0, Predicted == 1) |> select(Freq) |> pull(Freq)
FN <- cm_df |> filter(Actual == 1, Predicted == 0) |> select(Freq) |> pull(Freq)
error_rate <- (FP + FN) / (TP + FP + TN + FN)
return(error_rate)
}
## Testing Usage
error_rate <- error_rate_func(df)
accuracy <- accuracy_function(df)
print(paste("The error rate of the data is", round((error_rate * 100), 2), "percent."))
## [1] "The error rate of the data is 19.34 percent."
## [1] "Verification: Accuracy + Error Rate = 1"
\[ \text{Precision} = \frac{TP}{TP + FP} \]
# Function Creation
precision_function <- function(df){
confusion_matrix <- table(df$class, df$scored.class)
cm_df <- data.frame(confusion_matrix)
colnames(cm_df)[colnames(cm_df) == "Var1"] <- "Actual"
colnames(cm_df)[colnames(cm_df) == "Var2"] <- "Predicted"
TP <- cm_df |> filter(Actual == 1, Predicted == 1) |> select(Freq) |> pull(Freq)
FP <- cm_df |> filter(Actual == 0, Predicted == 1) |> select(Freq) |> pull(Freq)
precision <- TP / (TP + FP)
return(precision)
}
# Test Usage
precision <- precision_function(df)
print(paste("The precision of the predictions in the data is", round((precision * 100), 2), "percent."))
## [1] "The precision of the predictions in the data is 84.38 percent."
\[ \text{Sensitivity} = \frac{TP}{TP + FN} \]
# Function Creation
sensitivity_function <- function(df){
confusion_matrix <- table(df$class, df$scored.class)
cm_df <- data.frame(confusion_matrix)
colnames(cm_df)[colnames(cm_df) == "Var1"] <- "Actual"
colnames(cm_df)[colnames(cm_df) == "Var2"] <- "Predicted"
TP <- cm_df |> filter(Actual == 1, Predicted == 1) |> select(Freq) |> pull(Freq)
FN <- cm_df |> filter(Actual == 1, Predicted == 0) |> select(Freq) |> pull(Freq)
sensitivity <- TP / (TP + FN)
return(sensitivity)
}
# Test Usage
sensitivity <- sensitivity_function(df)
#print(sensitivity)
print(paste("The prediction sensitivity for this data is", round((sensitivity * 100), 2), "percent."))
## [1] "The prediction sensitivity for this data is 47.37 percent."
\[ \text{Specificity} = \frac{TN}{TN + FP} \]
# Function Creation
specificity_function <- function(df){
confusion_matrix <- table(df$class, df$scored.class)
cm_df <- data.frame(confusion_matrix)
colnames(cm_df)[colnames(cm_df) == "Var1"] <- "Actual"
colnames(cm_df)[colnames(cm_df) == "Var2"] <- "Predicted"
TN <- cm_df |> filter(Actual == 0, Predicted == 0) |> select(Freq) |> pull(Freq)
FP <- cm_df |> filter(Actual == 0, Predicted == 1) |> select(Freq) |> pull(Freq)
specificity <- TN / (TN + FP)
return(specificity)
}
# Test Usage
specificity <- specificity_function(df)
#print(specificity)
print(paste("The prediction specificity for this data is", round((specificity * 100), 2), "percent."))
## [1] "The prediction specificity for this data is 95.97 percent."
\[ F1 \text{ Score} = \frac{2 \times \text{Precision} \times \text{Sensitivity}}{\text{Precision} + \text{Sensitivity}} \]
## Function Creation
f1_function <- function(df){
confusion_matrix <- table(df$class, df$scored.class)
cm_df <- data.frame(confusion_matrix)
colnames(cm_df)[colnames(cm_df) == "Var1"] <- "Actual"
colnames(cm_df)[colnames(cm_df) == "Var2"] <- "Predicted"
TP <- cm_df |> filter(Actual == 1, Predicted == 1) |> select(Freq) |> pull(Freq)
FP <- cm_df |> filter(Actual == 0, Predicted == 1) |> select(Freq) |> pull(Freq)
FN <- cm_df |> filter(Actual == 1, Predicted == 0) |> select(Freq) |> pull(Freq)
# Compute precision and sensitivity
sensitivity <- TP / (TP + FN)
precision <- TP / (TP + FP)
# Compute F1-score
f1_score <- (2 * precision * sensitivity) / (precision + sensitivity)
return(f1_score)
}
## Function Usage
f1_score <- f1_function(df)
#print(f1_score)
print(paste("The F1 score for this data's predictions is", round(f1_score, 2)))
## [1] "The F1 score for this data's predictions is 0.61"
An F1 Score will always be between zero and 1. This is because the formula for sensitivity is the True Positive over the total amount of True positives and False negatives. Similarly, the formula for precision is the total number of True Positives over the total number of positives, true and false. Having these two formulas be a proportion under that is less than or equal two one, combined with the F1 score formula it means that the numerator will be double the proportion of the proportion divided by the sum of the same initial proportions for precision and sensitivity.
Precision (P) = TP / (TP + FP) and Sensitivity (S) = TP / (TP + FN) are both in [0,1]. The F1 formula, 2PS / (P + S), is constrained: if P = 0 or S = 0, F1 = 0; if P = S = 1, F1 = 1. For 0 < P, S < 1, 2PS < P + S (since PS < (P + S)/2), so F1 < 1.
roc_function <- function(df) {
thresholds <- seq(0, 1, by = 0.01)
n_thresh <- length(thresholds)
# Preallocate TPR and FPR vectors
tpr <- numeric(n_thresh)
fpr <- numeric(n_thresh)
# Compute confusion matrix components
for (i in seq_along(thresholds)) {
pred_class <- df$scored.probability >= thresholds[i]
TP <- sum(df$class == 1 & pred_class)
FN <- sum(df$class == 1 & !pred_class)
FP <- sum(df$class == 0 & pred_class)
TN <- sum(df$class == 0 & !pred_class)
# Compute TPR and FPR
tpr[i] <- ifelse(TP + FN > 0, TP / (TP + FN), 0)
fpr[i] <- ifelse(FP + TN > 0, FP / (FP + TN), 0)
}
# Compute AUC using the trapezoidal rule and ensure positive value
auc_value <- abs(sum(diff(fpr) * (tpr[-1] + tpr[-length(tpr)]) / 2))
# Plot ROC Curve
plot(fpr, tpr, type = "l", col = "blue", lwd = 2,
xlab = "False Positive Rate", ylab = "True Positive Rate",
main = "Custom ROC Curve")
abline(a = 0, b = 1, col = "gray", lty = 2)
text(0.6, 0.2, paste("AUC =", round(auc_value, 4)), col = "blue")
return(list(fpr = fpr, tpr = tpr, auc = auc_value))
}
# Test function
roc_result <- roc_function(df)
## Custom AUC: 0.8489
# Load libraries
library(knitr)
library(kableExtra)
# Calculate all classification metrics
accuracy <- accuracy_function(df)
error_rate <- error_rate_func(df)
precision <- precision_function(df)
sensitivity <- sensitivity_function(df)
specificity <- specificity_function(df)
f1_score <- f1_function(df)
# Create a data frame for displaying results with all values rounded to 2 decimal places
metrics_df <- data.frame(
Metric = c("Accuracy", "Error Rate", "Precision", "Sensitivity (Recall)",
"Specificity", "F1 Score", "AUC"),
Value = round(c(accuracy * 100, error_rate * 100,
precision * 100, sensitivity * 100,
specificity * 100, f1_score,
roc_result$auc), 2)
)
cat("### Classification Metrics from Custom Functions\n")
## ### Classification Metrics from Custom Functions
kable(metrics_df, format = "html",
col.names = c("Metric", "Value (%)"),
align = c("l", "r")) %>% # Ensures right-alignment of values
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE, position = "center") %>%
column_spec(1, bold = TRUE) %>%
column_spec(2, width = "10em")
Metric | Value (%) |
---|---|
Accuracy | 80.66 |
Error Rate | 19.34 |
Precision | 84.38 |
Sensitivity (Recall) | 47.37 |
Specificity | 95.97 |
F1 Score | 0.61 |
AUC | 0.85 |
# Verify that Accuracy + Error Rate = 1
verification_df <- data.frame(
Verification = "Accuracy + Error Rate",
Value = round(accuracy + error_rate, 2)
)
cat("\n### Verification\n")
##
## ### Verification
kable(verification_df, format = "html",
align = c("l", "r")) %>% # Ensures right-alignment
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, position = "center") %>%
column_spec(1, bold = TRUE) %>%
column_spec(2, width = "10em")
Verification | Value |
---|---|
Accuracy + Error Rate | 1 |
# Load libraries
library(caret)
library(knitr)
library(kableExtra)
# Prepare the data for caret's confusion matrix
actual <- factor(df$class, levels = c(0, 1), labels = c("Negative", "Positive"))
predicted <- factor(df$scored.class, levels = c(0, 1), labels = c("Negative", "Positive"))
# Generate confusion matrix using caret
caret_conf_matrix <- confusionMatrix(data = predicted, reference = actual, positive = "Positive")
# Extract accuracy, sensitivity, and specificity using caret functions
caret_accuracy <- caret_conf_matrix$overall["Accuracy"]
caret_sensitivity <- sensitivity(data = predicted, reference = actual, positive = "Positive")
caret_specificity <- specificity(data = predicted, reference = actual, positive = "Positive")
# Calculate custom function results
custom_accuracy <- accuracy_function(df)
custom_sensitivity <- sensitivity_function(df)
custom_specificity <- specificity_function(df)
# Create data frame for caret results
caret_metrics_df <- data.frame(
Metric = c("Caret Accuracy", "Caret Sensitivity", "Caret Specificity"),
Value = c(round(caret_accuracy * 100, 2),
round(caret_sensitivity * 100, 2),
round(caret_specificity * 100, 2))
)
# Create data frame for custom function results
custom_metrics_df <- data.frame(
Metric = c("Custom Accuracy", "Custom Sensitivity", "Custom Specificity"),
Value = c(round(custom_accuracy * 100, 2),
round(custom_sensitivity * 100, 2),
round(custom_specificity * 100, 2))
)
# Create data frame for comparison
comparison_df <- data.frame(
Metric = c("Difference in Accuracy (Caret - Custom)",
"Difference in Sensitivity (Caret - Custom)",
"Difference in Specificity (Caret - Custom)"),
Value = c(round((caret_accuracy - custom_accuracy) * 100, 2),
round((caret_sensitivity - custom_sensitivity) * 100, 2),
round((caret_specificity - custom_specificity) * 100, 2))
)
# Print results as formatted tables
cat("### Caret Classification Metrics\n")
## ### Caret Classification Metrics
kable(caret_metrics_df, format = "html",
col.names = c("Metric", "Value (%)")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE, position = "center") %>%
column_spec(1, bold = TRUE) %>%
column_spec(2, width = "10em")
Metric | Value (%) |
---|---|
Caret Accuracy | 80.66 |
Caret Sensitivity | 47.37 |
Caret Specificity | 47.37 |
##
## ### Custom Function Classification Metrics
kable(custom_metrics_df, format = "html",
col.names = c("Metric", "Value (%)")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE, position = "center") %>%
column_spec(1, bold = TRUE) %>%
column_spec(2, width = "10em")
Metric | Value (%) |
---|---|
Custom Accuracy | 80.66 |
Custom Sensitivity | 47.37 |
Custom Specificity | 95.97 |
##
## ### Comparison of Caret vs Custom Functions
kable(comparison_df, format = "html",
col.names = c("Metric", "Value (%)")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE, position = "center") %>%
column_spec(1, bold = TRUE) %>%
column_spec(2, width = "10em")
Metric | Value (%) |
---|---|
Difference in Accuracy (Caret - Custom) | 0.0 |
Difference in Sensitivity (Caret - Custom) | 0.0 |
Difference in Specificity (Caret - Custom) | -48.6 |
# Load libraries
library(pROC)
# Generate ROC curve using pROC
roc_obj <- roc(response = df$class, predictor = df$scored.probability, direction = "<")
# Plot ROC using pROC
plot(roc_obj,
main = "ROC Curve using pROC",
col = "blue",
lwd = 2,
print.auc = TRUE,
auc.polygon = TRUE,
grid = TRUE,
xlab = "False Positive Rate (1 - Specificity)",
ylab = "True Positive Rate (Sensitivity)",
asp = 1)
# Extract AUC value from pROC
pROC_auc <- auc(roc_obj)
cat("AUC from pROC:", round(pROC_auc, 4), "\n")
## AUC from pROC: 0.8503
# Manual ROC Function with Fixed AUC Calculation
manual_roc <- function(df) {
thresholds <- seq(0, 1, by = 0.01)
n_thresh <- length(thresholds)
# TPR and FPR vectors
tpr <- numeric(n_thresh)
fpr <- numeric(n_thresh)
# Compute confusion matrix components
for (i in seq_along(thresholds)) {
pred_class <- df$scored.probability >= thresholds[i]
TP <- sum(df$class == 1 & pred_class)
FN <- sum(df$class == 1 & !pred_class)
FP <- sum(df$class == 0 & pred_class)
TN <- sum(df$class == 0 & !pred_class)
# Compute TPR and FPR
tpr[i] <- ifelse(TP + FN > 0, TP / (TP + FN), 0)
fpr[i] <- ifelse(FP + TN > 0, FP / (FP + TN), 0)
}
# Compute AUC using the trapezoidal rule and ensure it's always positive
auc_value <- abs(sum(diff(fpr) * (tpr[-1] + tpr[-length(tpr)]) / 2))
return(list(fpr = fpr, tpr = tpr, auc = auc_value))
}
# Run manual ROC calculation
manual_result <- manual_roc(df)
cat("Manual AUC:", round(manual_result$auc, 4), "\n")
## Manual AUC: 0.8489
# Plot manual ROC curve
plot(manual_result$fpr, manual_result$tpr,
type = "l",
col = "red",
lwd = 2,
xlab = "False Positive Rate",
ylab = "True Positive Rate",
main = "Manual ROC Curve")
abline(a = 0, b = 1, col = "gray", lty = 2)
text(0.6, 0.2, paste("AUC =", round(manual_result$auc, 4)), col = "red")
##
## Comparison:
## Difference in AUC (pROC - Manual): 0.0014