The following RMD contains CUNY SPS DATA 621 Fall 2025 context for the Homework 02 assignment.
library(tidyverse)
library(knitr)
library(caret)
library(pROC)
# Import the provided data
classification_raw <- read_csv("https://raw.githubusercontent.com/evanskaylie/DATA621/refs/heads/main/classification-output-data.csv")
# Preview the provided data
head(classification_raw)
## # A tibble: 6 × 11
## pregnant glucose diastolic skinfold insulin bmi pedigree age class
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 124 70 33 215 25.5 0.161 37 0
## 2 2 122 76 27 200 35.9 0.483 26 0
## 3 3 107 62 13 48 22.9 0.678 23 1
## 4 1 91 64 24 0 29.2 0.192 21 0
## 5 4 83 86 19 0 29.3 0.317 34 0
## 6 1 100 74 12 46 19.5 0.149 28 0
## # ℹ 2 more variables: scored.class <dbl>, scored.probability <dbl>
colSums(is.na(classification_raw))
## pregnant glucose diastolic skinfold
## 0 0 0 0
## insulin bmi pedigree age
## 0 0 0 0
## class scored.class scored.probability
## 0 0 0
Data looks clear and there are no missing values.
Make sure you understand the output. In particular, do the rows represent the actual or predicted class? The columns? The rows represent the predicted class. The columns represent the actual class.
# Save the columns we need to use
class_df <- classification_raw |>
select(class, scored.class, scored.probability)
# Format the stored class and class features in a table
class_df |>
select(scored.class, class) |>
mutate(
scored.class = recode(scored.class,
'0' = 'Predicted Negative',
'1' = 'Predicted Positive'),
class = recode(class,
'0' = 'Actual Negative',
'1' = 'Actual Positive')
) |>
table()
## class
## scored.class Actual Negative Actual Positive
## Predicted Negative 119 30
## Predicted Positive 5 27
# Function to get accuracy of predictions based on predictions and actual classifications
accurary <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
FP <- sum(x$class == 0 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
round((TP + TN) / (TP + FP + TN + FN), 4)
}
# Call the function
accurary(class_df)
## [1] 0.8066
# Function to get classification error rate of predictions
class_error_rate <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
FP <- sum(x$class == 0 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
round((FP + FN)/(TP + FP + TN + FN), 4)
}
# Call the function
class_error_rate(class_df)
## [1] 0.1934
Verify that you get an accuracy and an error rate that sums to one.
# The two rates should add to 1
class_error_rate(class_df) + accurary(class_df)
## [1] 1
# Function to get precision
precision <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
FP <- sum(x$class == 0 & x$scored.class == 1)
round(TP/(TP + FP), 4)
}
# Call the function
precision(class_df)
## [1] 0.8438
# Function to get sensitivity
sensitivity <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
FN <- sum(x$class == 1 & x$scored.class == 0)
round(TP/(TP + FN), 4)
}
# Call the function
sensitivity(class_df)
## [1] 0.4737
# Function to get specificity
specificity <- function(x){
TN <- sum(x$class == 0 & x$scored.class == 0)
FP <- sum(x$class == 0 & x$scored.class == 1)
round(TN/(TN + FP), 4)
}
# Call the function
specificity(class_df)
## [1] 0.9597
# Function to get F1 score
f1_score <- function(x){
TP <- sum(x$class == 1 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
FP <- sum(x$class == 0 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
precision <- round(TP/(TP + FP), 4)
sensitivity <- round(TP/(TP + FN), 4)
round((2 * precision * sensitivity)/(precision + sensitivity), 4)
}
# Call the function
f1_score(class_df)
## [1] 0.6068
precision = True Positives /(True Positives + False Positives)
sensitivity = True Positives/(True Positives + False Negatives)
F1 Score = (2 * precision * sensitivity)/(precision + sensitivity)
Because precision and sensitivity are both divided by their numerators plus another value, they will always be between 0 and 1.
Precision:
Without false positives, precision will be 1.
Without true positives, precision will be 0.
Sensitivity:
Without false negatives, sensitivity will be 1.
Without true positives, sensitivity will be 0.
F1 Score:
When sensitivity and precision are both 1, F1 score will be 1 and cannot exceed 1.
When sensitivity and precision are both 0, F1 score is errored.
The range of F1 scores possible with its formula is between 0 (not inclusive) and 1 (inclusive).
# Build function to get ROC curve
generate_roc_curve <- function(df, true_col, prob_col) {
# Calculate ROC curve and AUC
roc_curve <- roc(df[[true_col]], df[[prob_col]])
auc_value <- auc(roc_curve)
#Plot ROC curve
plot(roc_curve,
main = "ROC Curve",
xlab = "False Postivie Rate",
ylab = "True Positive Rate",
col = "salmon",
lwd = 2)
abline(h = seq(0, 1, by = 0.1), v = seq(0, 1, by = 0.1), col = "lightgray", lty = 2)
grid()
#Returning list with plot and AUC value
result_list <- list(ROC_Curve_Plot = roc_curve, AUC = auc_value)
return(result_list)
}
# Call the function
result <- generate_roc_curve(class_df, "class", "scored.probability")
# Save metrics from functions in a new data frame
metrics <- c(accurary(class_df), class_error_rate(class_df), precision(class_df), sensitivity(class_df), specificity(class_df), f1_score(class_df))
# Label each metric in the new df
names(metrics) <- c("Accuracy", "Classification Error Rate", "Precision", "Sensitivity", "Specificity", "F1 Score")
# Knit the values with their ordered column name
kable(metrics, col.names = "Metrics")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "format"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
Metrics | |
---|---|
Accuracy | 0.8066 |
Classification Error Rate | 0.1934 |
Precision | 0.8438 |
Sensitivity | 0.4737 |
Specificity | 0.9597 |
F1 Score | 0.6068 |
cmatrix <- confusionMatrix(data = as.factor(class_df$scored.class),
reference = as.factor(class_df$class),
positive = "1")
# Get values from caret confusion matrix
caret_package <- c(round(cmatrix$overall["Accuracy"], 4), round(cmatrix$byClass["Sensitivity"], 4), round(cmatrix$byClass["Specificity"], 4))
# Get values from above functions
above_functions <- c(accurary(class_df), sensitivity(class_df), specificity(class_df))
# Create table to compare the two values
comparison_table <- cbind(caret_package, above_functions)
kable(comparison_table)
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "format"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
caret_package | above_functions | |
---|---|---|
Accuracy | 0.8066 | 0.8066 |
Sensitivity | 0.4737 | 0.4737 |
Specificity | 0.9597 | 0.9597 |
The results are the same for my above function results and the ones calculated with caret.
# Create plot with pROC package
par(mfrow = c(1, 2))
plot(roc(class_df$class, class_df$scored.probability),
main = "pROC Calculated Curve",
xlab = "False Postivie Rate",
ylab = "True Positive Rate",
col = "skyblue",
lwd = 2)
abline(h = seq(0, 1, by = 0.1), v = seq(0, 1, by = 0.1), col = "lightgray", lty = 2)
grid()
# Compare that to my own plot
generate_roc_curve(class_df, "class", "scored.probability")
## $ROC_Curve_Plot
##
## Call:
## roc.default(response = df[[true_col]], predictor = df[[prob_col]])
##
## Data: df[[prob_col]] in 124 controls (df[[true_col]] 0) < 57 cases (df[[true_col]] 1).
## Area under the curve: 0.8503
##
## $AUC
## Area under the curve: 0.8503
The results are the same for my above ROC curve and the one calculated with pROC.