Introduction

The following RMD contains CUNY SPS DATA 621 Fall 2025 context for the Homework 02 assignment.

Data Preparation

Load Libraries

library(tidyverse)
library(knitr)
library(caret)
library(pROC)

1.

Download the classification output data set

# Import the provided data
classification_raw <- read_csv("https://raw.githubusercontent.com/evanskaylie/DATA621/refs/heads/main/classification-output-data.csv")

# Preview the provided data
head(classification_raw)
## # A tibble: 6 × 11
##   pregnant glucose diastolic skinfold insulin   bmi pedigree   age class
##      <dbl>   <dbl>     <dbl>    <dbl>   <dbl> <dbl>    <dbl> <dbl> <dbl>
## 1        7     124        70       33     215  25.5    0.161    37     0
## 2        2     122        76       27     200  35.9    0.483    26     0
## 3        3     107        62       13      48  22.9    0.678    23     1
## 4        1      91        64       24       0  29.2    0.192    21     0
## 5        4      83        86       19       0  29.3    0.317    34     0
## 6        1     100        74       12      46  19.5    0.149    28     0
## # ℹ 2 more variables: scored.class <dbl>, scored.probability <dbl>
colSums(is.na(classification_raw))
##           pregnant            glucose          diastolic           skinfold 
##                  0                  0                  0                  0 
##            insulin                bmi           pedigree                age 
##                  0                  0                  0                  0 
##              class       scored.class scored.probability 
##                  0                  0                  0

Data looks clear and there are no missing values.

2.

Use the table() function to get the raw confusion matrix for this scored dataset.

Make sure you understand the output. In particular, do the rows represent the actual or predicted class? The columns? The rows represent the predicted class. The columns represent the actual class.

# Save the columns we need to use
class_df <- classification_raw |>
  select(class, scored.class, scored.probability)

# Format the stored class and class features in a table
class_df |>
  select(scored.class, class) |>
  mutate(
         scored.class = recode(scored.class,
                               '0' = 'Predicted Negative', 
                               '1' = 'Predicted Positive'),
         class = recode(class,
                        '0' = 'Actual Negative', 
                        '1' = 'Actual Positive')
         ) |>
  table()
##                     class
## scored.class         Actual Negative Actual Positive
##   Predicted Negative             119              30
##   Predicted Positive               5              27

3.

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the accuracy of the predictions.

# Function to get accuracy of predictions based on predictions and actual classifications
accurary <- function(x){
  TP <- sum(x$class == 1 & x$scored.class == 1)
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  round((TP + TN) / (TP + FP + TN + FN), 4)  
}

# Call the function
accurary(class_df)
## [1] 0.8066

4.

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the classification error rate of the predictions.

# Function to get classification error rate of predictions
class_error_rate <- function(x){
  TP <- sum(x$class == 1 & x$scored.class == 1)
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  round((FP + FN)/(TP + FP + TN + FN), 4)
}

# Call the function
class_error_rate(class_df)
## [1] 0.1934

Verify that you get an accuracy and an error rate that sums to one.

# The two rates should add to 1
class_error_rate(class_df) + accurary(class_df)
## [1] 1

5.

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the precision of the predictions.

# Function to get precision
precision <- function(x){
  TP <- sum(x$class == 1 & x$scored.class == 1)
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  round(TP/(TP + FP), 4)
}

# Call the function
precision(class_df)
## [1] 0.8438

6.

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the sensitivity of the predictions. Sensitivity is also known as recall.

# Function to get sensitivity
sensitivity <- function(x){
  TP <- sum(x$class == 1 & x$scored.class == 1)
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  round(TP/(TP + FN), 4)
}

# Call the function
sensitivity(class_df)
## [1] 0.4737

7.

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the specificity of the predictions.

# Function to get specificity
specificity <- function(x){
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  round(TN/(TN + FP), 4)
}

# Call the function
specificity(class_df)
## [1] 0.9597

8.

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the F1 score of the predictions

# Function to get F1 score
f1_score <- function(x){
  TP <- sum(x$class == 1 & x$scored.class == 1)
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  
  precision <- round(TP/(TP + FP), 4)
  sensitivity <- round(TP/(TP + FN), 4)
  
  round((2 * precision * sensitivity)/(precision + sensitivity), 4)
}

# Call the function
f1_score(class_df)
## [1] 0.6068

9.

Show that the F1 score will always be between 0 and 1.

precision = True Positives /(True Positives + False Positives)

sensitivity = True Positives/(True Positives + False Negatives)

F1 Score = (2 * precision * sensitivity)/(precision + sensitivity)

Because precision and sensitivity are both divided by their numerators plus another value, they will always be between 0 and 1.

Precision:

  • Without false positives, precision will be 1.

  • Without true positives, precision will be 0.

Sensitivity:

  • Without false negatives, sensitivity will be 1.

  • Without true positives, sensitivity will be 0.

F1 Score:

  • When sensitivity and precision are both 1, F1 score will be 1 and cannot exceed 1.

  • When sensitivity and precision are both 0, F1 score is errored.

  • The range of F1 scores possible with its formula is between 0 (not inclusive) and 1 (inclusive).

10.

Write a function that generates an ROC curve from a data set with a true classification column (class in our example) and a probability column (scored.probability in our example). Your function should return a list that includes the plot of the ROC curve and a vector that contains the calculated area under the curve (AUC).

# Build function to get ROC curve
generate_roc_curve <- function(df, true_col, prob_col) {
  # Calculate ROC curve and AUC
  roc_curve <- roc(df[[true_col]], df[[prob_col]])
  auc_value <- auc(roc_curve)
  
  #Plot ROC curve
  plot(roc_curve, 
       main = "ROC Curve",
       xlab = "False Postivie Rate",
       ylab = "True Positive Rate", 
       col = "salmon", 
       lwd = 2)
  abline(h = seq(0, 1, by = 0.1), v = seq(0, 1, by = 0.1), col = "lightgray", lty = 2)
  grid()
  
  #Returning list with plot and AUC value
  result_list <- list(ROC_Curve_Plot = roc_curve, AUC = auc_value)
  return(result_list)
}

# Call the function
result <- generate_roc_curve(class_df, "class", "scored.probability")

11.

Use your created R functions and the provided classification output data set to produce all of the classification metrics discussed above

# Save metrics from functions in a new data frame
metrics <- c(accurary(class_df), class_error_rate(class_df), precision(class_df), sensitivity(class_df), specificity(class_df), f1_score(class_df))

# Label each metric in the new df
names(metrics) <- c("Accuracy", "Classification Error Rate", "Precision", "Sensitivity", "Specificity", "F1 Score")

# Knit the values with their ordered column name
kable(metrics, col.names = "Metrics")
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "format"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
Metrics
Accuracy 0.8066
Classification Error Rate 0.1934
Precision 0.8438
Sensitivity 0.4737
Specificity 0.9597
F1 Score 0.6068

12.

Investigate the caret package. In particular, consider the functions confusionMatrix, sensitivity, and specificity. Apply the functions to the data set. How do the results compare with your own functions?

cmatrix <- confusionMatrix(data = as.factor(class_df$scored.class), 
                      reference = as.factor(class_df$class), 
                      positive = "1")

# Get values from caret confusion matrix
caret_package <- c(round(cmatrix$overall["Accuracy"], 4), round(cmatrix$byClass["Sensitivity"], 4), round(cmatrix$byClass["Specificity"], 4))

# Get values from above functions
above_functions <- c(accurary(class_df), sensitivity(class_df), specificity(class_df))

# Create table to compare the two values
comparison_table <- cbind(caret_package, above_functions)
kable(comparison_table)
## Warning in attr(x, "align"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
## Warning in attr(x, "format"): 'xfun::attr()' is deprecated.
## Use 'xfun::attr2()' instead.
## See help("Deprecated")
caret_package above_functions
Accuracy 0.8066 0.8066
Sensitivity 0.4737 0.4737
Specificity 0.9597 0.9597

The results are the same for my above function results and the ones calculated with caret.

13.

Investigate the pROC package. Use it to generate an ROC curve for the data set. How do the results compare with your own functions?

# Create plot with pROC package
par(mfrow = c(1, 2))

plot(roc(class_df$class, class_df$scored.probability), 
       main = "pROC Calculated Curve",
       xlab = "False Postivie Rate",
       ylab = "True Positive Rate", 
       col = "skyblue", 
       lwd = 2)
  abline(h = seq(0, 1, by = 0.1), v = seq(0, 1, by = 0.1), col = "lightgray", lty = 2)
  grid()

# Compare that to my own plot
generate_roc_curve(class_df, "class", "scored.probability")

## $ROC_Curve_Plot
## 
## Call:
## roc.default(response = df[[true_col]], predictor = df[[prob_col]])
## 
## Data: df[[prob_col]] in 124 controls (df[[true_col]] 0) < 57 cases (df[[true_col]] 1).
## Area under the curve: 0.8503
## 
## $AUC
## Area under the curve: 0.8503

The results are the same for my above ROC curve and the one calculated with pROC.