DATA 621 HW 2

Instructions Complete each of the following steps as instructed: 1. Download the classification output data set.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

df <- read_csv("classification-output-data.csv")

## Rows: 181 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): pregnant, glucose, diastolic, skinfold, insulin, bmi, pedigree, ag...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(df)

## Rows: 181
## Columns: 11
## $ pregnant           <dbl> 7, 2, 3, 1, 4, 1, 9, 8, 1, 2, 5, 5, 13, 0, 7, 12, 0…
## $ glucose            <dbl> 124, 122, 107, 91, 83, 100, 89, 120, 79, 123, 88, 1…
## $ diastolic          <dbl> 70, 76, 62, 64, 86, 74, 62, 78, 60, 48, 78, 72, 60,…
## $ skinfold           <dbl> 33, 27, 13, 24, 19, 12, 0, 0, 42, 32, 30, 43, 0, 26…
## $ insulin            <dbl> 215, 200, 48, 0, 0, 46, 0, 0, 48, 165, 0, 75, 0, 50…
## $ bmi                <dbl> 25.5, 35.9, 22.9, 29.2, 29.3, 19.5, 22.5, 25.0, 43.…
## $ pedigree           <dbl> 0.161, 0.483, 0.678, 0.192, 0.317, 0.149, 0.142, 0.…
## $ age                <dbl> 37, 26, 23, 21, 34, 28, 33, 64, 23, 26, 37, 33, 41,…
## $ class              <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ scored.class       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ scored.probability <dbl> 0.32845226, 0.27319044, 0.10966039, 0.05599835, 0.1…

head(df)

STEP 2 — Confusion Matrix Using table() 2. The data set has three key columns we will use:  class: the actual class for the observation  scored.class: the predicted class for the observation (based on a threshold of 0.5)  scored.probability: the predicted probability of success for the observation Use the table() function to get the raw confusion matrix for this scored dataset. Make sure you understand the output. In particular, do the rows represent the actual or predicted class? The columns?

conf_matrix <- table(df$class, df$`scored.class`)
conf_matrix

##    
##       0   1
##   0 119   5
##   1  30  27

STEP 3 — Function for Accuracy

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the accuracy of the predictions. 𝐴𝑐𝑐𝑢𝑟𝑎𝑐𝑦 = 𝑇𝑃 + 𝑇𝑁 𝑇𝑃 + 𝐹𝑃 + 𝑇𝑁 + 𝐹𝑁

accuracy <- function(data, actual, predicted) {
  
  # Build confusion matrix
  cm <- table(data[[actual]], data[[predicted]])
  
  TP <- cm["1", "1"]
  TN <- cm["0", "0"]
  FP <- cm["0", "1"]
  FN <- cm["1", "0"]
  
  (TP + TN) / (TP + FP + TN + FN)
}

accuracy(df, "class", "scored.class")

## [1] 0.8066298

STEP 4 — Classification Error Rate

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the classification error rate of the predictions. 𝐶𝑙𝑎𝑠𝑠𝑖𝑓𝑖𝑐𝑎𝑡𝑖𝑜𝑛 𝐸𝑟𝑟𝑜𝑟 𝑅𝑎𝑡𝑒 = 𝐹𝑃 + 𝐹𝑁 𝑇𝑃 + 𝐹𝑃 + 𝑇𝑁 + 𝐹𝑁 Verify that you get an accuracy and an error rate that sums to one.

classification_error <- function(data, actual, predicted) {
  
  cm <- table(data[[actual]], data[[predicted]])
  
  FP <- cm["0", "1"]
  FN <- cm["1", "0"]
  Total <- sum(cm)
  
  (FP + FN) / Total
}

classification_error(df, "class", "scored.class")

## [1] 0.1933702

accuracy(df, "class", "scored.class") +
  classification_error(df, "class", "scored.class")

## [1] 1

STEP 5 — Precision

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the precision of the predictions. 𝑃𝑟𝑒𝑐𝑖𝑠𝑖𝑜𝑛 = 𝑇𝑃 𝑇𝑃 + 𝐹𝑃

precision <- function(data, actual, predicted) {
  
  cm <- table(data[[actual]], data[[predicted]])
  
  TP <- cm["1", "1"]
  FP <- cm["0", "1"]
  
  TP / (TP + FP)
}

precision(df, "class", "scored.class")

## [1] 0.84375

STEP 6 — Sensitivity (Recall)

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the sensitivity of the predictions. Sensitivity is also known as recall. 𝑆𝑒𝑛𝑠𝑖𝑡𝑖𝑣𝑖𝑡𝑦 = 𝑇𝑃 𝑇𝑃 + 𝐹𝑁

sensitivity <- function(data, actual, predicted) {
  
  cm <- table(data[[actual]], data[[predicted]])
  
  TP <- cm["1", "1"]
  FN <- cm["1", "0"]
  
  TP / (TP + FN)
}

sensitivity(df, "class", "scored.class")

## [1] 0.4736842

STEP 7 — Specificity

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the specificity of the predictions. 𝑆𝑝𝑒𝑐𝑖𝑓𝑖𝑐𝑖𝑡𝑦 = 𝑇𝑁 𝑇𝑁 + 𝐹𝑃

specificity <- function(data, actual, predicted) {
  
  cm <- table(data[[actual]], data[[predicted]])
  
  TN <- cm["0", "0"]
  FP <- cm["0", "1"]
  
  TN / (TN + FP)
}

specificity(df, "class", "scored.class")

## [1] 0.9596774

STEP 8 — F1 Score

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the F1 score of the predictions. 𝐹1 𝑆𝑐𝑜𝑟𝑒 = 2 × 𝑃𝑟𝑒𝑐𝑖𝑠𝑖𝑜𝑛 × 𝑆𝑒𝑛𝑠𝑖𝑡𝑖𝑣𝑖𝑡𝑦 𝑃𝑟𝑒𝑐𝑖𝑠𝑖𝑜𝑛 + 𝑆𝑒𝑛𝑠𝑖𝑡𝑖𝑣𝑖𝑡𝑦

f1_score <- function(data, actual, predicted) {
  
  P <- precision(data, actual, predicted)
  R <- sensitivity(data, actual, predicted)
  
  (2 * P * R) / (P + R)
}

f1_score(df, "class", "scored.class")

## [1] 0.6067416

STEP 9 — Show F1 Is Always Between 0 and 1

Let:

0 < 𝑃 < 1 0<P<1

0 < 𝑅 < 1 0<R<1

Then:

Their product 𝑃 𝑅 < 𝑃 PR<P

And also 𝑃 𝑅 < 𝑅 PR<R

Now look at the formula:

𝐹 1 = 2 𝑃 𝑅 𝑃 + 𝑅 F1= P+R 2PR

Since:

Numerator 2 𝑃 𝑅 2PR is positive and less than 2 𝑃 2P or 2 𝑅 2R

Denominator 𝑃 + 𝑅 P+R is greater than either P or R alone

Therefore:

0 < 2 𝑃 𝑅 𝑃 + 𝑅 < 1 0< P+R 2PR

Thus, F1 is always between 0 and 1.

STEP 10 — Custom ROC & AUC Function

Write a function that generates an ROC curve from a data set with a true classification column (class in our example) and a probability column (scored.probability in our example). Your function should return a list that includes the plot of the ROC curve and a vector that contains the calculated area under the curve (AUC). Note that I recommend using a sequence of thresholds ranging from 0 to 1 at 0.01 intervals.

roc_curve <- function(data, actual, prob_col) {
  
  thresholds <- seq(0, 1, by = 0.01)
  TPR <- c()
  FPR <- c()
  
  for (t in thresholds) {
    
    # Predicted class at threshold t
    pred_temp <- ifelse(data[[prob_col]] >= t, 1, 0)
    
    # Force both to have levels 0 and 1 so table is always 2x2
    actual_fac <- factor(data[[actual]], levels = c(0, 1))
    pred_fac   <- factor(pred_temp,       levels = c(0, 1))
    
    cm <- table(actual_fac, pred_fac)
    
    TP <- cm["1", "1"]
    FP <- cm["0", "1"]
    TN <- cm["0", "0"]
    FN <- cm["1", "0"]
    
    # Safely compute TPR and FPR (avoid division by zero)
    if ((TP + FN) == 0) {
      tpr <- 0
    } else {
      tpr <- TP / (TP + FN)
    }
    
    if ((FP + TN) == 0) {
      fpr <- 0
    } else {
      fpr <- FP / (FP + TN)
    }
    
    TPR <- c(TPR, tpr)
    FPR <- c(FPR, fpr)
  }
  
  # AUC via trapezoidal rule
  auc <- sum(diff(FPR) * (head(TPR, -1) + tail(TPR, -1)) / 2)
  
  # Plot ROC
  plot(FPR, TPR, type = "l",
       xlab = "False Positive Rate",
       ylab = "True Positive Rate",
       main = "ROC Curve")
  
  # Return everything useful
  return(list(
    auc = auc,
    thresholds = thresholds,
    TPR = TPR,
    FPR = FPR
  ))
}

# Run it
roc_results <- roc_curve(df, "class", "scored.probability")

roc_results$auc

## [1] -0.8488964

STEP 11 — Use Your Functions to Produce All Metrics

Use your created R functions and the provided classification output data set to produce all of the classification metrics discussed above.

accuracy(df, "class", "scored.class")

## [1] 0.8066298

classification_error(df, "class", "scored.class")

## [1] 0.1933702

precision(df, "class", "scored.class")

## [1] 0.84375

sensitivity(df, "class", "scored.class")

## [1] 0.4736842

specificity(df, "class", "scored.class")

## [1] 0.9596774

f1_score(df, "class", "scored.class")

## [1] 0.6067416

roc_results <- roc_curve(df, "class", "scored.probability")

roc_results$auc

## [1] -0.8488964

STEP 12 — caret Package Comparison

Investigate the caret package. In particular, consider the functions confusionMatrix, sensitivity, and specificity. Apply the functions to the data set. How do the results compare with your own functions?

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked _by_ '.GlobalEnv':
## 
##     precision, sensitivity, specificity

## The following object is masked from 'package:purrr':
## 
##     lift

caret_results <- confusionMatrix(
  factor(df$scored.class),
  factor(df$class),
  positive = "1"
)

caret_sens <- caret::sensitivity(
  data = factor(df$scored.class),
  reference = factor(df$class),
  positive = "1"
)

caret_spec <- caret::specificity(
  data = factor(df$scored.class),
  reference = factor(df$class),
  negative = "0"
)

Investigate the pROC package. Use it to generate an ROC curve for the data set. How do the results compare with your own functions?

roc_obj <- roc(df$class, df$scored.probability)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

auc(roc_obj)

## Area under the curve: 0.8503

plot(roc_obj, main="pROC ROC Curve")

DATA 621 HW 2

Biyag Dukuray

2025-10-12

DATA 621 HW 2