library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ggplot2)

#1 Download Data

df <- read.csv("classification-output-data.csv")

str(df)
## 'data.frame':    181 obs. of  11 variables:
##  $ pregnant          : int  7 2 3 1 4 1 9 8 1 2 ...
##  $ glucose           : int  124 122 107 91 83 100 89 120 79 123 ...
##  $ diastolic         : int  70 76 62 64 86 74 62 78 60 48 ...
##  $ skinfold          : int  33 27 13 24 19 12 0 0 42 32 ...
##  $ insulin           : int  215 200 48 0 0 46 0 0 48 165 ...
##  $ bmi               : num  25.5 35.9 22.9 29.2 29.3 19.5 22.5 25 43.5 42.1 ...
##  $ pedigree          : num  0.161 0.483 0.678 0.192 0.317 0.149 0.142 0.409 0.678 0.52 ...
##  $ age               : int  37 26 23 21 34 28 33 64 23 26 ...
##  $ class             : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ scored.class      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ scored.probability: num  0.328 0.273 0.11 0.056 0.1 ...
head(df)
##   pregnant glucose diastolic skinfold insulin  bmi pedigree age class
## 1        7     124        70       33     215 25.5    0.161  37     0
## 2        2     122        76       27     200 35.9    0.483  26     0
## 3        3     107        62       13      48 22.9    0.678  23     1
## 4        1      91        64       24       0 29.2    0.192  21     0
## 5        4      83        86       19       0 29.3    0.317  34     0
## 6        1     100        74       12      46 19.5    0.149  28     0
##   scored.class scored.probability
## 1            0         0.32845226
## 2            0         0.27319044
## 3            0         0.10966039
## 4            0         0.05599835
## 5            0         0.10049072
## 6            0         0.05515460

#2 Create Confusion Matrix

cm_raw <- table(Actual = df$class, Predicted = df$scored.class)
cm_raw
##       Predicted
## Actual   0   1
##      0 119   5
##      1  30  27

Functions

#3 Accuracy

\[ \text{Accuracy} = \frac{TP + TN}{TP + FP + TN + FN} \]

#4 Classification Error Rate

\[ \text{Classification Error Rate} = \frac{FP + FN}{TP + FP + TN + FN} \]

#5 Precision

\[ \text{Precision} = \frac{TP}{TP + FP} \]

#6 Sensitivity

\[ \text{Sensitivity} = \frac{TP}{TP + FN} \]

#7 Specificity

\[ \text{Specificity} = \frac{TN}{TN + FP} \]

#8 F1 Score

\[ F1 \text{ Score} = \frac{2 \times \text{Precision} \times \text{Sensitivity}}{\text{Precision} + \text{Sensitivity}} \]

# Confusion counts
confusion_counts <- function(actual, pred, pos = 1) {
  TP <- sum(actual == pos & pred == pos)
  FP <- sum(actual != pos & pred == pos)
  TN <- sum(actual != pos & pred != pos)
  FN <- sum(actual == pos & pred != pos)
  list(TP = TP, FP = FP, TN = TN, FN = FN)
}

# Accuracy
accuracy_fn <- function(df, pos = 1) {
  cc <- confusion_counts(df$class, df$scored.class, pos)
  (cc$TP + cc$TN) / (cc$TP + cc$FP + cc$TN + cc$FN)
}

# Error rate
error_rate_fn <- function(df, pos = 1) {
  cc <- confusion_counts(df$class, df$scored.class, pos)
  (cc$FP + cc$FN) / (cc$TP + cc$FP + cc$TN + cc$FN)
}

# Precision
precision_fn <- function(df, pos = 1) {
  cc <- confusion_counts(df$class, df$scored.class, pos)
  cc$TP / (cc$TP + cc$FP)
}

# Sensitivity
sensitivity_fn <- function(df, pos = 1) {
  cc <- confusion_counts(df$class, df$scored.class, pos)
  cc$TP / (cc$TP + cc$FN)
}

# Specificity
specificity_fn <- function(df, pos = 1) {
  cc <- confusion_counts(df$class, df$scored.class, pos)
  cc$TN / (cc$TN + cc$FP)
}

# F1 Score
f1_fn <- function(df, pos = 1) {
  prec <- precision_fn(df, pos)
  rec <- sensitivity_fn(df, pos)
  2 * prec * rec / (prec + rec)
}

#9

Before we move on, let’s consider a question that was asked: What are the bounds on the F1 score? Show that the F1 score will always be between 0 and 1.

The F1 score is defined as \[ F1 = \frac{2PS}{P + S} \] where P is Precision and S is Sensitivity. By definition, both P and S are proportions, so their values must be between 0 and 1.

Lower Bound: Since P and S are non-negative, the numerator (2PS) is non-negative and the denominator (P+S) is non-negative. Therefore, the F1 score must be >= 0.

Upper Bound: We can prove the upper bound by starting with the fact that the square of any real number is non-negative. \[ (P - S)^2 \ge 0 \]

Expanding this gives: \[ P^2 - 2PS + S^2 \ge 0 \]

Add 4PS to both sides of the inequality: \[ P^2 + 2PS + S^2 \ge 4PS \]

The left side is now a perfect square, (P+S)^2: \[ (P + S)^2 \ge 4PS \]

Since P and S are non-negative, we can divide both sides by P+S (assuming it is not zero): \[ P + S \ge \frac{4PS}{P+S} \]

Now, divide both sides by 2: \[ \frac{P + S}{2} \ge \frac{2PS}{P+S} \]

The term on the right is the F1 score. The term on the left is the arithmetic mean of P and S. Since both P and S are less than or equal to 1, their arithmetic mean must also be less than or equal to 1. Therefore: \[ 1 \ge \frac{P + S}{2} \ge \frac{2PS}{P+S} = F1 \]

This definitively proves that F1 <= 1.

#10 ROC curve and AUC

# Metrics
pos_label <- 1

roc_from_probs <- function(df, pos = 1, thresholds = seq(0, 1, by = 0.01)) {
  actual <- df$class
  probs  <- df$scored.probability
  tpr <- fpr <- numeric(length(thresholds))

  for (i in seq_along(thresholds)) {
    thr <- thresholds[i]
    pred <- ifelse(probs >= thr, pos, 1 - pos)
    cc <- confusion_counts(actual, pred, pos)
    tpr[i] <- cc$TP / (cc$TP + cc$FN)
    fpr[i] <- cc$FP / (cc$FP + cc$TN)
  }

  fpr_sorted <- rev(fpr)
  tpr_sorted <- rev(tpr)
  
  # Calculate AUC
  auc_est <- sum(diff(fpr_sorted) * (head(tpr_sorted, -1) + tail(tpr_sorted, -1)) / 2)

  df_roc <- data.frame(Threshold = thresholds, FPR = fpr, TPR = tpr)
  p <- ggplot(df_roc, aes(FPR, TPR)) +
    geom_line(color = "blue") +
    geom_abline(linetype = "dashed", color = "gray") +
    theme_minimal() +
    labs(title = paste0("Custom ROC Curve (AUC = ", round(auc_est, 4), ")"), # Use "=" instead of "~"
         x = "False Positive Rate (1 - Specificity)",
         y = "True Positive Rate (Sensitivity)")

  list(auc = auc_est, plot = p)
}

roc_res <- roc_from_probs(df, pos_label)
roc_res$auc
## [1] 0.8488964
roc_res$plot

The ROC curve and AUC generated using the pROC package are consistent with the results from the custom ROC function. The AUC values are the same aside from minor rounding differences, which verifies the accuracy of our implementation.

#11 Classification metrics

acc  <- accuracy_fn(df, pos_label)
err  <- error_rate_fn(df, pos_label)
prec <- precision_fn(df, pos_label)
sens <- sensitivity_fn(df, pos_label)
spec <- specificity_fn(df, pos_label)
f1   <- f1_fn(df, pos_label)

metrics <- data.frame(
  Metric = c("Accuracy", "Error Rate", "Precision", "Sensitivity (Recall)", "Specificity", "F1 Score"),
  Value  = round(c(acc, err, prec, sens, spec, f1), 4)
)

# Display metrics neatly
knitr::kable(metrics, caption = "Classification Metrics Summary")
Classification Metrics Summary
Metric Value
Accuracy 0.8066
Error Rate 0.1934
Precision 0.8438
Sensitivity (Recall) 0.4737
Specificity 0.9597
F1 Score 0.6067
# Check that Accuracy + Error Rate = 1
check_sum <- acc + err
cat("Accuracy + Error Rate =", round(check_sum, 4))
## Accuracy + Error Rate = 1

#12 Comparison with caret Package

The results from the confusion Matrix function validate the calculations from our functions. The Sensitivity and Specificity values produced by caret are identical to the values we calculated manually. This confirms that our helper function confusion_counts and the metric functions sensitivity_fn and specificity_fn are implemented correctly.

df$class <- factor(df$class)
df$scored.class <- factor(df$scored.class, levels = levels(df$class))

cm <- confusionMatrix(df$scored.class, df$class, positive = "1")
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##                                           
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.4737          
##             Specificity : 0.9597          
##          Pos Pred Value : 0.8438          
##          Neg Pred Value : 0.7987          
##              Prevalence : 0.3149          
##          Detection Rate : 0.1492          
##    Detection Prevalence : 0.1768          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 1               
## 
caret_sens <- sensitivity(df$scored.class, df$class, positive = "1")
caret_spec <- specificity(df$scored.class, df$class, negative = "0")

caret_results <- data.frame(
  Metric = c("Caret Sensitivity", "Caret Specificity"),
  Value  = round(c(caret_sens, caret_spec), 4)
)
caret_results
##              Metric  Value
## 1 Caret Sensitivity 0.4737
## 2 Caret Specificity 0.9597
roc_obj <- roc(response = df$class,
               predictor = df$scored.probability,
               levels = c("0", "1"),
               direction = "<")

auc_val <- auc(roc_obj)
plot(roc_obj, main = paste("pROC ROC Curve (AUC =", round(auc_val, 4), ")"))

auc_val
## Area under the curve: 0.8503

The results from the caret functions match the values produced by my custom functions for accuracy, sensitivity, and specificity. This confirms that the manual implementations are correct.

#13 Comparison with pROC Package

The pROC package is a specialized and highly optimized tool for ROC analysis. The Area Under the Curve (AUC) it calculated is almost identical to the AUC from our custom roc_from_probs function.

Our Custom Function AUC: 0.8489 pROC Package AUC: 0.8504

Very close, confirming correctness of custom ROC function.