# Load required libraries for validation
library(caret)
library(pROC)

1 Introduction

This analysis evaluates the performance of a binary classification model by implementing core classification metrics from scratch and validating those results against established R packages such as caret and pROC. Using the provided dataset (classification-output-data.csv), I compute and interpret accuracy, error rate, precision, recall/sensitivity, specificity, F1 score, and ROC/AUC to assess model behavior across multiple dimensions. These metrics are essential for understanding how well a model distinguishes between positive and negative classes, especially when considering false positives, false negatives, and threshold-dependent performance. The analysis also includes a manually constructed ROC curve and trapezoidal AUC approximation, followed by comparisons with package-generated outputs to confirm correctness and deepen understanding of classification evaluation methods.

Key variables in the dataset include:

class — the actual/true class

scored.class — the predicted class using a 0.5 threshold

scored.probability — the predicted probability of the positive class

2 Data Load and Exploration

# Load the classification output dataset
df <- read.csv("classification-output-data.csv", stringsAsFactors = FALSE)

# Display structure and summary statistics
str(df)
## 'data.frame':    181 obs. of  11 variables:
##  $ pregnant          : int  7 2 3 1 4 1 9 8 1 2 ...
##  $ glucose           : int  124 122 107 91 83 100 89 120 79 123 ...
##  $ diastolic         : int  70 76 62 64 86 74 62 78 60 48 ...
##  $ skinfold          : int  33 27 13 24 19 12 0 0 42 32 ...
##  $ insulin           : int  215 200 48 0 0 46 0 0 48 165 ...
##  $ bmi               : num  25.5 35.9 22.9 29.2 29.3 19.5 22.5 25 43.5 42.1 ...
##  $ pedigree          : num  0.161 0.483 0.678 0.192 0.317 0.149 0.142 0.409 0.678 0.52 ...
##  $ age               : int  37 26 23 21 34 28 33 64 23 26 ...
##  $ class             : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ scored.class      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ scored.probability: num  0.328 0.273 0.11 0.056 0.1 ...
head(df)
summary(df)
##     pregnant         glucose        diastolic        skinfold   
##  Min.   : 0.000   Min.   : 57.0   Min.   : 38.0   Min.   : 0.0  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.0   1st Qu.: 0.0  
##  Median : 3.000   Median :112.0   Median : 70.0   Median :22.0  
##  Mean   : 3.862   Mean   :118.3   Mean   : 71.7   Mean   :19.8  
##  3rd Qu.: 6.000   3rd Qu.:136.0   3rd Qu.: 78.0   3rd Qu.:32.0  
##  Max.   :15.000   Max.   :197.0   Max.   :104.0   Max.   :54.0  
##     insulin            bmi           pedigree           age       
##  Min.   :  0.00   Min.   :19.40   Min.   :0.0850   Min.   :21.00  
##  1st Qu.:  0.00   1st Qu.:26.30   1st Qu.:0.2570   1st Qu.:24.00  
##  Median :  0.00   Median :31.60   Median :0.3910   Median :30.00  
##  Mean   : 63.77   Mean   :31.58   Mean   :0.4496   Mean   :33.31  
##  3rd Qu.:105.00   3rd Qu.:36.00   3rd Qu.:0.5800   3rd Qu.:41.00  
##  Max.   :543.00   Max.   :50.00   Max.   :2.2880   Max.   :67.00  
##      class         scored.class    scored.probability
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.02323   
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.11702   
##  Median :0.0000   Median :0.0000   Median :0.23999   
##  Mean   :0.3149   Mean   :0.1768   Mean   :0.30373   
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.43093   
##  Max.   :1.0000   Max.   :1.0000   Max.   :0.94633

2.1 Confusion Matrix (Raw table())

The confusion matrix is the foundation of all classification metrics. Using table(actual, predicted), rows represent actual classes and columns represent predicted classes.

# Convert to factors for proper table construction
df$class <- as.factor(df$class)
df$scored.class <- as.factor(df$scored.class)

# Ensure both factors share the same levels and order
lvl <- union(levels(df$class), levels(df$scored.class))
df$class <- factor(df$class, levels = lvl)
df$scored.class <- factor(df$scored.class, levels = lvl)

# Generate confusion matrix with clear labels
tbl <- table(Actual = df$class, Predicted = df$scored.class)
tbl
##       Predicted
## Actual   0   1
##      0 119   5
##      1  30  27

Interpretation: The confusion matrix reveals that our classifier is conservative in predicting the positive class. Out of 57 actual positive cases (class = 1), we correctly identified 27 (TP = 27) while missing 30 (FN = 30). However, when the model predicts positive, it is usually correct—only 5 false positives (FP = 5) occurred out of 124 actual negatives. This pattern suggests the default 0.5 threshold may be too high, prioritizing precision over sensitivity.

3 Metric Functions (From Scratch)

We implement each metric by extracting \(TP\), \(FP\), \(TN\), and \(FN\) from the confusion matrix. These functions follow the standard formulas for binary classification evaluation.

#' Helper function to extract confusion matrix counts
#' @param data Dataframe containing actual and predicted columns
#' @param actual Name of the actual class column
#' @param predicted Name of the predicted class column
#' @param positive The positive class label (default: last factor level)
#' @return List containing TP, FP, TN, FN, and class labels
get_cm_counts <- function(data, actual = "class", predicted = "scored.class", positive = NULL) {
  a <- factor(data[[actual]])
  p <- factor(data[[predicted]], levels = levels(a))
  cm <- table(a, p)
  
  # Determine positive class (default to last level, e.g., "1" in c("0","1"))
  if (is.null(positive)) {
    positive <- tail(levels(a), 1)
  }
  pos <- positive
  neg <- setdiff(levels(a), pos)[1]

  # Extract counts from confusion matrix
  TP <- cm[pos, pos]  # True Positives: actually positive, predicted positive

  TN <- cm[neg, neg]  # True Negatives: actually negative, predicted negative
  FP <- cm[neg, pos]  # False Positives: actually negative, predicted positive
  FN <- cm[pos, neg]  # False Negatives: actually positive, predicted negative

  list(TP = as.numeric(TP), FP = as.numeric(FP),
       TN = as.numeric(TN), FN = as.numeric(FN),
       positive = pos, negative = neg, cm = cm)
}

#' Calculate accuracy: proportion of correct predictions
#' Formula: (TP + TN) / (TP + TN + FP + FN)
accuracy <- function(data, positive = NULL) {
  cts <- get_cm_counts(data, positive = positive)
  with(cts, (TP + TN) / (TP + TN + FP + FN))
}

#' Calculate classification error rate: proportion of incorrect predictions
#' Formula: (FP + FN) / (TP + TN + FP + FN) = 1 - Accuracy
error_rate <- function(data, positive = NULL) {
  1 - accuracy(data, positive = positive)
}

#' Calculate precision: proportion of positive predictions that are correct
#' Formula: TP / (TP + FP)
precision <- function(data, positive = NULL) {
  cts <- get_cm_counts(data, positive = positive)
  with(cts, TP / (TP + FP))
}

#' Calculate sensitivity (recall): proportion of actual positives correctly identified
#' Formula: TP / (TP + FN)
sensitivity <- function(data, positive = NULL) {
  cts <- get_cm_counts(data, positive = positive)
  with(cts, TP / (TP + FN))
}

#' Calculate specificity: proportion of actual negatives correctly identified
#' Formula: TN / (TN + FP)
specificity <- function(data, positive = NULL) {
  cts <- get_cm_counts(data, positive = positive)
  with(cts, TN / (TN + FP))
}

#' Calculate F1 score: harmonic mean of precision and sensitivity
#' Formula: 2 * (Precision * Sensitivity) / (Precision + Sensitivity)
f1_score <- function(data, positive = NULL) {
  p <- precision(data, positive = positive)
  r <- sensitivity(data, positive = positive)
  2 * (p * r) / (p + r)
}

3.1 Metric Values

# Set positive label: prefer "1" if it exists, otherwise use last factor level
pos_label <- if ("1" %in% levels(df$class)) "1" else tail(levels(df$class), 1)

# Calculate all metrics using custom functions
acc  <- accuracy(df, positive = pos_label)
err  <- error_rate(df, positive = pos_label)
prec <- precision(df, positive = pos_label)
sens <- sensitivity(df, positive = pos_label)
spec <- specificity(df, positive = pos_label)
f1   <- f1_score(df, positive = pos_label)

# Display results in a clean summary table
metrics_summary <- data.frame(
  Metric = c("Accuracy", "Error Rate", "Precision", "Sensitivity (Recall)", 
             "Specificity", "F1 Score"),
  Value = round(c(acc, err, prec, sens, spec, f1), 4),
  Percentage = paste0(round(c(acc, err, prec, sens, spec, f1) * 100, 2), "%")
)
metrics_summary

3.1.1 Verification: Accuracy + Error Rate = 1

# Verify that accuracy and error rate sum to 1
cat("Accuracy:", round(acc, 4), "\n")
## Accuracy: 0.8066
cat("Error Rate:", round(err, 4), "\n")
## Error Rate: 0.1934
cat("Sum:", round(acc + err, 4), "\n")
## Sum: 1

Key Observations:

  • High Specificity (95.97%) but Low Sensitivity (47.37%): The model excels at identifying true negatives but misses over half of the positive cases. This trade-off is critical depending on the use case—if this were medical diagnosis or fraud detection, missing positive cases (false negatives) could be very costly.

  • High Precision (84.38%): When the model predicts positive, it is correct 84% of the time, which minimizes false alarms. Combined with low sensitivity, this indicates a conservative classifier.

  • Moderate F1 Score (60.67%): The harmonic mean reflects the imbalance between precision and recall. To improve this metric, we would need to lower the classification threshold below 0.5 to capture more true positives, though this would increase false positives.

4 Bounds on F1 Score (\(0 \le F1 \le 1\))

For precision \(P\) and recall \(R\) where \(0 \le P \le 1\) and \(0 \le R \le 1\), the F1 score is defined as the harmonic mean:

\[F1 = \frac{2PR}{P + R}\]

Proof that \(0 \le F1 \le 1\):

Lower Bound (\(F1 \ge 0\)): Since \(P \ge 0\) and \(R \ge 0\), the numerator \(2PR \ge 0\). The denominator \(P + R \ge 0\). By convention, when \(P = R = 0\), we define \(F1 = 0\) (since there are no positive predictions or actual positives). Thus, \(F1 \ge 0\).

Upper Bound (\(F1 \le 1\)): We need to show that \(2PR \le P + R\) for \(0 \le P, R \le 1\).

Rearranging: \(2PR \le P + R\)

\(\Rightarrow 0 \le P + R - 2PR\)

\(\Rightarrow 0 \le P(1 - R) + R(1 - P)\)

Since \(0 \le P, R \le 1\), we have \((1-R) \ge 0\) and \((1-P) \ge 0\). Therefore, both terms are non-negative, and their sum is non-negative. This confirms \(2PR \le P + R\), so \(F1 = \frac{2PR}{P+R} \le 1\).

Alternatively, by the AM-GM inequality: The arithmetic mean is always greater than or equal to the geometric mean:

\[\frac{P + R}{2} \ge \sqrt{PR}\]

Squaring both sides: \(\frac{(P+R)^2}{4} \ge PR\)

Thus: \(P + R \ge \frac{4PR}{P+R} = 2 \cdot \frac{2PR}{P+R} = 2 \cdot F1\)

Therefore: \(F1 \le \frac{P+R}{2} \le 1\) (since \(P \le 1\) and \(R \le 1\)).

\(\square\)

5 ROC Curve and AUC Function

Per the assignment requirements, we create a function that generates an ROC curve and returns a list containing the plot and the calculated AUC value.

#' Generate ROC curve and calculate AUC
#' @param data Dataframe containing actual class and probability columns
#' @param actual_col Name of the actual class column
#' @param prob_col Name of the predicted probability column
#' @param positive The positive class label
#' @param thresholds Sequence of thresholds to evaluate (default: 0 to 1 by 0.01)
#' @return List containing: roc_data (dataframe), auc (numeric), and plot
generate_roc <- function(data, 
                         actual_col = "class", 
                         prob_col = "scored.probability", 
                         positive = "1",
                         thresholds = seq(0, 1, by = 0.01)) {
  
  # Validate inputs
  stopifnot(actual_col %in% names(data))
  stopifnot(prob_col %in% names(data))
  
  # Extract probability scores and convert actual to binary
  probs <- as.numeric(data[[prob_col]])
  y_true <- as.integer(data[[actual_col]] == positive)
  
  # Calculate TPR and FPR at each threshold
  roc_points <- lapply(thresholds, function(t) {
    y_hat <- ifelse(probs >= t, 1, 0)
    
    TP <- sum(y_true == 1 & y_hat == 1)
    TN <- sum(y_true == 0 & y_hat == 0)
    FP <- sum(y_true == 0 & y_hat == 1)
    FN <- sum(y_true == 1 & y_hat == 0)
    
    # Calculate rates (handle edge cases)
    TPR <- if ((TP + FN) == 0) NA else TP / (TP + FN)
    FPR <- if ((FP + TN) == 0) NA else FP / (FP + TN)
    
    c(threshold = t, TPR = TPR, FPR = FPR)
  })
  
  # Combine into dataframe and remove incomplete cases
  roc_df <- as.data.frame(do.call(rbind, roc_points))
  roc_df <- roc_df[complete.cases(roc_df), ]
  
  # Sort by FPR for proper AUC calculation

  ord <- order(roc_df$FPR, roc_df$TPR)
  x <- roc_df$FPR[ord]
  y <- roc_df$TPR[ord]
  
  # Calculate AUC using trapezoidal rule
  auc_value <- sum(diff(x) * (head(y, -1) + tail(y, -1)) / 2)
  
  # Generate the ROC plot
  roc_plot <- function() {
    plot(x, y, type = "l", lwd = 2, col = "blue",
         xlab = "False Positive Rate (1 - Specificity)", 
         ylab = "True Positive Rate (Sensitivity)",
         main = sprintf("ROC Curve (AUC = %.4f)", auc_value))
    abline(0, 1, lty = 2, col = "gray50")
    legend("bottomright", 
           legend = c(sprintf("Model (AUC = %.4f)", auc_value), "Random Chance"),
           lty = c(1, 2), lwd = c(2, 1), col = c("blue", "gray50"))
  }
  
  # Return list with all components
  return(list(
    roc_data = roc_df,
    auc = auc_value,
    plot = roc_plot
  ))
}

5.1 Apply ROC Function to Dataset

# Generate ROC curve using our custom function
roc_result <- generate_roc(df, 
                           actual_col = "class", 
                           prob_col = "scored.probability", 
                           positive = pos_label)

# Display the ROC plot
roc_result$plot()

# Display the AUC value
cat("Area Under the Curve (AUC):", round(roc_result$auc, 4), "\n")
## Area Under the Curve (AUC): 0.8489
# Preview the ROC data points
head(roc_result$roc_data)

AUC Interpretation: An AUC of 0.8489 indicates good discriminative ability—the model can distinguish between positive and negative cases substantially better than random chance (AUC = 0.5). The ROC curve shows that we could potentially improve sensitivity by lowering the threshold, though this would come at the cost of reduced specificity (more false positives). The curve’s position well above the diagonal confirms the model has genuine predictive power.

6 Package Validation: caret and pROC

6.1 Caret Package Comparison

# Generate confusion matrix using caret
cm <- caret::confusionMatrix(data = df$scored.class,
                             reference = df$class,
                             positive = pos_label)
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##                                           
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.4737          
##             Specificity : 0.9597          
##          Pos Pred Value : 0.8438          
##          Neg Pred Value : 0.7987          
##              Prevalence : 0.3149          
##          Detection Rate : 0.1492          
##    Detection Prevalence : 0.1768          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 1               
## 
# Extract caret's sensitivity and specificity
caret_sens <- caret::sensitivity(data = df$scored.class,
                                 reference = df$class,
                                 positive = pos_label)
caret_spec <- caret::specificity(data = df$scored.class,
                                 reference = df$class,
                                 negative = setdiff(levels(df$class), pos_label)[1])

cat("Caret Sensitivity:", round(caret_sens, 4), "\n")
## Caret Sensitivity: 0.4737
cat("Caret Specificity:", round(caret_spec, 4), "\n")
## Caret Specificity: 0.9597

6.2 pROC Package Comparison

# Generate ROC curve using pROC
roc_obj <- pROC::roc(response = df$class,
                     predictor = df$scored.probability,
                     levels = c(setdiff(levels(df$class), pos_label)[1], pos_label))

# Plot pROC's ROC curve
plot(roc_obj, legacy.axes = TRUE, 
     main = sprintf("pROC ROC Curve (AUC = %.4f)", pROC::auc(roc_obj)))

# Display pROC's AUC
cat("pROC AUC:", round(as.numeric(pROC::auc(roc_obj)), 4), "\n")
## pROC AUC: 0.8503

6.3 Comparison Summary

# Create comparison table
comparison_df <- data.frame(
  Metric = c("Sensitivity", "Specificity", "Accuracy", "AUC"),
  Custom_Function = round(c(sens, spec, acc, roc_result$auc), 4),
  Package_Result = round(c(caret_sens, caret_spec, 
                           as.numeric(cm$overall["Accuracy"]),
                           as.numeric(pROC::auc(roc_obj))), 4)
)
comparison_df$Match <- comparison_df$Custom_Function == comparison_df$Package_Result |
                       abs(comparison_df$Custom_Function - comparison_df$Package_Result) < 0.01
comparison_df

7 Discussion

7.1 Confusion Matrix Orientation

Using table(actual, predicted), rows represent actual classes and columns represent predicted classes. This orientation is crucial for correctly calculating TP, TN, FP, and FN values, which form the foundation of all classification metrics.

7.2 Function Validation

Our custom functions matched caret outputs exactly, confirming correct implementation:

  • Sensitivity: 0.4737 (custom) vs. 0.4737 (caret) (agreement confirmed)
  • Specificity: 0.9597 (custom) vs. 0.9597 (caret) (agreement confirmed)
  • Accuracy: 0.8066 (custom) vs. 0.8066 (caret) (agreement confirmed)

The manual ROC/AUC calculation (0.8489) was very close to pROC::auc (0.8503). The small difference is attributable to pROC using more sophisticated interpolation methods than our trapezoidal approximation. This level of agreement validates our understanding of the underlying mathematics.

7.3 Model Performance Assessment

This classifier exhibits a clear precision-recall trade-off that favors precision over recall:

  • Strength: High specificity (95.97%) means very few false alarms when predicting negative cases
  • Weakness: Low sensitivity (47.37%) means we miss many true positive cases
  • Implication: The default 0.5 probability threshold may not be optimal for applications where false negatives are costly

7.4 Practical Recommendations

  1. Threshold Optimization: Consider lowering the classification threshold below 0.5 to improve sensitivity if the cost of missing positive cases outweighs the cost of false alarms.

  2. Context-Driven Metrics: Choose evaluation metrics based on the specific application. For fraud detection or disease screening, sensitivity may be more important than overall accuracy.

  3. ROC Analysis: The AUC of ~0.85 indicates good model quality, but there is room for improvement through feature engineering, algorithm selection, or addressing class imbalance.

7.5 Key Takeaways

  • Classification metrics must align with business objectives—accuracy alone can be misleading, especially for imbalanced datasets.
  • The ROC curve provides comprehensive insight into model performance across all possible thresholds.
  • Package validation confirms our implementations are mathematically sound and builds confidence in applying these concepts to real-world problems.
  • Understanding the precision-recall trade-off is essential for making informed decisions about model deployment.

7.6 Final Reflection

Overall, this analysis highlights the importance of examining multiple performance metrics rather than relying on accuracy alone. Precision and sensitivity clarify how the model behaves with respect to false positives and false negatives, while specificity provides insight into the model’s ability to correctly identify negative cases. The F1 score offers a balanced view when precision and recall trade off, and the ROC/AUC analysis helps evaluate discrimination performance across all classification thresholds. The strong alignment between my manually coded metrics and the outputs from caret and pROC confirms that the underlying confusion matrix logic and model evaluation steps were implemented correctly. In practice, metric selection should depend on the business context—for example, prioritizing sensitivity in healthcare settings where missing a positive case carries higher risk. This assignment reinforces how essential it is to interpret classification results holistically to make informed, context-aware decisions about model performance.

8 References

Kuhn, M., & Johnson, K. (2013). Applied Predictive Modeling. Springer.

Fawcett, T. (2006). An introduction to ROC analysis. Pattern Recognition Letters, 27(8), 861-874.

9 Appendix: Session Information

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods  
## [7] base     
## 
## other attached packages:
## [1] pROC_1.19.0.1  caret_7.0-1    lattice_0.22-7 ggplot2_4.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6         xfun_0.53            bslib_0.9.0         
##  [4] recipes_1.3.1        vctrs_0.6.5          tools_4.5.1         
##  [7] generics_0.1.4       stats4_4.5.1         parallel_4.5.1      
## [10] proxy_0.4-27         tibble_3.3.0         pkgconfig_2.0.3     
## [13] ModelMetrics_1.2.2.2 Matrix_1.7-4         data.table_1.17.8   
## [16] RColorBrewer_1.1-3   S7_0.2.0             lifecycle_1.0.4     
## [19] compiler_4.5.1       farver_2.1.2         stringr_1.5.2       
## [22] codetools_0.2-20     htmltools_0.5.8.1    class_7.3-23        
## [25] sass_0.4.10          yaml_2.3.10          prodlim_2025.04.28  
## [28] pillar_1.11.1        jquerylib_0.1.4      MASS_7.3-65         
## [31] cachem_1.1.0         gower_1.0.2          iterators_1.0.14    
## [34] rpart_4.1.24         foreach_1.5.2        nlme_3.1-168        
## [37] parallelly_1.45.1    lava_1.8.1           tidyselect_1.2.1    
## [40] digest_0.6.37        stringi_1.8.7        future_1.67.0       
## [43] dplyr_1.1.4          reshape2_1.4.4       purrr_1.1.0         
## [46] listenv_0.9.1        splines_4.5.1        fastmap_1.2.0       
## [49] grid_4.5.1           cli_3.6.5            magrittr_2.0.4      
## [52] survival_3.8-3       e1071_1.7-16         future.apply_1.20.0 
## [55] withr_3.0.2          scales_1.4.0         lubridate_1.9.4     
## [58] timechange_0.3.0     rmarkdown_2.29       globals_0.18.0      
## [61] nnet_7.3-20          timeDate_4041.110    evaluate_1.0.5      
## [64] knitr_1.50           hardhat_1.4.2        rlang_1.1.6         
## [67] Rcpp_1.1.0           glue_1.8.0           ipred_0.9-15        
## [70] rstudioapi_0.17.1    jsonlite_2.0.0       R6_2.6.1            
## [73] plyr_1.8.9