Instructions Complete each of the following steps as instructed: 1. Download the classification output data set.
library(tidyverse)
## ββ Attaching core tidyverse packages ββββββββββββββββββββββββ tidyverse 2.0.0 ββ
## β dplyr 1.1.4 β readr 2.1.5
## β forcats 1.0.0 β stringr 1.5.1
## β ggplot2 3.5.2 β tibble 3.3.0
## β lubridate 1.9.4 β tidyr 1.3.1
## β purrr 1.0.4
## ββ Conflicts ββββββββββββββββββββββββββββββββββββββββββ tidyverse_conflicts() ββ
## β dplyr::filter() masks stats::filter()
## β dplyr::lag() masks stats::lag()
## βΉ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
df <- read_csv("classification-output-data.csv")
## Rows: 181 Columns: 11
## ββ Column specification ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
## Delimiter: ","
## dbl (11): pregnant, glucose, diastolic, skinfold, insulin, bmi, pedigree, ag...
##
## βΉ Use `spec()` to retrieve the full column specification for this data.
## βΉ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df)
## Rows: 181
## Columns: 11
## $ pregnant <dbl> 7, 2, 3, 1, 4, 1, 9, 8, 1, 2, 5, 5, 13, 0, 7, 12, 0β¦
## $ glucose <dbl> 124, 122, 107, 91, 83, 100, 89, 120, 79, 123, 88, 1β¦
## $ diastolic <dbl> 70, 76, 62, 64, 86, 74, 62, 78, 60, 48, 78, 72, 60,β¦
## $ skinfold <dbl> 33, 27, 13, 24, 19, 12, 0, 0, 42, 32, 30, 43, 0, 26β¦
## $ insulin <dbl> 215, 200, 48, 0, 0, 46, 0, 0, 48, 165, 0, 75, 0, 50β¦
## $ bmi <dbl> 25.5, 35.9, 22.9, 29.2, 29.3, 19.5, 22.5, 25.0, 43.β¦
## $ pedigree <dbl> 0.161, 0.483, 0.678, 0.192, 0.317, 0.149, 0.142, 0.β¦
## $ age <dbl> 37, 26, 23, 21, 34, 28, 33, 64, 23, 26, 37, 33, 41,β¦
## $ class <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, β¦
## $ scored.class <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, β¦
## $ scored.probability <dbl> 0.32845226, 0.27319044, 0.10966039, 0.05599835, 0.1β¦
head(df)
STEP 2 β Confusion Matrix Using table() 2. The data set has three key columns we will use: ο· class: the actual class for the observation ο· scored.class: the predicted class for the observation (based on a threshold of 0.5) ο· scored.probability: the predicted probability of success for the observation Use the table() function to get the raw confusion matrix for this scored dataset. Make sure you understand the output. In particular, do the rows represent the actual or predicted class? The columns?
conf_matrix <- table(df$class, df$`scored.class`)
conf_matrix
##
## 0 1
## 0 119 5
## 1 30 27
STEP 3 β Function for Accuracy
accuracy <- function(data, actual, predicted) {
# Build confusion matrix
cm <- table(data[[actual]], data[[predicted]])
TP <- cm["1", "1"]
TN <- cm["0", "0"]
FP <- cm["0", "1"]
FN <- cm["1", "0"]
(TP + TN) / (TP + FP + TN + FN)
}
accuracy(df, "class", "scored.class")
## [1] 0.8066298
STEP 4 β Classification Error Rate
classification_error <- function(data, actual, predicted) {
cm <- table(data[[actual]], data[[predicted]])
FP <- cm["0", "1"]
FN <- cm["1", "0"]
Total <- sum(cm)
(FP + FN) / Total
}
classification_error(df, "class", "scored.class")
## [1] 0.1933702
accuracy(df, "class", "scored.class") +
classification_error(df, "class", "scored.class")
## [1] 1
STEP 5 β Precision
precision <- function(data, actual, predicted) {
cm <- table(data[[actual]], data[[predicted]])
TP <- cm["1", "1"]
FP <- cm["0", "1"]
TP / (TP + FP)
}
precision(df, "class", "scored.class")
## [1] 0.84375
STEP 6 β Sensitivity (Recall)
sensitivity <- function(data, actual, predicted) {
cm <- table(data[[actual]], data[[predicted]])
TP <- cm["1", "1"]
FN <- cm["1", "0"]
TP / (TP + FN)
}
sensitivity(df, "class", "scored.class")
## [1] 0.4736842
STEP 7 β Specificity
specificity <- function(data, actual, predicted) {
cm <- table(data[[actual]], data[[predicted]])
TN <- cm["0", "0"]
FP <- cm["0", "1"]
TN / (TN + FP)
}
specificity(df, "class", "scored.class")
## [1] 0.9596774
STEP 8 β F1 Score
f1_score <- function(data, actual, predicted) {
P <- precision(data, actual, predicted)
R <- sensitivity(data, actual, predicted)
(2 * P * R) / (P + R)
}
f1_score(df, "class", "scored.class")
## [1] 0.6067416
STEP 9 β Show F1 Is Always Between 0 and 1
Let:
0 < π < 1 0<P<1
0 < π < 1 0<R<1
Then:
Their product π π < π PR<P
And also π π < π PR<R
Now look at the formula:
πΉ 1 = 2 π π π + π F1= P+R 2PR β
Since:
Numerator 2 π π 2PR is positive and less than 2 π 2P or 2 π 2R
Denominator π + π P+R is greater than either P or R alone
Therefore:
0 < 2 π π π + π < 1 0< P+R 2PR β
<1
Thus, F1 is always between 0 and 1.
STEP 10 β Custom ROC & AUC Function
roc_curve <- function(data, actual, prob_col) {
thresholds <- seq(0, 1, by = 0.01)
TPR <- c()
FPR <- c()
for (t in thresholds) {
# Predicted class at threshold t
pred_temp <- ifelse(data[[prob_col]] >= t, 1, 0)
# Force both to have levels 0 and 1 so table is always 2x2
actual_fac <- factor(data[[actual]], levels = c(0, 1))
pred_fac <- factor(pred_temp, levels = c(0, 1))
cm <- table(actual_fac, pred_fac)
TP <- cm["1", "1"]
FP <- cm["0", "1"]
TN <- cm["0", "0"]
FN <- cm["1", "0"]
# Safely compute TPR and FPR (avoid division by zero)
if ((TP + FN) == 0) {
tpr <- 0
} else {
tpr <- TP / (TP + FN)
}
if ((FP + TN) == 0) {
fpr <- 0
} else {
fpr <- FP / (FP + TN)
}
TPR <- c(TPR, tpr)
FPR <- c(FPR, fpr)
}
# AUC via trapezoidal rule
auc <- sum(diff(FPR) * (head(TPR, -1) + tail(TPR, -1)) / 2)
# Plot ROC
plot(FPR, TPR, type = "l",
xlab = "False Positive Rate",
ylab = "True Positive Rate",
main = "ROC Curve")
# Return everything useful
return(list(
auc = auc,
thresholds = thresholds,
TPR = TPR,
FPR = FPR
))
}
# Run it
roc_results <- roc_curve(df, "class", "scored.probability")
roc_results$auc
## [1] -0.8488964
STEP 11 β Use Your Functions to Produce All Metrics
accuracy(df, "class", "scored.class")
## [1] 0.8066298
classification_error(df, "class", "scored.class")
## [1] 0.1933702
precision(df, "class", "scored.class")
## [1] 0.84375
sensitivity(df, "class", "scored.class")
## [1] 0.4736842
specificity(df, "class", "scored.class")
## [1] 0.9596774
f1_score(df, "class", "scored.class")
## [1] 0.6067416
roc_results <- roc_curve(df, "class", "scored.probability")
roc_results$auc
## [1] -0.8488964
STEP 12 β caret Package Comparison
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked _by_ '.GlobalEnv':
##
## precision, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
caret_results <- confusionMatrix(
factor(df$scored.class),
factor(df$class),
positive = "1"
)
caret_sens <- caret::sensitivity(
data = factor(df$scored.class),
reference = factor(df$class),
positive = "1"
)
caret_spec <- caret::specificity(
data = factor(df$scored.class),
reference = factor(df$class),
negative = "0"
)
roc_obj <- roc(df$class, df$scored.probability)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.8503
plot(roc_obj, main="pROC ROC Curve")