raw <- read_csv('https://raw.githubusercontent.com/kglan/MSDS/main/DATA621/HW2/classification-output-data.csv', col_names = TRUE)
## Rows: 181 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): pregnant, glucose, diastolic, skinfold, insulin, bmi, pedigree, ag...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- raw[, c("class", "scored.class", "scored.probability")]
The data set has three key columns we will use: * class: the actual class for the observation * scored.class: the predicted class for the observation (based on a threshold of 0.5) * scored.probability: the predicted probability of success for the observation Use the table() function to get the raw confusion matrix for this scored dataset. Make sure you understand the output. In particular, do the rows represent the actual or predicted class? The columns?
confusion_matrix <- table(df$class, df$scored.class)
print(confusion_matrix)
##
## 0 1
## 0 119 5
## 1 30 27
Rows represent actual labels while columns represent predicted labels
Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the accuracy of the predictions.
\[ \text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{FP} + \text{TN} + \text{FN}} \]
get_accuracy <- function(x) {
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
FN <- sum(x$class == 1 & x$scored.class == 0)
accuracy <- (TP + TN) / (TP + FP + TN + FN)
return(accuracy)
}
accuracy <- get_accuracy(df)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.806629834254144"
\[ \text{Classification Error Rate} = \frac{\text{FP} + \text{FN}}{\text{TP} + \text{FP} + \text{TN} + \text{FN}} \]
get_error_rate <- function(x) {
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
FN <- sum(x$class == 1 & x$scored.class == 0)
error_rate <- (FP + FN) / (TP + FP + TN + FN)
return(error_rate)
}
error_rate <- get_error_rate(df)
print(paste("Error Rate:", error_rate))
## [1] "Error Rate: 0.193370165745856"
sumaccuracyerror <- accuracy + error_rate
print(paste("Sum of Accuracy and Error Rate:", sumaccuracyerror))
## [1] "Sum of Accuracy and Error Rate: 1"
\[ \text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}} \]
get_precision <- function(x) {
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
precision <- TP / (TP + FP)
return(precision)
}
precision <- get_precision(df)
print(paste("Precision:", precision))
## [1] "Precision: 0.84375"
\[ \text{Sensitivity (Recall)} = \frac{\text{TP}}{\text{TP} + \text{FN}} \]
get_sensitivity <- function(x) {
TP <- sum(x$class == 1 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
sensitivity <- TP / (TP + FN)
return(sensitivity)
}
sensitivity <- get_sensitivity(df)
print(paste("Sensitivity (Recall):", sensitivity))
## [1] "Sensitivity (Recall): 0.473684210526316"
\[ \text{Specificity} = \frac{\text{TN}}{\text{TN} + \text{FP}} \]
get_specificity <- function(x) {
TN <- sum(x$class == 0 & x$scored.class == 0)
FP <- sum(x$class == 0 & x$scored.class == 1)
specificity <- TN / (TN + FP)
return(specificity)
}
specificity <- get_specificity(df)
print(paste("Specificity:", specificity))
## [1] "Specificity: 0.959677419354839"
\[ F1 \text{ Score} = \frac{2 \times \text{Precision} \times \text{Sensitivity}}{\text{Precision} + \text{Sensitivity}} \]
get_f1_score <- function(x) {
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
precision <- TP / (TP + FP)
sensitivity <- TP / (TP + FN)
f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)
return(f1_score)
}
f1_score <- get_f1_score(df)
print(paste("F1 Score:", f1_score))
## [1] "F1 Score: 0.606741573033708"
The precision and recall are both non-negative values. These are both in the numerator and denominator. This the F1 score can never be negative. That gives us the lower bound of 0. Since these are binary classifications, the options are 0 or 1. This shows us that their maximum is 1. The calculation 2x1x1 =2 while 1+1 =2 thus 2/2 = 1
ROC <- function(x, y){
x <- x[order(y, decreasing = TRUE)]
TPR <- cumsum(x) / sum(x)
FPR <- cumsum(!x) / sum(!x)
xy <- data.frame(TPR, FPR, x)
FPR_df <- c(diff(xy$FPR), 0)
TPR_df <- c(diff(xy$TPR), 0)
AUC <- round(sum(xy$TPR * FPR_df) + sum(TPR_df * FPR_df)/2, 4)
plot(xy$FPR, xy$TPR, type = "l",
main = "ROC Curve",
xlab = "False Postivie Rate",
ylab = "True Positive Rate")
abline(a = 0, b = 1)
legend(.6, .4, AUC, title = "AUC")
}
ROC(df$class,df$scored.probability)
metrics <- c(get_accuracy(df), get_error_rate(df), get_precision(df), get_sensitivity(df), get_specificity(df), get_f1_score(df))
names(metrics) <- c("Accuracy", "Classification Error Rate", "Precision", "Sensitivity", "Specificity", "F1 Score")
kable(metrics, col.names = "Metrics")
| Metrics | |
|---|---|
| Accuracy | 0.8066298 |
| Classification Error Rate | 0.1933702 |
| Precision | 0.8437500 |
| Sensitivity | 0.4736842 |
| Specificity | 0.9596774 |
| F1 Score | 0.6067416 |
df12<- df %>%
select(scored.class, class) %>%
mutate(scored.class = as.factor(scored.class),
class = as.factor(class))
c <- confusionMatrix(df12$scored.class, df12$class, positive = "1")
caret_package <- c(c$overall["Accuracy"], c$byClass["Sensitivity"], c$byClass["Specificity"])
personal_functions <- c(get_accuracy(df12), get_sensitivity(df12), get_specificity(df12))
combo<- cbind(caret_package, personal_functions)
kable(combo)
| caret_package | personal_functions | |
|---|---|---|
| Accuracy | 0.8066298 | 0.8066298 |
| Sensitivity | 0.4736842 | 0.4736842 |
| Specificity | 0.9596774 | 0.9596774 |
roc_curve <- roc(df$class, df$scored.probability)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve, main = "ROC Curve", print.auc = TRUE)
ROC(df$class,df$scored.probability)