1
raw <- read_csv('https://raw.githubusercontent.com/kglan/MSDS/main/DATA621/HW2/classification-output-data.csv', col_names = TRUE)
## Rows: 181 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): pregnant, glucose, diastolic, skinfold, insulin, bmi, pedigree, ag...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- raw[, c("class", "scored.class", "scored.probability")]
2
confusion_matrix <- table(df$class, df$scored.class)
print(confusion_matrix)
##
## 0 1
## 0 119 5
## 1 30 27
Rows represent actual labels while columns represent predicted
labels
3
get_accuracy <- function(x) {
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
FN <- sum(x$class == 1 & x$scored.class == 0)
accuracy <- (TP + TN) / (TP + FP + TN + FN)
return(accuracy)
}
accuracy <- get_accuracy(df)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.806629834254144"
4
get_error_rate <- function(x) {
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
TN <- sum(x$class == 0 & x$scored.class == 0)
FN <- sum(x$class == 1 & x$scored.class == 0)
error_rate <- (FP + FN) / (TP + FP + TN + FN)
return(error_rate)
}
error_rate <- get_error_rate(df)
print(paste("Error Rate:", error_rate))
## [1] "Error Rate: 0.193370165745856"
sumaccuracyerror <- accuracy + error_rate
print(paste("Sum of Accuracy and Error Rate:", sumaccuracyerror))
## [1] "Sum of Accuracy and Error Rate: 1"
5
get_precision <- function(x) {
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
precision <- TP / (TP + FP)
return(precision)
}
precision <- get_precision(df)
print(paste("Precision:", precision))
## [1] "Precision: 0.84375"
6
get_sensitivity <- function(x) {
TP <- sum(x$class == 1 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
sensitivity <- TP / (TP + FN)
return(sensitivity)
}
sensitivity <- get_sensitivity(df)
print(paste("Sensitivity (Recall):", sensitivity))
## [1] "Sensitivity (Recall): 0.473684210526316"
7
get_specificity <- function(x) {
TN <- sum(x$class == 0 & x$scored.class == 0)
FP <- sum(x$class == 0 & x$scored.class == 1)
specificity <- TN / (TN + FP)
return(specificity)
}
specificity <- get_specificity(df)
print(paste("Specificity:", specificity))
## [1] "Specificity: 0.959677419354839"
8
get_f1_score <- function(x) {
TP <- sum(x$class == 1 & x$scored.class == 1)
FP <- sum(x$class == 0 & x$scored.class == 1)
FN <- sum(x$class == 1 & x$scored.class == 0)
precision <- TP / (TP + FP)
sensitivity <- TP / (TP + FN)
f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)
return(f1_score)
}
f1_score <- get_f1_score(df)
print(paste("F1 Score:", f1_score))
## [1] "F1 Score: 0.606741573033708"
9
10
ROC <- function(x, y){
x <- x[order(y, decreasing = TRUE)]
TPR <- cumsum(x) / sum(x)
FPR <- cumsum(!x) / sum(!x)
xy <- data.frame(TPR, FPR, x)
FPR_df <- c(diff(xy$FPR), 0)
TPR_df <- c(diff(xy$TPR), 0)
AUC <- round(sum(xy$TPR * FPR_df) + sum(TPR_df * FPR_df)/2, 4)
plot(xy$FPR, xy$TPR, type = "l",
main = "ROC Curve",
xlab = "False Postivie Rate",
ylab = "True Positive Rate")
abline(a = 0, b = 1)
legend(.6, .4, AUC, title = "AUC")
}
ROC(df$class,df$scored.probability)

11
metrics <- c(get_accuracy(df), get_error_rate(df), get_precision(df), get_sensitivity(df), get_specificity(df), get_f1_score(df))
names(metrics) <- c("Accuracy", "Classification Error Rate", "Precision", "Sensitivity", "Specificity", "F1 Score")
kable(metrics, col.names = "Metrics")
| Accuracy |
0.8066298 |
| Classification Error Rate |
0.1933702 |
| Precision |
0.8437500 |
| Sensitivity |
0.4736842 |
| Specificity |
0.9596774 |
| F1 Score |
0.6067416 |
12
df12<- df %>%
select(scored.class, class) %>%
mutate(scored.class = as.factor(scored.class),
class = as.factor(class))
c <- confusionMatrix(df12$scored.class, df12$class, positive = "1")
caret_package <- c(c$overall["Accuracy"], c$byClass["Sensitivity"], c$byClass["Specificity"])
written_function <- c(get_accuracy(df12), get_sensitivity(df12), get_specificity(df12))
combo<- cbind(caret_package, written_function)
kable(combo)
| Accuracy |
0.8066298 |
0.8066298 |
| Sensitivity |
0.4736842 |
0.4736842 |
| Specificity |
0.9596774 |
0.9596774 |
13
roc_curve <- roc(df$class, df$scored.probability)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve, main = "ROC Curve", print.auc = TRUE)

ROC(df$class,df$scored.probability)
