1

raw <- read_csv('https://raw.githubusercontent.com/kglan/MSDS/main/DATA621/HW2/classification-output-data.csv', col_names = TRUE)
## Rows: 181 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): pregnant, glucose, diastolic, skinfold, insulin, bmi, pedigree, ag...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- raw[, c("class", "scored.class", "scored.probability")]

2

confusion_matrix <- table(df$class, df$scored.class)
print(confusion_matrix)
##    
##       0   1
##   0 119   5
##   1  30  27

Rows represent actual labels while columns represent predicted labels

3

get_accuracy <- function(x) {
  TP <- sum(x$class == 1 & x$scored.class == 1)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  
  accuracy <- (TP + TN) / (TP + FP + TN + FN)
  
  return(accuracy)
}

accuracy <- get_accuracy(df)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.806629834254144"

4

get_error_rate <- function(x) {
  TP <- sum(x$class == 1 & x$scored.class == 1)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  error_rate <- (FP + FN) / (TP + FP + TN + FN)
  return(error_rate)
}


error_rate <- get_error_rate(df)
print(paste("Error Rate:", error_rate))
## [1] "Error Rate: 0.193370165745856"
sumaccuracyerror <- accuracy + error_rate
print(paste("Sum of Accuracy and Error Rate:", sumaccuracyerror))
## [1] "Sum of Accuracy and Error Rate: 1"

5

get_precision <- function(x) {
  TP <- sum(x$class == 1 & x$scored.class == 1)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  precision <- TP / (TP + FP)
  return(precision)
}
precision <- get_precision(df)
print(paste("Precision:", precision))
## [1] "Precision: 0.84375"

6

get_sensitivity <- function(x) {
  TP <- sum(x$class == 1 & x$scored.class == 1)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  sensitivity <- TP / (TP + FN)
  return(sensitivity)
}

sensitivity <- get_sensitivity(df)
print(paste("Sensitivity (Recall):", sensitivity))
## [1] "Sensitivity (Recall): 0.473684210526316"

7

get_specificity <- function(x) {
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  specificity <- TN / (TN + FP)
  return(specificity)
}

specificity <- get_specificity(df)
print(paste("Specificity:", specificity))
## [1] "Specificity: 0.959677419354839"

8

get_f1_score <- function(x) {
  TP <- sum(x$class == 1 & x$scored.class == 1)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  
  precision <- TP / (TP + FP)
  sensitivity <- TP / (TP + FN)
  
  f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)
  return(f1_score)
}

f1_score <- get_f1_score(df)
print(paste("F1 Score:", f1_score))
## [1] "F1 Score: 0.606741573033708"

9

10

ROC <- function(x, y){
  x <- x[order(y, decreasing = TRUE)]
  TPR <- cumsum(x) / sum(x)
  FPR <- cumsum(!x) / sum(!x)
  xy <- data.frame(TPR, FPR, x)
  
  FPR_df <- c(diff(xy$FPR), 0)
  TPR_df <- c(diff(xy$TPR), 0)
  AUC <- round(sum(xy$TPR * FPR_df) + sum(TPR_df * FPR_df)/2, 4)
  
  plot(xy$FPR, xy$TPR, type = "l",
       main = "ROC Curve",
       xlab = "False Postivie Rate",
       ylab = "True Positive Rate")
  abline(a = 0, b = 1)
  legend(.6, .4, AUC, title = "AUC")
}

ROC(df$class,df$scored.probability)

11

metrics <- c(get_accuracy(df), get_error_rate(df), get_precision(df), get_sensitivity(df), get_specificity(df), get_f1_score(df))
names(metrics) <- c("Accuracy", "Classification Error Rate", "Precision", "Sensitivity", "Specificity", "F1 Score")
kable(metrics, col.names = "Metrics")
Metrics
Accuracy 0.8066298
Classification Error Rate 0.1933702
Precision 0.8437500
Sensitivity 0.4736842
Specificity 0.9596774
F1 Score 0.6067416

12

df12<- df %>%
  select(scored.class, class) %>%
  mutate(scored.class = as.factor(scored.class), 
         class = as.factor(class))

c <- confusionMatrix(df12$scored.class, df12$class, positive = "1")

caret_package <- c(c$overall["Accuracy"], c$byClass["Sensitivity"], c$byClass["Specificity"])
written_function <- c(get_accuracy(df12), get_sensitivity(df12), get_specificity(df12))

combo<- cbind(caret_package, written_function)
kable(combo)
caret_package written_function
Accuracy 0.8066298 0.8066298
Sensitivity 0.4736842 0.4736842
Specificity 0.9596774 0.9596774

13

roc_curve <- roc(df$class, df$scored.probability)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve, main = "ROC Curve", print.auc = TRUE)

ROC(df$class,df$scored.probability)