1. Dataset

raw <- read_csv('https://raw.githubusercontent.com/kglan/MSDS/main/DATA621/HW2/classification-output-data.csv', col_names = TRUE)
## Rows: 181 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): pregnant, glucose, diastolic, skinfold, insulin, bmi, pedigree, ag...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- raw[, c("class", "scored.class", "scored.probability")]

2. Confusion Matrix

The data set has three key columns we will use: * class: the actual class for the observation * scored.class: the predicted class for the observation (based on a threshold of 0.5) * scored.probability: the predicted probability of success for the observation Use the table() function to get the raw confusion matrix for this scored dataset. Make sure you understand the output. In particular, do the rows represent the actual or predicted class? The columns?

confusion_matrix <- table(df$class, df$scored.class)
print(confusion_matrix)
##    
##       0   1
##   0 119   5
##   1  30  27

Rows represent actual labels while columns represent predicted labels

3 . Accuracy

Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the accuracy of the predictions.

\[ \text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{FP} + \text{TN} + \text{FN}} \]

get_accuracy <- function(x) {
  TP <- sum(x$class == 1 & x$scored.class == 1)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  
  accuracy <- (TP + TN) / (TP + FP + TN + FN)
  
  return(accuracy)
}

accuracy <- get_accuracy(df)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.806629834254144"

4. Claasification Error Rate

\[ \text{Classification Error Rate} = \frac{\text{FP} + \text{FN}}{\text{TP} + \text{FP} + \text{TN} + \text{FN}} \]

get_error_rate <- function(x) {
  TP <- sum(x$class == 1 & x$scored.class == 1)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  error_rate <- (FP + FN) / (TP + FP + TN + FN)
  return(error_rate)
}


error_rate <- get_error_rate(df)
print(paste("Error Rate:", error_rate))
## [1] "Error Rate: 0.193370165745856"
sumaccuracyerror <- accuracy + error_rate
print(paste("Sum of Accuracy and Error Rate:", sumaccuracyerror))
## [1] "Sum of Accuracy and Error Rate: 1"

5. Precision

\[ \text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}} \]

get_precision <- function(x) {
  TP <- sum(x$class == 1 & x$scored.class == 1)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  precision <- TP / (TP + FP)
  return(precision)
}
precision <- get_precision(df)
print(paste("Precision:", precision))
## [1] "Precision: 0.84375"

6. Sensitivity(Recall)

\[ \text{Sensitivity (Recall)} = \frac{\text{TP}}{\text{TP} + \text{FN}} \]

get_sensitivity <- function(x) {
  TP <- sum(x$class == 1 & x$scored.class == 1)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  sensitivity <- TP / (TP + FN)
  return(sensitivity)
}

sensitivity <- get_sensitivity(df)
print(paste("Sensitivity (Recall):", sensitivity))
## [1] "Sensitivity (Recall): 0.473684210526316"

7. Specificity

\[ \text{Specificity} = \frac{\text{TN}}{\text{TN} + \text{FP}} \]

get_specificity <- function(x) {
  TN <- sum(x$class == 0 & x$scored.class == 0)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  specificity <- TN / (TN + FP)
  return(specificity)
}

specificity <- get_specificity(df)
print(paste("Specificity:", specificity))
## [1] "Specificity: 0.959677419354839"

8. F1 Score

\[ F1 \text{ Score} = \frac{2 \times \text{Precision} \times \text{Sensitivity}}{\text{Precision} + \text{Sensitivity}} \]

get_f1_score <- function(x) {
  TP <- sum(x$class == 1 & x$scored.class == 1)
  FP <- sum(x$class == 0 & x$scored.class == 1)
  FN <- sum(x$class == 1 & x$scored.class == 0)
  
  precision <- TP / (TP + FP)
  sensitivity <- TP / (TP + FN)
  
  f1_score <- 2 * (precision * sensitivity) / (precision + sensitivity)
  return(f1_score)
}

f1_score <- get_f1_score(df)
print(paste("F1 Score:", f1_score))
## [1] "F1 Score: 0.606741573033708"

9. Bounds of F1 score

The precision and recall are both non-negative values. These are both in the numerator and denominator. This the F1 score can never be negative. That gives us the lower bound of 0. Since these are binary classifications, the options are 0 or 1. This shows us that their maximum is 1. The calculation 2x1x1 =2 while 1+1 =2 thus 2/2 = 1

10. ROC curve

ROC <- function(x, y){
  x <- x[order(y, decreasing = TRUE)]
  TPR <- cumsum(x) / sum(x)
  FPR <- cumsum(!x) / sum(!x)
  xy <- data.frame(TPR, FPR, x)
  
  FPR_df <- c(diff(xy$FPR), 0)
  TPR_df <- c(diff(xy$TPR), 0)
  AUC <- round(sum(xy$TPR * FPR_df) + sum(TPR_df * FPR_df)/2, 4)
  
  plot(xy$FPR, xy$TPR, type = "l",
       main = "ROC Curve",
       xlab = "False Postivie Rate",
       ylab = "True Positive Rate")
  abline(a = 0, b = 1)
  legend(.6, .4, AUC, title = "AUC")
}

ROC(df$class,df$scored.probability)

11. Metrics

metrics <- c(get_accuracy(df), get_error_rate(df), get_precision(df), get_sensitivity(df), get_specificity(df), get_f1_score(df))
names(metrics) <- c("Accuracy", "Classification Error Rate", "Precision", "Sensitivity", "Specificity", "F1 Score")
kable(metrics, col.names = "Metrics")
Metrics
Accuracy 0.8066298
Classification Error Rate 0.1933702
Precision 0.8437500
Sensitivity 0.4736842
Specificity 0.9596774
F1 Score 0.6067416

12. Caret Comparison

df12<- df %>%
  select(scored.class, class) %>%
  mutate(scored.class = as.factor(scored.class), 
         class = as.factor(class))

c <- confusionMatrix(df12$scored.class, df12$class, positive = "1")

caret_package <- c(c$overall["Accuracy"], c$byClass["Sensitivity"], c$byClass["Specificity"])
personal_functions <- c(get_accuracy(df12), get_sensitivity(df12), get_specificity(df12))

combo<- cbind(caret_package, personal_functions)
kable(combo)
caret_package personal_functions
Accuracy 0.8066298 0.8066298
Sensitivity 0.4736842 0.4736842
Specificity 0.9596774 0.9596774

13. ROC Curve Comparison

roc_curve <- roc(df$class, df$scored.probability)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve, main = "ROC Curve", print.auc = TRUE)

ROC(df$class,df$scored.probability)