Overview:In this homework assignment, you will work through various classification metrics. You will be asked to create functions in R to carry out the various calculations. You will also investigate some functions in packages that will let you obtain the equivalent results. Finally, you will create graphical output that also can be used to evaluate the output of classification models, such as binary logistic regression.

#1 Download the classification output data set (attached in Blackboard to the assignment).

data <- read.csv("https://raw.githubusercontent.com/Angelogallardo05/DAta-621-HW2/refs/heads/main/classification-output-data.csv")
glimpse(data)

## Rows: 181
## Columns: 11
## $ pregnant           <int> 7, 2, 3, 1, 4, 1, 9, 8, 1, 2, 5, 5, 13, 0, 7, 12, 0…
## $ glucose            <int> 124, 122, 107, 91, 83, 100, 89, 120, 79, 123, 88, 1…
## $ diastolic          <int> 70, 76, 62, 64, 86, 74, 62, 78, 60, 48, 78, 72, 60,…
## $ skinfold           <int> 33, 27, 13, 24, 19, 12, 0, 0, 42, 32, 30, 43, 0, 26…
## $ insulin            <int> 215, 200, 48, 0, 0, 46, 0, 0, 48, 165, 0, 75, 0, 50…
## $ bmi                <dbl> 25.5, 35.9, 22.9, 29.2, 29.3, 19.5, 22.5, 25.0, 43.…
## $ pedigree           <dbl> 0.161, 0.483, 0.678, 0.192, 0.317, 0.149, 0.142, 0.…
## $ age                <int> 37, 26, 23, 21, 34, 28, 33, 64, 23, 26, 37, 33, 41,…
## $ class              <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ scored.class       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, …
## $ scored.probability <dbl> 0.32845226, 0.27319044, 0.10966039, 0.05599835, 0.1…

#2 The data set has three key columns we will use: #class: the actual class for the observation #scored.class: the predicted class for the observation (based on a threshold of 0.5) #scored.probability: the predicted probability of success for the observation #Use the table() function to get the raw confusion matrix for this scored dataset. Make sure you understand #the output. In particular, do the rows represent the actual or predicted class? The columns?

confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
print(confusion_matrix)

##       Predicted
## Actual   0   1
##      0 119   5
##      1  30  27

#3 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified,and returns the accuracy of the predictions.

calculate_accuracy <- function(data) {

  confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
  

  TN <- confusion_matrix[1, 1]  # True Negatives
  TP <- confusion_matrix[2, 2]  # True Positives
  FP <- confusion_matrix[1, 2]  # False Positives
  FN <- confusion_matrix[2, 1]  # False Negatives
  
  # Calculate accuracy
  accuracy <- (TP + TN) / (TP + TN + FP + FN)
  
  return(accuracy)
}

accuracy <- calculate_accuracy(data)
print(accuracy)

## [1] 0.8066298

#4 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the classification error rate of the predictions.

calculate_error_rate <- function(data) {
 
  confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
  
  TN <- confusion_matrix[1, 1]  # True Negatives
  TP <- confusion_matrix[2, 2]  # True Positives
  FP <- confusion_matrix[1, 2]  # False Positives
  FN <- confusion_matrix[2, 1]  # False Negatives
  

  error_rate <- (FP + FN) / (TP + FP + TN + FN)
  
  return(error_rate)
}


calculate_metrics <- function(data) {
  
  confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
  
  TN <- confusion_matrix[1, 1]  # True Negatives
  TP <- confusion_matrix[2, 2]  # True Positives
  FP <- confusion_matrix[1, 2]  # False Positives
  FN <- confusion_matrix[2, 1]  # False Negatives
  
  accuracy <- (TP + TN) / (TP + TN + FP + FN)
  
  error_rate <- (FP + FN) / (TP + FP + TN + FN)
  
  return(list(accuracy = accuracy, error_rate = error_rate))
}

metrics <- calculate_metrics(data)
print(metrics$accuracy)

## [1] 0.8066298

print(metrics$error_rate)

## [1] 0.1933702

# Verify the sum
if (metrics$accuracy + metrics$error_rate == 1) {
  print("Accuracy and error rate sum to 1.")
} else {
  print("There is an error in the calculations.")
}

## [1] "Accuracy and error rate sum to 1."

#5 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the precision of the predictions.

calculate_precision <- function(data) {

  confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
  
  
  TP <- confusion_matrix[2, 2]  
  FP <- confusion_matrix[1, 2] 
  

  precision <- TP / (TP + FP)
  
  return(precision)
}

precision <- calculate_precision(data)
print(precision)

## [1] 0.84375

#6 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the sensitivity of the predictions. Sensitivity is also known as recall.

calculate_sensitivity <- function(data) {
  
  confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
  
  
  TP <- confusion_matrix[2, 2]  
  FN <- confusion_matrix[2, 1]  
  
 
  sensitivity <- TP / (TP + FN)
  
  return(sensitivity)
}

sensitivity <- calculate_sensitivity(data)
print(sensitivity)

## [1] 0.4736842

#7 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the specificity of the predictions.

calculate_specificity <- function(data) {
  
  confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
  
  
  TN <- confusion_matrix[1, 1]  # True Negatives
  FP <- confusion_matrix[1, 2]  # False Positives
  
  
  specificity <- TN / (TN + FP)
  
  return(specificity)
}

specificity <- calculate_specificity(data)
print(specificity)

## [1] 0.9596774

#8 Write a function that takes the data set as a dataframe, with actual and predicted classifications identified, and returns the F1 score of the predictions.

calculate_f1_score <- function(data) {
  
  
  confusion_matrix <- table(Actual = data$class, Predicted = data$scored.class)
  
  
  TP <- confusion_matrix[2, 2]  # True Positives
  FP <- confusion_matrix[1, 2]  # False Positives
  FN <- confusion_matrix[2, 1]  # False Negatives
  
  # Calculate Precision
  precision <- TP / (TP + FP)
  
  # Calculate Sensitivity (Recall)
  sensitivity <- TP / (TP + FN)
  
  # Calculate F1 Score
  f1_score <- (2 * precision * sensitivity) / (precision + sensitivity)
  
  return(f1_score)
}

f1_score <- calculate_f1_score(data)
print(f1_score)

## [1] 0.6067416

#9 : What are the bounds on the F1 score? Show that the F1 score will always be between 0 and 1. (Hint: If 0 < 𝑎 < 1 and 0 < 𝑏 < 1 then 𝑎𝑏 < 𝑎.)

calculate_f1_score <- function(precision, sensitivity) {
  if (precision + sensitivity == 0) {
    return(0)  
  }
  return(2 * precision * sensitivity / (precision + sensitivity))
}


precision_values <- seq(0, 1, by = 0.1)
sensitivity_values <- seq(0, 1, by = 0.1)


f1_scores <- outer(precision_values, sensitivity_values, Vectorize(calculate_f1_score))


f1_table <- expand.grid(precision = precision_values, sensitivity = sensitivity_values)
f1_table$F1_score <- as.vector(f1_scores)


print(f1_table)

##     precision sensitivity  F1_score
## 1         0.0         0.0 0.0000000
## 2         0.1         0.0 0.0000000
## 3         0.2         0.0 0.0000000
## 4         0.3         0.0 0.0000000
## 5         0.4         0.0 0.0000000
## 6         0.5         0.0 0.0000000
## 7         0.6         0.0 0.0000000
## 8         0.7         0.0 0.0000000
## 9         0.8         0.0 0.0000000
## 10        0.9         0.0 0.0000000
## 11        1.0         0.0 0.0000000
## 12        0.0         0.1 0.0000000
## 13        0.1         0.1 0.1000000
## 14        0.2         0.1 0.1333333
## 15        0.3         0.1 0.1500000
## 16        0.4         0.1 0.1600000
## 17        0.5         0.1 0.1666667
## 18        0.6         0.1 0.1714286
## 19        0.7         0.1 0.1750000
## 20        0.8         0.1 0.1777778
## 21        0.9         0.1 0.1800000
## 22        1.0         0.1 0.1818182
## 23        0.0         0.2 0.0000000
## 24        0.1         0.2 0.1333333
## 25        0.2         0.2 0.2000000
## 26        0.3         0.2 0.2400000
## 27        0.4         0.2 0.2666667
## 28        0.5         0.2 0.2857143
## 29        0.6         0.2 0.3000000
## 30        0.7         0.2 0.3111111
## 31        0.8         0.2 0.3200000
## 32        0.9         0.2 0.3272727
## 33        1.0         0.2 0.3333333
## 34        0.0         0.3 0.0000000
## 35        0.1         0.3 0.1500000
## 36        0.2         0.3 0.2400000
## 37        0.3         0.3 0.3000000
## 38        0.4         0.3 0.3428571
## 39        0.5         0.3 0.3750000
## 40        0.6         0.3 0.4000000
## 41        0.7         0.3 0.4200000
## 42        0.8         0.3 0.4363636
## 43        0.9         0.3 0.4500000
## 44        1.0         0.3 0.4615385
## 45        0.0         0.4 0.0000000
## 46        0.1         0.4 0.1600000
## 47        0.2         0.4 0.2666667
## 48        0.3         0.4 0.3428571
## 49        0.4         0.4 0.4000000
## 50        0.5         0.4 0.4444444
## 51        0.6         0.4 0.4800000
## 52        0.7         0.4 0.5090909
## 53        0.8         0.4 0.5333333
## 54        0.9         0.4 0.5538462
## 55        1.0         0.4 0.5714286
## 56        0.0         0.5 0.0000000
## 57        0.1         0.5 0.1666667
## 58        0.2         0.5 0.2857143
## 59        0.3         0.5 0.3750000
## 60        0.4         0.5 0.4444444
## 61        0.5         0.5 0.5000000
## 62        0.6         0.5 0.5454545
## 63        0.7         0.5 0.5833333
## 64        0.8         0.5 0.6153846
## 65        0.9         0.5 0.6428571
## 66        1.0         0.5 0.6666667
## 67        0.0         0.6 0.0000000
## 68        0.1         0.6 0.1714286
## 69        0.2         0.6 0.3000000
## 70        0.3         0.6 0.4000000
## 71        0.4         0.6 0.4800000
## 72        0.5         0.6 0.5454545
## 73        0.6         0.6 0.6000000
## 74        0.7         0.6 0.6461538
## 75        0.8         0.6 0.6857143
## 76        0.9         0.6 0.7200000
## 77        1.0         0.6 0.7500000
## 78        0.0         0.7 0.0000000
## 79        0.1         0.7 0.1750000
## 80        0.2         0.7 0.3111111
## 81        0.3         0.7 0.4200000
## 82        0.4         0.7 0.5090909
## 83        0.5         0.7 0.5833333
## 84        0.6         0.7 0.6461538
## 85        0.7         0.7 0.7000000
## 86        0.8         0.7 0.7466667
## 87        0.9         0.7 0.7875000
## 88        1.0         0.7 0.8235294
## 89        0.0         0.8 0.0000000
## 90        0.1         0.8 0.1777778
## 91        0.2         0.8 0.3200000
## 92        0.3         0.8 0.4363636
## 93        0.4         0.8 0.5333333
## 94        0.5         0.8 0.6153846
## 95        0.6         0.8 0.6857143
## 96        0.7         0.8 0.7466667
## 97        0.8         0.8 0.8000000
## 98        0.9         0.8 0.8470588
## 99        1.0         0.8 0.8888889
## 100       0.0         0.9 0.0000000
## 101       0.1         0.9 0.1800000
## 102       0.2         0.9 0.3272727
## 103       0.3         0.9 0.4500000
## 104       0.4         0.9 0.5538462
## 105       0.5         0.9 0.6428571
## 106       0.6         0.9 0.7200000
## 107       0.7         0.9 0.7875000
## 108       0.8         0.9 0.8470588
## 109       0.9         0.9 0.9000000
## 110       1.0         0.9 0.9473684
## 111       0.0         1.0 0.0000000
## 112       0.1         1.0 0.1818182
## 113       0.2         1.0 0.3333333
## 114       0.3         1.0 0.4615385
## 115       0.4         1.0 0.5714286
## 116       0.5         1.0 0.6666667
## 117       0.6         1.0 0.7500000
## 118       0.7         1.0 0.8235294
## 119       0.8         1.0 0.8888889
## 120       0.9         1.0 0.9473684
## 121       1.0         1.0 1.0000000

min_f1 <- min(f1_table$F1_score)
max_f1 <- max(f1_table$F1_score)

cat("Minimum F1 Score:", min_f1, "\n")

## Minimum F1 Score: 0

cat("Maximum F1 Score:", max_f1, "\n")

## Maximum F1 Score: 1

if (min_f1 >= 0 && max_f1 <= 1) {
  cat("The F1 score is always between 0 and 1.\n")
} else {
  cat("There is an error in the F1 score calculation.\n")
}

## The F1 score is always between 0 and 1.

#10 Write a function that generates an ROC curve from a data set with a true classification column (class in ourexample) and a probability column (scored.probability in our example). Your function should return a list that includes the plot of the ROC curve and a vector that contains the calculated area under the curve (AUC). Note that I recommend using a sequence of thresholds ranging from 0 to 1 at 0.01 intervals.

generate_roc_curve <- function(data, actual_col, probability_col) {
  
  
  actual <- data[[actual_col]]
  predicted_probabilities <- data[[probability_col]]
  
  
  roc_curve <- roc(actual, predicted_probabilities, plot = TRUE, 
                   print.auc = TRUE, legacy.axes = TRUE,
                   col = "blue", lwd = 2, main = "ROC Curve")
  
  
  auc_value <- auc(roc_curve)
  
  
  return(list(roc_plot = roc_curve, auc_value = auc_value))
}



result <- generate_roc_curve(data, actual_col = "class", probability_col = "scored.probability")

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

cat("AUC value:", result$auc_value, "\n")

## AUC value: 0.8503113

# Print the results
cat("Accuracy:", accuracy, "\n")

## Accuracy: 0.8066298

cat("Error Rate:", metrics$error_rate, "\n")

## Error Rate: 0.1933702

cat("Precision:", precision, "\n")

## Precision: 0.84375

cat("Sensitivity (Recall):", sensitivity, "\n")

## Sensitivity (Recall): 0.4736842

cat("Specificity:", specificity, "\n")

## Specificity: 0.9596774

cat("F1 Score:", f1_score, "\n")

## F1 Score: 0.6067416

cat("AUC:",result$auc_value, "\n")

## AUC: 0.8503113

#12 Investigate the caret package. In particular, consider the functions confusionMatrix, sensitivity, and specificity. Apply the functions to the data set. How do the results compare with your own functions?

cm <- confusionMatrix(factor(data$scored.class), factor(data$class))
print(cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##                                           
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.9597          
##             Specificity : 0.4737          
##          Pos Pred Value : 0.7987          
##          Neg Pred Value : 0.8438          
##              Prevalence : 0.6851          
##          Detection Rate : 0.6575          
##    Detection Prevalence : 0.8232          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 0               
##

accuracy_caret <- cm$overall['Accuracy']
sensitivity_caret <- cm$byClass["Sensitivity"]
specificity_caret <- cm$byClass["Specificity"]


cat("Caret Accuracy:", accuracy_caret, "\n")

## Caret Accuracy: 0.8066298

cat("Caret Sensitivity (Recall):", sensitivity_caret, "\n")

## Caret Sensitivity (Recall): 0.9596774

cat("Caret Specificity:", specificity_caret, "\n")

## Caret Specificity: 0.4736842

cat("Custom Accuracy:", accuracy, "\n")

## Custom Accuracy: 0.8066298

cat("Custom Sensitivity (Recall):", sensitivity, "\n")

## Custom Sensitivity (Recall): 0.4736842

cat("Custom Specificity:", specificity, "\n")

## Custom Specificity: 0.9596774

#13 Investigate the pROC package. Use it to generate an ROC curve for the data set. How do the results compare with your own functions?

roc_result <- roc(data$class, data$scored.probability)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc_result, main="ROC Curve", col="blue", lwd=2)


auc_value <- auc(roc_result)
text(0.5, 0.5, paste("AUC =", round(auc_value, 2)), adj=c(0.5, 0.5))

cat("AUC from pROC:", auc_value, "\n")

## AUC from pROC: 0.8503113

Data 621 HW2

Angel Gallardo

2024-10-12