In this homework assignment, you will work through various classification metrics. You will be asked to create functions in R to carry out the various calculations. You will also investigate some functions in packages that will let you obtain the equivalent results. Finally, you will create graphical output that also can be used to evaluate the output of classification models, such as binary logistic regression.
classification_output_data <- read.csv("/Users/Michele/Desktop/classification_output_data.csv", header=TRUE)
summary(classification_output_data)
## pregnant glucose diastolic skinfold
## Min. : 0.000 Min. : 57.0 Min. : 38.0 Min. : 0.0
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.0 1st Qu.: 0.0
## Median : 3.000 Median :112.0 Median : 70.0 Median :22.0
## Mean : 3.862 Mean :118.3 Mean : 71.7 Mean :19.8
## 3rd Qu.: 6.000 3rd Qu.:136.0 3rd Qu.: 78.0 3rd Qu.:32.0
## Max. :15.000 Max. :197.0 Max. :104.0 Max. :54.0
## insulin bmi pedigree age
## Min. : 0.00 Min. :19.40 Min. :0.0850 Min. :21.00
## 1st Qu.: 0.00 1st Qu.:26.30 1st Qu.:0.2570 1st Qu.:24.00
## Median : 0.00 Median :31.60 Median :0.3910 Median :30.00
## Mean : 63.77 Mean :31.58 Mean :0.4496 Mean :33.31
## 3rd Qu.:105.00 3rd Qu.:36.00 3rd Qu.:0.5800 3rd Qu.:41.00
## Max. :543.00 Max. :50.00 Max. :2.2880 Max. :67.00
## class scored.class scored.probability
## Min. :0.0000 Min. :0.0000 Min. :0.02323
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.11702
## Median :0.0000 Median :0.0000 Median :0.23999
## Mean :0.3149 Mean :0.1768 Mean :0.30373
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.43093
## Max. :1.0000 Max. :1.0000 Max. :0.94633
(confusion_matrix <- table("Actual"=classification_output_data$class, "Predicted"=classification_output_data$scored.class))
## Predicted
## Actual 0 1
## 0 119 5
## 1 30 27
accuracy <- function(df){
cols = c("TN", "FN", "FP", "TP")
confusion_matrix <- table("Actual"=df$class, "Predicted"=df$scored.class)
confusion_matrix <- data.frame(confusion_matrix, index = cols)
accuracy_value <- (confusion_matrix$Freq[4] + confusion_matrix$Freq[1])/sum(confusion_matrix$Freq)
return(accuracy_value)
}
(Accuracy <- accuracy(classification_output_data))
## [1] 0.8066298
error <- function(df){
cols = c("TN", "FN", "FP", "TP")
confusion_matrix <- table("Actual"=df$class, "Predicted"=df$scored.class)
confusion_matrix <- data.frame(confusion_matrix, index = cols)
error_value <- (confusion_matrix$Freq[2] + confusion_matrix$Freq[3])/sum(confusion_matrix$Freq)
return(error_value)
}
(Error <- error(classification_output_data))
## [1] 0.1933702
Accuracy + Error
## [1] 1
precision <- function(df){
cols = c("TN", "FN", "FP", "TP")
confusion_matrix <- table("Actual"=df$class, "Predicted"=df$scored.class)
confusion_matrix <- data.frame(confusion_matrix, index = cols)
error_value <- (confusion_matrix$Freq[4])/(confusion_matrix$Freq[4]+confusion_matrix$Freq[3])
return(error_value)
}
precision(classification_output_data)
## [1] 0.84375
sensitivity <- function(df){
cols = c("TN", "FN", "FP", "TP")
confusion_matrix <- table("Actual"=df$class, "Predicted"=df$scored.class)
confusion_matrix <- data.frame(confusion_matrix, index = cols)
error_value <- (confusion_matrix$Freq[4])/(confusion_matrix$Freq[4]+confusion_matrix$Freq[2])
return(error_value)
}
(sensitivity(classification_output_data))
## [1] 0.4736842
specificity <- function(df){
cols = c("TN", "FN", "FP", "TP")
confusion_matrix <- table("Actual"=df$class, "Predicted"=df$scored.class)
confusion_matrix <- data.frame(confusion_matrix, index = cols)
error_value <- (confusion_matrix$Freq[1])/(confusion_matrix$Freq[1]+confusion_matrix$Freq[3])
return(error_value)
}
(specificity(classification_output_data))
## [1] 0.9596774
f1_score <- function(classification_output_data){
precision_value <- precision(classification_output_data)
sensitivity_value <- sensitivity(classification_output_data)
F1_Score = (2*precision_value*sensitivity_value)/(precision_value+sensitivity_value)
return(F1_Score)
}
(f1_score(classification_output_data))
## [1] 0.6067416
f1_function <- function(precision, sensitivity){
f1score <- (2*precision*sensitivity)/(precision+sensitivity)
return (f1score)
}
(f1_function(0, .5))
## [1] 0
(f1_function(1, 1))
## [1] 1
library(ggplot2)
roc_plot <- function(df, probability) {
set.seed(1)
threshold <- seq(0, 1, .01)
FPR <- numeric(length(threshold))
TPR <- FPR
pos <- sum(df$class == 1)
neg <- sum(df$class == 0)
for (i in 1:length(threshold)) {
data_subset <- subset(df, df$scored.probability <= threshold[i])
TP <- sum(data_subset[data_subset$class == 1, probability] > 0.5)
TN <- sum(data_subset[data_subset$class == 0, probability] <= 0.5)
FP <- sum(data_subset[data_subset$class == 0, probability] > 0.5)
FN <- sum(data_subset[data_subset$class == 1, probability] <= 0.5)
TPR[i] <- 1 - (TP + FN) / pos
FPR[i] <- 1 - (TN + FP) / neg
}
classification_data <- data.frame(TPR, FPR)
ggplot <- ggplot(classification_data, aes(FPR, TPR))
plot = ggplot + geom_line() + geom_abline(intercept = 0) + ggtitle("ROC Curve for Classification data")
height = (classification_data$TPR[-1]+classification_data$TPR[-length(classification_data$TPR)])/2
width = -diff(classification_data$FPR)
AUC = sum(height*width)
return (list(AUC = AUC, plot))
}
roc_plot(classification_output_data, "scored.probability")
## $AUC
## [1] 0.8488964
##
## [[2]]
Accuracy <- accuracy(classification_output_data)
Error <- error(classification_output_data)
Precision <- precision(classification_output_data)
Sensitivity <- sensitivity(classification_output_data)
Specificity <- specificity(classification_output_data)
F1_score <- f1_score(classification_output_data)
ROC <- roc_plot(classification_output_data, "scored.probability")
AUC <- ROC$AUC
classification_data <- t(data.frame(Accuracy, Error, Precision, Sensitivity, Specificity, F1_score, AUC))
classification_data
## [,1]
## Accuracy 0.8066298
## Error 0.1933702
## Precision 0.8437500
## Sensitivity 0.4736842
## Specificity 0.9596774
## F1_score 0.6067416
## AUC 0.8488964
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked _by_ '.GlobalEnv':
##
## precision, sensitivity, specificity
caret <- confusionMatrix(classification_output_data$scored.class, classification_output_data$class, positive='1')
caret
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.4737
## Specificity : 0.9597
## Pos Pred Value : 0.8438
## Neg Pred Value : 0.7987
## Prevalence : 0.3149
## Detection Rate : 0.1492
## Detection Prevalence : 0.1768
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 1
##
Sensitivity == caret$byClass["Sensitivity"]
## Sensitivity
## TRUE
Specificity == caret$byClass["Specificity"]
## Specificity
## TRUE
Accuracy == caret$overall["Accuracy"]
## Accuracy
## TRUE
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc <- roc(classification_output_data$class, classification_output_data$scored.probability)
plot(roc, main="ROC Curve for Classification data")
roc$auc
## Area under the curve: 0.8503