Parts I and II
data <- read.csv('https://raw.githubusercontent.com/st3vejobs/DATA-621/main/classification-output-data.csv')
keys <- subset(data, select = c("class","scored.class","scored.probability"))
raw_cm <- table(keys)
cm <- data.frame(raw_cm)
#keys$scored.class <- as.factor(keys$scored.class)
#keys$scored.probablity <- as.factor(keys$scored.probability)
new_keys <- data.frame(keys$scored.probability)
new_keys$class <- keys$class
new_keys$class[new_keys$class == 1] <- 'Positive'
new_keys$class[new_keys$class == 0] <- 'Negative'
new_keys$scored.class <- keys$scored.class
new_keys$scored.class[new_keys$scored.class == 1] <- 'Predicted Positive'
new_keys$scored.class[new_keys$scored.class == 0] <- 'Predicted Negative'
new_keys_tbl <- subset(new_keys, select = c(2,3))
tbl_keys <- table(new_keys_tbl)
tbl_keys## scored.class
## class Predicted Negative Predicted Positive
## Negative 119 5
## Positive 30 27
Part III:
The dataframe MUST contain columns titled: ‘class’, ‘scored.class’, and ‘scored.probability’.
accuracy <- function(dataframe){
keys <- data.frame(subset(dataframe, select = c('class','scored.class','scored.probability')))
keys$class[keys$class == 1] <- "Positive"
keys$class[keys$class == 0] <- "Negative"
keys$scored.class[keys$scored.class == 1] <- "Predicted Positive"
keys$scored.class[keys$scored.class == 0] <- "Predicted Negative"
keys_tbl <- subset(keys,select = c("class","scored.class"))
tbl_keys <- table(keys_tbl)
TP <- as.numeric(tbl_keys[4])
FP <- as.numeric(tbl_keys[3])
TN <- as.numeric(tbl_keys[1])
FN <- as.numeric(tbl_keys[2])
acc <- (TP + TN)/(TP + FP + TN + FN)
return(acc)
}
acc <- accuracy(data)
acc## [1] 0.8066298
Part IV:
class_error_rate <- function(dataframe){
keys <- data.frame(subset(dataframe, select = c('class','scored.class','scored.probability')))
keys$class[keys$class == 1] <- "Positive"
keys$class[keys$class == 0] <- "Negative"
keys$scored.class[keys$scored.class == 1] <- "Predicted Positive"
keys$scored.class[keys$scored.class == 0] <- "Predicted Negative"
keys_tbl <- subset(keys,select = c("class","scored.class"))
tbl_keys <- table(keys_tbl)
TP <- as.numeric(tbl_keys[4])
FP <- as.numeric(tbl_keys[3])
TN <- as.numeric(tbl_keys[1])
FN <- as.numeric(tbl_keys[2])
cer <- (FP + FN)/(TP + FP + TN + FN)
return(cer)
}
cer <- class_error_rate(data)
cer## [1] 0.1933702
verify <- function(accuracy,error_rate){
if(accuracy + error_rate == 1){
return(TRUE)
}
else return(FALSE)
}
verify(acc,cer)## [1] TRUE
Part V:
precision <- function(dataframe){
keys <- data.frame(subset(dataframe, select = c('class','scored.class','scored.probability')))
keys$class[keys$class == 1] <- "Positive"
keys$class[keys$class == 0] <- "Negative"
keys$scored.class[keys$scored.class == 1] <- "Predicted Positive"
keys$scored.class[keys$scored.class == 0] <- "Predicted Negative"
keys_tbl <- subset(keys,select = c("class","scored.class"))
tbl_keys <- table(keys_tbl)
TP <- as.numeric(tbl_keys[4])
FP <- as.numeric(tbl_keys[3])
TN <- as.numeric(tbl_keys[1])
FN <- as.numeric(tbl_keys[2])
prec <- (TP)/(TP + FP)
return(prec)
}
precision(data)## [1] 0.84375
Part VI:
sensitivity <- function(dataframe){
keys <- data.frame(subset(dataframe, select = c('class','scored.class','scored.probability')))
keys$class[keys$class == 1] <- "Positive"
keys$class[keys$class == 0] <- "Negative"
keys$scored.class[keys$scored.class == 1] <- "Predicted Positive"
keys$scored.class[keys$scored.class == 0] <- "Predicted Negative"
keys_tbl <- subset(keys,select = c("class","scored.class"))
tbl_keys <- table(keys_tbl)
TP <- as.numeric(tbl_keys[4])
FP <- as.numeric(tbl_keys[3])
TN <- as.numeric(tbl_keys[1])
FN <- as.numeric(tbl_keys[2])
sens <- (TP)/(TP + FN)
return(sens)
}
sensitivity(data)## [1] 0.4736842
Part VII:
specificity <- function(dataframe){
keys <- data.frame(subset(dataframe, select = c('class','scored.class','scored.probability')))
keys$class[keys$class == 1] <- "Positive"
keys$class[keys$class == 0] <- "Negative"
keys$scored.class[keys$scored.class == 1] <- "Predicted Positive"
keys$scored.class[keys$scored.class == 0] <- "Predicted Negative"
keys_tbl <- subset(keys,select = c("class","scored.class"))
tbl_keys <- table(keys_tbl)
TP <- as.numeric(tbl_keys[4])
FP <- as.numeric(tbl_keys[3])
TN <- as.numeric(tbl_keys[1])
FN <- as.numeric(tbl_keys[2])
spec <- (TN)/(TN + FP)
return(spec)
}
specificity(data)## [1] 0.9596774
Part VIII:
f1_score <- function(dataframe){
keys <- data.frame(subset(dataframe, select = c('class','scored.class','scored.probability')))
keys$class[keys$class == 1] <- "Positive"
keys$class[keys$class == 0] <- "Negative"
keys$scored.class[keys$scored.class == 1] <- "Predicted Positive"
keys$scored.class[keys$scored.class == 0] <- "Predicted Negative"
keys_tbl <- subset(keys,select = c("class","scored.class"))
tbl_keys <- table(keys_tbl)
TP <- as.numeric(tbl_keys[4])
FP <- as.numeric(tbl_keys[3])
TN <- as.numeric(tbl_keys[1])
FN <- as.numeric(tbl_keys[2])
prec <- (TP)/(TP + FP)
sens <- (TP)/(TP + FN)
f1 <- (2 * prec * sens)/(prec + sens)
return(f1)
}
f1_score(data)## [1] 0.6067416
Part IX:
The bounds on the F1 Score can be determined by examining the extreme values that are possible with the numerator and denominator of the function.
\(F1 Score = \frac{2 \times Precision \times Sensitivity}{Precision + Sensitivity}\)
\(Precision = \frac{TP}{TP + FP}\)
The Upper Limit for precision is 1 because as TP increases towards infinity and FP decreases towards 0, precision becomes 1. The lower limit for precision, as FP increases towards infinity, is 0.
\(\frac{TP}{TP} = 1\)
\(\lim_{TP\to\infty} (Precision) = 1\) \(\lim_{FP\to\infty} (Precision) = 0\)
\(Sensitivity = \frac{TP}{TP + FN}\)
Similarly, True Positives and False Negatives are correlated. As TP increases towards infinity, sensitivity will converge to 1. As FN increases to infinity, sensitivity converges to 0.
\(\lim_{TP\to\infty} (Sensitivity) = 1\) \(\lim_{FN\to\infty} (Sensitivity) = 0\)
From the hint:
If $ 0 < < 1,and < b < 1, then b < $
Let \(\alpha = Precision\)
Let \(b = Sensitivity\)
The conditions of the hint are satisfied, so if we apply the upper and lower limits of alpha and beta to our F1 function:
\(F1 Score = 2 \times \frac{\alpha \times b}{\alpha + b}\)
Substitute the upper limit for \[ab < a\]:
\(\lim_{\alpha, b \to 1} (F1Score) = 2 \times \frac{\alpha}{\alpha + b}\)
The upper limit for a and b are both 1, so:
\(\lim_{\alpha, b \to 1} (F1Score) = 2 \times \frac{\alpha}{\alpha + \alpha} = 2 \times \frac{\alpha}{2 \times \alpha} = \frac {\alpha}{\alpha} = 1\)
Similarly,
\(\lim_{\alpha, b \to 0} (F1Score) = 0\)
The bounds of F1 Score are [0,1]
Part X:
ROC <- function(dataframe){
keys <- data.frame(subset(dataframe, select = c('class','scored.class','scored.probability')))
keys <- keys[order(keys$scored.probability,decreasing = TRUE), ]
keys$y_true_pos_fraction <- cumsum(keys$class) / sum(keys$class)
keys$x_false_pos_fraction <- cumsum(!keys$class) / sum(!keys$class)
rownames(keys) <- seq(nrow(keys))
keys$y_tp_abs <- c(diff(keys$y_true_pos_fraction),0)
keys$x_fp_abs <- c(diff(keys$x_false_pos_fraction),0)
AUC <- (sum(keys$y_true_pos_fraction * keys$x_fp_abs) + sum(keys$y_true_pos_fraction * keys$x_fp_abs))/2
library(ggplot2)
figure <- ggplot(keys, aes(x = x_false_pos_fraction, y = y_true_pos_fraction))+
geom_line()+
ggtitle(paste0("ROC, Area Under the Curve: ", {round(AUC,2)}))+
xlab("False Positive Rate")+
ylab("True Positive Rate")+
theme(plot.title = element_text(hjust = 0.5))
return(figure)
}
ROC(data)Part XI:
summary_table <- data.frame(nrow = 1)
summary_table$Accuracy <- c(round(accuracy(data),3))
summary_table$Classification_Error_Rate <- round(class_error_rate(data),3)
summary_table$Precision <- round(precision(data),3)
summary_table$Sensitivity <- round(sensitivity(data),3)
summary_table$Specificity <- round(specificity(data),3)
summary_table$F1_Score <- round(f1_score(data),3)
summary_table <- subset(summary_table, select = -c(1))
summary_tablePart XII:
library(caret)## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked _by_ '.GlobalEnv':
##
## precision, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
data_raw <- data
data$scored.class <- as.factor(data$scored.class)
data$class <- as.factor(data$class)
confusionMatrix(data$scored.class,data$class, positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
##
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.4737
## Specificity : 0.9597
## Pos Pred Value : 0.8438
## Neg Pred Value : 0.7987
## Prevalence : 0.3149
## Detection Rate : 0.1492
## Detection Prevalence : 0.1768
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 1
##
car_sens <- caret::sensitivity(data$scored.class, data$class, positive = "1")
car_sens## [1] 0.4736842
car_spec <- caret::specificity(data$scored.class, data$class, negative = "0")
car_spec## [1] 0.9596774
The sensitivity matches the function I created, as does the specificity.
(car_sens - sensitivity(data)) / car_sens## Warning in `[<-.factor`(`*tmp*`, keys$class == 1, value = "Positive"): invalid
## factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, keys$class == 0, value = "Negative"): invalid
## factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, keys$scored.class == 1, value = "Predicted
## Positive"): invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, keys$scored.class == 0, value = "Predicted
## Negative"): invalid factor level, NA generated
## [1] NaN
(car_spec - specificity(data)) / car_spec## Warning in `[<-.factor`(`*tmp*`, keys$class == 1, value = "Positive"): invalid
## factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, keys$class == 0, value = "Negative"): invalid
## factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, keys$scored.class == 1, value = "Predicted
## Positive"): invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, keys$scored.class == 0, value = "Predicted
## Negative"): invalid factor level, NA generated
## [1] NaN
Part XIII:
library(pROC)## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
plot(roc(data$class,data$scored.probability), print.auc = TRUE)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
That was the output using the pROC package. Below is the output using the function I created.
ROC(data_raw)