Question 1
data <- fread("https://raw.githubusercontent.com/baroncurtin2/data621/master/week2/classification-output-data.csv") %>%
as_tibble() %>% select(class:scored.probability)
Question 2
summary(data)
## class scored.class scored.probability
## Min. :0.0000 Min. :0.0000 Min. :0.02323
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.11702
## Median :0.0000 Median :0.0000 Median :0.23999
## Mean :0.3149 Mean :0.1768 Mean :0.30373
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.43093
## Max. :1.0000 Max. :1.0000 Max. :0.94633
with(data, table(class, scored.class))
## scored.class
## class 0 1
## 0 119 5
## 1 30 27
- The rows represent the actual classes
- The columns represent the predicted classes
# confusion matrix helper function
conf_mtx <- function(df) {
df %>% summarise(tp = sum(class == 1 & scored.class == 1), tn = sum(class ==
0 & scored.class == 0), fp = sum(class == 0 & scored.class == 1), fn = sum(class ==
1 & scored.class == 0))
}
Question 3
accuracy <- function(df) {
# mtx
x <- conf_mtx(df)
# return accuracy
return((x$tp + x$tn)/nrow(df))
}
# get accuracy
accuracy(data)
## [1] 0.8066298
Question 4
cer <- function(df) {
# mtx
x <- conf_mtx(df)
# return accuracy
return((x$fp + x$fn)/nrow(df))
}
# get cer
cer(data)
## [1] 0.1933702
# sum to 1 check
accuracy(data) + cer(data) == 1
## [1] TRUE
Question 5
precision <- function(df) {
# mtx
x <- conf_mtx(df)
# return precision
return(x$tp/(x$tp + x$fp))
}
# get precision
precision(data)
## [1] 0.84375
Question 6
sensitivity <- function(df, opt = "class", threshold = 0.5) {
# df = 3 column 1st = actual class / 2nd = predicted class / 3rd = predicted
# probability
# true positive
tp <- if_else(opt == "class", sum(df[, 1] == 1 & df[, 2] == 1), sum(df[,
1] == 1 & df[, 3] > threshold))
# false negative
fn <- if_else(opt == "class", sum(df[, 1] == 1 & df[, 2] == 0), sum(df[,
1] == 1 & df[, 3] <= threshold))
# return sensitivty
return(tp/(tp + fn))
}
sensitivity(data)
## [1] 0.4736842
Question 7
specificity <- function(df, opt = "class", threshold = 0.5) {
# df = 3 column 1st = actual class / 2nd = predicted class / 3rd = predicted
# probability
# true negatives
tn <- if_else(opt == "class", sum(df[, 1] == 0 & df[, 2] == 0), sum(df[,
1] == 0 & df[, 3] <= threshold))
# false positives
fp <- if_else(opt == "class", sum(df[, 1] == 0 & df[, 2] == 1), sum(df[,
1] == 0 & df[, 3] > threshold))
# return specificity
return(tn/(tn + fp))
}
specificity(data)
## [1] 0.9596774
Question 8
f1score <- function(df) {
return((2 * precision(df) * sensitivity(df))/(precision(df) + sensitivity(df)))
}
f1score(data)
## [1] 0.6067416
Question 9
p <- runif(1000, min = 0, max = 1)
s <- runif(1000, min = 0, max = 1)
f <- (2 * p * s)/(p + s)
summary(f)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000246 0.2229556 0.4120884 0.4232514 0.6083694 0.9845176
data_frame(a = p, b = s, ab = p * s, proof = ab < a) %>% summary(n())
## a b ab proof
## Min. :0.001583 Min. :0.0000123 Min. :0.000003 Mode:logical
## 1st Qu.:0.268201 1st Qu.:0.2591696 1st Qu.:0.073824 TRUE:1000
## Median :0.528290 Median :0.5203246 Median :0.205934
## Mean :0.514900 Mean :0.5147638 Mean :0.262177
## 3rd Qu.:0.764076 3rd Qu.:0.7699889 3rd Qu.:0.394673
## Max. :0.997271 Max. :0.9987308 Max. :0.969319
- A simulation of 1000 was created to show that F1 will always be between 0 and 1
- Based on 0 < a < 1 and 0 < b < 1, then a*b < a, we see that there 1000 TRUEs
Question 10
rocCurve <- function(df, int = 100) {
# first column = class / second = scored.probability
thresholds <- seq(0, 1, by = 1/int)
# generate sensitivty
sens <- thresholds %>% map_dbl(function(x) sensitivity(df = data, opt = "prob",
threshold = x)) %>% sort()
# generate specificity
spec <- thresholds %>% map_dbl(function(x) (1 - specificity(df = data, opt = "prob",
threshold = x))) %>% sort()
# generate plot
plotdf <- data_frame(x = spec, y = sens)
g <- ggplot(plotdf, aes(x = x, y = y)) + geom_line() + geom_abline(intercept = 0,
slope = 1) + # geom_smooth() +
labs(x = "Specificity", y = "Sensitivity", title = "Custom ROC Curve")
# calculate area under curve
auc <- sum(diff(spec) * rollmean(sens, 2))
# return list of items
return(list(plot = g, AUC = auc))
}
myCurve <- rocCurve(data) %>% print
## $plot
##
## $AUC
## [1] 0.8488964
Question 11
metric <- c("accuracy", "cer", "precision", "sensitivity", "specificity", "f1score") %>%
data_frame(metric = ., value = map_dbl(metric, .f = function(x) get(x)(df = data)))
Question 12
cMatrix <- confusionMatrix(as.factor(data$scored.class), as.factor(data$class),
positive = "1", mode = "everything") %>% print
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.4737
## Specificity : 0.9597
## Pos Pred Value : 0.8438
## Neg Pred Value : 0.7987
## Precision : 0.8438
## Recall : 0.4737
## F1 : 0.6067
## Prevalence : 0.3149
## Detection Rate : 0.1492
## Detection Prevalence : 0.1768
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 1
##
# compare to homemade functions
metric %>% bind_cols(., caret = c(cMatrix$overall[[1]], 1 - cMatrix$overall[[1]],
cMatrix$byClass[c(5, 1, 2, 7)])) %>% mutate(match = round(value, 8) == round(caret,
8))
- The custom functions match caret up to 8 digits
Question 13
curveROC <- roc(data$class, data$scored.probability)
plot(curveROC, legacy.axes = T, main = "pROC")
data_frame(custom = c(myCurve$AUC), ROC = as.numeric(curveROC$auc)) %>% mutate(match = ROC -
custom)
Code Appendix
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(tidy = TRUE)
knitr::opts_chunk$set(warning = FALSE)
libs <- c("knitr", "tidyverse", "magrittr", "data.table", "kableExtra", "caret",
"pROC", "zoo")
loadPkg <- function(x) {
if (!require(x, character.only = T))
install.packages(x, dependencies = T)
require(x, character.only = T)
}
lapply(libs, loadPkg)
data <- fread("https://raw.githubusercontent.com/baroncurtin2/data621/master/week2/classification-output-data.csv") %>%
as_tibble() %>% select(class:scored.probability)
summary(data)
with(data, table(class, scored.class))
# confusion matrix helper function
conf_mtx <- function(df) {
df %>% summarise(tp = sum(class == 1 & scored.class == 1), tn = sum(class ==
0 & scored.class == 0), fp = sum(class == 0 & scored.class == 1), fn = sum(class ==
1 & scored.class == 0))
}
accuracy <- function(df) {
# mtx
x <- conf_mtx(df)
# return accuracy
return((x$tp + x$tn)/nrow(df))
}
# get accuracy
accuracy(data)
cer <- function(df) {
# mtx
x <- conf_mtx(df)
# return accuracy
return((x$fp + x$fn)/nrow(df))
}
# get cer
cer(data)
# sum to 1 check
accuracy(data) + cer(data) == 1
precision <- function(df) {
# mtx
x <- conf_mtx(df)
# return precision
return(x$tp/(x$tp + x$fp))
}
# get precision
precision(data)
sensitivity <- function(df, opt = "class", threshold = 0.5) {
# df = 3 column 1st = actual class / 2nd = predicted class / 3rd = predicted
# probability
# true positive
tp <- if_else(opt == "class", sum(df[, 1] == 1 & df[, 2] == 1), sum(df[,
1] == 1 & df[, 3] > threshold))
# false negative
fn <- if_else(opt == "class", sum(df[, 1] == 1 & df[, 2] == 0), sum(df[,
1] == 1 & df[, 3] <= threshold))
# return sensitivty
return(tp/(tp + fn))
}
sensitivity(data)
specificity <- function(df, opt = "class", threshold = 0.5) {
# df = 3 column 1st = actual class / 2nd = predicted class / 3rd = predicted
# probability
# true negatives
tn <- if_else(opt == "class", sum(df[, 1] == 0 & df[, 2] == 0), sum(df[,
1] == 0 & df[, 3] <= threshold))
# false positives
fp <- if_else(opt == "class", sum(df[, 1] == 0 & df[, 2] == 1), sum(df[,
1] == 0 & df[, 3] > threshold))
# return specificity
return(tn/(tn + fp))
}
specificity(data)
f1score <- function(df) {
return((2 * precision(df) * sensitivity(df))/(precision(df) + sensitivity(df)))
}
f1score(data)
p <- runif(1000, min = 0, max = 1)
s <- runif(1000, min = 0, max = 1)
f <- (2 * p * s)/(p + s)
summary(f)
data_frame(a = p, b = s, ab = p * s, proof = ab < a) %>% summary(n())
rocCurve <- function(df, int = 100) {
# first column = class / second = scored.probability
thresholds <- seq(0, 1, by = 1/int)
# generate sensitivty
sens <- thresholds %>% map_dbl(function(x) sensitivity(df = data, opt = "prob",
threshold = x)) %>% sort()
# generate specificity
spec <- thresholds %>% map_dbl(function(x) (1 - specificity(df = data, opt = "prob",
threshold = x))) %>% sort()
# generate plot
plotdf <- data_frame(x = spec, y = sens)
g <- ggplot(plotdf, aes(x = x, y = y)) + geom_line() + geom_abline(intercept = 0,
slope = 1) + # geom_smooth() +
labs(x = "Specificity", y = "Sensitivity", title = "Custom ROC Curve")
# calculate area under curve
auc <- sum(diff(spec) * rollmean(sens, 2))
# return list of items
return(list(plot = g, AUC = auc))
}
myCurve <- rocCurve(data) %>% print
metric <- c("accuracy", "cer", "precision", "sensitivity", "specificity", "f1score") %>%
data_frame(metric = ., value = map_dbl(metric, .f = function(x) get(x)(df = data)))
cMatrix <- confusionMatrix(as.factor(data$scored.class), as.factor(data$class),
positive = "1", mode = "everything") %>% print
# compare to homemade functions
metric %>% bind_cols(., caret = c(cMatrix$overall[[1]], 1 - cMatrix$overall[[1]],
cMatrix$byClass[c(5, 1, 2, 7)])) %>% mutate(match = round(value, 8) == round(caret,
8))
curveROC <- roc(data$class, data$scored.probability)
plot(curveROC, legacy.axes = T, main = "pROC")
data_frame(custom = c(myCurve$AUC), ROC = as.numeric(curveROC$auc)) %>% mutate(match = ROC -
custom)