Data 621 - Week2

Question 1

data <- fread("https://raw.githubusercontent.com/baroncurtin2/data621/master/week2/classification-output-data.csv") %>% 
    as_tibble() %>% select(class:scored.probability)

Question 2

summary(data)

##      class         scored.class    scored.probability
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.02323   
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.11702   
##  Median :0.0000   Median :0.0000   Median :0.23999   
##  Mean   :0.3149   Mean   :0.1768   Mean   :0.30373   
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.43093   
##  Max.   :1.0000   Max.   :1.0000   Max.   :0.94633

with(data, table(class, scored.class))

##      scored.class
## class   0   1
##     0 119   5
##     1  30  27

The rows represent the actual classes
The columns represent the predicted classes

# confusion matrix helper function
conf_mtx <- function(df) {
    df %>% summarise(tp = sum(class == 1 & scored.class == 1), tn = sum(class == 
        0 & scored.class == 0), fp = sum(class == 0 & scored.class == 1), fn = sum(class == 
        1 & scored.class == 0))
}

Question 3

accuracy <- function(df) {
    # mtx
    x <- conf_mtx(df)
    
    # return accuracy
    return((x$tp + x$tn)/nrow(df))
}

# get accuracy
accuracy(data)

## [1] 0.8066298

Question 4

cer <- function(df) {
    # mtx
    x <- conf_mtx(df)
    
    # return accuracy
    return((x$fp + x$fn)/nrow(df))
}

# get cer
cer(data)

## [1] 0.1933702

# sum to 1 check
accuracy(data) + cer(data) == 1

## [1] TRUE

Question 5

precision <- function(df) {
    # mtx
    x <- conf_mtx(df)
    
    # return precision
    return(x$tp/(x$tp + x$fp))
}

# get precision
precision(data)

## [1] 0.84375

Question 6

sensitivity <- function(df, opt = "class", threshold = 0.5) {
    # df = 3 column 1st = actual class / 2nd = predicted class / 3rd = predicted
    # probability
    
    # true positive
    tp <- if_else(opt == "class", sum(df[, 1] == 1 & df[, 2] == 1), sum(df[, 
        1] == 1 & df[, 3] > threshold))
    
    # false negative
    fn <- if_else(opt == "class", sum(df[, 1] == 1 & df[, 2] == 0), sum(df[, 
        1] == 1 & df[, 3] <= threshold))
    
    # return sensitivty
    return(tp/(tp + fn))
}

sensitivity(data)

## [1] 0.4736842

Question 7

specificity <- function(df, opt = "class", threshold = 0.5) {
    # df = 3 column 1st = actual class / 2nd = predicted class / 3rd = predicted
    # probability
    
    # true negatives
    tn <- if_else(opt == "class", sum(df[, 1] == 0 & df[, 2] == 0), sum(df[, 
        1] == 0 & df[, 3] <= threshold))
    
    # false positives
    fp <- if_else(opt == "class", sum(df[, 1] == 0 & df[, 2] == 1), sum(df[, 
        1] == 0 & df[, 3] > threshold))
    
    # return specificity
    return(tn/(tn + fp))
}

specificity(data)

## [1] 0.9596774

Question 8

f1score <- function(df) {
    return((2 * precision(df) * sensitivity(df))/(precision(df) + sensitivity(df)))
}

f1score(data)

## [1] 0.6067416

Question 9

p <- runif(1000, min = 0, max = 1)
s <- runif(1000, min = 0, max = 1)
f <- (2 * p * s)/(p + s)
summary(f)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000246 0.2229556 0.4120884 0.4232514 0.6083694 0.9845176

data_frame(a = p, b = s, ab = p * s, proof = ab < a) %>% summary(n())

##        a                  b                   ab            proof        
##  Min.   :0.001583   Min.   :0.0000123   Min.   :0.000003   Mode:logical  
##  1st Qu.:0.268201   1st Qu.:0.2591696   1st Qu.:0.073824   TRUE:1000     
##  Median :0.528290   Median :0.5203246   Median :0.205934                 
##  Mean   :0.514900   Mean   :0.5147638   Mean   :0.262177                 
##  3rd Qu.:0.764076   3rd Qu.:0.7699889   3rd Qu.:0.394673                 
##  Max.   :0.997271   Max.   :0.9987308   Max.   :0.969319

A simulation of 1000 was created to show that F1 will always be between 0 and 1
Based on 0 < a < 1 and 0 < b < 1, then a*b < a, we see that there 1000 TRUEs

Question 10

rocCurve <- function(df, int = 100) {
    # first column = class / second = scored.probability
    thresholds <- seq(0, 1, by = 1/int)
    
    # generate sensitivty
    sens <- thresholds %>% map_dbl(function(x) sensitivity(df = data, opt = "prob", 
        threshold = x)) %>% sort()
    
    # generate specificity
    spec <- thresholds %>% map_dbl(function(x) (1 - specificity(df = data, opt = "prob", 
        threshold = x))) %>% sort()
    
    # generate plot
    plotdf <- data_frame(x = spec, y = sens)
    g <- ggplot(plotdf, aes(x = x, y = y)) + geom_line() + geom_abline(intercept = 0, 
        slope = 1) + # geom_smooth() +
    labs(x = "Specificity", y = "Sensitivity", title = "Custom ROC Curve")
    
    # calculate area under curve
    auc <- sum(diff(spec) * rollmean(sens, 2))
    
    # return list of items
    return(list(plot = g, AUC = auc))
}

myCurve <- rocCurve(data) %>% print

## $plot

## 
## $AUC
## [1] 0.8488964

Question 11

metric <- c("accuracy", "cer", "precision", "sensitivity", "specificity", "f1score") %>% 
    data_frame(metric = ., value = map_dbl(metric, .f = function(x) get(x)(df = data)))

Question 12

cMatrix <- confusionMatrix(as.factor(data$scored.class), as.factor(data$class), 
    positive = "1", mode = "everything") %>% print

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.4737          
##             Specificity : 0.9597          
##          Pos Pred Value : 0.8438          
##          Neg Pred Value : 0.7987          
##               Precision : 0.8438          
##                  Recall : 0.4737          
##                      F1 : 0.6067          
##              Prevalence : 0.3149          
##          Detection Rate : 0.1492          
##    Detection Prevalence : 0.1768          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 1               
##

# compare to homemade functions
metric %>% bind_cols(., caret = c(cMatrix$overall[[1]], 1 - cMatrix$overall[[1]], 
    cMatrix$byClass[c(5, 1, 2, 7)])) %>% mutate(match = round(value, 8) == round(caret, 
    8))

The custom functions match caret up to 8 digits

Question 13

curveROC <- roc(data$class, data$scored.probability)
plot(curveROC, legacy.axes = T, main = "pROC")

data_frame(custom = c(myCurve$AUC), ROC = as.numeric(curveROC$auc)) %>% mutate(match = ROC - 
    custom)

Code Appendix

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(tidy = TRUE)
knitr::opts_chunk$set(warning = FALSE)

libs <- c("knitr", "tidyverse", "magrittr", "data.table", "kableExtra", "caret", 
    "pROC", "zoo")

loadPkg <- function(x) {
    if (!require(x, character.only = T)) 
        install.packages(x, dependencies = T)
    require(x, character.only = T)
}

lapply(libs, loadPkg)
data <- fread("https://raw.githubusercontent.com/baroncurtin2/data621/master/week2/classification-output-data.csv") %>% 
    as_tibble() %>% select(class:scored.probability)
summary(data)
with(data, table(class, scored.class))
# confusion matrix helper function
conf_mtx <- function(df) {
    df %>% summarise(tp = sum(class == 1 & scored.class == 1), tn = sum(class == 
        0 & scored.class == 0), fp = sum(class == 0 & scored.class == 1), fn = sum(class == 
        1 & scored.class == 0))
}
accuracy <- function(df) {
    # mtx
    x <- conf_mtx(df)
    
    # return accuracy
    return((x$tp + x$tn)/nrow(df))
}

# get accuracy
accuracy(data)
cer <- function(df) {
    # mtx
    x <- conf_mtx(df)
    
    # return accuracy
    return((x$fp + x$fn)/nrow(df))
}

# get cer
cer(data)

# sum to 1 check
accuracy(data) + cer(data) == 1
precision <- function(df) {
    # mtx
    x <- conf_mtx(df)
    
    # return precision
    return(x$tp/(x$tp + x$fp))
}

# get precision
precision(data)
sensitivity <- function(df, opt = "class", threshold = 0.5) {
    # df = 3 column 1st = actual class / 2nd = predicted class / 3rd = predicted
    # probability
    
    # true positive
    tp <- if_else(opt == "class", sum(df[, 1] == 1 & df[, 2] == 1), sum(df[, 
        1] == 1 & df[, 3] > threshold))
    
    # false negative
    fn <- if_else(opt == "class", sum(df[, 1] == 1 & df[, 2] == 0), sum(df[, 
        1] == 1 & df[, 3] <= threshold))
    
    # return sensitivty
    return(tp/(tp + fn))
}

sensitivity(data)
specificity <- function(df, opt = "class", threshold = 0.5) {
    # df = 3 column 1st = actual class / 2nd = predicted class / 3rd = predicted
    # probability
    
    # true negatives
    tn <- if_else(opt == "class", sum(df[, 1] == 0 & df[, 2] == 0), sum(df[, 
        1] == 0 & df[, 3] <= threshold))
    
    # false positives
    fp <- if_else(opt == "class", sum(df[, 1] == 0 & df[, 2] == 1), sum(df[, 
        1] == 0 & df[, 3] > threshold))
    
    # return specificity
    return(tn/(tn + fp))
}

specificity(data)
f1score <- function(df) {
    return((2 * precision(df) * sensitivity(df))/(precision(df) + sensitivity(df)))
}

f1score(data)
p <- runif(1000, min = 0, max = 1)
s <- runif(1000, min = 0, max = 1)
f <- (2 * p * s)/(p + s)
summary(f)

data_frame(a = p, b = s, ab = p * s, proof = ab < a) %>% summary(n())
rocCurve <- function(df, int = 100) {
    # first column = class / second = scored.probability
    thresholds <- seq(0, 1, by = 1/int)
    
    # generate sensitivty
    sens <- thresholds %>% map_dbl(function(x) sensitivity(df = data, opt = "prob", 
        threshold = x)) %>% sort()
    
    # generate specificity
    spec <- thresholds %>% map_dbl(function(x) (1 - specificity(df = data, opt = "prob", 
        threshold = x))) %>% sort()
    
    # generate plot
    plotdf <- data_frame(x = spec, y = sens)
    g <- ggplot(plotdf, aes(x = x, y = y)) + geom_line() + geom_abline(intercept = 0, 
        slope = 1) + # geom_smooth() +
    labs(x = "Specificity", y = "Sensitivity", title = "Custom ROC Curve")
    
    # calculate area under curve
    auc <- sum(diff(spec) * rollmean(sens, 2))
    
    # return list of items
    return(list(plot = g, AUC = auc))
}

myCurve <- rocCurve(data) %>% print
metric <- c("accuracy", "cer", "precision", "sensitivity", "specificity", "f1score") %>% 
    data_frame(metric = ., value = map_dbl(metric, .f = function(x) get(x)(df = data)))
cMatrix <- confusionMatrix(as.factor(data$scored.class), as.factor(data$class), 
    positive = "1", mode = "everything") %>% print
# compare to homemade functions
metric %>% bind_cols(., caret = c(cMatrix$overall[[1]], 1 - cMatrix$overall[[1]], 
    cMatrix$byClass[c(5, 1, 2, 7)])) %>% mutate(match = round(value, 8) == round(caret, 
    8))
curveROC <- roc(data$class, data$scored.probability)
plot(curveROC, legacy.axes = T, main = "pROC")
data_frame(custom = c(myCurve$AUC), ROC = as.numeric(curveROC$auc)) %>% mutate(match = ROC - 
    custom)

Data 621 - Week2

Baron Curtin

2018-06-20

Question 1

Question 2

Question 3

Question 4

Question 5

Question 6

Question 7

Question 8

Question 9

Question 10

Question 11

Question 12

Question 13

Code Appendix