data_621

df <- read.csv("classification-output-data.csv", header = TRUE)

dfSubset <- df %>% dplyr::select(class, scored.class, scored.probability)
rawConfusionMatrix <- with(dfSubset, table(scored.class, class))
rawConfusionMatrix
##             class
## scored.class   0   1
##            0 119  30
##            1   5  27

The confusion matrix summarizes the classification output. The columns represent the actuals (class), whereas the rows represent the predicted (scored.class). There are 30 false negatives in contrast to 5 false positives.

accuracy <- function(x, actual = "actual", predicted = "predicted"){
        
        x <- as.data.frame(x) %>%
                dplyr::select(bquote(.(actual)), bquote(.(predicted))) %>%
                dplyr::mutate(TP = dplyr::case_when(class == 1 & scored.class == 1 ~ 1, TRUE ~ 0),
                              TN = dplyr::case_when(class == 0 & scored.class == 0 ~ 1, TRUE ~ 0))
        
        a <- sum(x$TP, x$TN) / nrow(df)
        return(a)
        
}

print(paste0("The accuracy is ", accuracy(df, actual = 'class', predicted = 'scored.class')))
## [1] "The accuracy is 0.806629834254144"

classError <- function(x, actual = "actual", predicted = "predicted"){
        
        x <- as.data.frame(x) %>%
                dplyr::select(bquote(.(actual)), bquote(.(predicted))) %>%
                dplyr::mutate(FP = dplyr::case_when(class == 0 & scored.class == 1 ~ 1, TRUE ~ 0),
                              FN = dplyr::case_when(class == 1 & scored.class == 0 ~ 1, TRUE ~ 0))
        
        e <- sum(x$FP, x$FN) / nrow(df)
        return(e)
        
}

print(paste0("The classification error rate (which is equivalent to 1 minus the accuracy) is ", classError(df, actual = 'class', predicted = 'scored.class')))
## [1] "The classification error rate (which is equivalent to 1 minus the accuracy) is 0.193370165745856"

# verification
if(accuracy(df, "class", "scored.class") + classError(df, "class", "scored.class") == 1){
        print("Verification Success: accuracy plus classification error rate is equal to 1.")
}
## [1] "Verification Success: accuracy plus classification error rate is equal to 1."

precision <- function(x, actual = "actual", predicted = "predicted"){
        
        x <- as.data.frame(x) %>%
                dplyr::select(bquote(.(actual)), bquote(.(predicted))) %>%
                dplyr::mutate(TP = dplyr::case_when(class == 1 & scored.class == 1 ~ 1, TRUE ~ 0),
                              FP = dplyr::case_when(class == 0 & scored.class == 1 ~ 1, TRUE ~ 0))
        
        p <- sum(x$TP)/ sum(x$TP, x$FP)
        return(p)
        
}

print(paste0("The precision is ", precision(df, actual = 'class', predicted = 'scored.class')))
## [1] "The precision is 0.84375"

sensitivity <- function(x, actual = "actual", predicted = "predicted"){
        
        x <- as.data.frame(x) %>%
                dplyr::select(bquote(.(actual)), bquote(.(predicted))) %>%
                dplyr::mutate(TP = dplyr::case_when(class == 1 & scored.class == 1 ~ 1, TRUE ~ 0),
                              FN = dplyr::case_when(class == 1 & scored.class == 0 ~ 1, TRUE ~ 0))
        
        s <- sum(x$TP)/ sum(x$TP, x$FN)
        return(s)
        
}

print(paste0("The sensitivity aka recall is ", sensitivity(df, actual = 'class', predicted = 'scored.class')))
## [1] "The sensitivity aka recall is 0.473684210526316"

specificity <- function(x, actual = "actual", predicted = "predicted"){
        
        x <- as.data.frame(x) %>%
                dplyr::select(bquote(.(actual)), bquote(.(predicted))) %>%
                dplyr::mutate(TN = dplyr::case_when(class == 0 & scored.class == 0 ~ 1, TRUE ~ 0),
                              FP = dplyr::case_when(class == 0 & scored.class == 1 ~ 1, TRUE ~ 0))
        
        sp <- sum(x$TN)/ sum(x$TN, x$FP)
        return(sp)
        
}

print(paste0("The specificity is ", specificity(df, actual = 'class', predicted = 'scored.class')))
## [1] "The specificity is 0.959677419354839"

f1Score <- function(x, actual = "actual", predicted = "predicted"){
        
        p = precision(x, actual, predicted)
        s = sensitivity(x, actual, predicted)
        
        f1 = (2 * p * s) / (p + s)
        return(f1)
        
}

print(paste0("The F1 score is ", f1Score(df, actual = 'class', predicted = 'scored.class')))
## [1] "The F1 score is 0.606741573033708"

## [1] "####### 1) Let's assume we have classified n true positives and no false negatives or false positives."
## [1] "Assume that TP is non zero and FN and FP are equal to zero, then the formula becomes:"
## [1] "F1 Score = { 2TP }{ 2TP + 0 + 0 } = { 2TP }{ 2TP } = 1"
## [1] "####### 2) Let's assume there is no true positives and n false negatives with m false positives or 0 false negatives with m false positives, or n false negatives with 0 false positives, then our F1 output would become as follows:"
## [1] "F1 Score = { 2TP }{ 2TP + FN + FP } = { 0 }{ 0 + FN + FP } = 0"
## [1] "F1 Score = { 2TP }{ 2TP + FN + FP } = { 0 }{ 0 + FN + 0 } = 0"
## [1] "F1 Score = { 2TP }{ 2TP + FN + FP } = { 0 }{ 0 + 0 + FP } = 0"
## [1] "####### 3) The third situation is if TP, FN, and FP are all non zero. The denominator is always greater than the numerator, hence the F1 score will always be a value between zero and one."
## [1] "0 < { 2TP }{ 2TP + FN + FP } < 1"

ROC <- function(x, actual = "actual", prob = "prob"){
        
        # df
        df <- as.data.frame(x) %>%
                dplyr::select(bquote(.(actual)), bquote(.(prob)))
        
        # define threshold
        t <- seq(0, 1, by = 0.01)
        
        # temp data frame
        d <- data.frame(temp = 1:nrow(df))
        x <- seq_along(t)
        y <- seq_along(t)
        
        for(i in t){
                prob.df <- as.numeric(df[[2]] >i)
                d <- cbind(d, prob.df)
        }
        
        for(j in 1:length(t)){
                classes.type.df = factor(df[[1]], levels = c(0, 1))
                prediciton_types.df = factor(d[, j], levels = c(0, 1))
                ct = table(classes.type.df, prediciton_types.df)
                
                sen_a = ct[2,2] / (ct[2,2] + ct[2,1])
                spe_a = ct[1,1] / (ct[1,1] + ct[1,2])
                y[j] = sen_a
                x[j] = 1 - spe_a
        }
        
        roc.df = data.frame(false_positive_rate = x, true_positive_rate = y)
        roc.visuals = ggplot(roc.df, aes(x = false_positive_rate, y = true_positive_rate)) + 
                geom_step() + 
                geom_abline(slope = 1, intercept = c(0, 0), colour = "red", lty = 2)   
        
        myauc <- function(outcome, prob){
                n = length(prob)
                positives_sum = sum(outcome)
                z <- data.frame(out = outcome, prob = prob)
                z <- z[order(-z$prob), ] %>% dplyr::mutate(above = (1:n) - cumsum(out))
                return(1 - sum(z$above * z$out) / (positives_sum * (n - positives_sum)))
        }       
        
        auc_final = myauc(df[[1]], df[[2]])
        
        results = list("Area under curve" = auc_final, "Plot" = roc.visuals)
        
        results
}

## [1] "The accuracy is 0.806629834254144"
## [1] "The classification error rate is 0.193370165745856"
## [1] "The precision is 0.84375"
## [1] "The sensitivity is 0.473684210526316"
## [1] "The specificity is 0.959677419354839"
## [1] "The F1 score is 0.606741573033708"
## [1] "The AUC is 0.850311262026033"
## Warning: Removed 1 rows containing missing values (geom_path).

caret_matrix <- caret::confusionMatrix(rawConfusionMatrix[2:1, 2:1])

print(caret_matrix)
## Confusion Matrix and Statistics
## 
##             class
## scored.class   1   0
##            1  27   5
##            0  30 119
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##                                           
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.4737          
##             Specificity : 0.9597          
##          Pos Pred Value : 0.8438          
##          Neg Pred Value : 0.7987          
##              Prevalence : 0.3149          
##          Detection Rate : 0.1492          
##    Detection Prevalence : 0.1768          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 1               
## 

caret_matrix$table
##             class
## scored.class   1   0
##            1  27   5
##            0  30 119

Comparing with my function outputs, the result is identical.

# AUC
print(paste0("The AUC is ", pROC::auc(pROC::roc(df$class, df$scored.probability))))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [1] "The AUC is 0.850311262026033"

# plot
plot(pROC::roc(df$class, df$scored.probability), main = "pROC package : ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

The result is also identical; however, ggplot seems to create better looking visualization.

data_621_hw2

Jimmy Ng

3/14/2020