HHernandez_DATA621

library(dplyr)
library(ggplot2)

#1.Download dataset
classout <- read.csv("classification-output-data.csv", header= TRUE)
summary(classout)

##     pregnant         glucose        diastolic        skinfold   
##  Min.   : 0.000   Min.   : 57.0   Min.   : 38.0   Min.   : 0.0  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.0   1st Qu.: 0.0  
##  Median : 3.000   Median :112.0   Median : 70.0   Median :22.0  
##  Mean   : 3.862   Mean   :118.3   Mean   : 71.7   Mean   :19.8  
##  3rd Qu.: 6.000   3rd Qu.:136.0   3rd Qu.: 78.0   3rd Qu.:32.0  
##  Max.   :15.000   Max.   :197.0   Max.   :104.0   Max.   :54.0  
##     insulin            bmi           pedigree           age       
##  Min.   :  0.00   Min.   :19.40   Min.   :0.0850   Min.   :21.00  
##  1st Qu.:  0.00   1st Qu.:26.30   1st Qu.:0.2570   1st Qu.:24.00  
##  Median :  0.00   Median :31.60   Median :0.3910   Median :30.00  
##  Mean   : 63.77   Mean   :31.58   Mean   :0.4496   Mean   :33.31  
##  3rd Qu.:105.00   3rd Qu.:36.00   3rd Qu.:0.5800   3rd Qu.:41.00  
##  Max.   :543.00   Max.   :50.00   Max.   :2.2880   Max.   :67.00  
##      class         scored.class    scored.probability
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.02323   
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.11702   
##  Median :0.0000   Median :0.0000   Median :0.23999   
##  Mean   :0.3149   Mean   :0.1768   Mean   :0.30373   
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.43093   
##  Max.   :1.0000   Max.   :1.0000   Max.   :0.94633

#2. Raw Confusion Matrix

## Rows represent Predicted Class and Columns represent Actual Class

ct <- table(classout$scored.class, classout$class)
colnames(ct) <- c("AP", "AN")
rownames(ct) <- c("PP", "PN")

ct

##     
##       AP  AN
##   PP 119  30
##   PN   5  27

# Confusion Matrix function
confmx <- function(a){
  x <- table(a$scored.class, a$class)
  colnames(x) <- c("AP", "AN")
  rownames(x) <- c("PP", "PN")
  
  TP <- x["PP","AP"]
  TN <- x["PN","AN"]
  FP <- x["PP", "AN"]
  FN <- x["PN","AP"]
  cf <- data.frame(TP=TP, TN=TN, FP=FP, FN=FN)
  return(cf)
}

confmx(classout)

##    TP TN FP FN
## 1 119 27 30  5

#3. Accuracy function
accuracy <- function(a){
  cf <- confmx(a)
  acc <- (cf$TP+cf$TN)/(cf$TP+cf$FP+cf$TN+cf$FN)
  return(acc)
}

accuracy(classout)

## [1] 0.8066298

#4. Error Rate function
errorrate <- function(a){
  cf <- confmx(a)
  errr <- (cf$FP+cf$FN)/(cf$TP+cf$FP+cf$TN+cf$FN)
  return(errr)
}

errorrate(classout)

## [1] 0.1933702

# Verifying Accuracy + ErrorRate = 1
accuracy(classout)+errorrate(classout)

## [1] 1

#5. Precision function
precision <- function(a){
  cf <- confmx(a)
  prec <- (cf$TP)/(cf$TP+cf$FP)
  return(prec)
}

precision(classout)

## [1] 0.7986577

#6. Sensitivity (Recall) function
sensitivity <- function(a){
  cf <- confmx(a)
  sens <- (cf$TP)/(cf$TP+cf$FN)
  return(sens)
}

sensitivity(classout)

## [1] 0.9596774

#7. Specificity function
specificity <- function(a){
  cf <- confmx(a)
  spec <- (cf$TN)/(cf$TN+cf$FP)
  return(spec)
}

specificity(classout)

## [1] 0.4736842

#8. F1 Score function
f1score <- function(a){
  f1s <- (2*precision(a)*sensitivity(a))/(precision(a)+sensitivity(a))
  return(f1s)
}

f1score(classout)

## [1] 0.8717949

9. What are the bounds on the F1 score? Show that the F1 score will always be between 0 and 1

F1 score is a classifier metric which calculates a mean of precision and recall in a way that emphasizes the lowest value, it seeks a balance between Precision and Recall

The F1 score is based on the harmonic mean, which is defined as the reciprocal of the arithmetic mean of the reciprocals. Because of that,

The result is not sensitive to extremely large values. On the other hand, extremely low values have a significant influence on the result.

#10. ROC function

roc_func <- function(a){
  classes <- a$class[order(a$scored.probability, decreasing=TRUE)]
  df <- data.frame(TPR=cumsum(classes)/sum(classes), FPR=cumsum(!classes)/sum(!classes), classes)
  return(df)
}

roc_df <- roc_func(classout)

ggplot(roc_df, aes(x=FPR, y=TPR)) + geom_line(color="blue") +labs(y="TPR", x="FPR")

# AUC

roc_auc <- function(a){
  classes <- a$class[order(a$scored.probability, decreasing=TRUE)]
  TPR <- cumsum(classes)/sum(classes)
  FPR <- cumsum(!classes)/sum(!classes)
  dFPR <- c(diff(FPR), 0)
  dTPR <- c(diff(TPR), 0)
  auc <- sum(TPR * dFPR) + sum(dTPR * dFPR)/2
  return(auc)
  }
  
roc_auc(classout)

## [1] 0.8503113

11. All the functions were already run

#12, Use caret package
#install.packages('caret', dependencies = TRUE)
library(caret)

confusionMatrix(as.factor(classout$scored.class), as.factor(classout$class))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##                                           
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.9597          
##             Specificity : 0.4737          
##          Pos Pred Value : 0.7987          
##          Neg Pred Value : 0.8438          
##              Prevalence : 0.6851          
##          Detection Rate : 0.6575          
##    Detection Prevalence : 0.8232          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 0               
##

ConfusionMatrix, Accuracy, Sensitivty & Specificity results between Custom functions created and Caret built-in ones are the same

#13. Use pROC package
#install.packages('pROC')
library(pROC)

rocobj <- roc(classout$class, classout$scored.class, direction = "<")
plot(rocobj, col = "blue")

auc(rocobj)

## Area under the curve: 0.7167

ROC curves and AUCs results between Custom functions and pROC built-in ones differ, custom ROC considers TPR and FPR, the one from pROC considers Sensitivity (TPR) and Specificity (1-FPR)

HHernandez_DATA621 - HOMEWORK2

humbertohp

October 4, 2019

9. What are the bounds on the F1 score? Show that the F1 score will always be between 0 and 1

F1 score is a classifier metric which calculates a mean of precision and recall in a way that emphasizes the lowest value, it seeks a balance between Precision and Recall

The F1 score is based on the harmonic mean, which is defined as the reciprocal of the arithmetic mean of the reciprocals. Because of that,

The result is not sensitive to extremely large values. On the other hand, extremely low values have a significant influence on the result.

11. All the functions were already run

ConfusionMatrix, Accuracy, Sensitivty & Specificity results between Custom functions created and Caret built-in ones are the same

ROC curves and AUCs results between Custom functions and pROC built-in ones differ, custom ROC considers TPR and FPR, the one from pROC considers Sensitivity (TPR) and Specificity (1-FPR)