HW2_Jeyaraman_Ramalingam

Input Dataset

df <- read_csv("classification-output-data.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   pregnant = col_double(),
##   glucose = col_double(),
##   diastolic = col_double(),
##   skinfold = col_double(),
##   insulin = col_double(),
##   bmi = col_double(),
##   pedigree = col_double(),
##   age = col_double(),
##   class = col_double(),
##   scored.class = col_double(),
##   scored.probability = col_double()
## )

summary(df)

##     pregnant         glucose        diastolic        skinfold   
##  Min.   : 0.000   Min.   : 57.0   Min.   : 38.0   Min.   : 0.0  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.0   1st Qu.: 0.0  
##  Median : 3.000   Median :112.0   Median : 70.0   Median :22.0  
##  Mean   : 3.862   Mean   :118.3   Mean   : 71.7   Mean   :19.8  
##  3rd Qu.: 6.000   3rd Qu.:136.0   3rd Qu.: 78.0   3rd Qu.:32.0  
##  Max.   :15.000   Max.   :197.0   Max.   :104.0   Max.   :54.0  
##     insulin            bmi           pedigree           age       
##  Min.   :  0.00   Min.   :19.40   Min.   :0.0850   Min.   :21.00  
##  1st Qu.:  0.00   1st Qu.:26.30   1st Qu.:0.2570   1st Qu.:24.00  
##  Median :  0.00   Median :31.60   Median :0.3910   Median :30.00  
##  Mean   : 63.77   Mean   :31.58   Mean   :0.4496   Mean   :33.31  
##  3rd Qu.:105.00   3rd Qu.:36.00   3rd Qu.:0.5800   3rd Qu.:41.00  
##  Max.   :543.00   Max.   :50.00   Max.   :2.2880   Max.   :67.00  
##      class         scored.class    scored.probability
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.02323   
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.11702   
##  Median :0.0000   Median :0.0000   Median :0.23999   
##  Mean   :0.3149   Mean   :0.1768   Mean   :0.30373   
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.43093   
##  Max.   :1.0000   Max.   :1.0000   Max.   :0.94633

Confusion Matrix

table(df$class,df$scored.class)

##    
##       0   1
##   0 119   5
##   1  30  27

Accuracy

Formula for Accuracy is

\[Accuracy =\frac{TP+TN}{TP+FP+TN+FN}\]

pred_accuracy <- function(df){
  df_tbl=table(df$class,df$scored.class)
  true_negatives <- df_tbl[1,1]
  false_positives <- df_tbl[1,2]
  false_negatives <- df_tbl[2,1]
  true_positives <- df_tbl[2,2]
  accuracy = (true_positives + true_negatives)/(true_negatives+false_positives+false_negatives+true_positives)
  
  return(accuracy)
  
}
pred_accuracy(df)

## [1] 0.8066298

CER

Formula for CER is

\[CER =\frac{FP+FN}{TP+FP+TN+FN}\]

pred_CER <- function(df){
  df_tbl=table(df$class,df$scored.class)
  true_negatives <- df_tbl[1,1]
  false_positives <- df_tbl[1,2]
  false_negatives <- df_tbl[2,1]
  true_positives <- df_tbl[2,2]
  CER = (false_positives + false_negatives)/(true_negatives+false_positives+false_negatives+true_positives)
  
  return(CER)
  
}
pred_CER(df)

## [1] 0.1933702

Precision

Formula for Precision is

\[Precisiion =\frac{TP}{TP+FP}\]

pred_precision <- function(df){
  df_tbl=table(df$class,df$scored.class)
  false_positives <- df_tbl[1,2]
  true_positives <- df_tbl[2,2]
  precision = (true_positives)/(false_positives+true_positives)
  
  return(precision)
  
}
pred_precision(df)

## [1] 0.84375

Sensitivity

Formula for Sensitivity is

\[Sensitivity =\frac{TP}{TP+FN}\]

pred_sensitivity <- function(df){
  df_tbl=table(df$class,df$scored.class)
  false_negatives <- df_tbl[2,1]
  true_positives <- df_tbl[2,2]
  sensitivity = (true_positives)/(false_negatives+true_positives)
  
  return(sensitivity)
  
}
pred_sensitivity(df)

## [1] 0.4736842

Specificity

Formula for Specificity is

\[Specificity =\frac{TN}{TN+FP}\]

pred_specificity <- function(df){
  df_tbl=table(df$class,df$scored.class)
  true_negatives <- df_tbl[1,1]
  false_positives <- df_tbl[1,2]
  specificity = (true_negatives)/(true_negatives+false_positives)
  
  return(specificity)
  
}
pred_specificity(df)

## [1] 0.9596774

F1 Score

Formula for F1 Score is

\[F1 Score =\frac{2*Precision*Sensitivity}{Precision+Sensitivity}\]

pred_f_score <- function(df){
  
  f_score = (2*pred_precision(df) * pred_sensitivity(df))/(pred_precision(df)+pred_sensitivity(df))
  return(f_score)
}
pred_f_score(df)

## [1] 0.6067416

F1 Score Interpretation

The F-Score is equal to 0.607 and It is in the range 0 < f_score < 1

ROC Curve - Manual Calculation

roc_function<- function(d){ 
  #Create a count
  temp <- table(d[ ,'class'], d[ ,"scored.probability"])
  #Calculate frequency
  allPos <- sum(df$class == 1, na.rm=TRUE)
  allNeg <- sum(df$class == 0, na.rm=TRUE)
  #Set threshold
  threshold <- seq(0,1,0.01)
  #Calculating probability for threshold
  x <- c()
  y <- c()
  for (i in 1:length(threshold)) {
    TP <- sum(df$scored.probability >= threshold[i] & df$class == 1, na.rm=TRUE)
    TN <- sum(df$scored.probability < threshold[i] & df$class == 0, na.rm=TRUE)
    y[i] <- TP / allPos
    x[i] <- 1-TN / allNeg
  }  
  rocPlot <- plot(x,y,type = "s", xlim=c(-0.5,1.5),
                  main = "ROC Curve from function",
                  xlab = "1-Specificity",
                  ylab = "Sensitivity")
  fPlot <- abline(0,1); fPlot
  xd <- c(0, abs(diff(x)))
  fAuc <- sum(xd*y); fAuc
  print(paste0("Area under the curve: ", fAuc))
}

roc_function(df)

## [1] "Area under the curve: 0.843803056027165"

Metrics Output

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.0.4

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

result <- data.frame(Accuracy=pred_accuracy(df),CER=pred_CER(df),Precision=pred_precision(df),Sensitivity=pred_sensitivity(df),Specificity=pred_specificity(df),F_Score=pred_f_score(df))

result %>%
  kbl() %>%
  kable_material_dark()

Accuracy	CER	Precision	Sensitivity	Specificity	F_Score
0.8066298	0.1933702	0.84375	0.4736842	0.9596774	0.6067416

Caret Package

library(caret)

## Warning: package 'caret' was built under R version 4.0.4

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:AUC':
## 
##     sensitivity, specificity

confusionMatrix(data=as.factor(df$scored.class),reference=as.factor(df$class), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##                                           
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.4737          
##             Specificity : 0.9597          
##          Pos Pred Value : 0.8438          
##          Neg Pred Value : 0.7987          
##              Prevalence : 0.3149          
##          Detection Rate : 0.1492          
##    Detection Prevalence : 0.1768          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 1               
##

ROC - R Package

pred_roc_curve <- function(labels,predictions){
  library(pROC)
  pROC_obj <- roc(labels,predictions,
                  smoothed = TRUE,
                  # arguments for ci
                  ci=TRUE, ci.alpha=0.9, stratified=FALSE,
                  # arguments for plot
                  plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                  print.auc=TRUE, show.thres=TRUE)
  sens.ci <- ci.se(pROC_obj)
  plot(sens.ci, type="shape", col="lightblue")
  ## Warning in plot.ci.se(sens.ci, type = "shape", col = "lightblue"): Low
  ## definition shape.
  plot(sens.ci, type="bars")
  
}
pred_roc_curve(df$class,df$scored.probability)

## Warning: package 'pROC' was built under R version 4.0.4

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:AUC':
## 
##     auc, roc

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Warning in plot.ci.se(sens.ci, type = "shape", col = "lightblue"): Low
## definition shape.