Loading Data

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data<-read.csv("https://raw.githubusercontent.com/scottogden10/621_hw/master/classification-output-data.csv")
head(data)
##   pregnant glucose diastolic skinfold insulin  bmi pedigree age class
## 1        7     124        70       33     215 25.5    0.161  37     0
## 2        2     122        76       27     200 35.9    0.483  26     0
## 3        3     107        62       13      48 22.9    0.678  23     1
## 4        1      91        64       24       0 29.2    0.192  21     0
## 5        4      83        86       19       0 29.3    0.317  34     0
## 6        1     100        74       12      46 19.5    0.149  28     0
##   scored.class scored.probability
## 1            0         0.32845226
## 2            0         0.27319044
## 3            0         0.10966039
## 4            0         0.05599835
## 5            0         0.10049072
## 6            0         0.05515460

Table of Scored vs Class

tab<-table(data$class,data$scored.class)
colnames(tab)<-c("Real Negative","Real Positive")
rownames(tab)<-c("Model Negative","Model Positive")
  tab
##                 
##                  Real Negative Real Positive
##   Model Negative           119             5
##   Model Positive            30            27

Confusion Matrix Function

Confus.Matrix <- function(data) {
  tab <- table(data$class,data$scored.class)
colnames(tab)<-c("Real Negative","Real Positive")
rownames(tab)<-c("Model Negative","Model Positive")
  
  return(tab)
}

Confus.Matrix(data=data)
##                 
##                  Real Negative Real Positive
##   Model Negative           119             5
##   Model Positive            30            27

All Metrics Function (Problems 3-8)

Sens<-function(data) {
  tab <- table(data$class,data$scored.class)
  tn<-tab[1,1]
  tp<-tab[2,2]
  fn<-tab[2,1]
  fp<-tab[1,2]
  
  sens<-tn/(tn+fp)

  return(sens)
  
}

Spec<-function(data) {
  tab <- table(data$class,data$scored.class)
  tn<-tab[1,1]
  tp<-tab[2,2]
  fn<-tab[2,1]
  fp<-tab[1,2]
  
  spec<-tp/(tp+fn)

  return(spec)
  
}

CER<-function(data) {
  tab <- table(data$class,data$scored.class)
  tn<-tab[1,1]
  tp<-tab[2,2]
  fn<-tab[2,1]
  fp<-tab[1,2]
  
  cer<-(fp+fn)/(tp+tn+fn+fp)

  return(cer)
  
}

PREC<-function(data) {
  tab <- table(data$class,data$scored.class)
  tn<-tab[1,1]
  tp<-tab[2,2]
  fn<-tab[2,1]
  fp<-tab[1,2]
  
  prec<-tp/(tp+fp)

  return(prec)
  
}

ACC<-function(data) {
  tab <- table(data$class,data$scored.class)
  tn<-tab[1,1]
  tp<-tab[2,2]
  fn<-tab[2,1]
  fp<-tab[1,2]
  
  acc<-(tp+tn)/(tp+tn+fn+fp)

  return(acc)
  
}

F1<-function(data) {
    tab <- table(data$class,data$scored.class)
  tn<-tab[1,1]
  tp<-tab[2,2]
  fn<-tab[2,1]
  fp<-tab[1,2]
  
  acc<-(tp+tn)/(tp+tn+fn+fp)
  cer<-(fp+fn)/(tp+tn+fn+fp)
  prec<-tp/(tp+fp)
  spec<-tp/(tp+fn)
  sens<-tn/(tn+fp)
  f1<-2*prec*sens/(prec+sens)

  return(f1)
  
}

Metrics<-function(data) {
  tab <- table(data$class,data$scored.class)
  tn<-tab[1,1]
  tp<-tab[2,2]
  fn<-tab[2,1]
  fp<-tab[1,2]
  
  acc<-(tp+tn)/(tp+tn+fn+fp)
  cer<-(fp+fn)/(tp+tn+fn+fp)
  prec<-tp/(tp+fp)
  spec<-tp/(tp+fn)
  sens<-tn/(tn+fp)
  f1<-2*prec*sens/(prec+sens)
  
  results<-data.frame(list(acc,cer,prec,sens,spec,f1))
  colnames(results) <- c("Accuracy","Classification Error Rate","Precision","Sensitivity","Specificity","F1 Score")
  results<-data.frame(t(results))
  colnames(results)<-"Result"
  return(results)
  
}

Metrics(data)
##                              Result
## Accuracy                  0.8066298
## Classification Error Rate 0.1933702
## Precision                 0.8437500
## Sensitivity               0.9596774
## Specificity               0.4736842
## F1 Score                  0.8979877

Problem 9

The F1 Score is minimized,zero, when either precision or sensitivity is zero. Neither precision nor sensitivity can be less than zero since tp,tn,fn,fp are all counts.

F1 is maximized at 1, when precision=sensitivity=1.

Proof by contradiction. Suppose it could be more than 1.

2ps/(p+s) > 1

Re-arrange: 2sp<-s+p

Re-arrange: 2 > s/p + p/s

Let s/p=x

2 > x+1/x

This is a contradiction since f(x)=x+1/x never attains a value lower than its global minimum of f(1)=2. See:

f’(x)=1-1/x^2=0

=>x=+/-1, (but s/p is always positive) f(1)=2

Second derivative test.

f’’(1)=2/x^3 = 2>0, implies minimum.

So f is always 2 or great for positive x, this implies that it is never lower than two and our assumption is a contradiction. QED.

Problem 10

’

ROC_Scott<- function(data,t) {

se<-0
sp<-0
a<-0
for (i in 1:round(1/t))
{
  
  se[i]<-sensitivity(reference=as.factor(data$class),data=as.factor(as.numeric(data$scored.probability > 0.01*i)))
  sp[i]<-specificity(reference=as.factor(data$class),data=as.factor(as.numeric(data$scored.probability > 0.01*i)))
  a[i]<-t/2*(sp[i+1]+se[i])
  

}
## Area of rectangles
b1<-se[-1]
b2<-se[-round(1/t)]
x1<-sp[-1]
x2<-sp[-round(1/t)]

auc<-sum(b1*(x2-x1))
plot(y=se,x=1-sp,xlab="1-Specificity",ylab="Sensitivity",main = "Scott ROC",type="l")
return(paste0("AUC=",round(auc,3)))
}

#ROC_Scott(data=data,t=.01)

Problem 11

Metrics(data)
##                              Result
## Accuracy                  0.8066298
## Classification Error Rate 0.1933702
## Precision                 0.8437500
## Sensitivity               0.9596774
## Specificity               0.4736842
## F1 Score                  0.8979877
ROC_Scott(data,t=0.01)

## [1] "AUC=0.854"

Problem 12

Note the values for the matrix, accuracy, Sensitivity and Specificity are all the same.

confusionMatrix(data=data$scored.class,reference = data$class)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119  30
##          1   5  27
##                                           
##                Accuracy : 0.8066          
##                  95% CI : (0.7415, 0.8615)
##     No Information Rate : 0.6851          
##     P-Value [Acc > NIR] : 0.0001712       
##                                           
##                   Kappa : 0.4916          
##  Mcnemar's Test P-Value : 4.976e-05       
##                                           
##             Sensitivity : 0.9597          
##             Specificity : 0.4737          
##          Pos Pred Value : 0.7987          
##          Neg Pred Value : 0.8438          
##              Prevalence : 0.6851          
##          Detection Rate : 0.6575          
##    Detection Prevalence : 0.8232          
##       Balanced Accuracy : 0.7167          
##                                           
##        'Positive' Class : 0               
## 
Metrics(data)
##                              Result
## Accuracy                  0.8066298
## Classification Error Rate 0.1933702
## Precision                 0.8437500
## Sensitivity               0.9596774
## Specificity               0.4736842
## F1 Score                  0.8979877

Problem 13

roc(data$class, data$scored.probability,plot=TRUE)

## 
## Call:
## roc.default(response = data$class, predictor = data$scored.probability,     plot = TRUE)
## 
## Data: data$scored.probability in 124 controls (data$class 0) < 57 cases (data$class 1).
## Area under the curve: 0.8503

AUC=0.8503

This compares favorably with the results (0.80) from Problem 11.