Overview In this homework assignment, you will work through various classification metrics. You will be asked to create functions in R to carry out the various calculations. You will also investigate some functions in packages that will let you obtain the equivalent results. Finally, you will create graphical output that also can be used to evaluate the output of classification models, such as binary logistic regression. #
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggplot2)
data <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data621-Assignment2/master/classification-output-data.csv", stringsAsFactors = FALSE, sep = ",", header = TRUE)
summary(data)
## pregnant glucose diastolic skinfold
## Min. : 0.000 Min. : 57.0 Min. : 38.0 Min. : 0.0
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.0 1st Qu.: 0.0
## Median : 3.000 Median :112.0 Median : 70.0 Median :22.0
## Mean : 3.862 Mean :118.3 Mean : 71.7 Mean :19.8
## 3rd Qu.: 6.000 3rd Qu.:136.0 3rd Qu.: 78.0 3rd Qu.:32.0
## Max. :15.000 Max. :197.0 Max. :104.0 Max. :54.0
## insulin bmi pedigree age
## Min. : 0.00 Min. :19.40 Min. :0.0850 Min. :21.00
## 1st Qu.: 0.00 1st Qu.:26.30 1st Qu.:0.2570 1st Qu.:24.00
## Median : 0.00 Median :31.60 Median :0.3910 Median :30.00
## Mean : 63.77 Mean :31.58 Mean :0.4496 Mean :33.31
## 3rd Qu.:105.00 3rd Qu.:36.00 3rd Qu.:0.5800 3rd Qu.:41.00
## Max. :543.00 Max. :50.00 Max. :2.2880 Max. :67.00
## class scored.class scored.probability
## Min. :0.0000 Min. :0.0000 Min. :0.02323
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.11702
## Median :0.0000 Median :0.0000 Median :0.23999
## Mean :0.3149 Mean :0.1768 Mean :0.30373
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.43093
## Max. :1.0000 Max. :1.0000 Max. :0.94633
r <- table(data$scored.class,data$class)
knitr:: kable(r)
| 0 | 1 | |
|---|---|---|
| 0 | 119 | 30 |
| 1 | 5 | 27 |
Accuracy <- function(data) {
tb = table(data$class,data$scored.class)
TN=tb[1,1]
TP=tb[2,2]
FN=tb[2,1]
FP=tb[1,2]
return((TP+TN)/(TP+FP+TN+FN))
}
Accuracy(data)
## [1] 0.8066298
CER <- function(data) {
tb = table(data$class,data$scored.class)
TN=tb[1,1]
TP=tb[2,2]
FN=tb[2,1]
FP=tb[1,2]
return((FP+FN)/(TP+FP+TN+FN))
}
CER(data)
## [1] 0.1933702
Precision <- function(data) {
tb = table(data$class,data$scored.class)
TP=tb[2,2]
FP=tb[1,2]
return((TP)/(TP+FP))
}
Precision(data)
## [1] 0.84375
Sensitivity <- function(data) {
tb = table(data$class,data$scored.class)
TP=tb[2,2]
FN=tb[2,1]
return((TP)/(TP+FN))
}
Sensitivity(data)
## [1] 0.4736842
Specificity <- function(data) {
tb = table(data$class,data$scored.class)
TN=tb[1,1]
TP=tb[2,2]
FN=tb[2,1]
FP=tb[1,2]
return((TN)/(TN+FP))
}
Specificity(data)
## [1] 0.9596774
F1_score <- function(data) {
tb = table(data$class,data$scored.class)
TN=tb[1,1]
TP=tb[2,2]
FN=tb[2,1]
FP=tb[1,2]
Precision = (TP)/(TP+FP)
Sensitivity = (TP)/(TP+FN)
Precision =(TP)/(TP+FP)
return((2*Precision*Sensitivity)/(Precision+Sensitivity))
}
F1_score(data)
## [1] 0.6067416
Both Precision and Sensitivity used to calculate F1 score are bounded between 0 and 1 , so F1 score will be between 0 and 1.
ROC = function(labels, scores){
labels = labels[order(scores, decreasing=TRUE)]
result =data.frame(TPR=cumsum(labels)/sum(labels), FPR=cumsum(!labels)/sum(!labels), labels)
FPR_df = c(diff(result$FPR), 0)
TPR_df = c(diff(result$TPR), 0)
AUC = round(sum(result$TPR * FPR_df) + sum(TPR_df * FPR_df)/2,4)
plot(result$FPR,result$TPR,type="l",main ="ROC Curve",ylab="Sensitivity",xlab="1-Specificity")
abline(a=0,b=1)
legend(.6,.2,AUC,title = "AUC")
}
ROC(data$class,data$scored.probability)
Accuracy(data)
## [1] 0.8066298
CER(data)
## [1] 0.1933702
Precision(data)
## [1] 0.84375
Sensitivity(data)
## [1] 0.4736842
Specificity(data)
## [1] 0.9596774
F1_score(data)
## [1] 0.6067416
confusionMatrix(as.factor(data$scored.class), as.factor(data$class), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 30
## 1 5 27
##
## Accuracy : 0.8066
## 95% CI : (0.7415, 0.8615)
## No Information Rate : 0.6851
## P-Value [Acc > NIR] : 0.0001712
##
## Kappa : 0.4916
## Mcnemar's Test P-Value : 4.976e-05
##
## Sensitivity : 0.4737
## Specificity : 0.9597
## Pos Pred Value : 0.8438
## Neg Pred Value : 0.7987
## Prevalence : 0.3149
## Detection Rate : 0.1492
## Detection Prevalence : 0.1768
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : 1
##
The results are similar.
par(mfrow=c(1,2))
plot(roc(data$class,data$scored.probability),print.auc=TRUE)
ROC(data$class,data$scored.probability)