LOAD DATA INTO ENVIRONMENT
diabetes <- read.csv("diabetes.csv")
head(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
str(diabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
CONVERT DEPENDENT VARIABLE FROM INTEGER TO FACTOR
diabetes$Outcome <- as.factor(diabetes$Outcome)
diabetes$Age <- cut(diabetes$Age, breaks = c(0, 30, 50, Inf), labels = c("c1", "c2","c3"))
str(diabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : Factor w/ 3 levels "c1","c2","c3": 2 2 2 1 2 1 1 1 3 3 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 2 ...
GENERATE FULL MODEL
riskmodel <- glm(Outcome~., family = binomial, data = diabetes)
summary(riskmodel)
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = diabetes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7161 -0.6989 -0.3996 0.7208 2.7977
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.9945959 0.7052938 -11.335 < 2e-16 ***
## Pregnancies 0.0792825 0.0333120 2.380 0.01731 *
## Glucose 0.0352211 0.0037244 9.457 < 2e-16 ***
## BloodPressure -0.0138835 0.0053167 -2.611 0.00902 **
## SkinThickness 0.0004863 0.0069611 0.070 0.94430
## Insulin -0.0009968 0.0009092 -1.096 0.27291
## BMI 0.0868958 0.0153378 5.665 1.47e-08 ***
## DiabetesPedigreeFunction 0.8423325 0.3030783 2.779 0.00545 **
## Agec2 0.9457453 0.2412908 3.920 8.87e-05 ***
## Agec3 0.5628264 0.3387560 1.661 0.09662 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 710.44 on 758 degrees of freedom
## AIC: 730.44
##
## Number of Fisher Scoring iterations: 5
CHECK FOR VARIANCES - NULL VS FULL MODEL
null <- glm(Outcome~1, family = binomial, data = diabetes)
anova(null, riskmodel, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: Outcome ~ 1
## Model 2: Outcome ~ Pregnancies + Glucose + BloodPressure + SkinThickness +
## Insulin + BMI + DiabetesPedigreeFunction + Age
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 767 993.48
## 2 758 710.44 9 283.05 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
APPEND PREDICTION PROBABILITY VECTOR TO DATAFRAME
diabetes$predprob <- round(fitted(riskmodel), 2)
head(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome predprob
## 1 0.627 c2 1 0.75
## 2 0.351 c2 0 0.09
## 3 0.672 c2 1 0.85
## 4 0.167 c1 0 0.04
## 5 2.288 c2 1 0.94
## 6 0.201 c1 0 0.10
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(gplots)
ROCRPRED <- prediction(diabetes$predprob, diabetes$Outcome)
ROCEPERF <- performance(ROCRPRED, "tpr", "fpr"); ROCEPERF
## An object of class "performance"
## Slot "x.name":
## [1] "False positive rate"
##
## Slot "y.name":
## [1] "True positive rate"
##
## Slot "alpha.name":
## [1] "Cutoff"
##
## Slot "x.values":
## [[1]]
## [1] 0.000 0.000 0.004 0.004 0.004 0.004 0.006 0.006 0.010 0.010 0.010
## [12] 0.012 0.012 0.014 0.014 0.016 0.018 0.018 0.020 0.022 0.022 0.024
## [23] 0.028 0.030 0.034 0.034 0.036 0.036 0.040 0.044 0.050 0.054 0.056
## [34] 0.060 0.062 0.064 0.064 0.068 0.074 0.076 0.080 0.080 0.082 0.092
## [45] 0.096 0.100 0.110 0.118 0.124 0.126 0.126 0.128 0.134 0.146 0.154
## [56] 0.162 0.172 0.180 0.188 0.200 0.212 0.218 0.228 0.240 0.248 0.256
## [67] 0.270 0.276 0.280 0.288 0.294 0.310 0.314 0.328 0.350 0.370 0.392
## [78] 0.406 0.426 0.448 0.470 0.482 0.518 0.540 0.566 0.590 0.612 0.644
## [89] 0.698 0.740 0.770 0.814 0.844 0.872 0.924 0.956 0.980 0.994 1.000
##
##
## Slot "y.values":
## [[1]]
## [1] 0.000000000 0.003731343 0.014925373 0.022388060 0.033582090
## [6] 0.048507463 0.059701493 0.074626866 0.097014925 0.111940299
## [11] 0.123134328 0.126865672 0.134328358 0.164179104 0.175373134
## [16] 0.186567164 0.194029851 0.212686567 0.223880597 0.250000000
## [21] 0.264925373 0.279850746 0.294776119 0.324626866 0.343283582
## [26] 0.358208955 0.373134328 0.384328358 0.388059701 0.414179104
## [31] 0.425373134 0.432835821 0.440298507 0.444029851 0.455223881
## [36] 0.470149254 0.473880597 0.481343284 0.496268657 0.507462687
## [41] 0.514925373 0.522388060 0.526119403 0.537313433 0.544776119
## [46] 0.544776119 0.563432836 0.578358209 0.597014925 0.604477612
## [51] 0.615671642 0.649253731 0.660447761 0.675373134 0.682835821
## [56] 0.690298507 0.697761194 0.712686567 0.712686567 0.723880597
## [61] 0.735074627 0.738805970 0.746268657 0.746268657 0.761194030
## [66] 0.772388060 0.776119403 0.798507463 0.809701493 0.824626866
## [71] 0.828358209 0.847014925 0.854477612 0.865671642 0.876865672
## [76] 0.880597015 0.895522388 0.899253731 0.902985075 0.917910448
## [81] 0.921641791 0.921641791 0.929104478 0.940298507 0.947761194
## [86] 0.962686567 0.970149254 0.973880597 0.973880597 0.977611940
## [91] 0.981343284 0.981343284 0.981343284 0.988805970 0.988805970
## [96] 0.996268657 1.000000000 1.000000000 1.000000000
##
##
## Slot "alpha.values":
## [[1]]
## [1] Inf 0.99 0.97 0.96 0.95 0.94 0.93 0.92 0.91 0.90 0.89 0.88 0.87 0.86
## [15] 0.85 0.84 0.83 0.82 0.81 0.80 0.79 0.78 0.77 0.76 0.75 0.74 0.73 0.72
## [29] 0.71 0.70 0.69 0.68 0.67 0.66 0.65 0.64 0.63 0.62 0.61 0.59 0.58 0.57
## [43] 0.56 0.55 0.54 0.53 0.52 0.51 0.50 0.49 0.48 0.47 0.46 0.45 0.44 0.43
## [57] 0.42 0.41 0.40 0.39 0.38 0.37 0.36 0.35 0.34 0.33 0.32 0.31 0.30 0.29
## [71] 0.28 0.27 0.26 0.25 0.24 0.23 0.22 0.21 0.20 0.19 0.18 0.17 0.16 0.15
## [85] 0.14 0.13 0.12 0.11 0.10 0.09 0.08 0.07 0.06 0.05 0.04 0.03 0.02 0.01
## [99] 0.00
GENERATE AOC-ROC PLOT
plot(ROCEPERF, colorize = T, print.cutoffs.at = seq(0.1, by=0.1))
AUC
auc <- performance(ROCRPRED, "auc");
auc@y.values
## [[1]]
## [1] 0.8464328
CLASSIFICATION OF PREDICTION OUTCOME
diabetes$pred_Outcome <- ifelse(diabetes$predprob>0.4,1,0)
head(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome predprob pred_Outcome
## 1 0.627 c2 1 0.75 1
## 2 0.351 c2 0 0.09 0
## 3 0.672 c2 1 0.85 1
## 4 0.167 c1 0 0.04 0
## 5 2.288 c2 1 0.94 1
## 6 0.201 c1 0 0.10 0
BUILD CONFUSION MATRIX
cMatrix <- table(diabetes$pred_Outcome, diabetes$Outcome) ; cMatrix
##
## 0 1
## 0 410 77
## 1 90 191
CALCULATE SENSITIVITY & SPECIFICITY
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
sensitivity(cMatrix)
## [1] 0.82
specificity(cMatrix)
## [1] 0.7126866
HOSMER AND LEMESHOW GOODNESS OF FIT (GOF) TEST
library(ResourceSelection)
## ResourceSelection 0.3-4 2019-01-08
hltest<- hoslem.test(diabetes$Outcome, fitted(riskmodel),g=10)
## Warning in Ops.factor(1, y): '-' not meaningful for factors
hltest
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: diabetes$Outcome, fitted(riskmodel)
## X-squared = 768, df = 8, p-value < 2.2e-16
EXTRACTING COEEFICIENTS OF MODEL
coef(riskmodel)
## (Intercept) Pregnancies Glucose
## -7.9945958769 0.0792825133 0.0352210556
## BloodPressure SkinThickness Insulin
## -0.0138835342 0.0004863145 -0.0009968294
## BMI DiabetesPedigreeFunction Agec2
## 0.0868957531 0.8423325194 0.9457452877
## Agec3
## 0.5628263516
exp(coef(riskmodel))
## (Intercept) Pregnancies Glucose
## 0.0003372804 1.0825101024 1.0358486636
## BloodPressure SkinThickness Insulin
## 0.9862123976 1.0004864328 0.9990036673
## BMI DiabetesPedigreeFunction Agec2
## 1.0907829630 2.3217762539 2.5747315793
## Agec3
## 1.7556275153