LOAD DATA INTO ENVIRONMENT

diabetes <- read.csv("diabetes.csv")
head(diabetes)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0
str(diabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...

CONVERT DEPENDENT VARIABLE FROM INTEGER TO FACTOR

diabetes$Outcome <- as.factor(diabetes$Outcome)
diabetes$Age <- cut(diabetes$Age, breaks = c(0, 30, 50, Inf), labels = c("c1", "c2","c3"))
str(diabetes)               
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : Factor w/ 3 levels "c1","c2","c3": 2 2 2 1 2 1 1 1 3 3 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 2 ...

GENERATE FULL MODEL

riskmodel <- glm(Outcome~., family = binomial, data = diabetes)
summary(riskmodel)
## 
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = diabetes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7161  -0.6989  -0.3996   0.7208   2.7977  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -7.9945959  0.7052938 -11.335  < 2e-16 ***
## Pregnancies               0.0792825  0.0333120   2.380  0.01731 *  
## Glucose                   0.0352211  0.0037244   9.457  < 2e-16 ***
## BloodPressure            -0.0138835  0.0053167  -2.611  0.00902 ** 
## SkinThickness             0.0004863  0.0069611   0.070  0.94430    
## Insulin                  -0.0009968  0.0009092  -1.096  0.27291    
## BMI                       0.0868958  0.0153378   5.665 1.47e-08 ***
## DiabetesPedigreeFunction  0.8423325  0.3030783   2.779  0.00545 ** 
## Agec2                     0.9457453  0.2412908   3.920 8.87e-05 ***
## Agec3                     0.5628264  0.3387560   1.661  0.09662 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 710.44  on 758  degrees of freedom
## AIC: 730.44
## 
## Number of Fisher Scoring iterations: 5

CHECK FOR VARIANCES - NULL VS FULL MODEL

null <- glm(Outcome~1, family = binomial, data = diabetes)
anova(null, riskmodel, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: Outcome ~ 1
## Model 2: Outcome ~ Pregnancies + Glucose + BloodPressure + SkinThickness + 
##     Insulin + BMI + DiabetesPedigreeFunction + Age
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       767     993.48                          
## 2       758     710.44  9   283.05 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

APPEND PREDICTION PROBABILITY VECTOR TO DATAFRAME

diabetes$predprob <- round(fitted(riskmodel), 2)
head(diabetes)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome predprob
## 1                    0.627  c2       1     0.75
## 2                    0.351  c2       0     0.09
## 3                    0.672  c2       1     0.85
## 4                    0.167  c1       0     0.04
## 5                    2.288  c2       1     0.94
## 6                    0.201  c1       0     0.10

AUC ROC CURVE

library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(gplots)
ROCRPRED <- prediction(diabetes$predprob, diabetes$Outcome)
ROCEPERF <- performance(ROCRPRED, "tpr", "fpr"); ROCEPERF
## An object of class "performance"
## Slot "x.name":
## [1] "False positive rate"
## 
## Slot "y.name":
## [1] "True positive rate"
## 
## Slot "alpha.name":
## [1] "Cutoff"
## 
## Slot "x.values":
## [[1]]
##  [1] 0.000 0.000 0.004 0.004 0.004 0.004 0.006 0.006 0.010 0.010 0.010
## [12] 0.012 0.012 0.014 0.014 0.016 0.018 0.018 0.020 0.022 0.022 0.024
## [23] 0.028 0.030 0.034 0.034 0.036 0.036 0.040 0.044 0.050 0.054 0.056
## [34] 0.060 0.062 0.064 0.064 0.068 0.074 0.076 0.080 0.080 0.082 0.092
## [45] 0.096 0.100 0.110 0.118 0.124 0.126 0.126 0.128 0.134 0.146 0.154
## [56] 0.162 0.172 0.180 0.188 0.200 0.212 0.218 0.228 0.240 0.248 0.256
## [67] 0.270 0.276 0.280 0.288 0.294 0.310 0.314 0.328 0.350 0.370 0.392
## [78] 0.406 0.426 0.448 0.470 0.482 0.518 0.540 0.566 0.590 0.612 0.644
## [89] 0.698 0.740 0.770 0.814 0.844 0.872 0.924 0.956 0.980 0.994 1.000
## 
## 
## Slot "y.values":
## [[1]]
##  [1] 0.000000000 0.003731343 0.014925373 0.022388060 0.033582090
##  [6] 0.048507463 0.059701493 0.074626866 0.097014925 0.111940299
## [11] 0.123134328 0.126865672 0.134328358 0.164179104 0.175373134
## [16] 0.186567164 0.194029851 0.212686567 0.223880597 0.250000000
## [21] 0.264925373 0.279850746 0.294776119 0.324626866 0.343283582
## [26] 0.358208955 0.373134328 0.384328358 0.388059701 0.414179104
## [31] 0.425373134 0.432835821 0.440298507 0.444029851 0.455223881
## [36] 0.470149254 0.473880597 0.481343284 0.496268657 0.507462687
## [41] 0.514925373 0.522388060 0.526119403 0.537313433 0.544776119
## [46] 0.544776119 0.563432836 0.578358209 0.597014925 0.604477612
## [51] 0.615671642 0.649253731 0.660447761 0.675373134 0.682835821
## [56] 0.690298507 0.697761194 0.712686567 0.712686567 0.723880597
## [61] 0.735074627 0.738805970 0.746268657 0.746268657 0.761194030
## [66] 0.772388060 0.776119403 0.798507463 0.809701493 0.824626866
## [71] 0.828358209 0.847014925 0.854477612 0.865671642 0.876865672
## [76] 0.880597015 0.895522388 0.899253731 0.902985075 0.917910448
## [81] 0.921641791 0.921641791 0.929104478 0.940298507 0.947761194
## [86] 0.962686567 0.970149254 0.973880597 0.973880597 0.977611940
## [91] 0.981343284 0.981343284 0.981343284 0.988805970 0.988805970
## [96] 0.996268657 1.000000000 1.000000000 1.000000000
## 
## 
## Slot "alpha.values":
## [[1]]
##  [1]  Inf 0.99 0.97 0.96 0.95 0.94 0.93 0.92 0.91 0.90 0.89 0.88 0.87 0.86
## [15] 0.85 0.84 0.83 0.82 0.81 0.80 0.79 0.78 0.77 0.76 0.75 0.74 0.73 0.72
## [29] 0.71 0.70 0.69 0.68 0.67 0.66 0.65 0.64 0.63 0.62 0.61 0.59 0.58 0.57
## [43] 0.56 0.55 0.54 0.53 0.52 0.51 0.50 0.49 0.48 0.47 0.46 0.45 0.44 0.43
## [57] 0.42 0.41 0.40 0.39 0.38 0.37 0.36 0.35 0.34 0.33 0.32 0.31 0.30 0.29
## [71] 0.28 0.27 0.26 0.25 0.24 0.23 0.22 0.21 0.20 0.19 0.18 0.17 0.16 0.15
## [85] 0.14 0.13 0.12 0.11 0.10 0.09 0.08 0.07 0.06 0.05 0.04 0.03 0.02 0.01
## [99] 0.00

GENERATE AOC-ROC PLOT

plot(ROCEPERF, colorize = T, print.cutoffs.at = seq(0.1, by=0.1))

AUC

auc <- performance(ROCRPRED, "auc");
auc@y.values
## [[1]]
## [1] 0.8464328

CLASSIFICATION OF PREDICTION OUTCOME

diabetes$pred_Outcome <- ifelse(diabetes$predprob>0.4,1,0)
head(diabetes)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome predprob pred_Outcome
## 1                    0.627  c2       1     0.75            1
## 2                    0.351  c2       0     0.09            0
## 3                    0.672  c2       1     0.85            1
## 4                    0.167  c1       0     0.04            0
## 5                    2.288  c2       1     0.94            1
## 6                    0.201  c1       0     0.10            0

BUILD CONFUSION MATRIX

cMatrix <- table(diabetes$pred_Outcome, diabetes$Outcome) ; cMatrix
##    
##       0   1
##   0 410  77
##   1  90 191

CALCULATE SENSITIVITY & SPECIFICITY

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
sensitivity(cMatrix)
## [1] 0.82
specificity(cMatrix)
## [1] 0.7126866

HOSMER AND LEMESHOW GOODNESS OF FIT (GOF) TEST

library(ResourceSelection)
## ResourceSelection 0.3-4   2019-01-08
hltest<- hoslem.test(diabetes$Outcome, fitted(riskmodel),g=10)
## Warning in Ops.factor(1, y): '-' not meaningful for factors
hltest
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  diabetes$Outcome, fitted(riskmodel)
## X-squared = 768, df = 8, p-value < 2.2e-16

EXTRACTING COEEFICIENTS OF MODEL

coef(riskmodel)
##              (Intercept)              Pregnancies                  Glucose 
##            -7.9945958769             0.0792825133             0.0352210556 
##            BloodPressure            SkinThickness                  Insulin 
##            -0.0138835342             0.0004863145            -0.0009968294 
##                      BMI DiabetesPedigreeFunction                    Agec2 
##             0.0868957531             0.8423325194             0.9457452877 
##                    Agec3 
##             0.5628263516
exp(coef(riskmodel))
##              (Intercept)              Pregnancies                  Glucose 
##             0.0003372804             1.0825101024             1.0358486636 
##            BloodPressure            SkinThickness                  Insulin 
##             0.9862123976             1.0004864328             0.9990036673 
##                      BMI DiabetesPedigreeFunction                    Agec2 
##             1.0907829630             2.3217762539             2.5747315793 
##                    Agec3 
##             1.7556275153