diab = read.csv("/Users/nirajanbudhathoki/OneDrive - Central Michigan University/diabetes.csv")
head(diab)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

Value of 0 in the dataset refer missing values. Impute missing values with median for all predictors except “Pregnancies”.

myfun = function(x){
  ifelse(x == 0,median(x),x)
}

imp.median <- data.frame(
  sapply(diab[,-c(1,9)],myfun))
finaldata = cbind(diab[,c(9,1)],imp.median)
head(finaldata)
##   Outcome Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1       1           6     148            72            35    30.5 33.6
## 2       0           1      85            66            29    30.5 26.6
## 3       1           8     183            64            23    30.5 23.3
## 4       0           1      89            66            23    94.0 28.1
## 5       1           0     137            40            35   168.0 43.1
## 6       0           5     116            74            23    30.5 25.6
##   DiabetesPedigreeFunction Age
## 1                    0.627  50
## 2                    0.351  31
## 3                    0.672  32
## 4                    0.167  21
## 5                    2.288  33
## 6                    0.201  30
set.seed(111)
train = sample(1:nrow(finaldata),nrow(finaldata)*0.8)  #80% data on training set
finaldata.train = finaldata[train,]
finaldata.test = finaldata[-train,]

X1 = scale(finaldata.train[,-1])*0.5+0 colMeans(X1) apply(X1,2,sd)

# logistic regression
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
finaldata.train$Outcome = as.factor(finaldata.train$Outcome)
finaldata.test$Outcome = as.factor(finaldata.test$Outcome)
fitControl <- trainControl(method = "cv", number = 5)
mod_logistic <- train(Outcome~., 
                      data = finaldata.train, 
                      method = "glm", 
                      family = "binomial", 
                      trControl = fitControl)
summary(mod_logistic)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5388  -0.7210  -0.4010   0.7269   2.1637  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -9.083586   0.903645 -10.052  < 2e-16 ***
## Pregnancies               0.123175   0.036194   3.403 0.000666 ***
## Glucose                   0.037800   0.004294   8.802  < 2e-16 ***
## BloodPressure            -0.010432   0.009596  -1.087 0.276989    
## SkinThickness             0.001827   0.013643   0.134 0.893446    
## Insulin                  -0.001505   0.001027  -1.466 0.142632    
## BMI                       0.098233   0.019188   5.120 3.06e-07 ***
## DiabetesPedigreeFunction  0.638807   0.326422   1.957 0.050347 .  
## Age                       0.014501   0.010559   1.373 0.169665    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 792.69  on 613  degrees of freedom
## Residual deviance: 572.61  on 605  degrees of freedom
## AIC: 590.61
## 
## Number of Fisher Scoring iterations: 5
logistic.pred <- predict(mod_logistic, newdata = finaldata.test,type ="raw")
confusionMatrix(data = logistic.pred, finaldata.test$Outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 90 24
##          1  9 31
##                                           
##                Accuracy : 0.7857          
##                  95% CI : (0.7124, 0.8477)
##     No Information Rate : 0.6429          
##     P-Value [Acc > NIR] : 8.822e-05       
##                                           
##                   Kappa : 0.5032          
##                                           
##  Mcnemar's Test P-Value : 0.01481         
##                                           
##             Sensitivity : 0.5636          
##             Specificity : 0.9091          
##          Pos Pred Value : 0.7750          
##          Neg Pred Value : 0.7895          
##              Prevalence : 0.3571          
##          Detection Rate : 0.2013          
##    Detection Prevalence : 0.2597          
##       Balanced Accuracy : 0.7364          
##                                           
##        'Positive' Class : 1               
## 
roc(finaldata.test$Outcome, as.numeric(logistic.pred),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(logistic.pred),     plot = TRUE, legacy.axes = TRUE)
## 
## Data: as.numeric(logistic.pred) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.7364
roc(finaldata.test$Outcome, as.numeric(logistic.pred))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(logistic.pred))
## 
## Data: as.numeric(logistic.pred) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.7364
par(pty = "s")
imp = varImp(mod_logistic)
plot(imp)

With only significant variables

mod_logistic <- train(Outcome~Pregnancies+Glucose+BMI+DiabetesPedigreeFunction, 
                      data = finaldata.train, 
                      method = "glm", 
                      family = "binomial", 
                      trControl = fitControl)
summary(mod_logistic)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6145  -0.7336  -0.4064   0.7329   2.1783  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -9.006203   0.775224 -11.618  < 2e-16 ***
## Pregnancies               0.145565   0.031145   4.674 2.96e-06 ***
## Glucose                   0.036161   0.003844   9.408  < 2e-16 ***
## BMI                       0.089010   0.015962   5.576 2.45e-08 ***
## DiabetesPedigreeFunction  0.608931   0.321176   1.896    0.058 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 792.69  on 613  degrees of freedom
## Residual deviance: 577.30  on 609  degrees of freedom
## AIC: 587.3
## 
## Number of Fisher Scoring iterations: 5
logistic.pred <- predict(mod_logistic, newdata = finaldata.test,type ="raw")
confusionMatrix(data = logistic.pred, finaldata.test$Outcome, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 88 26
##          1 11 29
##                                           
##                Accuracy : 0.7597          
##                  95% CI : (0.6844, 0.8248)
##     No Information Rate : 0.6429          
##     P-Value [Acc > NIR] : 0.00125         
##                                           
##                   Kappa : 0.443           
##                                           
##  Mcnemar's Test P-Value : 0.02136         
##                                           
##             Sensitivity : 0.5273          
##             Specificity : 0.8889          
##          Pos Pred Value : 0.7250          
##          Neg Pred Value : 0.7719          
##              Prevalence : 0.3571          
##          Detection Rate : 0.1883          
##    Detection Prevalence : 0.2597          
##       Balanced Accuracy : 0.7081          
##                                           
##        'Positive' Class : 1               
## 
roc(finaldata.test$Outcome, as.numeric(logistic.pred),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(logistic.pred),     plot = TRUE, legacy.axes = TRUE)
## 
## Data: as.numeric(logistic.pred) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.7081
roc(finaldata.test$Outcome, as.numeric(logistic.pred))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(logistic.pred))
## 
## Data: as.numeric(logistic.pred) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.7081
par(pty = "s")
imp = varImp(mod_logistic)
plot(imp)

Without cross validation

mymod = glm(Outcome~.,data=finaldata.train,family = "binomial")
summary(mymod)
## 
## Call:
## glm(formula = Outcome ~ ., family = "binomial", data = finaldata.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5388  -0.7210  -0.4010   0.7269   2.1637  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -9.083586   0.903645 -10.052  < 2e-16 ***
## Pregnancies               0.123175   0.036194   3.403 0.000666 ***
## Glucose                   0.037800   0.004294   8.802  < 2e-16 ***
## BloodPressure            -0.010432   0.009596  -1.087 0.276989    
## SkinThickness             0.001827   0.013643   0.134 0.893446    
## Insulin                  -0.001505   0.001027  -1.466 0.142632    
## BMI                       0.098233   0.019188   5.120 3.06e-07 ***
## DiabetesPedigreeFunction  0.638807   0.326422   1.957 0.050347 .  
## Age                       0.014501   0.010559   1.373 0.169665    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 792.69  on 613  degrees of freedom
## Residual deviance: 572.61  on 605  degrees of freedom
## AIC: 590.61
## 
## Number of Fisher Scoring iterations: 5
pred = predict(mymod, newdata=finaldata.test,type = "response")

(tab0.5 = table(pred > 0.5, finaldata.test$Outcome))
##        
##          0  1
##   FALSE 90 24
##   TRUE   9 31
sum(diag(tab0.5)) / sum(tab0.5) 
## [1] 0.7857143
roc(finaldata.test$Outcome, as.numeric(pred),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(pred),     plot = TRUE, legacy.axes = TRUE)
## 
## Data: as.numeric(pred) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.8494

With only significant variables

mymod1 = glm(Outcome~Pregnancies+Glucose+BMI+DiabetesPedigreeFunction,data=finaldata.train,
             family = "binomial")

summary(mymod1)
## 
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction, 
##     family = "binomial", data = finaldata.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6145  -0.7336  -0.4064   0.7329   2.1783  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -9.006203   0.775224 -11.618  < 2e-16 ***
## Pregnancies               0.145565   0.031145   4.674 2.96e-06 ***
## Glucose                   0.036161   0.003844   9.408  < 2e-16 ***
## BMI                       0.089010   0.015962   5.576 2.45e-08 ***
## DiabetesPedigreeFunction  0.608931   0.321176   1.896    0.058 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 792.69  on 613  degrees of freedom
## Residual deviance: 577.30  on 609  degrees of freedom
## AIC: 587.3
## 
## Number of Fisher Scoring iterations: 5
pred1 = predict(mymod1, newdata=finaldata.test,type = "response")

(tab0.5 = table(pred1 > 0.5, finaldata.test$Outcome))
##        
##          0  1
##   FALSE 88 26
##   TRUE  11 29
sum(diag(tab0.5)) / sum(tab0.5)
## [1] 0.7597403
roc(finaldata.test$Outcome, as.numeric(pred1),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(pred1),     plot = TRUE, legacy.axes = TRUE)
## 
## Data: as.numeric(pred1) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.8494