diab = read.csv("/Users/nirajanbudhathoki/OneDrive - Central Michigan University/diabetes.csv")
head(diab)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
Value of 0 in the dataset refer missing values. Impute missing values with median for all predictors except “Pregnancies”.
myfun = function(x){
ifelse(x == 0,median(x),x)
}
imp.median <- data.frame(
sapply(diab[,-c(1,9)],myfun))
finaldata = cbind(diab[,c(9,1)],imp.median)
head(finaldata)
## Outcome Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 1 6 148 72 35 30.5 33.6
## 2 0 1 85 66 29 30.5 26.6
## 3 1 8 183 64 23 30.5 23.3
## 4 0 1 89 66 23 94.0 28.1
## 5 1 0 137 40 35 168.0 43.1
## 6 0 5 116 74 23 30.5 25.6
## DiabetesPedigreeFunction Age
## 1 0.627 50
## 2 0.351 31
## 3 0.672 32
## 4 0.167 21
## 5 2.288 33
## 6 0.201 30
set.seed(111)
train = sample(1:nrow(finaldata),nrow(finaldata)*0.8) #80% data on training set
finaldata.train = finaldata[train,]
finaldata.test = finaldata[-train,]
X1 = scale(finaldata.train[,-1])*0.5+0 colMeans(X1) apply(X1,2,sd)
# logistic regression
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
finaldata.train$Outcome = as.factor(finaldata.train$Outcome)
finaldata.test$Outcome = as.factor(finaldata.test$Outcome)
fitControl <- trainControl(method = "cv", number = 5)
mod_logistic <- train(Outcome~.,
data = finaldata.train,
method = "glm",
family = "binomial",
trControl = fitControl)
summary(mod_logistic)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5388 -0.7210 -0.4010 0.7269 2.1637
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.083586 0.903645 -10.052 < 2e-16 ***
## Pregnancies 0.123175 0.036194 3.403 0.000666 ***
## Glucose 0.037800 0.004294 8.802 < 2e-16 ***
## BloodPressure -0.010432 0.009596 -1.087 0.276989
## SkinThickness 0.001827 0.013643 0.134 0.893446
## Insulin -0.001505 0.001027 -1.466 0.142632
## BMI 0.098233 0.019188 5.120 3.06e-07 ***
## DiabetesPedigreeFunction 0.638807 0.326422 1.957 0.050347 .
## Age 0.014501 0.010559 1.373 0.169665
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 792.69 on 613 degrees of freedom
## Residual deviance: 572.61 on 605 degrees of freedom
## AIC: 590.61
##
## Number of Fisher Scoring iterations: 5
logistic.pred <- predict(mod_logistic, newdata = finaldata.test,type ="raw")
confusionMatrix(data = logistic.pred, finaldata.test$Outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 90 24
## 1 9 31
##
## Accuracy : 0.7857
## 95% CI : (0.7124, 0.8477)
## No Information Rate : 0.6429
## P-Value [Acc > NIR] : 8.822e-05
##
## Kappa : 0.5032
##
## Mcnemar's Test P-Value : 0.01481
##
## Sensitivity : 0.5636
## Specificity : 0.9091
## Pos Pred Value : 0.7750
## Neg Pred Value : 0.7895
## Prevalence : 0.3571
## Detection Rate : 0.2013
## Detection Prevalence : 0.2597
## Balanced Accuracy : 0.7364
##
## 'Positive' Class : 1
##
roc(finaldata.test$Outcome, as.numeric(logistic.pred),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(logistic.pred), plot = TRUE, legacy.axes = TRUE)
##
## Data: as.numeric(logistic.pred) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.7364
roc(finaldata.test$Outcome, as.numeric(logistic.pred))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(logistic.pred))
##
## Data: as.numeric(logistic.pred) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.7364
par(pty = "s")
imp = varImp(mod_logistic)
plot(imp)
With only significant variables
mod_logistic <- train(Outcome~Pregnancies+Glucose+BMI+DiabetesPedigreeFunction,
data = finaldata.train,
method = "glm",
family = "binomial",
trControl = fitControl)
summary(mod_logistic)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6145 -0.7336 -0.4064 0.7329 2.1783
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.006203 0.775224 -11.618 < 2e-16 ***
## Pregnancies 0.145565 0.031145 4.674 2.96e-06 ***
## Glucose 0.036161 0.003844 9.408 < 2e-16 ***
## BMI 0.089010 0.015962 5.576 2.45e-08 ***
## DiabetesPedigreeFunction 0.608931 0.321176 1.896 0.058 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 792.69 on 613 degrees of freedom
## Residual deviance: 577.30 on 609 degrees of freedom
## AIC: 587.3
##
## Number of Fisher Scoring iterations: 5
logistic.pred <- predict(mod_logistic, newdata = finaldata.test,type ="raw")
confusionMatrix(data = logistic.pred, finaldata.test$Outcome, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 88 26
## 1 11 29
##
## Accuracy : 0.7597
## 95% CI : (0.6844, 0.8248)
## No Information Rate : 0.6429
## P-Value [Acc > NIR] : 0.00125
##
## Kappa : 0.443
##
## Mcnemar's Test P-Value : 0.02136
##
## Sensitivity : 0.5273
## Specificity : 0.8889
## Pos Pred Value : 0.7250
## Neg Pred Value : 0.7719
## Prevalence : 0.3571
## Detection Rate : 0.1883
## Detection Prevalence : 0.2597
## Balanced Accuracy : 0.7081
##
## 'Positive' Class : 1
##
roc(finaldata.test$Outcome, as.numeric(logistic.pred),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(logistic.pred), plot = TRUE, legacy.axes = TRUE)
##
## Data: as.numeric(logistic.pred) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.7081
roc(finaldata.test$Outcome, as.numeric(logistic.pred))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(logistic.pred))
##
## Data: as.numeric(logistic.pred) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.7081
par(pty = "s")
imp = varImp(mod_logistic)
plot(imp)
Without cross validation
mymod = glm(Outcome~.,data=finaldata.train,family = "binomial")
summary(mymod)
##
## Call:
## glm(formula = Outcome ~ ., family = "binomial", data = finaldata.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5388 -0.7210 -0.4010 0.7269 2.1637
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.083586 0.903645 -10.052 < 2e-16 ***
## Pregnancies 0.123175 0.036194 3.403 0.000666 ***
## Glucose 0.037800 0.004294 8.802 < 2e-16 ***
## BloodPressure -0.010432 0.009596 -1.087 0.276989
## SkinThickness 0.001827 0.013643 0.134 0.893446
## Insulin -0.001505 0.001027 -1.466 0.142632
## BMI 0.098233 0.019188 5.120 3.06e-07 ***
## DiabetesPedigreeFunction 0.638807 0.326422 1.957 0.050347 .
## Age 0.014501 0.010559 1.373 0.169665
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 792.69 on 613 degrees of freedom
## Residual deviance: 572.61 on 605 degrees of freedom
## AIC: 590.61
##
## Number of Fisher Scoring iterations: 5
pred = predict(mymod, newdata=finaldata.test,type = "response")
(tab0.5 = table(pred > 0.5, finaldata.test$Outcome))
##
## 0 1
## FALSE 90 24
## TRUE 9 31
sum(diag(tab0.5)) / sum(tab0.5)
## [1] 0.7857143
roc(finaldata.test$Outcome, as.numeric(pred),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(pred), plot = TRUE, legacy.axes = TRUE)
##
## Data: as.numeric(pred) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.8494
With only significant variables
mymod1 = glm(Outcome~Pregnancies+Glucose+BMI+DiabetesPedigreeFunction,data=finaldata.train,
family = "binomial")
summary(mymod1)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction,
## family = "binomial", data = finaldata.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6145 -0.7336 -0.4064 0.7329 2.1783
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.006203 0.775224 -11.618 < 2e-16 ***
## Pregnancies 0.145565 0.031145 4.674 2.96e-06 ***
## Glucose 0.036161 0.003844 9.408 < 2e-16 ***
## BMI 0.089010 0.015962 5.576 2.45e-08 ***
## DiabetesPedigreeFunction 0.608931 0.321176 1.896 0.058 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 792.69 on 613 degrees of freedom
## Residual deviance: 577.30 on 609 degrees of freedom
## AIC: 587.3
##
## Number of Fisher Scoring iterations: 5
pred1 = predict(mymod1, newdata=finaldata.test,type = "response")
(tab0.5 = table(pred1 > 0.5, finaldata.test$Outcome))
##
## 0 1
## FALSE 88 26
## TRUE 11 29
sum(diag(tab0.5)) / sum(tab0.5)
## [1] 0.7597403
roc(finaldata.test$Outcome, as.numeric(pred1),plot=TRUE,legacy.axes =TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = finaldata.test$Outcome, predictor = as.numeric(pred1), plot = TRUE, legacy.axes = TRUE)
##
## Data: as.numeric(pred1) in 99 controls (finaldata.test$Outcome 0) < 55 cases (finaldata.test$Outcome 1).
## Area under the curve: 0.8494