Project Objective- predict an individual has diabetes or not where the independent variable-'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness','Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age' and the dependent Variable-'outcome'(0-no, 1-yes)
#########################
#### Import Dataset ####
########################
diab_data <- read.csv(file= "C:/Users/WASIM/Documents/diabetes.csv", header=TRUE)
str(diab_data)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
diab_data$Outcome <- as.factor(diab_data$Outcome)
str(diab_data)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 2 ...
###########################################
#### Split dataset into test and train ####
###########################################
set.seed(789)
p<- c(0.9,0.1)
sample_diab_data <- sample(2, nrow(diab_data),replace = TRUE, prob = p)
train_data <- diab_data[sample_diab_data==1, ]
test_data <- diab_data[sample_diab_data==2, ]
#######################
#### Model fitting####
#######################
train_model<-glm(Outcome~ Pregnancies+Glucose+BloodPressure+SkinThickness+Insulin+BMI+DiabetesPedigreeFunction+Age,
data=train_data, family= 'binomial')
summary(train_model)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure +
## SkinThickness + Insulin + BMI + DiabetesPedigreeFunction +
## Age, family = "binomial", data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5071 -0.7378 -0.4230 0.7415 2.8163
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.2717155 0.7539469 -10.971 < 2e-16 ***
## Pregnancies 0.1190562 0.0342033 3.481 0.00050 ***
## Glucose 0.0331355 0.0038136 8.689 < 2e-16 ***
## BloodPressure -0.0163122 0.0055712 -2.928 0.00341 **
## SkinThickness 0.0006007 0.0072855 0.082 0.93428
## Insulin -0.0014143 0.0009306 -1.520 0.12856
## BMI 0.0999325 0.0164000 6.093 1.11e-09 ***
## DiabetesPedigreeFunction 0.8954655 0.3156768 2.837 0.00456 **
## Age 0.0164474 0.0098323 1.673 0.09437 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 889.91 on 690 degrees of freedom
## Residual deviance: 650.66 on 682 degrees of freedom
## AIC: 668.66
##
## Number of Fisher Scoring iterations: 5
It is evident from summary that 'SkinThickness', 'Insulin' and 'Age' are statistically insignificant. So we drop those variables and rerun the model.
####################################################
#### Drop statistically insignificant varibles ####
####################################################
train_model<-glm(Outcome~ Pregnancies+Glucose+BloodPressure+BMI+DiabetesPedigreeFunction,
data=train_data, family= 'binomial')
summary(train_model)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure +
## BMI + DiabetesPedigreeFunction, family = "binomial", data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7087 -0.7489 -0.4276 0.7164 2.8444
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.752343 0.708099 -10.948 < 2e-16 ***
## Pregnancies 0.153757 0.029502 5.212 1.87e-07 ***
## Glucose 0.032424 0.003461 9.368 < 2e-16 ***
## BloodPressure -0.014888 0.005338 -2.789 0.00528 **
## BMI 0.093946 0.015217 6.174 6.67e-10 ***
## DiabetesPedigreeFunction 0.853092 0.308504 2.765 0.00569 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 889.91 on 690 degrees of freedom
## Residual deviance: 656.81 on 685 degrees of freedom
## AIC: 668.81
##
## Number of Fisher Scoring iterations: 5
Further it appears that 'BloodPressure' is not a statistically significant predictor. So we drop it from our final model
train_model<-glm(Outcome~ Pregnancies+Glucose+BMI+DiabetesPedigreeFunction,
data=train_data, family= 'binomial')
summary(train_model)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction,
## family = "binomial", data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6654 -0.7430 -0.4348 0.7700 2.7845
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.332608 0.688536 -12.102 < 2e-16 ***
## Pregnancies 0.139399 0.028614 4.872 1.11e-06 ***
## Glucose 0.031605 0.003415 9.256 < 2e-16 ***
## BMI 0.085137 0.014787 5.757 8.54e-09 ***
## DiabetesPedigreeFunction 0.836113 0.304788 2.743 0.00608 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 889.91 on 690 degrees of freedom
## Residual deviance: 664.81 on 686 degrees of freedom
## AIC: 674.81
##
## Number of Fisher Scoring iterations: 5
####################
#### Prediction ####
####################
pred <- predict(train_model,train_data,type = 'response')
head(pred)
## 1 2 3 4 5 6
## 0.63788184 0.04979995 0.75247811 0.05476523 0.82916456 0.16494865
head(train_data)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
#####################################################################
#### Model Accuaracy & Misclassified percentage for training data####
#####################################################################
Pred_accuracy <-ifelse(pred> 0.6, 1,0)
Pred_accuracy_table <- table(Predicted= Pred_accuracy, Real= train_data$Outcome )
Pred_accuracy_table
## Real
## Predicted 0 1
## 0 423 129
## 1 30 109
Percentage_misclass_train <-(1-sum(diag(Pred_accuracy_table))/sum(Pred_accuracy_table))*100
Percentage_misclass_train
## [1] 23.01013
#################################################################
#### Model Accuaracy & Misclassified percentage for test data####
#################################################################
pred_test <- predict(train_model,test_data,type = 'response')
Pred_accuracy <-ifelse(pred_test> 0.6, 1,0)
Pred_accuracy_table <- table(Predicted= Pred_accuracy, Real= test_data$Outcome )
Pred_accuracy_table
## Real
## Predicted 0 1
## 0 43 13
## 1 4 17
Percentage_misclass_test <-(1-sum(diag(Pred_accuracy_table))/sum(Pred_accuracy_table))*100
Percentage_misclass_test
## [1] 22.07792