#I. Data exploration #a.data overview
pacman::p_load(tidyverse, caret, corrplot, caTools,knitr,car, ROCR,IRdisplay, e1071, earth, riv, woe)
data = read.csv('diabetes.csv')
head(data,4)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
str(data)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
summary(data)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
#b.Summary, correlation from the correlation, we don’t see signigicant collinearity between independent variables from the last column, the Glucose has the highest relationship with the Outcome.
cor(data)
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.00000000 0.12945867 0.14128198 -0.08167177
## Glucose 0.12945867 1.00000000 0.15258959 0.05732789
## BloodPressure 0.14128198 0.15258959 1.00000000 0.20737054
## SkinThickness -0.08167177 0.05732789 0.20737054 1.00000000
## Insulin -0.07353461 0.33135711 0.08893338 0.43678257
## BMI 0.01768309 0.22107107 0.28180529 0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730 0.04126495 0.18392757
## Age 0.54434123 0.26351432 0.23952795 -0.11397026
## Outcome 0.22189815 0.46658140 0.06506836 0.07475223
## Insulin BMI DiabetesPedigreeFunction
## Pregnancies -0.07353461 0.01768309 -0.03352267
## Glucose 0.33135711 0.22107107 0.13733730
## BloodPressure 0.08893338 0.28180529 0.04126495
## SkinThickness 0.43678257 0.39257320 0.18392757
## Insulin 1.00000000 0.19785906 0.18507093
## BMI 0.19785906 1.00000000 0.14064695
## DiabetesPedigreeFunction 0.18507093 0.14064695 1.00000000
## Age -0.04216295 0.03624187 0.03356131
## Outcome 0.13054795 0.29269466 0.17384407
## Age Outcome
## Pregnancies 0.54434123 0.22189815
## Glucose 0.26351432 0.46658140
## BloodPressure 0.23952795 0.06506836
## SkinThickness -0.11397026 0.07475223
## Insulin -0.04216295 0.13054795
## BMI 0.03624187 0.29269466
## DiabetesPedigreeFunction 0.03356131 0.17384407
## Age 1.00000000 0.23835598
## Outcome 0.23835598 1.00000000
pairs(~Pregnancies+Glucose+BloodPressure+SkinThickness+Insulin+BMI+DiabetesPedigreeFunction+Age,data=data, main="Scatterplot Matrix")
set the variables to factors (categorical data)
data$Outcome <- factor(data$Outcome)
str(data)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 2 ...
#c.Check whether the data distribution is balanced
data %>%
group_by(Outcome)%>%
summarise(per = n()/nrow(data))%>%
ggplot(aes(x=Outcome, y=per, fill = Outcome)) +
geom_bar(stat='identity') +
geom_text(aes(label = round(per, 2)), vjust = 2)
We can find that the data is almost balanced with 65 percent of 0 and 35 percent of 1
#II. Fitting the model #a.splitting into test and train data sets
#training and testing
#set initial seed
set.seed(8)
# create a boolean flag to split data
# sample.split from caTools
splitData = sample.split(data$Outcome, SplitRatio = 0.7)
#split_data
# create train and test datasets
train_set = data[splitData,]
head(train_set)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 7 3 78 50 32 88 31.0
## 8 10 115 0 0 0 35.3
## 9 2 197 70 45 543 30.5
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 7 0.248 26 1
## 8 0.134 29 0
## 9 0.158 53 1
nrow(train_set)/nrow(data)
## [1] 0.7005208
dim(train_set)
## [1] 538 9
test_set = data[!splitData,]
head(test_set)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## 10 8 125 96 0 0 0.0
## 13 10 139 80 0 0 27.1
## 14 1 189 60 23 846 30.1
## DiabetesPedigreeFunction Age Outcome
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
## 10 0.232 54 1
## 13 1.441 57 0
## 14 0.398 59 1
nrow(test_set)/nrow(data)
## [1] 0.2994792
dim(test_set)
## [1] 230 9
#b.Observe how well the model fits the data, e.g., AIC, p-values Since the training set has 538 pieces of data, the amount of data is large enough, so we can directly build the model with the training set.
#use all independent variables
model = glm(Outcome ~ ., data = train_set, family = binomial)
summary(model)
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5701 -0.7364 -0.4175 0.7408 2.3150
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.209874 0.846460 -9.699 < 2e-16 ***
## Pregnancies 0.142680 0.038089 3.746 0.00018 ***
## Glucose 0.036859 0.004522 8.152 3.59e-16 ***
## BloodPressure -0.014746 0.006354 -2.321 0.02031 *
## SkinThickness 0.001492 0.008452 0.176 0.85991
## Insulin -0.002527 0.001131 -2.234 0.02551 *
## BMI 0.092165 0.018143 5.080 3.78e-07 ***
## DiabetesPedigreeFunction 0.861031 0.373380 2.306 0.02111 *
## Age 0.005372 0.010948 0.491 0.62367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 696.28 on 537 degrees of freedom
## Residual deviance: 505.20 on 529 degrees of freedom
## AIC: 523.2
##
## Number of Fisher Scoring iterations: 5
# check for multi-collinearity
vif(model)
## Pregnancies Glucose BloodPressure
## 1.363305 1.295265 1.193943
## SkinThickness Insulin BMI
## 1.556465 1.651165 1.216055
## DiabetesPedigreeFunction Age
## 1.047335 1.484524
Take out significant variables (p-value <0.05) to build a new model
#use significant independent variables
model_improved=glm(Outcome ~ Pregnancies+ Glucose+ BloodPressure+ BMI+ DiabetesPedigreeFunction, data = train_set, family = binomial)
summary(model_improved)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure +
## BMI + DiabetesPedigreeFunction, family = binomial, data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6524 -0.7483 -0.4113 0.7425 2.2371
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.638016 0.787246 -9.702 < 2e-16 ***
## Pregnancies 0.162646 0.033159 4.905 9.34e-07 ***
## Glucose 0.033781 0.003988 8.471 < 2e-16 ***
## BloodPressure -0.014198 0.006047 -2.348 0.0189 *
## BMI 0.084731 0.016901 5.013 5.35e-07 ***
## DiabetesPedigreeFunction 0.731780 0.365311 2.003 0.0452 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 696.28 on 537 degrees of freedom
## Residual deviance: 512.08 on 532 degrees of freedom
## AIC: 524.08
##
## Number of Fisher Scoring iterations: 5
# check for multi-collinearity
vif(model_improved)
## Pregnancies Glucose BloodPressure
## 1.051283 1.026198 1.125642
## BMI DiabetesPedigreeFunction
## 1.083058 1.003814
We can see that both AIC and vifs turns down, so this model is better than before, and all vifs are lower than 5.
#III. Interpret the model (what variables are important, does it make sense) #a. Accuracy of model We use both the train set and the test set to verify the accuracy of the model’s predictions
# predict on the train set
trainPredict = predict(model_improved, newdata = train_set,
type = 'response')
# assign 0s or 1s for the values
p_class = ifelse(trainPredict > 0.5, 1,0)
CM_Train=confusionMatrix(table(p_class,train_set$Outcome), positive='1')
CM_Train
## Confusion Matrix and Statistics
##
##
## p_class 0 1
## 0 308 79
## 1 42 109
##
## Accuracy : 0.7751
## 95% CI : (0.7374, 0.8097)
## No Information Rate : 0.6506
## P-Value [Acc > NIR] : 2.402e-10
##
## Kappa : 0.4817
##
## Mcnemar's Test P-Value : 0.001065
##
## Sensitivity : 0.5798
## Specificity : 0.8800
## Pos Pred Value : 0.7219
## Neg Pred Value : 0.7959
## Prevalence : 0.3494
## Detection Rate : 0.2026
## Detection Prevalence : 0.2807
## Balanced Accuracy : 0.7299
##
## 'Positive' Class : 1
##
# predict on the test set
testPredict = predict(model_improved, newdata = test_set, type = 'response')
# assign 0s or 1s for the values
p_class = ifelse(testPredict > 0.5, 1,0)
CM_test=confusionMatrix(table(p_class,test_set$Outcome), positive='1')
CM_test
## Confusion Matrix and Statistics
##
##
## p_class 0 1
## 0 132 35
## 1 18 45
##
## Accuracy : 0.7696
## 95% CI : (0.7097, 0.8224)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 7.748e-05
##
## Kappa : 0.4656
##
## Mcnemar's Test P-Value : 0.02797
##
## Sensitivity : 0.5625
## Specificity : 0.8800
## Pos Pred Value : 0.7143
## Neg Pred Value : 0.7904
## Prevalence : 0.3478
## Detection Rate : 0.1957
## Detection Prevalence : 0.2739
## Balanced Accuracy : 0.7212
##
## 'Positive' Class : 1
##
We can see that the accuracy of test set is 0.7739, is very close to the accuracy of train set (0.7751), so this model is very robust. In addition, both accuracy rates are higher than 0.7, so the prediction performance of the model is very good.
#b. Confusion matrix the confusion matrix of the training set and the test set can also be seen from the above results. From the data of confusion matrix, we can further draw some evaluation indicators:
CM_Train["table"]
## $table
##
## p_class 0 1
## 0 308 79
## 1 42 109
Recall_Train=109/(42+109)
Recall_Train
## [1] 0.7218543
Precision_Train=109/(109+79)
Precision_Train
## [1] 0.5797872
CM_test["table"]
## $table
##
## p_class 0 1
## 0 132 35
## 1 18 45
Recall_test=45/(18+45)
Recall_test
## [1] 0.7142857
Precision_test=45/(35+45)
Precision_test
## [1] 0.5625
#c. lift chart
pred = prediction(trainPredict, train_set$Outcome )
perf = performance( pred, "lift", "rpp" ) #RPP=Rate of positive prediction
plot(perf, main="lift curve", xlab = 'Proportion of Customers (sorted prob)')
from lift curve, we think those part which value is above or around 2 is good. so the top 30% proportion is great.
#d. draw roc.curve
library(ROSE)
## Loaded ROSE 0.0-4
roc.curve(train_set$Outcome,trainPredict)
## Area under the curve (AUC): 0.836
The ROC curve seems good, and the AUC is 0.836(relatively large), so our prediction model is good.