PA Workshop Diabetes exercise

#I. Data exploration #a.data overview

pacman::p_load(tidyverse, caret, corrplot, caTools,knitr,car, ROCR,IRdisplay, e1071, earth, riv, woe)

data =  read.csv('diabetes.csv')
head(data,4)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
str(data)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...
summary(data)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

#b.Summary, correlation from the correlation, we don’t see signigicant collinearity between independent variables from the last column, the Glucose has the highest relationship with the Outcome.

cor(data)
##                          Pregnancies    Glucose BloodPressure SkinThickness
## Pregnancies               1.00000000 0.12945867    0.14128198   -0.08167177
## Glucose                   0.12945867 1.00000000    0.15258959    0.05732789
## BloodPressure             0.14128198 0.15258959    1.00000000    0.20737054
## SkinThickness            -0.08167177 0.05732789    0.20737054    1.00000000
## Insulin                  -0.07353461 0.33135711    0.08893338    0.43678257
## BMI                       0.01768309 0.22107107    0.28180529    0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730    0.04126495    0.18392757
## Age                       0.54434123 0.26351432    0.23952795   -0.11397026
## Outcome                   0.22189815 0.46658140    0.06506836    0.07475223
##                              Insulin        BMI DiabetesPedigreeFunction
## Pregnancies              -0.07353461 0.01768309              -0.03352267
## Glucose                   0.33135711 0.22107107               0.13733730
## BloodPressure             0.08893338 0.28180529               0.04126495
## SkinThickness             0.43678257 0.39257320               0.18392757
## Insulin                   1.00000000 0.19785906               0.18507093
## BMI                       0.19785906 1.00000000               0.14064695
## DiabetesPedigreeFunction  0.18507093 0.14064695               1.00000000
## Age                      -0.04216295 0.03624187               0.03356131
## Outcome                   0.13054795 0.29269466               0.17384407
##                                  Age    Outcome
## Pregnancies               0.54434123 0.22189815
## Glucose                   0.26351432 0.46658140
## BloodPressure             0.23952795 0.06506836
## SkinThickness            -0.11397026 0.07475223
## Insulin                  -0.04216295 0.13054795
## BMI                       0.03624187 0.29269466
## DiabetesPedigreeFunction  0.03356131 0.17384407
## Age                       1.00000000 0.23835598
## Outcome                   0.23835598 1.00000000
pairs(~Pregnancies+Glucose+BloodPressure+SkinThickness+Insulin+BMI+DiabetesPedigreeFunction+Age,data=data, main="Scatterplot Matrix")

set the variables to factors (categorical data)

data$Outcome <- factor(data$Outcome)
str(data)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 2 ...

#c.Check whether the data distribution is balanced

data %>%
  group_by(Outcome)%>%
  summarise(per = n()/nrow(data))%>%
  ggplot(aes(x=Outcome, y=per, fill = Outcome)) + 
  geom_bar(stat='identity') +
  geom_text(aes(label = round(per, 2)), vjust = 2)

We can find that the data is almost balanced with 65 percent of 0 and 35 percent of 1

#II. Fitting the model #a.splitting into test and train data sets

#training and testing
#set initial seed
set.seed(8)

# create a boolean flag to split data
# sample.split from caTools
splitData = sample.split(data$Outcome, SplitRatio = 0.7)

#split_data
# create train and test datasets
train_set = data[splitData,]
head(train_set)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 7           3      78            50            32      88 31.0
## 8          10     115             0             0       0 35.3
## 9           2     197            70            45     543 30.5
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 7                    0.248  26       1
## 8                    0.134  29       0
## 9                    0.158  53       1
nrow(train_set)/nrow(data)
## [1] 0.7005208
dim(train_set)
## [1] 538   9
test_set = data[!splitData,]
head(test_set)
##    Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 4            1      89            66            23      94 28.1
## 5            0     137            40            35     168 43.1
## 6            5     116            74             0       0 25.6
## 10           8     125            96             0       0  0.0
## 13          10     139            80             0       0 27.1
## 14           1     189            60            23     846 30.1
##    DiabetesPedigreeFunction Age Outcome
## 4                     0.167  21       0
## 5                     2.288  33       1
## 6                     0.201  30       0
## 10                    0.232  54       1
## 13                    1.441  57       0
## 14                    0.398  59       1
nrow(test_set)/nrow(data)
## [1] 0.2994792
dim(test_set)
## [1] 230   9

#b.Observe how well the model fits the data, e.g., AIC, p-values Since the training set has 538 pieces of data, the amount of data is large enough, so we can directly build the model with the training set.

#use all independent variables 
model = glm(Outcome ~ ., data = train_set, family = binomial)
summary(model)
## 
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5701  -0.7364  -0.4175   0.7408   2.3150  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -8.209874   0.846460  -9.699  < 2e-16 ***
## Pregnancies               0.142680   0.038089   3.746  0.00018 ***
## Glucose                   0.036859   0.004522   8.152 3.59e-16 ***
## BloodPressure            -0.014746   0.006354  -2.321  0.02031 *  
## SkinThickness             0.001492   0.008452   0.176  0.85991    
## Insulin                  -0.002527   0.001131  -2.234  0.02551 *  
## BMI                       0.092165   0.018143   5.080 3.78e-07 ***
## DiabetesPedigreeFunction  0.861031   0.373380   2.306  0.02111 *  
## Age                       0.005372   0.010948   0.491  0.62367    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 696.28  on 537  degrees of freedom
## Residual deviance: 505.20  on 529  degrees of freedom
## AIC: 523.2
## 
## Number of Fisher Scoring iterations: 5
# check for multi-collinearity
vif(model)
##              Pregnancies                  Glucose            BloodPressure 
##                 1.363305                 1.295265                 1.193943 
##            SkinThickness                  Insulin                      BMI 
##                 1.556465                 1.651165                 1.216055 
## DiabetesPedigreeFunction                      Age 
##                 1.047335                 1.484524

Take out significant variables (p-value <0.05) to build a new model

#use significant independent variables
model_improved=glm(Outcome ~ Pregnancies+ Glucose+ BloodPressure+ BMI+ DiabetesPedigreeFunction, data = train_set, family = binomial)
summary(model_improved)
## 
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure + 
##     BMI + DiabetesPedigreeFunction, family = binomial, data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6524  -0.7483  -0.4113   0.7425   2.2371  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -7.638016   0.787246  -9.702  < 2e-16 ***
## Pregnancies               0.162646   0.033159   4.905 9.34e-07 ***
## Glucose                   0.033781   0.003988   8.471  < 2e-16 ***
## BloodPressure            -0.014198   0.006047  -2.348   0.0189 *  
## BMI                       0.084731   0.016901   5.013 5.35e-07 ***
## DiabetesPedigreeFunction  0.731780   0.365311   2.003   0.0452 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 696.28  on 537  degrees of freedom
## Residual deviance: 512.08  on 532  degrees of freedom
## AIC: 524.08
## 
## Number of Fisher Scoring iterations: 5
# check for multi-collinearity
vif(model_improved)
##              Pregnancies                  Glucose            BloodPressure 
##                 1.051283                 1.026198                 1.125642 
##                      BMI DiabetesPedigreeFunction 
##                 1.083058                 1.003814

We can see that both AIC and vifs turns down, so this model is better than before, and all vifs are lower than 5.

#III. Interpret the model (what variables are important, does it make sense) #a. Accuracy of model We use both the train set and the test set to verify the accuracy of the model’s predictions

# predict on the train set
trainPredict = predict(model_improved, newdata = train_set, 
                       type = 'response')
# assign 0s or 1s for the values
p_class = ifelse(trainPredict > 0.5, 1,0)
CM_Train=confusionMatrix(table(p_class,train_set$Outcome), positive='1')
CM_Train
## Confusion Matrix and Statistics
## 
##        
## p_class   0   1
##       0 308  79
##       1  42 109
##                                           
##                Accuracy : 0.7751          
##                  95% CI : (0.7374, 0.8097)
##     No Information Rate : 0.6506          
##     P-Value [Acc > NIR] : 2.402e-10       
##                                           
##                   Kappa : 0.4817          
##                                           
##  Mcnemar's Test P-Value : 0.001065        
##                                           
##             Sensitivity : 0.5798          
##             Specificity : 0.8800          
##          Pos Pred Value : 0.7219          
##          Neg Pred Value : 0.7959          
##              Prevalence : 0.3494          
##          Detection Rate : 0.2026          
##    Detection Prevalence : 0.2807          
##       Balanced Accuracy : 0.7299          
##                                           
##        'Positive' Class : 1               
## 
# predict on the test set
testPredict = predict(model_improved, newdata = test_set, type = 'response')
# assign 0s or 1s for the values
p_class = ifelse(testPredict > 0.5, 1,0)
CM_test=confusionMatrix(table(p_class,test_set$Outcome), positive='1')
CM_test
## Confusion Matrix and Statistics
## 
##        
## p_class   0   1
##       0 132  35
##       1  18  45
##                                           
##                Accuracy : 0.7696          
##                  95% CI : (0.7097, 0.8224)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 7.748e-05       
##                                           
##                   Kappa : 0.4656          
##                                           
##  Mcnemar's Test P-Value : 0.02797         
##                                           
##             Sensitivity : 0.5625          
##             Specificity : 0.8800          
##          Pos Pred Value : 0.7143          
##          Neg Pred Value : 0.7904          
##              Prevalence : 0.3478          
##          Detection Rate : 0.1957          
##    Detection Prevalence : 0.2739          
##       Balanced Accuracy : 0.7212          
##                                           
##        'Positive' Class : 1               
## 

We can see that the accuracy of test set is 0.7739, is very close to the accuracy of train set (0.7751), so this model is very robust. In addition, both accuracy rates are higher than 0.7, so the prediction performance of the model is very good.

#b. Confusion matrix the confusion matrix of the training set and the test set can also be seen from the above results. From the data of confusion matrix, we can further draw some evaluation indicators:

CM_Train["table"]
## $table
##        
## p_class   0   1
##       0 308  79
##       1  42 109
Recall_Train=109/(42+109)
Recall_Train
## [1] 0.7218543
Precision_Train=109/(109+79)
Precision_Train
## [1] 0.5797872
CM_test["table"]
## $table
##        
## p_class   0   1
##       0 132  35
##       1  18  45
Recall_test=45/(18+45)
Recall_test
## [1] 0.7142857
Precision_test=45/(35+45)
Precision_test
## [1] 0.5625

#c. lift chart

pred = prediction(trainPredict, train_set$Outcome )
perf = performance( pred, "lift", "rpp" ) #RPP=Rate of positive prediction 

plot(perf, main="lift curve", xlab = 'Proportion of Customers (sorted prob)')

from lift curve, we think those part which value is above or around 2 is good. so the top 30% proportion is great.

#d. draw roc.curve

library(ROSE)
## Loaded ROSE 0.0-4
roc.curve(train_set$Outcome,trainPredict)

## Area under the curve (AUC): 0.836

The ROC curve seems good, and the AUC is 0.836(relatively large), so our prediction model is good.