1 Load library

library(ggplot2)
library(dplyr)
library(gridExtra)
library(corrplot)
diabetes <- read.csv('diabetes.csv')
dim(diabetes) # number of row and column 
## [1] 768   9
str(diabetes) # data structure 
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...
head(diabetes) # head row 
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

2 Exploratory Data Analysis and Feature Selection

2.1 Missing value

cat("Number of missing value:", sum(is.na(diabetes)), "\n")
## Number of missing value: 0

2.2 Statistical summary

summary(diabetes)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

2.3 Histogram of numeric variables

p1 <- ggplot(diabetes, aes(x = Pregnancies)) + 
  ggtitle("Number of times pregnant") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 1, colour = "black", fill = "white") + ylab("Percentage")

p2 <- ggplot(diabetes, aes(x=Glucose)) + ggtitle("Glucose") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 5, colour="black", fill="white") + ylab("Percentage")

p3 <- ggplot(diabetes, aes(x=BloodPressure)) + ggtitle("Blood Pressure") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 2, colour="black", fill="white") + ylab("Percentage")

p4 <- ggplot(diabetes, aes(x=SkinThickness)) + ggtitle("Skin Thickness") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 2, colour="black", fill="white") + ylab("Percentage")

p5 <- ggplot(diabetes, aes(x=Insulin)) + ggtitle("Insulin") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 20, colour="black", fill="white") + ylab("Percentage")

p6 <- ggplot(diabetes, aes(x=BMI)) + ggtitle("Body Mass Index") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth = 1, colour="black", fill="white") + ylab("Percentage")

p7 <- ggplot(diabetes, aes(x=DiabetesPedigreeFunction)) + ggtitle("Diabetes Pedigree Function") +
  geom_histogram(aes(y = 100*(..count..)/sum(..count..)), colour="black", fill="white") + ylab("Percentage")

p8 <- ggplot(diabetes, aes(x=Age)) + ggtitle("Age") + geom_histogram(aes(y = 100*(..count..)/sum(..count..)), binwidth=1, colour="black", fill="white") + ylab("Percentage")

grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, ncol=2)

Note: All the variables seem to have reasonable broad distribution, therefore, will be kept for the regression analysis.

2.4 Correlation

2.5 Correlation between numeric variables

numeric.var <- sapply(diabetes, is.numeric)
corr.matrix <- cor(diabetes[, numeric.var])

corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", order = "hclust", tl.col = "black", tl.srt=45, tl.cex=0.5, cl.cex=0.5)

Note: The variable are not correlated

2.6 Correlation with outcome variable

attach(diabetes)
par(mfrow = c(2,4))

boxplot(Pregnancies~Outcome, main="No. of Pregnancies vs. Diabetes", 
        xlab="Outcome", ylab="Pregnancies")
boxplot(Glucose~Outcome, main="Glucose vs. Diabetes", 
        xlab="Outcome", ylab="Glucose")
boxplot(BloodPressure~Outcome, main="Blood Pressure vs. Diabetes", 
        xlab="Outcome", ylab="Blood Pressure")
boxplot(SkinThickness~Outcome, main="Skin Thickness vs. Diabetes", 
        xlab="Outcome", ylab="Skin Thickness")
boxplot(Insulin~Outcome, main="Insulin vs. Diabetes", 
        xlab="Outcome", ylab="Insulin")
boxplot(BMI~Outcome, main="BMI vs. Diabetes", 
        xlab="Outcome", ylab="BMI")
boxplot(DiabetesPedigreeFunction~Outcome, main="Diabetes Pedigree Function vs. Diabetes", xlab="Outcome", ylab="DiabetesPedigreeFunction")
boxplot(Age~Outcome, main="Age vs. Diabetes", 
        xlab="Outcome", ylab="Age")

Note: 1. Blood pressure and skin thickness show little variation with diabetes, they will be excluded from the model. 2. Other variables show more or less correlation with diabetes, so will be kept.

3 Logistic Regression

3.1 Model

diabetes$BloodPressure <- NULL
diabetes$SkinThickness <- NULL

train <- diabetes[1:540,]
test <- diabetes[541:768,]

model <- glm(Outcome ~., family = binomial(link = 'logit'), data = train)

summary(model)
## 
## Call:
## glm(formula = Outcome ~ ., family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4366  -0.7741  -0.4312   0.8021   2.7310  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -8.3461752  0.8157916 -10.231  < 2e-16 ***
## Pregnancies               0.1246856  0.0373214   3.341 0.000835 ***
## Glucose                   0.0315778  0.0042497   7.431 1.08e-13 ***
## Insulin                  -0.0013400  0.0009441  -1.419 0.155781    
## BMI                       0.0881521  0.0164090   5.372 7.78e-08 ***
## DiabetesPedigreeFunction  0.9642132  0.3430094   2.811 0.004938 ** 
## Age                       0.0018904  0.0107225   0.176 0.860053    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 700.47  on 539  degrees of freedom
## Residual deviance: 526.56  on 533  degrees of freedom
## AIC: 540.56
## 
## Number of Fisher Scoring iterations: 5

Note: The top three most relevant features are “Glucose”, “BMI” and “Number of times pregnant” because of the low p-values.

“Insulin” and “Age” appear not statistically significant.

3.2 Anova test

anova(model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Outcome
## 
## Terms added sequentially (first to last)
## 
## 
##                          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                       539     700.47              
## Pregnancies               1   26.314       538     674.16 2.901e-07 ***
## Glucose                   1  102.960       537     571.20 < 2.2e-16 ***
## Insulin                   1    0.062       536     571.14  0.803341    
## BMI                       1   36.135       535     535.00 1.841e-09 ***
## DiabetesPedigreeFunction  1    8.414       534     526.59  0.003723 ** 
## Age                       1    0.031       533     526.56  0.860201    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Note: adding insulin and age have little effect on the residual deviance.

3.3 Cross Validation

fitted.results <- predict(model, newdata = test, type = 'response')
fitted.results <- ifelse(fitted.results > 0.5, 1, 0)
misClasificError <- mean(fitted.results != test$Outcome)

print(paste('Accuracy', 1-misClasificError))
## [1] "Accuracy 0.789473684210526"

4 Decision Tree

4.1 Model

library(rpart)
model2 <- rpart(Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction, 
                data = train, 
                method="class")

plot(model2, uniform = TRUE,
     main = "Classification Tree for Diabetes")

text(model2, use.n = TRUE, all = TRUE, cex = .8)

Note: If a person’s BMI less than 45.4 and her diabetes digree function less than 0.8745, then she is more likely to have diabetes

4.2 Confusion table & accuracy

treePred <- predict(model2, test, type = 'class')
table(treePred, test$Outcome)
##         
## treePred   0   1
##        0 121  29
##        1  29  49
mean(treePred == test$Outcome)
## [1] 0.745614

Note: 1. Logistic Regression model show better performance than Decision Tree 2. Decision Tree disadvantage is Overfitting. there are things we can do to improve the generalization performance in decision tree induction such as pruning