Read & describe data

data <- read.xlsx("labW9.xlsx", 1)
str(data)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : num  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num  1 0 1 0 1 0 1 0 1 1 ...
dim(data)
## [1] 768   9
summary(data, colnames(data))
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
sum(is.na(data)) # no NA value
## [1] 0

Plot data

plot(data)

ggplot(data, mapping=aes(x=factor(Outcome))) + geom_bar()

Distribution of numerical variables

num_col <- colnames(data)[1:8]

dis_plot <- list()
for (i in c(1:(length(data)-1))){
  dis_plot[[i]] <- local({ i<- i 
                          ggplot(data, mapping=aes(x=data[,i])) + 
                          geom_boxplot() + 
                          xlab(colnames(
data)[i])
  })
}

wrap_plots(dis_plot)

Data cleaning: Remove outliers using z-score

z_scores <- as.data.frame(sapply(data, function(data) (abs(data-mean(data))/sd(data))))
data_clean <- z_scores[!rowSums(z_scores>3), ]

dim(data_clean)
## [1] 688   9

Visualize relationship between numerical variables with outcome

attach(data_clean)
par(mfrow=c(2,4))
boxplot(Pregnancies~Outcome, main="No. of Pregnancies vs. Diabetes", 
        xlab="Outcome", ylab="Pregnancies")
boxplot(Glucose~Outcome, main="Glucose vs. Diabetes", 
        xlab="Outcome", ylab="Glucose")
boxplot(BloodPressure~Outcome, main="Blood Pressure vs. Diabetes", 
        xlab="Outcome", ylab="Blood Pressure")
boxplot(SkinThickness~Outcome, main="Skin Thickness vs. Diabetes", 
        xlab="Outcome", ylab="Skin Thickness")
boxplot(Insulin~Outcome, main="Insulin vs. Diabetes", 
        xlab="Outcome", ylab="Insulin")
boxplot(BMI~Outcome, main="BMI vs. Diabetes", 
        xlab="Outcome", ylab="BMI")
boxplot(DiabetesPedigreeFunction~Outcome, main="Diabetes Pedigree Function vs. Diabetes", xlab="Outcome", ylab="DiabetesPedigreeFunction")
boxplot(Age~Outcome, main="Age vs. Diabetes", 
        xlab="Outcome", ylab="Age")

Correlation between variables

numeric.var <- sapply(data_clean, is.numeric)
corr.matrix <- cor(data_clean[,numeric.var])
ggcorrplot(corr.matrix)

Split data into training and testing

TrainingIndex <- createDataPartition(data_clean$Outcome, p=0.7, list = FALSE)
TrainingSet <- data_clean[TrainingIndex,] # Training Set
TestingSet <- data_clean[-TrainingIndex,] # Test Set

Train a decision tree

model <- rpart(Outcome ~ Pregnancies + Glucose + BMI + Insulin + BloodPressure + SkinThickness + Age + DiabetesPedigreeFunction, data=TrainingSet, method="class")

Plot the model

plot(model, uniform=TRUE, 
     main="Classification Tree for Diabetes")
text(model, use.n=TRUE, all=TRUE, cex=.8)

Test the model using testing data

treePred <- predict(model, TestingSet, type = 'class')

Confusion matrix and accuracy of the model

table(treePred, TestingSet$Outcome)
##                    
## treePred            0.731643412690454 1.3650063669598
##   0.731643412690454               110              43
##   1.3650063669598                  25              28
mean(treePred==TestingSet$Outcome)
## [1] 0.6699029