Read & describe data
data <- read.xlsx("labW9.xlsx", 1)
str(data)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : num 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num 1 0 1 0 1 0 1 0 1 1 ...
dim(data)
## [1] 768 9
summary(data, colnames(data))
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
sum(is.na(data)) # no NA value
## [1] 0
Plot data
plot(data)

ggplot(data, mapping=aes(x=factor(Outcome))) + geom_bar()

Distribution of numerical variables
num_col <- colnames(data)[1:8]
dis_plot <- list()
for (i in c(1:(length(data)-1))){
dis_plot[[i]] <- local({ i<- i
ggplot(data, mapping=aes(x=data[,i])) +
geom_boxplot() +
xlab(colnames(
data)[i])
})
}
wrap_plots(dis_plot)

Data cleaning: Remove outliers using z-score
z_scores <- as.data.frame(sapply(data, function(data) (abs(data-mean(data))/sd(data))))
data_clean <- z_scores[!rowSums(z_scores>3), ]
dim(data_clean)
## [1] 688 9
Visualize relationship between numerical variables with outcome
attach(data_clean)
par(mfrow=c(2,4))
boxplot(Pregnancies~Outcome, main="No. of Pregnancies vs. Diabetes",
xlab="Outcome", ylab="Pregnancies")
boxplot(Glucose~Outcome, main="Glucose vs. Diabetes",
xlab="Outcome", ylab="Glucose")
boxplot(BloodPressure~Outcome, main="Blood Pressure vs. Diabetes",
xlab="Outcome", ylab="Blood Pressure")
boxplot(SkinThickness~Outcome, main="Skin Thickness vs. Diabetes",
xlab="Outcome", ylab="Skin Thickness")
boxplot(Insulin~Outcome, main="Insulin vs. Diabetes",
xlab="Outcome", ylab="Insulin")
boxplot(BMI~Outcome, main="BMI vs. Diabetes",
xlab="Outcome", ylab="BMI")
boxplot(DiabetesPedigreeFunction~Outcome, main="Diabetes Pedigree Function vs. Diabetes", xlab="Outcome", ylab="DiabetesPedigreeFunction")
boxplot(Age~Outcome, main="Age vs. Diabetes",
xlab="Outcome", ylab="Age")

Correlation between variables
numeric.var <- sapply(data_clean, is.numeric)
corr.matrix <- cor(data_clean[,numeric.var])
ggcorrplot(corr.matrix)

Split data into training and testing
TrainingIndex <- createDataPartition(data_clean$Outcome, p=0.7, list = FALSE)
TrainingSet <- data_clean[TrainingIndex,] # Training Set
TestingSet <- data_clean[-TrainingIndex,] # Test Set
Train a decision tree
model <- rpart(Outcome ~ Pregnancies + Glucose + BMI + Insulin + BloodPressure + SkinThickness + Age + DiabetesPedigreeFunction, data=TrainingSet, method="class")
Plot the model
plot(model, uniform=TRUE,
main="Classification Tree for Diabetes")
text(model, use.n=TRUE, all=TRUE, cex=.8)

Test the model using testing data
treePred <- predict(model, TestingSet, type = 'class')
Confusion matrix and accuracy of the model
table(treePred, TestingSet$Outcome)
##
## treePred 0.731643412690454 1.3650063669598
## 0.731643412690454 110 43
## 1.3650063669598 25 28
mean(treePred==TestingSet$Outcome)
## [1] 0.6699029