Libraries to load

library(ggplot2)
library(tidyverse)
library(mlbench)
library(gridExtra)
library(grid)
library(MASS)
library(e1071)
library(caret)
library(VIM)
library(car)
library(formatR)

load data:

data(PimaIndiansDiabetes2, package = "mlbench")

1 IDA Pima Indians Diabetes data

dim(PimaIndiansDiabetes2)
## [1] 768   9
head(PimaIndiansDiabetes2)
##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35      NA 33.6    0.627  50      pos
## 2        1      85       66      29      NA 26.6    0.351  31      neg
## 3        8     183       64      NA      NA 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74      NA      NA 25.6    0.201  30      neg
levels(PimaIndiansDiabetes2$diabetes)
## [1] "neg" "pos"

sum(is.na(PimaIndiansDiabetes2))
## [1] 652
aggr(PimaIndiansDiabetes2, prop = F, numbers = T)

We have 768 cases in PimaIndiansDiabetes2 dataset. But we will only use 392 complete cases and disgard 376 cases with missing data.

diabetes_data <- na.omit(PimaIndiansDiabetes2)
dim(diabetes_data)
## [1] 392   9

2 Logistic Regression

logistic.model <- glm(diabetes ~ ., data = diabetes_data, family = binomial(link = "logit"))
summary(logistic.model)
## 
## Call:
## glm(formula = diabetes ~ ., family = binomial(link = "logit"), 
##     data = diabetes_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7823  -0.6603  -0.3642   0.6409   2.5612  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.004e+01  1.218e+00  -8.246  < 2e-16 ***
## pregnant     8.216e-02  5.543e-02   1.482  0.13825    
## glucose      3.827e-02  5.768e-03   6.635 3.24e-11 ***
## pressure    -1.420e-03  1.183e-02  -0.120  0.90446    
## triceps      1.122e-02  1.708e-02   0.657  0.51128    
## insulin     -8.253e-04  1.306e-03  -0.632  0.52757    
## mass         7.054e-02  2.734e-02   2.580  0.00989 ** 
## pedigree     1.141e+00  4.274e-01   2.669  0.00760 ** 
## age          3.395e-02  1.838e-02   1.847  0.06474 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 498.10  on 391  degrees of freedom
## Residual deviance: 344.02  on 383  degrees of freedom
## AIC: 362.02
## 
## Number of Fisher Scoring iterations: 5

Here we see glucose*** is the most informative to explain the diabetes class.

2.1 Logistic regression : compute the accuracy

pred.classes <- ifelse(predict(logistic.model) > 0, "pos", "neg")
# calculate classification accuracy (in percentage %)
mean(pred.classes == diabetes_data$diabetes) * 100
## [1] 78.31633

So the accuracy rate is 78.31633.

3 Classifier comparisons

3.1 Partition the data

set.seed(555)
index <- createDataPartition(diabetes_data$diabetes, p = 0.75, list = FALSE)
train_diabetes <- diabetes_data[index, ]
test_diabetes <- diabetes_data[-index, ]
dim(train_diabetes)
## [1] 295   9
dim(test_diabetes)
## [1] 97  9

3.2 Train classifiers

logistic regression

logisticReg <- train(diabetes ~ ., data = train_diabetes, method = "glm", trControl = trainControl(method = "repeatedcv", 
    repeats = 5))
# logisticReg$results
# accuracy for train data
logisticReg_train.pred <- predict(logisticReg, newdata = train_diabetes)
accuracy_logisticReg_train <- confusionMatrix(data = logisticReg_train.pred, reference = train_diabetes$diabetes)$overall["Accuracy"]

LDA

lda <- train(diabetes ~ ., data = train_diabetes, method = "lda", trControl = trainControl(method = "repeatedcv", 
    repeats = 5))
# lda$results
# accuracy for train data
lda_train.pred <- predict(lda, newdata = train_diabetes)
accuracy_lda_train <- confusionMatrix(data = lda_train.pred, reference = train_diabetes$diabetes)$overall["Accuracy"]

KNN

knn <- train(diabetes ~ ., data = train_diabetes, method = "knn", trControl = trainControl(method = "repeatedcv", 
    repeats = 5))

# knn$results
# accuracy for train data
knn_train.pred <- predict(knn, newdata = train_diabetes)
accuracy_knn_train <- confusionMatrix(data = knn_train.pred, reference = train_diabetes$diabetes)$overall["Accuracy"]

SVM

svm <- train(diabetes ~ ., data = train_diabetes, method = "svmLinearWeights", trControl = trainControl(method = "repeatedcv", 
    repeats = 5))
# svm$results
# accuracy for train data
svm_train.pred <- predict(svm, newdata = train_diabetes)
accuracy_svm_train <- confusionMatrix(data = svm_train.pred, reference = train_diabetes$diabetes)$overall["Accuracy"]

So the accuracy rate for test data set is :

model_name <- c("logistic regression", "LDA", "KNN", "SVM")
accuracy <- c(round(accuracy_logisticReg_train, 5) * 100, round(accuracy_lda_train, 
    5) * 100, round(accuracy_knn_train, 5) * 100, round(accuracy_svm_train, 5) * 
    100)

accuracy_combine <- data.frame(model_name, accuracy)
knitr::kable(accuracy_combine)
model_name accuracy
logistic regression 78.983
LDA 78.644
KNN 78.644
SVM 77.966

3.3 Assess on Test data

The accuracy for test set: 0.8041237(SVM) > 0.7938144(logistic regression) = 0.7938144(LDA) > 0.7628866(KNN)

logistic regression

logisticReg.pred <- predict(logisticReg, newdata = test_diabetes)
head(logisticReg.pred)
## [1] pos neg pos neg pos neg
## Levels: neg pos
confusionMatrix(data = logisticReg.pred, reference = test_diabetes$diabetes)$overall["Accuracy"]
##  Accuracy 
## 0.7938144

LDA

lda.pred <- predict(lda, newdata = test_diabetes)
head(lda.pred)
## [1] pos neg pos neg pos neg
## Levels: neg pos
confusionMatrix(data = lda.pred, reference = test_diabetes$diabetes)$overall["Accuracy"]
##  Accuracy 
## 0.7938144

KNN

knn.pred <- predict(knn, newdata = test_diabetes)
head(knn.pred)
## [1] neg neg pos neg pos neg
## Levels: neg pos
confusionMatrix(data = knn.pred, reference = test_diabetes$diabetes)$overall["Accuracy"]
##  Accuracy 
## 0.7628866

SVM

svm.pred <- predict(svm, newdata = test_diabetes)
head(svm.pred)
## [1] pos neg pos neg pos neg
## Levels: neg pos
confusionMatrix(data = svm.pred, reference = test_diabetes$diabetes)$overall["Accuracy"]
##  Accuracy 
## 0.8041237

4 Visualize the boundaries created by the classifiers

Model construct as follows:

4.1 Linear decision boundaries

Linear decision boundaries for logistic regression

lgm.model <- glm(diabetes ~ glucose + mass, data = diabetes_data, family = binomial(link = "logit"))
logit.coeff = coefficients(lgm.model)
logit.slope <- -logit.coeff["glucose"]/logit.coeff["mass"]
logit.intercept <- -logit.coeff["(Intercept)"]/logit.coeff["mass"]

plot(x = diabetes_data$glucose, y = diabetes_data$mass, col = c("red", "blue")[diabetes_data$diabetes], 
    ylim = c(15, 70), xlim = c(50, 200), main = "Linear decision boundaries for logistic regression", 
    xlab = "glucose", ylab = "mass", pch = 20)
abline(logit.intercept, logit.slope, lty = "dotted")

Linear decision boundaries for linear SVM

svm.model <- svm(diabetes ~ glucose + mass, data = diabetes_data, kernel = "linear", 
    type = "C-classification", scale = FALSE)

dat <- expand.grid(glucose = seq(50, 200, by = 1), mass = seq(15, 70, by = 1))
prediction2 <- predict(svm.model, dat)
plot(x = diabetes_data$glucose, y = diabetes_data$mass, col = c("red", "blue")[diabetes_data$diabetes], 
    ylim = c(15, 70), xlim = c(50, 200), main = "Linear decision boundaries for linear SVM", 
    xlab = "glucose", ylab = "mass", pch = 20)
points(svm.model$SV, col = "orange", cex = 2)


w <- t(svm.model$coefs) %*% svm.model$SV
beta_0 <- -svm.model$rho
abline(a = -beta_0/w[1, 2], b = -w[1, 1]/w[1, 2], col = "black", lty = 1)
abline(a = (-beta_0 - 1)/w[1, 2], b = -w[1, 1]/w[1, 2], col = "red", lty = 3)
abline(a = (-beta_0 + 1)/w[1, 2], b = -w[1, 1]/w[1, 2], col = "red", lty = 3)

4.2 Nonlinear decision boundaries

Nonlinear decision boundaries for kNN

knn.model <- train(diabetes ~ glucose + mass, data = diabetes_data, method = "knn", 
    trControl = trainControl(method = "repeatedcv", repeats = 5))
prediction3 <- predict(knn.model, dat)

plot(x = diabetes_data$glucose, y = diabetes_data$mass, ylim = c(15, 70), xlim = c(50, 
    200), type = "n", main = "Nonlinear decision boundaries for KNN", xlab = "glucose", 
    ylab = "mass", pch = 20)
points(dat, col = ifelse(prediction3 == "neg", "red", "blue"), cex = 0.3, pch = 16)
points(x = diabetes_data$glucose, y = diabetes_data$mass, col = c("red", "blue")[diabetes_data$diabetes], 
    pch = 16)

Nonlinear decision boundaries for SVM


svm.model2 <- svm(diabetes ~ glucose + mass, data = diabetes_data, kernel = "radial", 
    type = "C-classification")
prediction4 <- predict(svm.model2, dat)

plot(x = diabetes_data$glucose, y = diabetes_data$mass, ylim = c(15, 70), xlim = c(50, 
    200), type = "n", main = "Nonlinear decision boundaries for SVM", xlab = "glucose", 
    ylab = "mass", pch = 20)
points(dat, col = ifelse(prediction4 == "neg", "red", "blue"), cex = 0.3, pch = 16)
points(x = diabetes_data$glucose, y = diabetes_data$mass, col = c("red", "blue")[diabetes_data$diabetes], 
    pch = 16)

APP. Student Info

Course: STAT5003_Computational Statistical Methods
Assignment: Lab Week 5
Student Name: Yujun Yao(June Yao)
SID: 500316995
Email: