Assignment_week05_June Yao_500316995
Libraries to load
library(ggplot2)
library(tidyverse)
library(mlbench)
library(gridExtra)
library(grid)
library(MASS)
library(e1071)
library(caret)
library(VIM)
library(car)
library(formatR)load data:
1 IDA Pima Indians Diabetes data
dim(PimaIndiansDiabetes2)
## [1] 768 9
head(PimaIndiansDiabetes2)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 NA 33.6 0.627 50 pos
## 2 1 85 66 29 NA 26.6 0.351 31 neg
## 3 8 183 64 NA NA 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 NA NA 25.6 0.201 30 neg
levels(PimaIndiansDiabetes2$diabetes)
## [1] "neg" "pos"
sum(is.na(PimaIndiansDiabetes2))
## [1] 652
aggr(PimaIndiansDiabetes2, prop = F, numbers = T) We have 768 cases in PimaIndiansDiabetes2 dataset. But we will only use 392 complete cases and disgard 376 cases with missing data.
2 Logistic Regression
logistic.model <- glm(diabetes ~ ., data = diabetes_data, family = binomial(link = "logit"))
summary(logistic.model)
##
## Call:
## glm(formula = diabetes ~ ., family = binomial(link = "logit"),
## data = diabetes_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7823 -0.6603 -0.3642 0.6409 2.5612
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.004e+01 1.218e+00 -8.246 < 2e-16 ***
## pregnant 8.216e-02 5.543e-02 1.482 0.13825
## glucose 3.827e-02 5.768e-03 6.635 3.24e-11 ***
## pressure -1.420e-03 1.183e-02 -0.120 0.90446
## triceps 1.122e-02 1.708e-02 0.657 0.51128
## insulin -8.253e-04 1.306e-03 -0.632 0.52757
## mass 7.054e-02 2.734e-02 2.580 0.00989 **
## pedigree 1.141e+00 4.274e-01 2.669 0.00760 **
## age 3.395e-02 1.838e-02 1.847 0.06474 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.10 on 391 degrees of freedom
## Residual deviance: 344.02 on 383 degrees of freedom
## AIC: 362.02
##
## Number of Fisher Scoring iterations: 5Here we see glucose*** is the most informative to explain the diabetes class.
3 Classifier comparisons
3.1 Partition the data
3.2 Train classifiers
logistic regression
LDA
KNN
SVM
svm <- train(diabetes ~ ., data = train_diabetes, method = "svmLinearWeights", trControl = trainControl(method = "repeatedcv",
repeats = 5))
# svm$results# accuracy for train data
svm_train.pred <- predict(svm, newdata = train_diabetes)
accuracy_svm_train <- confusionMatrix(data = svm_train.pred, reference = train_diabetes$diabetes)$overall["Accuracy"]So the accuracy rate for test data set is :
model_name <- c("logistic regression", "LDA", "KNN", "SVM")
accuracy <- c(round(accuracy_logisticReg_train, 5) * 100, round(accuracy_lda_train,
5) * 100, round(accuracy_knn_train, 5) * 100, round(accuracy_svm_train, 5) *
100)
accuracy_combine <- data.frame(model_name, accuracy)
knitr::kable(accuracy_combine)| model_name | accuracy |
|---|---|
| logistic regression | 78.983 |
| LDA | 78.644 |
| KNN | 78.644 |
| SVM | 77.966 |
3.3 Assess on Test data
The accuracy for test set: 0.8041237(SVM) > 0.7938144(logistic regression) = 0.7938144(LDA) > 0.7628866(KNN)
logistic regression
LDA
KNN
4 Visualize the boundaries created by the classifiers
Model construct as follows:
4.1 Linear decision boundaries
Linear decision boundaries for logistic regression
lgm.model <- glm(diabetes ~ glucose + mass, data = diabetes_data, family = binomial(link = "logit"))
logit.coeff = coefficients(lgm.model)
logit.slope <- -logit.coeff["glucose"]/logit.coeff["mass"]
logit.intercept <- -logit.coeff["(Intercept)"]/logit.coeff["mass"]
plot(x = diabetes_data$glucose, y = diabetes_data$mass, col = c("red", "blue")[diabetes_data$diabetes],
ylim = c(15, 70), xlim = c(50, 200), main = "Linear decision boundaries for logistic regression",
xlab = "glucose", ylab = "mass", pch = 20)
abline(logit.intercept, logit.slope, lty = "dotted")Linear decision boundaries for linear SVM
svm.model <- svm(diabetes ~ glucose + mass, data = diabetes_data, kernel = "linear",
type = "C-classification", scale = FALSE)
dat <- expand.grid(glucose = seq(50, 200, by = 1), mass = seq(15, 70, by = 1))
prediction2 <- predict(svm.model, dat)
plot(x = diabetes_data$glucose, y = diabetes_data$mass, col = c("red", "blue")[diabetes_data$diabetes],
ylim = c(15, 70), xlim = c(50, 200), main = "Linear decision boundaries for linear SVM",
xlab = "glucose", ylab = "mass", pch = 20)
points(svm.model$SV, col = "orange", cex = 2)
w <- t(svm.model$coefs) %*% svm.model$SV
beta_0 <- -svm.model$rho
abline(a = -beta_0/w[1, 2], b = -w[1, 1]/w[1, 2], col = "black", lty = 1)
abline(a = (-beta_0 - 1)/w[1, 2], b = -w[1, 1]/w[1, 2], col = "red", lty = 3)
abline(a = (-beta_0 + 1)/w[1, 2], b = -w[1, 1]/w[1, 2], col = "red", lty = 3)4.2 Nonlinear decision boundaries
Nonlinear decision boundaries for kNN
knn.model <- train(diabetes ~ glucose + mass, data = diabetes_data, method = "knn",
trControl = trainControl(method = "repeatedcv", repeats = 5))
prediction3 <- predict(knn.model, dat)
plot(x = diabetes_data$glucose, y = diabetes_data$mass, ylim = c(15, 70), xlim = c(50,
200), type = "n", main = "Nonlinear decision boundaries for KNN", xlab = "glucose",
ylab = "mass", pch = 20)
points(dat, col = ifelse(prediction3 == "neg", "red", "blue"), cex = 0.3, pch = 16)
points(x = diabetes_data$glucose, y = diabetes_data$mass, col = c("red", "blue")[diabetes_data$diabetes],
pch = 16)Nonlinear decision boundaries for SVM
svm.model2 <- svm(diabetes ~ glucose + mass, data = diabetes_data, kernel = "radial",
type = "C-classification")
prediction4 <- predict(svm.model2, dat)
plot(x = diabetes_data$glucose, y = diabetes_data$mass, ylim = c(15, 70), xlim = c(50,
200), type = "n", main = "Nonlinear decision boundaries for SVM", xlab = "glucose",
ylab = "mass", pch = 20)
points(dat, col = ifelse(prediction4 == "neg", "red", "blue"), cex = 0.3, pch = 16)
points(x = diabetes_data$glucose, y = diabetes_data$mass, col = c("red", "blue")[diabetes_data$diabetes],
pch = 16)APP. Student Info
Course: STAT5003_Computational Statistical Methods
Assignment: Lab Week 5
Student Name: Yujun Yao(June Yao)
SID: 500316995
Email: yyao2983@uni.sydney.edu.au