##Part1:
library(MASS)
train <- rbind(Pima.tr, Pima.tr2)
test <- Pima.te
train$type <- as.integer(train$type) - 1L
test$type <- as.integer(test$type) - 1L
# Print out the missing value
sapply(train, function(x) sum(is.na(x)))
## npreg glu bp skin bmi ped age type
## 0 0 13 98 3 0 0 0
##Part2:
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.3
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
#Plot a scatterplot matrix between all the explanatory variables
pairs(subset(train, select = -c(type)), col = as.factor(train$type))
#Use jitter to make graph more informative
library(ggplot2)
ggplot(train, aes(x = age, y = type)) +
geom_jitter(width = 0.5, height = 0.03, alpha = .2) +
geom_smooth(method = "glm", se = FALSE,
method.args = list(family = "binomial")) +
labs(y = expression(hat(P)(Diabetic)))
##Part3: #Print out the coefficients and p-value.
library(broom)
## Warning: package 'broom' was built under R version 3.6.3
lg1 <- glm(type ~ age + bmi, data = train, family = binomial)
tidy(lg1)
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -5.80 0.687 -8.45 3.02e-17
## 2 age 0.0538 0.00904 5.95 2.63e- 9
## 3 bmi 0.103 0.0173 5.93 2.97e- 9
##Part 4 #test the model fitted?
lgs_fun <- function(par, x) {
1 / (1 + exp(-x %*% par))
}
lgs_fun(lg1$coefficients, c(1, 35, 32))
## [,1]
## [1,] 0.3470908
lgs_fun(lg1$coefficients, c(1, 35, 22))
## [,1]
## [1,] 0.1600962
##Part 5 #Calculate the odds
exp(predict(lg1, response = "link", newdata = data.frame(age = 55, bmi = 37)))
## 1
## 2.605789
##Part 6 #Remove the missing value first and then calculate
cm1 <- table(train$type[!is.na(train$bmi)], predict(lg1, type = "response") >= 0.5)
cm1
##
## FALSE TRUE
## 0 276 48
## 1 114 59
# Predicition Accurancy
sum(diag(cm1)) / sum(cm1)
## [1] 0.6740443
##Part 7 # prediciton accurancy in Test set
sapply(test, function(x) sum(is.na(x)))
## npreg glu bp skin bmi ped age type
## 0 0 0 0 0 0 0 0
library(GGally)
test_pred <- predict(lg1, type = "response", newdata = test)
test_predm <- lgs_fun(lg1$coefficients, as.matrix(cbind(1, test[, c("age", "bmi")])))
cm1_test <- table(test$type,
test_pred >= 0.5)
cm1_test
##
## FALSE TRUE
## 0 195 28
## 1 67 42
sum(diag(cm1_test)) / sum(cm1_test)
## [1] 0.7138554
cm2_test = table(predicted = as.numeric(test_pred >= 0.5), actual = test$type)
library(caret)
train_con_mat = confusionMatrix(cm2_test)
c(train_con_mat$overall["Accuracy"])
## Accuracy
## 0.7138554
##Part8 #Draw ROC and calculate AUC
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.6.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(gplots)
# first install.packages("ROCR")
# test the accuracy sensitivity, spcificity
c(train_con_mat$byClass["Sensitivity"],
train_con_mat$byClass["Specificity"])
## Sensitivity Specificity
## 0.8744395 0.3853211
# ROC curve
predROCR <- prediction(test_predm, test$type)
perfROCR <- performance(predROCR, "tpr", "fpr")
plot(perfROCR, colorize = TRUE)
# Calculate the AUC
performance(predROCR, "auc")@y.values
## [[1]]
## [1] 0.7558522