Title: question 1 of final exam
# part1
library(MASS)
library(ggplot2)
train <- rbind(Pima.tr, Pima.tr2)
test <- Pima.te
train$type <- as.integer(train$type) - 1L
test$type <- as.integer(test$type) - 1L
# Missing values?
sapply(train, function(x) sum(is.na(x)))
## npreg glu bp skin bmi ped age type
## 0 0 13 98 3 0 0 0
#It shows bp(13) and skin(98) have missing values.
# part2
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
pairs(subset(train, select = -c(type)), col = as.factor(train$type))
##from the matrix, skin has a positive linear relation with bmi.
library(ggplot2)
ggplot(train, aes(x = age, y = type)) +
geom_jitter(width = 0.5, height = 0.03, alpha = .2) +
geom_smooth(method = "glm", se = FALSE,
method.args = list(family = "binomial")) +
labs(y = expression(hat(P)(Diabetic)))
#After plotting type as a function of age,I observed there is also a positive linear relation with type.
library(broom)
## Warning: package 'broom' was built under R version 3.6.3
lg1 <- glm(type ~ age + bmi, data = train, family = binomial)
tidy(lg1)
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -5.80 0.687 -8.45 3.02e-17
## 2 age 0.0538 0.00904 5.95 2.63e- 9
## 3 bmi 0.103 0.0173 5.93 2.97e- 9
#coefficients of age is 0.05383783 and bmi is 0.10256607.
#p-value of age is 2.628863e-09.p-value of bmi is 2.972812e-09.
lgs_fun <- function(par, x) {
1 / (1 + exp(-x %*% par)) # x %*% par is linear algebra equivalent of b_0 + b_1*age + b2*bmi
}
lgs_fun(lg1$coefficients, c(1, 35, 32))
## [,1]
## [1,] 0.3470908
lgs_fun(lg1$coefficients, c(1, 35, 22))
## [,1]
## [1,] 0.1600962
#when bmi is 32,predict value is 0.3470908.
# when bmi is 22,predict value is 0.1600962.
exp(predict(lg1, response = "link", newdata = data.frame(age = 55, bmi = 37)))
## 1
## 2.605789
#odds that a woman in our sample is diabetic given age 55 and a bmi 37 is 2.605789.
cm1 <- table(train$type[!is.na(train$bmi)], predict(lg1, type = "response") >= 0.5)
cm1
##
## FALSE TRUE
## 0 276 48
## 1 114 59
# Prediction accuracy
sum(diag(cm1)) / sum(cm1)
## [1] 0.6740443
#Since the bmi has the missing value, I only consider the not missing value. Prediction accuracy is 0.6740443.
#to check if dataset has missing value
sapply(test, function(x) sum(is.na(x)))
## npreg glu bp skin bmi ped age type
## 0 0 0 0 0 0 0 0
#predition
test_pred <- predict(lg1, type = "response", newdata = test)
test_predm <- lgs_fun(lg1$coefficients, as.matrix(cbind(1, test[, c("age", "bmi")])))
cm1_test <- table(test$type,test_pred >= 0.5)
cm1_test
##
## FALSE TRUE
## 0 195 28
## 1 67 42
#get the prediction accuarcy
# Prediction accuracy
cm2_test = table(predicted = as.numeric(test_pred >= 0.5), actual = test$type)
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
train_con_mat = confusionMatrix(cm2_test)
c(train_con_mat$overall["Accuracy"],
train_con_mat$byClass["Sensitivity"],
train_con_mat$byClass["Specificity"])
## Accuracy Sensitivity Specificity
## 0.7138554 0.8744395 0.3853211
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.3
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
# ROC curve
predROCR <- prediction(test_predm, test$type)
perfROCR <- performance(predROCR, "tpr", "fpr")
plot(perfROCR, colorize = TRUE)
performance(predROCR, "auc")@y.values
## [[1]]
## [1] 0.7558522
#AUC score:0.7558522