Title: question 1 of final exam

part1:

# part1
library(MASS)
library(ggplot2)
train <- rbind(Pima.tr, Pima.tr2)
test  <- Pima.te
train$type <- as.integer(train$type) - 1L
test$type <- as.integer(test$type) - 1L
# Missing values?
sapply(train, function(x) sum(is.na(x)))
## npreg   glu    bp  skin   bmi   ped   age  type 
##     0     0    13    98     3     0     0     0
#It shows bp(13) and skin(98) have missing values.
# part2

library(GGally)
## Warning: package 'GGally' was built under R version 3.6.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
pairs(subset(train, select = -c(type)), col = as.factor(train$type))

##from the matrix, skin has a positive linear relation with bmi.
library(ggplot2)
ggplot(train, aes(x = age, y = type)) +
  geom_jitter(width = 0.5, height = 0.03, alpha = .2) +
  geom_smooth(method = "glm", se = FALSE,
              method.args = list(family = "binomial")) +
  labs(y = expression(hat(P)(Diabetic)))

#After plotting type as a function of age,I observed there is also a positive linear relation with type.

part 3:

library(broom)
## Warning: package 'broom' was built under R version 3.6.3
lg1 <- glm(type ~ age + bmi, data = train, family = binomial)
tidy(lg1)
## # A tibble: 3 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  -5.80     0.687       -8.45 3.02e-17
## 2 age           0.0538   0.00904      5.95 2.63e- 9
## 3 bmi           0.103    0.0173       5.93 2.97e- 9
#coefficients of age is 0.05383783 and bmi is 0.10256607.
#p-value of age is 2.628863e-09.p-value of bmi is 2.972812e-09.

part 4:

lgs_fun   <- function(par, x) {
            1 / (1 + exp(-x %*% par)) # x %*% par is linear algebra equivalent of b_0 + b_1*age + b2*bmi
    }
lgs_fun(lg1$coefficients, c(1, 35, 32))
##           [,1]
## [1,] 0.3470908
lgs_fun(lg1$coefficients, c(1, 35, 22))
##           [,1]
## [1,] 0.1600962
#when bmi is 32,predict value is 0.3470908.
# when bmi is 22,predict value is 0.1600962.

part 5:

exp(predict(lg1, response = "link", newdata = data.frame(age = 55, bmi = 37)))
##        1 
## 2.605789
#odds that a woman in our sample is diabetic given age 55 and a bmi 37 is 2.605789.

part 6:

cm1 <- table(train$type[!is.na(train$bmi)], predict(lg1, type = "response") >= 0.5)
cm1
##    
##     FALSE TRUE
##   0   276   48
##   1   114   59
# Prediction accuracy
sum(diag(cm1)) / sum(cm1)
## [1] 0.6740443
#Since the bmi has the missing value, I only consider the not missing value. Prediction accuracy is 0.6740443.

part 7:

#to check if dataset has missing value
sapply(test, function(x) sum(is.na(x))) 
## npreg   glu    bp  skin   bmi   ped   age  type 
##     0     0     0     0     0     0     0     0
#predition
test_pred <- predict(lg1, type = "response", newdata = test)
test_predm <- lgs_fun(lg1$coefficients, as.matrix(cbind(1, test[, c("age", "bmi")])))
cm1_test <- table(test$type,test_pred >= 0.5)
cm1_test
##    
##     FALSE TRUE
##   0   195   28
##   1    67   42
#get the prediction accuarcy
# Prediction accuracy

cm2_test = table(predicted = as.numeric(test_pred >= 0.5), actual = test$type)
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
train_con_mat = confusionMatrix(cm2_test)
c(train_con_mat$overall["Accuracy"], 
  train_con_mat$byClass["Sensitivity"], 
  train_con_mat$byClass["Specificity"])
##    Accuracy Sensitivity Specificity 
##   0.7138554   0.8744395   0.3853211

part 8:

library(ROCR) 
## Warning: package 'ROCR' was built under R version 3.6.3
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
# ROC curve


predROCR <- prediction(test_predm, test$type)
perfROCR <- performance(predROCR, "tpr", "fpr")
plot(perfROCR, colorize = TRUE)

performance(predROCR, "auc")@y.values
## [[1]]
## [1] 0.7558522
#AUC score:0.7558522