##Part1:

library(MASS)
train <- rbind(Pima.tr, Pima.tr2)
test  <- Pima.te
train$type <- as.integer(train$type) - 1L
test$type <- as.integer(test$type) - 1L
# Print out the missing value
sapply(train, function(x) sum(is.na(x)))
## npreg   glu    bp  skin   bmi   ped   age  type 
##     0     0    13    98     3     0     0     0

##Part2:

library(GGally)
## Warning: package 'GGally' was built under R version 3.6.3
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
#Plot a scatterplot matrix between all the explanatory variables
pairs(subset(train, select = -c(type)), col = as.factor(train$type))

#Use jitter to make graph more informative
library(ggplot2)
ggplot(train, aes(x = age, y = type)) +
  geom_jitter(width = 0.5, height = 0.03, alpha = .2) +
  geom_smooth(method = "glm", se = FALSE,
              method.args = list(family = "binomial")) +
  labs(y = expression(hat(P)(Diabetic)))

##Part3: #Print out the coefficients and p-value.

library(broom)
## Warning: package 'broom' was built under R version 3.6.3
lg1 <- glm(type ~ age + bmi, data = train, family = binomial)
tidy(lg1)
## # A tibble: 3 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  -5.80     0.687       -8.45 3.02e-17
## 2 age           0.0538   0.00904      5.95 2.63e- 9
## 3 bmi           0.103    0.0173       5.93 2.97e- 9

##Part 4 #test the model fitted?

lgs_fun   <- function(par, x) {
  1 / (1 + exp(-x %*% par)) 
}
lgs_fun(lg1$coefficients, c(1, 35, 32))
##           [,1]
## [1,] 0.3470908
lgs_fun(lg1$coefficients, c(1, 35, 22))
##           [,1]
## [1,] 0.1600962

##Part 5 #Calculate the odds

exp(predict(lg1, response = "link", newdata = data.frame(age = 55, bmi = 37)))
##        1 
## 2.605789

##Part 6 #Remove the missing value first and then calculate

cm1 <- table(train$type[!is.na(train$bmi)], predict(lg1, type = "response") >= 0.5)
cm1
##    
##     FALSE TRUE
##   0   276   48
##   1   114   59
# Predicition Accurancy
sum(diag(cm1)) / sum(cm1)
## [1] 0.6740443

##Part 7 # prediciton accurancy in Test set

sapply(test, function(x) sum(is.na(x))) 
## npreg   glu    bp  skin   bmi   ped   age  type 
##     0     0     0     0     0     0     0     0
library(GGally)
test_pred <- predict(lg1, type = "response", newdata = test)
test_predm <- lgs_fun(lg1$coefficients, as.matrix(cbind(1, test[, c("age", "bmi")])))
cm1_test <- table(test$type,
                  test_pred >= 0.5)
cm1_test
##    
##     FALSE TRUE
##   0   195   28
##   1    67   42
sum(diag(cm1_test)) / sum(cm1_test)
## [1] 0.7138554
cm2_test = table(predicted = as.numeric(test_pred >= 0.5), actual = test$type)
library(caret)
train_con_mat = confusionMatrix(cm2_test)
c(train_con_mat$overall["Accuracy"])
##  Accuracy 
## 0.7138554

##Part8 #Draw ROC and calculate AUC

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.6.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(gplots)
# first install.packages("ROCR")
# test the accuracy sensitivity, spcificity
c(train_con_mat$byClass["Sensitivity"], 
  train_con_mat$byClass["Specificity"])
## Sensitivity Specificity 
##   0.8744395   0.3853211
# ROC curve
predROCR <- prediction(test_predm, test$type)
perfROCR <- performance(predROCR, "tpr", "fpr")
plot(perfROCR, colorize = TRUE)

# Calculate the AUC
performance(predROCR, "auc")@y.values
## [[1]]
## [1] 0.7558522