ANLY 510 Final Exam

##Part1:

library(MASS)
train <- rbind(Pima.tr, Pima.tr2)
test  <- Pima.te
train$type <- as.integer(train$type) - 1L
test$type <- as.integer(test$type) - 1L
# Print out the missing value
sapply(train, function(x) sum(is.na(x)))

## npreg   glu    bp  skin   bmi   ped   age  type 
##     0     0    13    98     3     0     0     0

##Part2:

library(GGally)

## Warning: package 'GGally' was built under R version 3.6.3

## Loading required package: ggplot2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

#Plot a scatterplot matrix between all the explanatory variables
pairs(subset(train, select = -c(type)), col = as.factor(train$type))

#Use jitter to make graph more informative
library(ggplot2)
ggplot(train, aes(x = age, y = type)) +
  geom_jitter(width = 0.5, height = 0.03, alpha = .2) +
  geom_smooth(method = "glm", se = FALSE,
              method.args = list(family = "binomial")) +
  labs(y = expression(hat(P)(Diabetic)))

##Part3: #Print out the coefficients and p-value.

library(broom)

## Warning: package 'broom' was built under R version 3.6.3

lg1 <- glm(type ~ age + bmi, data = train, family = binomial)
tidy(lg1)

## # A tibble: 3 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  -5.80     0.687       -8.45 3.02e-17
## 2 age           0.0538   0.00904      5.95 2.63e- 9
## 3 bmi           0.103    0.0173       5.93 2.97e- 9

##Part 4 #test the model fitted?

lgs_fun   <- function(par, x) {
  1 / (1 + exp(-x %*% par)) 
}
lgs_fun(lg1$coefficients, c(1, 35, 32))

##           [,1]
## [1,] 0.3470908

lgs_fun(lg1$coefficients, c(1, 35, 22))

##           [,1]
## [1,] 0.1600962

##Part 5 #Calculate the odds

exp(predict(lg1, response = "link", newdata = data.frame(age = 55, bmi = 37)))

##        1 
## 2.605789

##Part 6 #Remove the missing value first and then calculate

cm1 <- table(train$type[!is.na(train$bmi)], predict(lg1, type = "response") >= 0.5)
cm1

##    
##     FALSE TRUE
##   0   276   48
##   1   114   59

# Predicition Accurancy
sum(diag(cm1)) / sum(cm1)

## [1] 0.6740443

##Part 7 # prediciton accurancy in Test set

sapply(test, function(x) sum(is.na(x)))

## npreg   glu    bp  skin   bmi   ped   age  type 
##     0     0     0     0     0     0     0     0

library(GGally)
test_pred <- predict(lg1, type = "response", newdata = test)
test_predm <- lgs_fun(lg1$coefficients, as.matrix(cbind(1, test[, c("age", "bmi")])))
cm1_test <- table(test$type,
                  test_pred >= 0.5)
cm1_test

##    
##     FALSE TRUE
##   0   195   28
##   1    67   42

sum(diag(cm1_test)) / sum(cm1_test)

## [1] 0.7138554

cm2_test = table(predicted = as.numeric(test_pred >= 0.5), actual = test$type)
library(caret)
train_con_mat = confusionMatrix(cm2_test)
c(train_con_mat$overall["Accuracy"])

##  Accuracy 
## 0.7138554

##Part8 #Draw ROC and calculate AUC

library(ROCR)

## Warning: package 'ROCR' was built under R version 3.6.3

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.6.3

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(gplots)
# first install.packages("ROCR")
# test the accuracy sensitivity, spcificity
c(train_con_mat$byClass["Sensitivity"], 
  train_con_mat$byClass["Specificity"])

## Sensitivity Specificity 
##   0.8744395   0.3853211

# ROC curve
predROCR <- prediction(test_predm, test$type)
perfROCR <- performance(predROCR, "tpr", "fpr")
plot(perfROCR, colorize = TRUE)

# Calculate the AUC
performance(predROCR, "auc")@y.values

## [[1]]
## [1] 0.7558522

ANLY 510 Final Exam

Jiayi Zhang