1. Predict legendary pokemon using SVM: One-Hot-Encoding vs. KNN (hint: you can reference Week 10’s Rpubs to complete this assignment)

(1a) Load required packages and data

library(dplyr)
library(ggplot2)
library(tidyr)
library(caret)
library(skimr)
library(psych)
library(e1071)
library(data.table)
library(Matrix)
library(keras)
poke <- read.csv("https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1")



(1b) Subset the following variables (columns) into a new dataframe, rename type1 to type and check out how many unique “types” are in this variable

poke = tbl_df(poke)

# tibble is a kind of dataframe of which the format is more free and lazy. (week 10 rmd also rename classification. While it's not used here, I thus skip the process.)
classify_legendary = select(poke, is_legendary, hp, weight_kg, height_m, sp_attack, sp_defense, type1, capture_rate)
names(classify_legendary) ## rename No.7 column 'type1' <- 'type'
## [1] "is_legendary" "hp"           "weight_kg"    "height_m"     "sp_attack"   
## [6] "sp_defense"   "type1"        "capture_rate"
colnames(classify_legendary)[7] <- 'type'
names(classify_legendary) ## checked rename
## [1] "is_legendary" "hp"           "weight_kg"    "height_m"     "sp_attack"   
## [6] "sp_defense"   "type"         "capture_rate"
print(unique(classify_legendary$type)) ## check out how many unique “types” are in this variable
##  [1] "grass"    "fire"     "water"    "bug"      "normal"   "poison"  
##  [7] "electric" "ground"   "fairy"    "fighting" "psychic"  "rock"    
## [13] "ghost"    "ice"      "dragon"   "dark"     "steel"    "flying"

(1c) Replace NA values in weight_kg and height_m with 0, convert capture_rate to numeric class, convert is_legendary to factor class

classify_legendary$capture_rate <- as.numeric(classify_legendary$capture_rate)
classify_legendary$is_legendary <- as.factor(classify_legendary$is_legendary)
classify_legendary$weight_kg[is.na(classify_legendary$weight_kg)] <- 0
classify_legendary$height_m[is.na(classify_legendary$height_m)] <- 0
colSums(is.na(classify_legendary))
## is_legendary           hp    weight_kg     height_m    sp_attack   sp_defense 
##            0            0            0            0            0            0 
##         type capture_rate 
##            0            1
which(is.na(classify_legendary$capture_rate) == TRUE)
## [1] 774

Because I found out 774th row capture_rate contained two values which missed when I try to convert its class, I thus decide to address the problem before next step.

## original data within 774th row capture_rate: 30 (Meteorite)255 (Core)
## According to instruction at the end of sheet, the first value is recommended to adopt.

classify_legendary[774, c("capture_rate")] <- 30
colSums(is.na(classify_legendary)) ## check na value
## is_legendary           hp    weight_kg     height_m    sp_attack   sp_defense 
##            0            0            0            0            0            0 
##         type capture_rate 
##            0            0

(1d) One-hot-encoding (OHE) the type variable (note: that this will also erase the original column), after that, how many additional columns of variables are generated from the OHE? What are they?

dummies_model <- dummyVars(is_legendary ~ ., data = classify_legendary)
data_mat <- predict(dummies_model, newdata = classify_legendary) # one-hot-encode all categorical variables
classify_legendary_ohe <- data.frame(data_mat)

## concatenate
classify_legendary_ohe <- cbind(classify_legendary$is_legendary, classify_legendary_ohe)
colnames(classify_legendary_ohe)[1] <- "is_legendary"
count = 0
for (i in names(classify_legendary_ohe)){
  exist = FALSE  
    for (j in names(classify_legendary)){
        if (i == j){
          exist = TRUE
        }
    }
  
  if (exist == FALSE){
    print(i)
    count = count + 1
  }
}
## [1] "typebug"
## [1] "typedark"
## [1] "typedragon"
## [1] "typeelectric"
## [1] "typefairy"
## [1] "typefighting"
## [1] "typefire"
## [1] "typeflying"
## [1] "typeghost"
## [1] "typegrass"
## [1] "typeground"
## [1] "typeice"
## [1] "typenormal"
## [1] "typepoison"
## [1] "typepsychic"
## [1] "typerock"
## [1] "typesteel"
## [1] "typewater"
print(paste0("new columns generated by one hot encoding: ", count))
## [1] "new columns generated by one hot encoding: 18"

(1e) Execute the following commands

(i) Do a 75-25 train-test split (remember to set seed to 123)
set.seed(123)
train_row_numbers <- createDataPartition(classify_legendary_ohe$is_legendary, p=0.75, list=FALSE)

train <- classify_legendary_ohe[train_row_numbers, ]
test <- classify_legendary_ohe[-train_row_numbers, ]
(ii) Using is_legendary as the outcome variable, train a SVM model on the training data
svm.fit <- svm(is_legendary~., 
                 scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
               kernel = "radial", # other options are available
               degree = 3, # if kernel is of type = "polynomial"
                # gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide one for variable x
               cost = 1, # the cost function (C)
               probability = FALSE, # whether to output probability predictions
               na.action = na.omit,
                 data = train)
(iii) Predict the trained model on the testing data
predict_test_svm <- predict(svm.fit, test)
(iv) Generate confusion matrix and calculate accuracy score
confusion_svm <- table(Predicted = predict_test_svm, 
               Actual = test$is_legendary)

print(confusion_svm)
##          Actual
## Predicted   0   1
##         0 181  16
##         1   1   1
print(paste0('acc score: ', (confusion_svm[1, 1] + confusion_svm[2, 2]) / sum(confusion_svm) * 100))
## [1] "acc score: 91.4572864321608"

(1f) Repeat steps 1a-1b, convert capture_rate to numeric class, convert is_legendary to factor class. Now check out your dataframe using this line of code colSums(is.na(your dataframe)) to see if there exists missing values in any of the columns, Then use KNN method to impute missing values in those columns (you will need to load the RANN package), then check again using colSums(is.na()). Finally, repeat step 1d to generate confusion matrix and accuracy score for the KNN-imputed SVM prediction. Answer this question: compared to One-hot-encoded SVM model, does the KNN-imputed model has higher prediction accuracy?

poke_1 <- read.csv("https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1")
poke_1 = tbl_df(poke_1)
KNN_classify_legendary = select(poke_1, is_legendary, hp, weight_kg, height_m, sp_attack, sp_defense, type1, capture_rate)
colnames(KNN_classify_legendary)[7] <- 'type'
KNN_classify_legendary$capture_rate <- as.numeric(KNN_classify_legendary$capture_rate)
KNN_classify_legendary$is_legendary <- as.factor(KNN_classify_legendary$is_legendary)
colSums(is.na(KNN_classify_legendary))
## is_legendary           hp    weight_kg     height_m    sp_attack   sp_defense 
##            0            0           20           20            0            0 
##         type capture_rate 
##            0            1
library(RANN)
pre_process_missing_data <- preProcess(as.data.frame(KNN_classify_legendary), method="knnImpute")

KNN_classify_legendary <- predict(pre_process_missing_data, newdata = KNN_classify_legendary)
colSums(is.na(KNN_classify_legendary))
## is_legendary           hp    weight_kg     height_m    sp_attack   sp_defense 
##            0            0            0            0            0            0 
##         type capture_rate 
##            0            0
set.seed(123)
KNN_train_row_numbers <- createDataPartition(KNN_classify_legendary$is_legendary, p=0.75, list=FALSE)
KNN_train <- KNN_classify_legendary[KNN_train_row_numbers, ]
KNN_test <- KNN_classify_legendary[-KNN_train_row_numbers, ]
KNN_svm.fit <- svm(is_legendary~., 
                 scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
               kernel = "radial", # other options are available
               degree = 3, # if kernel is of type = "polynomial"
                # gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide one for variable x
               cost = 1, # the cost function (C)
               probability = FALSE, # whether to output probability predictions
               na.action = na.omit,
                 data = KNN_train)
KNN_predict_test_svm <- predict(KNN_svm.fit, KNN_test)

KNN_confusion_svm <- table(Predicted = KNN_predict_test_svm, 
               Actual = KNN_test$is_legendary)


print(paste0('The KNN-imputed model has higher prediction accuracy. KNN acc score: ', (KNN_confusion_svm[1, 1] + KNN_confusion_svm[2, 2]) / sum(KNN_confusion_svm) * 100))
## [1] "The KNN-imputed model has higher prediction accuracy. KNN acc score: 93.4673366834171"

2. Compare prediction accuracy using ROC and AUC metrics

(2a) Load required packages and data

library(e1071)
library(caret)
library(gbm)
library(pROC)
library(plotROC)
library(ISLR)
data('Default')
head(Default)
##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559
colSums(is.na(Default))
## default student balance  income 
##       0       0       0       0
Default$default <- ifelse(Default$default == "No", 0, 1)
Default$student <- ifelse(Default$student == "No", 0, 1)
(i) Do a 75-25 train-test split (set seed to 123 for all models)
set.seed(123)

default_train_row_numbers <- createDataPartition(Default$default, p=0.75, list=FALSE)

default_train <- Default[default_train_row_numbers, ]
default_test <- Default[-default_train_row_numbers, ]
(ii) Trained a logit, a probit, a SVM, and a logistic gradient boost machine (GBM) model on the training data, using default as the outcome variables, all other variables as explanatory variables.

logit model

logit_deft <- glm(default ~ ., family = binomial, data = default_train)

probit model

probit_deft <- glm(default ~ ., family = binomial(link = "probit"), data = default_train)

GBM

library(rsample) 
library(gbm)
library(xgboost)
library(pdp)
library(ggplot2)
library(purrr)
## adjust hyperparameters
## random_index <- sample(1:nrow(Default), nrow(default_train))
## random_training <- Default[random_index, ]

hyper_grid <- expand.grid(
 shrinkage = c(.01, .1, .2),
 interaction.depth = c(1, 3, 5),
 n.minobsinnode = c(5, 10, 15),
 bag.fraction = c(.5, .75, 1),
 optimal_trees = 0, # you will fill in values from loop
 min_RMSE = 0) 
## iterate to find the best combination set of hyperparameters 

for(i in 1:nrow(hyper_grid)) {
  
  # reproducibility
  set.seed(123)
  
  # train model
  gbm.tune <- gbm(
    default ~ .,
    data = default_train,
    distribution = "bernoulli",
    cv.folds = 5,
    n.trees = 500,
    interaction.depth = hyper_grid$interaction.depth[i],
    shrinkage = hyper_grid$shrinkage[i],
    n.minobsinnode = hyper_grid$n.minobsinnode[i],
    bag.fraction = hyper_grid$bag.fraction[i],
    train.fraction = .75,
    n.cores = NULL, # will use all cores by default
    verbose = FALSE
  )
  
  # locate minimum training error from the n-th tree, add it to grid
  hyper_grid$optimal_trees[i] <- which.min(gbm.tune$valid.error)
  hyper_grid$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}
optimal_para <- hyper_grid %>% dplyr::arrange(min_RMSE) %>% head(1)

# define a list of optimal hyperparameters
shrink <- optimal_para$shrinkage
ntrees <- optimal_para$optimal_trees
minobs <- optimal_para$n.minobsinnode
bagfrac <- optimal_para$bag.fraction
depth <- optimal_para$interaction.depth

# train final GBM model
gbm_deft <- gbm(
  default ~ .,
  data = default_train,
  distribution = "bernoulli",
  cv.folds = 5,
  n.trees = ntrees,
  interaction.depth = depth,
  shrinkage = shrink,
  n.minobsinnode = minobs,
  bag.fraction = bagfrac, 
  n.cores = NULL, # will use all cores by default
  verbose = FALSE
  )  

SVM

default_test$default <- as.factor(default_test$default)
class(default_test$default)
## [1] "factor"
default_train$default <- as.factor(default_train$default)
class(default_train$default)
## [1] "factor"
svm_deft <- svm(default ~ ., 
                 scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
               kernel = "radial", # other options are available
               degree = 3, # if kernel is of type = "polynomial"
                # gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide one for variable x
               cost = 1, # the cost function (C)
               probability = FALSE, # whether to output probability predictions
               na.action = na.omit,
                 data = default_train)
(iii) Predict the trained model on the testing data for all four models
pred_logit_deft <- predict(logit_deft, default_test, type = "response")
pred_probit_deft <- predict(probit_deft, default_test, type = "response")
pred_svm_deft <- predict(svm_deft, default_test)
pred_gbm_deft <- predict(gbm_deft, default_test, type = "response")
(iv) Generate confusion matrix and calculate precision scores of all four models.

(convert logit, probit, gbm results to 0/1)

convert_gbm_pred <- as.numeric(pred_gbm_deft > 0.5)

confmat_gbm <- table(Predicted = convert_gbm_pred, Actual = default_test$default)

(confmat_gbm[1 ,1] + confmat_gbm[2, 2]) / sum(confmat_gbm)
## [1] 0.9736
convert_logit_pred <- as.numeric(pred_logit_deft > 0.5)

confmat_logit <- table(Predicted = convert_logit_pred, Actual = default_test$default)

(confmat_logit[1 ,1] + confmat_logit[2, 2]) / sum(confmat_logit)
## [1] 0.9732
convert_probit_pred <- as.numeric(pred_probit_deft > 0.5)

confmat_probit <- table(Predicted = convert_probit_pred, Actual = default_test$default)

(confmat_probit[1 ,1] + confmat_probit[2, 2]) / sum(confmat_probit)
## [1] 0.9736
confmat_svm <- table(Predicted = pred_svm_deft, Actual = default_test$default)

(confmat_svm[1 ,1] + confmat_svm[2, 2]) / sum(confmat_svm)
## [1] 0.9704

(2c) Generate the ROC object for all four models

library(pROC) 
library(plotROC)  
ROC_logit <- roc(default_test$default, convert_logit_pred)
ROC_probit <- roc(default_test$default, convert_probit_pred)
ROC_svm <- roc(default_test$default, as.numeric(pred_svm_deft))
ROC_gbm <- roc(default_test$default, convert_gbm_pred)

(2d) Calculate the AUC for all four models from the ROC objects you just generated, which model has the largest AUC?

auc(ROC_logit)
## Area under the curve: 0.6445
auc(ROC_probit)
## Area under the curve: 0.6388
auc(ROC_svm)
## Area under the curve: 0.5841
auc(ROC_gbm)
## Area under the curve: 0.6447

The AUC of Gradient Boost Machine is largest.

(2e) Plot all the ROC of all four models using plot.roc(), from your response in 2d and through your visual inspection, which model has the best predictive power?

par(mar=c(1,1,1,1))

plot(ROC_logit, xlim = c(1.1, -0.1))
lines(ROC_probit, col="blue")
lines(ROC_svm, col="red")
lines(ROC_gbm, col="green")
legend(0.2, 0.5,
       legend = c("model   AUC", "logit:   0.6445", "probit: 0.6388", "SVM:  0.5841", "GBM:  0.6447"),
       col = c("white", "black", "blue", "red", "green"),
       lty = c(1, 1, 2, 3, 4),
       pch = c(NA, NA, NA, NA, NA),
       cex = 0.7)

Overall, Gradient Boost Machine is the best predictive model.

3. Resampling methods: K-fold vs. Bootstrapping

library(dplyr)
library(rsample)
library(boot)
library(purrr)

(3a) Estimate a logistic regression model of the following form:
Default = student + balance + income + income^2 + … income^n
Using 10-fold cross-validation. Evaluate the variable income up to 10-degree polynomial, use the cross-validated result to find out at which degree polynomial does the model produce the lowest mean prediction error (delta)?

kfcv_error <- function(x) {
  set.seed(123)
  logit_deft_2 <- glm(default ~ student + balance + poly(income, x, raw = FALSE), family = binomial, data = default_train)
  cv.glm(default_train, logit_deft_2, K = 10)$delta[1]  # I set K = 10
}

me_li <- 1:10 %>% map_dbl(kfcv_error)
me_li
##  [1] 0.02147460 0.02150658 0.02153383 0.02157427 0.02160413 0.02162116
##  [7] 0.02164158 0.02165694 0.02165978 0.02171254
min_me = 1
min_degree = 0
degree = 0
for (me in me_li){
  degree = degree + 1
  if (min_me > me){
    min_me = me
    min_degree = degree
  }
}

print(paste0("The lowest mean prediction error ", min_me, " at ", min_degree,"-th degree."))
## [1] "The lowest mean prediction error 0.0214745998145375 at 1-th degree."

(3b) By the result of 3a, estimate the same logistic regression model by setting the polynomial of income exactly at that particular degree, use 1,000 bootstrap replications to get simulated standard error (you will need to use boot()), show your result and tell us which variable (except for the intercept term) has the largest “bias.”

result <- function(data, index) {
  set.seed(123)
  logit_deft_3 <- glm(default ~ student + balance + income, family = binomial, data = default_train, subset = index)
  coef(logit_deft_3)
}

set.seed(123)
boot(default_train, result, R = 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = default_train, statistic = result, R = 1000)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.107857e+01 -3.623368e-02 5.653469e-01
## t2* -5.506691e-01 -1.486520e-02 2.704693e-01
## t3*  5.772478e-03  2.829435e-05 2.660595e-04
## t4*  7.082431e-06 -2.660294e-07 9.532645e-06

t3* which is balance has the largest bias.